From a73826dc1db20b7e9a7c51fa7bce04c12b52e5b9 Mon Sep 17 00:00:00 2001 From: wren <“porlong@qq.com”> Date: Sun, 17 May 2026 19:24:16 +0800 Subject: [PATCH] feat: integrate govdoc module into leaudit platform --- fastapi_admin/celery_app.py | 7 +- .../controllers/govdocController.py | 4 +- .../govdoc_bridge/result_adapter.py | 6 + .../fastapi_leaudit/govdoc_bridge/runner.py | 280 ++- .../govdoc_bridge/storage_adapter.py | 72 +- .../fastapi_leaudit/govdoc_bridge/tasks.py | 14 +- .../fastapi_leaudit/govdoc_engine/__init__.py | 52 +- .../govdoc_engine/llm/client.py | 25 +- .../fastapi_leaudit/govdoc_engine/pipeline.py | 21 +- .../models/govdocRuleResult.py | 1 + .../fastapi_leaudit/models/govdocRun.py | 1 + .../fastapi_leaudit/services/govdocService.py | 1 + .../services/impl/documentServiceImpl.py | 44 +- .../services/impl/govdocServiceImpl.py | 1528 +++++++++++++++-- rules/govdoc/govdoc_general/rules.yaml | 546 ++++++ scripts/创建sql/schema_add_govdoc_module.sql | 12 +- 16 files changed, 2334 insertions(+), 280 deletions(-) create mode 100644 rules/govdoc/govdoc_general/rules.yaml diff --git a/fastapi_admin/celery_app.py b/fastapi_admin/celery_app.py index 636dd50..615448b 100644 --- a/fastapi_admin/celery_app.py +++ b/fastapi_admin/celery_app.py @@ -33,7 +33,10 @@ celery_app = Celery( celery_app.conf.update( task_default_queue=LEAUDIT_WORKER_QUEUE_NORMAL, - imports=("fastapi_modules.fastapi_leaudit.leaudit_bridge.tasks",), + imports=( + "fastapi_modules.fastapi_leaudit.leaudit_bridge.tasks", + "fastapi_modules.fastapi_leaudit.govdoc_bridge.tasks", + ), task_queues=( Queue(LEAUDIT_WORKER_QUEUE_URGENT), Queue(LEAUDIT_WORKER_QUEUE_NORMAL), @@ -56,9 +59,11 @@ celery_app.conf.update( celery_app.autodiscover_tasks( [ "fastapi_modules.fastapi_leaudit.leaudit_bridge", + "fastapi_modules.fastapi_leaudit.govdoc_bridge", ], force=True, ) # 显式导入任务模块,避免 worker 在某些启动方式下漏注册 bridge tasks。 from fastapi_modules.fastapi_leaudit.leaudit_bridge import tasks as _leaudit_bridge_tasks # noqa: F401,E402 +from fastapi_modules.fastapi_leaudit.govdoc_bridge import tasks as _govdoc_bridge_tasks # noqa: F401,E402 diff --git a/fastapi_modules/fastapi_leaudit/controllers/govdocController.py b/fastapi_modules/fastapi_leaudit/controllers/govdocController.py index 0161258..75f645c 100644 --- a/fastapi_modules/fastapi_leaudit/controllers/govdocController.py +++ b/fastapi_modules/fastapi_leaudit/controllers/govdocController.py @@ -31,7 +31,7 @@ class GovdocController(BaseController): file: UploadFile = File(...), typeId: int | None = Form(default=None), region: str = Form(default="default"), - autoRun: bool = Form(default=False), + autoRun: bool = Form(default=True), speed: str = Form(default="normal"), ruleVersionId: int | None = Form(default=None), payload: dict[str, Any] = Depends(verify_access_token), @@ -56,6 +56,7 @@ class GovdocController(BaseController): page: int = Query(default=1, ge=1), pageSize: int = Query(default=20, ge=1, le=100), keyword: str | None = Query(default=None), + fileExt: str | None = Query(default=None), region: str | None = Query(default=None), status: str | None = Query(default=None), resultStatus: str | None = Query(default=None), @@ -72,6 +73,7 @@ class GovdocController(BaseController): page=page, pageSize=pageSize, keyword=keyword, + fileExt=fileExt, region=region, status=status, resultStatus=resultStatus, diff --git a/fastapi_modules/fastapi_leaudit/govdoc_bridge/result_adapter.py b/fastapi_modules/fastapi_leaudit/govdoc_bridge/result_adapter.py index 85b526d..1601877 100644 --- a/fastapi_modules/fastapi_leaudit/govdoc_bridge/result_adapter.py +++ b/fastapi_modules/fastapi_leaudit/govdoc_bridge/result_adapter.py @@ -26,6 +26,7 @@ class ResultAdapter: EngineResult: AuditResult, Structure: list[dict[str, Any]] | None = None, Outline: list[dict[str, Any]] | None = None, + Entities: list[dict[str, Any]] | None = None, ) -> dict[str, Any]: """从 AuditResult.summary 提取 run 汇总字段。 @@ -40,6 +41,10 @@ class ResultAdapter: aux["structure"] = Structure if Outline is not None: aux["outline"] = Outline + if Entities is not None: + aux["entities"] = { + entity["name"]: entity for entity in Entities if entity.get("name") + } return { "totalScore": s.score, @@ -100,6 +105,7 @@ class ResultAdapter: "primaryRole": entity.primary_role, "source": entity.source, "confidence": entity.confidence, + "extra": entity.extra, }) return entities diff --git a/fastapi_modules/fastapi_leaudit/govdoc_bridge/runner.py b/fastapi_modules/fastapi_leaudit/govdoc_bridge/runner.py index b75022d..c8ecdd6 100644 --- a/fastapi_modules/fastapi_leaudit/govdoc_bridge/runner.py +++ b/fastapi_modules/fastapi_leaudit/govdoc_bridge/runner.py @@ -9,14 +9,26 @@ from __future__ import annotations +import hashlib +import shutil +import tempfile from dataclasses import dataclass, field +from pathlib import Path from typing import Any +from sqlalchemy import text + from fastapi_common.fastapi_common_logger import logger +from fastapi_common.fastapi_common_sqlalchemy.database import GetAsyncSession +from fastapi_common.fastapi_common_storage.oss_path_utils import OssPathUtils from fastapi_modules.fastapi_leaudit.govdoc_bridge.input_resolver import InputResolver -from fastapi_modules.fastapi_leaudit.govdoc_bridge.result_adapter import ResultAdapter from fastapi_modules.fastapi_leaudit.govdoc_bridge.storage_adapter import StorageAdapter +from fastapi_modules.fastapi_leaudit.govdoc_engine.parser.docx_parser import parse_docx +from fastapi_modules.fastapi_leaudit.govdoc_engine.reporter.docx_annotator import annotate_docx +from fastapi_modules.fastapi_leaudit.govdoc_engine.reporter.html_paragraph import paragraphs_to_html +from fastapi_modules.fastapi_leaudit.govdoc_engine.reporter.html_renderer import render_html +from fastapi_modules.fastapi_leaudit.services.impl.ossServiceImpl import OssServiceImpl log = logger @@ -33,13 +45,30 @@ class GovdocRunner: InputResolver: InputResolver = field(default_factory=InputResolver) Storage: StorageAdapter = field(default_factory=StorageAdapter) - ResultAdapter: ResultAdapter = field(default_factory=ResultAdapter) + OssService: OssServiceImpl = field(default_factory=OssServiceImpl) + ResultAdapter: Any | None = None + + def ResolveRulesPath(self, RulesPath: str | None) -> str: + """解析并校验执行所需规则文件路径。""" + candidate = (RulesPath or "").strip() + if not candidate: + raise ValueError("未提供 govdoc rules_path,当前任务无法执行规则审查") + + path = Path(candidate).expanduser() + if not path.is_absolute(): + path = Path.cwd() / path + path = path.resolve() + + if not path.is_file(): + raise FileNotFoundError(f"govdoc 规则文件不存在: {path}") + + return str(path) async def Execute( self, DocumentId: int, RunId: int, - RulesPath: str, + RulesPath: str | None = None, TriggerUserId: int | None = None, Speed: str = "normal", ) -> dict[str, Any]: @@ -54,57 +83,216 @@ class GovdocRunner: Returns: 执行摘要 dict。 """ + resolvedRulesPath = self.ResolveRulesPath(RulesPath) log.info(f"[Govdoc] Starting execution: runId={RunId}, documentId={DocumentId}") + artifactTempDir: str | None = None + inputPayload = None + try: + # 1. 更新 run 状态 → processing + await self.Storage.UpdateRunStatus(RunId, "processing", Phase="parsing") + await self.Storage.UpdateDocumentStatus(DocumentId, "processing", RunId) - # 1. 更新 run 状态 → processing - await self.Storage.UpdateRunStatus(RunId, "processing", phase="parsing") - await self.Storage.UpdateDocumentStatus(DocumentId, "processing", RunId) + # 2. 解析输入文件 + inputPayload = await self.InputResolver.ResolveForDocument(DocumentId) + log.info(f"[Govdoc] Input resolved: {inputPayload.fileName} → {inputPayload.localPath}") - # 2. 解析输入文件 - inputPayload = await self.InputResolver.ResolveForDocument(DocumentId) - log.info(f"[Govdoc] Input resolved: {inputPayload.fileName} → {inputPayload.localPath}") + # 3. 调用 govdoc_engine 执行审查 + from fastapi_modules.fastapi_leaudit.govdoc_bridge.result_adapter import ResultAdapter + from fastapi_modules.fastapi_leaudit.govdoc_engine.pipeline import run as engine_run - # 3. 调用 govdoc_engine 执行审查 - from fastapi_modules.fastapi_leaudit.govdoc_engine.pipeline import run as engine_run + if self.ResultAdapter is None: + self.ResultAdapter = ResultAdapter() - engineResult = await engine_run( - file_path=inputPayload.localPath, - rules_path=RulesPath, - llm_client=None, # 使用默认 LlmClient (从平台配置加载) + engineResult = await engine_run( + file_path=inputPayload.localPath, + rules_path=resolvedRulesPath, + llm_client=None, # 使用默认 LlmClient (从平台配置加载) + ) + engineResult.document["filename"] = inputPayload.fileName + + # 4. 适配引擎结果 + structure = self.ResultAdapter.AdaptStructure(engineResult) + outline = self.ResultAdapter.AdaptOutline(engineResult) + entities = self.ResultAdapter.AdaptEntities(engineResult) + runSummary = self.ResultAdapter.AdaptRunSummary( + engineResult, + Structure=structure, + Outline=outline, + Entities=entities, + ) + ruleResults = self.ResultAdapter.AdaptRuleResults(engineResult) + checkedRuleResults = self.ResultAdapter.AdaptCheckedRules(engineResult) + artifactTempDir, artifacts = await self._GenerateArtifacts( + DocumentId=DocumentId, + RunId=RunId, + InputPath=inputPayload.localPath, + InputFileName=inputPayload.fileName, + EngineResult=engineResult, + RuleResults=ruleResults, + ) + + failedRuleIds = {str(row.get("ruleId") or "") for row in ruleResults} + for checkedRule in checkedRuleResults: + ruleId = str(checkedRule.get("ruleId") or "") + if checkedRule.get("result") == "fail" and ruleId in failedRuleIds: + continue + ruleResults.append(checkedRule) + + # 将 rules_path 附带到 runSummary 中,供 GetRuleDetail 后续解析 + runSummary["rulesPath"] = resolvedRulesPath + + # 5. 持久化结果 + await self.Storage.UpdateRunResult(RunId, runSummary) + await self.Storage.SaveRuleResults(RunId, ruleResults) + await self.Storage.SaveArtifacts(RunId, artifacts) + + # 6. 更新终态 + await self.Storage.UpdateRunStatus(RunId, "completed", Phase="reporting") + await self.Storage.UpdateDocumentStatus(DocumentId, "completed", RunId) + + log.info(f"[Govdoc] Execution completed: runId={RunId}") + + return { + "runId": RunId, + "documentId": DocumentId, + "status": "completed", + "ruleResultsCount": len(ruleResults), + "structureCount": len(structure), + "outlineCount": len(outline), + "artifactCount": len(artifacts), + } + finally: + if artifactTempDir: + shutil.rmtree(artifactTempDir, ignore_errors=True) + if inputPayload and inputPayload.tempDir: + shutil.rmtree(inputPayload.tempDir, ignore_errors=True) + + async def _GenerateArtifacts( + self, + DocumentId: int, + RunId: int, + InputPath: str, + InputFileName: str, + EngineResult: Any, + RuleResults: list[dict[str, Any]], + ) -> tuple[str, list[dict[str, Any]]]: + """生成报告产物并上传到 OSS。""" + artifactDir = tempfile.mkdtemp(prefix=f"govdoc_artifacts_{RunId}_") + sourcePath = Path(InputPath) + baseName = OssPathUtils.BuildSafeFileStem(InputFileName) + region = await self._GetDocumentRegion(DocumentId) + + annotatedPath = Path(artifactDir) / f"{baseName}.annotated.docx" + annotate_docx(sourcePath, annotatedPath, EngineResult) + + htmlReport = render_html(EngineResult) + + doc = parse_docx(sourcePath) + findingMap: dict[int, list[str]] = {} + for row in RuleResults: + if row.get("result") != "fail": + continue + paragraphIndex = row.get("paragraphIndex") + if paragraphIndex is None: + continue + findingId = f"{row.get('ruleId') or 'finding'}-{paragraphIndex}" + findingMap.setdefault(int(paragraphIndex), []).append(findingId) + paragraphsHtml = paragraphs_to_html(doc, findingMap) + + annotatedUrl = await self.OssService.UploadBytes( + ObjectKey=OssPathUtils.BuildArtifactKey( + Region=region, + RunId=RunId, + ArtifactType="annotated_docx", + Detail=f"{baseName}.annotated.docx", + ), + Content=annotatedPath.read_bytes(), + ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ) + htmlUrl = await self.OssService.UploadText( + ObjectKey=OssPathUtils.BuildArtifactKey( + Region=region, + RunId=RunId, + ArtifactType="html_report", + Detail=f"{baseName}.report.html", + ), + Content=htmlReport, + ContentType="text/html; charset=utf-8", + ) + paragraphUrl = await self.OssService.UploadText( + ObjectKey=OssPathUtils.BuildArtifactKey( + Region=region, + RunId=RunId, + ArtifactType="paragraph_html", + Detail=f"{baseName}.paragraphs.html", + ), + Content=paragraphsHtml, + ContentType="text/html; charset=utf-8", ) - # 4. 适配引擎结果 - structure = self.ResultAdapter.AdaptStructure(engineResult) - outline = self.ResultAdapter.AdaptOutline(engineResult) - runSummary = self.ResultAdapter.AdaptRunSummary( - engineResult, - Structure=structure, - Outline=outline, - ) - ruleResults = self.ResultAdapter.AdaptRuleResults(engineResult) - entities = self.ResultAdapter.AdaptEntities(engineResult) - artifacts = self.ResultAdapter.AdaptArtifacts(engineResult, RunId) + return artifactDir, [ + self._BuildArtifactRow( + artifactType="annotated_docx", + fileName=f"{baseName}.annotated.docx", + mimeType="application/vnd.openxmlformats-officedocument.wordprocessingml.document", + content=annotatedPath.read_bytes(), + ossUrl=annotatedUrl, + description="批注 DOCX", + ), + self._BuildArtifactRow( + artifactType="html_report", + fileName=f"{baseName}.report.html", + mimeType="text/html; charset=utf-8", + content=htmlReport.encode("utf-8"), + ossUrl=htmlUrl, + description="HTML 审查报告", + ), + self._BuildArtifactRow( + artifactType="paragraph_html", + fileName=f"{baseName}.paragraphs.html", + mimeType="text/html; charset=utf-8", + content=paragraphsHtml.encode("utf-8"), + ossUrl=paragraphUrl, + description="段落联动视图", + ), + ] - # 将 rules_path 附带到 runSummary 中,供 GetRuleDetail 后续解析 - runSummary["rulesPath"] = RulesPath - - # 5. 持久化结果 - await self.Storage.UpdateRunResult(RunId, runSummary) - await self.Storage.SaveRuleResults(RunId, ruleResults) - await self.Storage.SaveArtifacts(RunId, artifacts) - - # 6. 更新终态 - await self.Storage.UpdateRunStatus(RunId, "completed", phase="reporting") - await self.Storage.UpdateDocumentStatus(DocumentId, "completed", RunId) - - log.info(f"[Govdoc] Execution completed: runId={RunId}") + async def _GetDocumentRegion(self, DocumentId: int) -> str: + async with GetAsyncSession() as session: + row = ( + await session.execute( + text( + """ + SELECT COALESCE(region, 'default') AS region + FROM leaudit_documents + WHERE id = :document_id + LIMIT 1 + """ + ), + {"document_id": DocumentId}, + ) + ).mappings().first() + return str(row["region"] or "default") if row else "default" + def _BuildArtifactRow( + self, + *, + artifactType: str, + fileName: str, + mimeType: str, + content: bytes, + ossUrl: str, + description: str, + ) -> dict[str, Any]: + fileExt = Path(fileName).suffix.lstrip(".").lower() return { - "runId": RunId, - "documentId": DocumentId, - "status": "completed", - "ruleResultsCount": len(ruleResults), - "structureCount": len(structure), - "outlineCount": len(outline), - "artifactCount": len(artifacts), + "artifactType": artifactType, + "fileName": fileName, + "fileExt": fileExt, + "mimeType": mimeType, + "fileSize": len(content), + "sha256": hashlib.sha256(content).hexdigest(), + "ossUrl": ossUrl, + "storageProvider": "minio", + "description": description, } diff --git a/fastapi_modules/fastapi_leaudit/govdoc_bridge/storage_adapter.py b/fastapi_modules/fastapi_leaudit/govdoc_bridge/storage_adapter.py index b0d716e..0d52001 100644 --- a/fastapi_modules/fastapi_leaudit/govdoc_bridge/storage_adapter.py +++ b/fastapi_modules/fastapi_leaudit/govdoc_bridge/storage_adapter.py @@ -6,6 +6,7 @@ govdoc_report_artifacts 表,并更新 leaudit_documents 状态。 from __future__ import annotations +import json from datetime import datetime, timezone from typing import Any @@ -16,6 +17,33 @@ from sqlalchemy import text log = logger +_RUN_RESULT_FALLBACK_KEYS = { + "total_score": ["totalScore", "score"], + "passed_count": ["passedCount", "passed_count"], + "failed_count": ["failedCount", "failed_count"], + "skipped_count": ["skippedCount", "skipped_count"], + "result_status": ["resultStatus", "result_status"], + "result_summary_json": ["resultSummaryJson", "result_summary_json"], +} + + +def _pick_value(payload: dict[str, Any], *keys: str) -> Any: + for key in keys: + if key in payload: + return payload[key] + return None + + +def _to_text_payload(value: Any) -> str | None: + if value is None: + return None + if isinstance(value, str): + return value + if isinstance(value, (dict, list, tuple, bool, int, float)): + return json.dumps(value, ensure_ascii=False) + return str(value) + + class StorageAdapter: """Govdoc 结果持久化适配器。 @@ -51,11 +79,23 @@ class StorageAdapter: log.info(f"[Govdoc] Run created: runId={run_id}, documentId={RunData['documentId']}") return run_id - async def UpdateRunStatus(self, RunId: int, Status: str, Phase: str | None = None, **Extra: Any) -> None: - """更新 run 状态和阶段。""" + async def UpdateRunStatus( + self, + RunId: int, + Status: str, + Phase: str | None = None, + **Extra: Any, + ) -> None: + """更新 run 状态和阶段。 + + 兼容调用方传 ``phase=`` 或 ``Phase=``,避免因大小写不一致导致阶段不落库。 + """ set_clauses = ["status = :status", "updated_at = now()"] params: dict[str, Any] = {"rid": RunId, "status": Status} + if Phase is None: + Phase = _pick_value(Extra, "phase", "Phase") + if Phase is not None: set_clauses.append("phase = :phase") params["phase"] = Phase @@ -63,6 +103,9 @@ class StorageAdapter: if Status == "completed" or Status == "failed": set_clauses.append("finished_at = :finished_at") params["finished_at"] = datetime.now(timezone.utc) + elif Status == "processing": + set_clauses.append("started_at = COALESCE(started_at, :started_at)") + params["started_at"] = datetime.now(timezone.utc) async with GetAsyncSession() as session: await session.execute( @@ -74,7 +117,7 @@ class StorageAdapter: async def UpdateRunResult(self, RunId: int, Summary: dict[str, Any]) -> None: """写入 run 结果汇总字段(含 rules_path / structure / outline)。""" - rules_path = Summary.get("rulesPath") + rules_path = _pick_value(Summary, "rulesPath", "rules_path") set_clauses = [ "total_score = :total_score", "passed_count = :passed_count", @@ -86,12 +129,12 @@ class StorageAdapter: ] params: dict[str, Any] = { "rid": RunId, - "total_score": Summary.get("totalScore"), - "passed_count": Summary.get("passedCount", 0), - "failed_count": Summary.get("failedCount", 0), - "skipped_count": Summary.get("skippedCount", 0), - "result_status": Summary.get("resultStatus"), - "result_summary_json": Summary.get("resultSummaryJson"), + "total_score": _pick_value(Summary, *_RUN_RESULT_FALLBACK_KEYS["total_score"]), + "passed_count": _pick_value(Summary, *_RUN_RESULT_FALLBACK_KEYS["passed_count"]) or 0, + "failed_count": _pick_value(Summary, *_RUN_RESULT_FALLBACK_KEYS["failed_count"]) or 0, + "skipped_count": _pick_value(Summary, *_RUN_RESULT_FALLBACK_KEYS["skipped_count"]) or 0, + "result_status": _pick_value(Summary, *_RUN_RESULT_FALLBACK_KEYS["result_status"]), + "result_summary_json": _pick_value(Summary, *_RUN_RESULT_FALLBACK_KEYS["result_summary_json"]), } if rules_path: set_clauses.append("rules_path = :rules_path") @@ -137,11 +180,11 @@ class StorageAdapter: (run_id, rule_id, rule_name, severity, category, message, suggestion, actual, expected, evidence, paragraph_index, paragraph_text, location_path, - result, score, created_at, updated_at) + result, skip_reason, score, created_at, updated_at) VALUES (:run_id, :rule_id, :rule_name, :severity, :category, :message, :suggestion, :actual, :expected, :evidence, :paragraph_index, :paragraph_text, :location_path, - :result, :score, now(), now())""" + :result, :skip_reason, :score, now(), now())""" ), { "run_id": RunId, @@ -151,13 +194,14 @@ class StorageAdapter: "category": row.get("category"), "message": row.get("message"), "suggestion": row.get("suggestion"), - "actual": row.get("actual"), - "expected": row.get("expected"), - "evidence": row.get("evidence"), + "actual": _to_text_payload(row.get("actual")), + "expected": _to_text_payload(row.get("expected")), + "evidence": _to_text_payload(row.get("evidence")), "paragraph_index": row.get("paragraphIndex"), "paragraph_text": row.get("paragraphText"), "location_path": row.get("locationPath"), "result": row.get("result", "pass"), + "skip_reason": _pick_value(row, "skipReason", "skip_reason"), "score": row.get("score"), }, ) diff --git a/fastapi_modules/fastapi_leaudit/govdoc_bridge/tasks.py b/fastapi_modules/fastapi_leaudit/govdoc_bridge/tasks.py index be4bc37..2f9634c 100644 --- a/fastapi_modules/fastapi_leaudit/govdoc_bridge/tasks.py +++ b/fastapi_modules/fastapi_leaudit/govdoc_bridge/tasks.py @@ -11,13 +11,17 @@ from typing import Any from fastapi_common.fastapi_common_logger import logger from fastapi_admin.celery_app import celery_app +from fastapi_admin.config import ( + LEAUDIT_WORKER_QUEUE_NORMAL, + LEAUDIT_WORKER_QUEUE_URGENT, +) from fastapi_modules.fastapi_leaudit.govdoc_bridge.runner import GovdocRunner from fastapi_modules.fastapi_leaudit.govdoc_bridge.storage_adapter import StorageAdapter log = logger -GOVDOC_WORKER_QUEUE = "govdoc" -GOVDOC_WORKER_QUEUE_URGENT = "govdoc_urgent" +GOVDOC_WORKER_QUEUE = LEAUDIT_WORKER_QUEUE_NORMAL +GOVDOC_WORKER_QUEUE_URGENT = LEAUDIT_WORKER_QUEUE_URGENT def resolve_govdoc_queue(speed: str = "normal") -> str: @@ -30,6 +34,7 @@ def resolve_govdoc_queue(speed: str = "normal") -> str: def dispatch_govdoc_task( documentId: int, runId: int, + rulesPath: str | None = None, triggerUserId: int | None = None, speed: str = "normal", ) -> Any: @@ -52,6 +57,7 @@ def dispatch_govdoc_task( kwargs={ "documentId": documentId, "runId": runId, + "rulesPath": rulesPath, "triggerUserId": triggerUserId, "speed": speed, }, @@ -73,6 +79,7 @@ def govdoc_execute_task( self, documentId: int, runId: int, + rulesPath: str | None = None, triggerUserId: int | None = None, speed: str = "normal", ) -> dict[str, Any]: @@ -89,7 +96,7 @@ def govdoc_execute_task( try: # 更新 run 状态 → running - loop.run_until_complete(storage.UpdateRunStatus(runId, "processing", phase="parsing")) + loop.run_until_complete(storage.UpdateRunStatus(runId, "processing", Phase="parsing")) # 执行完整审查链路 runner = GovdocRunner() @@ -97,6 +104,7 @@ def govdoc_execute_task( runner.Execute( DocumentId=documentId, RunId=runId, + RulesPath=rulesPath, TriggerUserId=triggerUserId, Speed=speed, ) diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/__init__.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/__init__.py index 287cc8d..854e2a9 100644 --- a/fastapi_modules/fastapi_leaudit/govdoc_engine/__init__.py +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/__init__.py @@ -1,30 +1,12 @@ """Govdoc 公文格式审查引擎内核。 -从旧 govdoc-audit 项目裁剪迁入,去除独立 API 层、SQLite 存储层、 -本地运行记录器 (RunRecorder) 和旧配置系统。 - -导出: - - pipeline.run() — 异步审查入口 (bridge 层主调用) - - pipeline.audit_file() — 同步审查入口 (兼容) - - models — 核心数据模型 (Pydantic) - - parser — 文档解析与实体抽取 - - dsl — YAML 规则 DSL 定义与加载 - - engine — 规则执行引擎与结果模型 - - reporter — 报告生成 (HTML/DOCX/JSON) - - llm — LLM 客户端 (OpenAI 兼容协议) +保持包级导入轻量,避免在控制器注册阶段提前拉起 LLM/OpenAI 依赖。 +真正执行审查时再按需导入 pipeline / result 模块。 """ from __future__ import annotations -from fastapi_modules.fastapi_leaudit.govdoc_engine.pipeline import ( - audit_file, - run, -) -from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.result import ( - AuditResult, - AuditSummary, - CheckedRule, -) +from typing import Any __all__ = [ "audit_file", @@ -33,3 +15,31 @@ __all__ = [ "AuditSummary", "CheckedRule", ] + + +def audit_file(*args: Any, **kwargs: Any): + from fastapi_modules.fastapi_leaudit.govdoc_engine.pipeline import audit_file as _audit_file + + return _audit_file(*args, **kwargs) + + +async def run(*args: Any, **kwargs: Any): + from fastapi_modules.fastapi_leaudit.govdoc_engine.pipeline import run as _run + + return await _run(*args, **kwargs) + + +def __getattr__(name: str): + if name in {"AuditResult", "AuditSummary", "CheckedRule"}: + from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.result import ( + AuditResult, + AuditSummary, + CheckedRule, + ) + + return { + "AuditResult": AuditResult, + "AuditSummary": AuditSummary, + "CheckedRule": CheckedRule, + }[name] + raise AttributeError(name) diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/llm/client.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/llm/client.py index 0dd379d..6044a00 100644 --- a/fastapi_modules/fastapi_leaudit/govdoc_engine/llm/client.py +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/llm/client.py @@ -11,7 +11,22 @@ import re import time from typing import Any -from openai import AsyncOpenAI, OpenAI, APIError, APIConnectionError, RateLimitError +try: + from openai import AsyncOpenAI, OpenAI, APIError, APIConnectionError, RateLimitError + _OPENAI_IMPORT_ERROR: Exception | None = None +except ModuleNotFoundError as exc: # pragma: no cover - optional dependency + AsyncOpenAI = None # type: ignore[assignment] + OpenAI = None # type: ignore[assignment] + _OPENAI_IMPORT_ERROR = exc + + class APIError(Exception): + status_code: int | None = None + + class APIConnectionError(Exception): + pass + + class RateLimitError(Exception): + pass from fastapi_admin.config import ( LLM_API_KEY, @@ -125,7 +140,13 @@ class LlmClient: ): key = api_key or LLM_API_KEY self._misconfigured_error: LlmConfigError | None = None - if not key: + if OpenAI is None or AsyncOpenAI is None: + self._client = None + self._aclient = None + self._misconfigured_error = LlmConfigError( + "python package 'openai' is not installed; govdoc LLM features are unavailable." + ) + elif not key: self._client = None self._aclient = None self._misconfigured_error = LlmConfigError( diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/pipeline.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/pipeline.py index 12af0c1..3c492f5 100644 --- a/fastapi_modules/fastapi_leaudit/govdoc_engine/pipeline.py +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/pipeline.py @@ -130,12 +130,12 @@ def _merge_llm_into_entities( # ── 实体构建 (同步,供 sync 入口使用) ────────────────── def _build_entities( - doc, ruleset: RuleSet, llm: LlmClient, + doc, ruleset: RuleSet, llm: LlmClient | None, ) -> dict[str, SemanticEntity | None]: """构建实体 + 差量 LLM 抽取(同步)。""" entities = EntityBuilder().build(doc) spec = _compute_missing_spec(entities, ruleset.extract.entities) - if spec: + if spec and llm is not None: llm_vals = FieldExtractor(llm).extract_missing(doc, spec) _merge_llm_into_entities(entities, llm_vals) return entities @@ -144,12 +144,12 @@ def _build_entities( # ── 实体构建 (异步,供 async 入口使用) ────────────────── async def _build_entities_async( - doc, ruleset: RuleSet, llm: LlmClient, + doc, ruleset: RuleSet, llm: LlmClient | None, ) -> dict[str, SemanticEntity | None]: """构建实体 + 差量 LLM 抽取(异步)。""" entities = EntityBuilder().build(doc) spec = _compute_missing_spec(entities, ruleset.extract.entities) - if spec: + if spec and llm is not None: llm_vals = await FieldExtractor(llm).extract_missing_async(doc, spec) _merge_llm_into_entities(entities, llm_vals) return entities @@ -174,7 +174,7 @@ def audit_file( """ docx_path = Path(docx_path) rules_path = Path(rules_path) - llm = llm_client or LlmClient() + llm = llm_client doc = parse_docx(docx_path) RoleTagger(llm_client=llm).tag(doc) @@ -210,7 +210,7 @@ async def run( """ file_path = Path(file_path) rules_path = Path(rules_path) - llm = llm_client or LlmClient() + llm = llm_client _log.info("Govdoc pipeline start: %s", file_path.name) @@ -219,18 +219,21 @@ async def run( _log.info(" parsed: %d paragraphs", len(doc.paragraphs)) # 2. 段落角色标注 - RoleTagger(llm_client=llm).tag(doc) + if llm is not None: + await RoleTagger(llm_client=llm).tag_async(doc) + else: + RoleTagger(llm_client=None).tag(doc) # 3. 加载规则 ruleset = load_rules(rules_path) - _log.info(" rules: %d groups, %d rules", len(ruleset.groups), len(ruleset.all_rules())) + _log.info(" rules: %d groups, %d rules", len(ruleset.rules), len(ruleset.all_rules())) # 4. 实体抽取 (含差量 LLM) entities = await _build_entities_async(doc, ruleset, llm) _log.info(" entities: %d/%d resolved", sum(1 for v in entities.values() if v), len(entities)) # 5. 规则评估 - findings, outcomes = RuleRunner(llm_client=llm).evaluate( + findings, outcomes = await RuleRunner(llm_client=llm).evaluate_async( ruleset.all_rules(), doc, entities ) _log.info(" evaluated: %d findings from %d rules", len(findings), len(outcomes)) diff --git a/fastapi_modules/fastapi_leaudit/models/govdocRuleResult.py b/fastapi_modules/fastapi_leaudit/models/govdocRuleResult.py index 1468df2..1abc6c8 100644 --- a/fastapi_modules/fastapi_leaudit/models/govdocRuleResult.py +++ b/fastapi_modules/fastapi_leaudit/models/govdocRuleResult.py @@ -36,4 +36,5 @@ class GovdocRuleResult(BaseModel): # 判定 result: Mapped[str] = mapped_column("result", String(32), default="pass", comment="执行结果:pass/fail/skipped/error") + skipReason: Mapped[str | None] = mapped_column("skip_reason", Text, comment="跳过原因,仅 skipped/error 时使用") score: Mapped[float | None] = mapped_column("score", Numeric(10, 2), comment="本条得分") diff --git a/fastapi_modules/fastapi_leaudit/models/govdocRun.py b/fastapi_modules/fastapi_leaudit/models/govdocRun.py index 59ca654..a1f71a2 100644 --- a/fastapi_modules/fastapi_leaudit/models/govdocRun.py +++ b/fastapi_modules/fastapi_leaudit/models/govdocRun.py @@ -31,6 +31,7 @@ class GovdocRun(BaseModel): engineVersion: Mapped[str | None] = mapped_column("engine_version", String(64), comment="引擎版本号") llmProvider: Mapped[str | None] = mapped_column("llm_provider", String(64), comment="LLM 提供商") llmModel: Mapped[str | None] = mapped_column("llm_model", String(128), comment="LLM 模型名") + rulesPath: Mapped[str | None] = mapped_column("rules_path", String(1024), comment="本次运行使用的规则文件路径") # 结果汇总 totalScore: Mapped[float | None] = mapped_column("total_score", Numeric(10, 2), comment="总分") diff --git a/fastapi_modules/fastapi_leaudit/services/govdocService.py b/fastapi_modules/fastapi_leaudit/services/govdocService.py index cca7f6d..6084de1 100644 --- a/fastapi_modules/fastapi_leaudit/services/govdocService.py +++ b/fastapi_modules/fastapi_leaudit/services/govdocService.py @@ -31,6 +31,7 @@ class IGovdocService(ABC): page: int = 1, pageSize: int = 20, keyword: str | None = None, + fileExt: str | None = None, region: str | None = None, status: str | None = None, resultStatus: str | None = None, diff --git a/fastapi_modules/fastapi_leaudit/services/impl/documentServiceImpl.py b/fastapi_modules/fastapi_leaudit/services/impl/documentServiceImpl.py index faf4ee4..9be7f12 100644 --- a/fastapi_modules/fastapi_leaudit/services/impl/documentServiceImpl.py +++ b/fastapi_modules/fastapi_leaudit/services/impl/documentServiceImpl.py @@ -163,6 +163,7 @@ class DocumentServiceImpl(IDocumentService): root_group_id=resolvedRootGroupId, region=normalizedRegion, normalized_name=normalizedName, + file_ext=fileExt, ) internalDocumentNo = time.time_ns() @@ -2712,16 +2713,25 @@ async def _find_latest_version_candidate( root_group_id: int | None, region: str, normalized_name: str, + file_ext: str | None = None, ) -> dict | None: - """Find the latest primary document version candidate by normalized name. + """Find the latest primary document version candidate by normalized name + extension.""" + ext_clause = "" + ext_params: dict[str, object] = {} + if file_ext: + ext_clause = " AND f.file_ext = :file_ext" + ext_params["file_ext"] = file_ext - Preferred rule: same region + same root group + same normalized name. - Fallback rule: when a root group cannot be resolved, keep the old same-type behavior. - """ if root_group_id is not None: + params: dict[str, object] = { + "root_group_id": root_group_id, + "region": region, + "normalized_name": normalized_name, + **ext_params, + } result = await session.execute( text( - """ + f""" SELECT d.id AS document_id, d.version_group_key, @@ -2751,7 +2761,7 @@ async def _find_latest_version_candidate( WHERE d.region = :region AND d.normalized_name = :normalized_name AND d.is_latest_version = true - AND d.deleted_at IS NULL + AND d.deleted_at IS NULL{ext_clause} AND COALESCE( CASE WHEN eg.id IS NULL THEN NULL @@ -2764,19 +2774,21 @@ async def _find_latest_version_candidate( LIMIT 1 """ ), - { - "root_group_id": root_group_id, - "region": region, - "normalized_name": normalized_name, - }, + params, ) row = result.mappings().first() if row: return dict(row) + params = { + "type_id": type_id, + "region": region, + "normalized_name": normalized_name, + **ext_params, + } result = await session.execute( text( - """ + f""" SELECT d.id AS document_id, d.version_group_key, @@ -2791,18 +2803,14 @@ async def _find_latest_version_candidate( AND f.file_role = 'primary' WHERE d.type_id = :type_id AND d.region = :region - AND d.normalized_name = :normalized_name + AND d.normalized_name = :normalized_name{ext_clause} AND d.is_latest_version = true AND d.deleted_at IS NULL ORDER BY d.version_no DESC, d.id DESC LIMIT 1 """ ), - { - "type_id": type_id, - "region": region, - "normalized_name": normalized_name, - }, + params, ) row = result.mappings().first() return dict(row) if row else None diff --git a/fastapi_modules/fastapi_leaudit/services/impl/govdocServiceImpl.py b/fastapi_modules/fastapi_leaudit/services/impl/govdocServiceImpl.py index 065479f..2e658e4 100644 --- a/fastapi_modules/fastapi_leaudit/services/impl/govdocServiceImpl.py +++ b/fastapi_modules/fastapi_leaudit/services/impl/govdocServiceImpl.py @@ -1,26 +1,65 @@ -"""Govdoc 公文模块服务实现(阶段骨架)。 - -本文件为 Phase 1 骨架实现,所有方法暂返回占位结果。 -后续步骤将逐步接入: - - govdoc_bridge 执行桥接 - - govdoc_engine 引擎内核 - - 文档主档复用 - - OSS / Celery 集成 -""" +"""Govdoc 公文模块服务实现。""" from __future__ import annotations +import hashlib +import json +import mimetypes +import time +from dataclasses import dataclass +from datetime import datetime +from pathlib import Path from typing import Any from fastapi import UploadFile +from sqlalchemy import text from fastapi_common.fastapi_common_logger import logger -from fastapi_modules.fastapi_leaudit.services import IGovdocService +from fastapi_common.fastapi_common_sqlalchemy.database import GetAsyncSession +from fastapi_common.fastapi_common_storage.oss_path_utils import OssPathUtils +from fastapi_common.fastapi_common_web.domain.responses import StatusCodeEnum +from fastapi_common.fastapi_common_web.exception.LeauditException import LeauditException + +from fastapi_modules.fastapi_leaudit.govdoc_bridge.storage_adapter import StorageAdapter +from fastapi_modules.fastapi_leaudit.govdoc_bridge.tasks import dispatch_govdoc_task +from fastapi_modules.fastapi_leaudit.govdoc_engine.parser.docx_parser import parse_docx +from fastapi_modules.fastapi_leaudit.govdoc_engine.reporter.html_paragraph import paragraphs_to_html +from fastapi_modules.fastapi_leaudit.models import LeauditDocument, LeauditDocumentFile +from fastapi_modules.fastapi_leaudit.services import IGovdocService, IOssService +from fastapi_modules.fastapi_leaudit.services.impl.ossServiceImpl import OssServiceImpl + + +@dataclass(frozen=True) +class _GovdocDocumentRow: + documentId: int + region: str + processingStatus: str + currentRunId: int | None + createdAt: Any + updatedAt: Any + fileId: int + fileName: str + fileExt: str | None + mimeType: str | None + fileSize: int | None + ossUrl: str | None + createdBy: int | None + resultStatus: str | None + totalScore: float | None + passedCount: int | None + failedCount: int | None + skippedCount: int | None + hasHtmlReport: bool + hasDocxReport: bool class GovdocServiceImpl(IGovdocService): """公文处理与格式审查服务实现。""" + def __init__(self, OssService: IOssService | None = None) -> None: + self.OssService = OssService or OssServiceImpl() + self.Storage = StorageAdapter() + # ── 文档 ────────────────────────────────────────────── async def UploadDocument( @@ -28,19 +67,126 @@ class GovdocServiceImpl(IGovdocService): file: UploadFile, typeId: int | None = None, region: str = "default", - autoRun: bool = False, + autoRun: bool = True, speed: str = "normal", ruleVersionId: int | None = None, createdBy: int | None = None, ) -> dict[str, Any]: - logger.info("[Govdoc] UploadDocument placeholder — file=%s region=%s", file.filename, region) + if createdBy is None: + raise LeauditException(StatusCodeEnum.HTTP_401_UNAUTHORIZED, "当前用户未登录") + if file is None or not file.filename: + raise LeauditException(StatusCodeEnum.HTTP_400_BAD_REQUEST, "上传文件不能为空") + + content = await file.read() + if not content: + raise LeauditException(StatusCodeEnum.HTTP_400_BAD_REQUEST, "上传文件内容不能为空") + + normalizedRegion = (region or "default").strip() or "default" + fileName = file.filename + fileExt = Path(fileName).suffix.lstrip(".").lower() or None + if fileExt != "docx": + raise LeauditException( + StatusCodeEnum.HTTP_400_BAD_REQUEST, + "当前内部公文模块仅支持上传 DOCX 文件", + ) + mimeType = file.content_type or mimetypes.guess_type(fileName)[0] or "application/octet-stream" + fileSha256 = hashlib.sha256(content).hexdigest() + uploadedAt = datetime.now() + normalizedName = self._normalize_document_name(fileName) + + async with GetAsyncSession() as session: + await self._ensureGovdocSchema(session) + currentUser = await self._getCurrentUserContext(createdBy) + resolvedRegion = self._resolve_upload_region(currentUser, normalizedRegion) + + document = await LeauditDocument.create_new( + session, + bizDocumentId=time.time_ns(), + typeId=typeId, + groupId=None, + region=resolvedRegion, + processingStatus="waiting", + currentRunId=None, + versionGroupKey=None, + versionNo=1, + previousVersionId=None, + rootVersionId=None, + isLatestVersion=True, + normalizedName=normalizedName, + reviewScope="govdoc", + ) + document.rootVersionId = document.Id + await session.flush() + + objectKey = OssPathUtils.BuildBusinessDocKey( + Region=resolvedRegion, + TypeCode="govdoc", + DocumentId=document.Id, + Version="v1", + FileRole="original", + FileName=fileName, + Year=uploadedAt.year, + Month=uploadedAt.month, + ) + ossUrl = await self.OssService.UploadBytes( + ObjectKey=objectKey, + Content=content, + ContentType=mimeType, + ) + + documentFile = LeauditDocumentFile( + documentId=document.Id, + fileRole="original", + fileName=fileName, + fileExt=fileExt, + mimeType=mimeType, + fileSize=len(content), + sha256=fileSha256, + localPath=None, + ossUrl=ossUrl, + storageProvider="minio", + isActive=True, + createdBy=createdBy, + ) + session.add(documentFile) + await session.flush() + + await session.execute( + text( + """ + UPDATE leaudit_documents + SET engine_type = 'govdoc' + WHERE id = :document_id + """ + ), + {"document_id": document.Id}, + ) + await session.commit() + + await session.refresh(document) + await session.refresh(documentFile) + + runPayload: dict[str, Any] | None = None + shouldAutoRun = bool(autoRun) + if shouldAutoRun: + runPayload = await self.CreateRun( + documentId=document.Id, + ruleVersionId=ruleVersionId, + speed=speed, + force=False, + triggerUserId=createdBy, + ) + return { - "documentId": 0, - "fileId": 0, - "fileName": file.filename, - "region": region, + "documentId": document.Id, + "fileId": documentFile.Id, + "fileName": documentFile.fileName, + "region": resolvedRegion, "engineType": "govdoc", - "autoRunTriggered": autoRun, + "processingStatus": "processing" if runPayload else (document.processingStatus or "waiting"), + "autoRunTriggered": shouldAutoRun, + "latestRunId": runPayload["runId"] if runPayload else None, + "run": runPayload, } async def ListDocuments( @@ -48,6 +194,7 @@ class GovdocServiceImpl(IGovdocService): page: int = 1, pageSize: int = 20, keyword: str | None = None, + fileExt: str | None = None, region: str | None = None, status: str | None = None, resultStatus: str | None = None, @@ -56,19 +203,374 @@ class GovdocServiceImpl(IGovdocService): dateTo: str | None = None, userId: int | None = None, ) -> dict[str, Any]: - logger.info("[Govdoc] ListDocuments placeholder — page=%s pageSize=%s", page, pageSize) - return {"items": [], "total": 0, "page": page, "pageSize": pageSize} + if userId is None: + raise LeauditException(StatusCodeEnum.HTTP_401_UNAUTHORIZED, "当前用户未登录") + + currentUser = await self._getCurrentUserContext(userId) + page = max(1, int(page)) + pageSize = max(1, min(int(pageSize), 100)) + offset = (page - 1) * pageSize + + async with GetAsyncSession() as session: + await self._ensureGovdocSchema(session) + + params: dict[str, Any] = { + "limit": pageSize, + "offset": offset, + } + filters = [ + "d.deleted_at IS NULL", + "f.deleted_at IS NULL", + "f.is_active = true", + "f.file_role = 'original'", + "COALESCE(d.engine_type, 'leaudit') = 'govdoc'", + ] + filters.extend( + self._buildDocumentScopeFilters( + CurrentUserId=userId, + CurrentUser=currentUser, + Params=params, + DocumentAlias="d", + FileAlias="f", + RequestedRegion=region, + RequestedUserId=createdBy, + ) + ) + if keyword: + filters.append("(f.file_name ILIKE :keyword OR COALESCE(d.normalized_name, '') ILIKE :keyword)") + params["keyword"] = f"%{keyword.strip()}%" + if fileExt: + normalizedExt = fileExt.strip().lstrip(".").lower() + if normalizedExt: + filters.append("LOWER(COALESCE(f.file_ext, '')) = :file_ext") + params["file_ext"] = normalizedExt + if status: + filters.append("COALESCE(d.processing_status, '') = :status") + params["status"] = status.strip() + if resultStatus: + filters.append("COALESCE(gr.result_status, '') = :result_status") + params["result_status"] = resultStatus.strip() + if dateFrom: + filters.append("d.created_at >= CAST(:date_from AS date)") + params["date_from"] = dateFrom.strip() + if dateTo: + filters.append("d.created_at < (CAST(:date_to AS date) + INTERVAL '1 day')") + params["date_to"] = dateTo.strip() + + whereClause = " AND ".join(filters) + + async with GetAsyncSession() as session: + rows = ( + await session.execute( + text( + f""" + SELECT + d.id AS document_id, + COALESCE(d.region, 'default') AS region, + COALESCE(d.processing_status, 'waiting') AS processing_status, + d.current_run_id, + d.created_at, + d.updated_at, + f.id AS file_id, + f.file_name, + f.file_ext, + f.mime_type, + f.file_size, + f.oss_url, + f.created_by, + gr.result_status, + gr.total_score, + gr.passed_count, + gr.failed_count, + gr.skipped_count, + EXISTS( + SELECT 1 + FROM govdoc_report_artifacts gra + WHERE gra.run_id = d.current_run_id + AND gra.artifact_type = 'html_report' + AND gra.deleted_at IS NULL + ) AS has_html_report, + EXISTS( + SELECT 1 + FROM govdoc_report_artifacts gra + WHERE gra.run_id = d.current_run_id + AND gra.artifact_type = 'annotated_docx' + AND gra.deleted_at IS NULL + ) AS has_docx_report + FROM leaudit_documents d + JOIN leaudit_document_files f + ON f.document_id = d.id + AND f.is_active = true + AND f.file_role = 'original' + AND f.deleted_at IS NULL + LEFT JOIN govdoc_runs gr + ON gr.id = d.current_run_id + WHERE {whereClause} + ORDER BY d.created_at DESC + LIMIT :limit OFFSET :offset + """ + ), + params, + ) + ).mappings().all() + + total = int( + ( + await session.execute( + text( + f""" + SELECT COUNT(1) + FROM leaudit_documents d + JOIN leaudit_document_files f + ON f.document_id = d.id + AND f.is_active = true + AND f.file_role = 'original' + AND f.deleted_at IS NULL + LEFT JOIN govdoc_runs gr + ON gr.id = d.current_run_id + WHERE {whereClause} + """ + ), + params, + ) + ).scalar_one() + ) + + items = [] + for row in rows: + mapped = self._map_document_row(row) + summary = self._build_summary_payload( + mapped.totalScore, + mapped.passedCount, + mapped.failedCount, + mapped.skippedCount, + ) + items.append( + { + "documentId": mapped.documentId, + "fileId": mapped.fileId, + "fileName": mapped.fileName, + "fileExt": mapped.fileExt, + "mimeType": mapped.mimeType, + "fileSize": mapped.fileSize, + "region": mapped.region, + "processingStatus": mapped.processingStatus, + "currentRunId": mapped.currentRunId, + "latestRunId": mapped.currentRunId, + "resultStatus": mapped.resultStatus, + "score": float(mapped.totalScore) if mapped.totalScore is not None else None, + "passedCount": mapped.passedCount or 0, + "failedCount": mapped.failedCount or 0, + "skippedCount": mapped.skippedCount or 0, + "latestRun": { + "runId": mapped.currentRunId, + "summary": summary, + } if mapped.currentRunId else None, + "reports": { + "hasHtmlReport": mapped.hasHtmlReport, + "hasDocxReport": mapped.hasDocxReport, + }, + "createdAt": self._iso(mapped.createdAt), + "updatedAt": self._iso(mapped.updatedAt), + } + ) + + return {"items": items, "total": total, "page": page, "pageSize": pageSize} async def GetDocumentDetail(self, documentId: int, userId: int | None = None) -> dict[str, Any]: - logger.info("[Govdoc] GetDocumentDetail placeholder — id=%s", documentId) - return {"documentId": documentId} + if userId is None: + raise LeauditException(StatusCodeEnum.HTTP_401_UNAUTHORIZED, "当前用户未登录") + + currentUser = await self._getCurrentUserContext(userId) + params: dict[str, Any] = {"document_id": documentId, "limit": 20} + filters = [ + "d.id = :document_id", + "d.deleted_at IS NULL", + "f.deleted_at IS NULL", + "f.is_active = true", + "f.file_role = 'original'", + "COALESCE(d.engine_type, 'leaudit') = 'govdoc'", + ] + filters.extend( + self._buildDocumentScopeFilters( + CurrentUserId=userId, + CurrentUser=currentUser, + Params=params, + DocumentAlias="d", + FileAlias="f", + ) + ) + whereClause = " AND ".join(filters) + + async with GetAsyncSession() as session: + await self._ensureGovdocSchema(session) + row = ( + await session.execute( + text( + f""" + SELECT + d.id AS document_id, + COALESCE(d.region, 'default') AS region, + COALESCE(d.processing_status, 'waiting') AS processing_status, + d.current_run_id, + d.created_at, + d.updated_at, + f.id AS file_id, + f.file_name, + f.file_ext, + f.mime_type, + f.file_size, + f.oss_url, + f.created_by, + gr.result_status, + gr.total_score, + gr.passed_count, + gr.failed_count, + gr.skipped_count + FROM leaudit_documents d + JOIN leaudit_document_files f + ON f.document_id = d.id + AND f.is_active = true + AND f.file_role = 'original' + AND f.deleted_at IS NULL + LEFT JOIN govdoc_runs gr + ON gr.id = d.current_run_id + WHERE {whereClause} + LIMIT 1 + """ + ), + params, + ) + ).mappings().first() + if not row: + raise LeauditException(StatusCodeEnum.HTTP_404_NOT_FOUND, "公文文档不存在或无权访问") + + runs = ( + await session.execute( + text( + """ + SELECT + id, + document_id, + status, + phase, + result_status, + total_score, + passed_count, + failed_count, + skipped_count, + error_message, + created_at, + updated_at, + started_at, + finished_at + FROM govdoc_runs + WHERE document_id = :document_id + AND deleted_at IS NULL + ORDER BY id DESC + LIMIT :limit + """ + ), + params, + ) + ).mappings().all() + + artifactRows = ( + await session.execute( + text( + """ + SELECT run_id, artifact_type, file_name, file_ext, mime_type, oss_url, description + FROM govdoc_report_artifacts + WHERE run_id = ANY( + SELECT id + FROM govdoc_runs + WHERE document_id = :document_id + AND deleted_at IS NULL + ) + AND deleted_at IS NULL + ORDER BY id DESC + """ + ), + {"document_id": documentId}, + ) + ).mappings().all() + + mapped = self._map_document_row(row) + runItems = [self._build_run_summary(item) for item in runs] + latestRunId = mapped.currentRunId or (runItems[0]["runId"] if runItems else None) + latestRun = next((item for item in runItems if item["runId"] == latestRunId), runItems[0] if runItems else None) + artifactsByRun = self._group_artifacts_by_run(artifactRows) + + return { + "documentId": mapped.documentId, + "latestRunId": latestRunId, + "document": { + "documentId": mapped.documentId, + "fileId": mapped.fileId, + "filename": mapped.fileName, + "fileExt": mapped.fileExt, + "mimeType": mapped.mimeType, + "fileSize": mapped.fileSize, + "region": mapped.region, + "processingStatus": mapped.processingStatus, + "createdAt": self._iso(mapped.createdAt), + "updatedAt": self._iso(mapped.updatedAt), + }, + "latestRun": latestRun, + "currentRun": latestRun, + "runs": runItems, + "reports": artifactsByRun.get(int(latestRunId), {}) if latestRunId else {}, + } async def UpdateDocument(self, documentId: int, body: dict[str, Any], userId: int | None = None) -> dict[str, Any]: - logger.info("[Govdoc] UpdateDocument placeholder — id=%s", documentId) - return {"documentId": documentId, **body} + raise LeauditException(StatusCodeEnum.HTTP_403_FORBIDDEN, "当前阶段暂不开放修改公文信息") async def DeleteDocument(self, documentId: int, userId: int | None = None) -> dict[str, Any]: - logger.info("[Govdoc] DeleteDocument placeholder — id=%s", documentId) + if userId is None: + raise LeauditException(StatusCodeEnum.HTTP_401_UNAUTHORIZED, "当前用户未登录") + + currentUser = await self._getCurrentUserContext(userId) + params: dict[str, Any] = {"document_id": documentId} + filters = ["d.id = :document_id", "d.deleted_at IS NULL", "f.is_active = true", "f.file_role = 'original'"] + filters.extend( + self._buildDocumentScopeFilters( + CurrentUserId=userId, + CurrentUser=currentUser, + Params=params, + DocumentAlias="d", + FileAlias="f", + ) + ) + whereClause = " AND ".join(filters) + async with GetAsyncSession() as session: + await self._ensureGovdocSchema(session) + row = ( + await session.execute( + text( + f""" + SELECT d.id + FROM leaudit_documents d + JOIN leaudit_document_files f + ON f.document_id = d.id + AND f.is_active = true + AND f.file_role = 'original' + AND f.deleted_at IS NULL + WHERE {whereClause} + LIMIT 1 + """ + ), + params, + ) + ).first() + if not row: + raise LeauditException(StatusCodeEnum.HTTP_404_NOT_FOUND, "公文文档不存在或无权访问") + + await session.execute( + text("UPDATE leaudit_documents SET deleted_at = now(), updated_at = now() WHERE id = :document_id"), + {"document_id": documentId}, + ) + await session.commit() + return {"documentId": documentId, "deleted": True} # ── 审查运行 ────────────────────────────────────────── @@ -81,157 +583,354 @@ class GovdocServiceImpl(IGovdocService): force: bool = False, triggerUserId: int | None = None, ) -> dict[str, Any]: - logger.info("[Govdoc] CreateRun placeholder — documentId=%s", documentId) + if triggerUserId is None: + raise LeauditException(StatusCodeEnum.HTTP_401_UNAUTHORIZED, "当前用户未登录") + + async with GetAsyncSession() as session: + await self._ensureGovdocSchema(session) + + currentUser = await self._getCurrentUserContext(triggerUserId) + documentMeta = await self._get_document_for_run(documentId, triggerUserId, currentUser) + if documentMeta.currentRunId and not force: + currentRun = await self.GetRunStatus(documentMeta.currentRunId) + if currentRun["status"] in {"pending", "processing"}: + return { + "runId": documentMeta.currentRunId, + "documentId": documentId, + "status": currentRun["status"], + "phase": currentRun.get("phase"), + "reused": True, + } + + runId = await self.Storage.CreateRun( + { + "documentId": documentId, + "documentFileId": documentMeta.fileId, + "runNo": await self._next_run_no(documentId), + "triggerSource": "upload" if not documentMeta.currentRunId else "manual", + "triggerUserId": triggerUserId, + } + ) + rulesPath = await self._resolve_rules_path() + await self.Storage.UpdateDocumentStatus(documentId, "processing", runId) + task = dispatch_govdoc_task( + documentId=documentId, + runId=runId, + rulesPath=rulesPath, + triggerUserId=triggerUserId, + speed=speed, + ) + + async with GetAsyncSession() as session: + await session.execute( + text("UPDATE govdoc_runs SET task_id = :task_id, started_at = now(), updated_at = now() WHERE id = :run_id"), + {"task_id": str(getattr(task, "id", "") or ""), "run_id": runId}, + ) + await session.commit() + return { - "runId": 0, + "runId": runId, "documentId": documentId, - "status": "queued", + "status": "pending", "phase": "dispatch", + "taskId": str(getattr(task, "id", "") or ""), } async def GetRunStatus(self, runId: int) -> dict[str, Any]: - logger.info("[Govdoc] GetRunStatus placeholder — runId=%s", runId) - return {"runId": runId, "status": "pending"} + async with GetAsyncSession() as session: + await self._ensureGovdocSchema(session) + row = ( + await session.execute( + text( + """ + SELECT + id, + document_id, + status, + phase, + result_status, + total_score, + passed_count, + failed_count, + skipped_count, + error_message, + task_id, + created_at, + updated_at, + started_at, + finished_at + FROM govdoc_runs + WHERE id = :run_id + AND deleted_at IS NULL + LIMIT 1 + """ + ), + {"run_id": runId}, + ) + ).mappings().first() + if not row: + raise LeauditException(StatusCodeEnum.HTTP_404_NOT_FOUND, "审查运行不存在") + return self._build_run_summary(row) # ── 结果与报告 ──────────────────────────────────────── async def GetRunResult(self, runId: int) -> dict[str, Any]: """从 govdoc_runs + govdoc_rule_results 读取审查结果,含 structure/outline。""" - import json as _json - from fastapi_common.fastapi_common_sqlalchemy.database import GetAsyncSession - from sqlalchemy import text - async with GetAsyncSession() as session: - run_row = await session.execute( - text( - """SELECT id, document_id, status, phase, total_score, passed_count, - failed_count, skipped_count, result_status, result_summary_json, - rules_path, started_at, finished_at - FROM govdoc_runs WHERE id = :rid""" - ), - {"rid": runId}, + await self._ensureGovdocSchema(session) + runRow = ( + await session.execute( + text( + """ + SELECT + id, + document_id, + status, + phase, + total_score, + passed_count, + failed_count, + skipped_count, + result_status, + result_summary_json, + started_at, + finished_at + FROM govdoc_runs + WHERE id = :run_id + AND deleted_at IS NULL + LIMIT 1 + """ + ), + {"run_id": runId}, + ) + ).mappings().first() + if not runRow: + raise LeauditException(StatusCodeEnum.HTTP_404_NOT_FOUND, "审查运行不存在") + + documentRow = ( + await session.execute( + text( + """ + SELECT + d.id AS document_id, + f.file_name + FROM leaudit_documents d + JOIN leaudit_document_files f + ON f.document_id = d.id + AND f.is_active = true + AND f.file_role = 'original' + AND f.deleted_at IS NULL + WHERE d.id = :document_id + AND d.deleted_at IS NULL + LIMIT 1 + """ + ), + {"document_id": int(runRow["document_id"])}, + ) + ).mappings().first() + + rulesRows = ( + await session.execute( + text( + """ + SELECT + rule_id, + rule_name, + severity, + category, + result, + skip_reason, + message, + suggestion, + actual, + expected, + evidence, + paragraph_index, + paragraph_text, + location_path, + score + FROM govdoc_rule_results + WHERE run_id = :run_id + AND deleted_at IS NULL + ORDER BY id ASC + """ + ), + {"run_id": runId}, + ) + ).mappings().all() + + aux = self._parse_json(runRow.get("result_summary_json")) + findings = [] + checkedRules = [] + severityStats: dict[str, int] = {} + categoryStats: dict[str, int] = {} + seenRuleIds: set[str] = set() + for rr in rulesRows: + location = { + "paragraph_index": rr["paragraph_index"] or 0, + "role": rr.get("location_path"), + "char_start": 0, + "char_end": 0, + "context": rr.get("paragraph_text") or "", + } + if rr.get("result") == "fail": + severity = str(rr.get("severity") or "info") + category = str(rr.get("category") or "") + severityStats[severity] = severityStats.get(severity, 0) + 1 + if category: + categoryStats[category] = categoryStats.get(category, 0) + 1 + findings.append( + { + "finding_id": f"{rr['rule_id']}-{rr.get('paragraph_index') or len(findings)}", + "rule_id": rr["rule_id"], + "rule_name": rr.get("rule_name") or rr["rule_id"], + "severity": severity, + "category": category, + "location": location, + "actual": self._parse_json(rr.get("actual")) or {}, + "expected": self._parse_json(rr.get("expected")) or {}, + "message": rr.get("message") or "", + "suggestion": rr.get("suggestion") or "", + "evidence": rr.get("evidence") or "", + "confidence": 1.0, + } + ) + ruleId = str(rr["rule_id"]) + if ruleId in seenRuleIds: + continue + seenRuleIds.add(ruleId) + status = str(rr.get("result") or "pass") + checkedRules.append( + { + "rule_id": ruleId, + "name": rr.get("rule_name") or ruleId, + "severity": rr.get("severity") or "info", + "category": rr.get("category") or "", + "status": status if status in {"pass", "fail", "skipped"} else "pass", + "skip_reason": rr.get("skip_reason") or "", + } ) - run_data = run_row.mappings().first() - if not run_data: - return {"runId": runId, "summary": {}, "checkedRules": [], "findings": [], - "structure": [], "outline": [], "entities": {}} - - rules_rows = await session.execute( - text( - """SELECT rule_id, rule_name, severity, category, result, skip_reason, - message, suggestion, actual, expected, evidence, - paragraph_index, paragraph_text, location_path, score - FROM govdoc_rule_results WHERE run_id = :rid""" - ), - {"rid": runId}, - ) - rule_results = [dict(r._mapping) for r in rules_rows.fetchall()] - - aux_raw = run_data.get("result_summary_json") - aux = {} - if aux_raw: - try: - aux = _json.loads(aux_raw) if isinstance(aux_raw, str) else aux_raw - except (TypeError, _json.JSONDecodeError): - pass - - findings = [] - for rr in rule_results: - loc = {} - if rr.get("paragraph_index") is not None: - loc["paragraph_index"] = rr["paragraph_index"] - if rr.get("paragraph_text"): - loc["context"] = rr["paragraph_text"] - if rr.get("location_path"): - loc["role"] = rr["location_path"] - findings.append({ - "finding_id": f"{rr['rule_id']}-{rr['paragraph_index'] or 0}", - "rule_id": rr["rule_id"], - "rule_name": rr["rule_name"], - "severity": rr["severity"], - "category": rr["category"], - "location": loc if loc else None, - "actual": rr.get("actual") or {}, - "expected": rr.get("expected") or {}, - "message": rr.get("message") or "", - "suggestion": rr.get("suggestion") or "", - "evidence": rr.get("evidence") or "", - "confidence": 1.0, - }) - - checked_rules = [] - seen = set() - for rr in rule_results: - rid = rr["rule_id"] - if rid in seen: - continue - seen.add(rid) - status = rr.get("result", "pass") - checked_rules.append({ - "rule_id": rid, - "name": rr["rule_name"], - "severity": rr["severity"], - "category": rr["category"], - "status": status if status in ("pass", "fail", "skipped") else "pass", - "skip_reason": rr.get("skip_reason"), - }) + totalScore = float(runRow.get("total_score") or 0) return { "runId": runId, - "summary": { - "score": run_data.get("total_score", 100), - "total_findings": len(findings), - "by_severity": {}, - "by_category": {}, - "passed_count": run_data.get("passed_count", 0), - "failed_count": run_data.get("failed_count", 0), - "skipped_count": run_data.get("skipped_count", 0), + "documentId": int(runRow["document_id"]), + "document": { + "documentId": int(runRow["document_id"]), + "filename": str(documentRow["file_name"] or "") if documentRow else "", }, - "checkedRules": checked_rules, + "summary": { + "score": totalScore, + "total_findings": len(findings), + "by_severity": severityStats, + "by_category": categoryStats, + "passed_count": int(runRow.get("passed_count") or 0), + "failed_count": int(runRow.get("failed_count") or 0), + "skipped_count": int(runRow.get("skipped_count") or 0), + }, + "checkedRules": checkedRules, "findings": findings, "structure": aux.get("structure", []), "outline": aux.get("outline", []), - "entities": {}, + "entities": aux.get("entities", {}), } async def GetRunFindings(self, runId: int) -> dict[str, Any]: - logger.info("[Govdoc] GetRunFindings placeholder — runId=%s", runId) - return {"runId": runId, "findings": []} + result = await self.GetRunResult(runId) + return {"runId": runId, "findings": result["findings"]} async def GetRunEntities(self, runId: int) -> dict[str, Any]: - logger.info("[Govdoc] GetRunEntities placeholder — runId=%s", runId) - return {"runId": runId, "entities": []} + result = await self.GetRunResult(runId) + return {"runId": runId, "entities": result.get("entities", {})} - async def GetRunParagraphs(self, runId: int) -> dict[str, Any]: - logger.info("[Govdoc] GetRunParagraphs placeholder — runId=%s", runId) - return {"runId": runId, "paragraphs": []} + async def GetRunParagraphs(self, runId: int) -> str: + runStatus = await self.GetRunStatus(runId) + if runStatus["status"] != "completed": + raise LeauditException( + StatusCodeEnum.HTTP_409_CONFLICT, + "当前审查尚未完成,暂时无法加载文档视图", + ) + + paragraphArtifact = await self._get_report_artifact(runId, "paragraph_html") + if paragraphArtifact: + content = await self.OssService.DownloadBytes(str(paragraphArtifact["oss_url"])) + return content.decode("utf-8") + + documentMeta = await self._get_document_for_read(int(runStatus["documentId"])) + fileRow = await self._get_active_original_file(documentMeta.documentId) + + ossUrl = getattr(fileRow, "ossUrl", None) or fileRow.get("oss_url") + fileExt = getattr(fileRow, "fileExt", None) or fileRow.get("file_ext") + + if not ossUrl: + raise LeauditException(StatusCodeEnum.HTTP_404_NOT_FOUND, "原始文档未找到可用存储地址") + if (fileExt or "").strip().lower() != "docx": + raise LeauditException( + StatusCodeEnum.HTTP_400_BAD_REQUEST, + "当前文档视图仅支持 DOCX 原文预览,请上传 DOCX 文件", + ) + + tempPath = await self.OssService.DownloadToTempFile( + Source=ossUrl, + Suffix=f".{fileExt or 'docx'}", + Prefix="govdoc-run-", + ) + try: + doc = parse_docx(tempPath) + findingsResult = await self.GetRunFindings(runId) + findingMap: dict[int, list[str]] = {} + for finding in findingsResult["findings"]: + pi = int(finding.get("location", {}).get("paragraph_index") or 0) + findingMap.setdefault(pi, []).append(str(finding["finding_id"])) + return paragraphs_to_html(doc, findingMap) + finally: + try: + Path(tempPath).unlink(missing_ok=True) + except Exception: + pass async def GetRunStructure(self, runId: int) -> dict[str, Any]: - logger.info("[Govdoc] GetRunStructure placeholder — runId=%s", runId) - return {"runId": runId, "structure": []} + result = await self.GetRunResult(runId) + return {"runId": runId, "structure": result.get("structure", [])} async def GetRunOutline(self, runId: int) -> dict[str, Any]: - logger.info("[Govdoc] GetRunOutline placeholder — runId=%s", runId) - return {"runId": runId, "outline": []} + result = await self.GetRunResult(runId) + return {"runId": runId, "outline": result.get("outline", [])} async def GetReportHtml(self, runId: int) -> dict[str, Any]: - logger.info("[Govdoc] GetReportHtml placeholder — runId=%s", runId) - return {"runId": runId, "htmlUrl": ""} + artifact = await self._get_report_artifact(runId, "html_report") + if not artifact: + return {"runId": runId, "htmlUrl": ""} + return { + "runId": runId, + "htmlUrl": await self.OssService.PresignGetUrl(str(artifact["oss_url"])), + } async def GetReportDocx(self, runId: int) -> dict[str, Any]: - logger.info("[Govdoc] GetReportDocx placeholder — runId=%s", runId) - return {"runId": runId, "docxUrl": ""} + artifact = await self._get_report_artifact(runId, "annotated_docx") + if not artifact: + return {"runId": runId, "docxUrl": ""} + return { + "runId": runId, + "docxUrl": await self.OssService.PresignGetUrl(str(artifact["oss_url"])), + } async def DownloadOriginal(self, documentId: int) -> dict[str, Any]: - logger.info("[Govdoc] DownloadOriginal placeholder — documentId=%s", documentId) - return {"documentId": documentId, "downloadUrl": ""} + fileRow = await self._get_active_original_file(documentId) + ossUrl = getattr(fileRow, "ossUrl", None) or fileRow.get("oss_url") + if not ossUrl: + raise LeauditException(StatusCodeEnum.HTTP_404_NOT_FOUND, "原始文档不存在") + return { + "documentId": documentId, + "downloadUrl": await self.OssService.PresignGetUrl(str(ossUrl)), + } # ── 规则 ────────────────────────────────────────────── async def ListRules(self, rulesPath: str | None = None) -> dict[str, Any]: """从 govdoc 规则 YAML 文件加载规则清单。""" rules = await self._load_rules_list(rulesPath) - return {"rules": rules, "total_rules": len(rules)} + return {"metadata": {}, "rules": rules, "total_rules": len(rules)} async def GetRuleDetail(self, ruleId: str, rulesPath: str | None = None) -> dict[str, Any]: """获取单条规则完整详情(名称、严重度、stages、消息等)。""" @@ -254,55 +953,556 @@ class GovdocServiceImpl(IGovdocService): } return {"rule_id": ruleId, "name": ruleId, "severity": "info", "category": "", "group": ""} - # ── 规则加载助手 ──────────────────────────────────── + # ── 助手 ────────────────────────────────────────────── async def _resolve_rules_path(self, rulesPath: str | None = None) -> str | None: - """解析规则 YAML 文件路径。 - - 优先级:传入参数 > govdoc_runs 表记录 > None - """ + """解析规则 YAML 文件路径。""" if rulesPath: return rulesPath - # 尝试从最近的 completed run 中获取 rules_path - try: - from fastapi_common.fastapi_common_sqlalchemy.database import GetAsyncSession - from sqlalchemy import text - async with GetAsyncSession() as session: - row = await session.execute( - text( - """SELECT rules_path FROM govdoc_runs - WHERE rules_path IS NOT NULL AND status = 'completed' - ORDER BY id DESC LIMIT 1""" - ) - ) - result = row.mappings().first() - if result and result.get("rules_path"): - return result["rules_path"] - except Exception: - pass + candidates = [ + Path("/home/wren-dev/Porject/leaudit-platform/rules/govdoc/govdoc_general/rules.yaml"), + Path("/home/wren-dev/Porject/leaudit-platform/rules/govdoc_general/rules.yaml"), + ] + for candidate in candidates: + if candidate.is_file(): + return str(candidate) return None async def _load_ruleset(self, rulesPath: str | None = None): - """加载 rules.yaml 为 RuleSet 对象。""" resolved = await self._resolve_rules_path(rulesPath) if not resolved: logger.warning("[Govdoc] Cannot resolve rules path for GetRuleDetail/ListRules") return None from fastapi_modules.fastapi_leaudit.govdoc_engine.dsl.loader import load_rules + return load_rules(resolved) async def _load_rules_list(self, rulesPath: str | None = None) -> list[dict[str, Any]]: - """加载规则列表(简要信息)。""" ruleset = await self._load_ruleset(rulesPath) if ruleset is None: return [] result = [] for rule in ruleset.all_rules(): - result.append({ - "rule_id": rule.rule_id, - "name": rule.name, - "severity": rule.severity, - "category": rule.category, - "group": "", - }) + result.append( + { + "rule_id": rule.rule_id, + "name": rule.name, + "severity": rule.severity, + "category": rule.category, + "group": "", + } + ) return result + + async def _ensureGovdocSchema(self, session) -> None: + statements = [ + """ + ALTER TABLE leaudit_documents + ADD COLUMN IF NOT EXISTS engine_type VARCHAR(32) NOT NULL DEFAULT 'leaudit' + """, + """ + CREATE TABLE IF NOT EXISTS public.govdoc_runs ( + id BIGINT GENERATED ALWAYS AS IDENTITY PRIMARY KEY, + document_id BIGINT NOT NULL, + document_file_id BIGINT, + run_no INTEGER NOT NULL DEFAULT 1, + trigger_source VARCHAR(64) NOT NULL DEFAULT 'upload', + trigger_user_id BIGINT, + task_id VARCHAR(128), + status VARCHAR(64) NOT NULL DEFAULT 'pending', + phase VARCHAR(32), + engine_version VARCHAR(64), + llm_provider VARCHAR(64), + llm_model VARCHAR(128), + rules_path VARCHAR(1024), + total_score NUMERIC(10, 2), + passed_count INTEGER, + failed_count INTEGER, + skipped_count INTEGER, + result_status VARCHAR(32), + result_summary_json TEXT, + error_message TEXT, + started_at TIMESTAMPTZ, + finished_at TIMESTAMPTZ, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + deleted_at TIMESTAMPTZ DEFAULT NULL + ) + """, + """ + CREATE TABLE IF NOT EXISTS public.govdoc_rule_results ( + id BIGINT GENERATED ALWAYS AS IDENTITY PRIMARY KEY, + run_id BIGINT NOT NULL, + rule_id VARCHAR(128) NOT NULL, + rule_name VARCHAR(256), + severity VARCHAR(32), + category VARCHAR(128), + message TEXT, + suggestion TEXT, + actual TEXT, + expected TEXT, + evidence TEXT, + paragraph_index INTEGER, + paragraph_text TEXT, + location_path VARCHAR(512), + result VARCHAR(32) NOT NULL DEFAULT 'pass', + skip_reason TEXT, + score NUMERIC(10, 2), + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + deleted_at TIMESTAMPTZ DEFAULT NULL + ) + """, + """ + CREATE TABLE IF NOT EXISTS public.govdoc_report_artifacts ( + id BIGINT GENERATED ALWAYS AS IDENTITY PRIMARY KEY, + run_id BIGINT NOT NULL, + artifact_type VARCHAR(64) NOT NULL, + file_name VARCHAR(512) NOT NULL, + file_ext VARCHAR(32), + mime_type VARCHAR(128), + file_size BIGINT, + sha256 VARCHAR(64), + oss_url VARCHAR(2048), + storage_provider VARCHAR(32), + description VARCHAR(512), + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + deleted_at TIMESTAMPTZ DEFAULT NULL + ) + """, + """ + CREATE INDEX IF NOT EXISTS idx_leaudit_documents_engine_type + ON public.leaudit_documents(engine_type) WHERE deleted_at IS NULL + """, + """ + CREATE INDEX IF NOT EXISTS idx_govdoc_runs_document_id + ON public.govdoc_runs(document_id) WHERE deleted_at IS NULL + """, + """ + CREATE INDEX IF NOT EXISTS idx_govdoc_runs_status + ON public.govdoc_runs(status) WHERE deleted_at IS NULL + """, + """ + CREATE INDEX IF NOT EXISTS idx_govdoc_runs_trigger_user_id + ON public.govdoc_runs(trigger_user_id) + """, + """ + CREATE INDEX IF NOT EXISTS idx_govdoc_rule_results_run_id + ON public.govdoc_rule_results(run_id) WHERE deleted_at IS NULL + """, + """ + CREATE INDEX IF NOT EXISTS idx_govdoc_rule_results_rule_id + ON public.govdoc_rule_results(rule_id) WHERE deleted_at IS NULL + """, + """ + CREATE INDEX IF NOT EXISTS idx_govdoc_rule_results_result + ON public.govdoc_rule_results(result) WHERE deleted_at IS NULL + """, + """ + CREATE INDEX IF NOT EXISTS idx_govdoc_rule_results_paragraph + ON public.govdoc_rule_results(run_id, paragraph_index) WHERE deleted_at IS NULL + """, + """ + CREATE INDEX IF NOT EXISTS idx_govdoc_report_artifacts_run_id + ON public.govdoc_report_artifacts(run_id) WHERE deleted_at IS NULL + """, + """ + CREATE INDEX IF NOT EXISTS idx_govdoc_report_artifacts_type + ON public.govdoc_report_artifacts(run_id, artifact_type) WHERE deleted_at IS NULL + """, + ] + for statement in statements: + await session.execute(text(statement)) + await session.commit() + + async def _getCurrentUserContext(self, CurrentUserId: int) -> dict[str, Any]: + async with GetAsyncSession() as session: + row = ( + await session.execute( + text( + """ + SELECT + u.id, + COALESCE(u.area, '') AS area, + COALESCE(bool_or(r.role_key IN ('super_admin', 'provincial_admin')), FALSE) AS is_global, + COALESCE(bool_or(r.role_key IN ('super_admin', 'provincial_admin', 'admin')), FALSE) AS can_manage, + COALESCE(bool_or(r.role_key = 'super_admin'), FALSE) AS is_super_admin + FROM sso_users u + LEFT JOIN user_role ur ON ur.user_id = u.id + LEFT JOIN roles r ON r.id = ur.role_id + WHERE u.id = :user_id + GROUP BY u.id, u.area + """ + ), + {"user_id": CurrentUserId}, + ) + ).mappings().first() + if not row: + raise LeauditException(StatusCodeEnum.HTTP_404_NOT_FOUND, "当前用户不存在") + return { + "area": str(row["area"] or ""), + "is_global": bool(row["is_global"]), + "can_manage": bool(row["can_manage"]), + "is_super_admin": bool(row["is_super_admin"]), + } + + def _buildDocumentScopeFilters( + self, + CurrentUserId: int, + CurrentUser: dict[str, Any], + Params: dict[str, object], + DocumentAlias: str, + FileAlias: str, + RequestedRegion: str | None = None, + RequestedUserId: int | None = None, + ) -> list[str]: + filters: list[str] = [] + requestedRegion = (RequestedRegion or "").strip() + area = str(CurrentUser["area"] or "").strip() + + if CurrentUser["is_global"]: + if requestedRegion: + filters.append(f"{DocumentAlias}.region = :requested_region") + Params["requested_region"] = requestedRegion + if RequestedUserId is not None: + filters.append(f"{FileAlias}.created_by = :requested_user_id") + Params["requested_user_id"] = RequestedUserId + return filters + + if CurrentUser["can_manage"]: + if not area: + filters.append("1 = 0") + return filters + if requestedRegion and requestedRegion != area: + filters.append("1 = 0") + return filters + filters.append(f"{DocumentAlias}.region = :scope_region") + Params["scope_region"] = area + if RequestedUserId is not None: + filters.append(f"{FileAlias}.created_by = :requested_user_id") + Params["requested_user_id"] = RequestedUserId + return filters + + filters.append(f"{FileAlias}.created_by = :scope_user_id") + Params["scope_user_id"] = CurrentUserId + if requestedRegion: + filters.append(f"{DocumentAlias}.region = :requested_region") + Params["requested_region"] = requestedRegion + if RequestedUserId is not None and RequestedUserId != CurrentUserId: + filters.append("1 = 0") + return filters + + def _resolve_upload_region(self, currentUser: dict[str, Any], requestedRegion: str) -> str: + area = str(currentUser["area"] or "").strip() + if currentUser["is_global"]: + return requestedRegion or area or "default" + if currentUser["can_manage"]: + if area and requestedRegion and requestedRegion != area: + raise LeauditException(StatusCodeEnum.HTTP_403_FORBIDDEN, "不能上传到非本地区") + return area or requestedRegion or "default" + if area and requestedRegion and requestedRegion != area: + raise LeauditException(StatusCodeEnum.HTTP_403_FORBIDDEN, "不能上传到非本人地区") + return area or requestedRegion or "default" + + def _normalize_document_name(self, fileName: str) -> str: + suffix = Path(fileName).suffix + return fileName[: -len(suffix)] if suffix else fileName + + async def _get_document_for_run( + self, + documentId: int, + userId: int, + currentUser: dict[str, Any], + ) -> _GovdocDocumentRow: + params: dict[str, Any] = {"document_id": documentId} + filters = [ + "d.id = :document_id", + "d.deleted_at IS NULL", + "f.deleted_at IS NULL", + "f.is_active = true", + "f.file_role = 'original'", + "COALESCE(d.engine_type, 'leaudit') = 'govdoc'", + ] + filters.extend( + self._buildDocumentScopeFilters( + CurrentUserId=userId, + CurrentUser=currentUser, + Params=params, + DocumentAlias="d", + FileAlias="f", + ) + ) + whereClause = " AND ".join(filters) + async with GetAsyncSession() as session: + row = ( + await session.execute( + text( + f""" + SELECT + d.id AS document_id, + COALESCE(d.region, 'default') AS region, + COALESCE(d.processing_status, 'waiting') AS processing_status, + d.current_run_id, + d.created_at, + d.updated_at, + f.id AS file_id, + f.file_name, + f.file_ext, + f.mime_type, + f.file_size, + f.oss_url, + f.created_by, + gr.result_status, + gr.total_score, + gr.passed_count, + gr.failed_count, + gr.skipped_count + FROM leaudit_documents d + JOIN leaudit_document_files f + ON f.document_id = d.id + AND f.is_active = true + AND f.file_role = 'original' + AND f.deleted_at IS NULL + LEFT JOIN govdoc_runs gr + ON gr.id = d.current_run_id + WHERE {whereClause} + LIMIT 1 + """ + ), + params, + ) + ).mappings().first() + if not row: + raise LeauditException(StatusCodeEnum.HTTP_404_NOT_FOUND, "公文文档不存在或无权访问") + return self._map_document_row(row) + + async def _get_document_for_read(self, documentId: int) -> _GovdocDocumentRow: + async with GetAsyncSession() as session: + row = ( + await session.execute( + text( + """ + SELECT + d.id AS document_id, + COALESCE(d.region, 'default') AS region, + COALESCE(d.processing_status, 'waiting') AS processing_status, + d.current_run_id, + d.created_at, + d.updated_at, + f.id AS file_id, + f.file_name, + f.file_ext, + f.mime_type, + f.file_size, + f.oss_url, + f.created_by, + gr.result_status, + gr.total_score, + gr.passed_count, + gr.failed_count, + gr.skipped_count + FROM leaudit_documents d + JOIN leaudit_document_files f + ON f.document_id = d.id + AND f.is_active = true + AND f.file_role = 'original' + AND f.deleted_at IS NULL + LEFT JOIN govdoc_runs gr + ON gr.id = d.current_run_id + WHERE d.id = :document_id + AND d.deleted_at IS NULL + AND COALESCE(d.engine_type, 'leaudit') = 'govdoc' + LIMIT 1 + """ + ), + {"document_id": documentId}, + ) + ).mappings().first() + if not row: + raise LeauditException(StatusCodeEnum.HTTP_404_NOT_FOUND, "公文文档不存在") + return self._map_document_row(row) + + async def _get_active_original_file(self, documentId: int): + async with GetAsyncSession() as session: + row = ( + await session.execute( + text( + """ + SELECT id, document_id, file_name, file_ext, mime_type, file_size, oss_url, created_by + FROM leaudit_document_files + WHERE document_id = :document_id + AND is_active = true + AND file_role = 'original' + AND deleted_at IS NULL + ORDER BY id DESC + LIMIT 1 + """ + ), + {"document_id": documentId}, + ) + ).mappings().first() + if not row: + raise LeauditException(StatusCodeEnum.HTTP_404_NOT_FOUND, "原始文档不存在") + return row + + async def _next_run_no(self, documentId: int) -> int: + async with GetAsyncSession() as session: + current = ( + await session.execute( + text( + """ + SELECT COALESCE(MAX(run_no), 0) + FROM govdoc_runs + WHERE document_id = :document_id + AND deleted_at IS NULL + """ + ), + {"document_id": documentId}, + ) + ).scalar_one() + return int(current or 0) + 1 + + def _build_run_summary(self, row: Any) -> dict[str, Any]: + summary = self._build_summary_payload( + row.get("total_score"), + row.get("passed_count"), + row.get("failed_count"), + row.get("skipped_count"), + ) + return { + "runId": int(row["id"]), + "documentId": int(row["document_id"]), + "status": str(row.get("status") or "pending"), + "phase": row.get("phase"), + "resultStatus": row.get("result_status"), + "score": float(row.get("total_score") or 0), + "passedCount": int(row.get("passed_count") or 0), + "failedCount": int(row.get("failed_count") or 0), + "skippedCount": int(row.get("skipped_count") or 0), + "summary": summary, + "errorMessage": row.get("error_message"), + "taskId": row.get("task_id"), + "createdAt": self._iso(row.get("created_at")), + "updatedAt": self._iso(row.get("updated_at")), + "startedAt": self._iso(row.get("started_at")), + "finishedAt": self._iso(row.get("finished_at")), + } + + def _build_summary_payload( + self, + totalScore: Any, + passedCount: Any, + failedCount: Any, + skippedCount: Any, + ) -> dict[str, Any]: + return { + "score": float(totalScore or 0), + "total_findings": int(failedCount or 0), + "by_severity": {}, + "by_category": {}, + "passed_count": int(passedCount or 0), + "failed_count": int(failedCount or 0), + "skipped_count": int(skippedCount or 0), + } + + def _group_artifacts_by_run(self, rows: list[Any]) -> dict[int, dict[str, Any]]: + grouped: dict[int, dict[str, Any]] = {} + artifactTypeMap = { + "html_report": "htmlUrl", + "annotated_docx": "docxUrl", + "paragraph_html": "paragraphHtmlUrl", + } + for row in rows: + runId = int(row["run_id"]) + payload = grouped.setdefault( + runId, + { + "artifacts": [], + "hasHtmlReport": False, + "hasDocxReport": False, + "hasParagraphHtml": False, + }, + ) + artifactType = str(row.get("artifact_type") or "") + if artifactType in artifactTypeMap and row.get("oss_url") and artifactTypeMap[artifactType] not in payload: + payload[artifactTypeMap[artifactType]] = row["oss_url"] + if artifactType == "html_report": + payload["hasHtmlReport"] = True + elif artifactType == "annotated_docx": + payload["hasDocxReport"] = True + elif artifactType == "paragraph_html": + payload["hasParagraphHtml"] = True + payload["artifacts"].append( + { + "artifactType": artifactType, + "fileName": row.get("file_name") or "", + "fileExt": row.get("file_ext") or "", + "mimeType": row.get("mime_type") or "", + "ossUrl": row.get("oss_url") or "", + "description": row.get("description") or "", + } + ) + return grouped + + def _map_document_row(self, row: Any) -> _GovdocDocumentRow: + return _GovdocDocumentRow( + documentId=int(row["document_id"]), + region=str(row["region"] or "default"), + processingStatus=str(row["processing_status"] or "waiting"), + currentRunId=int(row["current_run_id"]) if row.get("current_run_id") is not None else None, + createdAt=row.get("created_at"), + updatedAt=row.get("updated_at"), + fileId=int(row["file_id"]), + fileName=str(row["file_name"] or ""), + fileExt=str(row["file_ext"]) if row.get("file_ext") else None, + mimeType=str(row["mime_type"]) if row.get("mime_type") else None, + fileSize=int(row["file_size"]) if row.get("file_size") is not None else None, + ossUrl=str(row["oss_url"]) if row.get("oss_url") else None, + createdBy=int(row["created_by"]) if row.get("created_by") is not None else None, + resultStatus=str(row["result_status"]) if row.get("result_status") else None, + totalScore=float(row["total_score"]) if row.get("total_score") is not None else None, + passedCount=int(row["passed_count"]) if row.get("passed_count") is not None else None, + failedCount=int(row["failed_count"]) if row.get("failed_count") is not None else None, + skippedCount=int(row["skipped_count"]) if row.get("skipped_count") is not None else None, + hasHtmlReport=bool(row.get("has_html_report")), + hasDocxReport=bool(row.get("has_docx_report")), + ) + + async def _get_report_artifact(self, runId: int, artifactType: str) -> Any | None: + async with GetAsyncSession() as session: + await self._ensureGovdocSchema(session) + return ( + await session.execute( + text( + """ + SELECT oss_url + FROM govdoc_report_artifacts + WHERE run_id = :run_id + AND artifact_type = :artifact_type + AND deleted_at IS NULL + AND COALESCE(oss_url, '') <> '' + ORDER BY id DESC + LIMIT 1 + """ + ), + {"run_id": runId, "artifact_type": artifactType}, + ) + ).mappings().first() + + def _parse_json(self, raw: Any) -> Any: + if raw is None or raw == "": + return None + if isinstance(raw, (dict, list)): + return raw + try: + return json.loads(raw) + except Exception: + return None + + def _iso(self, value: Any) -> str | None: + if value is None: + return None + if isinstance(value, datetime): + return value.isoformat() + return str(value) diff --git a/rules/govdoc/govdoc_general/rules.yaml b/rules/govdoc/govdoc_general/rules.yaml new file mode 100644 index 0000000..f052154 --- /dev/null +++ b/rules/govdoc/govdoc_general/rules.yaml @@ -0,0 +1,546 @@ +metadata: + type_id: govdoc_general + name: 内部公文通用规则 + version: "0.1.0" + source: 公文文稿常见错误汇编(第一期)·2025-11 + description: 基于旧内部公文正式规则语义整理的当前平台规则集。 + +extract: + # 8 个内置实体(title / doc_number / recipient / date / + # signature / attachments / wenzhong / issuer)由代码自动产出。 + entities: [] + +rules: + - group: 标题(错误汇编 一) + rules: + - rule_id: GW-T-001 + name: 标题文种合规性 + severity: error + category: 标题 + target: title + on_missing: fail + stages: + - check: ai + prompt: | + 审查公文标题是否符合规范。 + 标题:{{title.text}} + + 15 种合法文种:决议、决定、命令(令)、公报、公告、通告、意见、 + 通知、通报、报告、请示、批复、议案、函、纪要 + + 检查要点: + 1. 是否使用了合法文种 + 2. 方案/规划/办法/细则等是否以"通知"形式下发(应为"关于印发〈xxx〉的通知") + 3. 标题中是否有"印发"等动词 + messages: + pass: 标题文种合规 + fail: 标题文种不合规 + + - rule_id: GW-T-002 + name: 标题不可有"请求"+"请示"重复 + severity: error + category: 标题 + target: title + on_missing: skip + stages: + - check: regex_forbid + pattern: '关于请求.*的请示' + messages: + pass: ok + fail: '"请示"已包含"请求"之意,应删去"请求"' + + - rule_id: GW-T-003 + name: 标题不可有"上报"+"报告"重复 + severity: error + category: 标题 + target: title + on_missing: skip + stages: + - check: regex_forbid + pattern: '关于上报.*的报告' + messages: + pass: ok + fail: '"报告"已包含"上报"之意,应删去"上报"' + + - rule_id: GW-T-004 + name: 标题介词连用 + severity: warning + category: 标题 + target: title + on_missing: skip + stages: + - check: regex_forbid + pattern: '关于对.*的(批复|通知|通报)' + messages: + pass: ok + fail: '"关于"+"对" 介词连用不规范' + + - rule_id: GW-T-005 + name: 标题文种白名单 + severity: error + category: 文种 + target: wenzhong + on_missing: skip + stages: + - check: wenzhong_whitelist + messages: + pass: 文种合规 + fail: 非法定文种(出现"工作情况""汇报""方案""办法"等当文种) + + - rule_id: GW-T-006 + name: 标题回行词意完整 + severity: warning + category: 标题 + target: title + on_missing: skip + stages: + - check: ai + prompt: | + 只在标题里**明确出现破词**时才报错。 + 破词示例:「广东省烟草专卖局关于xx的通知」如果在"专"和"卖"之间有换行 → fail + 其它情况(单行标题、合理换行点、词意完整)→ **必须 pass** + + 判断准则: + - 标题已经是单行字符串,没有明显断点 → pass + - 不要凭直觉揣测,只判断是否能在原文中**逐字定位**破词位置 + - 找不到具体破词位置就 pass + + 标题原文: + {{title.text}} + messages: + pass: 标题回行合规 + fail: 标题回行破词 + + - group: 发文字号(错误汇编 三、六.3) + rules: + - rule_id: GW-N-001 + name: 发文字号必须用六角括号 + severity: error + category: 发文 + target: doc_number + on_missing: fail + stages: + - check: forbid_chars + chars: ["[", "]"] + messages: + pass: ok + fail: 发文字号年份应用六角括号「〔〕」,不得使用方括号 + + - rule_id: GW-N-002 + name: 发文字号不可加"第"字 + severity: error + category: 发文 + target: doc_number + on_missing: fail + stages: + - check: regex_forbid + pattern: '〔\d{4}〕第\d+号' + messages: + pass: ok + fail: 发文字号顺序号前不应加"第"字 + + - rule_id: GW-N-003 + name: 发文字号顺序号不编虚位 + severity: error + category: 发文 + target: doc_number + on_missing: fail + stages: + - check: regex_forbid + pattern: '〔\d{4}〕0\d+号' + messages: + pass: ok + fail: 发文字号顺序号不编虚位(如"02号"应为"2号") + + - group: 格式(错误汇编 二) + rules: + - rule_id: GW-F-001 + name: 主标题用方正小标宋简体二号 + severity: error + category: 格式 + target: title + on_missing: fail + stages: + - check: font + expect: + eastasia: 方正小标宋简体 + size_pt: 22 + messages: + pass: ok + fail: 主标题应使用方正小标宋简体二号 + + - rule_id: GW-F-002 + name: 一级标题用黑体三号 + severity: error + category: 格式 + applies_to: + role: heading_1 + on_missing: skip + stages: + - check: font + expect: + eastasia: 黑体 + size_pt: 16 + messages: + pass: ok + fail: 一级标题应使用黑体三号 + + - rule_id: GW-F-003 + name: 二级标题用楷体三号 + severity: error + category: 格式 + applies_to: + role: heading_2 + on_missing: skip + stages: + - check: font + expect: + eastasia: 楷体 + size_pt: 16 + messages: + pass: ok + fail: 二级标题应使用楷体三号 + + - rule_id: GW-F-004 + name: 正文用仿宋三号 + severity: warning + category: 格式 + applies_to: + role: body + on_missing: skip + stages: + - check: font + expect: + eastasia: 仿宋 + size_pt: 16 + messages: + pass: ok + fail: 正文应使用仿宋(GB2312)三号 + + - rule_id: GW-F-005 + name: 附件后不加冒号 + severity: error + category: 格式 + applies_to: + role: attachment_marker + on_missing: skip + stages: + - check: regex_forbid + pattern: '^附件\d+:' + messages: + pass: ok + fail: '"附件1"等字样后不应加冒号' + + - rule_id: GW-F-006 + name: 不使用"(此页无正文)" + severity: warning + category: 格式 + applies_to: + role: any + on_missing: skip + stages: + - check: forbid_phrase + phrases: + - (此页无正文) + - (此页无正文) + messages: + pass: ok + fail: 应通过编辑排版避免出现"(此页无正文)" + + - rule_id: GW-F-007 + name: 附件项末尾不加标点 + severity: warning + category: 格式 + applies_to: + role: any + on_missing: skip + stages: + - check: cross_role + rules: + - type: attachment_item_no_trailing_punct + messages: + pass: ok + fail: 附件名称(内容)后不应使用标点符号 + + - rule_id: GW-F-008 + name: 三级标题用仿宋三号 + severity: warning + category: 格式 + applies_to: + role: heading_3 + on_missing: skip + stages: + - check: font + expect: + eastasia: 仿宋 + size_pt: 16 + messages: + pass: ok + fail: 三级标题应使用仿宋(GB2312)三号 + + - rule_id: GW-F-009 + name: 四级标题用仿宋三号 + severity: warning + category: 格式 + applies_to: + role: heading_4 + on_missing: skip + stages: + - check: font + expect: + eastasia: 仿宋 + size_pt: 16 + messages: + pass: ok + fail: 四级标题应使用仿宋(GB2312)三号 + + - rule_id: GW-F-010 + name: 附件标记用黑体三号不加粗 + severity: error + category: 格式 + applies_to: + role: attachment_marker + on_missing: skip + stages: + - check: attachment_marker_style + expect: + eastasia: 黑体 + size_pt: 16 + bold: false + messages: + pass: ok + fail: '"附件:"或"附件1"等标记应使用黑体三号,且不加粗' + + - group: 层级序号(错误汇编 四) + rules: + - rule_id: GW-H-001 + name: 层级序号格式 + severity: error + category: 层级 + applies_to: + role: any + on_missing: skip + stages: + - check: hierarchy + forbid_patterns: + - '^[一二三四五六七八九十]+、.*[、。]$' + - '^\d+、' + - '^([一二三四五六七八九十]+)、' + messages: + pass: ok + fail: 层级序号格式错误 + + - rule_id: GW-H-002 + name: 二级标题换行不带句号 + severity: warning + category: 层级 + applies_to: + role: heading_2 + on_missing: skip + stages: + - check: cross_role + rules: + - type: h2_no_period_then_break + messages: + pass: ok + fail: 二级标题在换行分段时不应使用句号 + + - group: 标点符号(错误汇编 六) + rules: + - rule_id: GW-P-001 + name: 多书名号/引号并列不加顿号 + severity: warning + category: 标点 + applies_to: + role: any + on_missing: skip + stages: + - check: punctuation + rules: + - type: no_dunhao_between_quotes + messages: + pass: ok + fail: 多个书名号/引号并列时不应用顿号分隔 + + - rule_id: GW-P-002 + name: 句内括号末尾不加标点 + severity: warning + category: 标点 + applies_to: + role: any + on_missing: skip + stages: + - check: punctuation + rules: + - type: no_punct_inside_inline_paren + messages: + pass: ok + fail: 句内括号行文末尾通常不应含标点 + + - rule_id: GW-P-003 + name: 引号嵌套不规范 + severity: warning + category: 标点 + applies_to: + role: any + on_missing: skip + stages: + - check: punctuation + rules: + - type: no_outer_quote_when_inner_quote + messages: + pass: ok + fail: 双引号内已含单引号强调时,外层不应再加双引号(如"卓'粤'创一流"应为 卓"粤"创一流) + + - group: 文字表述与提法(错误汇编 七、八、九) + rules: + - rule_id: GW-W-001 + name: 易混淆词使用 + severity: warning + category: 文字 + applies_to: + role: any + on_missing: skip + stages: + - check: confused_pair + pairs: + - wrong: 截至到 + correct: 截止到 + reason: '"截至" 已含"到"之意' + - wrong: 下称 + correct: 以下简称 + reason: 标注简称应用"以下简称" + - wrong_pattern: '截止\d{4}年' + suggest: 截至YYYY年 + reason: 用于到某时点应为"截至" + messages: + pass: ok + fail: 易混淆词使用不当 + + - rule_id: GW-W-002 + name: 简称使用规范 + severity: warning + category: 简称 + applies_to: + role: body + on_missing: skip + stages: + - check: ai + prompt: | + 只在文中出现以下两种省级职务简称错误时才报错,否则一律 pass: + - "X省省委书记" 错误(应为 "X省委书记",省字不重复) + - "X省长" 错误(应为 "X省省长",省字不可省略) + + 若文中没有"省委书记"或"省长"等省级职务字样,**必须 pass**。 + 若不能在文中找到准确的错误原文,**必须 pass**。 + 不要做语气、措辞、其它简称的检查。 + + 全文片段: + {{paragraphs[0]}} + messages: + pass: 简称规范 + fail: 简称使用不规范 + + - rule_id: GW-W-003 + name: 成文日期用阿拉伯数字 + severity: error + category: 提法 + target: date + on_missing: fail + stages: + - check: regex_forbid + pattern: '[一二三四五六七八九十○〇零]+年' + messages: + pass: ok + fail: 成文日期应使用阿拉伯数字(如"2023年10月9日") + + - rule_id: GW-W-004 + name: 成文日期不编虚位 + severity: warning + category: 提法 + target: date + on_missing: fail + stages: + - check: regex_forbid + pattern: '\d{4}年0\d月|\d{4}年\d{1,2}月0\d日' + messages: + pass: ok + fail: 成文日期月、日不编虚位 + + - group: 发文机关(错误汇编 十) + rules: + - rule_id: GW-S-001 + name: 发文机关署名不能用简称 + severity: error + category: 机关 + target: signature + on_missing: fail + stages: + - check: ai + prompt: | + 判断署名是否含**明确的简称错误**。 + 典型错误: + - "广东省烟草专卖局(公司)" — 用括号缩短两个机关 → 错 + - "省局" / "粤烟" 等单独缩写 → 错 + 典型正确: + - "广东省烟草专卖局" 单独出现 → pass(即使可能存在配套总公司,但单独存在不算简称) + - "广东省烟草专卖局 中国烟草总公司广东省公司" → pass + + 判断准则: + - 若署名是一个完整、官方、可独立成立的机关名 → **必须 pass** + - 若署名带"(公司)"、"省局"、明显缩写、行业内部代号 → fail + + 署名原文: + {{signature.text}} + messages: + pass: 署名规范 + fail: 发文机关署名使用了简称 + + - rule_id: GW-S-002 + name: 发文机关确定严谨性 + severity: warning + category: 机关 + target: signature + on_missing: fail + stages: + - check: ai + prompt: | + 只判断**这一个明确条件**: + - 标题或正文里明确涉及"党组""党的xx工作""组织部""纪委"等党务事项, + 但署名是行政机关(局/公司/委员会等),未署"党组"或党务机构 → fail + - 其它情况(行政事务、缺乏证据、性质模糊)→ **必须 pass** + + 判断时需要看到**明确的党务关键词**(党组/党委/党的xx会议/党风/反腐倡廉等), + 没有这些关键词就 pass。 + + 署名原文:{{signature.text}} + 标题:{{title.text}} + messages: + pass: 发文机关一致 + fail: 发文机关与文稿性质不一致 + + - group: 标题字体(target 通道示例) + rules: + - rule_id: GW-T-008 + name: 标题字体(语义实体通道) + severity: warning + category: 标题 + target: title + on_missing: warn + stages: + - check: ai + prompt: | + 判断公文标题的字体与字号是否合规。 + 要求:字体 = 方正小标宋简体;字号 = 22pt(或 22.0)。 + + 实际: + - 标题:{{title.text}} + - 字体:{{title.style.font_eastasia}} + - 字号:{{title.style.font_size_pt}}pt + + 若实际字体为空或与要求一致 → pass + 若字体明显不符(例如 仿宋/楷体/黑体)→ fail + 若仅字号轻微差异 → warn + messages: + pass: 标题字体字号合规 + fail: 标题字体或字号不符合 GB/T 9704 diff --git a/scripts/创建sql/schema_add_govdoc_module.sql b/scripts/创建sql/schema_add_govdoc_module.sql index 2f31778..a322aa3 100644 --- a/scripts/创建sql/schema_add_govdoc_module.sql +++ b/scripts/创建sql/schema_add_govdoc_module.sql @@ -33,6 +33,7 @@ CREATE TABLE IF NOT EXISTS public.govdoc_runs ( engine_version VARCHAR(64), llm_provider VARCHAR(64), llm_model VARCHAR(128), + rules_path VARCHAR(1024), -- 结果汇总 total_score NUMERIC(10, 2), @@ -52,6 +53,9 @@ CREATE TABLE IF NOT EXISTS public.govdoc_runs ( deleted_at TIMESTAMPTZ DEFAULT NULL ); +ALTER TABLE public.govdoc_runs + ADD COLUMN IF NOT EXISTS rules_path VARCHAR(1024); + COMMENT ON TABLE public.govdoc_runs IS '公文审查运行主表'; COMMENT ON COLUMN public.govdoc_runs.id IS '自增主键'; COMMENT ON COLUMN public.govdoc_runs.document_id IS '关联 leaudit_documents.id'; @@ -65,6 +69,7 @@ COMMENT ON COLUMN public.govdoc_runs.phase IS '当前阶段:parsing/executing/ COMMENT ON COLUMN public.govdoc_runs.engine_version IS '引擎版本号'; COMMENT ON COLUMN public.govdoc_runs.llm_provider IS 'LLM 提供商'; COMMENT ON COLUMN public.govdoc_runs.llm_model IS 'LLM 模型名'; +COMMENT ON COLUMN public.govdoc_runs.rules_path IS '本次运行使用的规则文件路径'; COMMENT ON COLUMN public.govdoc_runs.total_score IS '总分'; COMMENT ON COLUMN public.govdoc_runs.passed_count IS '通过规则数'; COMMENT ON COLUMN public.govdoc_runs.failed_count IS '未通过规则数'; @@ -106,6 +111,7 @@ CREATE TABLE IF NOT EXISTS public.govdoc_rule_results ( -- 判定 result VARCHAR(32) NOT NULL DEFAULT 'pass', + skip_reason TEXT, score NUMERIC(10, 2), created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), @@ -113,6 +119,9 @@ CREATE TABLE IF NOT EXISTS public.govdoc_rule_results ( deleted_at TIMESTAMPTZ DEFAULT NULL ); +ALTER TABLE public.govdoc_rule_results + ADD COLUMN IF NOT EXISTS skip_reason TEXT; + COMMENT ON TABLE public.govdoc_rule_results IS '公文规则执行结果明细表'; COMMENT ON COLUMN public.govdoc_rule_results.id IS '自增主键'; COMMENT ON COLUMN public.govdoc_rule_results.run_id IS '关联 govdoc_runs.id'; @@ -129,6 +138,7 @@ COMMENT ON COLUMN public.govdoc_rule_results.paragraph_index IS '段落索引'; COMMENT ON COLUMN public.govdoc_rule_results.paragraph_text IS '段落原文'; COMMENT ON COLUMN public.govdoc_rule_results.location_path IS '文档结构位置路径'; COMMENT ON COLUMN public.govdoc_rule_results.result IS '执行结果:pass/fail/skipped/error'; +COMMENT ON COLUMN public.govdoc_rule_results.skip_reason IS '跳过原因,仅 skipped/error 时使用'; COMMENT ON COLUMN public.govdoc_rule_results.score IS '本条得分'; CREATE INDEX IF NOT EXISTS idx_govdoc_rule_results_run_id ON public.govdoc_rule_results(run_id) WHERE deleted_at IS NULL; @@ -195,4 +205,4 @@ END $$; -- 为 engine_type 加索引,方便按模块过滤文档列表 CREATE INDEX IF NOT EXISTS idx_leaudit_documents_engine_type ON public.leaudit_documents(engine_type) WHERE deleted_at IS NULL; -COMMIT; \ No newline at end of file +COMMIT;