feat: integrate govdoc module into leaudit platform

2026-05-17 19:24:16 +08:00
parent cb13e61d3d
commit a73826dc1d
16 changed files with 2334 additions and 280 deletions
@@ -26,6 +26,7 @@ class ResultAdapter:
        EngineResult: AuditResult,
        Structure: list[dict[str, Any]] | None = None,
        Outline: list[dict[str, Any]] | None = None,
+        Entities: list[dict[str, Any]] | None = None,
    ) -> dict[str, Any]:
        """从 AuditResult.summary 提取 run 汇总字段。

@@ -40,6 +41,10 @@ class ResultAdapter:
            aux["structure"] = Structure
        if Outline is not None:
            aux["outline"] = Outline
+        if Entities is not None:
+            aux["entities"] = {
+                entity["name"]: entity for entity in Entities if entity.get("name")
+            }

        return {
            "totalScore": s.score,
@@ -100,6 +105,7 @@ class ResultAdapter:
                "primaryRole": entity.primary_role,
                "source": entity.source,
                "confidence": entity.confidence,
+                "extra": entity.extra,
            })
        return entities

@@ -9,14 +9,26 @@

 from __future__ import annotations

+import hashlib
+import shutil
+import tempfile
 from dataclasses import dataclass, field
+from pathlib import Path
 from typing import Any

+from sqlalchemy import text
+
 from fastapi_common.fastapi_common_logger import logger
+from fastapi_common.fastapi_common_sqlalchemy.database import GetAsyncSession
+from fastapi_common.fastapi_common_storage.oss_path_utils import OssPathUtils

 from fastapi_modules.fastapi_leaudit.govdoc_bridge.input_resolver import InputResolver
-from fastapi_modules.fastapi_leaudit.govdoc_bridge.result_adapter import ResultAdapter
 from fastapi_modules.fastapi_leaudit.govdoc_bridge.storage_adapter import StorageAdapter
+from fastapi_modules.fastapi_leaudit.govdoc_engine.parser.docx_parser import parse_docx
+from fastapi_modules.fastapi_leaudit.govdoc_engine.reporter.docx_annotator import annotate_docx
+from fastapi_modules.fastapi_leaudit.govdoc_engine.reporter.html_paragraph import paragraphs_to_html
+from fastapi_modules.fastapi_leaudit.govdoc_engine.reporter.html_renderer import render_html
+from fastapi_modules.fastapi_leaudit.services.impl.ossServiceImpl import OssServiceImpl

 log = logger

@@ -33,13 +45,30 @@ class GovdocRunner:

    InputResolver: InputResolver = field(default_factory=InputResolver)
    Storage: StorageAdapter = field(default_factory=StorageAdapter)
-    ResultAdapter: ResultAdapter = field(default_factory=ResultAdapter)
+    OssService: OssServiceImpl = field(default_factory=OssServiceImpl)
+    ResultAdapter: Any | None = None
+
+    def ResolveRulesPath(self, RulesPath: str | None) -> str:
+        """解析并校验执行所需规则文件路径。"""
+        candidate = (RulesPath or "").strip()
+        if not candidate:
+            raise ValueError("未提供 govdoc rules_path，当前任务无法执行规则审查")
+
+        path = Path(candidate).expanduser()
+        if not path.is_absolute():
+            path = Path.cwd() / path
+        path = path.resolve()
+
+        if not path.is_file():
+            raise FileNotFoundError(f"govdoc 规则文件不存在: {path}")
+
+        return str(path)

    async def Execute(
        self,
        DocumentId: int,
        RunId: int,
-        RulesPath: str,
+        RulesPath: str | None = None,
        TriggerUserId: int | None = None,
        Speed: str = "normal",
    ) -> dict[str, Any]:
@@ -54,57 +83,216 @@ class GovdocRunner:
        Returns:
            执行摘要 dict。
        """
+        resolvedRulesPath = self.ResolveRulesPath(RulesPath)
        log.info(f"[Govdoc] Starting execution: runId={RunId}, documentId={DocumentId}")
+        artifactTempDir: str | None = None
+        inputPayload = None
+        try:
+            # 1. 更新 run 状态 → processing
+            await self.Storage.UpdateRunStatus(RunId, "processing", Phase="parsing")
+            await self.Storage.UpdateDocumentStatus(DocumentId, "processing", RunId)

-        # 1. 更新 run 状态 → processing
-        await self.Storage.UpdateRunStatus(RunId, "processing", phase="parsing")
-        await self.Storage.UpdateDocumentStatus(DocumentId, "processing", RunId)
+            # 2. 解析输入文件
+            inputPayload = await self.InputResolver.ResolveForDocument(DocumentId)
+            log.info(f"[Govdoc] Input resolved: {inputPayload.fileName} → {inputPayload.localPath}")

-        # 2. 解析输入文件
-        inputPayload = await self.InputResolver.ResolveForDocument(DocumentId)
-        log.info(f"[Govdoc] Input resolved: {inputPayload.fileName} → {inputPayload.localPath}")
+            # 3. 调用 govdoc_engine 执行审查
+            from fastapi_modules.fastapi_leaudit.govdoc_bridge.result_adapter import ResultAdapter
+            from fastapi_modules.fastapi_leaudit.govdoc_engine.pipeline import run as engine_run

-        # 3. 调用 govdoc_engine 执行审查
-        from fastapi_modules.fastapi_leaudit.govdoc_engine.pipeline import run as engine_run
+            if self.ResultAdapter is None:
+                self.ResultAdapter = ResultAdapter()

-        engineResult = await engine_run(
-            file_path=inputPayload.localPath,
-            rules_path=RulesPath,
-            llm_client=None,  # 使用默认 LlmClient (从平台配置加载)
+            engineResult = await engine_run(
+                file_path=inputPayload.localPath,
+                rules_path=resolvedRulesPath,
+                llm_client=None,  # 使用默认 LlmClient (从平台配置加载)
+            )
+            engineResult.document["filename"] = inputPayload.fileName
+
+            # 4. 适配引擎结果
+            structure = self.ResultAdapter.AdaptStructure(engineResult)
+            outline = self.ResultAdapter.AdaptOutline(engineResult)
+            entities = self.ResultAdapter.AdaptEntities(engineResult)
+            runSummary = self.ResultAdapter.AdaptRunSummary(
+                engineResult,
+                Structure=structure,
+                Outline=outline,
+                Entities=entities,
+            )
+            ruleResults = self.ResultAdapter.AdaptRuleResults(engineResult)
+            checkedRuleResults = self.ResultAdapter.AdaptCheckedRules(engineResult)
+            artifactTempDir, artifacts = await self._GenerateArtifacts(
+                DocumentId=DocumentId,
+                RunId=RunId,
+                InputPath=inputPayload.localPath,
+                InputFileName=inputPayload.fileName,
+                EngineResult=engineResult,
+                RuleResults=ruleResults,
+            )
+
+            failedRuleIds = {str(row.get("ruleId") or "") for row in ruleResults}
+            for checkedRule in checkedRuleResults:
+                ruleId = str(checkedRule.get("ruleId") or "")
+                if checkedRule.get("result") == "fail" and ruleId in failedRuleIds:
+                    continue
+                ruleResults.append(checkedRule)
+
+            # 将 rules_path 附带到 runSummary 中，供 GetRuleDetail 后续解析
+            runSummary["rulesPath"] = resolvedRulesPath
+
+            # 5. 持久化结果
+            await self.Storage.UpdateRunResult(RunId, runSummary)
+            await self.Storage.SaveRuleResults(RunId, ruleResults)
+            await self.Storage.SaveArtifacts(RunId, artifacts)
+
+            # 6. 更新终态
+            await self.Storage.UpdateRunStatus(RunId, "completed", Phase="reporting")
+            await self.Storage.UpdateDocumentStatus(DocumentId, "completed", RunId)
+
+            log.info(f"[Govdoc] Execution completed: runId={RunId}")
+
+            return {
+                "runId": RunId,
+                "documentId": DocumentId,
+                "status": "completed",
+                "ruleResultsCount": len(ruleResults),
+                "structureCount": len(structure),
+                "outlineCount": len(outline),
+                "artifactCount": len(artifacts),
+            }
+        finally:
+            if artifactTempDir:
+                shutil.rmtree(artifactTempDir, ignore_errors=True)
+            if inputPayload and inputPayload.tempDir:
+                shutil.rmtree(inputPayload.tempDir, ignore_errors=True)
+
+    async def _GenerateArtifacts(
+        self,
+        DocumentId: int,
+        RunId: int,
+        InputPath: str,
+        InputFileName: str,
+        EngineResult: Any,
+        RuleResults: list[dict[str, Any]],
+    ) -> tuple[str, list[dict[str, Any]]]:
+        """生成报告产物并上传到 OSS。"""
+        artifactDir = tempfile.mkdtemp(prefix=f"govdoc_artifacts_{RunId}_")
+        sourcePath = Path(InputPath)
+        baseName = OssPathUtils.BuildSafeFileStem(InputFileName)
+        region = await self._GetDocumentRegion(DocumentId)
+
+        annotatedPath = Path(artifactDir) / f"{baseName}.annotated.docx"
+        annotate_docx(sourcePath, annotatedPath, EngineResult)
+
+        htmlReport = render_html(EngineResult)
+
+        doc = parse_docx(sourcePath)
+        findingMap: dict[int, list[str]] = {}
+        for row in RuleResults:
+            if row.get("result") != "fail":
+                continue
+            paragraphIndex = row.get("paragraphIndex")
+            if paragraphIndex is None:
+                continue
+            findingId = f"{row.get('ruleId') or 'finding'}-{paragraphIndex}"
+            findingMap.setdefault(int(paragraphIndex), []).append(findingId)
+        paragraphsHtml = paragraphs_to_html(doc, findingMap)
+
+        annotatedUrl = await self.OssService.UploadBytes(
+            ObjectKey=OssPathUtils.BuildArtifactKey(
+                Region=region,
+                RunId=RunId,
+                ArtifactType="annotated_docx",
+                Detail=f"{baseName}.annotated.docx",
+            ),
+            Content=annotatedPath.read_bytes(),
+            ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        )
+        htmlUrl = await self.OssService.UploadText(
+            ObjectKey=OssPathUtils.BuildArtifactKey(
+                Region=region,
+                RunId=RunId,
+                ArtifactType="html_report",
+                Detail=f"{baseName}.report.html",
+            ),
+            Content=htmlReport,
+            ContentType="text/html; charset=utf-8",
+        )
+        paragraphUrl = await self.OssService.UploadText(
+            ObjectKey=OssPathUtils.BuildArtifactKey(
+                Region=region,
+                RunId=RunId,
+                ArtifactType="paragraph_html",
+                Detail=f"{baseName}.paragraphs.html",
+            ),
+            Content=paragraphsHtml,
+            ContentType="text/html; charset=utf-8",
        )

-        # 4. 适配引擎结果
-        structure = self.ResultAdapter.AdaptStructure(engineResult)
-        outline = self.ResultAdapter.AdaptOutline(engineResult)
-        runSummary = self.ResultAdapter.AdaptRunSummary(
-            engineResult,
-            Structure=structure,
-            Outline=outline,
-        )
-        ruleResults = self.ResultAdapter.AdaptRuleResults(engineResult)
-        entities = self.ResultAdapter.AdaptEntities(engineResult)
-        artifacts = self.ResultAdapter.AdaptArtifacts(engineResult, RunId)
+        return artifactDir, [
+            self._BuildArtifactRow(
+                artifactType="annotated_docx",
+                fileName=f"{baseName}.annotated.docx",
+                mimeType="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+                content=annotatedPath.read_bytes(),
+                ossUrl=annotatedUrl,
+                description="批注 DOCX",
+            ),
+            self._BuildArtifactRow(
+                artifactType="html_report",
+                fileName=f"{baseName}.report.html",
+                mimeType="text/html; charset=utf-8",
+                content=htmlReport.encode("utf-8"),
+                ossUrl=htmlUrl,
+                description="HTML 审查报告",
+            ),
+            self._BuildArtifactRow(
+                artifactType="paragraph_html",
+                fileName=f"{baseName}.paragraphs.html",
+                mimeType="text/html; charset=utf-8",
+                content=paragraphsHtml.encode("utf-8"),
+                ossUrl=paragraphUrl,
+                description="段落联动视图",
+            ),
+        ]

-        # 将 rules_path 附带到 runSummary 中，供 GetRuleDetail 后续解析
-        runSummary["rulesPath"] = RulesPath
-
-        # 5. 持久化结果
-        await self.Storage.UpdateRunResult(RunId, runSummary)
-        await self.Storage.SaveRuleResults(RunId, ruleResults)
-        await self.Storage.SaveArtifacts(RunId, artifacts)
-
-        # 6. 更新终态
-        await self.Storage.UpdateRunStatus(RunId, "completed", phase="reporting")
-        await self.Storage.UpdateDocumentStatus(DocumentId, "completed", RunId)
-
-        log.info(f"[Govdoc] Execution completed: runId={RunId}")
+    async def _GetDocumentRegion(self, DocumentId: int) -> str:
+        async with GetAsyncSession() as session:
+            row = (
+                await session.execute(
+                    text(
+                        """
+                        SELECT COALESCE(region, 'default') AS region
+                        FROM leaudit_documents
+                        WHERE id = :document_id
+                        LIMIT 1
+                        """
+                    ),
+                    {"document_id": DocumentId},
+                )
+            ).mappings().first()
+        return str(row["region"] or "default") if row else "default"

+    def _BuildArtifactRow(
+        self,
+        *,
+        artifactType: str,
+        fileName: str,
+        mimeType: str,
+        content: bytes,
+        ossUrl: str,
+        description: str,
+    ) -> dict[str, Any]:
+        fileExt = Path(fileName).suffix.lstrip(".").lower()
        return {
-            "runId": RunId,
-            "documentId": DocumentId,
-            "status": "completed",
-            "ruleResultsCount": len(ruleResults),
-            "structureCount": len(structure),
-            "outlineCount": len(outline),
-            "artifactCount": len(artifacts),
+            "artifactType": artifactType,
+            "fileName": fileName,
+            "fileExt": fileExt,
+            "mimeType": mimeType,
+            "fileSize": len(content),
+            "sha256": hashlib.sha256(content).hexdigest(),
+            "ossUrl": ossUrl,
+            "storageProvider": "minio",
+            "description": description,
        }
@@ -6,6 +6,7 @@ govdoc_report_artifacts 表，并更新 leaudit_documents 状态。

 from __future__ import annotations

+import json
 from datetime import datetime, timezone
 from typing import Any

@@ -16,6 +17,33 @@ from sqlalchemy import text
 log = logger


+_RUN_RESULT_FALLBACK_KEYS = {
+    "total_score": ["totalScore", "score"],
+    "passed_count": ["passedCount", "passed_count"],
+    "failed_count": ["failedCount", "failed_count"],
+    "skipped_count": ["skippedCount", "skipped_count"],
+    "result_status": ["resultStatus", "result_status"],
+    "result_summary_json": ["resultSummaryJson", "result_summary_json"],
+}
+
+
+def _pick_value(payload: dict[str, Any], *keys: str) -> Any:
+    for key in keys:
+        if key in payload:
+            return payload[key]
+    return None
+
+
+def _to_text_payload(value: Any) -> str | None:
+    if value is None:
+        return None
+    if isinstance(value, str):
+        return value
+    if isinstance(value, (dict, list, tuple, bool, int, float)):
+        return json.dumps(value, ensure_ascii=False)
+    return str(value)
+
+
 class StorageAdapter:
    """Govdoc 结果持久化适配器。

@@ -51,11 +79,23 @@ class StorageAdapter:
        log.info(f"[Govdoc] Run created: runId={run_id}, documentId={RunData['documentId']}")
        return run_id

-    async def UpdateRunStatus(self, RunId: int, Status: str, Phase: str | None = None, **Extra: Any) -> None:
-        """更新 run 状态和阶段。"""
+    async def UpdateRunStatus(
+        self,
+        RunId: int,
+        Status: str,
+        Phase: str | None = None,
+        **Extra: Any,
+    ) -> None:
+        """更新 run 状态和阶段。
+
+        兼容调用方传 ``phase=`` 或 ``Phase=``，避免因大小写不一致导致阶段不落库。
+        """
        set_clauses = ["status = :status", "updated_at = now()"]
        params: dict[str, Any] = {"rid": RunId, "status": Status}

+        if Phase is None:
+            Phase = _pick_value(Extra, "phase", "Phase")
+
        if Phase is not None:
            set_clauses.append("phase = :phase")
            params["phase"] = Phase
@@ -63,6 +103,9 @@ class StorageAdapter:
        if Status == "completed" or Status == "failed":
            set_clauses.append("finished_at = :finished_at")
            params["finished_at"] = datetime.now(timezone.utc)
+        elif Status == "processing":
+            set_clauses.append("started_at = COALESCE(started_at, :started_at)")
+            params["started_at"] = datetime.now(timezone.utc)

        async with GetAsyncSession() as session:
            await session.execute(
@@ -74,7 +117,7 @@ class StorageAdapter:

    async def UpdateRunResult(self, RunId: int, Summary: dict[str, Any]) -> None:
        """写入 run 结果汇总字段（含 rules_path / structure / outline）。"""
-        rules_path = Summary.get("rulesPath")
+        rules_path = _pick_value(Summary, "rulesPath", "rules_path")
        set_clauses = [
            "total_score = :total_score",
            "passed_count = :passed_count",
@@ -86,12 +129,12 @@ class StorageAdapter:
        ]
        params: dict[str, Any] = {
            "rid": RunId,
-            "total_score": Summary.get("totalScore"),
-            "passed_count": Summary.get("passedCount", 0),
-            "failed_count": Summary.get("failedCount", 0),
-            "skipped_count": Summary.get("skippedCount", 0),
-            "result_status": Summary.get("resultStatus"),
-            "result_summary_json": Summary.get("resultSummaryJson"),
+            "total_score": _pick_value(Summary, *_RUN_RESULT_FALLBACK_KEYS["total_score"]),
+            "passed_count": _pick_value(Summary, *_RUN_RESULT_FALLBACK_KEYS["passed_count"]) or 0,
+            "failed_count": _pick_value(Summary, *_RUN_RESULT_FALLBACK_KEYS["failed_count"]) or 0,
+            "skipped_count": _pick_value(Summary, *_RUN_RESULT_FALLBACK_KEYS["skipped_count"]) or 0,
+            "result_status": _pick_value(Summary, *_RUN_RESULT_FALLBACK_KEYS["result_status"]),
+            "result_summary_json": _pick_value(Summary, *_RUN_RESULT_FALLBACK_KEYS["result_summary_json"]),
        }
        if rules_path:
            set_clauses.append("rules_path = :rules_path")
@@ -137,11 +180,11 @@ class StorageAdapter:
                        (run_id, rule_id, rule_name, severity, category,
                         message, suggestion, actual, expected, evidence,
                         paragraph_index, paragraph_text, location_path,
-                         result, score, created_at, updated_at)
+                         result, skip_reason, score, created_at, updated_at)
                        VALUES (:run_id, :rule_id, :rule_name, :severity, :category,
                         :message, :suggestion, :actual, :expected, :evidence,
                         :paragraph_index, :paragraph_text, :location_path,
-                         :result, :score, now(), now())"""
+                         :result, :skip_reason, :score, now(), now())"""
                    ),
                    {
                        "run_id": RunId,
@@ -151,13 +194,14 @@ class StorageAdapter:
                        "category": row.get("category"),
                        "message": row.get("message"),
                        "suggestion": row.get("suggestion"),
-                        "actual": row.get("actual"),
-                        "expected": row.get("expected"),
-                        "evidence": row.get("evidence"),
+                        "actual": _to_text_payload(row.get("actual")),
+                        "expected": _to_text_payload(row.get("expected")),
+                        "evidence": _to_text_payload(row.get("evidence")),
                        "paragraph_index": row.get("paragraphIndex"),
                        "paragraph_text": row.get("paragraphText"),
                        "location_path": row.get("locationPath"),
                        "result": row.get("result", "pass"),
+                        "skip_reason": _pick_value(row, "skipReason", "skip_reason"),
                        "score": row.get("score"),
                    },
                )
@@ -11,13 +11,17 @@ from typing import Any
 from fastapi_common.fastapi_common_logger import logger

 from fastapi_admin.celery_app import celery_app
+from fastapi_admin.config import (
+    LEAUDIT_WORKER_QUEUE_NORMAL,
+    LEAUDIT_WORKER_QUEUE_URGENT,
+)
 from fastapi_modules.fastapi_leaudit.govdoc_bridge.runner import GovdocRunner
 from fastapi_modules.fastapi_leaudit.govdoc_bridge.storage_adapter import StorageAdapter

 log = logger

-GOVDOC_WORKER_QUEUE = "govdoc"
-GOVDOC_WORKER_QUEUE_URGENT = "govdoc_urgent"
+GOVDOC_WORKER_QUEUE = LEAUDIT_WORKER_QUEUE_NORMAL
+GOVDOC_WORKER_QUEUE_URGENT = LEAUDIT_WORKER_QUEUE_URGENT


 def resolve_govdoc_queue(speed: str = "normal") -> str:
@@ -30,6 +34,7 @@ def resolve_govdoc_queue(speed: str = "normal") -> str:
 def dispatch_govdoc_task(
    documentId: int,
    runId: int,
+    rulesPath: str | None = None,
    triggerUserId: int | None = None,
    speed: str = "normal",
 ) -> Any:
@@ -52,6 +57,7 @@ def dispatch_govdoc_task(
        kwargs={
            "documentId": documentId,
            "runId": runId,
+            "rulesPath": rulesPath,
            "triggerUserId": triggerUserId,
            "speed": speed,
        },
@@ -73,6 +79,7 @@ def govdoc_execute_task(
    self,
    documentId: int,
    runId: int,
+    rulesPath: str | None = None,
    triggerUserId: int | None = None,
    speed: str = "normal",
 ) -> dict[str, Any]:
@@ -89,7 +96,7 @@ def govdoc_execute_task(

    try:
        # 更新 run 状态 → running
-        loop.run_until_complete(storage.UpdateRunStatus(runId, "processing", phase="parsing"))
+        loop.run_until_complete(storage.UpdateRunStatus(runId, "processing", Phase="parsing"))

        # 执行完整审查链路
        runner = GovdocRunner()
@@ -97,6 +104,7 @@ def govdoc_execute_task(
            runner.Execute(
                DocumentId=documentId,
                RunId=runId,
+                RulesPath=rulesPath,
                TriggerUserId=triggerUserId,
                Speed=speed,
            )