"""Govdoc Bridge — 执行编排器。 负责组织一次完整的 govdoc 审查执行链路: 1. 解析输入文件 (input_resolver) 2. 调用 govdoc_engine.pipeline 执行审查 3. 收集并适配结果 (result_adapter) 4. 持久化结果 (storage_adapter) """ from __future__ import annotations import hashlib import shutil import tempfile from dataclasses import dataclass, field from pathlib import Path from typing import Any from sqlalchemy import text from fastapi_common.fastapi_common_logger import logger from fastapi_common.fastapi_common_sqlalchemy.database import GetAsyncSession from fastapi_common.fastapi_common_storage.oss_path_utils import OssPathUtils from fastapi_modules.fastapi_leaudit.govdoc_bridge.input_resolver import InputResolver from fastapi_modules.fastapi_leaudit.govdoc_bridge.storage_adapter import StorageAdapter from fastapi_modules.fastapi_leaudit.govdoc_engine.parser.docx_parser import parse_docx from fastapi_modules.fastapi_leaudit.govdoc_engine.reporter.docx_annotator import annotate_docx from fastapi_modules.fastapi_leaudit.govdoc_engine.reporter.html_paragraph import paragraphs_to_html from fastapi_modules.fastapi_leaudit.govdoc_engine.reporter.html_renderer import render_html from fastapi_modules.fastapi_leaudit.services.impl.ossServiceImpl import OssServiceImpl log = logger @dataclass class GovdocRunner: """Govdoc 引擎一次完整审查的执行编排器。 当前为 Phase 1 骨架: - engine 集成点已标注 - 可先跑通 run 创建 → 状态更新 → 结果持久化 的完整生命周期 - 引擎执行部分待 govdoc_engine 迁入后接入 """ InputResolver: InputResolver = field(default_factory=InputResolver) Storage: StorageAdapter = field(default_factory=StorageAdapter) OssService: OssServiceImpl = field(default_factory=OssServiceImpl) ResultAdapter: Any | None = None def ResolveRulesPath(self, RulesPath: str | None) -> str: """解析并校验执行所需规则文件路径。""" candidate = (RulesPath or "").strip() if not candidate: raise ValueError("未提供 govdoc rules_path,当前任务无法执行规则审查") path = Path(candidate).expanduser() if not path.is_absolute(): path = Path.cwd() / path path = path.resolve() if not path.is_file(): raise FileNotFoundError(f"govdoc 规则文件不存在: {path}") return str(path) async def Execute( self, DocumentId: int, RunId: int, RulesPath: str | None = None, TriggerUserId: int | None = None, Speed: str = "normal", ) -> dict[str, Any]: """执行一次完整的 govdoc 审查。 Args: DocumentId: 文档 ID。 RunId: 已创建的 govdoc_runs.id。 TriggerUserId: 触发人。 Speed: 执行速度 ('normal' / 'urgent')。 Returns: 执行摘要 dict。 """ resolvedRulesPath = self.ResolveRulesPath(RulesPath) log.info(f"[Govdoc] Starting execution: runId={RunId}, documentId={DocumentId}") artifactTempDir: str | None = None inputPayload = None try: # 1. 更新 run 状态 → processing await self.Storage.UpdateRunStatus(RunId, "processing", Phase="parsing") await self.Storage.UpdateDocumentStatus(DocumentId, "processing", RunId) # 2. 解析输入文件 inputPayload = await self.InputResolver.ResolveForDocument(DocumentId) log.info(f"[Govdoc] Input resolved: {inputPayload.fileName} → {inputPayload.localPath}") # 3. 调用 govdoc_engine 执行审查 from fastapi_modules.fastapi_leaudit.govdoc_bridge.result_adapter import ResultAdapter from fastapi_modules.fastapi_leaudit.govdoc_engine.pipeline import run as engine_run if self.ResultAdapter is None: self.ResultAdapter = ResultAdapter() engineResult = await engine_run( file_path=inputPayload.localPath, rules_path=resolvedRulesPath, llm_client=None, # 使用默认 LlmClient (从平台配置加载) ) engineResult.document["filename"] = inputPayload.fileName # 4. 适配引擎结果 structure = self.ResultAdapter.AdaptStructure(engineResult) outline = self.ResultAdapter.AdaptOutline(engineResult) entities = self.ResultAdapter.AdaptEntities(engineResult) runSummary = self.ResultAdapter.AdaptRunSummary( engineResult, Structure=structure, Outline=outline, Entities=entities, ) ruleResults = self.ResultAdapter.AdaptRuleResults(engineResult) checkedRuleResults = self.ResultAdapter.AdaptCheckedRules(engineResult) artifactTempDir, artifacts = await self._GenerateArtifacts( DocumentId=DocumentId, RunId=RunId, InputPath=inputPayload.localPath, InputFileName=inputPayload.fileName, EngineResult=engineResult, RuleResults=ruleResults, ) failedRuleIds = {str(row.get("ruleId") or "") for row in ruleResults} for checkedRule in checkedRuleResults: ruleId = str(checkedRule.get("ruleId") or "") if checkedRule.get("result") == "fail" and ruleId in failedRuleIds: continue ruleResults.append(checkedRule) # 将 rules_path 附带到 runSummary 中,供 GetRuleDetail 后续解析 runSummary["rulesPath"] = resolvedRulesPath # 5. 持久化结果 await self.Storage.UpdateRunResult(RunId, runSummary) await self.Storage.SaveRuleResults(RunId, ruleResults) await self.Storage.SaveArtifacts(RunId, artifacts) # 6. 更新终态 await self.Storage.UpdateRunStatus(RunId, "completed", Phase="reporting") await self.Storage.UpdateDocumentStatus(DocumentId, "completed", RunId) log.info(f"[Govdoc] Execution completed: runId={RunId}") return { "runId": RunId, "documentId": DocumentId, "status": "completed", "ruleResultsCount": len(ruleResults), "structureCount": len(structure), "outlineCount": len(outline), "artifactCount": len(artifacts), } finally: if artifactTempDir: shutil.rmtree(artifactTempDir, ignore_errors=True) if inputPayload and inputPayload.tempDir: shutil.rmtree(inputPayload.tempDir, ignore_errors=True) async def _GenerateArtifacts( self, DocumentId: int, RunId: int, InputPath: str, InputFileName: str, EngineResult: Any, RuleResults: list[dict[str, Any]], ) -> tuple[str, list[dict[str, Any]]]: """生成报告产物并上传到 OSS。""" artifactDir = tempfile.mkdtemp(prefix=f"govdoc_artifacts_{RunId}_") sourcePath = Path(InputPath) baseName = OssPathUtils.BuildSafeFileStem(InputFileName) region = await self._GetDocumentRegion(DocumentId) annotatedPath = Path(artifactDir) / f"{baseName}.annotated.docx" annotate_docx(sourcePath, annotatedPath, EngineResult) htmlReport = render_html(EngineResult) doc = parse_docx(sourcePath) findingMap: dict[int, list[str]] = {} for row in RuleResults: if row.get("result") != "fail": continue paragraphIndex = row.get("paragraphIndex") if paragraphIndex is None: continue findingId = f"{row.get('ruleId') or 'finding'}-{paragraphIndex}" findingMap.setdefault(int(paragraphIndex), []).append(findingId) paragraphsHtml = paragraphs_to_html(doc, findingMap) annotatedUrl = await self.OssService.UploadBytes( ObjectKey=OssPathUtils.BuildArtifactKey( Region=region, RunId=RunId, ArtifactType="annotated_docx", Detail=f"{baseName}.annotated.docx", ), Content=annotatedPath.read_bytes(), ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document", ) htmlUrl = await self.OssService.UploadText( ObjectKey=OssPathUtils.BuildArtifactKey( Region=region, RunId=RunId, ArtifactType="html_report", Detail=f"{baseName}.report.html", ), Content=htmlReport, ContentType="text/html; charset=utf-8", ) paragraphUrl = await self.OssService.UploadText( ObjectKey=OssPathUtils.BuildArtifactKey( Region=region, RunId=RunId, ArtifactType="paragraph_html", Detail=f"{baseName}.paragraphs.html", ), Content=paragraphsHtml, ContentType="text/html; charset=utf-8", ) return artifactDir, [ self._BuildArtifactRow( artifactType="annotated_docx", fileName=f"{baseName}.annotated.docx", mimeType="application/vnd.openxmlformats-officedocument.wordprocessingml.document", content=annotatedPath.read_bytes(), ossUrl=annotatedUrl, description="批注 DOCX", ), self._BuildArtifactRow( artifactType="html_report", fileName=f"{baseName}.report.html", mimeType="text/html; charset=utf-8", content=htmlReport.encode("utf-8"), ossUrl=htmlUrl, description="HTML 审查报告", ), self._BuildArtifactRow( artifactType="paragraph_html", fileName=f"{baseName}.paragraphs.html", mimeType="text/html; charset=utf-8", content=paragraphsHtml.encode("utf-8"), ossUrl=paragraphUrl, description="段落联动视图", ), ] async def _GetDocumentRegion(self, DocumentId: int) -> str: async with GetAsyncSession() as session: row = ( await session.execute( text( """ SELECT COALESCE(region, 'default') AS region FROM leaudit_documents WHERE id = :document_id LIMIT 1 """ ), {"document_id": DocumentId}, ) ).mappings().first() return str(row["region"] or "default") if row else "default" def _BuildArtifactRow( self, *, artifactType: str, fileName: str, mimeType: str, content: bytes, ossUrl: str, description: str, ) -> dict[str, Any]: fileExt = Path(fileName).suffix.lstrip(".").lower() return { "artifactType": artifactType, "fileName": fileName, "fileExt": fileExt, "mimeType": mimeType, "fileSize": len(content), "sha256": hashlib.sha256(content).hexdigest(), "ossUrl": ossUrl, "storageProvider": "minio", "description": description, }