Files

299 lines
12 KiB
Python

"""Govdoc Bridge — 执行编排器。
负责组织一次完整的 govdoc 审查执行链路:
1. 解析输入文件 (input_resolver)
2. 调用 govdoc_engine.pipeline 执行审查
3. 收集并适配结果 (result_adapter)
4. 持久化结果 (storage_adapter)
"""
from __future__ import annotations
import hashlib
import shutil
import tempfile
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
from sqlalchemy import text
from fastapi_common.fastapi_common_logger import logger
from fastapi_common.fastapi_common_sqlalchemy.database import GetAsyncSession
from fastapi_common.fastapi_common_storage.oss_path_utils import OssPathUtils
from fastapi_modules.fastapi_leaudit.govdoc_bridge.input_resolver import InputResolver
from fastapi_modules.fastapi_leaudit.govdoc_bridge.storage_adapter import StorageAdapter
from fastapi_modules.fastapi_leaudit.govdoc_engine.parser.docx_parser import parse_docx
from fastapi_modules.fastapi_leaudit.govdoc_engine.reporter.docx_annotator import annotate_docx
from fastapi_modules.fastapi_leaudit.govdoc_engine.reporter.html_paragraph import paragraphs_to_html
from fastapi_modules.fastapi_leaudit.govdoc_engine.reporter.html_renderer import render_html
from fastapi_modules.fastapi_leaudit.services.impl.ossServiceImpl import OssServiceImpl
log = logger
@dataclass
class GovdocRunner:
"""Govdoc 引擎一次完整审查的执行编排器。
当前为 Phase 1 骨架:
- engine 集成点已标注
- 可先跑通 run 创建 → 状态更新 → 结果持久化 的完整生命周期
- 引擎执行部分待 govdoc_engine 迁入后接入
"""
InputResolver: InputResolver = field(default_factory=InputResolver)
Storage: StorageAdapter = field(default_factory=StorageAdapter)
OssService: OssServiceImpl = field(default_factory=OssServiceImpl)
ResultAdapter: Any | None = None
def ResolveRulesPath(self, RulesPath: str | None) -> str:
"""解析并校验执行所需规则文件路径。"""
candidate = (RulesPath or "").strip()
if not candidate:
raise ValueError("未提供 govdoc rules_path,当前任务无法执行规则审查")
path = Path(candidate).expanduser()
if not path.is_absolute():
path = Path.cwd() / path
path = path.resolve()
if not path.is_file():
raise FileNotFoundError(f"govdoc 规则文件不存在: {path}")
return str(path)
async def Execute(
self,
DocumentId: int,
RunId: int,
RulesPath: str | None = None,
TriggerUserId: int | None = None,
Speed: str = "normal",
) -> dict[str, Any]:
"""执行一次完整的 govdoc 审查。
Args:
DocumentId: 文档 ID。
RunId: 已创建的 govdoc_runs.id。
TriggerUserId: 触发人。
Speed: 执行速度 ('normal' / 'urgent')。
Returns:
执行摘要 dict。
"""
resolvedRulesPath = self.ResolveRulesPath(RulesPath)
log.info(f"[Govdoc] Starting execution: runId={RunId}, documentId={DocumentId}")
artifactTempDir: str | None = None
inputPayload = None
try:
# 1. 更新 run 状态 → processing
await self.Storage.UpdateRunStatus(RunId, "processing", Phase="parsing")
await self.Storage.UpdateDocumentStatus(DocumentId, "processing", RunId)
# 2. 解析输入文件
inputPayload = await self.InputResolver.ResolveForDocument(DocumentId)
log.info(f"[Govdoc] Input resolved: {inputPayload.fileName}{inputPayload.localPath}")
# 3. 调用 govdoc_engine 执行审查
from fastapi_modules.fastapi_leaudit.govdoc_bridge.result_adapter import ResultAdapter
from fastapi_modules.fastapi_leaudit.govdoc_engine.pipeline import run as engine_run
if self.ResultAdapter is None:
self.ResultAdapter = ResultAdapter()
engineResult = await engine_run(
file_path=inputPayload.localPath,
rules_path=resolvedRulesPath,
llm_client=None, # 使用默认 LlmClient (从平台配置加载)
)
engineResult.document["filename"] = inputPayload.fileName
# 4. 适配引擎结果
structure = self.ResultAdapter.AdaptStructure(engineResult)
outline = self.ResultAdapter.AdaptOutline(engineResult)
entities = self.ResultAdapter.AdaptEntities(engineResult)
runSummary = self.ResultAdapter.AdaptRunSummary(
engineResult,
Structure=structure,
Outline=outline,
Entities=entities,
)
ruleResults = self.ResultAdapter.AdaptRuleResults(engineResult)
checkedRuleResults = self.ResultAdapter.AdaptCheckedRules(engineResult)
artifactTempDir, artifacts = await self._GenerateArtifacts(
DocumentId=DocumentId,
RunId=RunId,
InputPath=inputPayload.localPath,
InputFileName=inputPayload.fileName,
EngineResult=engineResult,
RuleResults=ruleResults,
)
failedRuleIds = {str(row.get("ruleId") or "") for row in ruleResults}
for checkedRule in checkedRuleResults:
ruleId = str(checkedRule.get("ruleId") or "")
if checkedRule.get("result") == "fail" and ruleId in failedRuleIds:
continue
ruleResults.append(checkedRule)
# 将 rules_path 附带到 runSummary 中,供 GetRuleDetail 后续解析
runSummary["rulesPath"] = resolvedRulesPath
# 5. 持久化结果
await self.Storage.UpdateRunResult(RunId, runSummary)
await self.Storage.SaveRuleResults(RunId, ruleResults)
await self.Storage.SaveArtifacts(RunId, artifacts)
# 6. 更新终态
await self.Storage.UpdateRunStatus(RunId, "completed", Phase="reporting")
await self.Storage.UpdateDocumentStatus(DocumentId, "completed", RunId)
log.info(f"[Govdoc] Execution completed: runId={RunId}")
return {
"runId": RunId,
"documentId": DocumentId,
"status": "completed",
"ruleResultsCount": len(ruleResults),
"structureCount": len(structure),
"outlineCount": len(outline),
"artifactCount": len(artifacts),
}
finally:
if artifactTempDir:
shutil.rmtree(artifactTempDir, ignore_errors=True)
if inputPayload and inputPayload.tempDir:
shutil.rmtree(inputPayload.tempDir, ignore_errors=True)
async def _GenerateArtifacts(
self,
DocumentId: int,
RunId: int,
InputPath: str,
InputFileName: str,
EngineResult: Any,
RuleResults: list[dict[str, Any]],
) -> tuple[str, list[dict[str, Any]]]:
"""生成报告产物并上传到 OSS。"""
artifactDir = tempfile.mkdtemp(prefix=f"govdoc_artifacts_{RunId}_")
sourcePath = Path(InputPath)
baseName = OssPathUtils.BuildSafeFileStem(InputFileName)
region = await self._GetDocumentRegion(DocumentId)
annotatedPath = Path(artifactDir) / f"{baseName}.annotated.docx"
annotate_docx(sourcePath, annotatedPath, EngineResult)
htmlReport = render_html(EngineResult)
doc = parse_docx(sourcePath)
findingMap: dict[int, list[str]] = {}
for row in RuleResults:
if row.get("result") != "fail":
continue
paragraphIndex = row.get("paragraphIndex")
if paragraphIndex is None:
continue
findingId = f"{row.get('ruleId') or 'finding'}-{paragraphIndex}"
findingMap.setdefault(int(paragraphIndex), []).append(findingId)
paragraphsHtml = paragraphs_to_html(doc, findingMap)
annotatedUrl = await self.OssService.UploadBytes(
ObjectKey=OssPathUtils.BuildArtifactKey(
Region=region,
RunId=RunId,
ArtifactType="annotated_docx",
Detail=f"{baseName}.annotated.docx",
),
Content=annotatedPath.read_bytes(),
ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
)
htmlUrl = await self.OssService.UploadText(
ObjectKey=OssPathUtils.BuildArtifactKey(
Region=region,
RunId=RunId,
ArtifactType="html_report",
Detail=f"{baseName}.report.html",
),
Content=htmlReport,
ContentType="text/html; charset=utf-8",
)
paragraphUrl = await self.OssService.UploadText(
ObjectKey=OssPathUtils.BuildArtifactKey(
Region=region,
RunId=RunId,
ArtifactType="paragraph_html",
Detail=f"{baseName}.paragraphs.html",
),
Content=paragraphsHtml,
ContentType="text/html; charset=utf-8",
)
return artifactDir, [
self._BuildArtifactRow(
artifactType="annotated_docx",
fileName=f"{baseName}.annotated.docx",
mimeType="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
content=annotatedPath.read_bytes(),
ossUrl=annotatedUrl,
description="批注 DOCX",
),
self._BuildArtifactRow(
artifactType="html_report",
fileName=f"{baseName}.report.html",
mimeType="text/html; charset=utf-8",
content=htmlReport.encode("utf-8"),
ossUrl=htmlUrl,
description="HTML 审查报告",
),
self._BuildArtifactRow(
artifactType="paragraph_html",
fileName=f"{baseName}.paragraphs.html",
mimeType="text/html; charset=utf-8",
content=paragraphsHtml.encode("utf-8"),
ossUrl=paragraphUrl,
description="段落联动视图",
),
]
async def _GetDocumentRegion(self, DocumentId: int) -> str:
async with GetAsyncSession() as session:
row = (
await session.execute(
text(
"""
SELECT COALESCE(NULLIF(region, ''), '公共') AS region
FROM leaudit_documents
WHERE id = :document_id
LIMIT 1
"""
),
{"document_id": DocumentId},
)
).mappings().first()
return str(row["region"] or "公共") if row else "公共"
def _BuildArtifactRow(
self,
*,
artifactType: str,
fileName: str,
mimeType: str,
content: bytes,
ossUrl: str,
description: str,
) -> dict[str, Any]:
fileExt = Path(fileName).suffix.lstrip(".").lower()
return {
"artifactType": artifactType,
"fileName": fileName,
"fileExt": fileExt,
"mimeType": mimeType,
"fileSize": len(content),
"sha256": hashlib.sha256(content).hexdigest(),
"ossUrl": ossUrl,
"storageProvider": "minio",
"description": description,
}