feat(govdoc): 新增内部公文模块全链路(后端58+前端11文件)

This commit is contained in:
wren
2026-05-13 14:37:12 +08:00
parent 99699e20e1
commit 5d777599bf
63 changed files with 7608 additions and 0 deletions
@@ -0,0 +1,11 @@
"""Govdoc 执行桥接层 —— govdoc_engine ↔ leaudit-platform 适配器。
本层负责把 govdoc_engine 接入当前平台的基础设施:
- OSS/MinIO 文件下载与上传
- 文档主档 (leaudit_documents / leaudit_document_files) 查询
- GovdocRun / GovdocRuleResult / GovdocReportArtifact 持久化
- Celery 异步任务调度
- 临时文件管理与安全校验
"""
from __future__ import annotations
@@ -0,0 +1,133 @@
"""Govdoc Bridge — 输入文件解析器。
从 leaudit_document_files 中定位输入文件,从 OSS 下载到本地临时路径。
"""
from __future__ import annotations
import hashlib
import os
import tempfile
from dataclasses import dataclass
from pathlib import Path
from fastapi_common.fastapi_common_logger import logger
from fastapi_common.fastapi_common_sqlalchemy.database import GetAsyncSession
from fastapi_common.fastapi_common_storage.oss_client import OssClient
from sqlalchemy import select
from fastapi_modules.fastapi_leaudit.models.leauditDocumentFile import LeauditDocumentFile
log = logger
@dataclass(frozen=True)
class InputPayload:
"""Govdoc 引擎执行所需的输入载荷。"""
fileName: str
fileExt: str
localPath: str
sha256: str | None = None
fileSize: int | None = None
documentFileId: int | None = None
tempDir: str | None = None # 需调用方在任务结束时清理
class InputResolver:
"""解析 govdoc 引擎输入文件。
从 leaudit_document_files 中定位输入文件 (file_role='original')
优先使用本地缓存路径,否则从 OSS 下载到临时目录。
"""
def __init__(self, Oss: OssClient | None = None) -> None:
self.Oss = Oss or OssClient()
async def ResolveForDocument(self, documentId: int) -> InputPayload:
"""为指定文档解析输入文件载荷。
查找该文档最近一次激活的 original 文件记录。
"""
async with GetAsyncSession() as session:
result = await session.execute(
select(LeauditDocumentFile)
.where(
LeauditDocumentFile.documentId == documentId,
LeauditDocumentFile.fileRole == "original",
LeauditDocumentFile.isActive.is_(True),
)
.order_by(LeauditDocumentFile.Id.desc())
.limit(1)
)
fileRow = result.scalar_one_or_none()
if fileRow is None:
raise ValueError(f"未找到文档 {documentId} 的原始文件记录")
return await self.ResolveFromRow(fileRow)
async def ResolveFromRow(self, FileRow: LeauditDocumentFile) -> InputPayload:
"""从文件记录解析输入载荷。"""
# 优先本地路径
if FileRow.localPath:
LocalPath = Path(FileRow.localPath)
if LocalPath.is_file():
return InputPayload(
fileName=FileRow.fileName,
fileExt=FileRow.fileExt or _ext_from_name(FileRow.fileName),
localPath=str(LocalPath),
sha256=FileRow.sha256,
fileSize=FileRow.fileSize,
documentFileId=FileRow.Id,
)
# 否则从 OSS 下载
if FileRow.ossUrl:
return await self._DownloadFromOss(FileRow)
raise ValueError(
f"文件 {FileRow.Id} ({FileRow.fileName}) 既无可用 localPath 也无 ossUrl"
)
async def _DownloadFromOss(self, FileRow: LeauditDocumentFile) -> InputPayload:
"""从 OSS 下载文件到临时目录。"""
try:
content = self.Oss.DownloadBytes(FileRow.ossUrl)
except Exception as e:
log.error(f"从 OSS 下载文件失败: url={FileRow.ossUrl}, error={e}")
raise
tempDir = tempfile.mkdtemp(prefix="govdoc_input_")
ext = FileRow.fileExt or _ext_from_name(FileRow.fileName)
safeName = f"input_{FileRow.Id}{ext}"
localPath = os.path.join(tempDir, safeName)
with open(localPath, "wb") as f:
f.write(content)
computedSha = hashlib.sha256(content).hexdigest()
if FileRow.sha256 and computedSha != FileRow.sha256:
log.warning(
f"文件 SHA256 不匹配: expected={FileRow.sha256}, computed={computedSha}"
)
log.info(
f"从 OSS 下载文件: {FileRow.fileName}{localPath} ({len(content)} bytes)"
)
return InputPayload(
fileName=FileRow.fileName,
fileExt=ext,
localPath=localPath,
sha256=computedSha,
fileSize=len(content),
documentFileId=FileRow.Id,
tempDir=tempDir,
)
def _ext_from_name(fileName: str) -> str:
"""从文件名提取扩展名。"""
_, ext = os.path.splitext(fileName)
return ext if ext else ".docx"
@@ -0,0 +1,112 @@
"""Govdoc Bridge — 结果适配器。
将 govdoc_engine 原始结果对象 (AuditResult / Finding / SemanticEntity)
映射为 ORM 模型字段和前端 VO 字典。
"""
from __future__ import annotations
from typing import Any
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.result import AuditResult
class ResultAdapter:
"""Govdoc 引擎结果 → 平台数据模型适配器。
负责将 govdoc_engine 的原始执行结果转换为:
- GovdocRun 状态字段更新
- GovdocRuleResult 列表
- GovdocReportArtifact 清单
- 前端 VO 字典
"""
def AdaptRunSummary(self, EngineResult: AuditResult) -> dict[str, Any]:
"""从 AuditResult.summary 提取 run 汇总字段。"""
s = EngineResult.summary
return {
"totalScore": s.score,
"passedCount": s.passed_count,
"failedCount": s.failed_count,
"skippedCount": s.skipped_count,
"resultStatus": "pass" if s.failed_count == 0 else "fail" if s.passed_count == 0 else "partial",
"resultSummaryJson": None, # 可为后续扩展预留
}
def AdaptRuleResults(self, EngineResult: AuditResult) -> list[dict[str, Any]]:
"""从 AuditResult.findings 提取规则执行明细列表。"""
results: list[dict[str, Any]] = []
for f in EngineResult.findings:
results.append({
"ruleId": f.rule_id,
"ruleName": f.rule_name,
"severity": f.severity,
"category": f.category,
"message": f.message,
"suggestion": f.suggestion,
"actual": f.actual,
"expected": f.expected,
"evidence": f.evidence,
"paragraphIndex": f.location.paragraph_index if f.location else None,
"paragraphText": f.location.context if f.location else None,
"locationPath": f.location.role if f.location else None,
"result": "fail",
"score": None,
})
return results
def AdaptCheckedRules(self, EngineResult: AuditResult) -> list[dict[str, Any]]:
"""从 AuditResult.checked_rules 提取规则检查状态列表。"""
results: list[dict[str, Any]] = []
for cr in EngineResult.checked_rules:
results.append({
"ruleId": cr.rule_id,
"ruleName": cr.name,
"severity": cr.severity,
"category": cr.category,
"result": cr.status, # pass/fail/skipped
"skipReason": cr.skip_reason,
"score": None,
})
return results
def AdaptEntities(self, EngineResult: AuditResult) -> list[dict[str, Any]]:
"""从 AuditResult.entities 提取实体识别结果。"""
entities: list[dict[str, Any]] = []
for name, entity in EngineResult.entities.items():
if entity is None:
continue
entities.append({
"name": entity.name,
"text": entity.text,
"paragraphIndices": entity.paragraph_indices,
"primaryRole": entity.primary_role,
"source": entity.source,
"confidence": entity.confidence,
})
return entities
def AdaptArtifacts(self, _EngineResult: AuditResult, _RunId: int) -> list[dict[str, Any]]:
"""从引擎结果提取报告产物清单。
报告文件由 reporter 模块生成后上传 OSS。
当前返回空列表,待 report_adapter 实现后补齐。
"""
return []
def BuildDetailVO(
self,
Document: Any,
Run: Any,
RuleResults: list[dict[str, Any]],
Entities: list[dict[str, Any]],
Artifacts: list[dict[str, Any]],
) -> dict[str, Any]:
"""构建前端详情页 VO。"""
return {
"document": Document,
"run": Run,
"findings": RuleResults,
"entities": Entities,
"reports": Artifacts,
}
@@ -0,0 +1,99 @@
"""Govdoc Bridge — 执行编排器。
负责组织一次完整的 govdoc 审查执行链路:
1. 解析输入文件 (input_resolver)
2. 调用 govdoc_engine.pipeline 执行审查
3. 收集并适配结果 (result_adapter)
4. 持久化结果 (storage_adapter)
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any
from fastapi_common.fastapi_common_logger import logger
from fastapi_modules.fastapi_leaudit.govdoc_bridge.input_resolver import InputResolver
from fastapi_modules.fastapi_leaudit.govdoc_bridge.result_adapter import ResultAdapter
from fastapi_modules.fastapi_leaudit.govdoc_bridge.storage_adapter import StorageAdapter
log = logger
@dataclass
class GovdocRunner:
"""Govdoc 引擎一次完整审查的执行编排器。
当前为 Phase 1 骨架:
- engine 集成点已标注
- 可先跑通 run 创建 → 状态更新 → 结果持久化 的完整生命周期
- 引擎执行部分待 govdoc_engine 迁入后接入
"""
InputResolver: InputResolver = field(default_factory=InputResolver)
Storage: StorageAdapter = field(default_factory=StorageAdapter)
ResultAdapter: ResultAdapter = field(default_factory=ResultAdapter)
async def Execute(
self,
DocumentId: int,
RunId: int,
RulesPath: str,
TriggerUserId: int | None = None,
Speed: str = "normal",
) -> dict[str, Any]:
"""执行一次完整的 govdoc 审查。
Args:
DocumentId: 文档 ID。
RunId: 已创建的 govdoc_runs.id。
TriggerUserId: 触发人。
Speed: 执行速度 ('normal' / 'urgent')。
Returns:
执行摘要 dict。
"""
log.info(f"[Govdoc] Starting execution: runId={RunId}, documentId={DocumentId}")
# 1. 更新 run 状态 → processing
await self.Storage.UpdateRunStatus(RunId, "processing", phase="parsing")
await self.Storage.UpdateDocumentStatus(DocumentId, "processing", RunId)
# 2. 解析输入文件
inputPayload = await self.InputResolver.ResolveForDocument(DocumentId)
log.info(f"[Govdoc] Input resolved: {inputPayload.fileName}{inputPayload.localPath}")
# 3. 调用 govdoc_engine 执行审查
from fastapi_modules.fastapi_leaudit.govdoc_engine.pipeline import run as engine_run
engineResult = await engine_run(
file_path=inputPayload.localPath,
rules_path=RulesPath,
llm_client=None, # 使用默认 LlmClient (从平台配置加载)
)
# 4. 适配引擎结果
runSummary = self.ResultAdapter.AdaptRunSummary(engineResult)
ruleResults = self.ResultAdapter.AdaptRuleResults(engineResult)
entities = self.ResultAdapter.AdaptEntities(engineResult)
artifacts = self.ResultAdapter.AdaptArtifacts(engineResult, RunId)
# 5. 持久化结果
await self.Storage.UpdateRunResult(RunId, runSummary)
await self.Storage.SaveRuleResults(RunId, ruleResults)
await self.Storage.SaveArtifacts(RunId, artifacts)
# 6. 更新终态
await self.Storage.UpdateRunStatus(RunId, "completed", phase="reporting")
await self.Storage.UpdateDocumentStatus(DocumentId, "completed", RunId)
log.info(f"[Govdoc] Execution completed: runId={RunId}")
return {
"runId": RunId,
"documentId": DocumentId,
"status": "completed",
"ruleResultsCount": len(ruleResults),
"artifactCount": len(artifacts),
}
@@ -0,0 +1,221 @@
"""Govdoc Bridge — 存储适配器。
将 govdoc_engine 执行结果写入 govdoc_runs / govdoc_rule_results /
govdoc_report_artifacts 表,并更新 leaudit_documents 状态。
"""
from __future__ import annotations
from datetime import datetime, timezone
from typing import Any
from fastapi_common.fastapi_common_logger import logger
from fastapi_common.fastapi_common_sqlalchemy.database import GetAsyncSession
from sqlalchemy import text
log = logger
class StorageAdapter:
"""Govdoc 结果持久化适配器。
将 bridge 层产生的结构化结果写入 PostgreSQL。
使用原生 SQL (text) 以保持与现有 leaudit_bridge/storage_adapter 风格一致。
"""
# ── Run 状态 ─────────────────────────────────────────
async def CreateRun(self, RunData: dict[str, Any]) -> int:
"""创建 govdoc_runs 记录,返回 run_id。"""
async with GetAsyncSession() as session:
result = await session.execute(
text(
"""INSERT INTO govdoc_runs
(document_id, document_file_id, run_no, trigger_source,
trigger_user_id, status, phase, created_at, updated_at)
VALUES (:document_id, :document_file_id, :run_no, :trigger_source,
:trigger_user_id, 'pending', 'parsing', now(), now())
RETURNING id"""
),
{
"document_id": RunData["documentId"],
"document_file_id": RunData.get("documentFileId"),
"run_no": RunData.get("runNo", 1),
"trigger_source": RunData.get("triggerSource", "manual"),
"trigger_user_id": RunData.get("triggerUserId"),
},
)
row = result.fetchone()
run_id = row[0] if row else 0
await session.commit()
log.info(f"[Govdoc] Run created: runId={run_id}, documentId={RunData['documentId']}")
return run_id
async def UpdateRunStatus(self, RunId: int, Status: str, Phase: str | None = None, **Extra: Any) -> None:
"""更新 run 状态和阶段。"""
set_clauses = ["status = :status", "updated_at = now()"]
params: dict[str, Any] = {"rid": RunId, "status": Status}
if Phase is not None:
set_clauses.append("phase = :phase")
params["phase"] = Phase
if Status == "completed" or Status == "failed":
set_clauses.append("finished_at = :finished_at")
params["finished_at"] = datetime.now(timezone.utc)
async with GetAsyncSession() as session:
await session.execute(
text(f"UPDATE govdoc_runs SET {', '.join(set_clauses)} WHERE id = :rid"),
params,
)
await session.commit()
log.info(f"[Govdoc] Run status updated: runId={RunId}, status={Status}")
async def UpdateRunResult(self, RunId: int, Summary: dict[str, Any]) -> None:
"""写入 run 结果汇总字段。"""
async with GetAsyncSession() as session:
await session.execute(
text(
"""UPDATE govdoc_runs SET
total_score = :total_score,
passed_count = :passed_count,
failed_count = :failed_count,
skipped_count = :skipped_count,
result_status = :result_status,
result_summary_json = :result_summary_json,
updated_at = now()
WHERE id = :rid"""
),
{
"rid": RunId,
"total_score": Summary.get("totalScore"),
"passed_count": Summary.get("passedCount", 0),
"failed_count": Summary.get("failedCount", 0),
"skipped_count": Summary.get("skippedCount", 0),
"result_status": Summary.get("resultStatus"),
"result_summary_json": Summary.get("resultSummaryJson"),
},
)
await session.commit()
log.info(f"[Govdoc] Run result saved: runId={RunId}")
async def UpdateRunError(self, RunId: int, ErrorMessage: str) -> None:
"""记录运行失败的错误信息。"""
async with GetAsyncSession() as session:
await session.execute(
text(
"""UPDATE govdoc_runs SET
status = 'failed',
error_message = :error_message,
finished_at = now(),
updated_at = now()
WHERE id = :rid"""
),
{"rid": RunId, "error_message": ErrorMessage},
)
await session.commit()
log.error(f"[Govdoc] Run failed: runId={RunId}, error={ErrorMessage[:200]}")
# ── 规则结果 ─────────────────────────────────────────
async def SaveRuleResults(self, RunId: int, Results: list[dict[str, Any]]) -> None:
"""批量写入 govdoc_rule_results。"""
if not Results:
return
async with GetAsyncSession() as session:
for row in Results:
await session.execute(
text(
"""INSERT INTO govdoc_rule_results
(run_id, rule_id, rule_name, severity, category,
message, suggestion, actual, expected, evidence,
paragraph_index, paragraph_text, location_path,
result, score, created_at, updated_at)
VALUES (:run_id, :rule_id, :rule_name, :severity, :category,
:message, :suggestion, :actual, :expected, :evidence,
:paragraph_index, :paragraph_text, :location_path,
:result, :score, now(), now())"""
),
{
"run_id": RunId,
"rule_id": row.get("ruleId"),
"rule_name": row.get("ruleName"),
"severity": row.get("severity"),
"category": row.get("category"),
"message": row.get("message"),
"suggestion": row.get("suggestion"),
"actual": row.get("actual"),
"expected": row.get("expected"),
"evidence": row.get("evidence"),
"paragraph_index": row.get("paragraphIndex"),
"paragraph_text": row.get("paragraphText"),
"location_path": row.get("locationPath"),
"result": row.get("result", "pass"),
"score": row.get("score"),
},
)
await session.commit()
log.info(f"[Govdoc] Rule results saved: runId={RunId}, count={len(Results)}")
# ── 报告产物 ─────────────────────────────────────────
async def SaveArtifacts(self, RunId: int, Artifacts: list[dict[str, Any]]) -> None:
"""批量写入 govdoc_report_artifacts。"""
if not Artifacts:
return
async with GetAsyncSession() as session:
for row in Artifacts:
await session.execute(
text(
"""INSERT INTO govdoc_report_artifacts
(run_id, artifact_type, file_name, file_ext, mime_type,
file_size, sha256, oss_url, storage_provider, description,
created_at, updated_at)
VALUES (:run_id, :artifact_type, :file_name, :file_ext, :mime_type,
:file_size, :sha256, :oss_url, :storage_provider, :description,
now(), now())"""
),
{
"run_id": RunId,
"artifact_type": row.get("artifactType"),
"file_name": row.get("fileName"),
"file_ext": row.get("fileExt"),
"mime_type": row.get("mimeType"),
"file_size": row.get("fileSize"),
"sha256": row.get("sha256"),
"oss_url": row.get("ossUrl"),
"storage_provider": row.get("storageProvider"),
"description": row.get("description"),
},
)
await session.commit()
log.info(f"[Govdoc] Artifacts saved: runId={RunId}, count={len(Artifacts)}")
# ── 文档状态 ─────────────────────────────────────────
async def UpdateDocumentStatus(self, DocumentId: int, ProcessingStatus: str, RunId: int | None = None) -> None:
"""更新 leaudit_documents 的处理状态和当前 run_id。"""
async with GetAsyncSession() as session:
if RunId is not None:
await session.execute(
text(
"""UPDATE leaudit_documents SET
processing_status = :s, current_run_id = :rid, updated_at = now()
WHERE id = :did"""
),
{"s": ProcessingStatus, "rid": RunId, "did": DocumentId},
)
else:
await session.execute(
text(
"""UPDATE leaudit_documents SET
processing_status = :s, updated_at = now()
WHERE id = :did"""
),
{"s": ProcessingStatus, "did": DocumentId},
)
await session.commit()
log.info(f"[Govdoc] Document status updated: documentId={DocumentId}, status={ProcessingStatus}")
@@ -0,0 +1,115 @@
"""Govdoc Bridge — Celery 任务入口。
将 govdoc 审查执行投递到 Celery worker 队列。
"""
from __future__ import annotations
import asyncio
from typing import Any
from fastapi_common.fastapi_common_logger import logger
from fastapi_admin.celery_app import celery_app
from fastapi_modules.fastapi_leaudit.govdoc_bridge.runner import GovdocRunner
from fastapi_modules.fastapi_leaudit.govdoc_bridge.storage_adapter import StorageAdapter
log = logger
GOVDOC_WORKER_QUEUE = "govdoc"
GOVDOC_WORKER_QUEUE_URGENT = "govdoc_urgent"
def resolve_govdoc_queue(speed: str = "normal") -> str:
"""根据优先级返回对应的 worker 队列名。"""
if (speed or "").strip().lower() in {"urgent", "high", "fast", "紧急"}:
return GOVDOC_WORKER_QUEUE_URGENT
return GOVDOC_WORKER_QUEUE
def dispatch_govdoc_task(
documentId: int,
runId: int,
triggerUserId: int | None = None,
speed: str = "normal",
) -> Any:
"""投递 govdoc 审查任务到 Celery 队列。
Args:
documentId: 文档 ID。
runId: 已创建的 govdoc_runs.id。
triggerUserId: 触发人。
speed: 优先级 ('normal' / 'urgent')。
Returns:
Celery AsyncResult。
"""
queue = resolve_govdoc_queue(speed)
log.info(
f"[Govdoc] Dispatching task: runId={runId}, documentId={documentId}, queue={queue}"
)
return govdoc_execute_task.apply_async(
kwargs={
"documentId": documentId,
"runId": runId,
"triggerUserId": triggerUserId,
"speed": speed,
},
queue=queue,
)
@celery_app.task(
bind=True,
name="govdoc_execute_task",
max_retries=0,
default_retry_delay=60,
acks_late=True,
reject_on_worker_lost=True,
task_time_limit=30 * 60, # 单次执行 30 分钟超时
task_soft_time_limit=25 * 60,
)
def govdoc_execute_task(
self,
documentId: int,
runId: int,
triggerUserId: int | None = None,
speed: str = "normal",
) -> dict[str, Any]:
"""Celery 任务:执行一次 govdoc 公文格式审查。
此任务由 dispatch_govdoc_task 投递,worker 消费后执行完整审查链路。
"""
taskId = self.request.id or "unknown"
log.info(f"[Govdoc] Task started: taskId={taskId}, runId={runId}, documentId={documentId}")
storage = StorageAdapter()
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
# 更新 run 状态 → running
loop.run_until_complete(storage.UpdateRunStatus(runId, "processing", phase="parsing"))
# 执行完整审查链路
runner = GovdocRunner()
result = loop.run_until_complete(
runner.Execute(
DocumentId=documentId,
RunId=runId,
TriggerUserId=triggerUserId,
Speed=speed,
)
)
log.info(f"[Govdoc] Task completed: taskId={taskId}, runId={runId}")
return result
except Exception as exc:
errorMessage = str(exc)[:2000]
log.exception(f"[Govdoc] Task failed: taskId={taskId}, runId={runId}, error={errorMessage[:200]}")
loop.run_until_complete(storage.UpdateRunError(runId, errorMessage))
loop.run_until_complete(storage.UpdateDocumentStatus(documentId, "failed", runId))
raise
finally:
loop.close()