feat: integrate govdoc module into leaudit platform

This commit is contained in:
wren
2026-05-17 19:24:16 +08:00
parent cb13e61d3d
commit a73826dc1d
16 changed files with 2334 additions and 280 deletions
@@ -31,7 +31,7 @@ class GovdocController(BaseController):
file: UploadFile = File(...),
typeId: int | None = Form(default=None),
region: str = Form(default="default"),
autoRun: bool = Form(default=False),
autoRun: bool = Form(default=True),
speed: str = Form(default="normal"),
ruleVersionId: int | None = Form(default=None),
payload: dict[str, Any] = Depends(verify_access_token),
@@ -56,6 +56,7 @@ class GovdocController(BaseController):
page: int = Query(default=1, ge=1),
pageSize: int = Query(default=20, ge=1, le=100),
keyword: str | None = Query(default=None),
fileExt: str | None = Query(default=None),
region: str | None = Query(default=None),
status: str | None = Query(default=None),
resultStatus: str | None = Query(default=None),
@@ -72,6 +73,7 @@ class GovdocController(BaseController):
page=page,
pageSize=pageSize,
keyword=keyword,
fileExt=fileExt,
region=region,
status=status,
resultStatus=resultStatus,
@@ -26,6 +26,7 @@ class ResultAdapter:
EngineResult: AuditResult,
Structure: list[dict[str, Any]] | None = None,
Outline: list[dict[str, Any]] | None = None,
Entities: list[dict[str, Any]] | None = None,
) -> dict[str, Any]:
"""从 AuditResult.summary 提取 run 汇总字段。
@@ -40,6 +41,10 @@ class ResultAdapter:
aux["structure"] = Structure
if Outline is not None:
aux["outline"] = Outline
if Entities is not None:
aux["entities"] = {
entity["name"]: entity for entity in Entities if entity.get("name")
}
return {
"totalScore": s.score,
@@ -100,6 +105,7 @@ class ResultAdapter:
"primaryRole": entity.primary_role,
"source": entity.source,
"confidence": entity.confidence,
"extra": entity.extra,
})
return entities
@@ -9,14 +9,26 @@
from __future__ import annotations
import hashlib
import shutil
import tempfile
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
from sqlalchemy import text
from fastapi_common.fastapi_common_logger import logger
from fastapi_common.fastapi_common_sqlalchemy.database import GetAsyncSession
from fastapi_common.fastapi_common_storage.oss_path_utils import OssPathUtils
from fastapi_modules.fastapi_leaudit.govdoc_bridge.input_resolver import InputResolver
from fastapi_modules.fastapi_leaudit.govdoc_bridge.result_adapter import ResultAdapter
from fastapi_modules.fastapi_leaudit.govdoc_bridge.storage_adapter import StorageAdapter
from fastapi_modules.fastapi_leaudit.govdoc_engine.parser.docx_parser import parse_docx
from fastapi_modules.fastapi_leaudit.govdoc_engine.reporter.docx_annotator import annotate_docx
from fastapi_modules.fastapi_leaudit.govdoc_engine.reporter.html_paragraph import paragraphs_to_html
from fastapi_modules.fastapi_leaudit.govdoc_engine.reporter.html_renderer import render_html
from fastapi_modules.fastapi_leaudit.services.impl.ossServiceImpl import OssServiceImpl
log = logger
@@ -33,13 +45,30 @@ class GovdocRunner:
InputResolver: InputResolver = field(default_factory=InputResolver)
Storage: StorageAdapter = field(default_factory=StorageAdapter)
ResultAdapter: ResultAdapter = field(default_factory=ResultAdapter)
OssService: OssServiceImpl = field(default_factory=OssServiceImpl)
ResultAdapter: Any | None = None
def ResolveRulesPath(self, RulesPath: str | None) -> str:
"""解析并校验执行所需规则文件路径。"""
candidate = (RulesPath or "").strip()
if not candidate:
raise ValueError("未提供 govdoc rules_path,当前任务无法执行规则审查")
path = Path(candidate).expanduser()
if not path.is_absolute():
path = Path.cwd() / path
path = path.resolve()
if not path.is_file():
raise FileNotFoundError(f"govdoc 规则文件不存在: {path}")
return str(path)
async def Execute(
self,
DocumentId: int,
RunId: int,
RulesPath: str,
RulesPath: str | None = None,
TriggerUserId: int | None = None,
Speed: str = "normal",
) -> dict[str, Any]:
@@ -54,57 +83,216 @@ class GovdocRunner:
Returns:
执行摘要 dict。
"""
resolvedRulesPath = self.ResolveRulesPath(RulesPath)
log.info(f"[Govdoc] Starting execution: runId={RunId}, documentId={DocumentId}")
artifactTempDir: str | None = None
inputPayload = None
try:
# 1. 更新 run 状态 → processing
await self.Storage.UpdateRunStatus(RunId, "processing", Phase="parsing")
await self.Storage.UpdateDocumentStatus(DocumentId, "processing", RunId)
# 1. 更新 run 状态 → processing
await self.Storage.UpdateRunStatus(RunId, "processing", phase="parsing")
await self.Storage.UpdateDocumentStatus(DocumentId, "processing", RunId)
# 2. 解析输入文件
inputPayload = await self.InputResolver.ResolveForDocument(DocumentId)
log.info(f"[Govdoc] Input resolved: {inputPayload.fileName}{inputPayload.localPath}")
# 2. 解析输入文件
inputPayload = await self.InputResolver.ResolveForDocument(DocumentId)
log.info(f"[Govdoc] Input resolved: {inputPayload.fileName}{inputPayload.localPath}")
# 3. 调用 govdoc_engine 执行审查
from fastapi_modules.fastapi_leaudit.govdoc_bridge.result_adapter import ResultAdapter
from fastapi_modules.fastapi_leaudit.govdoc_engine.pipeline import run as engine_run
# 3. 调用 govdoc_engine 执行审查
from fastapi_modules.fastapi_leaudit.govdoc_engine.pipeline import run as engine_run
if self.ResultAdapter is None:
self.ResultAdapter = ResultAdapter()
engineResult = await engine_run(
file_path=inputPayload.localPath,
rules_path=RulesPath,
llm_client=None, # 使用默认 LlmClient (从平台配置加载)
engineResult = await engine_run(
file_path=inputPayload.localPath,
rules_path=resolvedRulesPath,
llm_client=None, # 使用默认 LlmClient (从平台配置加载)
)
engineResult.document["filename"] = inputPayload.fileName
# 4. 适配引擎结果
structure = self.ResultAdapter.AdaptStructure(engineResult)
outline = self.ResultAdapter.AdaptOutline(engineResult)
entities = self.ResultAdapter.AdaptEntities(engineResult)
runSummary = self.ResultAdapter.AdaptRunSummary(
engineResult,
Structure=structure,
Outline=outline,
Entities=entities,
)
ruleResults = self.ResultAdapter.AdaptRuleResults(engineResult)
checkedRuleResults = self.ResultAdapter.AdaptCheckedRules(engineResult)
artifactTempDir, artifacts = await self._GenerateArtifacts(
DocumentId=DocumentId,
RunId=RunId,
InputPath=inputPayload.localPath,
InputFileName=inputPayload.fileName,
EngineResult=engineResult,
RuleResults=ruleResults,
)
failedRuleIds = {str(row.get("ruleId") or "") for row in ruleResults}
for checkedRule in checkedRuleResults:
ruleId = str(checkedRule.get("ruleId") or "")
if checkedRule.get("result") == "fail" and ruleId in failedRuleIds:
continue
ruleResults.append(checkedRule)
# 将 rules_path 附带到 runSummary 中,供 GetRuleDetail 后续解析
runSummary["rulesPath"] = resolvedRulesPath
# 5. 持久化结果
await self.Storage.UpdateRunResult(RunId, runSummary)
await self.Storage.SaveRuleResults(RunId, ruleResults)
await self.Storage.SaveArtifacts(RunId, artifacts)
# 6. 更新终态
await self.Storage.UpdateRunStatus(RunId, "completed", Phase="reporting")
await self.Storage.UpdateDocumentStatus(DocumentId, "completed", RunId)
log.info(f"[Govdoc] Execution completed: runId={RunId}")
return {
"runId": RunId,
"documentId": DocumentId,
"status": "completed",
"ruleResultsCount": len(ruleResults),
"structureCount": len(structure),
"outlineCount": len(outline),
"artifactCount": len(artifacts),
}
finally:
if artifactTempDir:
shutil.rmtree(artifactTempDir, ignore_errors=True)
if inputPayload and inputPayload.tempDir:
shutil.rmtree(inputPayload.tempDir, ignore_errors=True)
async def _GenerateArtifacts(
self,
DocumentId: int,
RunId: int,
InputPath: str,
InputFileName: str,
EngineResult: Any,
RuleResults: list[dict[str, Any]],
) -> tuple[str, list[dict[str, Any]]]:
"""生成报告产物并上传到 OSS。"""
artifactDir = tempfile.mkdtemp(prefix=f"govdoc_artifacts_{RunId}_")
sourcePath = Path(InputPath)
baseName = OssPathUtils.BuildSafeFileStem(InputFileName)
region = await self._GetDocumentRegion(DocumentId)
annotatedPath = Path(artifactDir) / f"{baseName}.annotated.docx"
annotate_docx(sourcePath, annotatedPath, EngineResult)
htmlReport = render_html(EngineResult)
doc = parse_docx(sourcePath)
findingMap: dict[int, list[str]] = {}
for row in RuleResults:
if row.get("result") != "fail":
continue
paragraphIndex = row.get("paragraphIndex")
if paragraphIndex is None:
continue
findingId = f"{row.get('ruleId') or 'finding'}-{paragraphIndex}"
findingMap.setdefault(int(paragraphIndex), []).append(findingId)
paragraphsHtml = paragraphs_to_html(doc, findingMap)
annotatedUrl = await self.OssService.UploadBytes(
ObjectKey=OssPathUtils.BuildArtifactKey(
Region=region,
RunId=RunId,
ArtifactType="annotated_docx",
Detail=f"{baseName}.annotated.docx",
),
Content=annotatedPath.read_bytes(),
ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
)
htmlUrl = await self.OssService.UploadText(
ObjectKey=OssPathUtils.BuildArtifactKey(
Region=region,
RunId=RunId,
ArtifactType="html_report",
Detail=f"{baseName}.report.html",
),
Content=htmlReport,
ContentType="text/html; charset=utf-8",
)
paragraphUrl = await self.OssService.UploadText(
ObjectKey=OssPathUtils.BuildArtifactKey(
Region=region,
RunId=RunId,
ArtifactType="paragraph_html",
Detail=f"{baseName}.paragraphs.html",
),
Content=paragraphsHtml,
ContentType="text/html; charset=utf-8",
)
# 4. 适配引擎结果
structure = self.ResultAdapter.AdaptStructure(engineResult)
outline = self.ResultAdapter.AdaptOutline(engineResult)
runSummary = self.ResultAdapter.AdaptRunSummary(
engineResult,
Structure=structure,
Outline=outline,
)
ruleResults = self.ResultAdapter.AdaptRuleResults(engineResult)
entities = self.ResultAdapter.AdaptEntities(engineResult)
artifacts = self.ResultAdapter.AdaptArtifacts(engineResult, RunId)
return artifactDir, [
self._BuildArtifactRow(
artifactType="annotated_docx",
fileName=f"{baseName}.annotated.docx",
mimeType="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
content=annotatedPath.read_bytes(),
ossUrl=annotatedUrl,
description="批注 DOCX",
),
self._BuildArtifactRow(
artifactType="html_report",
fileName=f"{baseName}.report.html",
mimeType="text/html; charset=utf-8",
content=htmlReport.encode("utf-8"),
ossUrl=htmlUrl,
description="HTML 审查报告",
),
self._BuildArtifactRow(
artifactType="paragraph_html",
fileName=f"{baseName}.paragraphs.html",
mimeType="text/html; charset=utf-8",
content=paragraphsHtml.encode("utf-8"),
ossUrl=paragraphUrl,
description="段落联动视图",
),
]
# 将 rules_path 附带到 runSummary 中,供 GetRuleDetail 后续解析
runSummary["rulesPath"] = RulesPath
# 5. 持久化结果
await self.Storage.UpdateRunResult(RunId, runSummary)
await self.Storage.SaveRuleResults(RunId, ruleResults)
await self.Storage.SaveArtifacts(RunId, artifacts)
# 6. 更新终态
await self.Storage.UpdateRunStatus(RunId, "completed", phase="reporting")
await self.Storage.UpdateDocumentStatus(DocumentId, "completed", RunId)
log.info(f"[Govdoc] Execution completed: runId={RunId}")
async def _GetDocumentRegion(self, DocumentId: int) -> str:
async with GetAsyncSession() as session:
row = (
await session.execute(
text(
"""
SELECT COALESCE(region, 'default') AS region
FROM leaudit_documents
WHERE id = :document_id
LIMIT 1
"""
),
{"document_id": DocumentId},
)
).mappings().first()
return str(row["region"] or "default") if row else "default"
def _BuildArtifactRow(
self,
*,
artifactType: str,
fileName: str,
mimeType: str,
content: bytes,
ossUrl: str,
description: str,
) -> dict[str, Any]:
fileExt = Path(fileName).suffix.lstrip(".").lower()
return {
"runId": RunId,
"documentId": DocumentId,
"status": "completed",
"ruleResultsCount": len(ruleResults),
"structureCount": len(structure),
"outlineCount": len(outline),
"artifactCount": len(artifacts),
"artifactType": artifactType,
"fileName": fileName,
"fileExt": fileExt,
"mimeType": mimeType,
"fileSize": len(content),
"sha256": hashlib.sha256(content).hexdigest(),
"ossUrl": ossUrl,
"storageProvider": "minio",
"description": description,
}
@@ -6,6 +6,7 @@ govdoc_report_artifacts 表,并更新 leaudit_documents 状态。
from __future__ import annotations
import json
from datetime import datetime, timezone
from typing import Any
@@ -16,6 +17,33 @@ from sqlalchemy import text
log = logger
_RUN_RESULT_FALLBACK_KEYS = {
"total_score": ["totalScore", "score"],
"passed_count": ["passedCount", "passed_count"],
"failed_count": ["failedCount", "failed_count"],
"skipped_count": ["skippedCount", "skipped_count"],
"result_status": ["resultStatus", "result_status"],
"result_summary_json": ["resultSummaryJson", "result_summary_json"],
}
def _pick_value(payload: dict[str, Any], *keys: str) -> Any:
for key in keys:
if key in payload:
return payload[key]
return None
def _to_text_payload(value: Any) -> str | None:
if value is None:
return None
if isinstance(value, str):
return value
if isinstance(value, (dict, list, tuple, bool, int, float)):
return json.dumps(value, ensure_ascii=False)
return str(value)
class StorageAdapter:
"""Govdoc 结果持久化适配器。
@@ -51,11 +79,23 @@ class StorageAdapter:
log.info(f"[Govdoc] Run created: runId={run_id}, documentId={RunData['documentId']}")
return run_id
async def UpdateRunStatus(self, RunId: int, Status: str, Phase: str | None = None, **Extra: Any) -> None:
"""更新 run 状态和阶段。"""
async def UpdateRunStatus(
self,
RunId: int,
Status: str,
Phase: str | None = None,
**Extra: Any,
) -> None:
"""更新 run 状态和阶段。
兼容调用方传 ``phase=`` 或 ``Phase=``,避免因大小写不一致导致阶段不落库。
"""
set_clauses = ["status = :status", "updated_at = now()"]
params: dict[str, Any] = {"rid": RunId, "status": Status}
if Phase is None:
Phase = _pick_value(Extra, "phase", "Phase")
if Phase is not None:
set_clauses.append("phase = :phase")
params["phase"] = Phase
@@ -63,6 +103,9 @@ class StorageAdapter:
if Status == "completed" or Status == "failed":
set_clauses.append("finished_at = :finished_at")
params["finished_at"] = datetime.now(timezone.utc)
elif Status == "processing":
set_clauses.append("started_at = COALESCE(started_at, :started_at)")
params["started_at"] = datetime.now(timezone.utc)
async with GetAsyncSession() as session:
await session.execute(
@@ -74,7 +117,7 @@ class StorageAdapter:
async def UpdateRunResult(self, RunId: int, Summary: dict[str, Any]) -> None:
"""写入 run 结果汇总字段(含 rules_path / structure / outline)。"""
rules_path = Summary.get("rulesPath")
rules_path = _pick_value(Summary, "rulesPath", "rules_path")
set_clauses = [
"total_score = :total_score",
"passed_count = :passed_count",
@@ -86,12 +129,12 @@ class StorageAdapter:
]
params: dict[str, Any] = {
"rid": RunId,
"total_score": Summary.get("totalScore"),
"passed_count": Summary.get("passedCount", 0),
"failed_count": Summary.get("failedCount", 0),
"skipped_count": Summary.get("skippedCount", 0),
"result_status": Summary.get("resultStatus"),
"result_summary_json": Summary.get("resultSummaryJson"),
"total_score": _pick_value(Summary, *_RUN_RESULT_FALLBACK_KEYS["total_score"]),
"passed_count": _pick_value(Summary, *_RUN_RESULT_FALLBACK_KEYS["passed_count"]) or 0,
"failed_count": _pick_value(Summary, *_RUN_RESULT_FALLBACK_KEYS["failed_count"]) or 0,
"skipped_count": _pick_value(Summary, *_RUN_RESULT_FALLBACK_KEYS["skipped_count"]) or 0,
"result_status": _pick_value(Summary, *_RUN_RESULT_FALLBACK_KEYS["result_status"]),
"result_summary_json": _pick_value(Summary, *_RUN_RESULT_FALLBACK_KEYS["result_summary_json"]),
}
if rules_path:
set_clauses.append("rules_path = :rules_path")
@@ -137,11 +180,11 @@ class StorageAdapter:
(run_id, rule_id, rule_name, severity, category,
message, suggestion, actual, expected, evidence,
paragraph_index, paragraph_text, location_path,
result, score, created_at, updated_at)
result, skip_reason, score, created_at, updated_at)
VALUES (:run_id, :rule_id, :rule_name, :severity, :category,
:message, :suggestion, :actual, :expected, :evidence,
:paragraph_index, :paragraph_text, :location_path,
:result, :score, now(), now())"""
:result, :skip_reason, :score, now(), now())"""
),
{
"run_id": RunId,
@@ -151,13 +194,14 @@ class StorageAdapter:
"category": row.get("category"),
"message": row.get("message"),
"suggestion": row.get("suggestion"),
"actual": row.get("actual"),
"expected": row.get("expected"),
"evidence": row.get("evidence"),
"actual": _to_text_payload(row.get("actual")),
"expected": _to_text_payload(row.get("expected")),
"evidence": _to_text_payload(row.get("evidence")),
"paragraph_index": row.get("paragraphIndex"),
"paragraph_text": row.get("paragraphText"),
"location_path": row.get("locationPath"),
"result": row.get("result", "pass"),
"skip_reason": _pick_value(row, "skipReason", "skip_reason"),
"score": row.get("score"),
},
)
@@ -11,13 +11,17 @@ from typing import Any
from fastapi_common.fastapi_common_logger import logger
from fastapi_admin.celery_app import celery_app
from fastapi_admin.config import (
LEAUDIT_WORKER_QUEUE_NORMAL,
LEAUDIT_WORKER_QUEUE_URGENT,
)
from fastapi_modules.fastapi_leaudit.govdoc_bridge.runner import GovdocRunner
from fastapi_modules.fastapi_leaudit.govdoc_bridge.storage_adapter import StorageAdapter
log = logger
GOVDOC_WORKER_QUEUE = "govdoc"
GOVDOC_WORKER_QUEUE_URGENT = "govdoc_urgent"
GOVDOC_WORKER_QUEUE = LEAUDIT_WORKER_QUEUE_NORMAL
GOVDOC_WORKER_QUEUE_URGENT = LEAUDIT_WORKER_QUEUE_URGENT
def resolve_govdoc_queue(speed: str = "normal") -> str:
@@ -30,6 +34,7 @@ def resolve_govdoc_queue(speed: str = "normal") -> str:
def dispatch_govdoc_task(
documentId: int,
runId: int,
rulesPath: str | None = None,
triggerUserId: int | None = None,
speed: str = "normal",
) -> Any:
@@ -52,6 +57,7 @@ def dispatch_govdoc_task(
kwargs={
"documentId": documentId,
"runId": runId,
"rulesPath": rulesPath,
"triggerUserId": triggerUserId,
"speed": speed,
},
@@ -73,6 +79,7 @@ def govdoc_execute_task(
self,
documentId: int,
runId: int,
rulesPath: str | None = None,
triggerUserId: int | None = None,
speed: str = "normal",
) -> dict[str, Any]:
@@ -89,7 +96,7 @@ def govdoc_execute_task(
try:
# 更新 run 状态 → running
loop.run_until_complete(storage.UpdateRunStatus(runId, "processing", phase="parsing"))
loop.run_until_complete(storage.UpdateRunStatus(runId, "processing", Phase="parsing"))
# 执行完整审查链路
runner = GovdocRunner()
@@ -97,6 +104,7 @@ def govdoc_execute_task(
runner.Execute(
DocumentId=documentId,
RunId=runId,
RulesPath=rulesPath,
TriggerUserId=triggerUserId,
Speed=speed,
)
@@ -1,30 +1,12 @@
"""Govdoc 公文格式审查引擎内核。
从旧 govdoc-audit 项目裁剪迁入,去除独立 API 层、SQLite 存储层、
本地运行记录器 (RunRecorder) 和旧配置系统
导出:
- pipeline.run() — 异步审查入口 (bridge 层主调用)
- pipeline.audit_file() — 同步审查入口 (兼容)
- models — 核心数据模型 (Pydantic)
- parser — 文档解析与实体抽取
- dsl — YAML 规则 DSL 定义与加载
- engine — 规则执行引擎与结果模型
- reporter — 报告生成 (HTML/DOCX/JSON)
- llm — LLM 客户端 (OpenAI 兼容协议)
保持包级导入轻量,避免在控制器注册阶段提前拉起 LLM/OpenAI 依赖。
真正执行审查时再按需导入 pipeline / result 模块
"""
from __future__ import annotations
from fastapi_modules.fastapi_leaudit.govdoc_engine.pipeline import (
audit_file,
run,
)
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.result import (
AuditResult,
AuditSummary,
CheckedRule,
)
from typing import Any
__all__ = [
"audit_file",
@@ -33,3 +15,31 @@ __all__ = [
"AuditSummary",
"CheckedRule",
]
def audit_file(*args: Any, **kwargs: Any):
from fastapi_modules.fastapi_leaudit.govdoc_engine.pipeline import audit_file as _audit_file
return _audit_file(*args, **kwargs)
async def run(*args: Any, **kwargs: Any):
from fastapi_modules.fastapi_leaudit.govdoc_engine.pipeline import run as _run
return await _run(*args, **kwargs)
def __getattr__(name: str):
if name in {"AuditResult", "AuditSummary", "CheckedRule"}:
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.result import (
AuditResult,
AuditSummary,
CheckedRule,
)
return {
"AuditResult": AuditResult,
"AuditSummary": AuditSummary,
"CheckedRule": CheckedRule,
}[name]
raise AttributeError(name)
@@ -11,7 +11,22 @@ import re
import time
from typing import Any
from openai import AsyncOpenAI, OpenAI, APIError, APIConnectionError, RateLimitError
try:
from openai import AsyncOpenAI, OpenAI, APIError, APIConnectionError, RateLimitError
_OPENAI_IMPORT_ERROR: Exception | None = None
except ModuleNotFoundError as exc: # pragma: no cover - optional dependency
AsyncOpenAI = None # type: ignore[assignment]
OpenAI = None # type: ignore[assignment]
_OPENAI_IMPORT_ERROR = exc
class APIError(Exception):
status_code: int | None = None
class APIConnectionError(Exception):
pass
class RateLimitError(Exception):
pass
from fastapi_admin.config import (
LLM_API_KEY,
@@ -125,7 +140,13 @@ class LlmClient:
):
key = api_key or LLM_API_KEY
self._misconfigured_error: LlmConfigError | None = None
if not key:
if OpenAI is None or AsyncOpenAI is None:
self._client = None
self._aclient = None
self._misconfigured_error = LlmConfigError(
"python package 'openai' is not installed; govdoc LLM features are unavailable."
)
elif not key:
self._client = None
self._aclient = None
self._misconfigured_error = LlmConfigError(
@@ -130,12 +130,12 @@ def _merge_llm_into_entities(
# ── 实体构建 (同步,供 sync 入口使用) ──────────────────
def _build_entities(
doc, ruleset: RuleSet, llm: LlmClient,
doc, ruleset: RuleSet, llm: LlmClient | None,
) -> dict[str, SemanticEntity | None]:
"""构建实体 + 差量 LLM 抽取(同步)。"""
entities = EntityBuilder().build(doc)
spec = _compute_missing_spec(entities, ruleset.extract.entities)
if spec:
if spec and llm is not None:
llm_vals = FieldExtractor(llm).extract_missing(doc, spec)
_merge_llm_into_entities(entities, llm_vals)
return entities
@@ -144,12 +144,12 @@ def _build_entities(
# ── 实体构建 (异步,供 async 入口使用) ──────────────────
async def _build_entities_async(
doc, ruleset: RuleSet, llm: LlmClient,
doc, ruleset: RuleSet, llm: LlmClient | None,
) -> dict[str, SemanticEntity | None]:
"""构建实体 + 差量 LLM 抽取(异步)。"""
entities = EntityBuilder().build(doc)
spec = _compute_missing_spec(entities, ruleset.extract.entities)
if spec:
if spec and llm is not None:
llm_vals = await FieldExtractor(llm).extract_missing_async(doc, spec)
_merge_llm_into_entities(entities, llm_vals)
return entities
@@ -174,7 +174,7 @@ def audit_file(
"""
docx_path = Path(docx_path)
rules_path = Path(rules_path)
llm = llm_client or LlmClient()
llm = llm_client
doc = parse_docx(docx_path)
RoleTagger(llm_client=llm).tag(doc)
@@ -210,7 +210,7 @@ async def run(
"""
file_path = Path(file_path)
rules_path = Path(rules_path)
llm = llm_client or LlmClient()
llm = llm_client
_log.info("Govdoc pipeline start: %s", file_path.name)
@@ -219,18 +219,21 @@ async def run(
_log.info(" parsed: %d paragraphs", len(doc.paragraphs))
# 2. 段落角色标注
RoleTagger(llm_client=llm).tag(doc)
if llm is not None:
await RoleTagger(llm_client=llm).tag_async(doc)
else:
RoleTagger(llm_client=None).tag(doc)
# 3. 加载规则
ruleset = load_rules(rules_path)
_log.info(" rules: %d groups, %d rules", len(ruleset.groups), len(ruleset.all_rules()))
_log.info(" rules: %d groups, %d rules", len(ruleset.rules), len(ruleset.all_rules()))
# 4. 实体抽取 (含差量 LLM)
entities = await _build_entities_async(doc, ruleset, llm)
_log.info(" entities: %d/%d resolved", sum(1 for v in entities.values() if v), len(entities))
# 5. 规则评估
findings, outcomes = RuleRunner(llm_client=llm).evaluate(
findings, outcomes = await RuleRunner(llm_client=llm).evaluate_async(
ruleset.all_rules(), doc, entities
)
_log.info(" evaluated: %d findings from %d rules", len(findings), len(outcomes))
@@ -36,4 +36,5 @@ class GovdocRuleResult(BaseModel):
# 判定
result: Mapped[str] = mapped_column("result", String(32), default="pass", comment="执行结果:pass/fail/skipped/error")
skipReason: Mapped[str | None] = mapped_column("skip_reason", Text, comment="跳过原因,仅 skipped/error 时使用")
score: Mapped[float | None] = mapped_column("score", Numeric(10, 2), comment="本条得分")
@@ -31,6 +31,7 @@ class GovdocRun(BaseModel):
engineVersion: Mapped[str | None] = mapped_column("engine_version", String(64), comment="引擎版本号")
llmProvider: Mapped[str | None] = mapped_column("llm_provider", String(64), comment="LLM 提供商")
llmModel: Mapped[str | None] = mapped_column("llm_model", String(128), comment="LLM 模型名")
rulesPath: Mapped[str | None] = mapped_column("rules_path", String(1024), comment="本次运行使用的规则文件路径")
# 结果汇总
totalScore: Mapped[float | None] = mapped_column("total_score", Numeric(10, 2), comment="总分")
@@ -31,6 +31,7 @@ class IGovdocService(ABC):
page: int = 1,
pageSize: int = 20,
keyword: str | None = None,
fileExt: str | None = None,
region: str | None = None,
status: str | None = None,
resultStatus: str | None = None,
@@ -163,6 +163,7 @@ class DocumentServiceImpl(IDocumentService):
root_group_id=resolvedRootGroupId,
region=normalizedRegion,
normalized_name=normalizedName,
file_ext=fileExt,
)
internalDocumentNo = time.time_ns()
@@ -2712,16 +2713,25 @@ async def _find_latest_version_candidate(
root_group_id: int | None,
region: str,
normalized_name: str,
file_ext: str | None = None,
) -> dict | None:
"""Find the latest primary document version candidate by normalized name.
"""Find the latest primary document version candidate by normalized name + extension."""
ext_clause = ""
ext_params: dict[str, object] = {}
if file_ext:
ext_clause = " AND f.file_ext = :file_ext"
ext_params["file_ext"] = file_ext
Preferred rule: same region + same root group + same normalized name.
Fallback rule: when a root group cannot be resolved, keep the old same-type behavior.
"""
if root_group_id is not None:
params: dict[str, object] = {
"root_group_id": root_group_id,
"region": region,
"normalized_name": normalized_name,
**ext_params,
}
result = await session.execute(
text(
"""
f"""
SELECT
d.id AS document_id,
d.version_group_key,
@@ -2751,7 +2761,7 @@ async def _find_latest_version_candidate(
WHERE d.region = :region
AND d.normalized_name = :normalized_name
AND d.is_latest_version = true
AND d.deleted_at IS NULL
AND d.deleted_at IS NULL{ext_clause}
AND COALESCE(
CASE
WHEN eg.id IS NULL THEN NULL
@@ -2764,19 +2774,21 @@ async def _find_latest_version_candidate(
LIMIT 1
"""
),
{
"root_group_id": root_group_id,
"region": region,
"normalized_name": normalized_name,
},
params,
)
row = result.mappings().first()
if row:
return dict(row)
params = {
"type_id": type_id,
"region": region,
"normalized_name": normalized_name,
**ext_params,
}
result = await session.execute(
text(
"""
f"""
SELECT
d.id AS document_id,
d.version_group_key,
@@ -2791,18 +2803,14 @@ async def _find_latest_version_candidate(
AND f.file_role = 'primary'
WHERE d.type_id = :type_id
AND d.region = :region
AND d.normalized_name = :normalized_name
AND d.normalized_name = :normalized_name{ext_clause}
AND d.is_latest_version = true
AND d.deleted_at IS NULL
ORDER BY d.version_no DESC, d.id DESC
LIMIT 1
"""
),
{
"type_id": type_id,
"region": region,
"normalized_name": normalized_name,
},
params,
)
row = result.mappings().first()
return dict(row) if row else None
File diff suppressed because it is too large Load Diff