feat: integrate govdoc module into leaudit platform

This commit is contained in:
wren
2026-05-17 19:24:16 +08:00
parent cb13e61d3d
commit a73826dc1d
16 changed files with 2334 additions and 280 deletions
+6 -1
View File
@@ -33,7 +33,10 @@ celery_app = Celery(
celery_app.conf.update( celery_app.conf.update(
task_default_queue=LEAUDIT_WORKER_QUEUE_NORMAL, task_default_queue=LEAUDIT_WORKER_QUEUE_NORMAL,
imports=("fastapi_modules.fastapi_leaudit.leaudit_bridge.tasks",), imports=(
"fastapi_modules.fastapi_leaudit.leaudit_bridge.tasks",
"fastapi_modules.fastapi_leaudit.govdoc_bridge.tasks",
),
task_queues=( task_queues=(
Queue(LEAUDIT_WORKER_QUEUE_URGENT), Queue(LEAUDIT_WORKER_QUEUE_URGENT),
Queue(LEAUDIT_WORKER_QUEUE_NORMAL), Queue(LEAUDIT_WORKER_QUEUE_NORMAL),
@@ -56,9 +59,11 @@ celery_app.conf.update(
celery_app.autodiscover_tasks( celery_app.autodiscover_tasks(
[ [
"fastapi_modules.fastapi_leaudit.leaudit_bridge", "fastapi_modules.fastapi_leaudit.leaudit_bridge",
"fastapi_modules.fastapi_leaudit.govdoc_bridge",
], ],
force=True, force=True,
) )
# 显式导入任务模块,避免 worker 在某些启动方式下漏注册 bridge tasks。 # 显式导入任务模块,避免 worker 在某些启动方式下漏注册 bridge tasks。
from fastapi_modules.fastapi_leaudit.leaudit_bridge import tasks as _leaudit_bridge_tasks # noqa: F401,E402 from fastapi_modules.fastapi_leaudit.leaudit_bridge import tasks as _leaudit_bridge_tasks # noqa: F401,E402
from fastapi_modules.fastapi_leaudit.govdoc_bridge import tasks as _govdoc_bridge_tasks # noqa: F401,E402
@@ -31,7 +31,7 @@ class GovdocController(BaseController):
file: UploadFile = File(...), file: UploadFile = File(...),
typeId: int | None = Form(default=None), typeId: int | None = Form(default=None),
region: str = Form(default="default"), region: str = Form(default="default"),
autoRun: bool = Form(default=False), autoRun: bool = Form(default=True),
speed: str = Form(default="normal"), speed: str = Form(default="normal"),
ruleVersionId: int | None = Form(default=None), ruleVersionId: int | None = Form(default=None),
payload: dict[str, Any] = Depends(verify_access_token), payload: dict[str, Any] = Depends(verify_access_token),
@@ -56,6 +56,7 @@ class GovdocController(BaseController):
page: int = Query(default=1, ge=1), page: int = Query(default=1, ge=1),
pageSize: int = Query(default=20, ge=1, le=100), pageSize: int = Query(default=20, ge=1, le=100),
keyword: str | None = Query(default=None), keyword: str | None = Query(default=None),
fileExt: str | None = Query(default=None),
region: str | None = Query(default=None), region: str | None = Query(default=None),
status: str | None = Query(default=None), status: str | None = Query(default=None),
resultStatus: str | None = Query(default=None), resultStatus: str | None = Query(default=None),
@@ -72,6 +73,7 @@ class GovdocController(BaseController):
page=page, page=page,
pageSize=pageSize, pageSize=pageSize,
keyword=keyword, keyword=keyword,
fileExt=fileExt,
region=region, region=region,
status=status, status=status,
resultStatus=resultStatus, resultStatus=resultStatus,
@@ -26,6 +26,7 @@ class ResultAdapter:
EngineResult: AuditResult, EngineResult: AuditResult,
Structure: list[dict[str, Any]] | None = None, Structure: list[dict[str, Any]] | None = None,
Outline: list[dict[str, Any]] | None = None, Outline: list[dict[str, Any]] | None = None,
Entities: list[dict[str, Any]] | None = None,
) -> dict[str, Any]: ) -> dict[str, Any]:
"""从 AuditResult.summary 提取 run 汇总字段。 """从 AuditResult.summary 提取 run 汇总字段。
@@ -40,6 +41,10 @@ class ResultAdapter:
aux["structure"] = Structure aux["structure"] = Structure
if Outline is not None: if Outline is not None:
aux["outline"] = Outline aux["outline"] = Outline
if Entities is not None:
aux["entities"] = {
entity["name"]: entity for entity in Entities if entity.get("name")
}
return { return {
"totalScore": s.score, "totalScore": s.score,
@@ -100,6 +105,7 @@ class ResultAdapter:
"primaryRole": entity.primary_role, "primaryRole": entity.primary_role,
"source": entity.source, "source": entity.source,
"confidence": entity.confidence, "confidence": entity.confidence,
"extra": entity.extra,
}) })
return entities return entities
@@ -9,14 +9,26 @@
from __future__ import annotations from __future__ import annotations
import hashlib
import shutil
import tempfile
from dataclasses import dataclass, field from dataclasses import dataclass, field
from pathlib import Path
from typing import Any from typing import Any
from sqlalchemy import text
from fastapi_common.fastapi_common_logger import logger from fastapi_common.fastapi_common_logger import logger
from fastapi_common.fastapi_common_sqlalchemy.database import GetAsyncSession
from fastapi_common.fastapi_common_storage.oss_path_utils import OssPathUtils
from fastapi_modules.fastapi_leaudit.govdoc_bridge.input_resolver import InputResolver from fastapi_modules.fastapi_leaudit.govdoc_bridge.input_resolver import InputResolver
from fastapi_modules.fastapi_leaudit.govdoc_bridge.result_adapter import ResultAdapter
from fastapi_modules.fastapi_leaudit.govdoc_bridge.storage_adapter import StorageAdapter from fastapi_modules.fastapi_leaudit.govdoc_bridge.storage_adapter import StorageAdapter
from fastapi_modules.fastapi_leaudit.govdoc_engine.parser.docx_parser import parse_docx
from fastapi_modules.fastapi_leaudit.govdoc_engine.reporter.docx_annotator import annotate_docx
from fastapi_modules.fastapi_leaudit.govdoc_engine.reporter.html_paragraph import paragraphs_to_html
from fastapi_modules.fastapi_leaudit.govdoc_engine.reporter.html_renderer import render_html
from fastapi_modules.fastapi_leaudit.services.impl.ossServiceImpl import OssServiceImpl
log = logger log = logger
@@ -33,13 +45,30 @@ class GovdocRunner:
InputResolver: InputResolver = field(default_factory=InputResolver) InputResolver: InputResolver = field(default_factory=InputResolver)
Storage: StorageAdapter = field(default_factory=StorageAdapter) Storage: StorageAdapter = field(default_factory=StorageAdapter)
ResultAdapter: ResultAdapter = field(default_factory=ResultAdapter) OssService: OssServiceImpl = field(default_factory=OssServiceImpl)
ResultAdapter: Any | None = None
def ResolveRulesPath(self, RulesPath: str | None) -> str:
"""解析并校验执行所需规则文件路径。"""
candidate = (RulesPath or "").strip()
if not candidate:
raise ValueError("未提供 govdoc rules_path,当前任务无法执行规则审查")
path = Path(candidate).expanduser()
if not path.is_absolute():
path = Path.cwd() / path
path = path.resolve()
if not path.is_file():
raise FileNotFoundError(f"govdoc 规则文件不存在: {path}")
return str(path)
async def Execute( async def Execute(
self, self,
DocumentId: int, DocumentId: int,
RunId: int, RunId: int,
RulesPath: str, RulesPath: str | None = None,
TriggerUserId: int | None = None, TriggerUserId: int | None = None,
Speed: str = "normal", Speed: str = "normal",
) -> dict[str, Any]: ) -> dict[str, Any]:
@@ -54,57 +83,216 @@ class GovdocRunner:
Returns: Returns:
执行摘要 dict。 执行摘要 dict。
""" """
resolvedRulesPath = self.ResolveRulesPath(RulesPath)
log.info(f"[Govdoc] Starting execution: runId={RunId}, documentId={DocumentId}") log.info(f"[Govdoc] Starting execution: runId={RunId}, documentId={DocumentId}")
artifactTempDir: str | None = None
inputPayload = None
try:
# 1. 更新 run 状态 → processing
await self.Storage.UpdateRunStatus(RunId, "processing", Phase="parsing")
await self.Storage.UpdateDocumentStatus(DocumentId, "processing", RunId)
# 1. 更新 run 状态 → processing # 2. 解析输入文件
await self.Storage.UpdateRunStatus(RunId, "processing", phase="parsing") inputPayload = await self.InputResolver.ResolveForDocument(DocumentId)
await self.Storage.UpdateDocumentStatus(DocumentId, "processing", RunId) log.info(f"[Govdoc] Input resolved: {inputPayload.fileName}{inputPayload.localPath}")
# 2. 解析输入文件 # 3. 调用 govdoc_engine 执行审查
inputPayload = await self.InputResolver.ResolveForDocument(DocumentId) from fastapi_modules.fastapi_leaudit.govdoc_bridge.result_adapter import ResultAdapter
log.info(f"[Govdoc] Input resolved: {inputPayload.fileName}{inputPayload.localPath}") from fastapi_modules.fastapi_leaudit.govdoc_engine.pipeline import run as engine_run
# 3. 调用 govdoc_engine 执行审查 if self.ResultAdapter is None:
from fastapi_modules.fastapi_leaudit.govdoc_engine.pipeline import run as engine_run self.ResultAdapter = ResultAdapter()
engineResult = await engine_run( engineResult = await engine_run(
file_path=inputPayload.localPath, file_path=inputPayload.localPath,
rules_path=RulesPath, rules_path=resolvedRulesPath,
llm_client=None, # 使用默认 LlmClient (从平台配置加载) llm_client=None, # 使用默认 LlmClient (从平台配置加载)
)
engineResult.document["filename"] = inputPayload.fileName
# 4. 适配引擎结果
structure = self.ResultAdapter.AdaptStructure(engineResult)
outline = self.ResultAdapter.AdaptOutline(engineResult)
entities = self.ResultAdapter.AdaptEntities(engineResult)
runSummary = self.ResultAdapter.AdaptRunSummary(
engineResult,
Structure=structure,
Outline=outline,
Entities=entities,
)
ruleResults = self.ResultAdapter.AdaptRuleResults(engineResult)
checkedRuleResults = self.ResultAdapter.AdaptCheckedRules(engineResult)
artifactTempDir, artifacts = await self._GenerateArtifacts(
DocumentId=DocumentId,
RunId=RunId,
InputPath=inputPayload.localPath,
InputFileName=inputPayload.fileName,
EngineResult=engineResult,
RuleResults=ruleResults,
)
failedRuleIds = {str(row.get("ruleId") or "") for row in ruleResults}
for checkedRule in checkedRuleResults:
ruleId = str(checkedRule.get("ruleId") or "")
if checkedRule.get("result") == "fail" and ruleId in failedRuleIds:
continue
ruleResults.append(checkedRule)
# 将 rules_path 附带到 runSummary 中,供 GetRuleDetail 后续解析
runSummary["rulesPath"] = resolvedRulesPath
# 5. 持久化结果
await self.Storage.UpdateRunResult(RunId, runSummary)
await self.Storage.SaveRuleResults(RunId, ruleResults)
await self.Storage.SaveArtifacts(RunId, artifacts)
# 6. 更新终态
await self.Storage.UpdateRunStatus(RunId, "completed", Phase="reporting")
await self.Storage.UpdateDocumentStatus(DocumentId, "completed", RunId)
log.info(f"[Govdoc] Execution completed: runId={RunId}")
return {
"runId": RunId,
"documentId": DocumentId,
"status": "completed",
"ruleResultsCount": len(ruleResults),
"structureCount": len(structure),
"outlineCount": len(outline),
"artifactCount": len(artifacts),
}
finally:
if artifactTempDir:
shutil.rmtree(artifactTempDir, ignore_errors=True)
if inputPayload and inputPayload.tempDir:
shutil.rmtree(inputPayload.tempDir, ignore_errors=True)
async def _GenerateArtifacts(
self,
DocumentId: int,
RunId: int,
InputPath: str,
InputFileName: str,
EngineResult: Any,
RuleResults: list[dict[str, Any]],
) -> tuple[str, list[dict[str, Any]]]:
"""生成报告产物并上传到 OSS。"""
artifactDir = tempfile.mkdtemp(prefix=f"govdoc_artifacts_{RunId}_")
sourcePath = Path(InputPath)
baseName = OssPathUtils.BuildSafeFileStem(InputFileName)
region = await self._GetDocumentRegion(DocumentId)
annotatedPath = Path(artifactDir) / f"{baseName}.annotated.docx"
annotate_docx(sourcePath, annotatedPath, EngineResult)
htmlReport = render_html(EngineResult)
doc = parse_docx(sourcePath)
findingMap: dict[int, list[str]] = {}
for row in RuleResults:
if row.get("result") != "fail":
continue
paragraphIndex = row.get("paragraphIndex")
if paragraphIndex is None:
continue
findingId = f"{row.get('ruleId') or 'finding'}-{paragraphIndex}"
findingMap.setdefault(int(paragraphIndex), []).append(findingId)
paragraphsHtml = paragraphs_to_html(doc, findingMap)
annotatedUrl = await self.OssService.UploadBytes(
ObjectKey=OssPathUtils.BuildArtifactKey(
Region=region,
RunId=RunId,
ArtifactType="annotated_docx",
Detail=f"{baseName}.annotated.docx",
),
Content=annotatedPath.read_bytes(),
ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
)
htmlUrl = await self.OssService.UploadText(
ObjectKey=OssPathUtils.BuildArtifactKey(
Region=region,
RunId=RunId,
ArtifactType="html_report",
Detail=f"{baseName}.report.html",
),
Content=htmlReport,
ContentType="text/html; charset=utf-8",
)
paragraphUrl = await self.OssService.UploadText(
ObjectKey=OssPathUtils.BuildArtifactKey(
Region=region,
RunId=RunId,
ArtifactType="paragraph_html",
Detail=f"{baseName}.paragraphs.html",
),
Content=paragraphsHtml,
ContentType="text/html; charset=utf-8",
) )
# 4. 适配引擎结果 return artifactDir, [
structure = self.ResultAdapter.AdaptStructure(engineResult) self._BuildArtifactRow(
outline = self.ResultAdapter.AdaptOutline(engineResult) artifactType="annotated_docx",
runSummary = self.ResultAdapter.AdaptRunSummary( fileName=f"{baseName}.annotated.docx",
engineResult, mimeType="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
Structure=structure, content=annotatedPath.read_bytes(),
Outline=outline, ossUrl=annotatedUrl,
) description="批注 DOCX",
ruleResults = self.ResultAdapter.AdaptRuleResults(engineResult) ),
entities = self.ResultAdapter.AdaptEntities(engineResult) self._BuildArtifactRow(
artifacts = self.ResultAdapter.AdaptArtifacts(engineResult, RunId) artifactType="html_report",
fileName=f"{baseName}.report.html",
mimeType="text/html; charset=utf-8",
content=htmlReport.encode("utf-8"),
ossUrl=htmlUrl,
description="HTML 审查报告",
),
self._BuildArtifactRow(
artifactType="paragraph_html",
fileName=f"{baseName}.paragraphs.html",
mimeType="text/html; charset=utf-8",
content=paragraphsHtml.encode("utf-8"),
ossUrl=paragraphUrl,
description="段落联动视图",
),
]
# 将 rules_path 附带到 runSummary 中,供 GetRuleDetail 后续解析 async def _GetDocumentRegion(self, DocumentId: int) -> str:
runSummary["rulesPath"] = RulesPath async with GetAsyncSession() as session:
row = (
# 5. 持久化结果 await session.execute(
await self.Storage.UpdateRunResult(RunId, runSummary) text(
await self.Storage.SaveRuleResults(RunId, ruleResults) """
await self.Storage.SaveArtifacts(RunId, artifacts) SELECT COALESCE(region, 'default') AS region
FROM leaudit_documents
# 6. 更新终态 WHERE id = :document_id
await self.Storage.UpdateRunStatus(RunId, "completed", phase="reporting") LIMIT 1
await self.Storage.UpdateDocumentStatus(DocumentId, "completed", RunId) """
),
log.info(f"[Govdoc] Execution completed: runId={RunId}") {"document_id": DocumentId},
)
).mappings().first()
return str(row["region"] or "default") if row else "default"
def _BuildArtifactRow(
self,
*,
artifactType: str,
fileName: str,
mimeType: str,
content: bytes,
ossUrl: str,
description: str,
) -> dict[str, Any]:
fileExt = Path(fileName).suffix.lstrip(".").lower()
return { return {
"runId": RunId, "artifactType": artifactType,
"documentId": DocumentId, "fileName": fileName,
"status": "completed", "fileExt": fileExt,
"ruleResultsCount": len(ruleResults), "mimeType": mimeType,
"structureCount": len(structure), "fileSize": len(content),
"outlineCount": len(outline), "sha256": hashlib.sha256(content).hexdigest(),
"artifactCount": len(artifacts), "ossUrl": ossUrl,
"storageProvider": "minio",
"description": description,
} }
@@ -6,6 +6,7 @@ govdoc_report_artifacts 表,并更新 leaudit_documents 状态。
from __future__ import annotations from __future__ import annotations
import json
from datetime import datetime, timezone from datetime import datetime, timezone
from typing import Any from typing import Any
@@ -16,6 +17,33 @@ from sqlalchemy import text
log = logger log = logger
_RUN_RESULT_FALLBACK_KEYS = {
"total_score": ["totalScore", "score"],
"passed_count": ["passedCount", "passed_count"],
"failed_count": ["failedCount", "failed_count"],
"skipped_count": ["skippedCount", "skipped_count"],
"result_status": ["resultStatus", "result_status"],
"result_summary_json": ["resultSummaryJson", "result_summary_json"],
}
def _pick_value(payload: dict[str, Any], *keys: str) -> Any:
for key in keys:
if key in payload:
return payload[key]
return None
def _to_text_payload(value: Any) -> str | None:
if value is None:
return None
if isinstance(value, str):
return value
if isinstance(value, (dict, list, tuple, bool, int, float)):
return json.dumps(value, ensure_ascii=False)
return str(value)
class StorageAdapter: class StorageAdapter:
"""Govdoc 结果持久化适配器。 """Govdoc 结果持久化适配器。
@@ -51,11 +79,23 @@ class StorageAdapter:
log.info(f"[Govdoc] Run created: runId={run_id}, documentId={RunData['documentId']}") log.info(f"[Govdoc] Run created: runId={run_id}, documentId={RunData['documentId']}")
return run_id return run_id
async def UpdateRunStatus(self, RunId: int, Status: str, Phase: str | None = None, **Extra: Any) -> None: async def UpdateRunStatus(
"""更新 run 状态和阶段。""" self,
RunId: int,
Status: str,
Phase: str | None = None,
**Extra: Any,
) -> None:
"""更新 run 状态和阶段。
兼容调用方传 ``phase=`` 或 ``Phase=``,避免因大小写不一致导致阶段不落库。
"""
set_clauses = ["status = :status", "updated_at = now()"] set_clauses = ["status = :status", "updated_at = now()"]
params: dict[str, Any] = {"rid": RunId, "status": Status} params: dict[str, Any] = {"rid": RunId, "status": Status}
if Phase is None:
Phase = _pick_value(Extra, "phase", "Phase")
if Phase is not None: if Phase is not None:
set_clauses.append("phase = :phase") set_clauses.append("phase = :phase")
params["phase"] = Phase params["phase"] = Phase
@@ -63,6 +103,9 @@ class StorageAdapter:
if Status == "completed" or Status == "failed": if Status == "completed" or Status == "failed":
set_clauses.append("finished_at = :finished_at") set_clauses.append("finished_at = :finished_at")
params["finished_at"] = datetime.now(timezone.utc) params["finished_at"] = datetime.now(timezone.utc)
elif Status == "processing":
set_clauses.append("started_at = COALESCE(started_at, :started_at)")
params["started_at"] = datetime.now(timezone.utc)
async with GetAsyncSession() as session: async with GetAsyncSession() as session:
await session.execute( await session.execute(
@@ -74,7 +117,7 @@ class StorageAdapter:
async def UpdateRunResult(self, RunId: int, Summary: dict[str, Any]) -> None: async def UpdateRunResult(self, RunId: int, Summary: dict[str, Any]) -> None:
"""写入 run 结果汇总字段(含 rules_path / structure / outline)。""" """写入 run 结果汇总字段(含 rules_path / structure / outline)。"""
rules_path = Summary.get("rulesPath") rules_path = _pick_value(Summary, "rulesPath", "rules_path")
set_clauses = [ set_clauses = [
"total_score = :total_score", "total_score = :total_score",
"passed_count = :passed_count", "passed_count = :passed_count",
@@ -86,12 +129,12 @@ class StorageAdapter:
] ]
params: dict[str, Any] = { params: dict[str, Any] = {
"rid": RunId, "rid": RunId,
"total_score": Summary.get("totalScore"), "total_score": _pick_value(Summary, *_RUN_RESULT_FALLBACK_KEYS["total_score"]),
"passed_count": Summary.get("passedCount", 0), "passed_count": _pick_value(Summary, *_RUN_RESULT_FALLBACK_KEYS["passed_count"]) or 0,
"failed_count": Summary.get("failedCount", 0), "failed_count": _pick_value(Summary, *_RUN_RESULT_FALLBACK_KEYS["failed_count"]) or 0,
"skipped_count": Summary.get("skippedCount", 0), "skipped_count": _pick_value(Summary, *_RUN_RESULT_FALLBACK_KEYS["skipped_count"]) or 0,
"result_status": Summary.get("resultStatus"), "result_status": _pick_value(Summary, *_RUN_RESULT_FALLBACK_KEYS["result_status"]),
"result_summary_json": Summary.get("resultSummaryJson"), "result_summary_json": _pick_value(Summary, *_RUN_RESULT_FALLBACK_KEYS["result_summary_json"]),
} }
if rules_path: if rules_path:
set_clauses.append("rules_path = :rules_path") set_clauses.append("rules_path = :rules_path")
@@ -137,11 +180,11 @@ class StorageAdapter:
(run_id, rule_id, rule_name, severity, category, (run_id, rule_id, rule_name, severity, category,
message, suggestion, actual, expected, evidence, message, suggestion, actual, expected, evidence,
paragraph_index, paragraph_text, location_path, paragraph_index, paragraph_text, location_path,
result, score, created_at, updated_at) result, skip_reason, score, created_at, updated_at)
VALUES (:run_id, :rule_id, :rule_name, :severity, :category, VALUES (:run_id, :rule_id, :rule_name, :severity, :category,
:message, :suggestion, :actual, :expected, :evidence, :message, :suggestion, :actual, :expected, :evidence,
:paragraph_index, :paragraph_text, :location_path, :paragraph_index, :paragraph_text, :location_path,
:result, :score, now(), now())""" :result, :skip_reason, :score, now(), now())"""
), ),
{ {
"run_id": RunId, "run_id": RunId,
@@ -151,13 +194,14 @@ class StorageAdapter:
"category": row.get("category"), "category": row.get("category"),
"message": row.get("message"), "message": row.get("message"),
"suggestion": row.get("suggestion"), "suggestion": row.get("suggestion"),
"actual": row.get("actual"), "actual": _to_text_payload(row.get("actual")),
"expected": row.get("expected"), "expected": _to_text_payload(row.get("expected")),
"evidence": row.get("evidence"), "evidence": _to_text_payload(row.get("evidence")),
"paragraph_index": row.get("paragraphIndex"), "paragraph_index": row.get("paragraphIndex"),
"paragraph_text": row.get("paragraphText"), "paragraph_text": row.get("paragraphText"),
"location_path": row.get("locationPath"), "location_path": row.get("locationPath"),
"result": row.get("result", "pass"), "result": row.get("result", "pass"),
"skip_reason": _pick_value(row, "skipReason", "skip_reason"),
"score": row.get("score"), "score": row.get("score"),
}, },
) )
@@ -11,13 +11,17 @@ from typing import Any
from fastapi_common.fastapi_common_logger import logger from fastapi_common.fastapi_common_logger import logger
from fastapi_admin.celery_app import celery_app from fastapi_admin.celery_app import celery_app
from fastapi_admin.config import (
LEAUDIT_WORKER_QUEUE_NORMAL,
LEAUDIT_WORKER_QUEUE_URGENT,
)
from fastapi_modules.fastapi_leaudit.govdoc_bridge.runner import GovdocRunner from fastapi_modules.fastapi_leaudit.govdoc_bridge.runner import GovdocRunner
from fastapi_modules.fastapi_leaudit.govdoc_bridge.storage_adapter import StorageAdapter from fastapi_modules.fastapi_leaudit.govdoc_bridge.storage_adapter import StorageAdapter
log = logger log = logger
GOVDOC_WORKER_QUEUE = "govdoc" GOVDOC_WORKER_QUEUE = LEAUDIT_WORKER_QUEUE_NORMAL
GOVDOC_WORKER_QUEUE_URGENT = "govdoc_urgent" GOVDOC_WORKER_QUEUE_URGENT = LEAUDIT_WORKER_QUEUE_URGENT
def resolve_govdoc_queue(speed: str = "normal") -> str: def resolve_govdoc_queue(speed: str = "normal") -> str:
@@ -30,6 +34,7 @@ def resolve_govdoc_queue(speed: str = "normal") -> str:
def dispatch_govdoc_task( def dispatch_govdoc_task(
documentId: int, documentId: int,
runId: int, runId: int,
rulesPath: str | None = None,
triggerUserId: int | None = None, triggerUserId: int | None = None,
speed: str = "normal", speed: str = "normal",
) -> Any: ) -> Any:
@@ -52,6 +57,7 @@ def dispatch_govdoc_task(
kwargs={ kwargs={
"documentId": documentId, "documentId": documentId,
"runId": runId, "runId": runId,
"rulesPath": rulesPath,
"triggerUserId": triggerUserId, "triggerUserId": triggerUserId,
"speed": speed, "speed": speed,
}, },
@@ -73,6 +79,7 @@ def govdoc_execute_task(
self, self,
documentId: int, documentId: int,
runId: int, runId: int,
rulesPath: str | None = None,
triggerUserId: int | None = None, triggerUserId: int | None = None,
speed: str = "normal", speed: str = "normal",
) -> dict[str, Any]: ) -> dict[str, Any]:
@@ -89,7 +96,7 @@ def govdoc_execute_task(
try: try:
# 更新 run 状态 → running # 更新 run 状态 → running
loop.run_until_complete(storage.UpdateRunStatus(runId, "processing", phase="parsing")) loop.run_until_complete(storage.UpdateRunStatus(runId, "processing", Phase="parsing"))
# 执行完整审查链路 # 执行完整审查链路
runner = GovdocRunner() runner = GovdocRunner()
@@ -97,6 +104,7 @@ def govdoc_execute_task(
runner.Execute( runner.Execute(
DocumentId=documentId, DocumentId=documentId,
RunId=runId, RunId=runId,
RulesPath=rulesPath,
TriggerUserId=triggerUserId, TriggerUserId=triggerUserId,
Speed=speed, Speed=speed,
) )
@@ -1,30 +1,12 @@
"""Govdoc 公文格式审查引擎内核。 """Govdoc 公文格式审查引擎内核。
从旧 govdoc-audit 项目裁剪迁入,去除独立 API 层、SQLite 存储层、 保持包级导入轻量,避免在控制器注册阶段提前拉起 LLM/OpenAI 依赖。
本地运行记录器 (RunRecorder) 和旧配置系统 真正执行审查时再按需导入 pipeline / result 模块
导出:
- pipeline.run() — 异步审查入口 (bridge 层主调用)
- pipeline.audit_file() — 同步审查入口 (兼容)
- models — 核心数据模型 (Pydantic)
- parser — 文档解析与实体抽取
- dsl — YAML 规则 DSL 定义与加载
- engine — 规则执行引擎与结果模型
- reporter — 报告生成 (HTML/DOCX/JSON)
- llm — LLM 客户端 (OpenAI 兼容协议)
""" """
from __future__ import annotations from __future__ import annotations
from fastapi_modules.fastapi_leaudit.govdoc_engine.pipeline import ( from typing import Any
audit_file,
run,
)
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.result import (
AuditResult,
AuditSummary,
CheckedRule,
)
__all__ = [ __all__ = [
"audit_file", "audit_file",
@@ -33,3 +15,31 @@ __all__ = [
"AuditSummary", "AuditSummary",
"CheckedRule", "CheckedRule",
] ]
def audit_file(*args: Any, **kwargs: Any):
from fastapi_modules.fastapi_leaudit.govdoc_engine.pipeline import audit_file as _audit_file
return _audit_file(*args, **kwargs)
async def run(*args: Any, **kwargs: Any):
from fastapi_modules.fastapi_leaudit.govdoc_engine.pipeline import run as _run
return await _run(*args, **kwargs)
def __getattr__(name: str):
if name in {"AuditResult", "AuditSummary", "CheckedRule"}:
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.result import (
AuditResult,
AuditSummary,
CheckedRule,
)
return {
"AuditResult": AuditResult,
"AuditSummary": AuditSummary,
"CheckedRule": CheckedRule,
}[name]
raise AttributeError(name)
@@ -11,7 +11,22 @@ import re
import time import time
from typing import Any from typing import Any
from openai import AsyncOpenAI, OpenAI, APIError, APIConnectionError, RateLimitError try:
from openai import AsyncOpenAI, OpenAI, APIError, APIConnectionError, RateLimitError
_OPENAI_IMPORT_ERROR: Exception | None = None
except ModuleNotFoundError as exc: # pragma: no cover - optional dependency
AsyncOpenAI = None # type: ignore[assignment]
OpenAI = None # type: ignore[assignment]
_OPENAI_IMPORT_ERROR = exc
class APIError(Exception):
status_code: int | None = None
class APIConnectionError(Exception):
pass
class RateLimitError(Exception):
pass
from fastapi_admin.config import ( from fastapi_admin.config import (
LLM_API_KEY, LLM_API_KEY,
@@ -125,7 +140,13 @@ class LlmClient:
): ):
key = api_key or LLM_API_KEY key = api_key or LLM_API_KEY
self._misconfigured_error: LlmConfigError | None = None self._misconfigured_error: LlmConfigError | None = None
if not key: if OpenAI is None or AsyncOpenAI is None:
self._client = None
self._aclient = None
self._misconfigured_error = LlmConfigError(
"python package 'openai' is not installed; govdoc LLM features are unavailable."
)
elif not key:
self._client = None self._client = None
self._aclient = None self._aclient = None
self._misconfigured_error = LlmConfigError( self._misconfigured_error = LlmConfigError(
@@ -130,12 +130,12 @@ def _merge_llm_into_entities(
# ── 实体构建 (同步,供 sync 入口使用) ────────────────── # ── 实体构建 (同步,供 sync 入口使用) ──────────────────
def _build_entities( def _build_entities(
doc, ruleset: RuleSet, llm: LlmClient, doc, ruleset: RuleSet, llm: LlmClient | None,
) -> dict[str, SemanticEntity | None]: ) -> dict[str, SemanticEntity | None]:
"""构建实体 + 差量 LLM 抽取(同步)。""" """构建实体 + 差量 LLM 抽取(同步)。"""
entities = EntityBuilder().build(doc) entities = EntityBuilder().build(doc)
spec = _compute_missing_spec(entities, ruleset.extract.entities) spec = _compute_missing_spec(entities, ruleset.extract.entities)
if spec: if spec and llm is not None:
llm_vals = FieldExtractor(llm).extract_missing(doc, spec) llm_vals = FieldExtractor(llm).extract_missing(doc, spec)
_merge_llm_into_entities(entities, llm_vals) _merge_llm_into_entities(entities, llm_vals)
return entities return entities
@@ -144,12 +144,12 @@ def _build_entities(
# ── 实体构建 (异步,供 async 入口使用) ────────────────── # ── 实体构建 (异步,供 async 入口使用) ──────────────────
async def _build_entities_async( async def _build_entities_async(
doc, ruleset: RuleSet, llm: LlmClient, doc, ruleset: RuleSet, llm: LlmClient | None,
) -> dict[str, SemanticEntity | None]: ) -> dict[str, SemanticEntity | None]:
"""构建实体 + 差量 LLM 抽取(异步)。""" """构建实体 + 差量 LLM 抽取(异步)。"""
entities = EntityBuilder().build(doc) entities = EntityBuilder().build(doc)
spec = _compute_missing_spec(entities, ruleset.extract.entities) spec = _compute_missing_spec(entities, ruleset.extract.entities)
if spec: if spec and llm is not None:
llm_vals = await FieldExtractor(llm).extract_missing_async(doc, spec) llm_vals = await FieldExtractor(llm).extract_missing_async(doc, spec)
_merge_llm_into_entities(entities, llm_vals) _merge_llm_into_entities(entities, llm_vals)
return entities return entities
@@ -174,7 +174,7 @@ def audit_file(
""" """
docx_path = Path(docx_path) docx_path = Path(docx_path)
rules_path = Path(rules_path) rules_path = Path(rules_path)
llm = llm_client or LlmClient() llm = llm_client
doc = parse_docx(docx_path) doc = parse_docx(docx_path)
RoleTagger(llm_client=llm).tag(doc) RoleTagger(llm_client=llm).tag(doc)
@@ -210,7 +210,7 @@ async def run(
""" """
file_path = Path(file_path) file_path = Path(file_path)
rules_path = Path(rules_path) rules_path = Path(rules_path)
llm = llm_client or LlmClient() llm = llm_client
_log.info("Govdoc pipeline start: %s", file_path.name) _log.info("Govdoc pipeline start: %s", file_path.name)
@@ -219,18 +219,21 @@ async def run(
_log.info(" parsed: %d paragraphs", len(doc.paragraphs)) _log.info(" parsed: %d paragraphs", len(doc.paragraphs))
# 2. 段落角色标注 # 2. 段落角色标注
RoleTagger(llm_client=llm).tag(doc) if llm is not None:
await RoleTagger(llm_client=llm).tag_async(doc)
else:
RoleTagger(llm_client=None).tag(doc)
# 3. 加载规则 # 3. 加载规则
ruleset = load_rules(rules_path) ruleset = load_rules(rules_path)
_log.info(" rules: %d groups, %d rules", len(ruleset.groups), len(ruleset.all_rules())) _log.info(" rules: %d groups, %d rules", len(ruleset.rules), len(ruleset.all_rules()))
# 4. 实体抽取 (含差量 LLM) # 4. 实体抽取 (含差量 LLM)
entities = await _build_entities_async(doc, ruleset, llm) entities = await _build_entities_async(doc, ruleset, llm)
_log.info(" entities: %d/%d resolved", sum(1 for v in entities.values() if v), len(entities)) _log.info(" entities: %d/%d resolved", sum(1 for v in entities.values() if v), len(entities))
# 5. 规则评估 # 5. 规则评估
findings, outcomes = RuleRunner(llm_client=llm).evaluate( findings, outcomes = await RuleRunner(llm_client=llm).evaluate_async(
ruleset.all_rules(), doc, entities ruleset.all_rules(), doc, entities
) )
_log.info(" evaluated: %d findings from %d rules", len(findings), len(outcomes)) _log.info(" evaluated: %d findings from %d rules", len(findings), len(outcomes))
@@ -36,4 +36,5 @@ class GovdocRuleResult(BaseModel):
# 判定 # 判定
result: Mapped[str] = mapped_column("result", String(32), default="pass", comment="执行结果:pass/fail/skipped/error") result: Mapped[str] = mapped_column("result", String(32), default="pass", comment="执行结果:pass/fail/skipped/error")
skipReason: Mapped[str | None] = mapped_column("skip_reason", Text, comment="跳过原因,仅 skipped/error 时使用")
score: Mapped[float | None] = mapped_column("score", Numeric(10, 2), comment="本条得分") score: Mapped[float | None] = mapped_column("score", Numeric(10, 2), comment="本条得分")
@@ -31,6 +31,7 @@ class GovdocRun(BaseModel):
engineVersion: Mapped[str | None] = mapped_column("engine_version", String(64), comment="引擎版本号") engineVersion: Mapped[str | None] = mapped_column("engine_version", String(64), comment="引擎版本号")
llmProvider: Mapped[str | None] = mapped_column("llm_provider", String(64), comment="LLM 提供商") llmProvider: Mapped[str | None] = mapped_column("llm_provider", String(64), comment="LLM 提供商")
llmModel: Mapped[str | None] = mapped_column("llm_model", String(128), comment="LLM 模型名") llmModel: Mapped[str | None] = mapped_column("llm_model", String(128), comment="LLM 模型名")
rulesPath: Mapped[str | None] = mapped_column("rules_path", String(1024), comment="本次运行使用的规则文件路径")
# 结果汇总 # 结果汇总
totalScore: Mapped[float | None] = mapped_column("total_score", Numeric(10, 2), comment="总分") totalScore: Mapped[float | None] = mapped_column("total_score", Numeric(10, 2), comment="总分")
@@ -31,6 +31,7 @@ class IGovdocService(ABC):
page: int = 1, page: int = 1,
pageSize: int = 20, pageSize: int = 20,
keyword: str | None = None, keyword: str | None = None,
fileExt: str | None = None,
region: str | None = None, region: str | None = None,
status: str | None = None, status: str | None = None,
resultStatus: str | None = None, resultStatus: str | None = None,
@@ -163,6 +163,7 @@ class DocumentServiceImpl(IDocumentService):
root_group_id=resolvedRootGroupId, root_group_id=resolvedRootGroupId,
region=normalizedRegion, region=normalizedRegion,
normalized_name=normalizedName, normalized_name=normalizedName,
file_ext=fileExt,
) )
internalDocumentNo = time.time_ns() internalDocumentNo = time.time_ns()
@@ -2712,16 +2713,25 @@ async def _find_latest_version_candidate(
root_group_id: int | None, root_group_id: int | None,
region: str, region: str,
normalized_name: str, normalized_name: str,
file_ext: str | None = None,
) -> dict | None: ) -> dict | None:
"""Find the latest primary document version candidate by normalized name. """Find the latest primary document version candidate by normalized name + extension."""
ext_clause = ""
ext_params: dict[str, object] = {}
if file_ext:
ext_clause = " AND f.file_ext = :file_ext"
ext_params["file_ext"] = file_ext
Preferred rule: same region + same root group + same normalized name.
Fallback rule: when a root group cannot be resolved, keep the old same-type behavior.
"""
if root_group_id is not None: if root_group_id is not None:
params: dict[str, object] = {
"root_group_id": root_group_id,
"region": region,
"normalized_name": normalized_name,
**ext_params,
}
result = await session.execute( result = await session.execute(
text( text(
""" f"""
SELECT SELECT
d.id AS document_id, d.id AS document_id,
d.version_group_key, d.version_group_key,
@@ -2751,7 +2761,7 @@ async def _find_latest_version_candidate(
WHERE d.region = :region WHERE d.region = :region
AND d.normalized_name = :normalized_name AND d.normalized_name = :normalized_name
AND d.is_latest_version = true AND d.is_latest_version = true
AND d.deleted_at IS NULL AND d.deleted_at IS NULL{ext_clause}
AND COALESCE( AND COALESCE(
CASE CASE
WHEN eg.id IS NULL THEN NULL WHEN eg.id IS NULL THEN NULL
@@ -2764,19 +2774,21 @@ async def _find_latest_version_candidate(
LIMIT 1 LIMIT 1
""" """
), ),
{ params,
"root_group_id": root_group_id,
"region": region,
"normalized_name": normalized_name,
},
) )
row = result.mappings().first() row = result.mappings().first()
if row: if row:
return dict(row) return dict(row)
params = {
"type_id": type_id,
"region": region,
"normalized_name": normalized_name,
**ext_params,
}
result = await session.execute( result = await session.execute(
text( text(
""" f"""
SELECT SELECT
d.id AS document_id, d.id AS document_id,
d.version_group_key, d.version_group_key,
@@ -2791,18 +2803,14 @@ async def _find_latest_version_candidate(
AND f.file_role = 'primary' AND f.file_role = 'primary'
WHERE d.type_id = :type_id WHERE d.type_id = :type_id
AND d.region = :region AND d.region = :region
AND d.normalized_name = :normalized_name AND d.normalized_name = :normalized_name{ext_clause}
AND d.is_latest_version = true AND d.is_latest_version = true
AND d.deleted_at IS NULL AND d.deleted_at IS NULL
ORDER BY d.version_no DESC, d.id DESC ORDER BY d.version_no DESC, d.id DESC
LIMIT 1 LIMIT 1
""" """
), ),
{ params,
"type_id": type_id,
"region": region,
"normalized_name": normalized_name,
},
) )
row = result.mappings().first() row = result.mappings().first()
return dict(row) if row else None return dict(row) if row else None
File diff suppressed because it is too large Load Diff
+546
View File
@@ -0,0 +1,546 @@
metadata:
type_id: govdoc_general
name: 内部公文通用规则
version: "0.1.0"
source: 公文文稿常见错误汇编(第一期)·2025-11
description: 基于旧内部公文正式规则语义整理的当前平台规则集。
extract:
# 8 个内置实体(title / doc_number / recipient / date /
# signature / attachments / wenzhong / issuer)由代码自动产出。
entities: []
rules:
- group: 标题(错误汇编 一)
rules:
- rule_id: GW-T-001
name: 标题文种合规性
severity: error
category: 标题
target: title
on_missing: fail
stages:
- check: ai
prompt: |
审查公文标题是否符合规范。
标题:{{title.text}}
15 种合法文种:决议、决定、命令(令)、公报、公告、通告、意见、
通知、通报、报告、请示、批复、议案、函、纪要
检查要点:
1. 是否使用了合法文种
2. 方案/规划/办法/细则等是否以"通知"形式下发(应为"关于印发〈xxx〉的通知"
3. 标题中是否有"印发"等动词
messages:
pass: 标题文种合规
fail: 标题文种不合规
- rule_id: GW-T-002
name: 标题不可有"请求"+"请示"重复
severity: error
category: 标题
target: title
on_missing: skip
stages:
- check: regex_forbid
pattern: '关于请求.*的请示'
messages:
pass: ok
fail: '"请示"已包含"请求"之意,应删去"请求"'
- rule_id: GW-T-003
name: 标题不可有"上报"+"报告"重复
severity: error
category: 标题
target: title
on_missing: skip
stages:
- check: regex_forbid
pattern: '关于上报.*的报告'
messages:
pass: ok
fail: '"报告"已包含"上报"之意,应删去"上报"'
- rule_id: GW-T-004
name: 标题介词连用
severity: warning
category: 标题
target: title
on_missing: skip
stages:
- check: regex_forbid
pattern: '关于对.*的(批复|通知|通报)'
messages:
pass: ok
fail: '"关于"+"对" 介词连用不规范'
- rule_id: GW-T-005
name: 标题文种白名单
severity: error
category: 文种
target: wenzhong
on_missing: skip
stages:
- check: wenzhong_whitelist
messages:
pass: 文种合规
fail: 非法定文种(出现"工作情况""汇报""方案""办法"等当文种)
- rule_id: GW-T-006
name: 标题回行词意完整
severity: warning
category: 标题
target: title
on_missing: skip
stages:
- check: ai
prompt: |
只在标题里**明确出现破词**时才报错。
破词示例:「广东省烟草专卖局关于xx的通知」如果在"专"和"卖"之间有换行 → fail
其它情况(单行标题、合理换行点、词意完整)→ **必须 pass**
判断准则:
- 标题已经是单行字符串,没有明显断点 → pass
- 不要凭直觉揣测,只判断是否能在原文中**逐字定位**破词位置
- 找不到具体破词位置就 pass
标题原文:
{{title.text}}
messages:
pass: 标题回行合规
fail: 标题回行破词
- group: 发文字号(错误汇编 三、六.3
rules:
- rule_id: GW-N-001
name: 发文字号必须用六角括号
severity: error
category: 发文
target: doc_number
on_missing: fail
stages:
- check: forbid_chars
chars: ["[", "]"]
messages:
pass: ok
fail: 发文字号年份应用六角括号「〔〕」,不得使用方括号
- rule_id: GW-N-002
name: 发文字号不可加"第"字
severity: error
category: 发文
target: doc_number
on_missing: fail
stages:
- check: regex_forbid
pattern: '\d{4}〕第\d+号'
messages:
pass: ok
fail: 发文字号顺序号前不应加"第"字
- rule_id: GW-N-003
name: 发文字号顺序号不编虚位
severity: error
category: 发文
target: doc_number
on_missing: fail
stages:
- check: regex_forbid
pattern: '\d{4}0\d+号'
messages:
pass: ok
fail: 发文字号顺序号不编虚位(如"02号"应为"2号"
- group: 格式(错误汇编 二)
rules:
- rule_id: GW-F-001
name: 主标题用方正小标宋简体二号
severity: error
category: 格式
target: title
on_missing: fail
stages:
- check: font
expect:
eastasia: 方正小标宋简体
size_pt: 22
messages:
pass: ok
fail: 主标题应使用方正小标宋简体二号
- rule_id: GW-F-002
name: 一级标题用黑体三号
severity: error
category: 格式
applies_to:
role: heading_1
on_missing: skip
stages:
- check: font
expect:
eastasia: 黑体
size_pt: 16
messages:
pass: ok
fail: 一级标题应使用黑体三号
- rule_id: GW-F-003
name: 二级标题用楷体三号
severity: error
category: 格式
applies_to:
role: heading_2
on_missing: skip
stages:
- check: font
expect:
eastasia: 楷体
size_pt: 16
messages:
pass: ok
fail: 二级标题应使用楷体三号
- rule_id: GW-F-004
name: 正文用仿宋三号
severity: warning
category: 格式
applies_to:
role: body
on_missing: skip
stages:
- check: font
expect:
eastasia: 仿宋
size_pt: 16
messages:
pass: ok
fail: 正文应使用仿宋(GB2312)三号
- rule_id: GW-F-005
name: 附件后不加冒号
severity: error
category: 格式
applies_to:
role: attachment_marker
on_missing: skip
stages:
- check: regex_forbid
pattern: '^附件\d+'
messages:
pass: ok
fail: '"附件1"等字样后不应加冒号'
- rule_id: GW-F-006
name: 不使用"(此页无正文)"
severity: warning
category: 格式
applies_to:
role: any
on_missing: skip
stages:
- check: forbid_phrase
phrases:
- (此页无正文)
- (此页无正文)
messages:
pass: ok
fail: 应通过编辑排版避免出现"(此页无正文)"
- rule_id: GW-F-007
name: 附件项末尾不加标点
severity: warning
category: 格式
applies_to:
role: any
on_missing: skip
stages:
- check: cross_role
rules:
- type: attachment_item_no_trailing_punct
messages:
pass: ok
fail: 附件名称(内容)后不应使用标点符号
- rule_id: GW-F-008
name: 三级标题用仿宋三号
severity: warning
category: 格式
applies_to:
role: heading_3
on_missing: skip
stages:
- check: font
expect:
eastasia: 仿宋
size_pt: 16
messages:
pass: ok
fail: 三级标题应使用仿宋(GB2312)三号
- rule_id: GW-F-009
name: 四级标题用仿宋三号
severity: warning
category: 格式
applies_to:
role: heading_4
on_missing: skip
stages:
- check: font
expect:
eastasia: 仿宋
size_pt: 16
messages:
pass: ok
fail: 四级标题应使用仿宋(GB2312)三号
- rule_id: GW-F-010
name: 附件标记用黑体三号不加粗
severity: error
category: 格式
applies_to:
role: attachment_marker
on_missing: skip
stages:
- check: attachment_marker_style
expect:
eastasia: 黑体
size_pt: 16
bold: false
messages:
pass: ok
fail: '"附件:"或"附件1"等标记应使用黑体三号,且不加粗'
- group: 层级序号(错误汇编 四)
rules:
- rule_id: GW-H-001
name: 层级序号格式
severity: error
category: 层级
applies_to:
role: any
on_missing: skip
stages:
- check: hierarchy
forbid_patterns:
- '^[一二三四五六七八九十]+、.*[、。]$'
- '^\d+、'
- '^([一二三四五六七八九十]+)、'
messages:
pass: ok
fail: 层级序号格式错误
- rule_id: GW-H-002
name: 二级标题换行不带句号
severity: warning
category: 层级
applies_to:
role: heading_2
on_missing: skip
stages:
- check: cross_role
rules:
- type: h2_no_period_then_break
messages:
pass: ok
fail: 二级标题在换行分段时不应使用句号
- group: 标点符号(错误汇编 六)
rules:
- rule_id: GW-P-001
name: 多书名号/引号并列不加顿号
severity: warning
category: 标点
applies_to:
role: any
on_missing: skip
stages:
- check: punctuation
rules:
- type: no_dunhao_between_quotes
messages:
pass: ok
fail: 多个书名号/引号并列时不应用顿号分隔
- rule_id: GW-P-002
name: 句内括号末尾不加标点
severity: warning
category: 标点
applies_to:
role: any
on_missing: skip
stages:
- check: punctuation
rules:
- type: no_punct_inside_inline_paren
messages:
pass: ok
fail: 句内括号行文末尾通常不应含标点
- rule_id: GW-P-003
name: 引号嵌套不规范
severity: warning
category: 标点
applies_to:
role: any
on_missing: skip
stages:
- check: punctuation
rules:
- type: no_outer_quote_when_inner_quote
messages:
pass: ok
fail: 双引号内已含单引号强调时,外层不应再加双引号(如"卓'粤'创一流"应为 卓"粤"创一流)
- group: 文字表述与提法(错误汇编 七、八、九)
rules:
- rule_id: GW-W-001
name: 易混淆词使用
severity: warning
category: 文字
applies_to:
role: any
on_missing: skip
stages:
- check: confused_pair
pairs:
- wrong: 截至到
correct: 截止到
reason: '"截至" 已含"到"之意'
- wrong: 下称
correct: 以下简称
reason: 标注简称应用"以下简称"
- wrong_pattern: '截止\d{4}年'
suggest: 截至YYYY年
reason: 用于到某时点应为"截至"
messages:
pass: ok
fail: 易混淆词使用不当
- rule_id: GW-W-002
name: 简称使用规范
severity: warning
category: 简称
applies_to:
role: body
on_missing: skip
stages:
- check: ai
prompt: |
只在文中出现以下两种省级职务简称错误时才报错,否则一律 pass:
- "X省省委书记" 错误(应为 "X省委书记",省字不重复)
- "X省长" 错误(应为 "X省省长",省字不可省略)
若文中没有"省委书记"或"省长"等省级职务字样,**必须 pass**。
若不能在文中找到准确的错误原文,**必须 pass**。
不要做语气、措辞、其它简称的检查。
全文片段:
{{paragraphs[0]}}
messages:
pass: 简称规范
fail: 简称使用不规范
- rule_id: GW-W-003
name: 成文日期用阿拉伯数字
severity: error
category: 提法
target: date
on_missing: fail
stages:
- check: regex_forbid
pattern: '[一二三四五六七八九十○〇零]+年'
messages:
pass: ok
fail: 成文日期应使用阿拉伯数字(如"2023年10月9日"
- rule_id: GW-W-004
name: 成文日期不编虚位
severity: warning
category: 提法
target: date
on_missing: fail
stages:
- check: regex_forbid
pattern: '\d{4}年0\d月|\d{4}年\d{1,2}月0\d日'
messages:
pass: ok
fail: 成文日期月、日不编虚位
- group: 发文机关(错误汇编 十)
rules:
- rule_id: GW-S-001
name: 发文机关署名不能用简称
severity: error
category: 机关
target: signature
on_missing: fail
stages:
- check: ai
prompt: |
判断署名是否含**明确的简称错误**。
典型错误:
- "广东省烟草专卖局(公司)" — 用括号缩短两个机关 → 错
- "省局" / "粤烟" 等单独缩写 → 错
典型正确:
- "广东省烟草专卖局" 单独出现 → pass(即使可能存在配套总公司,但单独存在不算简称)
- "广东省烟草专卖局 中国烟草总公司广东省公司" → pass
判断准则:
- 若署名是一个完整、官方、可独立成立的机关名 → **必须 pass**
- 若署名带"(公司)"、"省局"、明显缩写、行业内部代号 → fail
署名原文:
{{signature.text}}
messages:
pass: 署名规范
fail: 发文机关署名使用了简称
- rule_id: GW-S-002
name: 发文机关确定严谨性
severity: warning
category: 机关
target: signature
on_missing: fail
stages:
- check: ai
prompt: |
只判断**这一个明确条件**:
- 标题或正文里明确涉及"党组""党的xx工作""组织部""纪委"等党务事项,
但署名是行政机关(局/公司/委员会等),未署"党组"或党务机构 → fail
- 其它情况(行政事务、缺乏证据、性质模糊)→ **必须 pass**
判断时需要看到**明确的党务关键词**(党组/党委/党的xx会议/党风/反腐倡廉等),
没有这些关键词就 pass。
署名原文:{{signature.text}}
标题:{{title.text}}
messages:
pass: 发文机关一致
fail: 发文机关与文稿性质不一致
- group: 标题字体(target 通道示例)
rules:
- rule_id: GW-T-008
name: 标题字体(语义实体通道)
severity: warning
category: 标题
target: title
on_missing: warn
stages:
- check: ai
prompt: |
判断公文标题的字体与字号是否合规。
要求:字体 = 方正小标宋简体;字号 = 22pt(或 22.0)。
实际:
- 标题:{{title.text}}
- 字体:{{title.style.font_eastasia}}
- 字号:{{title.style.font_size_pt}}pt
若实际字体为空或与要求一致 → pass
若字体明显不符(例如 仿宋/楷体/黑体)→ fail
若仅字号轻微差异 → warn
messages:
pass: 标题字体字号合规
fail: 标题字体或字号不符合 GB/T 9704
+11 -1
View File
@@ -33,6 +33,7 @@ CREATE TABLE IF NOT EXISTS public.govdoc_runs (
engine_version VARCHAR(64), engine_version VARCHAR(64),
llm_provider VARCHAR(64), llm_provider VARCHAR(64),
llm_model VARCHAR(128), llm_model VARCHAR(128),
rules_path VARCHAR(1024),
-- 结果汇总 -- 结果汇总
total_score NUMERIC(10, 2), total_score NUMERIC(10, 2),
@@ -52,6 +53,9 @@ CREATE TABLE IF NOT EXISTS public.govdoc_runs (
deleted_at TIMESTAMPTZ DEFAULT NULL deleted_at TIMESTAMPTZ DEFAULT NULL
); );
ALTER TABLE public.govdoc_runs
ADD COLUMN IF NOT EXISTS rules_path VARCHAR(1024);
COMMENT ON TABLE public.govdoc_runs IS '公文审查运行主表'; COMMENT ON TABLE public.govdoc_runs IS '公文审查运行主表';
COMMENT ON COLUMN public.govdoc_runs.id IS '自增主键'; COMMENT ON COLUMN public.govdoc_runs.id IS '自增主键';
COMMENT ON COLUMN public.govdoc_runs.document_id IS '关联 leaudit_documents.id'; COMMENT ON COLUMN public.govdoc_runs.document_id IS '关联 leaudit_documents.id';
@@ -65,6 +69,7 @@ COMMENT ON COLUMN public.govdoc_runs.phase IS '当前阶段:parsing/executing/
COMMENT ON COLUMN public.govdoc_runs.engine_version IS '引擎版本号'; COMMENT ON COLUMN public.govdoc_runs.engine_version IS '引擎版本号';
COMMENT ON COLUMN public.govdoc_runs.llm_provider IS 'LLM 提供商'; COMMENT ON COLUMN public.govdoc_runs.llm_provider IS 'LLM 提供商';
COMMENT ON COLUMN public.govdoc_runs.llm_model IS 'LLM 模型名'; COMMENT ON COLUMN public.govdoc_runs.llm_model IS 'LLM 模型名';
COMMENT ON COLUMN public.govdoc_runs.rules_path IS '本次运行使用的规则文件路径';
COMMENT ON COLUMN public.govdoc_runs.total_score IS '总分'; COMMENT ON COLUMN public.govdoc_runs.total_score IS '总分';
COMMENT ON COLUMN public.govdoc_runs.passed_count IS '通过规则数'; COMMENT ON COLUMN public.govdoc_runs.passed_count IS '通过规则数';
COMMENT ON COLUMN public.govdoc_runs.failed_count IS '未通过规则数'; COMMENT ON COLUMN public.govdoc_runs.failed_count IS '未通过规则数';
@@ -106,6 +111,7 @@ CREATE TABLE IF NOT EXISTS public.govdoc_rule_results (
-- 判定 -- 判定
result VARCHAR(32) NOT NULL DEFAULT 'pass', result VARCHAR(32) NOT NULL DEFAULT 'pass',
skip_reason TEXT,
score NUMERIC(10, 2), score NUMERIC(10, 2),
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
@@ -113,6 +119,9 @@ CREATE TABLE IF NOT EXISTS public.govdoc_rule_results (
deleted_at TIMESTAMPTZ DEFAULT NULL deleted_at TIMESTAMPTZ DEFAULT NULL
); );
ALTER TABLE public.govdoc_rule_results
ADD COLUMN IF NOT EXISTS skip_reason TEXT;
COMMENT ON TABLE public.govdoc_rule_results IS '公文规则执行结果明细表'; COMMENT ON TABLE public.govdoc_rule_results IS '公文规则执行结果明细表';
COMMENT ON COLUMN public.govdoc_rule_results.id IS '自增主键'; COMMENT ON COLUMN public.govdoc_rule_results.id IS '自增主键';
COMMENT ON COLUMN public.govdoc_rule_results.run_id IS '关联 govdoc_runs.id'; COMMENT ON COLUMN public.govdoc_rule_results.run_id IS '关联 govdoc_runs.id';
@@ -129,6 +138,7 @@ COMMENT ON COLUMN public.govdoc_rule_results.paragraph_index IS '段落索引';
COMMENT ON COLUMN public.govdoc_rule_results.paragraph_text IS '段落原文'; COMMENT ON COLUMN public.govdoc_rule_results.paragraph_text IS '段落原文';
COMMENT ON COLUMN public.govdoc_rule_results.location_path IS '文档结构位置路径'; COMMENT ON COLUMN public.govdoc_rule_results.location_path IS '文档结构位置路径';
COMMENT ON COLUMN public.govdoc_rule_results.result IS '执行结果:pass/fail/skipped/error'; COMMENT ON COLUMN public.govdoc_rule_results.result IS '执行结果:pass/fail/skipped/error';
COMMENT ON COLUMN public.govdoc_rule_results.skip_reason IS '跳过原因,仅 skipped/error 时使用';
COMMENT ON COLUMN public.govdoc_rule_results.score IS '本条得分'; COMMENT ON COLUMN public.govdoc_rule_results.score IS '本条得分';
CREATE INDEX IF NOT EXISTS idx_govdoc_rule_results_run_id ON public.govdoc_rule_results(run_id) WHERE deleted_at IS NULL; CREATE INDEX IF NOT EXISTS idx_govdoc_rule_results_run_id ON public.govdoc_rule_results(run_id) WHERE deleted_at IS NULL;
@@ -195,4 +205,4 @@ END $$;
-- 为 engine_type 加索引,方便按模块过滤文档列表 -- 为 engine_type 加索引,方便按模块过滤文档列表
CREATE INDEX IF NOT EXISTS idx_leaudit_documents_engine_type ON public.leaudit_documents(engine_type) WHERE deleted_at IS NULL; CREATE INDEX IF NOT EXISTS idx_leaudit_documents_engine_type ON public.leaudit_documents(engine_type) WHERE deleted_at IS NULL;
COMMIT; COMMIT;