feat: integrate govdoc module into leaudit platform

2026-05-17 19:24:16 +08:00
parent cb13e61d3d
commit a73826dc1d
16 changed files with 2334 additions and 280 deletions
@@ -1,30 +1,12 @@
 """Govdoc 公文格式审查引擎内核。

-从旧 govdoc-audit 项目裁剪迁入，去除独立 API 层、SQLite 存储层、
-本地运行记录器 (RunRecorder) 和旧配置系统。
-
-导出:
-  - pipeline.run() — 异步审查入口 (bridge 层主调用)
-  - pipeline.audit_file() — 同步审查入口 (兼容)
-  - models — 核心数据模型 (Pydantic)
-  - parser — 文档解析与实体抽取
-  - dsl — YAML 规则 DSL 定义与加载
-  - engine — 规则执行引擎与结果模型
-  - reporter — 报告生成 (HTML/DOCX/JSON)
-  - llm — LLM 客户端 (OpenAI 兼容协议)
+保持包级导入轻量，避免在控制器注册阶段提前拉起 LLM/OpenAI 依赖。
+真正执行审查时再按需导入 pipeline / result 模块。
 """

 from __future__ import annotations

-from fastapi_modules.fastapi_leaudit.govdoc_engine.pipeline import (
-    audit_file,
-    run,
-)
-from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.result import (
-    AuditResult,
-    AuditSummary,
-    CheckedRule,
-)
+from typing import Any

 __all__ = [
    "audit_file",
@@ -33,3 +15,31 @@ __all__ = [
    "AuditSummary",
    "CheckedRule",
 ]
+
+
+def audit_file(*args: Any, **kwargs: Any):
+    from fastapi_modules.fastapi_leaudit.govdoc_engine.pipeline import audit_file as _audit_file
+
+    return _audit_file(*args, **kwargs)
+
+
+async def run(*args: Any, **kwargs: Any):
+    from fastapi_modules.fastapi_leaudit.govdoc_engine.pipeline import run as _run
+
+    return await _run(*args, **kwargs)
+
+
+def __getattr__(name: str):
+    if name in {"AuditResult", "AuditSummary", "CheckedRule"}:
+        from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.result import (
+            AuditResult,
+            AuditSummary,
+            CheckedRule,
+        )
+
+        return {
+            "AuditResult": AuditResult,
+            "AuditSummary": AuditSummary,
+            "CheckedRule": CheckedRule,
+        }[name]
+    raise AttributeError(name)
@@ -11,7 +11,22 @@ import re
 import time
 from typing import Any

-from openai import AsyncOpenAI, OpenAI, APIError, APIConnectionError, RateLimitError
+try:
+    from openai import AsyncOpenAI, OpenAI, APIError, APIConnectionError, RateLimitError
+    _OPENAI_IMPORT_ERROR: Exception | None = None
+except ModuleNotFoundError as exc:  # pragma: no cover - optional dependency
+    AsyncOpenAI = None  # type: ignore[assignment]
+    OpenAI = None  # type: ignore[assignment]
+    _OPENAI_IMPORT_ERROR = exc
+
+    class APIError(Exception):
+        status_code: int | None = None
+
+    class APIConnectionError(Exception):
+        pass
+
+    class RateLimitError(Exception):
+        pass

 from fastapi_admin.config import (
    LLM_API_KEY,
@@ -125,7 +140,13 @@ class LlmClient:
    ):
        key = api_key or LLM_API_KEY
        self._misconfigured_error: LlmConfigError | None = None
-        if not key:
+        if OpenAI is None or AsyncOpenAI is None:
+            self._client = None
+            self._aclient = None
+            self._misconfigured_error = LlmConfigError(
+                "python package 'openai' is not installed; govdoc LLM features are unavailable."
+            )
+        elif not key:
            self._client = None
            self._aclient = None
            self._misconfigured_error = LlmConfigError(
@@ -130,12 +130,12 @@ def _merge_llm_into_entities(
 # ── 实体构建 (同步，供 sync 入口使用) ──────────────────

 def _build_entities(
-    doc, ruleset: RuleSet, llm: LlmClient,
+    doc, ruleset: RuleSet, llm: LlmClient | None,
 ) -> dict[str, SemanticEntity | None]:
    """构建实体 + 差量 LLM 抽取（同步）。"""
    entities = EntityBuilder().build(doc)
    spec = _compute_missing_spec(entities, ruleset.extract.entities)
-    if spec:
+    if spec and llm is not None:
        llm_vals = FieldExtractor(llm).extract_missing(doc, spec)
        _merge_llm_into_entities(entities, llm_vals)
    return entities
@@ -144,12 +144,12 @@ def _build_entities(
 # ── 实体构建 (异步，供 async 入口使用) ──────────────────

 async def _build_entities_async(
-    doc, ruleset: RuleSet, llm: LlmClient,
+    doc, ruleset: RuleSet, llm: LlmClient | None,
 ) -> dict[str, SemanticEntity | None]:
    """构建实体 + 差量 LLM 抽取（异步）。"""
    entities = EntityBuilder().build(doc)
    spec = _compute_missing_spec(entities, ruleset.extract.entities)
-    if spec:
+    if spec and llm is not None:
        llm_vals = await FieldExtractor(llm).extract_missing_async(doc, spec)
        _merge_llm_into_entities(entities, llm_vals)
    return entities
@@ -174,7 +174,7 @@ def audit_file(
    """
    docx_path = Path(docx_path)
    rules_path = Path(rules_path)
-    llm = llm_client or LlmClient()
+    llm = llm_client

    doc = parse_docx(docx_path)
    RoleTagger(llm_client=llm).tag(doc)
@@ -210,7 +210,7 @@ async def run(
    """
    file_path = Path(file_path)
    rules_path = Path(rules_path)
-    llm = llm_client or LlmClient()
+    llm = llm_client

    _log.info("Govdoc pipeline start: %s", file_path.name)

@@ -219,18 +219,21 @@ async def run(
    _log.info("  parsed: %d paragraphs", len(doc.paragraphs))

    # 2. 段落角色标注
-    RoleTagger(llm_client=llm).tag(doc)
+    if llm is not None:
+        await RoleTagger(llm_client=llm).tag_async(doc)
+    else:
+        RoleTagger(llm_client=None).tag(doc)

    # 3. 加载规则
    ruleset = load_rules(rules_path)
-    _log.info("  rules: %d groups, %d rules", len(ruleset.groups), len(ruleset.all_rules()))
+    _log.info("  rules: %d groups, %d rules", len(ruleset.rules), len(ruleset.all_rules()))

    # 4. 实体抽取 (含差量 LLM)
    entities = await _build_entities_async(doc, ruleset, llm)
    _log.info("  entities: %d/%d resolved", sum(1 for v in entities.values() if v), len(entities))

    # 5. 规则评估
-    findings, outcomes = RuleRunner(llm_client=llm).evaluate(
+    findings, outcomes = await RuleRunner(llm_client=llm).evaluate_async(
        ruleset.all_rules(), doc, entities
    )
    _log.info("  evaluated: %d findings from %d rules", len(findings), len(outcomes))