feat: integrate govdoc module into leaudit platform

This commit is contained in:
wren
2026-05-17 19:24:16 +08:00
parent cb13e61d3d
commit a73826dc1d
16 changed files with 2334 additions and 280 deletions
@@ -130,12 +130,12 @@ def _merge_llm_into_entities(
# ── 实体构建 (同步,供 sync 入口使用) ──────────────────
def _build_entities(
doc, ruleset: RuleSet, llm: LlmClient,
doc, ruleset: RuleSet, llm: LlmClient | None,
) -> dict[str, SemanticEntity | None]:
"""构建实体 + 差量 LLM 抽取(同步)。"""
entities = EntityBuilder().build(doc)
spec = _compute_missing_spec(entities, ruleset.extract.entities)
if spec:
if spec and llm is not None:
llm_vals = FieldExtractor(llm).extract_missing(doc, spec)
_merge_llm_into_entities(entities, llm_vals)
return entities
@@ -144,12 +144,12 @@ def _build_entities(
# ── 实体构建 (异步,供 async 入口使用) ──────────────────
async def _build_entities_async(
doc, ruleset: RuleSet, llm: LlmClient,
doc, ruleset: RuleSet, llm: LlmClient | None,
) -> dict[str, SemanticEntity | None]:
"""构建实体 + 差量 LLM 抽取(异步)。"""
entities = EntityBuilder().build(doc)
spec = _compute_missing_spec(entities, ruleset.extract.entities)
if spec:
if spec and llm is not None:
llm_vals = await FieldExtractor(llm).extract_missing_async(doc, spec)
_merge_llm_into_entities(entities, llm_vals)
return entities
@@ -174,7 +174,7 @@ def audit_file(
"""
docx_path = Path(docx_path)
rules_path = Path(rules_path)
llm = llm_client or LlmClient()
llm = llm_client
doc = parse_docx(docx_path)
RoleTagger(llm_client=llm).tag(doc)
@@ -210,7 +210,7 @@ async def run(
"""
file_path = Path(file_path)
rules_path = Path(rules_path)
llm = llm_client or LlmClient()
llm = llm_client
_log.info("Govdoc pipeline start: %s", file_path.name)
@@ -219,18 +219,21 @@ async def run(
_log.info(" parsed: %d paragraphs", len(doc.paragraphs))
# 2. 段落角色标注
RoleTagger(llm_client=llm).tag(doc)
if llm is not None:
await RoleTagger(llm_client=llm).tag_async(doc)
else:
RoleTagger(llm_client=None).tag(doc)
# 3. 加载规则
ruleset = load_rules(rules_path)
_log.info(" rules: %d groups, %d rules", len(ruleset.groups), len(ruleset.all_rules()))
_log.info(" rules: %d groups, %d rules", len(ruleset.rules), len(ruleset.all_rules()))
# 4. 实体抽取 (含差量 LLM)
entities = await _build_entities_async(doc, ruleset, llm)
_log.info(" entities: %d/%d resolved", sum(1 for v in entities.values() if v), len(entities))
# 5. 规则评估
findings, outcomes = RuleRunner(llm_client=llm).evaluate(
findings, outcomes = await RuleRunner(llm_client=llm).evaluate_async(
ruleset.all_rules(), doc, entities
)
_log.info(" evaluated: %d findings from %d rules", len(findings), len(outcomes))