feat(govdoc): 新增内部公文模块全链路(后端58+前端11文件)
This commit is contained in:
@@ -0,0 +1,24 @@
|
||||
"""Check 原语注册中心:通过 register 装饰器收集,runner 通过 get 查找。"""
|
||||
|
||||
from __future__ import annotations
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks.base import CheckBase
|
||||
|
||||
_REGISTRY: dict[str, type[CheckBase]] = {}
|
||||
|
||||
|
||||
def register(name: str):
|
||||
def deco(cls):
|
||||
cls.name = name
|
||||
_REGISTRY[name] = cls
|
||||
return cls
|
||||
return deco
|
||||
|
||||
|
||||
def get_check(name: str) -> type[CheckBase]:
|
||||
if name not in _REGISTRY:
|
||||
raise KeyError(f"unknown check: {name}; known: {list(_REGISTRY)}")
|
||||
return _REGISTRY[name]
|
||||
|
||||
|
||||
def all_checks() -> list[str]:
|
||||
return list(_REGISTRY.keys())
|
||||
@@ -0,0 +1,151 @@
|
||||
"""LLM 语义检查。三级输出:pass / warn / fail。"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any
|
||||
from pydantic import BaseModel
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import register
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks.base import (
|
||||
CheckBase, CheckContext, CheckHit, CheckResult,
|
||||
)
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.llm.client import LlmClient, LlmJsonError, _format_exc
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
_OUT_FORMAT = """
|
||||
请以 JSON 输出:
|
||||
{"result": "pass|warn|fail", "reason": "<简短理由>", "suggestion": "<改进建议;pass 时填空>"}
|
||||
"""
|
||||
|
||||
_VAR_RE = re.compile(r"\{\{\s*([^}]+?)\s*\}\}")
|
||||
|
||||
|
||||
def _resolve_dot_path(root: Any, path: str) -> str:
|
||||
"""点语法属性访问:title.style.font_eastasia → entities['title'].style.font_eastasia。"""
|
||||
cur: Any = root
|
||||
for seg in path.split("."):
|
||||
if cur is None:
|
||||
return ""
|
||||
if isinstance(cur, dict):
|
||||
cur = cur.get(seg)
|
||||
elif isinstance(cur, BaseModel):
|
||||
cur = getattr(cur, seg, None)
|
||||
else:
|
||||
cur = getattr(cur, seg, None)
|
||||
if cur is None:
|
||||
return ""
|
||||
if isinstance(cur, (dict, list)):
|
||||
return str(cur)
|
||||
return str(cur)
|
||||
|
||||
|
||||
def _interpolate(
|
||||
template: str,
|
||||
paragraphs: list,
|
||||
entities: dict | None = None,
|
||||
target: Any = None,
|
||||
) -> str:
|
||||
"""渲染顺序:① paragraphs[N] ② entities 点语法
|
||||
③ target 隐式(无前缀时视为 target.<key>)。"""
|
||||
entities = entities or {}
|
||||
|
||||
def repl(m):
|
||||
key = m.group(1).strip()
|
||||
# ① paragraphs[N] 索引
|
||||
if key.startswith("paragraphs["):
|
||||
try:
|
||||
idx = int(key[len("paragraphs["):].rstrip("]"))
|
||||
return paragraphs[idx].text
|
||||
except (ValueError, IndexError):
|
||||
return ""
|
||||
# ② entities 点语法:title.text / title.style.font_eastasia
|
||||
head, _, rest = key.partition(".")
|
||||
if head in entities:
|
||||
entity = entities[head]
|
||||
if entity is None:
|
||||
return ""
|
||||
return _resolve_dot_path(entity, rest) if rest else entity.text
|
||||
# ③ target 隐式:未带前缀且 target 存在
|
||||
if target is not None:
|
||||
v = _resolve_dot_path(target, key)
|
||||
if v:
|
||||
return v
|
||||
return ""
|
||||
|
||||
return _VAR_RE.sub(repl, template)
|
||||
|
||||
|
||||
@register("ai")
|
||||
class AiCheck(CheckBase):
|
||||
def __init__(self, llm_client: LlmClient | None = None):
|
||||
self.client = llm_client or LlmClient()
|
||||
|
||||
def _build_prompt(self, ctx: CheckContext) -> str:
|
||||
prompt = _interpolate(
|
||||
ctx.stage.prompt or "",
|
||||
ctx.paragraphs,
|
||||
ctx.entities,
|
||||
ctx.target,
|
||||
)
|
||||
return prompt + "\n\n" + _OUT_FORMAT
|
||||
|
||||
def _interpret(self, ctx: CheckContext, resp: dict) -> CheckResult:
|
||||
result = resp.get("result", "fail")
|
||||
reason = resp.get("reason", "")
|
||||
suggestion = resp.get("suggestion", "")
|
||||
if result == "pass":
|
||||
return CheckResult(passed=True, hits=[])
|
||||
target_p = ctx.paragraphs[0] if ctx.paragraphs else None
|
||||
confidence = 0.95 if result == "fail" else 0.7
|
||||
return CheckResult(passed=False, hits=[CheckHit(
|
||||
paragraph=target_p,
|
||||
char_start=0,
|
||||
char_end=len(target_p.text) if target_p else 0,
|
||||
actual={"llm_reason": reason, "llm_suggestion": suggestion},
|
||||
expected={},
|
||||
message=reason or "LLM 判定不通过",
|
||||
confidence=confidence,
|
||||
)])
|
||||
|
||||
def run(self, ctx: CheckContext) -> CheckResult:
|
||||
label = f"ai_{ctx.rule_id or 'unknown'}"
|
||||
try:
|
||||
resp = self.client.chat_json(
|
||||
[{"role": "user", "content": self._build_prompt(ctx)}],
|
||||
label=label,
|
||||
)
|
||||
except LlmJsonError as e:
|
||||
_log.warning("AI check skipped (LLM JSON error): %s", _format_exc(e))
|
||||
return CheckResult(
|
||||
passed=True, hits=[], skipped=True,
|
||||
skip_reason=f"LLM 返回内容无法解析为 JSON:{e}",
|
||||
)
|
||||
except Exception as e:
|
||||
_log.warning("AI check skipped (LLM error): %s", _format_exc(e))
|
||||
return CheckResult(
|
||||
passed=True, hits=[], skipped=True,
|
||||
skip_reason=f"LLM 调用失败:{e}",
|
||||
)
|
||||
return self._interpret(ctx, resp)
|
||||
|
||||
async def run_async(self, ctx: CheckContext) -> CheckResult:
|
||||
label = f"ai_{ctx.rule_id or 'unknown'}"
|
||||
try:
|
||||
resp = await self.client.chat_json_async(
|
||||
[{"role": "user", "content": self._build_prompt(ctx)}],
|
||||
label=label,
|
||||
)
|
||||
except LlmJsonError as e:
|
||||
_log.warning("AI check skipped (LLM JSON error): %s", _format_exc(e))
|
||||
return CheckResult(
|
||||
passed=True, hits=[], skipped=True,
|
||||
skip_reason=f"LLM 返回内容无法解析为 JSON:{e}",
|
||||
)
|
||||
except Exception as e:
|
||||
_log.warning("AI check skipped (LLM error): %s", _format_exc(e))
|
||||
return CheckResult(
|
||||
passed=True, hits=[], skipped=True,
|
||||
skip_reason=f"LLM 调用失败:{e}",
|
||||
)
|
||||
return self._interpret(ctx, resp)
|
||||
@@ -0,0 +1,48 @@
|
||||
"""Check 原语基类与上下文。"""
|
||||
|
||||
from __future__ import annotations
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, TYPE_CHECKING
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.models import Document, Paragraph
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.dsl.schema import RuleStage
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.parser.entities import SemanticEntity
|
||||
|
||||
|
||||
@dataclass
|
||||
class CheckContext:
|
||||
document: Document
|
||||
paragraphs: list[Paragraph]
|
||||
stage: RuleStage
|
||||
entities: dict[str, "SemanticEntity | None"] = field(default_factory=dict)
|
||||
target: "SemanticEntity | None" = None
|
||||
rule_id: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class CheckHit:
|
||||
paragraph: Paragraph | None
|
||||
char_start: int = 0
|
||||
char_end: int = 0
|
||||
actual: dict[str, Any] | None = None
|
||||
expected: dict[str, Any] | None = None
|
||||
message: str | None = None
|
||||
confidence: float = 1.0
|
||||
|
||||
|
||||
@dataclass
|
||||
class CheckResult:
|
||||
passed: bool
|
||||
hits: list[CheckHit] = field(default_factory=list)
|
||||
skipped: bool = False
|
||||
skip_reason: str = ""
|
||||
|
||||
|
||||
class CheckBase:
|
||||
"""所有 check 原语的抽象基类。"""
|
||||
|
||||
name: str = ""
|
||||
|
||||
def run(self, ctx: CheckContext) -> CheckResult:
|
||||
raise NotImplementedError
|
||||
@@ -0,0 +1,34 @@
|
||||
"""易混淆词对(字面 + 正则)。"""
|
||||
|
||||
import re
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import register
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks.base import CheckBase, CheckContext, CheckHit, CheckResult
|
||||
|
||||
|
||||
@register("confused_pair")
|
||||
class ConfusedPairCheck(CheckBase):
|
||||
def run(self, ctx: CheckContext) -> CheckResult:
|
||||
pairs = ctx.stage.pairs or []
|
||||
hits: list[CheckHit] = []
|
||||
for p in ctx.paragraphs:
|
||||
for pair in pairs:
|
||||
wrong = pair.get("wrong")
|
||||
wrong_pat = pair.get("wrong_pattern")
|
||||
correct = pair.get("correct") or pair.get("suggest", "")
|
||||
reason = pair.get("reason", "")
|
||||
if wrong and wrong in p.text:
|
||||
start = p.text.find(wrong)
|
||||
hits.append(CheckHit(
|
||||
paragraph=p, char_start=start, char_end=start + len(wrong),
|
||||
actual={"text": wrong}, expected={"text": correct},
|
||||
message=f"\"{wrong}\" 应为 \"{correct}\"。{reason}",
|
||||
))
|
||||
elif wrong_pat:
|
||||
for m in re.finditer(wrong_pat, p.text):
|
||||
hits.append(CheckHit(
|
||||
paragraph=p, char_start=m.start(), char_end=m.end(),
|
||||
actual={"text": m.group(0)},
|
||||
expected={"text": correct},
|
||||
message=f"\"{m.group(0)}\" 应为 \"{correct}\"。{reason}",
|
||||
))
|
||||
return CheckResult(passed=not hits, hits=hits)
|
||||
@@ -0,0 +1,69 @@
|
||||
"""跨段关系 check:例如二级标题以句号结尾后又新起一段。"""
|
||||
|
||||
import re
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import register
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks.base import CheckBase, CheckContext, CheckHit, CheckResult
|
||||
|
||||
|
||||
# 单个附件项末尾的标点:编号(数字+点) + 内容 + 末尾标点
|
||||
_ATTACH_ITEM_TRAIL_PUNCT = re.compile(r"\d+[\..][^\d;。,;,.]+?[;。,;,.]")
|
||||
# 整段是一个附件项
|
||||
_ATTACH_ITEM_LINE = re.compile(r"^\d+[\..].+[;。,;,.]\s*$")
|
||||
|
||||
|
||||
@register("cross_role")
|
||||
class CrossRoleCheck(CheckBase):
|
||||
def run(self, ctx: CheckContext) -> CheckResult:
|
||||
rules = ctx.stage.rules or []
|
||||
paras = ctx.document.paragraphs
|
||||
hits: list[CheckHit] = []
|
||||
for r in rules:
|
||||
t = r.get("type")
|
||||
if t == "h2_no_period_then_break":
|
||||
for i, p in enumerate(paras):
|
||||
if p.role == "heading_2" and p.text.rstrip().endswith(("。", ".")):
|
||||
if i + 1 < len(paras) and paras[i + 1].text.strip():
|
||||
hits.append(CheckHit(
|
||||
paragraph=p,
|
||||
char_start=len(p.text) - 1, char_end=len(p.text),
|
||||
actual={"text": p.text},
|
||||
message="二级标题在换行分段时不应使用句号;如使用句号则应紧接正文",
|
||||
))
|
||||
elif t == "attachment_item_no_trailing_punct":
|
||||
hits.extend(_attachment_item_hits(paras))
|
||||
return CheckResult(passed=not hits, hits=hits)
|
||||
|
||||
|
||||
def _attachment_item_hits(paras):
|
||||
"""从 attachment_marker 起扫描附件区块,找末尾带标点的附件项。"""
|
||||
hits: list[CheckHit] = []
|
||||
in_attachment = False
|
||||
for p in paras:
|
||||
text = p.text.strip()
|
||||
if not text:
|
||||
continue
|
||||
|
||||
if p.role == "attachment_marker":
|
||||
in_attachment = True
|
||||
# 同段内可能出现 "附件:1.xxx;2.yyy。" 多项一行
|
||||
for m in _ATTACH_ITEM_TRAIL_PUNCT.finditer(text):
|
||||
hits.append(CheckHit(
|
||||
paragraph=p,
|
||||
char_start=m.start(), char_end=m.end(),
|
||||
actual={"snippet": m.group(0)},
|
||||
message=f'附件项末尾不应有标点:"{m.group(0)}"',
|
||||
))
|
||||
continue
|
||||
|
||||
if p.role in ("signature", "date", "heading_1"):
|
||||
in_attachment = False
|
||||
continue
|
||||
|
||||
if in_attachment and _ATTACH_ITEM_LINE.match(text):
|
||||
hits.append(CheckHit(
|
||||
paragraph=p,
|
||||
char_start=len(p.text) - 1, char_end=len(p.text),
|
||||
actual={"text": p.text},
|
||||
message=f'附件项末尾不应有标点:"{text}"',
|
||||
))
|
||||
return hits
|
||||
@@ -0,0 +1,162 @@
|
||||
"""字体/字号/复合样式/行距 check。"""
|
||||
|
||||
import re
|
||||
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import register
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks.base import CheckBase, CheckContext, CheckHit, CheckResult
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.models import Paragraph, ParagraphStyle
|
||||
|
||||
|
||||
def _font_match(actual: str | None, expect: str) -> bool:
|
||||
if not actual:
|
||||
return False
|
||||
return expect in actual or actual in expect
|
||||
|
||||
|
||||
def _size_match(actual: float | None, expect: float, tol: float = 0.5) -> bool:
|
||||
if actual is None:
|
||||
return False
|
||||
return abs(actual - expect) <= tol
|
||||
|
||||
|
||||
def _style_matches(style: ParagraphStyle, expect: dict) -> bool:
|
||||
if "eastasia" in expect and not _font_match(style.font_eastasia, expect["eastasia"]):
|
||||
return False
|
||||
if "size_pt" in expect and not _size_match(
|
||||
style.font_size_pt, float(expect["size_pt"])
|
||||
):
|
||||
return False
|
||||
if "bold" in expect and bool(style.bold) != bool(expect["bold"]):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
@register("font")
|
||||
class FontCheck(CheckBase):
|
||||
def run(self, ctx: CheckContext) -> CheckResult:
|
||||
expect = ctx.stage.expect or {}
|
||||
hits: list[CheckHit] = []
|
||||
for p in ctx.paragraphs:
|
||||
ok = True
|
||||
actual = {
|
||||
"font": p.style.font_eastasia,
|
||||
"size": p.style.font_size_pt,
|
||||
}
|
||||
if "eastasia" in expect and not _font_match(p.style.font_eastasia, expect["eastasia"]):
|
||||
ok = False
|
||||
if "size_pt" in expect and not _size_match(
|
||||
p.style.font_size_pt, float(expect["size_pt"])
|
||||
):
|
||||
ok = False
|
||||
if not ok:
|
||||
hits.append(CheckHit(
|
||||
paragraph=p, char_start=0, char_end=len(p.text),
|
||||
actual=actual, expected=expect,
|
||||
message=f"字体或字号不符合(实际 {actual['font']} {actual['size']}pt,期望 {expect})",
|
||||
))
|
||||
return CheckResult(passed=not hits, hits=hits)
|
||||
|
||||
|
||||
@register("style_match")
|
||||
class StyleMatchCheck(CheckBase):
|
||||
def run(self, ctx: CheckContext) -> CheckResult:
|
||||
expect = ctx.stage.expect or {}
|
||||
hits: list[CheckHit] = []
|
||||
for p in ctx.paragraphs:
|
||||
ok = True
|
||||
actual = {
|
||||
"font": p.style.font_eastasia,
|
||||
"size": p.style.font_size_pt,
|
||||
"bold": p.style.bold,
|
||||
"italic": p.style.italic,
|
||||
"alignment": p.style.alignment,
|
||||
}
|
||||
if "eastasia" in expect and not _font_match(p.style.font_eastasia, expect["eastasia"]):
|
||||
ok = False
|
||||
if "size_pt" in expect and not _size_match(
|
||||
p.style.font_size_pt, float(expect["size_pt"])
|
||||
):
|
||||
ok = False
|
||||
if "bold" in expect and bool(p.style.bold) != bool(expect["bold"]):
|
||||
ok = False
|
||||
if "alignment" in expect and p.style.alignment != expect["alignment"]:
|
||||
ok = False
|
||||
if not ok:
|
||||
hits.append(CheckHit(
|
||||
paragraph=p, char_start=0, char_end=len(p.text),
|
||||
actual=actual, expected=expect, message="样式不符合",
|
||||
))
|
||||
return CheckResult(passed=not hits, hits=hits)
|
||||
|
||||
|
||||
_ATTACHMENT_MARKER_RE = re.compile(r"^\s*(附件[::]|附件\d+)")
|
||||
|
||||
|
||||
@register("attachment_marker_style")
|
||||
class AttachmentMarkerStyleCheck(CheckBase):
|
||||
"""只校验“附件:”或“附件1”等标记本身,不校验后续附件名称。"""
|
||||
|
||||
DEFAULT_EXPECT = {"eastasia": "黑体", "size_pt": 16, "bold": False}
|
||||
|
||||
def run(self, ctx: CheckContext) -> CheckResult:
|
||||
expect = ctx.stage.expect or self.DEFAULT_EXPECT
|
||||
hits: list[CheckHit] = []
|
||||
for p in ctx.paragraphs:
|
||||
match = _ATTACHMENT_MARKER_RE.match(p.text)
|
||||
if not match:
|
||||
continue
|
||||
marker_end = match.end(1)
|
||||
marker_styles = _marker_run_styles(p, marker_end)
|
||||
if not marker_styles:
|
||||
marker_styles = [p.style]
|
||||
bad_style = next(
|
||||
(style for style in marker_styles if not _style_matches(style, expect)),
|
||||
None,
|
||||
)
|
||||
if bad_style is not None:
|
||||
hits.append(CheckHit(
|
||||
paragraph=p,
|
||||
char_start=match.start(1),
|
||||
char_end=marker_end,
|
||||
actual={
|
||||
"font": bad_style.font_eastasia,
|
||||
"size": bad_style.font_size_pt,
|
||||
"bold": bad_style.bold,
|
||||
},
|
||||
expected=expect,
|
||||
message="附件标记样式不符合",
|
||||
))
|
||||
return CheckResult(passed=not hits, hits=hits)
|
||||
|
||||
|
||||
def _marker_run_styles(p: Paragraph, marker_end: int) -> list[ParagraphStyle]:
|
||||
styles: list[ParagraphStyle] = []
|
||||
cursor = 0
|
||||
for run in p.runs:
|
||||
run_start = cursor
|
||||
run_end = cursor + len(run.text)
|
||||
cursor = run_end
|
||||
if run_end <= 0 or run_start >= marker_end:
|
||||
continue
|
||||
if run.text.strip():
|
||||
styles.append(run.style)
|
||||
return styles
|
||||
|
||||
|
||||
@register("line_spacing")
|
||||
class LineSpacingCheck(CheckBase):
|
||||
def run(self, ctx: CheckContext) -> CheckResult:
|
||||
expect = ctx.stage.expect or {}
|
||||
target = float(expect.get("multiple", 1.5))
|
||||
tol = float(expect.get("tol", 0.05))
|
||||
hits: list[CheckHit] = []
|
||||
for p in ctx.paragraphs:
|
||||
actual = p.style.line_spacing
|
||||
if actual is None or abs(actual - target) > tol:
|
||||
hits.append(CheckHit(
|
||||
paragraph=p, char_start=0, char_end=len(p.text),
|
||||
actual={"line_spacing": actual},
|
||||
expected={"line_spacing": target},
|
||||
message=f"行距应为 {target},实际 {actual}",
|
||||
))
|
||||
return CheckResult(passed=not hits, hits=hits)
|
||||
@@ -0,0 +1,42 @@
|
||||
"""短语/字符黑名单。"""
|
||||
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import register
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks.base import CheckBase, CheckContext, CheckHit, CheckResult
|
||||
|
||||
|
||||
@register("forbid_phrase")
|
||||
class ForbidPhraseCheck(CheckBase):
|
||||
def run(self, ctx: CheckContext) -> CheckResult:
|
||||
phrases = ctx.stage.phrases or []
|
||||
hits: list[CheckHit] = []
|
||||
for p in ctx.paragraphs:
|
||||
for phr in phrases:
|
||||
start = p.text.find(phr)
|
||||
if start >= 0:
|
||||
hits.append(CheckHit(
|
||||
paragraph=p, char_start=start, char_end=start + len(phr),
|
||||
actual={"text": phr}, expected={"forbid": phr},
|
||||
message=f"出现禁用短语 \"{phr}\"",
|
||||
))
|
||||
return CheckResult(passed=not hits, hits=hits)
|
||||
|
||||
|
||||
@register("forbid_chars")
|
||||
class ForbidCharsCheck(CheckBase):
|
||||
def run(self, ctx: CheckContext) -> CheckResult:
|
||||
chars = ctx.stage.chars or []
|
||||
hits: list[CheckHit] = []
|
||||
for p in ctx.paragraphs:
|
||||
for c in chars:
|
||||
start = 0
|
||||
while True:
|
||||
idx = p.text.find(c, start)
|
||||
if idx < 0:
|
||||
break
|
||||
hits.append(CheckHit(
|
||||
paragraph=p, char_start=idx, char_end=idx + len(c),
|
||||
actual={"char": c}, expected={"forbid": c},
|
||||
message=f"禁用字符 \"{c}\" 出现在 idx {idx}",
|
||||
))
|
||||
start = idx + len(c)
|
||||
return CheckResult(passed=not hits, hits=hits)
|
||||
@@ -0,0 +1,29 @@
|
||||
"""层级序号格式 check。"""
|
||||
|
||||
import re
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import register
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks.base import CheckBase, CheckContext, CheckHit, CheckResult
|
||||
|
||||
|
||||
@register("hierarchy")
|
||||
class HierarchyCheck(CheckBase):
|
||||
"""检查层级序号格式:
|
||||
- expected_order: 各级允许的模式(正向白名单,按 level 升序)
|
||||
- forbid_patterns: 禁用模式(黑名单,命中即报错)
|
||||
"""
|
||||
|
||||
def run(self, ctx: CheckContext) -> CheckResult:
|
||||
forbid = [re.compile(p) for p in (ctx.stage.forbid_patterns or [])]
|
||||
hits: list[CheckHit] = []
|
||||
for p in ctx.paragraphs:
|
||||
text = p.text.strip()
|
||||
for f in forbid:
|
||||
m = f.search(text)
|
||||
if m:
|
||||
hits.append(CheckHit(
|
||||
paragraph=p, char_start=m.start(), char_end=m.end(),
|
||||
actual={"text": m.group(0)},
|
||||
expected={"forbid_pattern": f.pattern},
|
||||
message=f"层级序号格式错误:命中禁用模式 {f.pattern}",
|
||||
))
|
||||
return CheckResult(passed=not hits, hits=hits)
|
||||
@@ -0,0 +1,46 @@
|
||||
"""标点符号专项规则。"""
|
||||
|
||||
import re
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import register
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks.base import CheckBase, CheckContext, CheckHit, CheckResult
|
||||
|
||||
|
||||
# 多书名号或引号并列时不应用顿号分隔(中文/中文标点)
|
||||
_QUOTE_DUNHAO_RE = re.compile(r"([”》])、([“《])")
|
||||
# 句内括号末尾(除问号/叹号/省略号外)不应有标点
|
||||
_PAREN_PUNCT_RE = re.compile(r"[((][^))]*?[,。;:、][))]")
|
||||
# 引号嵌套:双引号内含单引号包裹的强调短语(如 "卓'粤'创一流")
|
||||
_NESTED_QUOTE_RE = re.compile(r"“[^“”]*?‘[^‘’]+’[^“”]*?”")
|
||||
|
||||
|
||||
@register("punctuation")
|
||||
class PunctuationCheck(CheckBase):
|
||||
def run(self, ctx: CheckContext) -> CheckResult:
|
||||
rules = ctx.stage.rules or []
|
||||
hits: list[CheckHit] = []
|
||||
for p in ctx.paragraphs:
|
||||
for r in rules:
|
||||
t = r.get("type")
|
||||
if t == "no_dunhao_between_quotes":
|
||||
for m in _QUOTE_DUNHAO_RE.finditer(p.text):
|
||||
hits.append(CheckHit(
|
||||
paragraph=p, char_start=m.start(), char_end=m.end(),
|
||||
actual={"text": m.group(0)},
|
||||
expected={"text": m.group(0).replace("、", "")},
|
||||
message="多个引号/书名号并列不应用顿号分隔",
|
||||
))
|
||||
elif t == "no_punct_inside_inline_paren":
|
||||
for m in _PAREN_PUNCT_RE.finditer(p.text):
|
||||
hits.append(CheckHit(
|
||||
paragraph=p, char_start=m.start(), char_end=m.end(),
|
||||
actual={"text": m.group(0)},
|
||||
message="句内括号末尾通常不应含标点",
|
||||
))
|
||||
elif t == "no_outer_quote_when_inner_quote":
|
||||
for m in _NESTED_QUOTE_RE.finditer(p.text):
|
||||
hits.append(CheckHit(
|
||||
paragraph=p, char_start=m.start(), char_end=m.end(),
|
||||
actual={"text": m.group(0)},
|
||||
message="双引号内已含单引号强调时,外层不应再加双引号",
|
||||
))
|
||||
return CheckResult(passed=not hits, hits=hits)
|
||||
@@ -0,0 +1,36 @@
|
||||
"""regex_require / regex_forbid。"""
|
||||
|
||||
import re
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import register
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks.base import CheckBase, CheckContext, CheckHit, CheckResult
|
||||
|
||||
|
||||
@register("regex_require")
|
||||
class RegexRequireCheck(CheckBase):
|
||||
def run(self, ctx: CheckContext) -> CheckResult:
|
||||
pat = re.compile(ctx.stage.pattern or "")
|
||||
hits: list[CheckHit] = []
|
||||
for p in ctx.paragraphs:
|
||||
if not pat.search(p.text):
|
||||
hits.append(CheckHit(
|
||||
paragraph=p, char_start=0, char_end=len(p.text),
|
||||
actual={"text": p.text}, expected={"pattern": ctx.stage.pattern},
|
||||
message=f"未匹配模式 {ctx.stage.pattern}",
|
||||
))
|
||||
return CheckResult(passed=not hits, hits=hits)
|
||||
|
||||
|
||||
@register("regex_forbid")
|
||||
class RegexForbidCheck(CheckBase):
|
||||
def run(self, ctx: CheckContext) -> CheckResult:
|
||||
pat = re.compile(ctx.stage.pattern or "")
|
||||
hits: list[CheckHit] = []
|
||||
for p in ctx.paragraphs:
|
||||
for m in pat.finditer(p.text):
|
||||
hits.append(CheckHit(
|
||||
paragraph=p, char_start=m.start(), char_end=m.end(),
|
||||
actual={"text": m.group(0)},
|
||||
expected={"forbid_pattern": ctx.stage.pattern},
|
||||
message=f"出现禁止模式 {ctx.stage.pattern}(命中 \"{m.group(0)}\")",
|
||||
))
|
||||
return CheckResult(passed=not hits, hits=hits)
|
||||
@@ -0,0 +1,28 @@
|
||||
"""required check:目标实体或选中段落必须有非空文本。"""
|
||||
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import register
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks.base import (
|
||||
CheckBase, CheckContext, CheckHit, CheckResult,
|
||||
)
|
||||
|
||||
|
||||
@register("required")
|
||||
class RequiredCheck(CheckBase):
|
||||
def run(self, ctx: CheckContext) -> CheckResult:
|
||||
# target 通道:检查实体 text 是否非空
|
||||
if ctx.target is not None:
|
||||
if ctx.target.text and ctx.target.text.strip():
|
||||
return CheckResult(passed=True, hits=[])
|
||||
anchor = ctx.paragraphs[0] if ctx.paragraphs else None
|
||||
return CheckResult(passed=False, hits=[
|
||||
CheckHit(paragraph=anchor, message=f"实体 {ctx.target.name} 缺失或为空")
|
||||
])
|
||||
|
||||
# applies_to 通道:所有段落必须非空
|
||||
empty = [p for p in ctx.paragraphs if not p.text.strip()]
|
||||
if empty:
|
||||
return CheckResult(
|
||||
passed=False,
|
||||
hits=[CheckHit(paragraph=p, message="段落为空") for p in empty],
|
||||
)
|
||||
return CheckResult(passed=True, hits=[])
|
||||
@@ -0,0 +1,42 @@
|
||||
"""文种白名单(15 种法定公文文种)。"""
|
||||
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import register
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks.base import (
|
||||
CheckBase, CheckContext, CheckHit, CheckResult,
|
||||
)
|
||||
|
||||
|
||||
LEGAL_WENZHONG = {
|
||||
"决议", "决定", "命令", "令", "公报", "公告", "通告",
|
||||
"意见", "通知", "通报", "报告", "请示", "批复",
|
||||
"议案", "函", "纪要",
|
||||
}
|
||||
|
||||
|
||||
@register("wenzhong_whitelist")
|
||||
class WenzhongWhitelistCheck(CheckBase):
|
||||
"""检查文种是否在 15 种法定文种白名单内。
|
||||
|
||||
数据来源:
|
||||
1. ctx.entities["wenzhong"].text ← 推荐
|
||||
2. ctx.target.text (当 rule.target = wenzhong 时)
|
||||
"""
|
||||
|
||||
def run(self, ctx: CheckContext) -> CheckResult:
|
||||
wz = ""
|
||||
wz_entity = ctx.entities.get("wenzhong") if ctx.entities else None
|
||||
if wz_entity is not None:
|
||||
wz = (wz_entity.text or "").strip()
|
||||
elif ctx.target is not None and ctx.target.name == "wenzhong":
|
||||
wz = (ctx.target.text or "").strip()
|
||||
|
||||
if not wz:
|
||||
return CheckResult(passed=True, hits=[])
|
||||
if wz in LEGAL_WENZHONG:
|
||||
return CheckResult(passed=True, hits=[])
|
||||
return CheckResult(passed=False, hits=[CheckHit(
|
||||
paragraph=None,
|
||||
actual={"wenzhong": wz},
|
||||
expected={"wenzhong_whitelist": sorted(LEGAL_WENZHONG)},
|
||||
message=f"非法定文种 \"{wz}\",应为 15 种法定公文文种之一",
|
||||
)])
|
||||
@@ -0,0 +1,81 @@
|
||||
"""审查结果数据结构。"""
|
||||
|
||||
from __future__ import annotations
|
||||
from collections import Counter
|
||||
from typing import Literal
|
||||
from pydantic import BaseModel, Field
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.models import Finding
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.parser.entities import SemanticEntity
|
||||
|
||||
|
||||
class CheckedRule(BaseModel):
|
||||
rule_id: str
|
||||
name: str
|
||||
severity: str
|
||||
category: str
|
||||
status: Literal["pass", "fail", "skipped"]
|
||||
skip_reason: str = ""
|
||||
|
||||
|
||||
class StructureItem(BaseModel):
|
||||
"""文档结构里一种 role 的统计。"""
|
||||
|
||||
role: str
|
||||
label: str
|
||||
count: int
|
||||
expected: bool
|
||||
paragraph_indices: list[int] = Field(default_factory=list)
|
||||
samples: list[str] = Field(default_factory=list)
|
||||
char_total: int = 0
|
||||
dominant_font: str | None = None
|
||||
dominant_size_pt: float | None = None
|
||||
style_uniform: bool = True
|
||||
|
||||
|
||||
class OutlineNode(BaseModel):
|
||||
"""大纲节点(heading_1~4 的层级树)。"""
|
||||
|
||||
paragraph_index: int
|
||||
level: int
|
||||
text: str
|
||||
children: list["OutlineNode"] = Field(default_factory=list)
|
||||
|
||||
|
||||
class AuditSummary(BaseModel):
|
||||
score: int = 100
|
||||
total_findings: int = 0
|
||||
by_severity: dict[str, int] = Field(default_factory=dict)
|
||||
by_category: dict[str, int] = Field(default_factory=dict)
|
||||
passed_count: int = 0
|
||||
failed_count: int = 0
|
||||
skipped_count: int = 0
|
||||
|
||||
|
||||
class AuditResult(BaseModel):
|
||||
audit_id: str
|
||||
document: dict = Field(default_factory=dict)
|
||||
summary: AuditSummary = Field(default_factory=AuditSummary)
|
||||
findings: list[Finding] = Field(default_factory=list)
|
||||
checked_rules: list[CheckedRule] = Field(default_factory=list)
|
||||
structure: list[StructureItem] = Field(default_factory=list)
|
||||
outline: list[OutlineNode] = Field(default_factory=list)
|
||||
entities: dict[str, SemanticEntity | None] = Field(default_factory=dict)
|
||||
|
||||
def compute_summary(self) -> None:
|
||||
sev_count = Counter(f.severity for f in self.findings)
|
||||
cat_count = Counter(f.category for f in self.findings)
|
||||
score = 100
|
||||
score -= 10 * sev_count.get("error", 0)
|
||||
score -= 3 * sev_count.get("warning", 0)
|
||||
passed = sum(1 for r in self.checked_rules if r.status == "pass")
|
||||
failed = sum(1 for r in self.checked_rules if r.status == "fail")
|
||||
skipped = sum(1 for r in self.checked_rules if r.status == "skipped")
|
||||
self.summary = AuditSummary(
|
||||
score=max(0, score),
|
||||
total_findings=len(self.findings),
|
||||
by_severity=dict(sev_count),
|
||||
by_category=dict(cat_count),
|
||||
passed_count=passed,
|
||||
failed_count=failed,
|
||||
skipped_count=skipped,
|
||||
)
|
||||
@@ -0,0 +1,242 @@
|
||||
"""规则评估引擎:跑一条规则的多 stage。"""
|
||||
|
||||
from __future__ import annotations
|
||||
import asyncio
|
||||
import uuid
|
||||
from dataclasses import dataclass, field
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.models import Document, Finding, Location
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.dsl.schema import Rule
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import get_check # noqa: F401 (确保注册)
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks.base import CheckContext, CheckResult, CheckHit
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks.ai_check import AiCheck
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.selector import select_paragraphs
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.parser.entities import SemanticEntity
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.llm.client import LlmClient
|
||||
|
||||
# 触发所有 check 类的 @register
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import required as _r # noqa: F401
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import font as _f # noqa: F401
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import regex_check as _rc # noqa: F401
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import confused_pair as _cp # noqa: F401
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import forbid as _fb # noqa: F401
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import wenzhong as _wz # noqa: F401
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import hierarchy as _h # noqa: F401
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import punctuation as _p # noqa: F401
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import cross_role as _cr # noqa: F401
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import ai_check as _ai # noqa: F401
|
||||
|
||||
|
||||
@dataclass
|
||||
class RuleOutcome:
|
||||
"""单条规则的执行结果(含 skipped 状态)。"""
|
||||
|
||||
rule: Rule
|
||||
findings: list[Finding] = field(default_factory=list)
|
||||
skipped: bool = False
|
||||
skip_reason: str = ""
|
||||
|
||||
|
||||
class RuleRunner:
|
||||
def __init__(self, llm_client: LlmClient | None = None):
|
||||
self.llm = llm_client
|
||||
|
||||
# -- 上下文装配 -----------------------------------------------------
|
||||
def _resolve_target(
|
||||
self,
|
||||
rule: Rule,
|
||||
doc: Document,
|
||||
entities: dict[str, SemanticEntity | None],
|
||||
) -> tuple[list, SemanticEntity | None, RuleOutcome | None]:
|
||||
"""根据 rule.target 或 rule.applies_to 选段落。
|
||||
|
||||
返回 (paragraphs, target_entity, early_outcome);
|
||||
若 early_outcome 非 None,调用方应直接返回(命中 on_missing 提前结束)。
|
||||
"""
|
||||
if rule.target:
|
||||
target_entity = entities.get(rule.target)
|
||||
if target_entity is None:
|
||||
return [], None, self._handle_missing(rule)
|
||||
paragraphs = [
|
||||
doc.paragraphs[i]
|
||||
for i in target_entity.paragraph_indices
|
||||
if 0 <= i < len(doc.paragraphs)
|
||||
]
|
||||
return paragraphs, target_entity, None
|
||||
# applies_to 通道(多段扫描)
|
||||
return select_paragraphs(doc, rule.applies_to), None, None
|
||||
|
||||
def _handle_missing(self, rule: Rule) -> RuleOutcome:
|
||||
mode = rule.on_missing
|
||||
if mode == "pass":
|
||||
return RuleOutcome(rule=rule)
|
||||
reason = f"目标实体「{rule.target}」未识别到"
|
||||
if mode == "skip":
|
||||
return RuleOutcome(rule=rule, skipped=True, skip_reason=reason)
|
||||
severity = "error" if mode == "fail" else "warning"
|
||||
finding = Finding(
|
||||
finding_id=f"F-{uuid.uuid4().hex[:8]}",
|
||||
rule_id=rule.rule_id,
|
||||
rule_name=rule.name,
|
||||
severity=severity,
|
||||
category=rule.category,
|
||||
location=Location(paragraph_index=-1),
|
||||
message=reason,
|
||||
suggestion=rule.messages.fail or "",
|
||||
evidence="", confidence=0.9,
|
||||
)
|
||||
return RuleOutcome(rule=rule, findings=[finding])
|
||||
|
||||
@staticmethod
|
||||
def _merge_skip(outcome: RuleOutcome, result: CheckResult) -> None:
|
||||
if not outcome.skip_reason:
|
||||
outcome.skip_reason = result.skip_reason or "stage skipped"
|
||||
outcome.skipped = True
|
||||
|
||||
# -- 同步路径 -------------------------------------------------------
|
||||
def run_rule(
|
||||
self,
|
||||
rule: Rule,
|
||||
doc: Document,
|
||||
entities: dict[str, SemanticEntity | None] | None = None,
|
||||
) -> RuleOutcome:
|
||||
entities = entities or {}
|
||||
paragraphs, target, early = self._resolve_target(rule, doc, entities)
|
||||
if early is not None:
|
||||
return early
|
||||
|
||||
outcome = RuleOutcome(rule=rule)
|
||||
for stage in rule.stages:
|
||||
if stage.check == "ai":
|
||||
check = AiCheck(llm_client=self.llm)
|
||||
else:
|
||||
check_cls = get_check(stage.check)
|
||||
check = check_cls()
|
||||
|
||||
ctx = CheckContext(
|
||||
document=doc,
|
||||
paragraphs=paragraphs,
|
||||
stage=stage,
|
||||
entities=entities,
|
||||
target=target,
|
||||
rule_id=rule.rule_id,
|
||||
)
|
||||
result: CheckResult = check.run(ctx)
|
||||
if result.skipped:
|
||||
self._merge_skip(outcome, result)
|
||||
continue
|
||||
if not result.passed:
|
||||
outcome.findings = [self._hit_to_finding(rule, h) for h in result.hits]
|
||||
outcome.skipped = False
|
||||
outcome.skip_reason = ""
|
||||
return outcome
|
||||
return outcome
|
||||
|
||||
def run_all(
|
||||
self,
|
||||
rules: list[Rule],
|
||||
doc: Document,
|
||||
entities: dict[str, SemanticEntity | None] | None = None,
|
||||
) -> list[Finding]:
|
||||
flat, _ = self.evaluate(rules, doc, entities)
|
||||
return flat
|
||||
|
||||
def evaluate(
|
||||
self,
|
||||
rules: list[Rule],
|
||||
doc: Document,
|
||||
entities: dict[str, SemanticEntity | None] | None = None,
|
||||
) -> tuple[list[Finding], list[RuleOutcome]]:
|
||||
flat: list[Finding] = []
|
||||
outcomes: list[RuleOutcome] = []
|
||||
for r in rules:
|
||||
o = self.run_rule(r, doc, entities)
|
||||
flat.extend(o.findings)
|
||||
outcomes.append(o)
|
||||
return flat, outcomes
|
||||
|
||||
# -- 异步路径 -------------------------------------------------------
|
||||
async def run_rule_async(
|
||||
self,
|
||||
rule: Rule,
|
||||
doc: Document,
|
||||
entities: dict[str, SemanticEntity | None] | None = None,
|
||||
) -> RuleOutcome:
|
||||
entities = entities or {}
|
||||
paragraphs, target, early = self._resolve_target(rule, doc, entities)
|
||||
if early is not None:
|
||||
return early
|
||||
|
||||
outcome = RuleOutcome(rule=rule)
|
||||
for stage in rule.stages:
|
||||
ctx = CheckContext(
|
||||
document=doc,
|
||||
paragraphs=paragraphs,
|
||||
stage=stage,
|
||||
entities=entities,
|
||||
target=target,
|
||||
rule_id=rule.rule_id,
|
||||
)
|
||||
if stage.check == "ai":
|
||||
result = await AiCheck(llm_client=self.llm).run_async(ctx)
|
||||
else:
|
||||
check_cls = get_check(stage.check)
|
||||
result = check_cls().run(ctx)
|
||||
if result.skipped:
|
||||
self._merge_skip(outcome, result)
|
||||
continue
|
||||
if not result.passed:
|
||||
outcome.findings = [self._hit_to_finding(rule, h) for h in result.hits]
|
||||
outcome.skipped = False
|
||||
outcome.skip_reason = ""
|
||||
return outcome
|
||||
return outcome
|
||||
|
||||
async def run_all_async(
|
||||
self,
|
||||
rules: list[Rule],
|
||||
doc: Document,
|
||||
entities: dict[str, SemanticEntity | None] | None = None,
|
||||
) -> list[Finding]:
|
||||
flat, _ = await self.evaluate_async(rules, doc, entities)
|
||||
return flat
|
||||
|
||||
async def evaluate_async(
|
||||
self,
|
||||
rules: list[Rule],
|
||||
doc: Document,
|
||||
entities: dict[str, SemanticEntity | None] | None = None,
|
||||
) -> tuple[list[Finding], list[RuleOutcome]]:
|
||||
outcomes_list = await asyncio.gather(
|
||||
*(self.run_rule_async(r, doc, entities) for r in rules)
|
||||
)
|
||||
flat: list[Finding] = []
|
||||
outcomes: list[RuleOutcome] = []
|
||||
for o in outcomes_list:
|
||||
flat.extend(o.findings)
|
||||
outcomes.append(o)
|
||||
return flat, outcomes
|
||||
|
||||
def _hit_to_finding(self, rule: Rule, hit: CheckHit) -> Finding:
|
||||
para = hit.paragraph
|
||||
loc = Location(
|
||||
paragraph_index=para.index if para else -1,
|
||||
role=para.role if para else None,
|
||||
char_start=hit.char_start,
|
||||
char_end=hit.char_end,
|
||||
context=para.text if para else "",
|
||||
)
|
||||
msg = hit.message or rule.messages.fail
|
||||
return Finding(
|
||||
finding_id=f"F-{uuid.uuid4().hex[:8]}",
|
||||
rule_id=rule.rule_id,
|
||||
rule_name=rule.name,
|
||||
severity=rule.severity,
|
||||
category=rule.category,
|
||||
location=loc,
|
||||
actual=hit.actual or {},
|
||||
expected=hit.expected or {},
|
||||
message=msg,
|
||||
suggestion=rule.messages.fail or "",
|
||||
evidence=rule.messages.fail or "",
|
||||
confidence=hit.confidence,
|
||||
)
|
||||
@@ -0,0 +1,27 @@
|
||||
"""applies_to → 段落集合。"""
|
||||
|
||||
from __future__ import annotations
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.models import Document, Paragraph
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.dsl.schema import AppliesTo
|
||||
|
||||
|
||||
def select_paragraphs(doc: Document, applies_to: AppliesTo) -> list[Paragraph]:
|
||||
if applies_to.paragraph_index is not None:
|
||||
idx = applies_to.paragraph_index
|
||||
if 0 <= idx < len(doc.paragraphs):
|
||||
return [doc.paragraphs[idx]]
|
||||
return []
|
||||
|
||||
if applies_to.role == "any":
|
||||
return list(doc.paragraphs)
|
||||
|
||||
targets: set[str] = set()
|
||||
if applies_to.role:
|
||||
targets.add(applies_to.role)
|
||||
if applies_to.roles:
|
||||
targets.update(applies_to.roles)
|
||||
|
||||
if not targets:
|
||||
return list(doc.paragraphs)
|
||||
|
||||
return [p for p in doc.paragraphs if p.role in targets]
|
||||
@@ -0,0 +1,93 @@
|
||||
"""从 Document 派生出 structure(按 role 分类统计)+ outline(heading 层级树)。"""
|
||||
|
||||
from __future__ import annotations
|
||||
from collections import Counter
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.models import Document, Paragraph
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.result import OutlineNode, StructureItem
|
||||
|
||||
|
||||
_ROLE_LABELS: list[tuple[str, str, bool]] = [
|
||||
# (role, 中文标签, 是否常规公文必备)
|
||||
("title", "标题", True),
|
||||
("doc_number", "发文字号", True),
|
||||
("recipient", "主送机关", True),
|
||||
("heading_1", "一级标题", False),
|
||||
("heading_2", "二级标题", False),
|
||||
("heading_3", "三级标题", False),
|
||||
("heading_4", "四级标题", False),
|
||||
("body", "正文", True),
|
||||
("attachment_marker", "附件标记", False),
|
||||
("attachment_title", "附件标题", False),
|
||||
("signature", "署名", True),
|
||||
("date", "成文日期", True),
|
||||
("no_text_marker", "(此页无正文)", False),
|
||||
("unknown", "未识别", False),
|
||||
]
|
||||
|
||||
_HEADING_LEVELS = {
|
||||
"heading_1": 1,
|
||||
"heading_2": 2,
|
||||
"heading_3": 3,
|
||||
"heading_4": 4,
|
||||
}
|
||||
|
||||
|
||||
def _dominant_style(paragraphs: list[Paragraph]) -> tuple[str | None, float | None, bool]:
|
||||
"""返回 (字体众数, 字号众数, 是否所有段落样式一致)。"""
|
||||
if not paragraphs:
|
||||
return None, None, True
|
||||
fonts = Counter(p.style.font_eastasia for p in paragraphs if p.style.font_eastasia)
|
||||
sizes = Counter(p.style.font_size_pt for p in paragraphs if p.style.font_size_pt is not None)
|
||||
dom_font = fonts.most_common(1)[0][0] if fonts else None
|
||||
dom_size = sizes.most_common(1)[0][0] if sizes else None
|
||||
uniform = len(fonts) <= 1 and len(sizes) <= 1
|
||||
return dom_font, dom_size, uniform
|
||||
|
||||
|
||||
def build_structure(doc: Document) -> list[StructureItem]:
|
||||
items: list[StructureItem] = []
|
||||
for role, label, expected in _ROLE_LABELS:
|
||||
paragraphs = [p for p in doc.paragraphs if p.role == role]
|
||||
if not paragraphs and not expected:
|
||||
# 非必备 role 没出现就不展示,保持面板紧凑
|
||||
continue
|
||||
samples = [p.text[:60] for p in paragraphs[:3]]
|
||||
font, size, uniform = _dominant_style(paragraphs)
|
||||
items.append(StructureItem(
|
||||
role=role,
|
||||
label=label,
|
||||
count=len(paragraphs),
|
||||
expected=expected,
|
||||
paragraph_indices=[p.index for p in paragraphs],
|
||||
samples=samples,
|
||||
char_total=sum(len(p.text) for p in paragraphs),
|
||||
dominant_font=font,
|
||||
dominant_size_pt=size,
|
||||
style_uniform=uniform,
|
||||
))
|
||||
return items
|
||||
|
||||
|
||||
def build_outline(doc: Document) -> list[OutlineNode]:
|
||||
"""按段落顺序 + heading 层级生成树。"""
|
||||
headings = [
|
||||
(p.index, _HEADING_LEVELS[p.role], p.text)
|
||||
for p in doc.paragraphs
|
||||
if p.role in _HEADING_LEVELS
|
||||
]
|
||||
if not headings:
|
||||
return []
|
||||
|
||||
roots: list[OutlineNode] = []
|
||||
stack: list[OutlineNode] = []
|
||||
for idx, level, text in headings:
|
||||
node = OutlineNode(paragraph_index=idx, level=level, text=text)
|
||||
# 弹出比当前 level 更深的祖先
|
||||
while stack and stack[-1].level >= level:
|
||||
stack.pop()
|
||||
if stack:
|
||||
stack[-1].children.append(node)
|
||||
else:
|
||||
roots.append(node)
|
||||
stack.append(node)
|
||||
return roots
|
||||
Reference in New Issue
Block a user