feat(govdoc): 新增内部公文模块全链路（后端58+前端11文件）

2026-05-13 14:37:12 +08:00
parent 99699e20e1
commit 5d777599bf
63 changed files with 7608 additions and 0 deletions
@@ -0,0 +1,152 @@
+"""解析 .docx → Document 对象。
+
+文档顺序遍历 body：顶级段落 + 表格内段落都纳入 paragraphs，
+后续 role tagging 与规则评估都能扫到表格内的内容。
+"""
+
+from __future__ import annotations
+from pathlib import Path
+from docx import Document as DocxDocument
+from docx.oxml.ns import qn
+from docx.text.paragraph import Paragraph as DocxParagraph
+from lxml import etree
+
+from fastapi_modules.fastapi_leaudit.govdoc_engine.models import Document, Paragraph, ParagraphStyle, Run, Table
+from fastapi_modules.fastapi_leaudit.govdoc_engine.parser.style_resolver import StyleResolver
+
+
+_ALIGN_MAP = {0: "left", 1: "center", 2: "right", 3: "justify"}
+
+
+def _read_run_style(run, p_elem, resolver: StyleResolver) -> ParagraphStyle:
+    rs = resolver.resolve_run(p_elem, run._element)
+    return ParagraphStyle(
+        font_eastasia=rs.font_eastasia,
+        font_ascii=rs.font_ascii,
+        font_size_pt=rs.size_pt,
+        bold=bool(rs.bold) if rs.bold is not None else False,
+        italic=bool(rs.italic) if rs.italic is not None else False,
+    )
+
+
+def _read_paragraph_style(p, resolver: StyleResolver) -> ParagraphStyle:
+    pf = p.paragraph_format
+    alignment = (
+        _ALIGN_MAP.get(pf.alignment, "left") if pf.alignment is not None else "left"
+    )
+    spacing_pt = float(pf.line_spacing) if pf.line_spacing is not None else None
+    indent = pf.first_line_indent
+    indent_pt = float(indent.pt) if indent is not None else 0.0
+
+    if p.runs:
+        base = _read_run_style(p.runs[0], p._element, resolver)
+    else:
+        rs = resolver.resolve_paragraph(p._element)
+        base = ParagraphStyle(
+            font_eastasia=rs.font_eastasia,
+            font_ascii=rs.font_ascii,
+            font_size_pt=rs.size_pt,
+            bold=bool(rs.bold) if rs.bold is not None else False,
+            italic=bool(rs.italic) if rs.italic is not None else False,
+        )
+    base.alignment = alignment
+    base.line_spacing = spacing_pt
+    base.first_line_indent_pt = indent_pt
+    return base
+
+
+def _is_in_table(p_elem) -> bool:
+    parent = p_elem.getparent()
+    while parent is not None:
+        if etree.QName(parent).localname == "tbl":
+            return True
+        parent = parent.getparent()
+    return False
+
+
+def _iter_body_paragraphs(docx):
+    """文档顺序遍历 body 下所有 w:p（含表格内）。"""
+    for p_elem in docx.element.body.iter(qn("w:p")):
+        yield p_elem
+
+
+def _iter_header_footer_paragraphs(docx):
+    """yield (DocxParagraph, p_elem, in_header, in_footer)，跨 section 去重。"""
+    seen: set[int] = set()
+    for section in docx.sections:
+        targets = [
+            ("header", section.header),
+            ("first_header", section.first_page_header),
+            ("even_header", section.even_page_header),
+            ("footer", section.footer),
+            ("first_footer", section.first_page_footer),
+            ("even_footer", section.even_page_footer),
+        ]
+        for kind, hf in targets:
+            if hf is None:
+                continue
+            try:
+                if hf.is_linked_to_previous:
+                    continue
+            except Exception:
+                pass
+            in_header = "header" in kind
+            for p in hf.paragraphs:
+                pid = id(p._element)
+                if pid in seen:
+                    continue
+                seen.add(pid)
+                yield p, p._element, in_header, not in_header
+
+
+def parse_docx(path: str | Path) -> Document:
+    path = Path(path)
+    docx = DocxDocument(path)
+    resolver = StyleResolver(docx)
+
+    paragraphs: list[Paragraph] = []
+    idx = 0
+    # 1) body：含表格内段落
+    for p_elem in _iter_body_paragraphs(docx):
+        p = DocxParagraph(p_elem, docx.part)
+        runs = [
+            Run(text=r.text, style=_read_run_style(r, p_elem, resolver))
+            for r in p.runs
+        ]
+        style = _read_paragraph_style(p, resolver)
+        paragraphs.append(Paragraph(
+            index=idx,
+            text=p.text,
+            runs=runs,
+            style=style,
+            in_table=_is_in_table(p_elem),
+        ))
+        idx += 1
+    # 2) headers / footers：附在末尾，role tagger 也能扫到
+    for p, p_elem, in_header, in_footer in _iter_header_footer_paragraphs(docx):
+        runs = [
+            Run(text=r.text, style=_read_run_style(r, p_elem, resolver))
+            for r in p.runs
+        ]
+        style = _read_paragraph_style(p, resolver)
+        paragraphs.append(Paragraph(
+            index=idx,
+            text=p.text,
+            runs=runs,
+            style=style,
+            in_table=_is_in_table(p_elem),
+            in_header=in_header,
+            in_footer=in_footer,
+        ))
+        idx += 1
+
+    tables = []
+    for tidx, t in enumerate(docx.tables):
+        rows = [[cell.text for cell in row.cells] for row in t.rows]
+        tables.append(Table(index=tidx, rows=rows))
+
+    return Document(
+        meta={"path": str(path), "page_count": len(docx.sections)},
+        paragraphs=paragraphs,
+        tables=tables,
+    )
@@ -0,0 +1,27 @@
+"""语义实体：把段落 + 字段值 + 样式合在一起。"""
+
+from __future__ import annotations
+from typing import Any, Literal
+from pydantic import BaseModel, Field
+from fastapi_modules.fastapi_leaudit.govdoc_engine.models import ParagraphStyle
+
+
+EntitySource = Literal["structural", "llm", "derived"]
+
+
+class SemanticEntity(BaseModel):
+    """公文中的一个语义单元（标题 / 发文字号 / 主送机关 / ...）。
+
+    - structural：name 与某个 role 一一对应，paragraph_indices 非空，style 可用。
+    - derived：从其他实体推导（如 wenzhong 从 title 末尾），paragraph_indices 借用源段落。
+    - llm：仅当结构 / 派生路径都失败时启用，paragraph_indices 可能为空。
+    """
+
+    name: str
+    text: str = ""
+    paragraph_indices: list[int] = Field(default_factory=list)
+    primary_role: str | None = None
+    style: ParagraphStyle | None = None
+    extra: dict[str, Any] = Field(default_factory=dict)
+    source: EntitySource = "structural"
+    confidence: float = 1.0
@@ -0,0 +1,195 @@
+"""从已 tag 的 Document 抽取语义实体（结构化优先）。"""
+
+from __future__ import annotations
+import re
+from fastapi_modules.fastapi_leaudit.govdoc_engine.models import Document
+from fastapi_modules.fastapi_leaudit.govdoc_engine.parser.entities import SemanticEntity
+
+
+# 8 个内置实体名（也用于 schema 校验冲突）
+BUILTIN_ENTITY_NAMES: frozenset[str] = frozenset({
+    "title", "doc_number", "recipient", "date",
+    "signature", "attachments", "wenzhong", "issuer",
+})
+
+
+# 内置实体的 LLM 兜底 prompt 描述（Phase B 使用）
+BUILTIN_LLM_DESCRIPTION: dict[str, str] = {
+    "title":       "公文主标题（不含发文字号）",
+    "doc_number":  "X发〔YYYY〕N号 形式的发文字号",
+    "recipient":   "公文抬头的接收机关名称",
+    "date":        "末尾的成文日期原文",
+    "signature":   "末尾的发文机关署名",
+    "attachments": "附件清单（数组，每项含 序号 与 名称）",
+    "wenzhong":    "公文文种（决议/决定/通知/通报/请示/批复 等 15 种之一）",
+    "issuer":      "发文机关全称",
+}
+
+
+# role → entity name 的 1:1 映射
+_ROLE_ENTITY_MAP = {
+    "title": "title",
+    "doc_number": "doc_number",
+    "recipient": "recipient",
+    "date": "date",
+    "signature": "signature",
+}
+
+
+_ATTACHMENT_HEAD_RE = re.compile(r"^附件\d*[:：]\s*")
+_ATTACHMENT_ITEM_RE = re.compile(r"^\s*(\d+)[\.．、)]\s*(.+)$")
+
+# 15 种法定文种（参照《党政机关公文处理工作条例》）
+_WENZHONG_LIST = (
+    "决议", "决定", "命令", "公报", "公告", "通告",
+    "意见", "通知", "通报", "报告", "请示", "批复",
+    "议案", "函", "纪要",
+)
+_WENZHONG_RE = re.compile("(" + "|".join(_WENZHONG_LIST) + ")$")
+
+# 「XX关于...的YY」 → issuer = XX
+_ISSUER_PREFIX_RE = re.compile(r"^(.+?)关于")
+
+
+class EntityBuilder:
+    """从已 tag 的 Document 抽取 8 个内置语义实体。"""
+
+    def build(self, doc: Document) -> dict[str, SemanticEntity | None]:
+        entities: dict[str, SemanticEntity | None] = {
+            name: None for name in BUILTIN_ENTITY_NAMES
+        }
+
+        # ① 一对一 role → entity
+        for role, name in _ROLE_ENTITY_MAP.items():
+            paras = [p for p in doc.paragraphs if p.role == role]
+            if not paras:
+                continue
+            target = paras[-1] if name == "signature" else paras[0]
+            entities[name] = SemanticEntity(
+                name=name,
+                text=target.text.strip(),
+                paragraph_indices=[target.index],
+                primary_role=role,
+                style=target.style,
+                source="structural",
+                confidence=target.role_confidence,
+            )
+
+        # ② attachments：attachment_marker + 跟随行
+        entities["attachments"] = self._build_attachments(doc)
+
+        # ③ 派生：wenzhong / issuer
+        title_e = entities.get("title")
+        if title_e:
+            entities["wenzhong"] = self._derive_wenzhong(title_e)
+            entities["issuer"] = self._derive_issuer(
+                title_e, entities.get("signature")
+            )
+        elif entities.get("signature"):
+            entities["issuer"] = self._derive_issuer(
+                None, entities["signature"]
+            )
+
+        return entities
+
+    # ---------- attachments ----------
+    def _build_attachments(self, doc: Document) -> SemanticEntity | None:
+        markers = [
+            i for i, p in enumerate(doc.paragraphs)
+            if p.role == "attachment_marker"
+        ]
+        if not markers:
+            return None
+
+        m = markers[0]
+        items: list[dict] = []
+        para_idxs: list[int] = [m]
+
+        first = doc.paragraphs[m].text.strip()
+        head = _ATTACHMENT_HEAD_RE.sub("", first)
+        if head:
+            mt = _ATTACHMENT_ITEM_RE.match(head)
+            if mt:
+                items.append(
+                    {"序号": int(mt.group(1)), "名称": mt.group(2).strip()}
+                )
+            else:
+                items.append({"序号": 1, "名称": head})
+
+        # 后续顺序行：直到遇到非 body / unknown 的段
+        for j in range(m + 1, len(doc.paragraphs)):
+            p = doc.paragraphs[j]
+            if p.role and p.role not in ("body", "unknown", "attachment_marker"):
+                break
+            t = p.text.strip()
+            if not t:
+                continue
+            mt = _ATTACHMENT_ITEM_RE.match(t)
+            if not mt:
+                break
+            items.append(
+                {"序号": int(mt.group(1)), "名称": mt.group(2).strip()}
+            )
+            para_idxs.append(p.index)
+
+        if not items:
+            return None
+
+        text = "; ".join(f"{it['序号']}. {it['名称']}" for it in items)
+        return SemanticEntity(
+            name="attachments",
+            text=text,
+            paragraph_indices=para_idxs,
+            primary_role="attachment_marker",
+            style=doc.paragraphs[m].style,
+            extra={"items": items},
+            source="structural",
+            confidence=0.9,
+        )
+
+    # ---------- 派生 ----------
+    def _derive_wenzhong(
+        self, title: SemanticEntity
+    ) -> SemanticEntity | None:
+        m = _WENZHONG_RE.search(title.text)
+        if not m:
+            return None
+        return SemanticEntity(
+            name="wenzhong",
+            text=m.group(1),
+            paragraph_indices=list(title.paragraph_indices),
+            primary_role="title",
+            extra={"derived_from": "title.suffix"},
+            source="derived",
+            confidence=0.95,
+        )
+
+    def _derive_issuer(
+        self,
+        title: SemanticEntity | None,
+        signature: SemanticEntity | None,
+    ) -> SemanticEntity | None:
+        if title:
+            m = _ISSUER_PREFIX_RE.match(title.text)
+            if m:
+                return SemanticEntity(
+                    name="issuer",
+                    text=m.group(1),
+                    paragraph_indices=list(title.paragraph_indices),
+                    primary_role="title",
+                    extra={"derived_from": "title.prefix"},
+                    source="derived",
+                    confidence=0.9,
+                )
+        if signature:
+            return SemanticEntity(
+                name="issuer",
+                text=signature.text,
+                paragraph_indices=list(signature.paragraph_indices),
+                primary_role="signature",
+                style=signature.style,
+                extra={"derived_from": "signature"},
+                source="derived",
+                confidence=0.8,
+            )
+        return None
@@ -0,0 +1,104 @@
+"""LLM 字段抽取：差量模式（仅对未知字段构造 prompt）。"""
+
+from __future__ import annotations
+import logging
+from typing import Any
+from fastapi_modules.fastapi_leaudit.govdoc_engine.models import Document
+from fastapi_modules.fastapi_leaudit.govdoc_engine.llm.client import LlmClient, _format_exc
+
+_log = logging.getLogger(__name__)
+
+
+_PROMPT_HEAD = """从下面的公文中抽取以下指定字段，仅以 JSON 输出。
+
+【公文内容（顺序段落）】
+{text}
+
+【需要抽取的字段】
+{spec_block}
+
+【输出格式】
+仅 JSON：{{{example}}}
+未识别的字段填 ""（list 类型填 []）。
+"""
+
+
+def _build_doc_text(doc: Document) -> str:
+    return "\n".join(f"[{p.index}] {p.text}" for p in doc.paragraphs)
+
+
+def _example_for(spec: dict[str, dict]) -> str:
+    parts = []
+    for name, meta in spec.items():
+        t = meta.get("type", "string")
+        if t == "list":
+            parts.append(f'"{name}": []')
+        else:
+            parts.append(f'"{name}": ""')
+    return ", ".join(parts)
+
+
+class FieldExtractor:
+    """LLM 差量字段抽取。
+
+    extract_missing(doc, spec): spec 指定需要抽哪些字段；空 spec 不调 LLM。
+    """
+
+    def __init__(self, llm_client: LlmClient):
+        self.client = llm_client
+
+    def _build_messages_for_spec(
+        self, doc: Document, spec: dict[str, dict]
+    ) -> list[dict[str, str]]:
+        spec_lines = [
+            f"- {name}: {meta.get('description', name)}"
+            f"（{meta.get('type', 'string')}）"
+            for name, meta in spec.items()
+        ]
+        prompt = _PROMPT_HEAD.format(
+            text=_build_doc_text(doc),
+            spec_block="\n".join(spec_lines) or "（无）",
+            example=_example_for(spec),
+        )
+        return [{"role": "user", "content": prompt}]
+
+    def _shape_missing(
+        self, spec: dict[str, dict], resp: dict
+    ) -> dict[str, Any]:
+        out: dict[str, Any] = {}
+        for name, meta in spec.items():
+            if meta.get("type") == "list":
+                out[name] = resp.get(name) or []
+            else:
+                out[name] = resp.get(name) or ""
+        return out
+
+    def extract_missing(
+        self, doc: Document | None, spec: dict[str, dict]
+    ) -> dict[str, Any]:
+        if not spec or doc is None:
+            return {}
+        label = "extract_missing__" + ",".join(spec.keys())
+        try:
+            resp = self.client.chat_json(
+                self._build_messages_for_spec(doc, spec), label=label,
+            )
+        except Exception as e:
+            _log.warning("Differential extraction failed: %s", _format_exc(e))
+            resp = {}
+        return self._shape_missing(spec, resp)
+
+    async def extract_missing_async(
+        self, doc: Document | None, spec: dict[str, dict]
+    ) -> dict[str, Any]:
+        if not spec or doc is None:
+            return {}
+        label = "extract_missing__" + ",".join(spec.keys())
+        try:
+            resp = await self.client.chat_json_async(
+                self._build_messages_for_spec(doc, spec), label=label,
+            )
+        except Exception as e:
+            _log.warning("Differential extraction failed: %s", _format_exc(e))
+            resp = {}
+        return self._shape_missing(spec, resp)
@@ -0,0 +1,83 @@
+"""doc / wps → docx 转换。"""
+
+from __future__ import annotations
+
+import shutil
+import subprocess
+from pathlib import Path
+
+from fastapi_modules.fastapi_leaudit.govdoc_engine.config import get_settings
+
+
+class UnsupportedFormat(Exception):
+    pass
+
+
+class ConversionError(Exception):
+    pass
+
+
+_SUPPORTED_DIRECT = {".docx"}
+_SUPPORTED_CONVERT = {".doc", ".wps"}
+_SOFFICE_FALLBACK_PATHS = (
+    "/opt/homebrew/bin/soffice",
+    "/usr/local/bin/soffice",
+    "/Applications/LibreOffice.app/Contents/MacOS/soffice",
+    "/usr/bin/soffice",
+)
+
+
+def load_to_docx(src: Path) -> Path:
+    """统一返回 .docx 路径。.doc/.wps 调 soffice 转换。"""
+    ext = src.suffix.lower()
+    if ext in _SUPPORTED_DIRECT:
+        return src
+    if ext in _SUPPORTED_CONVERT:
+        return _convert_via_soffice(src)
+    raise UnsupportedFormat(f"unsupported file type: {ext}")
+
+
+def _convert_via_soffice(src: Path) -> Path:
+    soffice = _resolve_soffice_path(get_settings().soffice_path)
+
+    out_dir = src.parent
+    cmd = [
+        soffice, "--headless", "--convert-to", "docx",
+        "--outdir", str(out_dir), str(src),
+    ]
+    try:
+        result = subprocess.run(
+            cmd, capture_output=True, timeout=60,
+        )
+    except subprocess.TimeoutExpired as e:
+        raise ConversionError("soffice timeout") from e
+
+    if result.returncode != 0:
+        raise ConversionError(
+            f"soffice exit {result.returncode}: {result.stderr.decode(errors='ignore')}"
+        )
+
+    out = out_dir / (src.stem + ".docx")
+    if not out.exists():
+        raise ConversionError(f"expected output not found: {out}")
+    return out
+
+
+def _resolve_soffice_path(configured: str) -> str:
+    candidates = [configured, *_SOFFICE_FALLBACK_PATHS]
+    checked: list[str] = []
+    for candidate in candidates:
+        if candidate in checked:
+            continue
+        checked.append(candidate)
+
+        resolved = shutil.which(candidate)
+        if resolved:
+            return resolved
+        if Path(candidate).exists():
+            return candidate
+
+    raise ConversionError(
+        f"soffice not found; checked: {', '.join(checked)}. "
+        "Install LibreOffice or set SOFFICE_PATH."
+    )
@@ -0,0 +1,50 @@
+"""组合规则 tagger + LLM tagger 的总入口。"""
+
+from __future__ import annotations
+import asyncio
+from fastapi_modules.fastapi_leaudit.govdoc_engine.models import Document
+from fastapi_modules.fastapi_leaudit.govdoc_engine.parser.role_tagger_rule import RuleBasedTagger
+from fastapi_modules.fastapi_leaudit.govdoc_engine.parser.role_tagger_llm import LlmTagger
+from fastapi_modules.fastapi_leaudit.govdoc_engine.llm.client import LlmClient
+
+
+class RoleTagger:
+    """两段式：先规则打标，置信度 < threshold 的段落送 LLM 兜底。"""
+
+    def __init__(
+        self,
+        llm_client: LlmClient | None = None,
+        threshold: float = 0.8,
+    ):
+        self.rule = RuleBasedTagger()
+        self.llm = LlmTagger(llm_client) if llm_client else None
+        self.threshold = threshold
+
+    def _low_conf_indices(self, doc: Document) -> list[int]:
+        return [
+            i for i, p in enumerate(doc.paragraphs)
+            if p.role_confidence < self.threshold
+        ]
+
+    def tag(self, doc: Document) -> None:
+        self.rule.tag(doc)
+        if self.llm is None:
+            return
+        for i in self._low_conf_indices(doc):
+            role, conf = self.llm.disambiguate(doc, i)
+            doc.paragraphs[i].role = role
+            doc.paragraphs[i].role_confidence = conf
+
+    async def tag_async(self, doc: Document) -> None:
+        self.rule.tag(doc)
+        if self.llm is None:
+            return
+        targets = self._low_conf_indices(doc)
+        if not targets:
+            return
+        results = await asyncio.gather(
+            *(self.llm.disambiguate_async(doc, i) for i in targets)
+        )
+        for i, (role, conf) in zip(targets, results):
+            doc.paragraphs[i].role = role
+            doc.paragraphs[i].role_confidence = conf
@@ -0,0 +1,90 @@
+"""LLM 兜底打 role：对低置信段落做二次确认。"""
+
+from __future__ import annotations
+import logging
+from fastapi_modules.fastapi_leaudit.govdoc_engine.models import Document, Role
+from fastapi_modules.fastapi_leaudit.govdoc_engine.llm.client import LlmClient, _format_exc
+
+_log = logging.getLogger(__name__)
+
+
+VALID_ROLES = [
+    "title", "doc_number", "recipient",
+    "heading_1", "heading_2", "heading_3", "heading_4",
+    "body", "attachment_marker", "signature", "date",
+    "no_text_marker", "unknown",
+]
+
+
+_PROMPT = """你是公文格式专家。下面是一份公文的段落列表，请为指定的"待定段落"判断其角色。
+
+【全文段落（带索引和当前规则推测）】
+{context}
+
+【待定段落 idx={idx}】
+文本: {text}
+当前推测角色: {current_role}（置信度 {conf:.2f}）
+
+【角色取值范围】
+{roles}
+
+请综合公文结构判断该段落最可能的角色。
+
+仅以 JSON 输出：
+{{"role": "<角色>", "confidence": <0-1 浮点数>, "reason": "<简短理由>"}}
+"""
+
+
+class LlmTagger:
+    def __init__(self, client: LlmClient):
+        self.client = client
+
+    def _build_prompt(self, doc: Document, target_idx: int) -> tuple[str, "object"]:
+        ctx_lines = []
+        for p in doc.paragraphs:
+            tag = "← 待定" if p.index == target_idx else ""
+            ctx_lines.append(f"[{p.index}] role={p.role} text={p.text[:60]} {tag}")
+        ctx = "\n".join(ctx_lines)
+        target = doc.paragraphs[target_idx]
+        prompt = _PROMPT.format(
+            context=ctx,
+            idx=target_idx,
+            text=target.text,
+            current_role=target.role or "unknown",
+            conf=target.role_confidence,
+            roles=", ".join(VALID_ROLES),
+        )
+        return prompt, target
+
+    def _interpret(self, resp: dict, target) -> tuple[Role, float]:
+        role = resp.get("role", "unknown")
+        if role not in VALID_ROLES:
+            role = "unknown"
+        conf = float(resp.get("confidence", 0.5))
+        return role, conf  # type: ignore[return-value]
+
+    def disambiguate(self, doc: Document, target_idx: int) -> tuple[Role, float]:
+        prompt, target = self._build_prompt(doc, target_idx)
+        label = f"role_tag_p{target_idx}"
+        try:
+            resp = self.client.chat_json(
+                [{"role": "user", "content": prompt}], label=label,
+            )
+        except Exception as e:
+            _log.warning("Role disambiguation skipped (LLM error): %s", _format_exc(e))
+            return target.role or "unknown", target.role_confidence  # type: ignore[return-value]
+        return self._interpret(resp, target)
+
+    async def disambiguate_async(
+        self, doc: Document, target_idx: int
+    ) -> tuple[Role, float]:
+        prompt, target = self._build_prompt(doc, target_idx)
+        label = f"role_tag_p{target_idx}"
+        try:
+            resp = await self.client.chat_json_async(
+                [{"role": "user", "content": prompt}], label=label,
+            )
+        except Exception as e:
+            _log.warning("Role disambiguation skipped (LLM error): %s", _format_exc(e))
+            return target.role or "unknown", target.role_confidence  # type: ignore[return-value]
+        return self._interpret(resp, target)
@@ -0,0 +1,132 @@
+"""基于位置 + 文字模式 + 字体样式的段落角色识别。"""
+
+from __future__ import annotations
+import re
+from fastapi_modules.fastapi_leaudit.govdoc_engine.models import Document, Paragraph, Role
+
+
+HEADING_1_RE = re.compile(r"^[一二三四五六七八九十百]+、")
+HEADING_2_RE = re.compile(r"^（[一二三四五六七八九十]+）")
+HEADING_3_RE = re.compile(r"^\d+[\.．]")
+HEADING_4_RE = re.compile(r"^（\d+）")
+DOC_NUMBER_RE = re.compile(r"[一-龥]+[〔\[]\d{4}[〕\]]第?\d+号")
+DATE_RE = re.compile(
+    r"^\d{4}年\d{1,2}月\d{1,2}日$"
+    r"|^[一二三四五六七八九十○〇零]+年[一二三四五六七八九十○〇零]+月[一二三四五六七八九十○〇零]+日$"
+)
+ATTACHMENT_RE = re.compile(r"^附件[:：1-9]")
+NO_TEXT_RE = re.compile(r"^[\(（]\s*此页无正文\s*[\)）]")
+RECIPIENT_TAIL_RE = re.compile(r"[：:]\s*$")
+RECIPIENT_HINTS = (
+    "局", "委", "府", "厅", "办", "公司", "各", "处室",
+    "委员会", "署", "院", "部", "司", "处",
+)
+RECIPIENT_BLOCKLIST = (
+    "现将", "兹", "经研究", "为做好", "为深入", "为进一步",
+    "根据", "如下", "汇报", "通知如下", "请示如下",
+)
+
+
+class RuleBasedTagger:
+    def tag(self, doc: Document) -> None:
+        n = len(doc.paragraphs)
+        for i, p in enumerate(doc.paragraphs):
+            role, conf = self._classify(p, i, n, doc)
+            p.role = role
+            p.role_confidence = conf
+
+    def _classify(
+        self, p: Paragraph, idx: int, total: int, doc: Document
+    ) -> tuple[Role, float]:
+        text = p.text.strip()
+
+        if not text:
+            return ("unknown", 0.5)
+
+        if NO_TEXT_RE.match(text):
+            return ("no_text_marker", 1.0)
+
+        if ATTACHMENT_RE.match(text):
+            return ("attachment_marker", 0.95)
+
+        if DATE_RE.match(text):
+            return ("date", 0.9)
+
+        if DOC_NUMBER_RE.search(text) and idx <= 5:
+            return ("doc_number", 0.95)
+
+        if idx == 0 or (
+            idx <= 2
+            and p.style.alignment == "center"
+            and (p.style.font_size_pt or 0) >= 18
+        ):
+            return ("title", 0.95)
+
+        font = (p.style.font_eastasia or "").strip()
+        size = p.style.font_size_pt or 0
+
+        if self._is_attachment_title(p, idx, doc):
+            return ("attachment_title", 0.9)
+
+        if HEADING_1_RE.match(text):
+            conf = 0.95 if "黑体" in font else 0.7
+            return ("heading_1", conf)
+
+        if HEADING_2_RE.match(text):
+            conf = 0.95 if "楷体" in font else 0.7
+            return ("heading_2", conf)
+
+        if HEADING_3_RE.match(text):
+            conf = 0.9 if "仿宋" in font else 0.65
+            return ("heading_3", conf)
+
+        if HEADING_4_RE.match(text):
+            return ("heading_4", 0.85)
+
+        if (
+            idx <= 6
+            and 3 <= len(text) <= 50
+            and RECIPIENT_TAIL_RE.search(text)
+            and any(kw in text for kw in RECIPIENT_HINTS)
+            and not any(kw in text for kw in RECIPIENT_BLOCKLIST)
+        ):
+            return ("recipient", 0.9)
+
+        if total - idx <= 3 and 5 <= len(text) <= 30 and any(
+            kw in text
+            for kw in ["局", "公司", "委员会", "人民政府", "办公厅", "办公室"]
+        ):
+            return ("signature", 0.7)
+
+        if size >= 14 or font:
+            return ("body", 0.85)
+
+        return ("unknown", 0.4)
+
+    @staticmethod
+    def _is_attachment_title(p: Paragraph, idx: int, doc: Document) -> bool:
+        """识别附件正文首页标题，避免按普通正文套用 GW-F-004。"""
+        if idx <= 0:
+            return False
+        text = p.text.strip()
+        font = (p.style.font_eastasia or "").strip()
+        if (
+            p.style.alignment != "center"
+            or (p.style.font_size_pt or 0) < 18
+            or "小标宋" not in font
+        ):
+            return False
+
+        marker_index = None
+        marker_text = ""
+        for prev in reversed(doc.paragraphs[:idx]):
+            if prev.role == "attachment_marker" or ATTACHMENT_RE.match(prev.text.strip()):
+                marker_index = prev.index
+                marker_text = prev.text.strip()
+                break
+        if marker_index is None or idx - marker_index > 12:
+            return False
+
+        attachment_name = re.sub(r"^附件\d*[:：]\s*", "", marker_text).strip()
+        attachment_name = re.sub(r"^\d+[\.．、)]\s*", "", attachment_name).strip()
+        return not attachment_name or text == attachment_name or text in attachment_name or attachment_name in text
@@ -0,0 +1,241 @@
+"""OOXML 字体解析：处理样式继承链 + 主题字体。
+
+Word 把字体属性分散在四个层级：
+1. 直接 run rPr：`<w:r><w:rPr><w:rFonts/></w:rPr>...`
+2. 段落 rPr（段落标记字体）：`<w:p><w:pPr><w:rPr><w:rFonts/></w:rPr></w:pPr>`
+3. 段落引用样式：`<w:p><w:pPr><w:pStyle w:val="Heading1"/></w:pPr>`
+   样式定义在 styles.xml，可经 `<w:basedOn>` 链向上继承
+4. 全局默认：styles.xml 的 `<w:docDefaults>`
+
+此外 `<w:rFonts>` 的 `*Theme` 属性指向 theme1.xml 中的字体方案
+（majorEastAsia / minorEastAsia 等），需要做二次解析。
+"""
+
+from __future__ import annotations
+from dataclasses import dataclass
+from docx.oxml.ns import qn
+from lxml import etree
+
+# theme1.xml 命名空间
+_DML_NS = "http://schemas.openxmlformats.org/drawingml/2006/main"
+
+
+@dataclass
+class ResolvedRunStyle:
+    font_eastasia: str | None = None
+    font_ascii: str | None = None
+    size_pt: float | None = None
+    bold: bool | None = None
+    italic: bool | None = None
+
+
+def _empty_to_none(s: str | None) -> str | None:
+    if s is None:
+        return None
+    s = s.strip()
+    return s or None
+
+
+class StyleResolver:
+    """构造时一次性解析样式表 + 主题；之后 resolve_run() 是 O(链长)。"""
+
+    def __init__(self, docx):
+        self._theme = self._load_theme(docx)
+        self._styles, self._doc_defaults = self._load_styles(docx)
+
+    # ---- 主题 ---------------------------------------------------------
+    def _load_theme(self, docx) -> dict[tuple[str, str], str | None]:
+        """返回 {(axis, scheme_attr): font_name}。
+
+        scheme_attr 形如 'majorEastAsia' / 'minorAscii'，axis 是 rFonts 的轴。
+        """
+        out: dict[tuple[str, str], str | None] = {}
+        try:
+            theme_part = next(
+                p for p in docx.part.package.parts
+                if p.partname.endswith("/theme/theme1.xml")
+            )
+        except StopIteration:
+            return out
+        try:
+            root = etree.fromstring(theme_part.blob)
+        except etree.XMLSyntaxError:
+            return out
+
+        ns = {"a": _DML_NS}
+        for kind, font_tag in (("major", "majorFont"), ("minor", "minorFont")):
+            font_elem = root.find(f".//a:fontScheme/a:{font_tag}", ns)
+            if font_elem is None:
+                continue
+            latin = font_elem.find("a:latin", ns)
+            ea = font_elem.find("a:ea", ns)
+            cs = font_elem.find("a:cs", ns)
+            # ea 为空时用简中 Hans 兜底
+            ea_val = _empty_to_none(ea.get("typeface")) if ea is not None else None
+            if ea_val is None:
+                hans = font_elem.find('a:font[@script="Hans"]', ns)
+                if hans is not None:
+                    ea_val = _empty_to_none(hans.get("typeface"))
+            latin_val = _empty_to_none(latin.get("typeface")) if latin is not None else None
+            cs_val = _empty_to_none(cs.get("typeface")) if cs is not None else None
+
+            out[("ascii", f"{kind}Ascii")] = latin_val
+            out[("ascii", f"{kind}HAnsi")] = latin_val  # asciiTheme=majorHAnsi 也可能出现
+            out[("hAnsi", f"{kind}HAnsi")] = latin_val
+            out[("hAnsi", f"{kind}Ascii")] = latin_val
+            out[("eastAsia", f"{kind}EastAsia")] = ea_val
+            out[("cs", f"{kind}Bidi")] = cs_val
+        return out
+
+    # ---- 样式表 -------------------------------------------------------
+    def _load_styles(
+        self, docx
+    ) -> tuple[dict[str, dict], ResolvedRunStyle | None]:
+        out: dict[str, dict] = {}
+        defaults: ResolvedRunStyle | None = None
+        try:
+            styles_root = docx.part._styles_part.element
+        except (AttributeError, KeyError):
+            return out, defaults
+        if styles_root is None:
+            return out, defaults
+
+        # docDefaults
+        ddef = styles_root.find(qn("w:docDefaults"))
+        if ddef is not None:
+            rdef = ddef.find(qn("w:rPrDefault"))
+            if rdef is not None:
+                defaults = self._read_rpr(rdef.find(qn("w:rPr")))
+
+        # 各 style
+        for style in styles_root.findall(qn("w:style")):
+            sid = style.get(qn("w:styleId"))
+            if not sid:
+                continue
+            rpr = style.find(qn("w:rPr"))
+            ppr = style.find(qn("w:pPr"))
+            ppr_rpr = ppr.find(qn("w:rPr")) if ppr is not None else None
+            based_on = None
+            bo = style.find(qn("w:basedOn"))
+            if bo is not None:
+                based_on = bo.get(qn("w:val"))
+            link = style.find(qn("w:link"))
+            link_id = link.get(qn("w:val")) if link is not None else None
+            out[sid] = {
+                "rpr": rpr,
+                "ppr_rpr": ppr_rpr,
+                "based_on": based_on,
+                "link": link_id,
+            }
+        return out, defaults
+
+    # ---- 读 rPr -------------------------------------------------------
+    def _read_rpr(self, rpr) -> ResolvedRunStyle | None:
+        if rpr is None:
+            return None
+        rs = ResolvedRunStyle()
+        rfonts = rpr.find(qn("w:rFonts"))
+        if rfonts is not None:
+            rs.font_eastasia = self._resolve_font_axis(rfonts, "eastAsia")
+            rs.font_ascii = self._resolve_font_axis(rfonts, "ascii")
+        sz = rpr.find(qn("w:sz"))
+        if sz is not None and sz.get(qn("w:val")):
+            try:
+                rs.size_pt = float(sz.get(qn("w:val"))) / 2.0
+            except ValueError:
+                pass
+        if rpr.find(qn("w:b")) is not None:
+            rs.bold = True
+        if rpr.find(qn("w:i")) is not None:
+            rs.italic = True
+        return rs
+
+    def _resolve_font_axis(self, rfonts, axis: str) -> str | None:
+        """同一根 rFonts 上 explicit > theme。"""
+        explicit = _empty_to_none(rfonts.get(qn(f"w:{axis}")))
+        if explicit:
+            return explicit
+        theme_attr = "cstheme" if axis == "cs" else f"{axis}Theme"
+        theme = _empty_to_none(rfonts.get(qn(f"w:{theme_attr}")))
+        if theme:
+            return self._theme.get((axis, theme))
+        return None
+
+    # ---- 合并 ---------------------------------------------------------
+    @staticmethod
+    def _fill(target: ResolvedRunStyle, source: ResolvedRunStyle | None) -> None:
+        """target 已有的字段保留；缺的从 source 取。"""
+        if source is None:
+            return
+        if target.font_eastasia is None:
+            target.font_eastasia = source.font_eastasia
+        if target.font_ascii is None:
+            target.font_ascii = source.font_ascii
+        if target.size_pt is None:
+            target.size_pt = source.size_pt
+        if target.bold is None:
+            target.bold = source.bold
+        if target.italic is None:
+            target.italic = source.italic
+
+    def _resolve_style_chain(
+        self, sid: str | None, _seen: set[str] | None = None
+    ) -> ResolvedRunStyle | None:
+        """段落样式 → 链向 basedOn → 沿途累积 rPr 与 pPr 的 rPr。"""
+        if sid is None:
+            return None
+        seen = _seen or set()
+        if sid in seen:
+            return None
+        seen = seen | {sid}
+        info = self._styles.get(sid)
+        if info is None:
+            return None
+        # 当前 style 的两个 rPr
+        rs = ResolvedRunStyle()
+        self._fill(rs, self._read_rpr(info.get("rpr")))
+        self._fill(rs, self._read_rpr(info.get("ppr_rpr")))
+        # 链接的 character style（如果有）
+        if info.get("link"):
+            self._fill(rs, self._resolve_style_chain(info["link"], seen))
+        # 父样式
+        if info.get("based_on"):
+            self._fill(rs, self._resolve_style_chain(info["based_on"], seen))
+        return rs
+
+    # ---- 主入口 -------------------------------------------------------
+    def resolve_run(self, p_elem, run_elem) -> ResolvedRunStyle:
+        """解析单个 run 的最终样式。p_elem 可为 None。"""
+        rs = ResolvedRunStyle()
+        # 1. 直接 run rPr
+        if run_elem is not None:
+            self._fill(rs, self._read_rpr(run_elem.find(qn("w:rPr"))))
+        # 2. 段落 rPr（段落标记字体）+ pStyle 链
+        if p_elem is not None:
+            ppr = p_elem.find(qn("w:pPr"))
+            if ppr is not None:
+                self._fill(rs, self._read_rpr(ppr.find(qn("w:rPr"))))
+                pstyle = ppr.find(qn("w:pStyle"))
+                if pstyle is not None and pstyle.get(qn("w:val")):
+                    self._fill(rs, self._resolve_style_chain(pstyle.get(qn("w:val"))))
+        # 3. 默认 style "Normal"（中文文档常见）
+        if "Normal" in self._styles:
+            self._fill(rs, self._resolve_style_chain("Normal"))
+        # 4. docDefaults
+        self._fill(rs, self._doc_defaults)
+        return rs
+
+    def resolve_paragraph(self, p_elem) -> ResolvedRunStyle:
+        """段落整体样式（不读 run，仅 pPr/style/默认）。"""
+        rs = ResolvedRunStyle()
+        if p_elem is not None:
+            ppr = p_elem.find(qn("w:pPr"))
+            if ppr is not None:
+                self._fill(rs, self._read_rpr(ppr.find(qn("w:rPr"))))
+                pstyle = ppr.find(qn("w:pStyle"))
+                if pstyle is not None and pstyle.get(qn("w:val")):
+                    self._fill(rs, self._resolve_style_chain(pstyle.get(qn("w:val"))))
+        if "Normal" in self._styles:
+            self._fill(rs, self._resolve_style_chain("Normal"))
+        self._fill(rs, self._doc_defaults)
+        return rs