leaudit-platform-backend/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/structure.py

"""从 Document 派生出 structure（按 role 分类统计）+ outline（heading 层级树）。"""

from __future__ import annotations
from collections import Counter
from fastapi_modules.fastapi_leaudit.govdoc_engine.models import Document, Paragraph
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.result import OutlineNode, StructureItem


_ROLE_LABELS: list[tuple[str, str, bool]] = [
    # (role, 中文标签, 是否常规公文必备)
    ("title", "标题", True),
    ("doc_number", "发文字号", True),
    ("recipient", "主送机关", True),
    ("heading_1", "一级标题", False),
    ("heading_2", "二级标题", False),
    ("heading_3", "三级标题", False),
    ("heading_4", "四级标题", False),
    ("body", "正文", True),
    ("attachment_marker", "附件标记", False),
    ("attachment_title", "附件标题", False),
    ("signature", "署名", True),
    ("date", "成文日期", True),
    ("no_text_marker", "(此页无正文)", False),
    ("unknown", "未识别", False),
]

_HEADING_LEVELS = {
    "heading_1": 1,
    "heading_2": 2,
    "heading_3": 3,
    "heading_4": 4,
}


def _dominant_style(paragraphs: list[Paragraph]) -> tuple[str | None, float | None, bool]:
    """返回 (字体众数, 字号众数, 是否所有段落样式一致)。"""
    if not paragraphs:
        return None, None, True
    fonts = Counter(p.style.font_eastasia for p in paragraphs if p.style.font_eastasia)
    sizes = Counter(p.style.font_size_pt for p in paragraphs if p.style.font_size_pt is not None)
    dom_font = fonts.most_common(1)[0][0] if fonts else None
    dom_size = sizes.most_common(1)[0][0] if sizes else None
    uniform = len(fonts) <= 1 and len(sizes) <= 1
    return dom_font, dom_size, uniform


def build_structure(doc: Document) -> list[StructureItem]:
    items: list[StructureItem] = []
    for role, label, expected in _ROLE_LABELS:
        paragraphs = [p for p in doc.paragraphs if p.role == role]
        if not paragraphs and not expected:
            # 非必备 role 没出现就不展示，保持面板紧凑
            continue
        samples = [p.text[:60] for p in paragraphs[:3]]
        font, size, uniform = _dominant_style(paragraphs)
        items.append(StructureItem(
            role=role,
            label=label,
            count=len(paragraphs),
            expected=expected,
            paragraph_indices=[p.index for p in paragraphs],
            samples=samples,
            char_total=sum(len(p.text) for p in paragraphs),
            dominant_font=font,
            dominant_size_pt=size,
            style_uniform=uniform,
        ))
    return items


def build_outline(doc: Document) -> list[OutlineNode]:
    """按段落顺序 + heading 层级生成树。"""
    headings = [
        (p.index, _HEADING_LEVELS[p.role], p.text)
        for p in doc.paragraphs
        if p.role in _HEADING_LEVELS
    ]
    if not headings:
        return []

    roots: list[OutlineNode] = []
    stack: list[OutlineNode] = []
    for idx, level, text in headings:
        node = OutlineNode(paragraph_index=idx, level=level, text=text)
        # 弹出比当前 level 更深的祖先
        while stack and stack[-1].level >= level:
            stack.pop()
        if stack:
            stack[-1].children.append(node)
        else:
            roots.append(node)
        stack.append(node)
    return roots