"""从 Document 派生出 structure(按 role 分类统计)+ outline(heading 层级树)。""" from __future__ import annotations from collections import Counter from fastapi_modules.fastapi_leaudit.govdoc_engine.models import Document, Paragraph from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.result import OutlineNode, StructureItem _ROLE_LABELS: list[tuple[str, str, bool]] = [ # (role, 中文标签, 是否常规公文必备) ("title", "标题", True), ("doc_number", "发文字号", True), ("recipient", "主送机关", True), ("heading_1", "一级标题", False), ("heading_2", "二级标题", False), ("heading_3", "三级标题", False), ("heading_4", "四级标题", False), ("body", "正文", True), ("attachment_marker", "附件标记", False), ("attachment_title", "附件标题", False), ("signature", "署名", True), ("date", "成文日期", True), ("no_text_marker", "(此页无正文)", False), ("unknown", "未识别", False), ] _HEADING_LEVELS = { "heading_1": 1, "heading_2": 2, "heading_3": 3, "heading_4": 4, } def _dominant_style(paragraphs: list[Paragraph]) -> tuple[str | None, float | None, bool]: """返回 (字体众数, 字号众数, 是否所有段落样式一致)。""" if not paragraphs: return None, None, True fonts = Counter(p.style.font_eastasia for p in paragraphs if p.style.font_eastasia) sizes = Counter(p.style.font_size_pt for p in paragraphs if p.style.font_size_pt is not None) dom_font = fonts.most_common(1)[0][0] if fonts else None dom_size = sizes.most_common(1)[0][0] if sizes else None uniform = len(fonts) <= 1 and len(sizes) <= 1 return dom_font, dom_size, uniform def build_structure(doc: Document) -> list[StructureItem]: items: list[StructureItem] = [] for role, label, expected in _ROLE_LABELS: paragraphs = [p for p in doc.paragraphs if p.role == role] if not paragraphs and not expected: # 非必备 role 没出现就不展示,保持面板紧凑 continue samples = [p.text[:60] for p in paragraphs[:3]] font, size, uniform = _dominant_style(paragraphs) items.append(StructureItem( role=role, label=label, count=len(paragraphs), expected=expected, paragraph_indices=[p.index for p in paragraphs], samples=samples, char_total=sum(len(p.text) for p in paragraphs), dominant_font=font, dominant_size_pt=size, style_uniform=uniform, )) return items def build_outline(doc: Document) -> list[OutlineNode]: """按段落顺序 + heading 层级生成树。""" headings = [ (p.index, _HEADING_LEVELS[p.role], p.text) for p in doc.paragraphs if p.role in _HEADING_LEVELS ] if not headings: return [] roots: list[OutlineNode] = [] stack: list[OutlineNode] = [] for idx, level, text in headings: node = OutlineNode(paragraph_index=idx, level=level, text=text) # 弹出比当前 level 更深的祖先 while stack and stack[-1].level >= level: stack.pop() if stack: stack[-1].children.append(node) else: roots.append(node) stack.append(node) return roots