94 lines
3.3 KiB
Python
94 lines
3.3 KiB
Python
"""从 Document 派生出 structure(按 role 分类统计)+ outline(heading 层级树)。"""
|
||
|
||
from __future__ import annotations
|
||
from collections import Counter
|
||
from fastapi_modules.fastapi_leaudit.govdoc_engine.models import Document, Paragraph
|
||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.result import OutlineNode, StructureItem
|
||
|
||
|
||
_ROLE_LABELS: list[tuple[str, str, bool]] = [
|
||
# (role, 中文标签, 是否常规公文必备)
|
||
("title", "标题", True),
|
||
("doc_number", "发文字号", True),
|
||
("recipient", "主送机关", True),
|
||
("heading_1", "一级标题", False),
|
||
("heading_2", "二级标题", False),
|
||
("heading_3", "三级标题", False),
|
||
("heading_4", "四级标题", False),
|
||
("body", "正文", True),
|
||
("attachment_marker", "附件标记", False),
|
||
("attachment_title", "附件标题", False),
|
||
("signature", "署名", True),
|
||
("date", "成文日期", True),
|
||
("no_text_marker", "(此页无正文)", False),
|
||
("unknown", "未识别", False),
|
||
]
|
||
|
||
_HEADING_LEVELS = {
|
||
"heading_1": 1,
|
||
"heading_2": 2,
|
||
"heading_3": 3,
|
||
"heading_4": 4,
|
||
}
|
||
|
||
|
||
def _dominant_style(paragraphs: list[Paragraph]) -> tuple[str | None, float | None, bool]:
|
||
"""返回 (字体众数, 字号众数, 是否所有段落样式一致)。"""
|
||
if not paragraphs:
|
||
return None, None, True
|
||
fonts = Counter(p.style.font_eastasia for p in paragraphs if p.style.font_eastasia)
|
||
sizes = Counter(p.style.font_size_pt for p in paragraphs if p.style.font_size_pt is not None)
|
||
dom_font = fonts.most_common(1)[0][0] if fonts else None
|
||
dom_size = sizes.most_common(1)[0][0] if sizes else None
|
||
uniform = len(fonts) <= 1 and len(sizes) <= 1
|
||
return dom_font, dom_size, uniform
|
||
|
||
|
||
def build_structure(doc: Document) -> list[StructureItem]:
|
||
items: list[StructureItem] = []
|
||
for role, label, expected in _ROLE_LABELS:
|
||
paragraphs = [p for p in doc.paragraphs if p.role == role]
|
||
if not paragraphs and not expected:
|
||
# 非必备 role 没出现就不展示,保持面板紧凑
|
||
continue
|
||
samples = [p.text[:60] for p in paragraphs[:3]]
|
||
font, size, uniform = _dominant_style(paragraphs)
|
||
items.append(StructureItem(
|
||
role=role,
|
||
label=label,
|
||
count=len(paragraphs),
|
||
expected=expected,
|
||
paragraph_indices=[p.index for p in paragraphs],
|
||
samples=samples,
|
||
char_total=sum(len(p.text) for p in paragraphs),
|
||
dominant_font=font,
|
||
dominant_size_pt=size,
|
||
style_uniform=uniform,
|
||
))
|
||
return items
|
||
|
||
|
||
def build_outline(doc: Document) -> list[OutlineNode]:
|
||
"""按段落顺序 + heading 层级生成树。"""
|
||
headings = [
|
||
(p.index, _HEADING_LEVELS[p.role], p.text)
|
||
for p in doc.paragraphs
|
||
if p.role in _HEADING_LEVELS
|
||
]
|
||
if not headings:
|
||
return []
|
||
|
||
roots: list[OutlineNode] = []
|
||
stack: list[OutlineNode] = []
|
||
for idx, level, text in headings:
|
||
node = OutlineNode(paragraph_index=idx, level=level, text=text)
|
||
# 弹出比当前 level 更深的祖先
|
||
while stack and stack[-1].level >= level:
|
||
stack.pop()
|
||
if stack:
|
||
stack[-1].children.append(node)
|
||
else:
|
||
roots.append(node)
|
||
stack.append(node)
|
||
return roots
|