Files
leaudit-platform-backend/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/structure.py
T

94 lines
3.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""从 Document 派生出 structure(按 role 分类统计)+ outlineheading 层级树)。"""
from __future__ import annotations
from collections import Counter
from fastapi_modules.fastapi_leaudit.govdoc_engine.models import Document, Paragraph
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.result import OutlineNode, StructureItem
_ROLE_LABELS: list[tuple[str, str, bool]] = [
# (role, 中文标签, 是否常规公文必备)
("title", "标题", True),
("doc_number", "发文字号", True),
("recipient", "主送机关", True),
("heading_1", "一级标题", False),
("heading_2", "二级标题", False),
("heading_3", "三级标题", False),
("heading_4", "四级标题", False),
("body", "正文", True),
("attachment_marker", "附件标记", False),
("attachment_title", "附件标题", False),
("signature", "署名", True),
("date", "成文日期", True),
("no_text_marker", "(此页无正文)", False),
("unknown", "未识别", False),
]
_HEADING_LEVELS = {
"heading_1": 1,
"heading_2": 2,
"heading_3": 3,
"heading_4": 4,
}
def _dominant_style(paragraphs: list[Paragraph]) -> tuple[str | None, float | None, bool]:
"""返回 (字体众数, 字号众数, 是否所有段落样式一致)。"""
if not paragraphs:
return None, None, True
fonts = Counter(p.style.font_eastasia for p in paragraphs if p.style.font_eastasia)
sizes = Counter(p.style.font_size_pt for p in paragraphs if p.style.font_size_pt is not None)
dom_font = fonts.most_common(1)[0][0] if fonts else None
dom_size = sizes.most_common(1)[0][0] if sizes else None
uniform = len(fonts) <= 1 and len(sizes) <= 1
return dom_font, dom_size, uniform
def build_structure(doc: Document) -> list[StructureItem]:
items: list[StructureItem] = []
for role, label, expected in _ROLE_LABELS:
paragraphs = [p for p in doc.paragraphs if p.role == role]
if not paragraphs and not expected:
# 非必备 role 没出现就不展示,保持面板紧凑
continue
samples = [p.text[:60] for p in paragraphs[:3]]
font, size, uniform = _dominant_style(paragraphs)
items.append(StructureItem(
role=role,
label=label,
count=len(paragraphs),
expected=expected,
paragraph_indices=[p.index for p in paragraphs],
samples=samples,
char_total=sum(len(p.text) for p in paragraphs),
dominant_font=font,
dominant_size_pt=size,
style_uniform=uniform,
))
return items
def build_outline(doc: Document) -> list[OutlineNode]:
"""按段落顺序 + heading 层级生成树。"""
headings = [
(p.index, _HEADING_LEVELS[p.role], p.text)
for p in doc.paragraphs
if p.role in _HEADING_LEVELS
]
if not headings:
return []
roots: list[OutlineNode] = []
stack: list[OutlineNode] = []
for idx, level, text in headings:
node = OutlineNode(paragraph_index=idx, level=level, text=text)
# 弹出比当前 level 更深的祖先
while stack and stack[-1].level >= level:
stack.pop()
if stack:
stack[-1].children.append(node)
else:
roots.append(node)
stack.append(node)
return roots