feat(govdoc): 新增内部公文模块全链路(后端58+前端11文件)
This commit is contained in:
@@ -0,0 +1,93 @@
|
||||
"""从 Document 派生出 structure(按 role 分类统计)+ outline(heading 层级树)。"""
|
||||
|
||||
from __future__ import annotations
|
||||
from collections import Counter
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.models import Document, Paragraph
|
||||
from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.result import OutlineNode, StructureItem
|
||||
|
||||
|
||||
_ROLE_LABELS: list[tuple[str, str, bool]] = [
|
||||
# (role, 中文标签, 是否常规公文必备)
|
||||
("title", "标题", True),
|
||||
("doc_number", "发文字号", True),
|
||||
("recipient", "主送机关", True),
|
||||
("heading_1", "一级标题", False),
|
||||
("heading_2", "二级标题", False),
|
||||
("heading_3", "三级标题", False),
|
||||
("heading_4", "四级标题", False),
|
||||
("body", "正文", True),
|
||||
("attachment_marker", "附件标记", False),
|
||||
("attachment_title", "附件标题", False),
|
||||
("signature", "署名", True),
|
||||
("date", "成文日期", True),
|
||||
("no_text_marker", "(此页无正文)", False),
|
||||
("unknown", "未识别", False),
|
||||
]
|
||||
|
||||
_HEADING_LEVELS = {
|
||||
"heading_1": 1,
|
||||
"heading_2": 2,
|
||||
"heading_3": 3,
|
||||
"heading_4": 4,
|
||||
}
|
||||
|
||||
|
||||
def _dominant_style(paragraphs: list[Paragraph]) -> tuple[str | None, float | None, bool]:
|
||||
"""返回 (字体众数, 字号众数, 是否所有段落样式一致)。"""
|
||||
if not paragraphs:
|
||||
return None, None, True
|
||||
fonts = Counter(p.style.font_eastasia for p in paragraphs if p.style.font_eastasia)
|
||||
sizes = Counter(p.style.font_size_pt for p in paragraphs if p.style.font_size_pt is not None)
|
||||
dom_font = fonts.most_common(1)[0][0] if fonts else None
|
||||
dom_size = sizes.most_common(1)[0][0] if sizes else None
|
||||
uniform = len(fonts) <= 1 and len(sizes) <= 1
|
||||
return dom_font, dom_size, uniform
|
||||
|
||||
|
||||
def build_structure(doc: Document) -> list[StructureItem]:
|
||||
items: list[StructureItem] = []
|
||||
for role, label, expected in _ROLE_LABELS:
|
||||
paragraphs = [p for p in doc.paragraphs if p.role == role]
|
||||
if not paragraphs and not expected:
|
||||
# 非必备 role 没出现就不展示,保持面板紧凑
|
||||
continue
|
||||
samples = [p.text[:60] for p in paragraphs[:3]]
|
||||
font, size, uniform = _dominant_style(paragraphs)
|
||||
items.append(StructureItem(
|
||||
role=role,
|
||||
label=label,
|
||||
count=len(paragraphs),
|
||||
expected=expected,
|
||||
paragraph_indices=[p.index for p in paragraphs],
|
||||
samples=samples,
|
||||
char_total=sum(len(p.text) for p in paragraphs),
|
||||
dominant_font=font,
|
||||
dominant_size_pt=size,
|
||||
style_uniform=uniform,
|
||||
))
|
||||
return items
|
||||
|
||||
|
||||
def build_outline(doc: Document) -> list[OutlineNode]:
|
||||
"""按段落顺序 + heading 层级生成树。"""
|
||||
headings = [
|
||||
(p.index, _HEADING_LEVELS[p.role], p.text)
|
||||
for p in doc.paragraphs
|
||||
if p.role in _HEADING_LEVELS
|
||||
]
|
||||
if not headings:
|
||||
return []
|
||||
|
||||
roots: list[OutlineNode] = []
|
||||
stack: list[OutlineNode] = []
|
||||
for idx, level, text in headings:
|
||||
node = OutlineNode(paragraph_index=idx, level=level, text=text)
|
||||
# 弹出比当前 level 更深的祖先
|
||||
while stack and stack[-1].level >= level:
|
||||
stack.pop()
|
||||
if stack:
|
||||
stack[-1].children.append(node)
|
||||
else:
|
||||
roots.append(node)
|
||||
stack.append(node)
|
||||
return roots
|
||||
Reference in New Issue
Block a user