From 5d777599bf226bee1c52286e35705804507f0f53 Mon Sep 17 00:00:00 2001 From: wren <“porlong@qq.com”> Date: Wed, 13 May 2026 14:37:12 +0800 Subject: [PATCH] =?UTF-8?q?feat(govdoc):=20=E6=96=B0=E5=A2=9E=E5=86=85?= =?UTF-8?q?=E9=83=A8=E5=85=AC=E6=96=87=E6=A8=A1=E5=9D=97=E5=85=A8=E9=93=BE?= =?UTF-8?q?=E8=B7=AF=EF=BC=88=E5=90=8E=E7=AB=AF58+=E5=89=8D=E7=AB=AF11?= =?UTF-8?q?=E6=96=87=E4=BB=B6=EF=BC=89?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../内部公文模块接口与权限设计.md | 731 ++++++++++++++++++ .../内部公文模块数据表复用与新增建议.md | 368 +++++++++ .../内部公文模块目录结构与代码落点清单.md | 617 +++++++++++++++ docs/内部公文模块/内部公文模块迁移方案.md | 503 ++++++++++++ .../controllers/govdocController.py | 228 ++++++ .../fastapi_leaudit/govdoc_bridge/__init__.py | 11 + .../govdoc_bridge/input_resolver.py | 133 ++++ .../govdoc_bridge/result_adapter.py | 112 +++ .../fastapi_leaudit/govdoc_bridge/runner.py | 99 +++ .../govdoc_bridge/storage_adapter.py | 221 ++++++ .../fastapi_leaudit/govdoc_bridge/tasks.py | 115 +++ .../fastapi_leaudit/govdoc_engine/__init__.py | 35 + .../govdoc_engine/dsl/__init__.py | 0 .../govdoc_engine/dsl/loader.py | 24 + .../govdoc_engine/dsl/schema.py | 141 ++++ .../govdoc_engine/engine/__init__.py | 0 .../govdoc_engine/engine/checks/__init__.py | 24 + .../govdoc_engine/engine/checks/ai_check.py | 151 ++++ .../govdoc_engine/engine/checks/base.py | 48 ++ .../engine/checks/confused_pair.py | 34 + .../govdoc_engine/engine/checks/cross_role.py | 69 ++ .../govdoc_engine/engine/checks/font.py | 162 ++++ .../govdoc_engine/engine/checks/forbid.py | 42 + .../govdoc_engine/engine/checks/hierarchy.py | 29 + .../engine/checks/punctuation.py | 46 ++ .../engine/checks/regex_check.py | 36 + .../govdoc_engine/engine/checks/required.py | 28 + .../govdoc_engine/engine/checks/wenzhong.py | 42 + .../govdoc_engine/engine/result.py | 81 ++ .../govdoc_engine/engine/runner.py | 242 ++++++ .../govdoc_engine/engine/selector.py | 27 + .../govdoc_engine/engine/structure.py | 93 +++ .../govdoc_engine/llm/__init__.py | 0 .../govdoc_engine/llm/cache.py | 101 +++ .../govdoc_engine/llm/client.py | 258 +++++++ .../fastapi_leaudit/govdoc_engine/models.py | 77 ++ .../govdoc_engine/parser/__init__.py | 0 .../govdoc_engine/parser/docx_parser.py | 152 ++++ .../govdoc_engine/parser/entities.py | 27 + .../govdoc_engine/parser/entity_builder.py | 195 +++++ .../govdoc_engine/parser/extractor.py | 104 +++ .../govdoc_engine/parser/loader.py | 83 ++ .../govdoc_engine/parser/role_tagger.py | 50 ++ .../govdoc_engine/parser/role_tagger_llm.py | 90 +++ .../govdoc_engine/parser/role_tagger_rule.py | 132 ++++ .../govdoc_engine/parser/style_resolver.py | 241 ++++++ .../fastapi_leaudit/govdoc_engine/pipeline.py | 248 ++++++ .../govdoc_engine/reporter/__init__.py | 0 .../govdoc_engine/reporter/docx_annotator.py | 105 +++ .../govdoc_engine/reporter/html_paragraph.py | 42 + .../govdoc_engine/reporter/html_renderer.py | 76 ++ .../govdoc_engine/reporter/json_report.py | 12 + .../fastapi_leaudit/models/__init__.py | 6 + .../models/govdocReportArtifact.py | 27 + .../models/govdocRuleResult.py | 39 + .../fastapi_leaudit/models/govdocRun.py | 46 ++ .../fastapi_leaudit/services/__init__.py | 2 + .../fastapi_leaudit/services/govdocService.py | 121 +++ .../services/impl/govdocServiceImpl.py | 130 ++++ scripts/创建sql/schema_add_govdoc_module.sql | 198 +++++ scripts/创建sql/seed_govdoc_entry_module.sql | 140 ++++ scripts/创建sql/seed_govdoc_permissions.sql | 138 ++++ scripts/创建sql/seed_govdoc_routes.sql | 276 +++++++ 63 files changed, 7608 insertions(+) create mode 100644 docs/内部公文模块/内部公文模块接口与权限设计.md create mode 100644 docs/内部公文模块/内部公文模块数据表复用与新增建议.md create mode 100644 docs/内部公文模块/内部公文模块目录结构与代码落点清单.md create mode 100644 docs/内部公文模块/内部公文模块迁移方案.md create mode 100644 fastapi_modules/fastapi_leaudit/controllers/govdocController.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_bridge/__init__.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_bridge/input_resolver.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_bridge/result_adapter.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_bridge/runner.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_bridge/storage_adapter.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_bridge/tasks.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/__init__.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/dsl/__init__.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/dsl/loader.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/dsl/schema.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/engine/__init__.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/__init__.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/ai_check.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/base.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/confused_pair.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/cross_role.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/font.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/forbid.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/hierarchy.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/punctuation.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/regex_check.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/required.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/wenzhong.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/engine/result.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/engine/runner.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/engine/selector.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/engine/structure.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/llm/__init__.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/llm/cache.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/llm/client.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/models.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/parser/__init__.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/parser/docx_parser.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/parser/entities.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/parser/entity_builder.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/parser/extractor.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/parser/loader.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/parser/role_tagger.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/parser/role_tagger_llm.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/parser/role_tagger_rule.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/parser/style_resolver.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/pipeline.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/reporter/__init__.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/reporter/docx_annotator.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/reporter/html_paragraph.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/reporter/html_renderer.py create mode 100644 fastapi_modules/fastapi_leaudit/govdoc_engine/reporter/json_report.py create mode 100644 fastapi_modules/fastapi_leaudit/models/govdocReportArtifact.py create mode 100644 fastapi_modules/fastapi_leaudit/models/govdocRuleResult.py create mode 100644 fastapi_modules/fastapi_leaudit/models/govdocRun.py create mode 100644 fastapi_modules/fastapi_leaudit/services/govdocService.py create mode 100644 fastapi_modules/fastapi_leaudit/services/impl/govdocServiceImpl.py create mode 100644 scripts/创建sql/schema_add_govdoc_module.sql create mode 100644 scripts/创建sql/seed_govdoc_entry_module.sql create mode 100644 scripts/创建sql/seed_govdoc_permissions.sql create mode 100644 scripts/创建sql/seed_govdoc_routes.sql diff --git a/docs/内部公文模块/内部公文模块接口与权限设计.md b/docs/内部公文模块/内部公文模块接口与权限设计.md new file mode 100644 index 0000000..2cc8e76 --- /dev/null +++ b/docs/内部公文模块/内部公文模块接口与权限设计.md @@ -0,0 +1,731 @@ +# 内部公文模块接口与权限设计 + +## 1. 目标 + +本文档定义 `govdoc` 模块接入当前 `leaudit-platform` 后端后的: + +- 接口边界 +- 路由设计 +- 权限键设计 +- 角色可见范围 +- 数据隔离规则 +- 前后端联动约定 + +该模块的正式中文名建议为: + +- `内部公文处理` + +模块编码建议为: + +- `govdoc` + +--- + +## 2. 设计原则 + +### 2.1 平台统一原则 + +`govdoc` 不作为独立系统存在,而是当前平台中的一个业务模块,因此必须统一复用: + +- JWT 鉴权 +- RBAC 权限模型 +- 地区隔离 +- 文档主档与文件版本体系 +- OSS/MinIO +- Celery 异步任务 +- Result 统一响应格式 + +### 2.2 接口职责边界 + +接口层只负责: + +- 收参 +- 鉴权 +- 权限校验 +- 数据范围校验 +- 调 service +- 返回 DTO + +接口层不负责: + +- 文档解析 +- 规则执行 +- 本地落盘 +- 同步长耗时审查 + +### 2.3 全异步执行原则 + +公文审查必须走异步任务,不能在 HTTP 请求内同步完成整条处理链。 + +即: + +- 上传文档 ≠ 立即完成审查 +- 发起审查 = 创建 run + 投递 worker +- 结果查询 = 前端轮询或详情页进入后查询 + +--- + +## 3. 模块路由建议 + +建议前端页面路由: + +- `/govdoc` +- `/govdoc/upload` +- `/govdoc/list` +- `/govdoc/detail/:documentId` +- `/govdoc/rules` +- `/govdoc/settings`(可选) + +说明: + +- 前端实际详情页建议使用 `/govdoc/detail/:documentId` +- `sys_routes` 中可注册隐藏详情模板路由 `/govdoc/detail`,供菜单/RBAC 使用 + +建议在 `sys_routes` 中注册为一组完整模块菜单。 + +--- + +## 4. 权限键设计 + +建议单独定义 `govdoc` 模块权限,不与 `rag:*`、`cross_review:*`、`document:*` 混用。 + +### 4.1 模块级权限键 + +- `govdoc:module:read` + +语义: + +- 是否可见内部公文处理模块菜单 +- 是否可进入 `/govdoc` 相关页面 + +### 4.2 文档权限键 + +- `govdoc:document:create` +- `govdoc:document:read` +- `govdoc:document:update` +- `govdoc:document:delete` + +语义: + +- 上传公文 +- 查看公文文档列表与详情 +- 更新文档基础信息 +- 删除公文文档 + +### 4.3 审查运行权限键 + +- `govdoc:run:create` +- `govdoc:run:read` +- `govdoc:run:retry` +- `govdoc:run:cancel`(若后续支持) + +语义: + +- 发起公文审查 +- 查看 run 状态 +- 失败后重试 +- 取消执行中的任务 + +### 4.4 报告与结果权限键 + +- `govdoc:report:read` +- `govdoc:result:read` + +语义: + +- 下载 HTML 报告 / 批注 DOCX / 原文 +- 查看 findings / entities / summary 等结果 + +### 4.5 规则权限键 + +- `govdoc:rule:read` +- `govdoc:rule:manage` + +语义: + +- 查看规则清单与规则详情 +- 发布/更新/切换规则版本 + +### 4.6 配置权限键(可选) + +- `govdoc:settings:read` +- `govdoc:settings:update` + +语义: + +- 查看模块配置 +- 修改模块配置,例如默认规则版本、模型开关、执行策略等 + +--- + +## 5. 角色建议与默认授权 + +当前平台已收口角色: + +- `provincial_admin` +- `admin` +- `common` +- `super_admin`(可选,仅系统级) + +建议 `govdoc` 默认授权如下。 + +### 5.1 `super_admin` + +建议权限: + +- 全部 `govdoc:*:*` + +数据范围: + +- 全量 + +### 5.2 `provincial_admin` + +建议权限: + +- `govdoc:module:read` +- `govdoc:document:create` +- `govdoc:document:read` +- `govdoc:document:update` +- `govdoc:document:delete` +- `govdoc:run:create` +- `govdoc:run:read` +- `govdoc:run:retry` +- `govdoc:report:read` +- `govdoc:result:read` +- `govdoc:rule:read` +- `govdoc:rule:manage` +- `govdoc:settings:read` +- `govdoc:settings:update` + +数据范围: + +- 全省 + +### 5.3 `admin` + +建议权限: + +- `govdoc:module:read` +- `govdoc:document:create` +- `govdoc:document:read` +- `govdoc:document:update` +- `govdoc:document:delete` +- `govdoc:run:create` +- `govdoc:run:read` +- `govdoc:run:retry` +- `govdoc:report:read` +- `govdoc:result:read` +- `govdoc:rule:read` + +是否授予: + +- `govdoc:rule:manage` +- `govdoc:settings:update` + +取决于业务是否允许地区管理员维护本地区规则版本。 + +数据范围: + +- 本地区 + +### 5.4 `common` + +建议权限: + +- `govdoc:module:read` +- `govdoc:document:create` +- `govdoc:document:read` +- `govdoc:run:create` +- `govdoc:run:read` +- `govdoc:report:read` +- `govdoc:result:read` +- `govdoc:rule:read` + +不建议授予: + +- `govdoc:document:delete` +- `govdoc:document:update` +- `govdoc:run:retry` +- `govdoc:rule:manage` +- `govdoc:settings:update` + +数据范围: + +- 本人 + +--- + +## 6. 数据范围隔离规则 + +`govdoc` 模块的数据范围必须严格复用当前平台模型,不单独搞第二套。 + +建议规则: + +### 6.1 全局角色 + +角色: + +- `super_admin` +- `provincial_admin` + +规则: + +- 可看全量数据 +- 若请求中带 `region` 筛选,则按筛选值缩小范围 + +### 6.2 地区管理角色 + +角色: + +- `admin` + +规则: + +- 只能看 `region = 当前用户 area` +- 即便前端传入其他地区,也必须后端拦截为 0 结果或直接拒绝 + +### 6.3 普通用户 + +角色: + +- `common` + +规则: + +- 只能看本人上传 / 本人创建 / 本人触发的文档与 run +- 不能通过 `userId` 参数查看他人数据 + +### 6.4 结果与报告的权限继承 + +以下资源不单独放宽: + +- findings +- entities +- HTML 报告 +- annotated DOCX +- 原文下载 + +这些资源的查看权限必须继承文档主档可见范围。 + +即: + +- 能否查看结果 = 能否查看该文档 + +--- + +## 7. 接口设计建议 + +建议全部放在 `/api/v3/govdoc/` 命名空间下。 + +--- + +## 8. 文档接口 + +### 8.1 上传公文 + +`POST /api/v3/govdoc/documents` + +功能: + +- 上传一份公文文档 +- 创建 `leaudit_documents` / `leaudit_document_files` 主记录 +- `engine_type` 标记为 `govdoc` +- 可选自动触发审查 + +请求建议: + +- `file`: 主文件 +- `typeId` / `typeCode` +- `region`(最终以后端数据范围校正为准) +- `autoRun` +- `speed` +- `ruleVersionId`(可选) + +所需权限: + +- `govdoc:document:create` + +返回建议: + +- `documentId` +- `fileId` +- `region` +- `engineType` +- `autoRunTriggered` +- `run`(若自动触发) + +### 8.2 获取公文列表 + +`GET /api/v3/govdoc/documents` + +功能: + +- 获取公文模块的文档列表 + +查询参数建议: + +- `page` +- `pageSize` +- `keyword` +- `region` +- `status` +- `resultStatus` +- `createdBy` +- `dateFrom` +- `dateTo` + +后端必须附加固定过滤: + +- `engine_type = 'govdoc'` + +所需权限: + +- `govdoc:document:read` + +### 8.3 获取公文详情 + +`GET /api/v3/govdoc/documents/{DocumentId}` + +功能: + +- 获取文档基础信息 +- 获取当前最新 run 摘要 +- 获取报告资源引用 + +所需权限: + +- `govdoc:document:read` + +### 8.4 更新文档信息 + +`PATCH /api/v3/govdoc/documents/{DocumentId}` + +功能: + +- 修改公文标题、文号、备注、类型等基础信息 + +所需权限: + +- `govdoc:document:update` + +### 8.5 删除文档 + +`DELETE /api/v3/govdoc/documents/{DocumentId}` + +功能: + +- 软删除文档 +- 默认只删模块侧展示,不立即物理删除 OSS 产物 + +所需权限: + +- `govdoc:document:delete` + +--- + +## 9. 审查运行接口 + +### 9.1 发起审查 + +`POST /api/v3/govdoc/runs` + +功能: + +- 对已存在文档发起一次公文审查 run +- 创建 `govdoc_runs` +- 投递 Celery worker + +请求建议: + +- `documentId` +- `ruleVersionId`(可选) +- `speed` +- `force` + +所需权限: + +- `govdoc:run:create` + +返回建议: + +- `runId` +- `documentId` +- `status=queued` +- `phase=dispatch` + +### 9.2 获取 run 状态 + +`GET /api/v3/govdoc/runs/{RunId}` + +功能: + +- 查询当前 run 的状态、阶段、耗时、错误摘要 + +所需权限: + +- `govdoc:run:read` + +### 9.3 重试 run + +`POST /api/v3/govdoc/runs/{RunId}/retry` + +功能: + +- 对失败或已完成的 run 重新发起一次新 run +- 原 run 不覆盖 + +所需权限: + +- `govdoc:run:retry` + +--- + +## 10. 结果与报告接口 + +### 10.1 获取审查结果摘要 + +`GET /api/v3/govdoc/runs/{RunId}/result` + +功能: + +- 返回 summary +- 返回 checked rules +- 返回 findings 统计 +- 返回 entities 摘要 + +所需权限: + +- `govdoc:result:read` + +### 10.2 获取 findings 明细 + +`GET /api/v3/govdoc/runs/{RunId}/findings` + +功能: + +- 返回段落级问题列表 + +所需权限: + +- `govdoc:result:read` + +### 10.3 获取 entities 明细 + +`GET /api/v3/govdoc/runs/{RunId}/entities` + +功能: + +- 返回识别出的标题、文号、收文机关、署名、文种、附件等实体 + +所需权限: + +- `govdoc:result:read` + +### 10.4 获取段落 HTML + +`GET /api/v3/govdoc/runs/{RunId}/paragraphs` + +功能: + +- 返回前端文档联动视图所需的段落 HTML + +所需权限: + +- `govdoc:report:read` + +### 10.5 下载 HTML 报告 + +`GET /api/v3/govdoc/runs/{RunId}/report/html` + +功能: + +- 获取 HTML 报告内容或下载地址 + +所需权限: + +- `govdoc:report:read` + +### 10.6 下载批注 DOCX + +`GET /api/v3/govdoc/runs/{RunId}/report/docx` + +功能: + +- 下载带批注的 DOCX + +所需权限: + +- `govdoc:report:read` + +### 10.7 下载原文 + +`GET /api/v3/govdoc/documents/{DocumentId}/original` + +功能: + +- 下载原始上传文档 + +所需权限: + +- `govdoc:report:read` +- 同时必须满足文档数据范围校验 + +--- + +## 11. 规则接口 + +### 11.1 获取规则清单 + +`GET /api/v3/govdoc/rules` + +功能: + +- 查看当前生效规则集摘要 +- 查看总规则数、group、severity、category + +所需权限: + +- `govdoc:rule:read` + +### 11.2 获取规则详情 + +`GET /api/v3/govdoc/rules/{RuleId}` + +功能: + +- 查看单条规则详情,包括 stages、messages、target / applies_to 等 + +所需权限: + +- `govdoc:rule:read` + +### 11.3 发布规则版本(后续) + +`POST /api/v3/govdoc/rule-versions/publish` + +功能: + +- 发布新的规则版本 + +所需权限: + +- `govdoc:rule:manage` + +--- + +## 12. 配置接口(可选) + +### 12.1 获取模块配置 + +`GET /api/v3/govdoc/settings` + +所需权限: + +- `govdoc:settings:read` + +### 12.2 更新模块配置 + +`PATCH /api/v3/govdoc/settings` + +所需权限: + +- `govdoc:settings:update` + +配置项示例: + +- 默认规则版本 +- 是否允许自动审查 +- 是否启用外部 LLM +- 最大文件大小 +- 最大排队任务数 + +--- + +## 13. 前后端联动约定 + +### 13.1 前端菜单显示 + +前端展示 `govdoc` 模块菜单的条件建议为: + +- 路由可见 +- 且拥有 `govdoc:module:read` + +### 13.2 页面按钮显示 + +建议按动作权限键控制: + +- 上传按钮:`govdoc:document:create` +- 删除按钮:`govdoc:document:delete` +- 重新发起审查:`govdoc:run:create` +- 重试按钮:`govdoc:run:retry` +- 规则管理入口:`govdoc:rule:manage` + +### 13.3 数据不可仅靠前端隐藏 + +即使按钮被隐藏,后端仍必须做: + +- 权限校验 +- 数据范围校验 +- 文档归属校验 + +前端永远不是安全边界。 + +--- + +## 14. 错误码与返回建议 + +建议继续复用当前平台 Result 风格,错误语义明确: + +常见错误建议: + +- `401`:未登录 / token 无效 +- `403`:无权限 +- `404`:文档 / run / 规则不存在 +- `409`:已有运行中任务,不允许重复发起 +- `422`:文件类型不支持 / 参数非法 +- `500`:执行失败 + +建议返回可读 message,例如: + +- `您没有查看内部公文处理模块的权限` +- `您没有查看该公文的权限` +- `当前文档已有执行中的审查任务` +- `当前规则版本不可用,请联系管理员` + +--- + +## 15. SQL 与初始化建议 + +建议至少补以下脚本: + +- `scripts/创建sql/schema_add_govdoc_module.sql` +- `scripts/创建sql/seed_govdoc_permissions.sql` +- `scripts/创建sql/seed_govdoc_entry_module.sql` +- `scripts/创建sql/seed_govdoc_routes.sql` + +建议初始化内容: + +- `permissions` 中加入 `govdoc:*` +- `sys_routes` 中加入 `/govdoc*` +- `role_permissions` 中给 `provincial_admin/admin/common` 分配默认权限 +- `role_route` 中分配菜单可见性 + +--- + +## 16. 最终建议 + +`govdoc` 模块的接口和权限设计,应严格遵守当前平台边界: + +- 接口统一走 `/api/v3/govdoc/*` +- 权限统一走 `govdoc:*:*` +- 数据范围统一走当前地区隔离逻辑 +- 结果下载权限继承文档可见范围 +- 公文模块不单独再造第二套用户、权限、地区模型 + +这样才能保证: + +- 模块边界清晰 +- 前后端联动简单 +- 后续统计和审计一致 +- 长期维护成本最低 diff --git a/docs/内部公文模块/内部公文模块数据表复用与新增建议.md b/docs/内部公文模块/内部公文模块数据表复用与新增建议.md new file mode 100644 index 0000000..2da81d4 --- /dev/null +++ b/docs/内部公文模块/内部公文模块数据表复用与新增建议.md @@ -0,0 +1,368 @@ +# 内部公文模块数据表复用与新增建议 + +## 1. 结论 + +数据表不需要“全部重新建”,但也不建议“全部硬复用当前项目现有表”。 + +最合理的方式是: + +- 复用当前项目已有的“平台公共表” +- 新建 `govdoc` 自己的“结果域表” +- 不要把公文模块的运行和结果强塞进现有 `leaudit` 专属结果表 + +一句话: + +- 主档能复用 +- 结果别乱复用 + +--- + +## 2. 建议直接复用的表 + +以下属于平台级公共能力,建议直接复用: + +### 2.1 用户与权限相关 + +- `sso_users` +- `roles` +- `user_role` +- `permissions` +- `role_permissions` +- `sys_routes` +- `role_route` + +这些负责: + +- 用户身份 +- JWT 登录态 +- RBAC 授权 +- 菜单显示 +- 接口动作权限 + +### 2.2 文档主档相关 + +- `leaudit_documents` +- `leaudit_document_files` + +这些负责: + +- 文档主记录 +- 文件版本 +- 文件角色 +- 所属地区 +- 创建人 +- OSS 路径关联 + +### 2.3 存储与任务基础设施 + +虽然不是数据表,但这些平台能力也必须复用: + +- OSS / MinIO +- Celery +- 当前统一配置体系 + +--- + +## 3. 为什么文档主档可以复用 + +从平台视角看,公文本质上也是文档。 + +当前 `DocumentServiceImpl` 已经处理了很多平台公共问题: + +- 上传 +- 文档类型归属 +- 地区归属 +- 文件版本 +- OSS 存储 +- 创建人 +- 列表查询 +- 数据隔离 + +这说明: + +- `leaudit_documents` 更像平台级文档主表 +- 不只是某一个引擎专属表 + +所以公文模块完全可以这样接: + +- 公文先入 `leaudit_documents` +- 文件入 `leaudit_document_files` +- 再通过模块标识或引擎标识决定后续执行链 + +--- + +## 4. 建议补充的主档字段 + +建议在 `leaudit_documents` 增加一个模块或引擎标识字段,例如: + +- `engine_type` + +可选值例如: + +- `leaudit` +- `govdoc` +- `rag` + +或者使用: + +- `biz_module` + +作用是让平台明确知道: + +- 这份文档属于哪个处理模块 +- 应走哪条执行链 +- 应展示哪套结果详情页面 + +这个字段非常关键。 + +--- + +## 5. 不建议直接复用的现有结果表 + +不建议直接复用的主要是“运行结果型表”,例如: + +- `leaudit_audit_runs` +- `leaudit_rule_results` +- 以及现有 `leaudit` 引擎专属的 metrics / extraction / rescue 一类结果表 + +原因如下。 + +### 5.1 字段语义会越来越脏 + +当前 `leaudit` 的运行结果表,天然围绕现有审查引擎设计,可能包含: + +- phase +- OCR / extract / rescue 阶段 +- ruleSetId / ruleVersionId +- resultStatus +- rescueApplied +- 当前引擎专属 timing / metrics + +而 `govdoc` 运行时更关注: + +- 公文结构解析 +- 标题 / 发文字号 / 文种 / 附件识别 +- 段落级 findings +- HTML / annotated docx / paragraph html 报告 + +表面上都叫 run,但含义并不完全相同。 + +### 5.2 结果明细模型不完全兼容 + +`govdoc-audit` 的单条 finding 更像: + +- `rule_id` +- `rule_name` +- `severity` +- `category` +- `location.paragraph_index` +- `message` +- `suggestion` +- `actual` +- `expected` +- `evidence` + +它非常强调: + +- 段落定位 +- 文档结构位置 +- 格式问题 + +而当前 `leaudit_rule_results` 更可能偏: + +- 评查点 +- 提取字段 +- 风险 +- rescue 翻转 +- 审核结果 + +两者不是一个结果语义世界。 + +### 5.3 后续统计口径容易混乱 + +如果共用结果表,后面做统计时会不断出现问题: + +- 这个 `passed_count` 是公文规则通过数,还是合同规则通过数? +- 这个 `failed_count` 是格式错误项数,还是评查点失败数? +- 这个 `score` 的计算口径是否一致? + +最终会逼着所有查询都加一层 `engine_type` 判断。 + +### 5.4 迁移与回滚成本更高 + +架构上: + +- 从专属表改回公共表,容易 +- 从硬复用公共表再拆出来,很痛苦 + +所以结果域隔离更稳。 + +--- + +## 6. 最推荐的折中方案 + +最佳折中不是“全新建”,也不是“全复用”,而是: + +- 1 套平台主档 +- 1 个模块标识字段 +- 1 套 `govdoc` 结果域表 + +### 6.1 第一层:复用公共文档主档 + +继续使用: + +- `leaudit_documents` +- `leaudit_document_files` + +负责: + +- 上传文档主记录 +- 保存文件版本 +- 地区归属 +- 创建人 +- OSS 路径 + +### 6.2 第二层:补充模块标识 + +在 `leaudit_documents` 增: + +- `engine_type = 'govdoc'` + +这样主档表知道: + +- 这是一份由公文模块消费的文档 + +### 6.3 第三层:新增公文结果域表 + +建议新增: + +- `govdoc_runs` +- `govdoc_rule_results` +- `govdoc_entities` +- `govdoc_report_artifacts` + +好处: + +- 上传、权限、地区、OSS 全复用 +- 结果和运行态不污染原系统 +- 前端查询和统计都清晰 + +--- + +## 7. 如果想极限少建表,最少也要新建哪几张 + +如果希望第一阶段尽量少建表,建议最少也要新建 3 张: + +- `govdoc_runs` +- `govdoc_rule_results` +- `govdoc_report_artifacts` + +可选第 4 张: + +- `govdoc_entities` + +如果想更快推进,`entities` 也可以先临时塞进 `govdoc_runs.result_json` 之类字段里,但长期仍建议拆表。 + +--- + +## 8. 什么情况下可以复用当前 `leaudit_audit_runs` + +只有一种情况可以考虑: + +先把 `leaudit_audit_runs` 正式重构为“平台通用引擎运行表”,明确它不再是 `leaudit` 私有运行表。 + +此时至少要补齐类似字段: + +- `engine_type` +- `biz_module` +- `result_summary_json` +- `artifact_manifest_json` + +然后让: + +- `leaudit` +- `govdoc` + +都成为这张“通用运行表”的消费者。 + +但要注意,这已经不是“直接复用原表”,而是: + +- 先重构当前表 +- 再让它变成通用表 + +这条路能走,但复杂度明显高于“新建 `govdoc_runs`”。 + +当前阶段,更推荐后者。 + +--- + +## 9. 明确建议 + +我的明确建议是: + +- 不用全部重建 +- 但也不要零成本硬复用全部现有结果表 + +最佳实践: + +- 复用 `用户 / 权限 / 地区 / 文档主档 / 文件主档 / OSS / 任务系统` +- 新建 `govdoc` 的 `run + result + artifact` 结果域表 + +即: + +**复用公共底座,新建领域结果。** + +--- + +## 10. 推荐最终表划分 + +### 10.1 复用 + +- `sso_users` +- `roles` +- `user_role` +- `permissions` +- `role_permissions` +- `sys_routes` +- `role_route` +- `leaudit_documents` +- `leaudit_document_files` + +### 10.2 建议补字段 + +- `leaudit_documents.engine_type` +- 或 `leaudit_documents.biz_module` + +### 10.3 新增 + +- `govdoc_runs` +- `govdoc_rule_results` +- `govdoc_entities` +- `govdoc_report_artifacts` + +后续规则平台化时再补: + +- `govdoc_rule_sets` +- `govdoc_rule_versions` + +--- + +## 11. 最终落地建议 + +如果现在立刻开始实施,建议按下面这版定: + +1. `leaudit_documents` / `leaudit_document_files` 继续复用 +2. 在 `leaudit_documents` 增加 `engine_type='govdoc'` +3. 新建 `govdoc_runs` +4. 新建 `govdoc_rule_results` +5. 新建 `govdoc_report_artifacts` +6. 后续视情况补 `govdoc_entities` + +这是一版: + +- 改动面适中 +- 风险最小 +- 迁移成本可控 +- 长期仍可扩展 + +非常适合当前项目阶段。 diff --git a/docs/内部公文模块/内部公文模块目录结构与代码落点清单.md b/docs/内部公文模块/内部公文模块目录结构与代码落点清单.md new file mode 100644 index 0000000..32216ad --- /dev/null +++ b/docs/内部公文模块/内部公文模块目录结构与代码落点清单.md @@ -0,0 +1,617 @@ +# 内部公文模块目录结构与代码落点清单 + +## 1. 目标 + +本文档用于把 `govdoc-audit` 迁入当前 `leaudit-platform` 时,明确: + +- 模块目录应该放在哪里 +- 哪些旧代码要迁 +- 哪些当前项目代码要扩展 +- 每一层应该承担什么职责 +- 第一阶段到第五阶段分别落哪些文件 + +这份文档不是讲业务方案,而是讲“代码最终应该落到仓库哪里”。 + +--- + +## 2. 总体落点原则 + +迁移后的 `govdoc` 不应再是一个独立系统,而应成为当前仓库中的一个标准业务模块。 + +总体原则: + +- 内核代码进 `fastapi_modules/fastapi_leaudit/govdoc_engine/` +- 平台适配代码进 `fastapi_modules/fastapi_leaudit/govdoc_bridge/` +- 接口代码进 `controllers/` +- 业务编排代码进 `services/impl/` +- ORM 模型进 `models/` +- SQL 进 `scripts/创建sql/` +- 文档进 `docs/内部公文模块/` +- 前端模块进 `legal-platform-frontend/` + +--- + +## 3. 后端目录结构建议 + +建议最终目录结构如下: + +```text +fastapi_modules/fastapi_leaudit/ + controllers/ + govdocController.py + + services/ + govdocService.py + impl/ + govdocServiceImpl.py + + govdoc_bridge/ + __init__.py + tasks.py + runner.py + input_resolver.py + rules_resolver.py + storage_adapter.py + result_adapter.py + report_adapter.py + context_builder.py + security_guard.py + + govdoc_engine/ + __init__.py + models.py + pipeline.py + parser/ + dsl/ + engine/ + reporter/ + llm/ + + models/ + govdocRun.py + govdocRuleResult.py + govdocEntity.py + govdocReportArtifact.py + govdocRuleSet.py + govdocRuleVersion.py +``` + +--- + +## 4. 旧项目代码迁移落点 + +旧项目路径: + +- `/home/wren-dev/Porject/govdoc-audit/` + +### 4.1 建议直接迁入 `govdoc_engine/` 的内容 + +建议从旧项目迁入这些目录/文件: + +```text +src/govdoc_audit/models.py +src/govdoc_audit/pipeline.py +src/govdoc_audit/parser/ +src/govdoc_audit/dsl/ +src/govdoc_audit/engine/ +src/govdoc_audit/reporter/ +src/govdoc_audit/llm/ +``` + +迁入后落到: + +```text +fastapi_modules/fastapi_leaudit/govdoc_engine/ +``` + +### 4.2 不建议迁入的旧项目内容 + +这些不要直接迁: + +```text +src/govdoc_audit/api/ +src/govdoc_audit/storage/ +src/govdoc_audit/services/storage_service.py +web/ +``` + +原因: + +- API 已由当前平台统一承接 +- 存储必须改走当前数据库 + OSS +- 前端要融入当前平台,而不是保留一套独立页面 + +### 4.3 旧规则文件建议落点 + +旧规则文件: + +```text +rules/govdoc_general/rules.yaml +``` + +短期建议: + +- 先迁到当前仓库本地规则目录,例如: + +```text +rules/govdoc/govdoc_general/rules.yaml +``` + +中期建议: + +- 纳入规则版本管理体系 +- 上传到 OSS +- 使用 `govdoc_rule_sets` / `govdoc_rule_versions` 控制生效版本 + +--- + +## 5. 新增后端文件清单 + +下面是建议新增的后端文件。 + +### 5.1 控制器层 + +新增: + +- `fastapi_modules/fastapi_leaudit/controllers/govdocController.py` + +职责: + +- 上传公文 +- 查询公文列表 +- 查询文档详情 +- 发起审查 +- 查询 run 状态 +- 查询 findings / entities / reports +- 下载 HTML / DOCX / 原文 +- 查询规则清单 + +同时需要: + +- 在 `fastapi_admin/bootstrap_parts/controllers.py` 中注册 `govdocController` + +### 5.2 服务接口层 + +新增: + +- `fastapi_modules/fastapi_leaudit/services/govdocService.py` + +职责: + +- 定义 `IGovdocService` 抽象接口 + +### 5.3 服务实现层 + +新增: + +- `fastapi_modules/fastapi_leaudit/services/impl/govdocServiceImpl.py` + +职责: + +- 调用文档主档服务 +- 创建 `govdoc_runs` +- 编排 worker 执行 +- 聚合详情返回前端 +- 权限/地区作用域后的结果查询 + +### 5.4 bridge 层 + +新增目录: + +- `fastapi_modules/fastapi_leaudit/govdoc_bridge/` + +建议文件职责如下。 + +#### `tasks.py` + +职责: + +- Celery 任务入口 +- 抢占 run +- 加载执行上下文 +- 调用 `runner.py` + +参考现有: + +- `fastapi_modules/fastapi_leaudit/leaudit_bridge/tasks.py` + +#### `runner.py` + +职责: + +- 调用 `govdoc_engine.pipeline` 执行完整审查 +- 组织执行顺序 +- 统一异常转换 + +#### `input_resolver.py` + +职责: + +- 从 `leaudit_documents` / `leaudit_document_files` 中定位输入文件 +- 从 OSS 下载到本地临时路径 +- 附加必要的文件元信息 + +#### `rules_resolver.py` + +职责: + +- 解析当前应使用的公文规则版本 +- 读取本地规则或从 OSS 下载规则文件 +- 校验规则版本可用性 + +#### `storage_adapter.py` + +职责: + +- 把 run 状态、结果摘要、规则结果、实体、报告路径写回数据库 +- 更新文档处理状态 + +#### `result_adapter.py` + +职责: + +- 将 `govdoc_engine` 的原始结果对象映射成数据库模型和前端 VO + +#### `report_adapter.py` + +职责: + +- 处理 HTML、annotated docx、paragraph html、json 结果等产物上传 OSS +- 返回 artifact 清单 + +#### `context_builder.py` + +职责: + +- 统一构造一次 run 的执行上下文 +- 避免在多个文件中反复拼上下文 + +#### `security_guard.py` + +职责: + +- 文件扩展名 / MIME / 大小 / 页数 / 压缩体积 / 临时路径安全校验 +- 调用前的执行保护 + +--- + +## 6. 建议新增的 ORM 模型文件 + +建议新增: + +- `fastapi_modules/fastapi_leaudit/models/govdocRun.py` +- `fastapi_modules/fastapi_leaudit/models/govdocRuleResult.py` +- `fastapi_modules/fastapi_leaudit/models/govdocEntity.py` +- `fastapi_modules/fastapi_leaudit/models/govdocReportArtifact.py` +- `fastapi_modules/fastapi_leaudit/models/govdocRuleSet.py` +- `fastapi_modules/fastapi_leaudit/models/govdocRuleVersion.py` + +还需要: + +- 更新 `fastapi_modules/fastapi_leaudit/models/__init__.py` + +职责建议如下。 + +### 6.1 `govdocRun.py` + +负责: + +- 一次公文审查运行记录 +- 记录文档、触发人、状态、阶段、得分、摘要、错误信息 + +### 6.2 `govdocRuleResult.py` + +负责: + +- 单条规则执行结果 +- 记录 rule_id / severity / category / message / location / pass/fail/skipped + +### 6.3 `govdocEntity.py` + +负责: + +- 标题、发文字号、文种、署名、附件、发文机关等实体结果 + +### 6.4 `govdocReportArtifact.py` + +负责: + +- HTML 报告、批注 DOCX、段落 HTML、JSON 报告等 OSS 产物索引 + +### 6.5 `govdocRuleSet.py` + +负责: + +- 公文规则集定义 + +### 6.6 `govdocRuleVersion.py` + +负责: + +- 规则版本与 OSS 路径、sha256、发布态等 + +--- + +## 7. 建议扩展的现有后端文件 + +除了新增文件,还建议扩展这些现有文件。 + +### 7.1 `fastapi_modules/fastapi_leaudit/models/leauditDocument.py` + +建议增加: + +- `engineType` / `engine_type` + +用途: + +- 标识该文档属于 `govdoc` 还是 `leaudit` + +### 7.2 `fastapi_modules/fastapi_leaudit/services/impl/documentServiceImpl.py` + +建议扩展: + +- 上传时支持传入 `engine_type='govdoc'` +- 查询列表时支持按 `engine_type` 过滤 +- 详情页聚合时能分发到 `govdoc` 详情构建逻辑 + +如果不想动太多,也可先由 `govdocServiceImpl` 单独包一层调用文档主档逻辑。 + +### 7.3 `fastapi_admin/bootstrap_parts/controllers.py` + +建议扩展: + +- 注册 `govdocController` + +### 7.4 `fastapi_modules/fastapi_leaudit/services/impl/rbacAdminServiceImpl.py` + +建议扩展: + +- 加入 `govdoc:*` 权限种子定义 +- 加入 `/govdoc` 路由绑定定义 + +### 7.5 `fastapi_modules/fastapi_leaudit/services/impl/rbacServiceImpl.py` + +建议扩展: + +- 为 `/govdoc` 前端入口提供兼容 route blueprint(如果当前系统仍依赖该机制) + +--- + +## 8. SQL 文件落点建议 + +建议新增以下 SQL 草案文件: + +- `scripts/创建sql/schema_add_govdoc_module.sql` +- `scripts/创建sql/seed_govdoc_permissions.sql` +- `scripts/创建sql/seed_govdoc_entry_module.sql` +- `scripts/创建sql/seed_govdoc_routes.sql` + +职责建议: + +### 8.1 `schema_add_govdoc_module.sql` + +负责: + +- 新增 `govdoc_runs` +- 新增 `govdoc_rule_results` +- 新增 `govdoc_entities` +- 新增 `govdoc_report_artifacts` +- 给 `leaudit_documents` 增加 `engine_type` + +### 8.2 `seed_govdoc_permissions.sql` + +负责: + +- 插入 `govdoc:*:*` 权限点 + +### 8.3 `seed_govdoc_entry_module.sql` + +负责: + +- 在入口模块配置中加入 `govdoc` + +### 8.4 `seed_govdoc_routes.sql` + +负责: + +- 加入 `/govdoc` 模块菜单与页面路由绑定 + +--- + +## 9. 文档目录落点建议 + +建议所有迁移文档统一放在: + +```text +docs/内部公文模块/ +``` + +建议至少保留: + +- `内部公文模块迁移方案.md` +- `内部公文模块数据表复用与新增建议.md` +- `内部公文模块接口与权限设计.md` +- `内部公文模块目录结构与代码落点清单.md` + +后续还可以继续补: + +- `内部公文模块前端页面设计.md` +- `内部公文模块执行链与安全控制点.md` +- `内部公文模块测试清单.md` + +--- + +## 10. 前端目录结构建议 + +前端项目在: + +- `legal-platform-frontend/` + +建议新增目录结构类似: + +```text +legal-platform-frontend/ + app/(audit)/govdoc/ + page.tsx + upload/page.tsx + list/page.tsx + detail/[documentId]/page.tsx + + components/govdoc/ + upload-form.tsx + govdoc-list.tsx + govdoc-detail.tsx + findings-panel.tsx + entity-panel.tsx + report-toolbar.tsx + + hooks/govdoc/ + useGovdocList.ts + useGovdocDetail.ts + useGovdocRun.ts + + lib/api/govdoc/ + client.ts + types.ts +``` + +职责建议: + +### 10.1 页面层 + +- 上传页 +- 列表页 +- 详情页 + +### 10.2 组件层 + +- 列表组件 +- 详情组件 +- findings 面板 +- entities 面板 +- 报告下载工具栏 + +### 10.3 hooks 层 + +- 列表请求 +- 详情请求 +- run 状态轮询 + +### 10.4 API 层 + +- 对接 `/api/v3/govdoc/*` + +--- + +## 11. 第一阶段最小落地文件集 + +如果先只做一个最小可运行版本,建议第一批先落这些文件: + +### 后端 + +- `controllers/govdocController.py` +- `services/govdocService.py` +- `services/impl/govdocServiceImpl.py` +- `govdoc_bridge/tasks.py` +- `govdoc_bridge/runner.py` +- `govdoc_bridge/input_resolver.py` +- `govdoc_bridge/storage_adapter.py` +- `govdoc_bridge/result_adapter.py` +- `govdoc_engine/`(裁剪内核) +- `models/govdocRun.py` +- `models/govdocRuleResult.py` +- `models/govdocReportArtifact.py` + +### SQL + +- `schema_add_govdoc_module.sql` +- `seed_govdoc_permissions.sql` + +### 前端 + +- 上传页 +- 列表页 +- 详情页 +- `lib/api/govdoc/client.ts` + +这样就可以先跑通: + +- 上传 +- 发起审查 +- 查询 run +- 看结果 +- 下载报告 + +--- + +## 12. 第二阶段建议补充文件 + +第二阶段再补: + +### 后端 + +- `models/govdocEntity.py` +- `models/govdocRuleSet.py` +- `models/govdocRuleVersion.py` +- `govdoc_bridge/rules_resolver.py` +- `govdoc_bridge/report_adapter.py` +- `govdoc_bridge/security_guard.py` + +### SQL + +- `seed_govdoc_entry_module.sql` +- `seed_govdoc_routes.sql` + +### 前端 + +- 规则查看页 +- 模块配置页 +- 规则版本管理页(如果做) + +--- + +## 13. 与现有 `leaudit_bridge` 的关系 + +`govdoc_bridge` 不应替换 `leaudit_bridge`,而应与之并行存在。 + +建议关系: + +- `leaudit_bridge` 处理现有评查引擎 +- `govdoc_bridge` 处理公文格式审查引擎 + +共同复用: + +- Celery +- OSS +- JWT/RBAC +- 文档主档 +- 地区隔离 + +这样当前平台最终就形成多引擎结构: + +- `leaudit` +- `govdoc` +- `rag`(知识库/对话是另一类模块,不完全同构) + +--- + +## 14. 最终建议 + +如果目标是把 `govdoc-audit` 真正“变成当前项目里的一个模块”,那代码落点必须明确: + +- 内核进 `govdoc_engine` +- 平台适配进 `govdoc_bridge` +- 业务编排进 `govdocServiceImpl` +- 对外接口进 `govdocController` +- 运行与结果落 `govdoc_*` 表 +- 页面落当前前端 `govdoc` 目录 + +不要保留成: + +- 一个独立 API 项目 +- 一个独立前端项目 +- 一套独立 SQLite 持久化 + +那样不叫模块化迁移,只叫外挂系统代理。 + +当前最稳的落法,就是按本文档这套目录与代码边界执行。 diff --git a/docs/内部公文模块/内部公文模块迁移方案.md b/docs/内部公文模块/内部公文模块迁移方案.md new file mode 100644 index 0000000..ae4ef5b --- /dev/null +++ b/docs/内部公文模块/内部公文模块迁移方案.md @@ -0,0 +1,503 @@ +# 内部公文模块迁移方案 + +## 1. 目标定位 + +`govdoc-audit` 不应作为独立系统继续外挂,而应正式收口为当前 `leaudit-platform` 中的一个一等业务模块。 + +建议模块名: + +- `govdoc` +- 中文显示名:`内部公文处理` + +它在当前项目中的定位应为: + +- 一个“公文处理与格式审查模块” +- 共享当前平台的用户、权限、文档、OSS、任务系统 +- 拥有自己独立的规则、运行、结果、报告模型 + +--- + +## 2. 总体迁移原则 + +一句话原则: + +- 迁“引擎” +- 不迁“壳子” + +应迁入当前项目的内容: + +- 公文解析器 +- 段落角色识别 +- 语义实体提取 +- YAML DSL +- 规则执行器 +- HTML / DOCX / JSON 报告生成 + +不建议直接迁入的内容: + +- 原 `FastAPI` API 层 +- 原 `SQLite` 存储层 +- 原 `web` 前端 +- 原本地 `var/documents` 落盘模式 + +原因是当前平台已经具备更成熟的基础设施: + +- 统一 FastAPI 应用 +- JWT / RBAC +- 地区隔离 +- 文档主档与文件版本 +- MinIO / OSS +- Celery 异步任务 +- bridge 执行链 + +因此正确方案不是再造第二个平台,而是把 `govdoc-audit` 变成当前平台中的一个能力模块。 + +--- + +## 3. 模块化后的目标形态 + +### 3.1 后端模块职责 + +该模块应负责: + +- 公文上传 +- 公文格式审查 +- 审查结果查询 +- 规则查看 +- 报告下载 +- 审查过程留痕 + +不负责: + +- 通用 RAG +- 合同评审 +- 交叉评查 + +### 3.2 前端模块职责 + +前端中应新增一个独立入口模块,页面建议包括: + +- 公文上传页 +- 审查列表页 +- 审查详情页 +- 规则查看页 +- 报告下载入口 +- 模块配置页(可选) + +路由建议: + +- `/govdoc` +- `/govdoc/upload` +- `/govdoc/list` +- `/govdoc/detail/:documentId` + +--- + +## 4. 推荐架构分层 + +建议按 6 层拆分: + +1. 接口层 +2. 服务层 +3. 执行桥接层 +4. 引擎内核层 +5. 持久化层 +6. 展示层 + +### 4.1 接口层 + +建议新增: + +- `fastapi_modules/fastapi_leaudit/controllers/govdocController.py` + +职责: + +- 上传公文 +- 查询公文列表 +- 发起审查 +- 查询运行状态 +- 查询结果详情 +- 下载 HTML / DOCX / 原文 +- 查询规则清单 + +控制器只做: + +- JWT 鉴权 +- DTO 收参 +- 调 service +- 统一返回 Result + +### 4.2 服务层 + +建议新增: + +- `fastapi_modules/fastapi_leaudit/services/govdocService.py` +- `fastapi_modules/fastapi_leaudit/services/impl/govdocServiceImpl.py` + +职责: + +- 复用当前文档服务做上传与元数据管理 +- 创建公文审查 run +- 调度异步任务 +- 聚合查询结果 +- 组织前端详情数据 + +### 4.3 执行桥接层 + +建议新增目录: + +```text +fastapi_modules/fastapi_leaudit/govdoc_bridge/ + __init__.py + tasks.py + runner.py + input_resolver.py + rules_resolver.py + storage_adapter.py + result_adapter.py + report_adapter.py + context_builder.py + security_guard.py +``` + +职责: + +- 从当前平台读取文档与文件元数据 +- 从 OSS 下载文档到本地临时文件 +- 解析规则版本并准备本地规则文件 +- 调用 `govdoc` 引擎执行 +- 将结果适配回当前平台数据库和 OSS + +这是整个模块接入的关键层。 + +### 4.4 引擎内核层 + +建议把 `govdoc-audit` 裁剪后并入当前仓库: + +```text +fastapi_modules/fastapi_leaudit/govdoc_engine/ + __init__.py + models.py + pipeline.py + parser/ + dsl/ + engine/ + reporter/ + llm/ +``` + +建议迁入: + +- `parser/` +- `dsl/` +- `engine/` +- `reporter/` +- `llm/` +- `models.py` +- `pipeline.py` + +明确不迁: + +- 原 `api/` +- 原 `storage/` +- 原 `services/storage_service.py` +- 原 `web/` + +### 4.5 持久化层 + +不建议继续使用原项目自己的 `SQLite + var/documents` 模式。 + +应基于当前平台数据库新增公文模块结果域表。 + +### 4.6 展示层 + +不迁原 `govdoc-audit/web`。 + +应直接在当前前端项目中新增 `govdoc` 模块页面,复用现有: + +- 登录态 +- 权限体系 +- 侧边栏菜单 +- UI 样式体系 + +--- + +## 5. 数据模型建议 + +### 5.1 应复用的平台公共表 + +建议直接复用: + +- `sso_users` +- `roles` +- `user_role` +- `permissions` +- `role_permissions` +- `sys_routes` +- `role_route` +- `leaudit_documents` +- `leaudit_document_files` + +这些表负责: + +- 用户身份 +- 权限控制 +- 地区隔离 +- 文档主档 +- 文件版本 +- OSS 关联 + +### 5.2 建议补充的主档字段 + +建议在 `leaudit_documents` 增加一个模块标识字段,例如: + +- `engine_type` + +可选值: + +- `leaudit` +- `govdoc` +- `rag` + +这样平台可以明确知道某份文档应走哪条处理链。 + +### 5.3 建议新增的公文模块结果域表 + +第一阶段建议至少新增: + +- `govdoc_runs` +- `govdoc_rule_results` +- `govdoc_report_artifacts` + +后续可补: + +- `govdoc_entities` +- `govdoc_rule_sets` +- `govdoc_rule_versions` + +推荐理解方式: + +- 公共主档复用 +- 领域结果隔离 + +不要把公文模块结果硬塞进现有 `leaudit_audit_runs` / `leaudit_rule_results`。 + +原因: + +- 语义不完全一致 +- 结果模型不完全兼容 +- 统计口径会混乱 +- 后期拆分成本更高 + +--- + +## 6. 模块权限设计建议 + +建议一开始就单独定义 `govdoc` 模块权限键: + +- `govdoc:module:read` +- `govdoc:document:create` +- `govdoc:document:read` +- `govdoc:document:update` +- `govdoc:document:delete` +- `govdoc:run:create` +- `govdoc:run:read` +- `govdoc:report:read` +- `govdoc:rule:read` +- `govdoc:rule:manage` + +权限层继续复用当前平台: + +- JWT 鉴权 +- RBAC +- 地区隔离 + +数据范围继续遵守当前平台规则: + +- 省级管理员:全量 +- 地区管理员:本地区 +- 普通用户:本人数据 + +--- + +## 7. 迁移实施步骤 + +建议分 5 期推进。 + +### 第一期:内核迁入 + +目标: + +- 把 `govdoc-audit` 裁成当前项目内可调用的纯引擎内核 + +动作: + +- 新建 `govdoc_engine/` +- 迁入 `parser / dsl / engine / reporter / llm` +- 去掉原 `api / storage / web` 依赖 +- 改成本项目统一 import 结构 +- 规则文件暂放本地规则目录 + +产出: + +- 当前项目内可以本地调用一份 `.docx` 执行公文审查 + +### 第二期:bridge 接入 + +目标: + +- 接入当前 worker 执行链 + +动作: + +- 新建 `govdoc_bridge/tasks.py` +- 新建 `govdoc_bridge/storage_adapter.py` +- 新建 `govdoc_bridge/result_adapter.py` +- 复用 OSS 下载到本地临时文件 +- 复用 Celery 异步执行 +- 结果回写 DB + 上传 OSS + +产出: + +- 当前项目可异步跑公文审查任务 + +### 第三期:服务与接口接入 + +目标: + +- 正式形成后端模块 API + +动作: + +- 新增 `govdocController.py` +- 新增 `govdocServiceImpl.py` +- 接入 JWT +- 接入地区隔离 +- 接入上传、列表、详情、运行状态、报告下载 + +产出: + +- 前端可以通过当前后端 API 正式使用公文模块 + +### 第四期:前端模块接入 + +目标: + +- 当前平台页面可用 + +动作: + +- 新增侧边栏模块 +- 新增上传页、列表页、详情页 +- 详情页展示: + - 分数 + - findings + - entities + - 原文段落联动 + - HTML / DOCX 报告下载 +- 接入权限控制 + +产出: + +- 用户可在当前平台直接使用“内部公文处理” + +### 第五期:平台化收口 + +目标: + +- 进入长期可维护状态 + +动作: + +- 规则版本化 +- 报告产物统一 OSS +- 文件转换 / 解析隔离执行 +- LLM / OCR 白名单与审计 +- 指标监控 +- 错误告警 +- 运行留痕 + +产出: + +- 模块正式生产可用 + +--- + +## 8. 推荐目录结构 + +```text +fastapi_modules/fastapi_leaudit/ + controllers/ + govdocController.py + + services/ + govdocService.py + impl/ + govdocServiceImpl.py + + govdoc_bridge/ + __init__.py + tasks.py + runner.py + input_resolver.py + rules_resolver.py + storage_adapter.py + result_adapter.py + report_adapter.py + security_guard.py + + govdoc_engine/ + __init__.py + models.py + pipeline.py + parser/ + dsl/ + engine/ + reporter/ + llm/ + + models/ + govdocRun.py + govdocRuleResult.py + govdocEntity.py + govdocReportArtifact.py + govdocRuleSet.py + govdocRuleVersion.py +``` + +文档与 SQL: + +```text +docs/内部公文模块/ + 内部公文模块迁移方案.md + 内部公文模块数据表复用与新增建议.md + +scripts/创建sql/ + schema_add_govdoc_module.sql + seed_govdoc_permissions.sql + seed_govdoc_entry_module.sql +``` + +--- + +## 9. 最终建议 + +最优方案不是: + +- 保留 `govdoc-audit` 为独立服务 +- 当前平台只做代理壳 + +而应该是: + +- 把 `govdoc-audit` 裁成 `govdoc_engine` +- 通过 `govdoc_bridge + Celery + OSS + RBAC + 地区隔离` +- 正式并入 `leaudit-platform` + +最终形成: + +- 平台共享底座 +- 模块独立结果域 +- 前后端统一接入 +- 规则、报告、运行态统一纳入当前项目 + +这是最稳、最适合长期维护的一条路。 diff --git a/fastapi_modules/fastapi_leaudit/controllers/govdocController.py b/fastapi_modules/fastapi_leaudit/controllers/govdocController.py new file mode 100644 index 0000000..ccd9ddb --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/controllers/govdocController.py @@ -0,0 +1,228 @@ +"""Govdoc 公文模块控制器。 + +提供公文上传、列表、详情、审查运行、结果与报告、规则查看等接口。 +""" + +from __future__ import annotations + +from typing import Any + +from fastapi import Depends, File, Form, Query, UploadFile + +from fastapi_common.fastapi_common_security.security import verify_access_token +from fastapi_common.fastapi_common_web.controller import BaseController +from fastapi_common.fastapi_common_web.domain.responses import Result + +from fastapi_modules.fastapi_leaudit.services import IGovdocService +from fastapi_modules.fastapi_leaudit.services.impl.govdocServiceImpl import GovdocServiceImpl + + +class GovdocController(BaseController): + """公文处理与格式审查控制器。""" + + def __init__(self): + super().__init__(prefix="/govdoc", tags=["内部公文"]) + self.GovdocService: IGovdocService = GovdocServiceImpl() + + # ── 文档 ────────────────────────────────────────── + + @self.router.post("/documents") + async def UploadDocument( + file: UploadFile = File(...), + typeId: int | None = Form(default=None), + region: str = Form(default="default"), + autoRun: bool = Form(default=False), + speed: str = Form(default="normal"), + ruleVersionId: int | None = Form(default=None), + payload: dict[str, Any] = Depends(verify_access_token), + ): + """上传公文文档。 + + 创建文档主档记录,engine_type 标记为 govdoc,可选自动触发审查。 + """ + result = await self.GovdocService.UploadDocument( + file=file, + typeId=typeId, + region=region, + autoRun=autoRun, + speed=speed, + ruleVersionId=ruleVersionId, + createdBy=int(payload["user_id"]), + ) + return Result.success(data=result) + + @self.router.get("/documents") + async def ListDocuments( + page: int = Query(default=1, ge=1), + pageSize: int = Query(default=20, ge=1, le=100), + keyword: str | None = Query(default=None), + region: str | None = Query(default=None), + status: str | None = Query(default=None), + resultStatus: str | None = Query(default=None), + createdBy: int | None = Query(default=None), + dateFrom: str | None = Query(default=None), + dateTo: str | None = Query(default=None), + payload: dict[str, Any] = Depends(verify_access_token), + ): + """获取公文模块文档列表。 + + 后端自动附加 engine_type='govdoc' 过滤条件。 + """ + result = await self.GovdocService.ListDocuments( + page=page, + pageSize=pageSize, + keyword=keyword, + region=region, + status=status, + resultStatus=resultStatus, + createdBy=createdBy, + dateFrom=dateFrom, + dateTo=dateTo, + userId=int(payload["user_id"]), + ) + return Result.success(data=result) + + @self.router.get("/documents/{documentId}") + async def GetDocumentDetail( + documentId: int, + payload: dict[str, Any] = Depends(verify_access_token), + ): + """获取公文详情:文档基础信息 + 最新 run 摘要 + 报告引用。""" + result = await self.GovdocService.GetDocumentDetail( + documentId=documentId, + userId=int(payload["user_id"]), + ) + return Result.success(data=result) + + @self.router.patch("/documents/{documentId}") + async def UpdateDocument( + documentId: int, + body: dict[str, Any], + payload: dict[str, Any] = Depends(verify_access_token), + ): + """修改公文标题、文号、备注等基础信息。""" + result = await self.GovdocService.UpdateDocument( + documentId=documentId, + body=body, + userId=int(payload["user_id"]), + ) + return Result.success(data=result) + + @self.router.delete("/documents/{documentId}") + async def DeleteDocument( + documentId: int, + payload: dict[str, Any] = Depends(verify_access_token), + ): + """软删除公文文档。""" + result = await self.GovdocService.DeleteDocument( + documentId=documentId, + userId=int(payload["user_id"]), + ) + return Result.success(data=result) + + # ── 审查运行 ────────────────────────────────────── + + @self.router.post("/runs") + async def CreateRun( + documentId: int = Form(...), + ruleVersionId: int | None = Form(default=None), + speed: str = Form(default="normal"), + force: bool = Form(default=False), + payload: dict[str, Any] = Depends(verify_access_token), + ): + """对已存在文档发起一次公文审查 run。 + + 创建 govdoc_runs 记录,投递 Celery worker。 + """ + result = await self.GovdocService.CreateRun( + documentId=documentId, + ruleVersionId=ruleVersionId, + speed=speed, + force=force, + triggerUserId=int(payload["user_id"]), + ) + return Result.success(data=result) + + @self.router.get("/runs/{runId}") + async def GetRunStatus( + runId: int, + payload: dict[str, Any] = Depends(verify_access_token), + ): + """查询 run 状态、阶段、耗时、错误摘要。""" + result = await self.GovdocService.GetRunStatus(runId=runId) + return Result.success(data=result) + + # ── 结果与报告 ──────────────────────────────────── + + @self.router.get("/runs/{runId}/result") + async def GetRunResult( + runId: int, + payload: dict[str, Any] = Depends(verify_access_token), + ): + """获取审查结果摘要:summary + checked rules + findings 统计 + entities 摘要。""" + result = await self.GovdocService.GetRunResult(runId=runId) + return Result.success(data=result) + + @self.router.get("/runs/{runId}/findings") + async def GetRunFindings( + runId: int, + payload: dict[str, Any] = Depends(verify_access_token), + ): + """获取段落级 findings 明细列表。""" + result = await self.GovdocService.GetRunFindings(runId=runId) + return Result.success(data=result) + + @self.router.get("/runs/{runId}/entities") + async def GetRunEntities( + runId: int, + payload: dict[str, Any] = Depends(verify_access_token), + ): + """获取识别出的标题、文号、署名等实体。""" + result = await self.GovdocService.GetRunEntities(runId=runId) + return Result.success(data=result) + + @self.router.get("/runs/{runId}/paragraphs") + async def GetRunParagraphs( + runId: int, + payload: dict[str, Any] = Depends(verify_access_token), + ): + """获取前端文档联动视图所需的段落 HTML。""" + result = await self.GovdocService.GetRunParagraphs(runId=runId) + return Result.success(data=result) + + @self.router.get("/runs/{runId}/report/html") + async def GetReportHtml( + runId: int, + payload: dict[str, Any] = Depends(verify_access_token), + ): + """获取 HTML 报告内容或下载地址。""" + result = await self.GovdocService.GetReportHtml(runId=runId) + return Result.success(data=result) + + @self.router.get("/runs/{runId}/report/docx") + async def GetReportDocx( + runId: int, + payload: dict[str, Any] = Depends(verify_access_token), + ): + """获取批注 DOCX 下载地址。""" + result = await self.GovdocService.GetReportDocx(runId=runId) + return Result.success(data=result) + + @self.router.get("/documents/{documentId}/original") + async def DownloadOriginal( + documentId: int, + payload: dict[str, Any] = Depends(verify_access_token), + ): + """获取原始上传文档下载地址。""" + result = await self.GovdocService.DownloadOriginal(documentId=documentId) + return Result.success(data=result) + + # ── 规则 ────────────────────────────────────────── + + @self.router.get("/rules") + async def ListRules( + payload: dict[str, Any] = Depends(verify_access_token), + ): + """获取当前生效规则集摘要。""" + result = await self.GovdocService.ListRules() + return Result.success(data=result) diff --git a/fastapi_modules/fastapi_leaudit/govdoc_bridge/__init__.py b/fastapi_modules/fastapi_leaudit/govdoc_bridge/__init__.py new file mode 100644 index 0000000..4e562fa --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_bridge/__init__.py @@ -0,0 +1,11 @@ +"""Govdoc 执行桥接层 —— govdoc_engine ↔ leaudit-platform 适配器。 + +本层负责把 govdoc_engine 接入当前平台的基础设施: + - OSS/MinIO 文件下载与上传 + - 文档主档 (leaudit_documents / leaudit_document_files) 查询 + - GovdocRun / GovdocRuleResult / GovdocReportArtifact 持久化 + - Celery 异步任务调度 + - 临时文件管理与安全校验 +""" + +from __future__ import annotations diff --git a/fastapi_modules/fastapi_leaudit/govdoc_bridge/input_resolver.py b/fastapi_modules/fastapi_leaudit/govdoc_bridge/input_resolver.py new file mode 100644 index 0000000..fedd419 --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_bridge/input_resolver.py @@ -0,0 +1,133 @@ +"""Govdoc Bridge — 输入文件解析器。 + +从 leaudit_document_files 中定位输入文件,从 OSS 下载到本地临时路径。 +""" + +from __future__ import annotations + +import hashlib +import os +import tempfile +from dataclasses import dataclass +from pathlib import Path + +from fastapi_common.fastapi_common_logger import logger +from fastapi_common.fastapi_common_sqlalchemy.database import GetAsyncSession +from fastapi_common.fastapi_common_storage.oss_client import OssClient +from sqlalchemy import select + +from fastapi_modules.fastapi_leaudit.models.leauditDocumentFile import LeauditDocumentFile + +log = logger + + +@dataclass(frozen=True) +class InputPayload: + """Govdoc 引擎执行所需的输入载荷。""" + + fileName: str + fileExt: str + localPath: str + sha256: str | None = None + fileSize: int | None = None + documentFileId: int | None = None + tempDir: str | None = None # 需调用方在任务结束时清理 + + +class InputResolver: + """解析 govdoc 引擎输入文件。 + + 从 leaudit_document_files 中定位输入文件 (file_role='original'), + 优先使用本地缓存路径,否则从 OSS 下载到临时目录。 + """ + + def __init__(self, Oss: OssClient | None = None) -> None: + self.Oss = Oss or OssClient() + + async def ResolveForDocument(self, documentId: int) -> InputPayload: + """为指定文档解析输入文件载荷。 + + 查找该文档最近一次激活的 original 文件记录。 + """ + async with GetAsyncSession() as session: + result = await session.execute( + select(LeauditDocumentFile) + .where( + LeauditDocumentFile.documentId == documentId, + LeauditDocumentFile.fileRole == "original", + LeauditDocumentFile.isActive.is_(True), + ) + .order_by(LeauditDocumentFile.Id.desc()) + .limit(1) + ) + fileRow = result.scalar_one_or_none() + + if fileRow is None: + raise ValueError(f"未找到文档 {documentId} 的原始文件记录") + + return await self.ResolveFromRow(fileRow) + + async def ResolveFromRow(self, FileRow: LeauditDocumentFile) -> InputPayload: + """从文件记录解析输入载荷。""" + # 优先本地路径 + if FileRow.localPath: + LocalPath = Path(FileRow.localPath) + if LocalPath.is_file(): + return InputPayload( + fileName=FileRow.fileName, + fileExt=FileRow.fileExt or _ext_from_name(FileRow.fileName), + localPath=str(LocalPath), + sha256=FileRow.sha256, + fileSize=FileRow.fileSize, + documentFileId=FileRow.Id, + ) + + # 否则从 OSS 下载 + if FileRow.ossUrl: + return await self._DownloadFromOss(FileRow) + + raise ValueError( + f"文件 {FileRow.Id} ({FileRow.fileName}) 既无可用 localPath 也无 ossUrl" + ) + + async def _DownloadFromOss(self, FileRow: LeauditDocumentFile) -> InputPayload: + """从 OSS 下载文件到临时目录。""" + try: + content = self.Oss.DownloadBytes(FileRow.ossUrl) + except Exception as e: + log.error(f"从 OSS 下载文件失败: url={FileRow.ossUrl}, error={e}") + raise + + tempDir = tempfile.mkdtemp(prefix="govdoc_input_") + ext = FileRow.fileExt or _ext_from_name(FileRow.fileName) + safeName = f"input_{FileRow.Id}{ext}" + localPath = os.path.join(tempDir, safeName) + + with open(localPath, "wb") as f: + f.write(content) + + computedSha = hashlib.sha256(content).hexdigest() + if FileRow.sha256 and computedSha != FileRow.sha256: + log.warning( + f"文件 SHA256 不匹配: expected={FileRow.sha256}, computed={computedSha}" + ) + + log.info( + f"从 OSS 下载文件: {FileRow.fileName} → {localPath} ({len(content)} bytes)" + ) + + return InputPayload( + fileName=FileRow.fileName, + fileExt=ext, + localPath=localPath, + sha256=computedSha, + fileSize=len(content), + documentFileId=FileRow.Id, + tempDir=tempDir, + ) + + +def _ext_from_name(fileName: str) -> str: + """从文件名提取扩展名。""" + _, ext = os.path.splitext(fileName) + return ext if ext else ".docx" diff --git a/fastapi_modules/fastapi_leaudit/govdoc_bridge/result_adapter.py b/fastapi_modules/fastapi_leaudit/govdoc_bridge/result_adapter.py new file mode 100644 index 0000000..2ae6d13 --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_bridge/result_adapter.py @@ -0,0 +1,112 @@ +"""Govdoc Bridge — 结果适配器。 + +将 govdoc_engine 原始结果对象 (AuditResult / Finding / SemanticEntity) +映射为 ORM 模型字段和前端 VO 字典。 +""" + +from __future__ import annotations + +from typing import Any + +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.result import AuditResult + + +class ResultAdapter: + """Govdoc 引擎结果 → 平台数据模型适配器。 + + 负责将 govdoc_engine 的原始执行结果转换为: + - GovdocRun 状态字段更新 + - GovdocRuleResult 列表 + - GovdocReportArtifact 清单 + - 前端 VO 字典 + """ + + def AdaptRunSummary(self, EngineResult: AuditResult) -> dict[str, Any]: + """从 AuditResult.summary 提取 run 汇总字段。""" + s = EngineResult.summary + return { + "totalScore": s.score, + "passedCount": s.passed_count, + "failedCount": s.failed_count, + "skippedCount": s.skipped_count, + "resultStatus": "pass" if s.failed_count == 0 else "fail" if s.passed_count == 0 else "partial", + "resultSummaryJson": None, # 可为后续扩展预留 + } + + def AdaptRuleResults(self, EngineResult: AuditResult) -> list[dict[str, Any]]: + """从 AuditResult.findings 提取规则执行明细列表。""" + results: list[dict[str, Any]] = [] + for f in EngineResult.findings: + results.append({ + "ruleId": f.rule_id, + "ruleName": f.rule_name, + "severity": f.severity, + "category": f.category, + "message": f.message, + "suggestion": f.suggestion, + "actual": f.actual, + "expected": f.expected, + "evidence": f.evidence, + "paragraphIndex": f.location.paragraph_index if f.location else None, + "paragraphText": f.location.context if f.location else None, + "locationPath": f.location.role if f.location else None, + "result": "fail", + "score": None, + }) + return results + + def AdaptCheckedRules(self, EngineResult: AuditResult) -> list[dict[str, Any]]: + """从 AuditResult.checked_rules 提取规则检查状态列表。""" + results: list[dict[str, Any]] = [] + for cr in EngineResult.checked_rules: + results.append({ + "ruleId": cr.rule_id, + "ruleName": cr.name, + "severity": cr.severity, + "category": cr.category, + "result": cr.status, # pass/fail/skipped + "skipReason": cr.skip_reason, + "score": None, + }) + return results + + def AdaptEntities(self, EngineResult: AuditResult) -> list[dict[str, Any]]: + """从 AuditResult.entities 提取实体识别结果。""" + entities: list[dict[str, Any]] = [] + for name, entity in EngineResult.entities.items(): + if entity is None: + continue + entities.append({ + "name": entity.name, + "text": entity.text, + "paragraphIndices": entity.paragraph_indices, + "primaryRole": entity.primary_role, + "source": entity.source, + "confidence": entity.confidence, + }) + return entities + + def AdaptArtifacts(self, _EngineResult: AuditResult, _RunId: int) -> list[dict[str, Any]]: + """从引擎结果提取报告产物清单。 + + 报告文件由 reporter 模块生成后上传 OSS。 + 当前返回空列表,待 report_adapter 实现后补齐。 + """ + return [] + + def BuildDetailVO( + self, + Document: Any, + Run: Any, + RuleResults: list[dict[str, Any]], + Entities: list[dict[str, Any]], + Artifacts: list[dict[str, Any]], + ) -> dict[str, Any]: + """构建前端详情页 VO。""" + return { + "document": Document, + "run": Run, + "findings": RuleResults, + "entities": Entities, + "reports": Artifacts, + } diff --git a/fastapi_modules/fastapi_leaudit/govdoc_bridge/runner.py b/fastapi_modules/fastapi_leaudit/govdoc_bridge/runner.py new file mode 100644 index 0000000..c47d60b --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_bridge/runner.py @@ -0,0 +1,99 @@ +"""Govdoc Bridge — 执行编排器。 + +负责组织一次完整的 govdoc 审查执行链路: + 1. 解析输入文件 (input_resolver) + 2. 调用 govdoc_engine.pipeline 执行审查 + 3. 收集并适配结果 (result_adapter) + 4. 持久化结果 (storage_adapter) +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any + +from fastapi_common.fastapi_common_logger import logger + +from fastapi_modules.fastapi_leaudit.govdoc_bridge.input_resolver import InputResolver +from fastapi_modules.fastapi_leaudit.govdoc_bridge.result_adapter import ResultAdapter +from fastapi_modules.fastapi_leaudit.govdoc_bridge.storage_adapter import StorageAdapter + +log = logger + + +@dataclass +class GovdocRunner: + """Govdoc 引擎一次完整审查的执行编排器。 + + 当前为 Phase 1 骨架: + - engine 集成点已标注 + - 可先跑通 run 创建 → 状态更新 → 结果持久化 的完整生命周期 + - 引擎执行部分待 govdoc_engine 迁入后接入 + """ + + InputResolver: InputResolver = field(default_factory=InputResolver) + Storage: StorageAdapter = field(default_factory=StorageAdapter) + ResultAdapter: ResultAdapter = field(default_factory=ResultAdapter) + + async def Execute( + self, + DocumentId: int, + RunId: int, + RulesPath: str, + TriggerUserId: int | None = None, + Speed: str = "normal", + ) -> dict[str, Any]: + """执行一次完整的 govdoc 审查。 + + Args: + DocumentId: 文档 ID。 + RunId: 已创建的 govdoc_runs.id。 + TriggerUserId: 触发人。 + Speed: 执行速度 ('normal' / 'urgent')。 + + Returns: + 执行摘要 dict。 + """ + log.info(f"[Govdoc] Starting execution: runId={RunId}, documentId={DocumentId}") + + # 1. 更新 run 状态 → processing + await self.Storage.UpdateRunStatus(RunId, "processing", phase="parsing") + await self.Storage.UpdateDocumentStatus(DocumentId, "processing", RunId) + + # 2. 解析输入文件 + inputPayload = await self.InputResolver.ResolveForDocument(DocumentId) + log.info(f"[Govdoc] Input resolved: {inputPayload.fileName} → {inputPayload.localPath}") + + # 3. 调用 govdoc_engine 执行审查 + from fastapi_modules.fastapi_leaudit.govdoc_engine.pipeline import run as engine_run + + engineResult = await engine_run( + file_path=inputPayload.localPath, + rules_path=RulesPath, + llm_client=None, # 使用默认 LlmClient (从平台配置加载) + ) + + # 4. 适配引擎结果 + runSummary = self.ResultAdapter.AdaptRunSummary(engineResult) + ruleResults = self.ResultAdapter.AdaptRuleResults(engineResult) + entities = self.ResultAdapter.AdaptEntities(engineResult) + artifacts = self.ResultAdapter.AdaptArtifacts(engineResult, RunId) + + # 5. 持久化结果 + await self.Storage.UpdateRunResult(RunId, runSummary) + await self.Storage.SaveRuleResults(RunId, ruleResults) + await self.Storage.SaveArtifacts(RunId, artifacts) + + # 6. 更新终态 + await self.Storage.UpdateRunStatus(RunId, "completed", phase="reporting") + await self.Storage.UpdateDocumentStatus(DocumentId, "completed", RunId) + + log.info(f"[Govdoc] Execution completed: runId={RunId}") + + return { + "runId": RunId, + "documentId": DocumentId, + "status": "completed", + "ruleResultsCount": len(ruleResults), + "artifactCount": len(artifacts), + } diff --git a/fastapi_modules/fastapi_leaudit/govdoc_bridge/storage_adapter.py b/fastapi_modules/fastapi_leaudit/govdoc_bridge/storage_adapter.py new file mode 100644 index 0000000..81261b2 --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_bridge/storage_adapter.py @@ -0,0 +1,221 @@ +"""Govdoc Bridge — 存储适配器。 + +将 govdoc_engine 执行结果写入 govdoc_runs / govdoc_rule_results / +govdoc_report_artifacts 表,并更新 leaudit_documents 状态。 +""" + +from __future__ import annotations + +from datetime import datetime, timezone +from typing import Any + +from fastapi_common.fastapi_common_logger import logger +from fastapi_common.fastapi_common_sqlalchemy.database import GetAsyncSession +from sqlalchemy import text + +log = logger + + +class StorageAdapter: + """Govdoc 结果持久化适配器。 + + 将 bridge 层产生的结构化结果写入 PostgreSQL。 + 使用原生 SQL (text) 以保持与现有 leaudit_bridge/storage_adapter 风格一致。 + """ + + # ── Run 状态 ───────────────────────────────────────── + + async def CreateRun(self, RunData: dict[str, Any]) -> int: + """创建 govdoc_runs 记录,返回 run_id。""" + async with GetAsyncSession() as session: + result = await session.execute( + text( + """INSERT INTO govdoc_runs + (document_id, document_file_id, run_no, trigger_source, + trigger_user_id, status, phase, created_at, updated_at) + VALUES (:document_id, :document_file_id, :run_no, :trigger_source, + :trigger_user_id, 'pending', 'parsing', now(), now()) + RETURNING id""" + ), + { + "document_id": RunData["documentId"], + "document_file_id": RunData.get("documentFileId"), + "run_no": RunData.get("runNo", 1), + "trigger_source": RunData.get("triggerSource", "manual"), + "trigger_user_id": RunData.get("triggerUserId"), + }, + ) + row = result.fetchone() + run_id = row[0] if row else 0 + await session.commit() + log.info(f"[Govdoc] Run created: runId={run_id}, documentId={RunData['documentId']}") + return run_id + + async def UpdateRunStatus(self, RunId: int, Status: str, Phase: str | None = None, **Extra: Any) -> None: + """更新 run 状态和阶段。""" + set_clauses = ["status = :status", "updated_at = now()"] + params: dict[str, Any] = {"rid": RunId, "status": Status} + + if Phase is not None: + set_clauses.append("phase = :phase") + params["phase"] = Phase + + if Status == "completed" or Status == "failed": + set_clauses.append("finished_at = :finished_at") + params["finished_at"] = datetime.now(timezone.utc) + + async with GetAsyncSession() as session: + await session.execute( + text(f"UPDATE govdoc_runs SET {', '.join(set_clauses)} WHERE id = :rid"), + params, + ) + await session.commit() + log.info(f"[Govdoc] Run status updated: runId={RunId}, status={Status}") + + async def UpdateRunResult(self, RunId: int, Summary: dict[str, Any]) -> None: + """写入 run 结果汇总字段。""" + async with GetAsyncSession() as session: + await session.execute( + text( + """UPDATE govdoc_runs SET + total_score = :total_score, + passed_count = :passed_count, + failed_count = :failed_count, + skipped_count = :skipped_count, + result_status = :result_status, + result_summary_json = :result_summary_json, + updated_at = now() + WHERE id = :rid""" + ), + { + "rid": RunId, + "total_score": Summary.get("totalScore"), + "passed_count": Summary.get("passedCount", 0), + "failed_count": Summary.get("failedCount", 0), + "skipped_count": Summary.get("skippedCount", 0), + "result_status": Summary.get("resultStatus"), + "result_summary_json": Summary.get("resultSummaryJson"), + }, + ) + await session.commit() + log.info(f"[Govdoc] Run result saved: runId={RunId}") + + async def UpdateRunError(self, RunId: int, ErrorMessage: str) -> None: + """记录运行失败的错误信息。""" + async with GetAsyncSession() as session: + await session.execute( + text( + """UPDATE govdoc_runs SET + status = 'failed', + error_message = :error_message, + finished_at = now(), + updated_at = now() + WHERE id = :rid""" + ), + {"rid": RunId, "error_message": ErrorMessage}, + ) + await session.commit() + log.error(f"[Govdoc] Run failed: runId={RunId}, error={ErrorMessage[:200]}") + + # ── 规则结果 ───────────────────────────────────────── + + async def SaveRuleResults(self, RunId: int, Results: list[dict[str, Any]]) -> None: + """批量写入 govdoc_rule_results。""" + if not Results: + return + + async with GetAsyncSession() as session: + for row in Results: + await session.execute( + text( + """INSERT INTO govdoc_rule_results + (run_id, rule_id, rule_name, severity, category, + message, suggestion, actual, expected, evidence, + paragraph_index, paragraph_text, location_path, + result, score, created_at, updated_at) + VALUES (:run_id, :rule_id, :rule_name, :severity, :category, + :message, :suggestion, :actual, :expected, :evidence, + :paragraph_index, :paragraph_text, :location_path, + :result, :score, now(), now())""" + ), + { + "run_id": RunId, + "rule_id": row.get("ruleId"), + "rule_name": row.get("ruleName"), + "severity": row.get("severity"), + "category": row.get("category"), + "message": row.get("message"), + "suggestion": row.get("suggestion"), + "actual": row.get("actual"), + "expected": row.get("expected"), + "evidence": row.get("evidence"), + "paragraph_index": row.get("paragraphIndex"), + "paragraph_text": row.get("paragraphText"), + "location_path": row.get("locationPath"), + "result": row.get("result", "pass"), + "score": row.get("score"), + }, + ) + await session.commit() + log.info(f"[Govdoc] Rule results saved: runId={RunId}, count={len(Results)}") + + # ── 报告产物 ───────────────────────────────────────── + + async def SaveArtifacts(self, RunId: int, Artifacts: list[dict[str, Any]]) -> None: + """批量写入 govdoc_report_artifacts。""" + if not Artifacts: + return + + async with GetAsyncSession() as session: + for row in Artifacts: + await session.execute( + text( + """INSERT INTO govdoc_report_artifacts + (run_id, artifact_type, file_name, file_ext, mime_type, + file_size, sha256, oss_url, storage_provider, description, + created_at, updated_at) + VALUES (:run_id, :artifact_type, :file_name, :file_ext, :mime_type, + :file_size, :sha256, :oss_url, :storage_provider, :description, + now(), now())""" + ), + { + "run_id": RunId, + "artifact_type": row.get("artifactType"), + "file_name": row.get("fileName"), + "file_ext": row.get("fileExt"), + "mime_type": row.get("mimeType"), + "file_size": row.get("fileSize"), + "sha256": row.get("sha256"), + "oss_url": row.get("ossUrl"), + "storage_provider": row.get("storageProvider"), + "description": row.get("description"), + }, + ) + await session.commit() + log.info(f"[Govdoc] Artifacts saved: runId={RunId}, count={len(Artifacts)}") + + # ── 文档状态 ───────────────────────────────────────── + + async def UpdateDocumentStatus(self, DocumentId: int, ProcessingStatus: str, RunId: int | None = None) -> None: + """更新 leaudit_documents 的处理状态和当前 run_id。""" + async with GetAsyncSession() as session: + if RunId is not None: + await session.execute( + text( + """UPDATE leaudit_documents SET + processing_status = :s, current_run_id = :rid, updated_at = now() + WHERE id = :did""" + ), + {"s": ProcessingStatus, "rid": RunId, "did": DocumentId}, + ) + else: + await session.execute( + text( + """UPDATE leaudit_documents SET + processing_status = :s, updated_at = now() + WHERE id = :did""" + ), + {"s": ProcessingStatus, "did": DocumentId}, + ) + await session.commit() + log.info(f"[Govdoc] Document status updated: documentId={DocumentId}, status={ProcessingStatus}") diff --git a/fastapi_modules/fastapi_leaudit/govdoc_bridge/tasks.py b/fastapi_modules/fastapi_leaudit/govdoc_bridge/tasks.py new file mode 100644 index 0000000..be4bc37 --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_bridge/tasks.py @@ -0,0 +1,115 @@ +"""Govdoc Bridge — Celery 任务入口。 + +将 govdoc 审查执行投递到 Celery worker 队列。 +""" + +from __future__ import annotations + +import asyncio +from typing import Any + +from fastapi_common.fastapi_common_logger import logger + +from fastapi_admin.celery_app import celery_app +from fastapi_modules.fastapi_leaudit.govdoc_bridge.runner import GovdocRunner +from fastapi_modules.fastapi_leaudit.govdoc_bridge.storage_adapter import StorageAdapter + +log = logger + +GOVDOC_WORKER_QUEUE = "govdoc" +GOVDOC_WORKER_QUEUE_URGENT = "govdoc_urgent" + + +def resolve_govdoc_queue(speed: str = "normal") -> str: + """根据优先级返回对应的 worker 队列名。""" + if (speed or "").strip().lower() in {"urgent", "high", "fast", "紧急"}: + return GOVDOC_WORKER_QUEUE_URGENT + return GOVDOC_WORKER_QUEUE + + +def dispatch_govdoc_task( + documentId: int, + runId: int, + triggerUserId: int | None = None, + speed: str = "normal", +) -> Any: + """投递 govdoc 审查任务到 Celery 队列。 + + Args: + documentId: 文档 ID。 + runId: 已创建的 govdoc_runs.id。 + triggerUserId: 触发人。 + speed: 优先级 ('normal' / 'urgent')。 + + Returns: + Celery AsyncResult。 + """ + queue = resolve_govdoc_queue(speed) + log.info( + f"[Govdoc] Dispatching task: runId={runId}, documentId={documentId}, queue={queue}" + ) + return govdoc_execute_task.apply_async( + kwargs={ + "documentId": documentId, + "runId": runId, + "triggerUserId": triggerUserId, + "speed": speed, + }, + queue=queue, + ) + + +@celery_app.task( + bind=True, + name="govdoc_execute_task", + max_retries=0, + default_retry_delay=60, + acks_late=True, + reject_on_worker_lost=True, + task_time_limit=30 * 60, # 单次执行 30 分钟超时 + task_soft_time_limit=25 * 60, +) +def govdoc_execute_task( + self, + documentId: int, + runId: int, + triggerUserId: int | None = None, + speed: str = "normal", +) -> dict[str, Any]: + """Celery 任务:执行一次 govdoc 公文格式审查。 + + 此任务由 dispatch_govdoc_task 投递,worker 消费后执行完整审查链路。 + """ + taskId = self.request.id or "unknown" + log.info(f"[Govdoc] Task started: taskId={taskId}, runId={runId}, documentId={documentId}") + + storage = StorageAdapter() + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + try: + # 更新 run 状态 → running + loop.run_until_complete(storage.UpdateRunStatus(runId, "processing", phase="parsing")) + + # 执行完整审查链路 + runner = GovdocRunner() + result = loop.run_until_complete( + runner.Execute( + DocumentId=documentId, + RunId=runId, + TriggerUserId=triggerUserId, + Speed=speed, + ) + ) + log.info(f"[Govdoc] Task completed: taskId={taskId}, runId={runId}") + return result + + except Exception as exc: + errorMessage = str(exc)[:2000] + log.exception(f"[Govdoc] Task failed: taskId={taskId}, runId={runId}, error={errorMessage[:200]}") + loop.run_until_complete(storage.UpdateRunError(runId, errorMessage)) + loop.run_until_complete(storage.UpdateDocumentStatus(documentId, "failed", runId)) + raise + + finally: + loop.close() diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/__init__.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/__init__.py new file mode 100644 index 0000000..287cc8d --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/__init__.py @@ -0,0 +1,35 @@ +"""Govdoc 公文格式审查引擎内核。 + +从旧 govdoc-audit 项目裁剪迁入,去除独立 API 层、SQLite 存储层、 +本地运行记录器 (RunRecorder) 和旧配置系统。 + +导出: + - pipeline.run() — 异步审查入口 (bridge 层主调用) + - pipeline.audit_file() — 同步审查入口 (兼容) + - models — 核心数据模型 (Pydantic) + - parser — 文档解析与实体抽取 + - dsl — YAML 规则 DSL 定义与加载 + - engine — 规则执行引擎与结果模型 + - reporter — 报告生成 (HTML/DOCX/JSON) + - llm — LLM 客户端 (OpenAI 兼容协议) +""" + +from __future__ import annotations + +from fastapi_modules.fastapi_leaudit.govdoc_engine.pipeline import ( + audit_file, + run, +) +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.result import ( + AuditResult, + AuditSummary, + CheckedRule, +) + +__all__ = [ + "audit_file", + "run", + "AuditResult", + "AuditSummary", + "CheckedRule", +] diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/dsl/__init__.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/dsl/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/dsl/loader.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/dsl/loader.py new file mode 100644 index 0000000..669714d --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/dsl/loader.py @@ -0,0 +1,24 @@ +"""加载并校验 rules.yaml。""" + +from __future__ import annotations +from functools import lru_cache +from pathlib import Path +import yaml + +from fastapi_modules.fastapi_leaudit.govdoc_engine.dsl.schema import RuleSet + + +def _load_uncached(path: Path) -> RuleSet: + with open(path, encoding="utf-8") as f: + data = yaml.safe_load(f) + return RuleSet.model_validate(data) + + +@lru_cache(maxsize=32) +def _load_cached(path_str: str, mtime: float) -> RuleSet: + return _load_uncached(Path(path_str)) + + +def load_rules(path: str | Path) -> RuleSet: + path = Path(path) + return _load_cached(str(path.resolve()), path.stat().st_mtime) diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/dsl/schema.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/dsl/schema.py new file mode 100644 index 0000000..462137f --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/dsl/schema.py @@ -0,0 +1,141 @@ +"""规则文件的 Pydantic schema。""" + +from __future__ import annotations +from typing import Any, Literal +from pydantic import BaseModel, Field, model_validator + + +CheckType = Literal[ + "required", "font", "style_match", "line_spacing", + "attachment_marker_style", + "regex_require", "regex_forbid", + "confused_pair", "forbid_phrase", "forbid_chars", + "punctuation", "wenzhong_whitelist", + "hierarchy", "cross_role", "ai", +] + + +class AppliesTo(BaseModel): + role: str | None = None + roles: list[str] | None = None + paragraph_index: int | None = None + + +class RuleStage(BaseModel): + id: str | None = None + check: CheckType + when: str | None = None + field: str | None = None + expect: dict[str, Any] | None = None + pattern: str | None = None + chars: list[str] | None = None + pairs: list[dict[str, Any]] | None = None + phrases: list[str] | None = None + rules: list[dict[str, Any]] | None = None + expected_order: list[dict[str, Any]] | None = None + forbid_patterns: list[str] | None = None + prompt: str | None = None + format: str | None = None + + model_config = {"extra": "allow"} + + +class Messages(BaseModel): + pass_msg: str = Field(alias="pass", default="") + fail: str = "" + + model_config = {"populate_by_name": True} + + +class Rule(BaseModel): + rule_id: str + name: str + severity: Literal["error", "warning", "info"] = "warning" + category: str + score: int | None = None + # 二选一:target 通道(推荐,绑定语义实体)或 applies_to 通道(旧,按 role 选段) + applies_to: AppliesTo | None = None + target: str | None = None + on_missing: Literal["pass", "warn", "fail", "skip"] = "skip" + activate_if: str | None = None + stages: list[RuleStage] + messages: Messages + + @model_validator(mode="after") + def _check_at_least_one_target(self) -> "Rule": + if self.applies_to is None and self.target is None: + raise ValueError( + f"Rule {self.rule_id}: 必须声明 target 或 applies_to 之一" + ) + return self + + +class RuleGroup(BaseModel): + group: str + rules: list[Rule] + + +# 8 个内置语义实体名(与 entity_builder.BUILTIN_ENTITY_NAMES 同步) +_BUILTIN_ENTITY_NAMES = frozenset({ + "title", "doc_number", "recipient", "date", + "signature", "attachments", "wenzhong", "issuer", +}) + + +class EntitySpec(BaseModel): + """声明用户自定义语义实体(builtin 实体由代码自动产出,无需在 yaml 出现)。""" + + name: str + type: Literal["string", "number", "list"] = "string" + description: str = "" + + @model_validator(mode="after") + def _no_clash_with_builtin(self) -> "EntitySpec": + if self.name in _BUILTIN_ENTITY_NAMES: + raise ValueError( + f"entity '{self.name}' 与内置实体重名," + f"去掉该声明即可(内置实体自动产出)" + ) + return self + + +class ExtractSpec(BaseModel): + entities: list[EntitySpec] = Field(default_factory=list) + + +class RuleSetMetadata(BaseModel): + type_id: str + name: str + version: str + source: str | None = None + description: str | None = None + + +class RuleSet(BaseModel): + metadata: RuleSetMetadata + extract: ExtractSpec = Field(default_factory=ExtractSpec) + rules: list[RuleGroup] + + @model_validator(mode="after") + def _check_unique_ids(self) -> "RuleSet": + seen: set[str] = set() + for g in self.rules: + for r in g.rules: + if r.rule_id in seen: + raise ValueError(f"duplicate rule_id: {r.rule_id}") + seen.add(r.rule_id) + return self + + def all_rules(self) -> list[Rule]: + return [r for g in self.rules for r in g.rules] + + +class FontCheck(BaseModel): + eastasia: str | None = None + ascii: str | None = None + size_pt: float | None = None + + +class RegexForbidCheck(BaseModel): + pattern: str + message: str | None = None diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/__init__.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/__init__.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/__init__.py new file mode 100644 index 0000000..72e50c6 --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/__init__.py @@ -0,0 +1,24 @@ +"""Check 原语注册中心:通过 register 装饰器收集,runner 通过 get 查找。""" + +from __future__ import annotations +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks.base import CheckBase + +_REGISTRY: dict[str, type[CheckBase]] = {} + + +def register(name: str): + def deco(cls): + cls.name = name + _REGISTRY[name] = cls + return cls + return deco + + +def get_check(name: str) -> type[CheckBase]: + if name not in _REGISTRY: + raise KeyError(f"unknown check: {name}; known: {list(_REGISTRY)}") + return _REGISTRY[name] + + +def all_checks() -> list[str]: + return list(_REGISTRY.keys()) diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/ai_check.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/ai_check.py new file mode 100644 index 0000000..bf7e0a8 --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/ai_check.py @@ -0,0 +1,151 @@ +"""LLM 语义检查。三级输出:pass / warn / fail。""" + +import logging +import re +from typing import Any +from pydantic import BaseModel +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import register +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks.base import ( + CheckBase, CheckContext, CheckHit, CheckResult, +) +from fastapi_modules.fastapi_leaudit.govdoc_engine.llm.client import LlmClient, LlmJsonError, _format_exc + +_log = logging.getLogger(__name__) + + +_OUT_FORMAT = """ +请以 JSON 输出: +{"result": "pass|warn|fail", "reason": "<简短理由>", "suggestion": "<改进建议;pass 时填空>"} +""" + +_VAR_RE = re.compile(r"\{\{\s*([^}]+?)\s*\}\}") + + +def _resolve_dot_path(root: Any, path: str) -> str: + """点语法属性访问:title.style.font_eastasia → entities['title'].style.font_eastasia。""" + cur: Any = root + for seg in path.split("."): + if cur is None: + return "" + if isinstance(cur, dict): + cur = cur.get(seg) + elif isinstance(cur, BaseModel): + cur = getattr(cur, seg, None) + else: + cur = getattr(cur, seg, None) + if cur is None: + return "" + if isinstance(cur, (dict, list)): + return str(cur) + return str(cur) + + +def _interpolate( + template: str, + paragraphs: list, + entities: dict | None = None, + target: Any = None, +) -> str: + """渲染顺序:① paragraphs[N] ② entities 点语法 + ③ target 隐式(无前缀时视为 target.)。""" + entities = entities or {} + + def repl(m): + key = m.group(1).strip() + # ① paragraphs[N] 索引 + if key.startswith("paragraphs["): + try: + idx = int(key[len("paragraphs["):].rstrip("]")) + return paragraphs[idx].text + except (ValueError, IndexError): + return "" + # ② entities 点语法:title.text / title.style.font_eastasia + head, _, rest = key.partition(".") + if head in entities: + entity = entities[head] + if entity is None: + return "" + return _resolve_dot_path(entity, rest) if rest else entity.text + # ③ target 隐式:未带前缀且 target 存在 + if target is not None: + v = _resolve_dot_path(target, key) + if v: + return v + return "" + + return _VAR_RE.sub(repl, template) + + +@register("ai") +class AiCheck(CheckBase): + def __init__(self, llm_client: LlmClient | None = None): + self.client = llm_client or LlmClient() + + def _build_prompt(self, ctx: CheckContext) -> str: + prompt = _interpolate( + ctx.stage.prompt or "", + ctx.paragraphs, + ctx.entities, + ctx.target, + ) + return prompt + "\n\n" + _OUT_FORMAT + + def _interpret(self, ctx: CheckContext, resp: dict) -> CheckResult: + result = resp.get("result", "fail") + reason = resp.get("reason", "") + suggestion = resp.get("suggestion", "") + if result == "pass": + return CheckResult(passed=True, hits=[]) + target_p = ctx.paragraphs[0] if ctx.paragraphs else None + confidence = 0.95 if result == "fail" else 0.7 + return CheckResult(passed=False, hits=[CheckHit( + paragraph=target_p, + char_start=0, + char_end=len(target_p.text) if target_p else 0, + actual={"llm_reason": reason, "llm_suggestion": suggestion}, + expected={}, + message=reason or "LLM 判定不通过", + confidence=confidence, + )]) + + def run(self, ctx: CheckContext) -> CheckResult: + label = f"ai_{ctx.rule_id or 'unknown'}" + try: + resp = self.client.chat_json( + [{"role": "user", "content": self._build_prompt(ctx)}], + label=label, + ) + except LlmJsonError as e: + _log.warning("AI check skipped (LLM JSON error): %s", _format_exc(e)) + return CheckResult( + passed=True, hits=[], skipped=True, + skip_reason=f"LLM 返回内容无法解析为 JSON:{e}", + ) + except Exception as e: + _log.warning("AI check skipped (LLM error): %s", _format_exc(e)) + return CheckResult( + passed=True, hits=[], skipped=True, + skip_reason=f"LLM 调用失败:{e}", + ) + return self._interpret(ctx, resp) + + async def run_async(self, ctx: CheckContext) -> CheckResult: + label = f"ai_{ctx.rule_id or 'unknown'}" + try: + resp = await self.client.chat_json_async( + [{"role": "user", "content": self._build_prompt(ctx)}], + label=label, + ) + except LlmJsonError as e: + _log.warning("AI check skipped (LLM JSON error): %s", _format_exc(e)) + return CheckResult( + passed=True, hits=[], skipped=True, + skip_reason=f"LLM 返回内容无法解析为 JSON:{e}", + ) + except Exception as e: + _log.warning("AI check skipped (LLM error): %s", _format_exc(e)) + return CheckResult( + passed=True, hits=[], skipped=True, + skip_reason=f"LLM 调用失败:{e}", + ) + return self._interpret(ctx, resp) diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/base.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/base.py new file mode 100644 index 0000000..a3db28f --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/base.py @@ -0,0 +1,48 @@ +"""Check 原语基类与上下文。""" + +from __future__ import annotations +from dataclasses import dataclass, field +from typing import Any, TYPE_CHECKING +from fastapi_modules.fastapi_leaudit.govdoc_engine.models import Document, Paragraph +from fastapi_modules.fastapi_leaudit.govdoc_engine.dsl.schema import RuleStage + +if TYPE_CHECKING: + from fastapi_modules.fastapi_leaudit.govdoc_engine.parser.entities import SemanticEntity + + +@dataclass +class CheckContext: + document: Document + paragraphs: list[Paragraph] + stage: RuleStage + entities: dict[str, "SemanticEntity | None"] = field(default_factory=dict) + target: "SemanticEntity | None" = None + rule_id: str = "" + + +@dataclass +class CheckHit: + paragraph: Paragraph | None + char_start: int = 0 + char_end: int = 0 + actual: dict[str, Any] | None = None + expected: dict[str, Any] | None = None + message: str | None = None + confidence: float = 1.0 + + +@dataclass +class CheckResult: + passed: bool + hits: list[CheckHit] = field(default_factory=list) + skipped: bool = False + skip_reason: str = "" + + +class CheckBase: + """所有 check 原语的抽象基类。""" + + name: str = "" + + def run(self, ctx: CheckContext) -> CheckResult: + raise NotImplementedError diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/confused_pair.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/confused_pair.py new file mode 100644 index 0000000..ddebe2d --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/confused_pair.py @@ -0,0 +1,34 @@ +"""易混淆词对(字面 + 正则)。""" + +import re +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import register +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks.base import CheckBase, CheckContext, CheckHit, CheckResult + + +@register("confused_pair") +class ConfusedPairCheck(CheckBase): + def run(self, ctx: CheckContext) -> CheckResult: + pairs = ctx.stage.pairs or [] + hits: list[CheckHit] = [] + for p in ctx.paragraphs: + for pair in pairs: + wrong = pair.get("wrong") + wrong_pat = pair.get("wrong_pattern") + correct = pair.get("correct") or pair.get("suggest", "") + reason = pair.get("reason", "") + if wrong and wrong in p.text: + start = p.text.find(wrong) + hits.append(CheckHit( + paragraph=p, char_start=start, char_end=start + len(wrong), + actual={"text": wrong}, expected={"text": correct}, + message=f"\"{wrong}\" 应为 \"{correct}\"。{reason}", + )) + elif wrong_pat: + for m in re.finditer(wrong_pat, p.text): + hits.append(CheckHit( + paragraph=p, char_start=m.start(), char_end=m.end(), + actual={"text": m.group(0)}, + expected={"text": correct}, + message=f"\"{m.group(0)}\" 应为 \"{correct}\"。{reason}", + )) + return CheckResult(passed=not hits, hits=hits) diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/cross_role.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/cross_role.py new file mode 100644 index 0000000..8e89aa6 --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/cross_role.py @@ -0,0 +1,69 @@ +"""跨段关系 check:例如二级标题以句号结尾后又新起一段。""" + +import re +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import register +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks.base import CheckBase, CheckContext, CheckHit, CheckResult + + +# 单个附件项末尾的标点:编号(数字+点) + 内容 + 末尾标点 +_ATTACH_ITEM_TRAIL_PUNCT = re.compile(r"\d+[\..][^\d;。,;,.]+?[;。,;,.]") +# 整段是一个附件项 +_ATTACH_ITEM_LINE = re.compile(r"^\d+[\..].+[;。,;,.]\s*$") + + +@register("cross_role") +class CrossRoleCheck(CheckBase): + def run(self, ctx: CheckContext) -> CheckResult: + rules = ctx.stage.rules or [] + paras = ctx.document.paragraphs + hits: list[CheckHit] = [] + for r in rules: + t = r.get("type") + if t == "h2_no_period_then_break": + for i, p in enumerate(paras): + if p.role == "heading_2" and p.text.rstrip().endswith(("。", ".")): + if i + 1 < len(paras) and paras[i + 1].text.strip(): + hits.append(CheckHit( + paragraph=p, + char_start=len(p.text) - 1, char_end=len(p.text), + actual={"text": p.text}, + message="二级标题在换行分段时不应使用句号;如使用句号则应紧接正文", + )) + elif t == "attachment_item_no_trailing_punct": + hits.extend(_attachment_item_hits(paras)) + return CheckResult(passed=not hits, hits=hits) + + +def _attachment_item_hits(paras): + """从 attachment_marker 起扫描附件区块,找末尾带标点的附件项。""" + hits: list[CheckHit] = [] + in_attachment = False + for p in paras: + text = p.text.strip() + if not text: + continue + + if p.role == "attachment_marker": + in_attachment = True + # 同段内可能出现 "附件:1.xxx;2.yyy。" 多项一行 + for m in _ATTACH_ITEM_TRAIL_PUNCT.finditer(text): + hits.append(CheckHit( + paragraph=p, + char_start=m.start(), char_end=m.end(), + actual={"snippet": m.group(0)}, + message=f'附件项末尾不应有标点:"{m.group(0)}"', + )) + continue + + if p.role in ("signature", "date", "heading_1"): + in_attachment = False + continue + + if in_attachment and _ATTACH_ITEM_LINE.match(text): + hits.append(CheckHit( + paragraph=p, + char_start=len(p.text) - 1, char_end=len(p.text), + actual={"text": p.text}, + message=f'附件项末尾不应有标点:"{text}"', + )) + return hits diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/font.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/font.py new file mode 100644 index 0000000..66dbec6 --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/font.py @@ -0,0 +1,162 @@ +"""字体/字号/复合样式/行距 check。""" + +import re + +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import register +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks.base import CheckBase, CheckContext, CheckHit, CheckResult +from fastapi_modules.fastapi_leaudit.govdoc_engine.models import Paragraph, ParagraphStyle + + +def _font_match(actual: str | None, expect: str) -> bool: + if not actual: + return False + return expect in actual or actual in expect + + +def _size_match(actual: float | None, expect: float, tol: float = 0.5) -> bool: + if actual is None: + return False + return abs(actual - expect) <= tol + + +def _style_matches(style: ParagraphStyle, expect: dict) -> bool: + if "eastasia" in expect and not _font_match(style.font_eastasia, expect["eastasia"]): + return False + if "size_pt" in expect and not _size_match( + style.font_size_pt, float(expect["size_pt"]) + ): + return False + if "bold" in expect and bool(style.bold) != bool(expect["bold"]): + return False + return True + + +@register("font") +class FontCheck(CheckBase): + def run(self, ctx: CheckContext) -> CheckResult: + expect = ctx.stage.expect or {} + hits: list[CheckHit] = [] + for p in ctx.paragraphs: + ok = True + actual = { + "font": p.style.font_eastasia, + "size": p.style.font_size_pt, + } + if "eastasia" in expect and not _font_match(p.style.font_eastasia, expect["eastasia"]): + ok = False + if "size_pt" in expect and not _size_match( + p.style.font_size_pt, float(expect["size_pt"]) + ): + ok = False + if not ok: + hits.append(CheckHit( + paragraph=p, char_start=0, char_end=len(p.text), + actual=actual, expected=expect, + message=f"字体或字号不符合(实际 {actual['font']} {actual['size']}pt,期望 {expect})", + )) + return CheckResult(passed=not hits, hits=hits) + + +@register("style_match") +class StyleMatchCheck(CheckBase): + def run(self, ctx: CheckContext) -> CheckResult: + expect = ctx.stage.expect or {} + hits: list[CheckHit] = [] + for p in ctx.paragraphs: + ok = True + actual = { + "font": p.style.font_eastasia, + "size": p.style.font_size_pt, + "bold": p.style.bold, + "italic": p.style.italic, + "alignment": p.style.alignment, + } + if "eastasia" in expect and not _font_match(p.style.font_eastasia, expect["eastasia"]): + ok = False + if "size_pt" in expect and not _size_match( + p.style.font_size_pt, float(expect["size_pt"]) + ): + ok = False + if "bold" in expect and bool(p.style.bold) != bool(expect["bold"]): + ok = False + if "alignment" in expect and p.style.alignment != expect["alignment"]: + ok = False + if not ok: + hits.append(CheckHit( + paragraph=p, char_start=0, char_end=len(p.text), + actual=actual, expected=expect, message="样式不符合", + )) + return CheckResult(passed=not hits, hits=hits) + + +_ATTACHMENT_MARKER_RE = re.compile(r"^\s*(附件[::]|附件\d+)") + + +@register("attachment_marker_style") +class AttachmentMarkerStyleCheck(CheckBase): + """只校验“附件:”或“附件1”等标记本身,不校验后续附件名称。""" + + DEFAULT_EXPECT = {"eastasia": "黑体", "size_pt": 16, "bold": False} + + def run(self, ctx: CheckContext) -> CheckResult: + expect = ctx.stage.expect or self.DEFAULT_EXPECT + hits: list[CheckHit] = [] + for p in ctx.paragraphs: + match = _ATTACHMENT_MARKER_RE.match(p.text) + if not match: + continue + marker_end = match.end(1) + marker_styles = _marker_run_styles(p, marker_end) + if not marker_styles: + marker_styles = [p.style] + bad_style = next( + (style for style in marker_styles if not _style_matches(style, expect)), + None, + ) + if bad_style is not None: + hits.append(CheckHit( + paragraph=p, + char_start=match.start(1), + char_end=marker_end, + actual={ + "font": bad_style.font_eastasia, + "size": bad_style.font_size_pt, + "bold": bad_style.bold, + }, + expected=expect, + message="附件标记样式不符合", + )) + return CheckResult(passed=not hits, hits=hits) + + +def _marker_run_styles(p: Paragraph, marker_end: int) -> list[ParagraphStyle]: + styles: list[ParagraphStyle] = [] + cursor = 0 + for run in p.runs: + run_start = cursor + run_end = cursor + len(run.text) + cursor = run_end + if run_end <= 0 or run_start >= marker_end: + continue + if run.text.strip(): + styles.append(run.style) + return styles + + +@register("line_spacing") +class LineSpacingCheck(CheckBase): + def run(self, ctx: CheckContext) -> CheckResult: + expect = ctx.stage.expect or {} + target = float(expect.get("multiple", 1.5)) + tol = float(expect.get("tol", 0.05)) + hits: list[CheckHit] = [] + for p in ctx.paragraphs: + actual = p.style.line_spacing + if actual is None or abs(actual - target) > tol: + hits.append(CheckHit( + paragraph=p, char_start=0, char_end=len(p.text), + actual={"line_spacing": actual}, + expected={"line_spacing": target}, + message=f"行距应为 {target},实际 {actual}", + )) + return CheckResult(passed=not hits, hits=hits) diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/forbid.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/forbid.py new file mode 100644 index 0000000..e39b897 --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/forbid.py @@ -0,0 +1,42 @@ +"""短语/字符黑名单。""" + +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import register +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks.base import CheckBase, CheckContext, CheckHit, CheckResult + + +@register("forbid_phrase") +class ForbidPhraseCheck(CheckBase): + def run(self, ctx: CheckContext) -> CheckResult: + phrases = ctx.stage.phrases or [] + hits: list[CheckHit] = [] + for p in ctx.paragraphs: + for phr in phrases: + start = p.text.find(phr) + if start >= 0: + hits.append(CheckHit( + paragraph=p, char_start=start, char_end=start + len(phr), + actual={"text": phr}, expected={"forbid": phr}, + message=f"出现禁用短语 \"{phr}\"", + )) + return CheckResult(passed=not hits, hits=hits) + + +@register("forbid_chars") +class ForbidCharsCheck(CheckBase): + def run(self, ctx: CheckContext) -> CheckResult: + chars = ctx.stage.chars or [] + hits: list[CheckHit] = [] + for p in ctx.paragraphs: + for c in chars: + start = 0 + while True: + idx = p.text.find(c, start) + if idx < 0: + break + hits.append(CheckHit( + paragraph=p, char_start=idx, char_end=idx + len(c), + actual={"char": c}, expected={"forbid": c}, + message=f"禁用字符 \"{c}\" 出现在 idx {idx}", + )) + start = idx + len(c) + return CheckResult(passed=not hits, hits=hits) diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/hierarchy.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/hierarchy.py new file mode 100644 index 0000000..15908a1 --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/hierarchy.py @@ -0,0 +1,29 @@ +"""层级序号格式 check。""" + +import re +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import register +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks.base import CheckBase, CheckContext, CheckHit, CheckResult + + +@register("hierarchy") +class HierarchyCheck(CheckBase): + """检查层级序号格式: + - expected_order: 各级允许的模式(正向白名单,按 level 升序) + - forbid_patterns: 禁用模式(黑名单,命中即报错) + """ + + def run(self, ctx: CheckContext) -> CheckResult: + forbid = [re.compile(p) for p in (ctx.stage.forbid_patterns or [])] + hits: list[CheckHit] = [] + for p in ctx.paragraphs: + text = p.text.strip() + for f in forbid: + m = f.search(text) + if m: + hits.append(CheckHit( + paragraph=p, char_start=m.start(), char_end=m.end(), + actual={"text": m.group(0)}, + expected={"forbid_pattern": f.pattern}, + message=f"层级序号格式错误:命中禁用模式 {f.pattern}", + )) + return CheckResult(passed=not hits, hits=hits) diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/punctuation.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/punctuation.py new file mode 100644 index 0000000..d639aea --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/punctuation.py @@ -0,0 +1,46 @@ +"""标点符号专项规则。""" + +import re +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import register +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks.base import CheckBase, CheckContext, CheckHit, CheckResult + + +# 多书名号或引号并列时不应用顿号分隔(中文/中文标点) +_QUOTE_DUNHAO_RE = re.compile(r"([”》])、([“《])") +# 句内括号末尾(除问号/叹号/省略号外)不应有标点 +_PAREN_PUNCT_RE = re.compile(r"[((][^))]*?[,。;:、][))]") +# 引号嵌套:双引号内含单引号包裹的强调短语(如 "卓'粤'创一流") +_NESTED_QUOTE_RE = re.compile(r"“[^“”]*?‘[^‘’]+’[^“”]*?”") + + +@register("punctuation") +class PunctuationCheck(CheckBase): + def run(self, ctx: CheckContext) -> CheckResult: + rules = ctx.stage.rules or [] + hits: list[CheckHit] = [] + for p in ctx.paragraphs: + for r in rules: + t = r.get("type") + if t == "no_dunhao_between_quotes": + for m in _QUOTE_DUNHAO_RE.finditer(p.text): + hits.append(CheckHit( + paragraph=p, char_start=m.start(), char_end=m.end(), + actual={"text": m.group(0)}, + expected={"text": m.group(0).replace("、", "")}, + message="多个引号/书名号并列不应用顿号分隔", + )) + elif t == "no_punct_inside_inline_paren": + for m in _PAREN_PUNCT_RE.finditer(p.text): + hits.append(CheckHit( + paragraph=p, char_start=m.start(), char_end=m.end(), + actual={"text": m.group(0)}, + message="句内括号末尾通常不应含标点", + )) + elif t == "no_outer_quote_when_inner_quote": + for m in _NESTED_QUOTE_RE.finditer(p.text): + hits.append(CheckHit( + paragraph=p, char_start=m.start(), char_end=m.end(), + actual={"text": m.group(0)}, + message="双引号内已含单引号强调时,外层不应再加双引号", + )) + return CheckResult(passed=not hits, hits=hits) diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/regex_check.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/regex_check.py new file mode 100644 index 0000000..765491e --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/regex_check.py @@ -0,0 +1,36 @@ +"""regex_require / regex_forbid。""" + +import re +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import register +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks.base import CheckBase, CheckContext, CheckHit, CheckResult + + +@register("regex_require") +class RegexRequireCheck(CheckBase): + def run(self, ctx: CheckContext) -> CheckResult: + pat = re.compile(ctx.stage.pattern or "") + hits: list[CheckHit] = [] + for p in ctx.paragraphs: + if not pat.search(p.text): + hits.append(CheckHit( + paragraph=p, char_start=0, char_end=len(p.text), + actual={"text": p.text}, expected={"pattern": ctx.stage.pattern}, + message=f"未匹配模式 {ctx.stage.pattern}", + )) + return CheckResult(passed=not hits, hits=hits) + + +@register("regex_forbid") +class RegexForbidCheck(CheckBase): + def run(self, ctx: CheckContext) -> CheckResult: + pat = re.compile(ctx.stage.pattern or "") + hits: list[CheckHit] = [] + for p in ctx.paragraphs: + for m in pat.finditer(p.text): + hits.append(CheckHit( + paragraph=p, char_start=m.start(), char_end=m.end(), + actual={"text": m.group(0)}, + expected={"forbid_pattern": ctx.stage.pattern}, + message=f"出现禁止模式 {ctx.stage.pattern}(命中 \"{m.group(0)}\")", + )) + return CheckResult(passed=not hits, hits=hits) diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/required.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/required.py new file mode 100644 index 0000000..0081261 --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/required.py @@ -0,0 +1,28 @@ +"""required check:目标实体或选中段落必须有非空文本。""" + +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import register +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks.base import ( + CheckBase, CheckContext, CheckHit, CheckResult, +) + + +@register("required") +class RequiredCheck(CheckBase): + def run(self, ctx: CheckContext) -> CheckResult: + # target 通道:检查实体 text 是否非空 + if ctx.target is not None: + if ctx.target.text and ctx.target.text.strip(): + return CheckResult(passed=True, hits=[]) + anchor = ctx.paragraphs[0] if ctx.paragraphs else None + return CheckResult(passed=False, hits=[ + CheckHit(paragraph=anchor, message=f"实体 {ctx.target.name} 缺失或为空") + ]) + + # applies_to 通道:所有段落必须非空 + empty = [p for p in ctx.paragraphs if not p.text.strip()] + if empty: + return CheckResult( + passed=False, + hits=[CheckHit(paragraph=p, message="段落为空") for p in empty], + ) + return CheckResult(passed=True, hits=[]) diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/wenzhong.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/wenzhong.py new file mode 100644 index 0000000..786acc9 --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/checks/wenzhong.py @@ -0,0 +1,42 @@ +"""文种白名单(15 种法定公文文种)。""" + +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import register +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks.base import ( + CheckBase, CheckContext, CheckHit, CheckResult, +) + + +LEGAL_WENZHONG = { + "决议", "决定", "命令", "令", "公报", "公告", "通告", + "意见", "通知", "通报", "报告", "请示", "批复", + "议案", "函", "纪要", +} + + +@register("wenzhong_whitelist") +class WenzhongWhitelistCheck(CheckBase): + """检查文种是否在 15 种法定文种白名单内。 + + 数据来源: + 1. ctx.entities["wenzhong"].text ← 推荐 + 2. ctx.target.text (当 rule.target = wenzhong 时) + """ + + def run(self, ctx: CheckContext) -> CheckResult: + wz = "" + wz_entity = ctx.entities.get("wenzhong") if ctx.entities else None + if wz_entity is not None: + wz = (wz_entity.text or "").strip() + elif ctx.target is not None and ctx.target.name == "wenzhong": + wz = (ctx.target.text or "").strip() + + if not wz: + return CheckResult(passed=True, hits=[]) + if wz in LEGAL_WENZHONG: + return CheckResult(passed=True, hits=[]) + return CheckResult(passed=False, hits=[CheckHit( + paragraph=None, + actual={"wenzhong": wz}, + expected={"wenzhong_whitelist": sorted(LEGAL_WENZHONG)}, + message=f"非法定文种 \"{wz}\",应为 15 种法定公文文种之一", + )]) diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/result.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/result.py new file mode 100644 index 0000000..fa72076 --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/result.py @@ -0,0 +1,81 @@ +"""审查结果数据结构。""" + +from __future__ import annotations +from collections import Counter +from typing import Literal +from pydantic import BaseModel, Field +from fastapi_modules.fastapi_leaudit.govdoc_engine.models import Finding +from fastapi_modules.fastapi_leaudit.govdoc_engine.parser.entities import SemanticEntity + + +class CheckedRule(BaseModel): + rule_id: str + name: str + severity: str + category: str + status: Literal["pass", "fail", "skipped"] + skip_reason: str = "" + + +class StructureItem(BaseModel): + """文档结构里一种 role 的统计。""" + + role: str + label: str + count: int + expected: bool + paragraph_indices: list[int] = Field(default_factory=list) + samples: list[str] = Field(default_factory=list) + char_total: int = 0 + dominant_font: str | None = None + dominant_size_pt: float | None = None + style_uniform: bool = True + + +class OutlineNode(BaseModel): + """大纲节点(heading_1~4 的层级树)。""" + + paragraph_index: int + level: int + text: str + children: list["OutlineNode"] = Field(default_factory=list) + + +class AuditSummary(BaseModel): + score: int = 100 + total_findings: int = 0 + by_severity: dict[str, int] = Field(default_factory=dict) + by_category: dict[str, int] = Field(default_factory=dict) + passed_count: int = 0 + failed_count: int = 0 + skipped_count: int = 0 + + +class AuditResult(BaseModel): + audit_id: str + document: dict = Field(default_factory=dict) + summary: AuditSummary = Field(default_factory=AuditSummary) + findings: list[Finding] = Field(default_factory=list) + checked_rules: list[CheckedRule] = Field(default_factory=list) + structure: list[StructureItem] = Field(default_factory=list) + outline: list[OutlineNode] = Field(default_factory=list) + entities: dict[str, SemanticEntity | None] = Field(default_factory=dict) + + def compute_summary(self) -> None: + sev_count = Counter(f.severity for f in self.findings) + cat_count = Counter(f.category for f in self.findings) + score = 100 + score -= 10 * sev_count.get("error", 0) + score -= 3 * sev_count.get("warning", 0) + passed = sum(1 for r in self.checked_rules if r.status == "pass") + failed = sum(1 for r in self.checked_rules if r.status == "fail") + skipped = sum(1 for r in self.checked_rules if r.status == "skipped") + self.summary = AuditSummary( + score=max(0, score), + total_findings=len(self.findings), + by_severity=dict(sev_count), + by_category=dict(cat_count), + passed_count=passed, + failed_count=failed, + skipped_count=skipped, + ) diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/runner.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/runner.py new file mode 100644 index 0000000..461fcbc --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/runner.py @@ -0,0 +1,242 @@ +"""规则评估引擎:跑一条规则的多 stage。""" + +from __future__ import annotations +import asyncio +import uuid +from dataclasses import dataclass, field +from fastapi_modules.fastapi_leaudit.govdoc_engine.models import Document, Finding, Location +from fastapi_modules.fastapi_leaudit.govdoc_engine.dsl.schema import Rule +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import get_check # noqa: F401 (确保注册) +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks.base import CheckContext, CheckResult, CheckHit +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks.ai_check import AiCheck +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.selector import select_paragraphs +from fastapi_modules.fastapi_leaudit.govdoc_engine.parser.entities import SemanticEntity +from fastapi_modules.fastapi_leaudit.govdoc_engine.llm.client import LlmClient + +# 触发所有 check 类的 @register +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import required as _r # noqa: F401 +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import font as _f # noqa: F401 +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import regex_check as _rc # noqa: F401 +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import confused_pair as _cp # noqa: F401 +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import forbid as _fb # noqa: F401 +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import wenzhong as _wz # noqa: F401 +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import hierarchy as _h # noqa: F401 +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import punctuation as _p # noqa: F401 +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import cross_role as _cr # noqa: F401 +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.checks import ai_check as _ai # noqa: F401 + + +@dataclass +class RuleOutcome: + """单条规则的执行结果(含 skipped 状态)。""" + + rule: Rule + findings: list[Finding] = field(default_factory=list) + skipped: bool = False + skip_reason: str = "" + + +class RuleRunner: + def __init__(self, llm_client: LlmClient | None = None): + self.llm = llm_client + + # -- 上下文装配 ----------------------------------------------------- + def _resolve_target( + self, + rule: Rule, + doc: Document, + entities: dict[str, SemanticEntity | None], + ) -> tuple[list, SemanticEntity | None, RuleOutcome | None]: + """根据 rule.target 或 rule.applies_to 选段落。 + + 返回 (paragraphs, target_entity, early_outcome); + 若 early_outcome 非 None,调用方应直接返回(命中 on_missing 提前结束)。 + """ + if rule.target: + target_entity = entities.get(rule.target) + if target_entity is None: + return [], None, self._handle_missing(rule) + paragraphs = [ + doc.paragraphs[i] + for i in target_entity.paragraph_indices + if 0 <= i < len(doc.paragraphs) + ] + return paragraphs, target_entity, None + # applies_to 通道(多段扫描) + return select_paragraphs(doc, rule.applies_to), None, None + + def _handle_missing(self, rule: Rule) -> RuleOutcome: + mode = rule.on_missing + if mode == "pass": + return RuleOutcome(rule=rule) + reason = f"目标实体「{rule.target}」未识别到" + if mode == "skip": + return RuleOutcome(rule=rule, skipped=True, skip_reason=reason) + severity = "error" if mode == "fail" else "warning" + finding = Finding( + finding_id=f"F-{uuid.uuid4().hex[:8]}", + rule_id=rule.rule_id, + rule_name=rule.name, + severity=severity, + category=rule.category, + location=Location(paragraph_index=-1), + message=reason, + suggestion=rule.messages.fail or "", + evidence="", confidence=0.9, + ) + return RuleOutcome(rule=rule, findings=[finding]) + + @staticmethod + def _merge_skip(outcome: RuleOutcome, result: CheckResult) -> None: + if not outcome.skip_reason: + outcome.skip_reason = result.skip_reason or "stage skipped" + outcome.skipped = True + + # -- 同步路径 ------------------------------------------------------- + def run_rule( + self, + rule: Rule, + doc: Document, + entities: dict[str, SemanticEntity | None] | None = None, + ) -> RuleOutcome: + entities = entities or {} + paragraphs, target, early = self._resolve_target(rule, doc, entities) + if early is not None: + return early + + outcome = RuleOutcome(rule=rule) + for stage in rule.stages: + if stage.check == "ai": + check = AiCheck(llm_client=self.llm) + else: + check_cls = get_check(stage.check) + check = check_cls() + + ctx = CheckContext( + document=doc, + paragraphs=paragraphs, + stage=stage, + entities=entities, + target=target, + rule_id=rule.rule_id, + ) + result: CheckResult = check.run(ctx) + if result.skipped: + self._merge_skip(outcome, result) + continue + if not result.passed: + outcome.findings = [self._hit_to_finding(rule, h) for h in result.hits] + outcome.skipped = False + outcome.skip_reason = "" + return outcome + return outcome + + def run_all( + self, + rules: list[Rule], + doc: Document, + entities: dict[str, SemanticEntity | None] | None = None, + ) -> list[Finding]: + flat, _ = self.evaluate(rules, doc, entities) + return flat + + def evaluate( + self, + rules: list[Rule], + doc: Document, + entities: dict[str, SemanticEntity | None] | None = None, + ) -> tuple[list[Finding], list[RuleOutcome]]: + flat: list[Finding] = [] + outcomes: list[RuleOutcome] = [] + for r in rules: + o = self.run_rule(r, doc, entities) + flat.extend(o.findings) + outcomes.append(o) + return flat, outcomes + + # -- 异步路径 ------------------------------------------------------- + async def run_rule_async( + self, + rule: Rule, + doc: Document, + entities: dict[str, SemanticEntity | None] | None = None, + ) -> RuleOutcome: + entities = entities or {} + paragraphs, target, early = self._resolve_target(rule, doc, entities) + if early is not None: + return early + + outcome = RuleOutcome(rule=rule) + for stage in rule.stages: + ctx = CheckContext( + document=doc, + paragraphs=paragraphs, + stage=stage, + entities=entities, + target=target, + rule_id=rule.rule_id, + ) + if stage.check == "ai": + result = await AiCheck(llm_client=self.llm).run_async(ctx) + else: + check_cls = get_check(stage.check) + result = check_cls().run(ctx) + if result.skipped: + self._merge_skip(outcome, result) + continue + if not result.passed: + outcome.findings = [self._hit_to_finding(rule, h) for h in result.hits] + outcome.skipped = False + outcome.skip_reason = "" + return outcome + return outcome + + async def run_all_async( + self, + rules: list[Rule], + doc: Document, + entities: dict[str, SemanticEntity | None] | None = None, + ) -> list[Finding]: + flat, _ = await self.evaluate_async(rules, doc, entities) + return flat + + async def evaluate_async( + self, + rules: list[Rule], + doc: Document, + entities: dict[str, SemanticEntity | None] | None = None, + ) -> tuple[list[Finding], list[RuleOutcome]]: + outcomes_list = await asyncio.gather( + *(self.run_rule_async(r, doc, entities) for r in rules) + ) + flat: list[Finding] = [] + outcomes: list[RuleOutcome] = [] + for o in outcomes_list: + flat.extend(o.findings) + outcomes.append(o) + return flat, outcomes + + def _hit_to_finding(self, rule: Rule, hit: CheckHit) -> Finding: + para = hit.paragraph + loc = Location( + paragraph_index=para.index if para else -1, + role=para.role if para else None, + char_start=hit.char_start, + char_end=hit.char_end, + context=para.text if para else "", + ) + msg = hit.message or rule.messages.fail + return Finding( + finding_id=f"F-{uuid.uuid4().hex[:8]}", + rule_id=rule.rule_id, + rule_name=rule.name, + severity=rule.severity, + category=rule.category, + location=loc, + actual=hit.actual or {}, + expected=hit.expected or {}, + message=msg, + suggestion=rule.messages.fail or "", + evidence=rule.messages.fail or "", + confidence=hit.confidence, + ) diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/selector.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/selector.py new file mode 100644 index 0000000..9d5ea7b --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/selector.py @@ -0,0 +1,27 @@ +"""applies_to → 段落集合。""" + +from __future__ import annotations +from fastapi_modules.fastapi_leaudit.govdoc_engine.models import Document, Paragraph +from fastapi_modules.fastapi_leaudit.govdoc_engine.dsl.schema import AppliesTo + + +def select_paragraphs(doc: Document, applies_to: AppliesTo) -> list[Paragraph]: + if applies_to.paragraph_index is not None: + idx = applies_to.paragraph_index + if 0 <= idx < len(doc.paragraphs): + return [doc.paragraphs[idx]] + return [] + + if applies_to.role == "any": + return list(doc.paragraphs) + + targets: set[str] = set() + if applies_to.role: + targets.add(applies_to.role) + if applies_to.roles: + targets.update(applies_to.roles) + + if not targets: + return list(doc.paragraphs) + + return [p for p in doc.paragraphs if p.role in targets] diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/structure.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/structure.py new file mode 100644 index 0000000..efeb802 --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/engine/structure.py @@ -0,0 +1,93 @@ +"""从 Document 派生出 structure(按 role 分类统计)+ outline(heading 层级树)。""" + +from __future__ import annotations +from collections import Counter +from fastapi_modules.fastapi_leaudit.govdoc_engine.models import Document, Paragraph +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.result import OutlineNode, StructureItem + + +_ROLE_LABELS: list[tuple[str, str, bool]] = [ + # (role, 中文标签, 是否常规公文必备) + ("title", "标题", True), + ("doc_number", "发文字号", True), + ("recipient", "主送机关", True), + ("heading_1", "一级标题", False), + ("heading_2", "二级标题", False), + ("heading_3", "三级标题", False), + ("heading_4", "四级标题", False), + ("body", "正文", True), + ("attachment_marker", "附件标记", False), + ("attachment_title", "附件标题", False), + ("signature", "署名", True), + ("date", "成文日期", True), + ("no_text_marker", "(此页无正文)", False), + ("unknown", "未识别", False), +] + +_HEADING_LEVELS = { + "heading_1": 1, + "heading_2": 2, + "heading_3": 3, + "heading_4": 4, +} + + +def _dominant_style(paragraphs: list[Paragraph]) -> tuple[str | None, float | None, bool]: + """返回 (字体众数, 字号众数, 是否所有段落样式一致)。""" + if not paragraphs: + return None, None, True + fonts = Counter(p.style.font_eastasia for p in paragraphs if p.style.font_eastasia) + sizes = Counter(p.style.font_size_pt for p in paragraphs if p.style.font_size_pt is not None) + dom_font = fonts.most_common(1)[0][0] if fonts else None + dom_size = sizes.most_common(1)[0][0] if sizes else None + uniform = len(fonts) <= 1 and len(sizes) <= 1 + return dom_font, dom_size, uniform + + +def build_structure(doc: Document) -> list[StructureItem]: + items: list[StructureItem] = [] + for role, label, expected in _ROLE_LABELS: + paragraphs = [p for p in doc.paragraphs if p.role == role] + if not paragraphs and not expected: + # 非必备 role 没出现就不展示,保持面板紧凑 + continue + samples = [p.text[:60] for p in paragraphs[:3]] + font, size, uniform = _dominant_style(paragraphs) + items.append(StructureItem( + role=role, + label=label, + count=len(paragraphs), + expected=expected, + paragraph_indices=[p.index for p in paragraphs], + samples=samples, + char_total=sum(len(p.text) for p in paragraphs), + dominant_font=font, + dominant_size_pt=size, + style_uniform=uniform, + )) + return items + + +def build_outline(doc: Document) -> list[OutlineNode]: + """按段落顺序 + heading 层级生成树。""" + headings = [ + (p.index, _HEADING_LEVELS[p.role], p.text) + for p in doc.paragraphs + if p.role in _HEADING_LEVELS + ] + if not headings: + return [] + + roots: list[OutlineNode] = [] + stack: list[OutlineNode] = [] + for idx, level, text in headings: + node = OutlineNode(paragraph_index=idx, level=level, text=text) + # 弹出比当前 level 更深的祖先 + while stack and stack[-1].level >= level: + stack.pop() + if stack: + stack[-1].children.append(node) + else: + roots.append(node) + stack.append(node) + return roots diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/llm/__init__.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/llm/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/llm/cache.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/llm/cache.py new file mode 100644 index 0000000..4e1285a --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/llm/cache.py @@ -0,0 +1,101 @@ +"""LLM 响应缓存(SQLite)。 + +缓存键 = sha256(model + canonical_json(messages, temperature, top_p, max_tokens))。 +仅缓存成功返回的文本;JSON 解析失败、API 错误、超时一律不入库。 +""" + +from __future__ import annotations +import hashlib +import json +import logging +import sqlite3 +import time +from pathlib import Path +from threading import Lock +from typing import Any + +_log = logging.getLogger(__name__) + +_SCHEMA = """ +CREATE TABLE IF NOT EXISTS llm_cache ( + cache_key TEXT PRIMARY KEY, + model TEXT NOT NULL, + response_text TEXT NOT NULL, + created_at REAL NOT NULL, + hit_count INTEGER NOT NULL DEFAULT 0, + last_hit_at REAL +); +CREATE INDEX IF NOT EXISTS idx_llm_cache_created ON llm_cache(created_at); +""" + +# 影响响应的关键参数。其他 kwargs 不入 hash(如 stream/timeout)。 +_KEY_PARAMS = ("temperature", "top_p", "max_tokens", "response_format") + + +def _canonical(messages: list[dict[str, str]], **kwargs: Any) -> str: + payload = { + "messages": messages, + "params": {k: kwargs.get(k) for k in _KEY_PARAMS}, + } + return json.dumps(payload, sort_keys=True, ensure_ascii=False) + + +def make_key(model: str, messages: list[dict[str, str]], **kwargs: Any) -> str: + raw = model + "\x00" + _canonical(messages, **kwargs) + return hashlib.sha256(raw.encode("utf-8")).hexdigest() + + +class LlmCache: + def __init__(self, path: str | Path): + self.path = Path(path) + self.path.parent.mkdir(parents=True, exist_ok=True) + self._lock = Lock() + self._conn = sqlite3.connect(str(self.path), check_same_thread=False) + self._conn.executescript(_SCHEMA) + self._conn.commit() + + def get(self, key: str) -> str | None: + with self._lock: + row = self._conn.execute( + "SELECT response_text FROM llm_cache WHERE cache_key = ?", + (key,), + ).fetchone() + if row is None: + return None + self._conn.execute( + "UPDATE llm_cache " + "SET hit_count = hit_count + 1, last_hit_at = ? " + "WHERE cache_key = ?", + (time.time(), key), + ) + self._conn.commit() + return row[0] + + def put(self, key: str, model: str, response_text: str) -> None: + if not response_text: + return + with self._lock: + self._conn.execute( + "INSERT OR IGNORE INTO llm_cache " + "(cache_key, model, response_text, created_at) " + "VALUES (?, ?, ?, ?)", + (key, model, response_text, time.time()), + ) + self._conn.commit() + + def stats(self) -> dict[str, int]: + with self._lock: + row = self._conn.execute( + "SELECT COUNT(*), COALESCE(SUM(hit_count), 0) FROM llm_cache" + ).fetchone() + return {"entries": int(row[0] or 0), "total_hits": int(row[1] or 0)} + + def clear(self) -> int: + with self._lock: + cur = self._conn.execute("DELETE FROM llm_cache") + self._conn.commit() + return cur.rowcount + + def close(self) -> None: + with self._lock: + self._conn.close() diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/llm/client.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/llm/client.py new file mode 100644 index 0000000..0dd379d --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/llm/client.py @@ -0,0 +1,258 @@ +"""Qwen LLM 客户端(OpenAI 兼容协议)。 + +包含:超时(asyncio.wait_for)、重试(指数退避)、并发上限(Semaphore)。 +""" + +from __future__ import annotations +import asyncio +import json +import logging +import re +import time +from typing import Any + +from openai import AsyncOpenAI, OpenAI, APIError, APIConnectionError, RateLimitError + +from fastapi_admin.config import ( + LLM_API_KEY, + LLM_BASE_URL, + LLM_MODEL, + LEAUDIT_LLM_MAX_CONCURRENCY, + LEAUDIT_LLM_REQUEST_TIMEOUT, + LEAUDIT_LLM_RETRY_MAX_ATTEMPTS, + LEAUDIT_LLM_RETRY_BACKOFF_BASE_SECONDS, +) +from fastapi_modules.fastapi_leaudit.govdoc_engine.llm.cache import LlmCache, make_key + +_log = logging.getLogger(__name__) + + +_FENCE_RE = re.compile(r"```(?:json)?\s*([\s\S]+?)\s*```", re.MULTILINE) + +# 这些异常会触发重试;JSON 解析错误等业务错误不重试 +_RETRYABLE = ( + asyncio.TimeoutError, + TimeoutError, + APIConnectionError, + RateLimitError, +) + + +class LlmJsonError(Exception): + """LLM 返回内容无法解析为 JSON。""" + + +class LlmConfigError(Exception): + """LLM 客户端缺少必要配置。""" + + +def _parse_json_text(text: str) -> dict[str, Any]: + text = text.strip() + m = _FENCE_RE.search(text) + if m: + text = m.group(1) + try: + return json.loads(text) + except json.JSONDecodeError: + start = text.find("{") + end = text.rfind("}") + if start >= 0 and end > start: + try: + return json.loads(text[start : end + 1]) + except json.JSONDecodeError as e: + raise LlmJsonError(f"failed to parse LLM JSON: {text!r}") from e + raise LlmJsonError(f"LLM returned non-JSON content: {text!r}") + + +def _is_retryable_status(exc: Exception) -> bool: + """APIError 中只重试 5xx 与 429。""" + if isinstance(exc, RateLimitError): + return True + if isinstance(exc, APIError): + status = getattr(exc, "status_code", None) + return status is not None and (status >= 500 or status == 429) + return False + + +def _clip_text(value: Any, limit: int = 400) -> str: + text = str(value).strip() + if len(text) <= limit: + return text + return text[: limit - 3] + "..." + + +def _format_exc(exc: Exception) -> str: + text = str(exc).strip() + parts = [exc.__class__.__name__] + if text: + parts.append(text) + + status = getattr(exc, "status_code", None) + if status is not None: + parts.append(f"status={status}") + + body = getattr(exc, "body", None) + if body not in (None, "", b""): + parts.append(f"body={_clip_text(body)}") + + response = getattr(exc, "response", None) + if response is not None: + try: + request = getattr(response, "request", None) + if request is not None and getattr(request, "url", None): + parts.append(f"url={request.url}") + except Exception: + pass + + request = getattr(exc, "request", None) + if request is not None and getattr(request, "url", None): + parts.append(f"url={request.url}") + + return ": ".join(parts[:2]) + ("" if len(parts) <= 2 else " | " + " | ".join(parts[2:])) + + +class LlmClient: + def __init__( + self, + api_key: str | None = None, + base_url: str | None = None, + model: str | None = None, + max_concurrency: int | None = None, + timeout_seconds: float | None = None, + max_retries: int | None = None, + cache: LlmCache | None = None, + cache_enabled: bool | None = None, + ): + key = api_key or LLM_API_KEY + self._misconfigured_error: LlmConfigError | None = None + if not key: + self._client = None + self._aclient = None + self._misconfigured_error = LlmConfigError( + "LLM_API_KEY is not configured. Set LLM_API_KEY in platform config." + ) + else: + self._client = OpenAI(api_key=key, base_url=base_url or LLM_BASE_URL) + self._aclient = AsyncOpenAI(api_key=key, base_url=base_url or LLM_BASE_URL) + self.model = model or LLM_MODEL + self.timeout = timeout_seconds if timeout_seconds is not None else LEAUDIT_LLM_REQUEST_TIMEOUT + self.max_retries = max_retries if max_retries is not None else LEAUDIT_LLM_RETRY_MAX_ATTEMPTS + conc = max_concurrency if max_concurrency is not None else LEAUDIT_LLM_MAX_CONCURRENCY + self._sem = asyncio.Semaphore(conc) + # 缓存:cache 显式传入则用之;否则默认关闭。 + if cache is not None: + self.cache: LlmCache | None = cache + elif cache_enabled is not False: + self.cache = None + else: + self.cache = None + + def _ensure_ready(self) -> None: + if self._misconfigured_error is not None: + raise self._misconfigured_error + + @staticmethod + def _prompt_text(messages: list[dict[str, str]]) -> str: + return "\n\n".join( + f"[{m.get('role', 'user')}]\n{m.get('content', '')}" + for m in messages + ) + + # -- 同步路径 ------------------------------------------------------- + def chat(self, messages: list[dict[str, str]], **kwargs) -> str: + self._ensure_ready() + use_cache = kwargs.pop("use_cache", True) + label = kwargs.pop("label", "llm_call") + cache_kwargs = {k: kwargs.get(k) for k in ("temperature", "top_p", "max_tokens", "response_format")} + cache_key: str | None = None + prompt_text = self._prompt_text(messages) + t0 = time.monotonic() + + if use_cache and self.cache is not None: + cache_key = make_key(self.model, messages, **cache_kwargs) + hit = self.cache.get(cache_key) + if hit is not None: + _log.debug("LLM cache HIT key=%s", cache_key[:12]) + return hit + + kwargs.setdefault("timeout", self.timeout) + last_exc: Exception | None = None + for attempt in range(self.max_retries + 1): + try: + resp = self._client.chat.completions.create( + model=self.model, messages=messages, **kwargs + ) + content = resp.choices[0].message.content or "" + if cache_key is not None and content: + self.cache.put(cache_key, self.model, content) + return content + except _RETRYABLE as e: + last_exc = e + except APIError as e: + if not _is_retryable_status(e): + raise + last_exc = e + if attempt < self.max_retries: + wait = min(8.0, 2 ** attempt) + _log.warning( + "LLM call failed (%s); retry %d/%d after %.1fs", + _format_exc(last_exc), attempt + 1, self.max_retries, wait, + ) + time.sleep(wait) + assert last_exc is not None + raise last_exc + + def chat_json(self, messages: list[dict[str, str]], **kwargs) -> dict[str, Any]: + return _parse_json_text(self.chat(messages, **kwargs)) + + # -- 异步路径 ------------------------------------------------------- + async def chat_async(self, messages: list[dict[str, str]], **kwargs) -> str: + self._ensure_ready() + use_cache = kwargs.pop("use_cache", True) + label = kwargs.pop("label", "llm_call") + cache_kwargs = {k: kwargs.get(k) for k in ("temperature", "top_p", "max_tokens", "response_format")} + cache_key: str | None = None + prompt_text = self._prompt_text(messages) + t0 = time.monotonic() + + if use_cache and self.cache is not None: + cache_key = make_key(self.model, messages, **cache_kwargs) + hit = self.cache.get(cache_key) + if hit is not None: + _log.debug("LLM cache HIT key=%s", cache_key[:12]) + return hit + + last_exc: Exception | None = None + for attempt in range(self.max_retries + 1): + try: + async with self._sem: + resp = await asyncio.wait_for( + self._aclient.chat.completions.create( + model=self.model, messages=messages, **kwargs, + ), + timeout=self.timeout, + ) + content = resp.choices[0].message.content or "" + if cache_key is not None and content: + self.cache.put(cache_key, self.model, content) + return content + except _RETRYABLE as e: + last_exc = e + except APIError as e: + if not _is_retryable_status(e): + raise + last_exc = e + if attempt < self.max_retries: + wait = min(8.0, 2 ** attempt) + _log.warning( + "LLM async call failed (%s); retry %d/%d after %.1fs", + _format_exc(last_exc), attempt + 1, self.max_retries, wait, + ) + await asyncio.sleep(wait) + assert last_exc is not None + raise last_exc + + async def chat_json_async( + self, messages: list[dict[str, str]], **kwargs + ) -> dict[str, Any]: + return _parse_json_text(await self.chat_async(messages, **kwargs)) diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/models.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/models.py new file mode 100644 index 0000000..ec56a77 --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/models.py @@ -0,0 +1,77 @@ +"""公文审查的核心数据模型。""" + +from __future__ import annotations +from typing import Any, Literal +from pydantic import BaseModel, Field + + +Role = Literal[ + "title", "doc_number", "recipient", + "heading_1", "heading_2", "heading_3", "heading_4", + "body", "attachment_marker", "attachment_title", "signature", "date", + "no_text_marker", "unknown", "any", +] +Severity = Literal["error", "warning", "info"] + + +class ParagraphStyle(BaseModel): + font_eastasia: str | None = None + font_ascii: str | None = None + font_size_pt: float | None = None + bold: bool = False + italic: bool = False + line_spacing: float | None = None + line_spacing_rule: str | None = None + alignment: str = "left" + first_line_indent_pt: float = 0.0 + + +class Run(BaseModel): + text: str + style: ParagraphStyle + + +class Paragraph(BaseModel): + index: int + text: str + runs: list[Run] + style: ParagraphStyle + role: Role | None = None + role_confidence: float = 1.0 + in_table: bool = False + in_header: bool = False + in_footer: bool = False + + +class Table(BaseModel): + index: int + rows: list[list[str]] + + +class Document(BaseModel): + meta: dict[str, Any] = Field(default_factory=dict) + paragraphs: list[Paragraph] + tables: list[Table] = Field(default_factory=list) + + +class Location(BaseModel): + paragraph_index: int + role: Role | None = None + char_start: int = 0 + char_end: int = 0 + context: str = "" + + +class Finding(BaseModel): + finding_id: str + rule_id: str + rule_name: str + severity: Severity + category: str + location: Location + actual: dict[str, Any] = Field(default_factory=dict) + expected: dict[str, Any] = Field(default_factory=dict) + message: str + suggestion: str = "" + evidence: str = "" + confidence: float = 1.0 diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/parser/__init__.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/parser/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/parser/docx_parser.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/parser/docx_parser.py new file mode 100644 index 0000000..9bd15f0 --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/parser/docx_parser.py @@ -0,0 +1,152 @@ +"""解析 .docx → Document 对象。 + +文档顺序遍历 body:顶级段落 + 表格内段落都纳入 paragraphs, +后续 role tagging 与规则评估都能扫到表格内的内容。 +""" + +from __future__ import annotations +from pathlib import Path +from docx import Document as DocxDocument +from docx.oxml.ns import qn +from docx.text.paragraph import Paragraph as DocxParagraph +from lxml import etree + +from fastapi_modules.fastapi_leaudit.govdoc_engine.models import Document, Paragraph, ParagraphStyle, Run, Table +from fastapi_modules.fastapi_leaudit.govdoc_engine.parser.style_resolver import StyleResolver + + +_ALIGN_MAP = {0: "left", 1: "center", 2: "right", 3: "justify"} + + +def _read_run_style(run, p_elem, resolver: StyleResolver) -> ParagraphStyle: + rs = resolver.resolve_run(p_elem, run._element) + return ParagraphStyle( + font_eastasia=rs.font_eastasia, + font_ascii=rs.font_ascii, + font_size_pt=rs.size_pt, + bold=bool(rs.bold) if rs.bold is not None else False, + italic=bool(rs.italic) if rs.italic is not None else False, + ) + + +def _read_paragraph_style(p, resolver: StyleResolver) -> ParagraphStyle: + pf = p.paragraph_format + alignment = ( + _ALIGN_MAP.get(pf.alignment, "left") if pf.alignment is not None else "left" + ) + spacing_pt = float(pf.line_spacing) if pf.line_spacing is not None else None + indent = pf.first_line_indent + indent_pt = float(indent.pt) if indent is not None else 0.0 + + if p.runs: + base = _read_run_style(p.runs[0], p._element, resolver) + else: + rs = resolver.resolve_paragraph(p._element) + base = ParagraphStyle( + font_eastasia=rs.font_eastasia, + font_ascii=rs.font_ascii, + font_size_pt=rs.size_pt, + bold=bool(rs.bold) if rs.bold is not None else False, + italic=bool(rs.italic) if rs.italic is not None else False, + ) + base.alignment = alignment + base.line_spacing = spacing_pt + base.first_line_indent_pt = indent_pt + return base + + +def _is_in_table(p_elem) -> bool: + parent = p_elem.getparent() + while parent is not None: + if etree.QName(parent).localname == "tbl": + return True + parent = parent.getparent() + return False + + +def _iter_body_paragraphs(docx): + """文档顺序遍历 body 下所有 w:p(含表格内)。""" + for p_elem in docx.element.body.iter(qn("w:p")): + yield p_elem + + +def _iter_header_footer_paragraphs(docx): + """yield (DocxParagraph, p_elem, in_header, in_footer),跨 section 去重。""" + seen: set[int] = set() + for section in docx.sections: + targets = [ + ("header", section.header), + ("first_header", section.first_page_header), + ("even_header", section.even_page_header), + ("footer", section.footer), + ("first_footer", section.first_page_footer), + ("even_footer", section.even_page_footer), + ] + for kind, hf in targets: + if hf is None: + continue + try: + if hf.is_linked_to_previous: + continue + except Exception: + pass + in_header = "header" in kind + for p in hf.paragraphs: + pid = id(p._element) + if pid in seen: + continue + seen.add(pid) + yield p, p._element, in_header, not in_header + + +def parse_docx(path: str | Path) -> Document: + path = Path(path) + docx = DocxDocument(path) + resolver = StyleResolver(docx) + + paragraphs: list[Paragraph] = [] + idx = 0 + # 1) body:含表格内段落 + for p_elem in _iter_body_paragraphs(docx): + p = DocxParagraph(p_elem, docx.part) + runs = [ + Run(text=r.text, style=_read_run_style(r, p_elem, resolver)) + for r in p.runs + ] + style = _read_paragraph_style(p, resolver) + paragraphs.append(Paragraph( + index=idx, + text=p.text, + runs=runs, + style=style, + in_table=_is_in_table(p_elem), + )) + idx += 1 + # 2) headers / footers:附在末尾,role tagger 也能扫到 + for p, p_elem, in_header, in_footer in _iter_header_footer_paragraphs(docx): + runs = [ + Run(text=r.text, style=_read_run_style(r, p_elem, resolver)) + for r in p.runs + ] + style = _read_paragraph_style(p, resolver) + paragraphs.append(Paragraph( + index=idx, + text=p.text, + runs=runs, + style=style, + in_table=_is_in_table(p_elem), + in_header=in_header, + in_footer=in_footer, + )) + idx += 1 + + tables = [] + for tidx, t in enumerate(docx.tables): + rows = [[cell.text for cell in row.cells] for row in t.rows] + tables.append(Table(index=tidx, rows=rows)) + + return Document( + meta={"path": str(path), "page_count": len(docx.sections)}, + paragraphs=paragraphs, + tables=tables, + ) diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/parser/entities.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/parser/entities.py new file mode 100644 index 0000000..b8d08af --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/parser/entities.py @@ -0,0 +1,27 @@ +"""语义实体:把段落 + 字段值 + 样式合在一起。""" + +from __future__ import annotations +from typing import Any, Literal +from pydantic import BaseModel, Field +from fastapi_modules.fastapi_leaudit.govdoc_engine.models import ParagraphStyle + + +EntitySource = Literal["structural", "llm", "derived"] + + +class SemanticEntity(BaseModel): + """公文中的一个语义单元(标题 / 发文字号 / 主送机关 / ...)。 + + - structural:name 与某个 role 一一对应,paragraph_indices 非空,style 可用。 + - derived:从其他实体推导(如 wenzhong 从 title 末尾),paragraph_indices 借用源段落。 + - llm:仅当结构 / 派生路径都失败时启用,paragraph_indices 可能为空。 + """ + + name: str + text: str = "" + paragraph_indices: list[int] = Field(default_factory=list) + primary_role: str | None = None + style: ParagraphStyle | None = None + extra: dict[str, Any] = Field(default_factory=dict) + source: EntitySource = "structural" + confidence: float = 1.0 diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/parser/entity_builder.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/parser/entity_builder.py new file mode 100644 index 0000000..134293e --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/parser/entity_builder.py @@ -0,0 +1,195 @@ +"""从已 tag 的 Document 抽取语义实体(结构化优先)。""" + +from __future__ import annotations +import re +from fastapi_modules.fastapi_leaudit.govdoc_engine.models import Document +from fastapi_modules.fastapi_leaudit.govdoc_engine.parser.entities import SemanticEntity + + +# 8 个内置实体名(也用于 schema 校验冲突) +BUILTIN_ENTITY_NAMES: frozenset[str] = frozenset({ + "title", "doc_number", "recipient", "date", + "signature", "attachments", "wenzhong", "issuer", +}) + + +# 内置实体的 LLM 兜底 prompt 描述(Phase B 使用) +BUILTIN_LLM_DESCRIPTION: dict[str, str] = { + "title": "公文主标题(不含发文字号)", + "doc_number": "X发〔YYYY〕N号 形式的发文字号", + "recipient": "公文抬头的接收机关名称", + "date": "末尾的成文日期原文", + "signature": "末尾的发文机关署名", + "attachments": "附件清单(数组,每项含 序号 与 名称)", + "wenzhong": "公文文种(决议/决定/通知/通报/请示/批复 等 15 种之一)", + "issuer": "发文机关全称", +} + + +# role → entity name 的 1:1 映射 +_ROLE_ENTITY_MAP = { + "title": "title", + "doc_number": "doc_number", + "recipient": "recipient", + "date": "date", + "signature": "signature", +} + + +_ATTACHMENT_HEAD_RE = re.compile(r"^附件\d*[::]\s*") +_ATTACHMENT_ITEM_RE = re.compile(r"^\s*(\d+)[\..、)]\s*(.+)$") + +# 15 种法定文种(参照《党政机关公文处理工作条例》) +_WENZHONG_LIST = ( + "决议", "决定", "命令", "公报", "公告", "通告", + "意见", "通知", "通报", "报告", "请示", "批复", + "议案", "函", "纪要", +) +_WENZHONG_RE = re.compile("(" + "|".join(_WENZHONG_LIST) + ")$") + +# 「XX关于...的YY」 → issuer = XX +_ISSUER_PREFIX_RE = re.compile(r"^(.+?)关于") + + +class EntityBuilder: + """从已 tag 的 Document 抽取 8 个内置语义实体。""" + + def build(self, doc: Document) -> dict[str, SemanticEntity | None]: + entities: dict[str, SemanticEntity | None] = { + name: None for name in BUILTIN_ENTITY_NAMES + } + + # ① 一对一 role → entity + for role, name in _ROLE_ENTITY_MAP.items(): + paras = [p for p in doc.paragraphs if p.role == role] + if not paras: + continue + target = paras[-1] if name == "signature" else paras[0] + entities[name] = SemanticEntity( + name=name, + text=target.text.strip(), + paragraph_indices=[target.index], + primary_role=role, + style=target.style, + source="structural", + confidence=target.role_confidence, + ) + + # ② attachments:attachment_marker + 跟随行 + entities["attachments"] = self._build_attachments(doc) + + # ③ 派生:wenzhong / issuer + title_e = entities.get("title") + if title_e: + entities["wenzhong"] = self._derive_wenzhong(title_e) + entities["issuer"] = self._derive_issuer( + title_e, entities.get("signature") + ) + elif entities.get("signature"): + entities["issuer"] = self._derive_issuer( + None, entities["signature"] + ) + + return entities + + # ---------- attachments ---------- + def _build_attachments(self, doc: Document) -> SemanticEntity | None: + markers = [ + i for i, p in enumerate(doc.paragraphs) + if p.role == "attachment_marker" + ] + if not markers: + return None + + m = markers[0] + items: list[dict] = [] + para_idxs: list[int] = [m] + + first = doc.paragraphs[m].text.strip() + head = _ATTACHMENT_HEAD_RE.sub("", first) + if head: + mt = _ATTACHMENT_ITEM_RE.match(head) + if mt: + items.append( + {"序号": int(mt.group(1)), "名称": mt.group(2).strip()} + ) + else: + items.append({"序号": 1, "名称": head}) + + # 后续顺序行:直到遇到非 body / unknown 的段 + for j in range(m + 1, len(doc.paragraphs)): + p = doc.paragraphs[j] + if p.role and p.role not in ("body", "unknown", "attachment_marker"): + break + t = p.text.strip() + if not t: + continue + mt = _ATTACHMENT_ITEM_RE.match(t) + if not mt: + break + items.append( + {"序号": int(mt.group(1)), "名称": mt.group(2).strip()} + ) + para_idxs.append(p.index) + + if not items: + return None + + text = "; ".join(f"{it['序号']}. {it['名称']}" for it in items) + return SemanticEntity( + name="attachments", + text=text, + paragraph_indices=para_idxs, + primary_role="attachment_marker", + style=doc.paragraphs[m].style, + extra={"items": items}, + source="structural", + confidence=0.9, + ) + + # ---------- 派生 ---------- + def _derive_wenzhong( + self, title: SemanticEntity + ) -> SemanticEntity | None: + m = _WENZHONG_RE.search(title.text) + if not m: + return None + return SemanticEntity( + name="wenzhong", + text=m.group(1), + paragraph_indices=list(title.paragraph_indices), + primary_role="title", + extra={"derived_from": "title.suffix"}, + source="derived", + confidence=0.95, + ) + + def _derive_issuer( + self, + title: SemanticEntity | None, + signature: SemanticEntity | None, + ) -> SemanticEntity | None: + if title: + m = _ISSUER_PREFIX_RE.match(title.text) + if m: + return SemanticEntity( + name="issuer", + text=m.group(1), + paragraph_indices=list(title.paragraph_indices), + primary_role="title", + extra={"derived_from": "title.prefix"}, + source="derived", + confidence=0.9, + ) + if signature: + return SemanticEntity( + name="issuer", + text=signature.text, + paragraph_indices=list(signature.paragraph_indices), + primary_role="signature", + style=signature.style, + extra={"derived_from": "signature"}, + source="derived", + confidence=0.8, + ) + return None diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/parser/extractor.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/parser/extractor.py new file mode 100644 index 0000000..8a5e645 --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/parser/extractor.py @@ -0,0 +1,104 @@ +"""LLM 字段抽取:差量模式(仅对未知字段构造 prompt)。""" + +from __future__ import annotations +import logging +from typing import Any +from fastapi_modules.fastapi_leaudit.govdoc_engine.models import Document +from fastapi_modules.fastapi_leaudit.govdoc_engine.llm.client import LlmClient, _format_exc + +_log = logging.getLogger(__name__) + + +_PROMPT_HEAD = """从下面的公文中抽取以下指定字段,仅以 JSON 输出。 + +【公文内容(顺序段落)】 +{text} + +【需要抽取的字段】 +{spec_block} + +【输出格式】 +仅 JSON:{{{example}}} +未识别的字段填 ""(list 类型填 [])。 +""" + + +def _build_doc_text(doc: Document) -> str: + return "\n".join(f"[{p.index}] {p.text}" for p in doc.paragraphs) + + +def _example_for(spec: dict[str, dict]) -> str: + parts = [] + for name, meta in spec.items(): + t = meta.get("type", "string") + if t == "list": + parts.append(f'"{name}": []') + else: + parts.append(f'"{name}": ""') + return ", ".join(parts) + + +class FieldExtractor: + """LLM 差量字段抽取。 + + extract_missing(doc, spec): spec 指定需要抽哪些字段;空 spec 不调 LLM。 + """ + + def __init__(self, llm_client: LlmClient): + self.client = llm_client + + def _build_messages_for_spec( + self, doc: Document, spec: dict[str, dict] + ) -> list[dict[str, str]]: + spec_lines = [ + f"- {name}: {meta.get('description', name)}" + f"({meta.get('type', 'string')})" + for name, meta in spec.items() + ] + prompt = _PROMPT_HEAD.format( + text=_build_doc_text(doc), + spec_block="\n".join(spec_lines) or "(无)", + example=_example_for(spec), + ) + return [{"role": "user", "content": prompt}] + + def _shape_missing( + self, spec: dict[str, dict], resp: dict + ) -> dict[str, Any]: + out: dict[str, Any] = {} + for name, meta in spec.items(): + if meta.get("type") == "list": + out[name] = resp.get(name) or [] + else: + out[name] = resp.get(name) or "" + return out + + def extract_missing( + self, doc: Document | None, spec: dict[str, dict] + ) -> dict[str, Any]: + if not spec or doc is None: + return {} + label = "extract_missing__" + ",".join(spec.keys()) + try: + resp = self.client.chat_json( + self._build_messages_for_spec(doc, spec), label=label, + ) + except Exception as e: + _log.warning("Differential extraction failed: %s", _format_exc(e)) + resp = {} + return self._shape_missing(spec, resp) + + async def extract_missing_async( + self, doc: Document | None, spec: dict[str, dict] + ) -> dict[str, Any]: + if not spec or doc is None: + return {} + label = "extract_missing__" + ",".join(spec.keys()) + try: + resp = await self.client.chat_json_async( + self._build_messages_for_spec(doc, spec), label=label, + ) + except Exception as e: + _log.warning("Differential extraction failed: %s", _format_exc(e)) + resp = {} + return self._shape_missing(spec, resp) diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/parser/loader.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/parser/loader.py new file mode 100644 index 0000000..28cbfc3 --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/parser/loader.py @@ -0,0 +1,83 @@ +"""doc / wps → docx 转换。""" + +from __future__ import annotations + +import shutil +import subprocess +from pathlib import Path + +from fastapi_modules.fastapi_leaudit.govdoc_engine.config import get_settings + + +class UnsupportedFormat(Exception): + pass + + +class ConversionError(Exception): + pass + + +_SUPPORTED_DIRECT = {".docx"} +_SUPPORTED_CONVERT = {".doc", ".wps"} +_SOFFICE_FALLBACK_PATHS = ( + "/opt/homebrew/bin/soffice", + "/usr/local/bin/soffice", + "/Applications/LibreOffice.app/Contents/MacOS/soffice", + "/usr/bin/soffice", +) + + +def load_to_docx(src: Path) -> Path: + """统一返回 .docx 路径。.doc/.wps 调 soffice 转换。""" + ext = src.suffix.lower() + if ext in _SUPPORTED_DIRECT: + return src + if ext in _SUPPORTED_CONVERT: + return _convert_via_soffice(src) + raise UnsupportedFormat(f"unsupported file type: {ext}") + + +def _convert_via_soffice(src: Path) -> Path: + soffice = _resolve_soffice_path(get_settings().soffice_path) + + out_dir = src.parent + cmd = [ + soffice, "--headless", "--convert-to", "docx", + "--outdir", str(out_dir), str(src), + ] + try: + result = subprocess.run( + cmd, capture_output=True, timeout=60, + ) + except subprocess.TimeoutExpired as e: + raise ConversionError("soffice timeout") from e + + if result.returncode != 0: + raise ConversionError( + f"soffice exit {result.returncode}: {result.stderr.decode(errors='ignore')}" + ) + + out = out_dir / (src.stem + ".docx") + if not out.exists(): + raise ConversionError(f"expected output not found: {out}") + return out + + +def _resolve_soffice_path(configured: str) -> str: + candidates = [configured, *_SOFFICE_FALLBACK_PATHS] + checked: list[str] = [] + for candidate in candidates: + if candidate in checked: + continue + checked.append(candidate) + + resolved = shutil.which(candidate) + if resolved: + return resolved + if Path(candidate).exists(): + return candidate + + raise ConversionError( + f"soffice not found; checked: {', '.join(checked)}. " + "Install LibreOffice or set SOFFICE_PATH." + ) diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/parser/role_tagger.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/parser/role_tagger.py new file mode 100644 index 0000000..a4f9977 --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/parser/role_tagger.py @@ -0,0 +1,50 @@ +"""组合规则 tagger + LLM tagger 的总入口。""" + +from __future__ import annotations +import asyncio +from fastapi_modules.fastapi_leaudit.govdoc_engine.models import Document +from fastapi_modules.fastapi_leaudit.govdoc_engine.parser.role_tagger_rule import RuleBasedTagger +from fastapi_modules.fastapi_leaudit.govdoc_engine.parser.role_tagger_llm import LlmTagger +from fastapi_modules.fastapi_leaudit.govdoc_engine.llm.client import LlmClient + + +class RoleTagger: + """两段式:先规则打标,置信度 < threshold 的段落送 LLM 兜底。""" + + def __init__( + self, + llm_client: LlmClient | None = None, + threshold: float = 0.8, + ): + self.rule = RuleBasedTagger() + self.llm = LlmTagger(llm_client) if llm_client else None + self.threshold = threshold + + def _low_conf_indices(self, doc: Document) -> list[int]: + return [ + i for i, p in enumerate(doc.paragraphs) + if p.role_confidence < self.threshold + ] + + def tag(self, doc: Document) -> None: + self.rule.tag(doc) + if self.llm is None: + return + for i in self._low_conf_indices(doc): + role, conf = self.llm.disambiguate(doc, i) + doc.paragraphs[i].role = role + doc.paragraphs[i].role_confidence = conf + + async def tag_async(self, doc: Document) -> None: + self.rule.tag(doc) + if self.llm is None: + return + targets = self._low_conf_indices(doc) + if not targets: + return + results = await asyncio.gather( + *(self.llm.disambiguate_async(doc, i) for i in targets) + ) + for i, (role, conf) in zip(targets, results): + doc.paragraphs[i].role = role + doc.paragraphs[i].role_confidence = conf diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/parser/role_tagger_llm.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/parser/role_tagger_llm.py new file mode 100644 index 0000000..66d5123 --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/parser/role_tagger_llm.py @@ -0,0 +1,90 @@ +"""LLM 兜底打 role:对低置信段落做二次确认。""" + +from __future__ import annotations +import logging +from fastapi_modules.fastapi_leaudit.govdoc_engine.models import Document, Role +from fastapi_modules.fastapi_leaudit.govdoc_engine.llm.client import LlmClient, _format_exc + +_log = logging.getLogger(__name__) + + +VALID_ROLES = [ + "title", "doc_number", "recipient", + "heading_1", "heading_2", "heading_3", "heading_4", + "body", "attachment_marker", "signature", "date", + "no_text_marker", "unknown", +] + + +_PROMPT = """你是公文格式专家。下面是一份公文的段落列表,请为指定的"待定段落"判断其角色。 + +【全文段落(带索引和当前规则推测)】 +{context} + +【待定段落 idx={idx}】 +文本: {text} +当前推测角色: {current_role}(置信度 {conf:.2f}) + +【角色取值范围】 +{roles} + +请综合公文结构判断该段落最可能的角色。 + +仅以 JSON 输出: +{{"role": "<角色>", "confidence": <0-1 浮点数>, "reason": "<简短理由>"}} +""" + + +class LlmTagger: + def __init__(self, client: LlmClient): + self.client = client + + def _build_prompt(self, doc: Document, target_idx: int) -> tuple[str, "object"]: + ctx_lines = [] + for p in doc.paragraphs: + tag = "← 待定" if p.index == target_idx else "" + ctx_lines.append(f"[{p.index}] role={p.role} text={p.text[:60]} {tag}") + ctx = "\n".join(ctx_lines) + target = doc.paragraphs[target_idx] + prompt = _PROMPT.format( + context=ctx, + idx=target_idx, + text=target.text, + current_role=target.role or "unknown", + conf=target.role_confidence, + roles=", ".join(VALID_ROLES), + ) + return prompt, target + + def _interpret(self, resp: dict, target) -> tuple[Role, float]: + role = resp.get("role", "unknown") + if role not in VALID_ROLES: + role = "unknown" + conf = float(resp.get("confidence", 0.5)) + return role, conf # type: ignore[return-value] + + def disambiguate(self, doc: Document, target_idx: int) -> tuple[Role, float]: + prompt, target = self._build_prompt(doc, target_idx) + label = f"role_tag_p{target_idx}" + try: + resp = self.client.chat_json( + [{"role": "user", "content": prompt}], label=label, + ) + except Exception as e: + _log.warning("Role disambiguation skipped (LLM error): %s", _format_exc(e)) + return target.role or "unknown", target.role_confidence # type: ignore[return-value] + return self._interpret(resp, target) + + async def disambiguate_async( + self, doc: Document, target_idx: int + ) -> tuple[Role, float]: + prompt, target = self._build_prompt(doc, target_idx) + label = f"role_tag_p{target_idx}" + try: + resp = await self.client.chat_json_async( + [{"role": "user", "content": prompt}], label=label, + ) + except Exception as e: + _log.warning("Role disambiguation skipped (LLM error): %s", _format_exc(e)) + return target.role or "unknown", target.role_confidence # type: ignore[return-value] + return self._interpret(resp, target) diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/parser/role_tagger_rule.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/parser/role_tagger_rule.py new file mode 100644 index 0000000..42c12aa --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/parser/role_tagger_rule.py @@ -0,0 +1,132 @@ +"""基于位置 + 文字模式 + 字体样式的段落角色识别。""" + +from __future__ import annotations +import re +from fastapi_modules.fastapi_leaudit.govdoc_engine.models import Document, Paragraph, Role + + +HEADING_1_RE = re.compile(r"^[一二三四五六七八九十百]+、") +HEADING_2_RE = re.compile(r"^([一二三四五六七八九十]+)") +HEADING_3_RE = re.compile(r"^\d+[\..]") +HEADING_4_RE = re.compile(r"^(\d+)") +DOC_NUMBER_RE = re.compile(r"[一-龥]+[〔\[]\d{4}[〕\]]第?\d+号") +DATE_RE = re.compile( + r"^\d{4}年\d{1,2}月\d{1,2}日$" + r"|^[一二三四五六七八九十○〇零]+年[一二三四五六七八九十○〇零]+月[一二三四五六七八九十○〇零]+日$" +) +ATTACHMENT_RE = re.compile(r"^附件[::1-9]") +NO_TEXT_RE = re.compile(r"^[\((]\s*此页无正文\s*[\))]") +RECIPIENT_TAIL_RE = re.compile(r"[::]\s*$") +RECIPIENT_HINTS = ( + "局", "委", "府", "厅", "办", "公司", "各", "处室", + "委员会", "署", "院", "部", "司", "处", +) +RECIPIENT_BLOCKLIST = ( + "现将", "兹", "经研究", "为做好", "为深入", "为进一步", + "根据", "如下", "汇报", "通知如下", "请示如下", +) + + +class RuleBasedTagger: + def tag(self, doc: Document) -> None: + n = len(doc.paragraphs) + for i, p in enumerate(doc.paragraphs): + role, conf = self._classify(p, i, n, doc) + p.role = role + p.role_confidence = conf + + def _classify( + self, p: Paragraph, idx: int, total: int, doc: Document + ) -> tuple[Role, float]: + text = p.text.strip() + + if not text: + return ("unknown", 0.5) + + if NO_TEXT_RE.match(text): + return ("no_text_marker", 1.0) + + if ATTACHMENT_RE.match(text): + return ("attachment_marker", 0.95) + + if DATE_RE.match(text): + return ("date", 0.9) + + if DOC_NUMBER_RE.search(text) and idx <= 5: + return ("doc_number", 0.95) + + if idx == 0 or ( + idx <= 2 + and p.style.alignment == "center" + and (p.style.font_size_pt or 0) >= 18 + ): + return ("title", 0.95) + + font = (p.style.font_eastasia or "").strip() + size = p.style.font_size_pt or 0 + + if self._is_attachment_title(p, idx, doc): + return ("attachment_title", 0.9) + + if HEADING_1_RE.match(text): + conf = 0.95 if "黑体" in font else 0.7 + return ("heading_1", conf) + + if HEADING_2_RE.match(text): + conf = 0.95 if "楷体" in font else 0.7 + return ("heading_2", conf) + + if HEADING_3_RE.match(text): + conf = 0.9 if "仿宋" in font else 0.65 + return ("heading_3", conf) + + if HEADING_4_RE.match(text): + return ("heading_4", 0.85) + + if ( + idx <= 6 + and 3 <= len(text) <= 50 + and RECIPIENT_TAIL_RE.search(text) + and any(kw in text for kw in RECIPIENT_HINTS) + and not any(kw in text for kw in RECIPIENT_BLOCKLIST) + ): + return ("recipient", 0.9) + + if total - idx <= 3 and 5 <= len(text) <= 30 and any( + kw in text + for kw in ["局", "公司", "委员会", "人民政府", "办公厅", "办公室"] + ): + return ("signature", 0.7) + + if size >= 14 or font: + return ("body", 0.85) + + return ("unknown", 0.4) + + @staticmethod + def _is_attachment_title(p: Paragraph, idx: int, doc: Document) -> bool: + """识别附件正文首页标题,避免按普通正文套用 GW-F-004。""" + if idx <= 0: + return False + text = p.text.strip() + font = (p.style.font_eastasia or "").strip() + if ( + p.style.alignment != "center" + or (p.style.font_size_pt or 0) < 18 + or "小标宋" not in font + ): + return False + + marker_index = None + marker_text = "" + for prev in reversed(doc.paragraphs[:idx]): + if prev.role == "attachment_marker" or ATTACHMENT_RE.match(prev.text.strip()): + marker_index = prev.index + marker_text = prev.text.strip() + break + if marker_index is None or idx - marker_index > 12: + return False + + attachment_name = re.sub(r"^附件\d*[::]\s*", "", marker_text).strip() + attachment_name = re.sub(r"^\d+[\..、)]\s*", "", attachment_name).strip() + return not attachment_name or text == attachment_name or text in attachment_name or attachment_name in text diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/parser/style_resolver.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/parser/style_resolver.py new file mode 100644 index 0000000..6957dcb --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/parser/style_resolver.py @@ -0,0 +1,241 @@ +"""OOXML 字体解析:处理样式继承链 + 主题字体。 + +Word 把字体属性分散在四个层级: +1. 直接 run rPr:`...` +2. 段落 rPr(段落标记字体):`` +3. 段落引用样式:`` + 样式定义在 styles.xml,可经 `` 链向上继承 +4. 全局默认:styles.xml 的 `` + +此外 `` 的 `*Theme` 属性指向 theme1.xml 中的字体方案 +(majorEastAsia / minorEastAsia 等),需要做二次解析。 +""" + +from __future__ import annotations +from dataclasses import dataclass +from docx.oxml.ns import qn +from lxml import etree + +# theme1.xml 命名空间 +_DML_NS = "http://schemas.openxmlformats.org/drawingml/2006/main" + + +@dataclass +class ResolvedRunStyle: + font_eastasia: str | None = None + font_ascii: str | None = None + size_pt: float | None = None + bold: bool | None = None + italic: bool | None = None + + +def _empty_to_none(s: str | None) -> str | None: + if s is None: + return None + s = s.strip() + return s or None + + +class StyleResolver: + """构造时一次性解析样式表 + 主题;之后 resolve_run() 是 O(链长)。""" + + def __init__(self, docx): + self._theme = self._load_theme(docx) + self._styles, self._doc_defaults = self._load_styles(docx) + + # ---- 主题 --------------------------------------------------------- + def _load_theme(self, docx) -> dict[tuple[str, str], str | None]: + """返回 {(axis, scheme_attr): font_name}。 + + scheme_attr 形如 'majorEastAsia' / 'minorAscii',axis 是 rFonts 的轴。 + """ + out: dict[tuple[str, str], str | None] = {} + try: + theme_part = next( + p for p in docx.part.package.parts + if p.partname.endswith("/theme/theme1.xml") + ) + except StopIteration: + return out + try: + root = etree.fromstring(theme_part.blob) + except etree.XMLSyntaxError: + return out + + ns = {"a": _DML_NS} + for kind, font_tag in (("major", "majorFont"), ("minor", "minorFont")): + font_elem = root.find(f".//a:fontScheme/a:{font_tag}", ns) + if font_elem is None: + continue + latin = font_elem.find("a:latin", ns) + ea = font_elem.find("a:ea", ns) + cs = font_elem.find("a:cs", ns) + # ea 为空时用简中 Hans 兜底 + ea_val = _empty_to_none(ea.get("typeface")) if ea is not None else None + if ea_val is None: + hans = font_elem.find('a:font[@script="Hans"]', ns) + if hans is not None: + ea_val = _empty_to_none(hans.get("typeface")) + latin_val = _empty_to_none(latin.get("typeface")) if latin is not None else None + cs_val = _empty_to_none(cs.get("typeface")) if cs is not None else None + + out[("ascii", f"{kind}Ascii")] = latin_val + out[("ascii", f"{kind}HAnsi")] = latin_val # asciiTheme=majorHAnsi 也可能出现 + out[("hAnsi", f"{kind}HAnsi")] = latin_val + out[("hAnsi", f"{kind}Ascii")] = latin_val + out[("eastAsia", f"{kind}EastAsia")] = ea_val + out[("cs", f"{kind}Bidi")] = cs_val + return out + + # ---- 样式表 ------------------------------------------------------- + def _load_styles( + self, docx + ) -> tuple[dict[str, dict], ResolvedRunStyle | None]: + out: dict[str, dict] = {} + defaults: ResolvedRunStyle | None = None + try: + styles_root = docx.part._styles_part.element + except (AttributeError, KeyError): + return out, defaults + if styles_root is None: + return out, defaults + + # docDefaults + ddef = styles_root.find(qn("w:docDefaults")) + if ddef is not None: + rdef = ddef.find(qn("w:rPrDefault")) + if rdef is not None: + defaults = self._read_rpr(rdef.find(qn("w:rPr"))) + + # 各 style + for style in styles_root.findall(qn("w:style")): + sid = style.get(qn("w:styleId")) + if not sid: + continue + rpr = style.find(qn("w:rPr")) + ppr = style.find(qn("w:pPr")) + ppr_rpr = ppr.find(qn("w:rPr")) if ppr is not None else None + based_on = None + bo = style.find(qn("w:basedOn")) + if bo is not None: + based_on = bo.get(qn("w:val")) + link = style.find(qn("w:link")) + link_id = link.get(qn("w:val")) if link is not None else None + out[sid] = { + "rpr": rpr, + "ppr_rpr": ppr_rpr, + "based_on": based_on, + "link": link_id, + } + return out, defaults + + # ---- 读 rPr ------------------------------------------------------- + def _read_rpr(self, rpr) -> ResolvedRunStyle | None: + if rpr is None: + return None + rs = ResolvedRunStyle() + rfonts = rpr.find(qn("w:rFonts")) + if rfonts is not None: + rs.font_eastasia = self._resolve_font_axis(rfonts, "eastAsia") + rs.font_ascii = self._resolve_font_axis(rfonts, "ascii") + sz = rpr.find(qn("w:sz")) + if sz is not None and sz.get(qn("w:val")): + try: + rs.size_pt = float(sz.get(qn("w:val"))) / 2.0 + except ValueError: + pass + if rpr.find(qn("w:b")) is not None: + rs.bold = True + if rpr.find(qn("w:i")) is not None: + rs.italic = True + return rs + + def _resolve_font_axis(self, rfonts, axis: str) -> str | None: + """同一根 rFonts 上 explicit > theme。""" + explicit = _empty_to_none(rfonts.get(qn(f"w:{axis}"))) + if explicit: + return explicit + theme_attr = "cstheme" if axis == "cs" else f"{axis}Theme" + theme = _empty_to_none(rfonts.get(qn(f"w:{theme_attr}"))) + if theme: + return self._theme.get((axis, theme)) + return None + + # ---- 合并 --------------------------------------------------------- + @staticmethod + def _fill(target: ResolvedRunStyle, source: ResolvedRunStyle | None) -> None: + """target 已有的字段保留;缺的从 source 取。""" + if source is None: + return + if target.font_eastasia is None: + target.font_eastasia = source.font_eastasia + if target.font_ascii is None: + target.font_ascii = source.font_ascii + if target.size_pt is None: + target.size_pt = source.size_pt + if target.bold is None: + target.bold = source.bold + if target.italic is None: + target.italic = source.italic + + def _resolve_style_chain( + self, sid: str | None, _seen: set[str] | None = None + ) -> ResolvedRunStyle | None: + """段落样式 → 链向 basedOn → 沿途累积 rPr 与 pPr 的 rPr。""" + if sid is None: + return None + seen = _seen or set() + if sid in seen: + return None + seen = seen | {sid} + info = self._styles.get(sid) + if info is None: + return None + # 当前 style 的两个 rPr + rs = ResolvedRunStyle() + self._fill(rs, self._read_rpr(info.get("rpr"))) + self._fill(rs, self._read_rpr(info.get("ppr_rpr"))) + # 链接的 character style(如果有) + if info.get("link"): + self._fill(rs, self._resolve_style_chain(info["link"], seen)) + # 父样式 + if info.get("based_on"): + self._fill(rs, self._resolve_style_chain(info["based_on"], seen)) + return rs + + # ---- 主入口 ------------------------------------------------------- + def resolve_run(self, p_elem, run_elem) -> ResolvedRunStyle: + """解析单个 run 的最终样式。p_elem 可为 None。""" + rs = ResolvedRunStyle() + # 1. 直接 run rPr + if run_elem is not None: + self._fill(rs, self._read_rpr(run_elem.find(qn("w:rPr")))) + # 2. 段落 rPr(段落标记字体)+ pStyle 链 + if p_elem is not None: + ppr = p_elem.find(qn("w:pPr")) + if ppr is not None: + self._fill(rs, self._read_rpr(ppr.find(qn("w:rPr")))) + pstyle = ppr.find(qn("w:pStyle")) + if pstyle is not None and pstyle.get(qn("w:val")): + self._fill(rs, self._resolve_style_chain(pstyle.get(qn("w:val")))) + # 3. 默认 style "Normal"(中文文档常见) + if "Normal" in self._styles: + self._fill(rs, self._resolve_style_chain("Normal")) + # 4. docDefaults + self._fill(rs, self._doc_defaults) + return rs + + def resolve_paragraph(self, p_elem) -> ResolvedRunStyle: + """段落整体样式(不读 run,仅 pPr/style/默认)。""" + rs = ResolvedRunStyle() + if p_elem is not None: + ppr = p_elem.find(qn("w:pPr")) + if ppr is not None: + self._fill(rs, self._read_rpr(ppr.find(qn("w:rPr")))) + pstyle = ppr.find(qn("w:pStyle")) + if pstyle is not None and pstyle.get(qn("w:val")): + self._fill(rs, self._resolve_style_chain(pstyle.get(qn("w:val")))) + if "Normal" in self._styles: + self._fill(rs, self._resolve_style_chain("Normal")) + self._fill(rs, self._doc_defaults) + return rs diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/pipeline.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/pipeline.py new file mode 100644 index 0000000..12af0c1 --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/pipeline.py @@ -0,0 +1,248 @@ +"""Govdoc 引擎主编排入口。 + +将旧 govdoc-audit 的 audit_file() 函数适配为异步 Pipeline 接口, +供 govdoc_bridge.runner 调用。 + +迁移自: govdoc-audit/src/govdoc_audit/pipeline.py +移除依赖: RunRecorder, config.py (local file logging) +适配平台: 异步执行、直接返回 AuditResult +""" + +from __future__ import annotations + +import logging +import uuid +from pathlib import Path +from typing import Any + +from fastapi_modules.fastapi_leaudit.govdoc_engine.parser.docx_parser import parse_docx +from fastapi_modules.fastapi_leaudit.govdoc_engine.parser.role_tagger import RoleTagger +from fastapi_modules.fastapi_leaudit.govdoc_engine.parser.extractor import FieldExtractor +from fastapi_modules.fastapi_leaudit.govdoc_engine.parser.entity_builder import ( + EntityBuilder, + BUILTIN_LLM_DESCRIPTION, +) +from fastapi_modules.fastapi_leaudit.govdoc_engine.parser.entities import SemanticEntity +from fastapi_modules.fastapi_leaudit.govdoc_engine.dsl.loader import load_rules +from fastapi_modules.fastapi_leaudit.govdoc_engine.dsl.schema import EntitySpec, RuleSet +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.runner import RuleRunner +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.result import AuditResult, CheckedRule +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.structure import build_outline, build_structure +from fastapi_modules.fastapi_leaudit.govdoc_engine.llm.client import LlmClient + +_log = logging.getLogger(__name__) + + +# ── 辅助函数 ──────────────────────────────────────────── + +def _outcomes_to_checked(outcomes) -> list[CheckedRule]: + """将规则执行结果汇总为 CheckedRule 列表。""" + rows: list[CheckedRule] = [] + for o in outcomes: + if o.skipped: + status = "skipped" + elif o.findings: + status = "fail" + else: + status = "pass" + rows.append( + CheckedRule( + rule_id=o.rule.rule_id, + name=o.rule.name, + severity=o.rule.severity, + category=o.rule.category, + status=status, + skip_reason=o.skip_reason, + ) + ) + return rows + + +def _build_result( + docx_path: Path, doc, findings, entities, outcomes, +) -> AuditResult: + """从审查产物构建 AuditResult。""" + document_meta = { + "filename": docx_path.name, + "path": str(docx_path), + "page_count": doc.meta.get("page_count", 1), + "paragraph_count": len(doc.paragraphs), + } + result = AuditResult( + audit_id=f"A-{uuid.uuid4().hex[:8]}", + document=document_meta, + findings=findings, + entities=entities, + checked_rules=_outcomes_to_checked(outcomes), + structure=build_structure(doc), + outline=build_outline(doc), + ) + result.compute_summary() + return result + + +def _compute_missing_spec( + entities: dict[str, SemanticEntity | None], + custom_entities: list[EntitySpec], +) -> dict[str, dict]: + """计算哪些实体需要送 LLM 抽取。""" + spec: dict[str, dict] = {} + for name, desc in BUILTIN_LLM_DESCRIPTION.items(): + if entities.get(name) is None: + spec[name] = { + "description": desc, + "type": "list" if name == "attachments" else "string", + } + for s in custom_entities: + spec[s.name] = {"description": s.description or s.name, "type": s.type} + return spec + + +def _merge_llm_into_entities( + entities: dict[str, SemanticEntity | None], + llm_values: dict[str, Any], +) -> None: + """将 LLM 抽取结果合并进 entities。""" + for name, val in llm_values.items(): + if val in (None, "", []): + continue + if isinstance(val, list): + text = "; ".join( + f"{it.get('序号', i + 1)}. {it.get('名称', '')}" + if isinstance(it, dict) else str(it) + for i, it in enumerate(val) + ) + extra = {"items": val} + else: + text = str(val) + extra = {} + entities[name] = SemanticEntity( + name=name, + text=text, + paragraph_indices=[], + primary_role=None, + source="llm", + confidence=0.7, + extra=extra, + ) + + +# ── 实体构建 (同步,供 sync 入口使用) ────────────────── + +def _build_entities( + doc, ruleset: RuleSet, llm: LlmClient, +) -> dict[str, SemanticEntity | None]: + """构建实体 + 差量 LLM 抽取(同步)。""" + entities = EntityBuilder().build(doc) + spec = _compute_missing_spec(entities, ruleset.extract.entities) + if spec: + llm_vals = FieldExtractor(llm).extract_missing(doc, spec) + _merge_llm_into_entities(entities, llm_vals) + return entities + + +# ── 实体构建 (异步,供 async 入口使用) ────────────────── + +async def _build_entities_async( + doc, ruleset: RuleSet, llm: LlmClient, +) -> dict[str, SemanticEntity | None]: + """构建实体 + 差量 LLM 抽取(异步)。""" + entities = EntityBuilder().build(doc) + spec = _compute_missing_spec(entities, ruleset.extract.entities) + if spec: + llm_vals = await FieldExtractor(llm).extract_missing_async(doc, spec) + _merge_llm_into_entities(entities, llm_vals) + return entities + + +# ── 同步入口 (保留兼容) ───────────────────────────────── + +def audit_file( + docx_path: str | Path, + rules_path: str | Path, + llm_client: LlmClient | None = None, +) -> AuditResult: + """同步审查单个公文文件。 + + Args: + docx_path: DOCX 文件路径。 + rules_path: YAML 规则文件路径。 + llm_client: 可选 LLM 客户端实例。 + + Returns: + AuditResult 包含 findings, entities, checked_rules, summary 等。 + """ + docx_path = Path(docx_path) + rules_path = Path(rules_path) + llm = llm_client or LlmClient() + + doc = parse_docx(docx_path) + RoleTagger(llm_client=llm).tag(doc) + + ruleset = load_rules(rules_path) + entities = _build_entities(doc, ruleset, llm) + + findings, outcomes = RuleRunner(llm_client=llm).evaluate( + ruleset.all_rules(), doc, entities + ) + + return _build_result(docx_path, doc, findings, entities, outcomes) + + +# ── 异步入口 (推荐,供 bridge 调用) ────────────────────── + +async def run( + file_path: str | Path, + rules_path: str | Path, + llm_client: LlmClient | None = None, +) -> AuditResult: + """异步审查单个公文文件。 + + 这是 govdoc_bridge 的主要调用入口。 + + Args: + file_path: 文档文件路径 (DOCX 或 PDF)。 + rules_path: YAML 规则文件路径。 + llm_client: 可选 LLM 客户端实例。 + + Returns: + AuditResult 包含 findings, entities, checked_rules, summary 等。 + """ + file_path = Path(file_path) + rules_path = Path(rules_path) + llm = llm_client or LlmClient() + + _log.info("Govdoc pipeline start: %s", file_path.name) + + # 1. 解析文档 + doc = parse_docx(file_path) + _log.info(" parsed: %d paragraphs", len(doc.paragraphs)) + + # 2. 段落角色标注 + RoleTagger(llm_client=llm).tag(doc) + + # 3. 加载规则 + ruleset = load_rules(rules_path) + _log.info(" rules: %d groups, %d rules", len(ruleset.groups), len(ruleset.all_rules())) + + # 4. 实体抽取 (含差量 LLM) + entities = await _build_entities_async(doc, ruleset, llm) + _log.info(" entities: %d/%d resolved", sum(1 for v in entities.values() if v), len(entities)) + + # 5. 规则评估 + findings, outcomes = RuleRunner(llm_client=llm).evaluate( + ruleset.all_rules(), doc, entities + ) + _log.info(" evaluated: %d findings from %d rules", len(findings), len(outcomes)) + + # 6. 构建结果 + result = _build_result(file_path, doc, findings, entities, outcomes) + _log.info( + "Govdoc pipeline complete: score=%d, pass=%d, fail=%d, skip=%d", + result.summary.score, + result.summary.passed_count, + result.summary.failed_count, + result.summary.skipped_count, + ) + + return result diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/reporter/__init__.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/reporter/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/reporter/docx_annotator.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/reporter/docx_annotator.py new file mode 100644 index 0000000..89605bc --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/reporter/docx_annotator.py @@ -0,0 +1,105 @@ +"""docx 标注:在原文加高亮 + 文末追加审核报告附页。""" + +from __future__ import annotations + +from pathlib import Path + +from docx import Document as DocxDocument +from docx.enum.text import WD_BREAK +from docx.shared import Pt +from docx.oxml.ns import qn +from lxml import etree + +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.result import AuditResult + + +_HIGHLIGHT_NAME = { + "error": "red", + "warning": "yellow", + "info": "cyan", +} + + +def _highlight_run(run, color_name: str) -> None: + rpr = run._element.get_or_add_rPr() + hl = rpr.find(qn("w:highlight")) + if hl is None: + hl = etree.SubElement(rpr, qn("w:highlight")) + hl.set(qn("w:val"), color_name) + + +def _highlight_paragraph_range(paragraph, start: int, end: int, color_name: str) -> None: + """简化策略:高亮整段(精准 char range 留 v0.2 实现)。""" + for run in paragraph.runs: + _highlight_run(run, color_name) + + +def _add_heading_with_fallback(doc, text: str, level: int = 1): + try: + return doc.add_heading(text, level=level) + except KeyError: + # Some uploaded documents don't include Word's built-in heading styles. + p = doc.add_paragraph() + run = p.add_run(text) + run.bold = True + if level == 1: + run.font.size = Pt(16) + elif level == 2: + run.font.size = Pt(13) + else: + run.font.size = Pt(12) + return p + + +def _append_appendix(doc, result: AuditResult) -> None: + p = doc.add_paragraph() + p.add_run().add_break(WD_BREAK.PAGE) + _add_heading_with_fallback(doc, "审核报告附页", level=1) + + s = result.summary + doc.add_paragraph( + f"得分: {s.score}/100 错误: {s.by_severity.get('error', 0)} " + f"警告: {s.by_severity.get('warning', 0)} 提示: {s.by_severity.get('info', 0)}" + ) + + if not result.findings: + doc.add_paragraph("未发现问题。") + return + + table = doc.add_table(rows=1, cols=5) + try: + table.style = "Light Grid" + except KeyError: + # Some source documents don't ship with the built-in table style set. + pass + hdr = table.rows[0].cells + for i, h in enumerate(["编号", "规则", "严重度", "类别", "位置 / 说明"]): + hdr[i].text = h + + for f in result.findings: + row = table.add_row().cells + row[0].text = f.finding_id + row[1].text = f.rule_id + row[2].text = f.severity + row[3].text = f.category + loc = f.location + ctx = (loc.context or "")[:30] + row[4].text = f"P{loc.paragraph_index} ({loc.role}): {f.message}\n 原文: {ctx}" + + +def annotate_docx(src: str | Path, dst: str | Path, result: AuditResult) -> None: + src = Path(src) + dst = Path(dst) + doc = DocxDocument(src) + + for f in result.findings: + idx = f.location.paragraph_index + if 0 <= idx < len(doc.paragraphs): + color = _HIGHLIGHT_NAME.get(f.severity, "yellow") + _highlight_paragraph_range( + doc.paragraphs[idx], f.location.char_start, f.location.char_end, color + ) + + _append_appendix(doc, result) + dst.parent.mkdir(parents=True, exist_ok=True) + doc.save(dst) diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/reporter/html_paragraph.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/reporter/html_paragraph.py new file mode 100644 index 0000000..2328e8d --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/reporter/html_paragraph.py @@ -0,0 +1,42 @@ +"""把 Document 渲染为带 inline style 的 HTML 段落,给前端用。""" + +from __future__ import annotations +from html import escape +from fastapi_modules.fastapi_leaudit.govdoc_engine.models import Document + + +def _style(p) -> str: + s = p.style + parts = [] + if s.font_size_pt: + sz = s.font_size_pt + sz_str = str(int(sz)) if sz == int(sz) else str(sz) + parts.append(f"font-size:{sz_str}pt") + if s.font_eastasia: + parts.append(f"font-family:'{s.font_eastasia}',serif") + if s.alignment and s.alignment != "left": + parts.append(f"text-align:{s.alignment}") + if s.bold: + parts.append("font-weight:700") + if s.first_line_indent_pt: + parts.append(f"text-indent:{s.first_line_indent_pt}pt") + return ";".join(parts) + + +def paragraphs_to_html(doc: Document, finding_map: dict[int, list[str]]) -> str: + """把 doc 每个段落渲染成

带 data-pi / data-role / data-finding-ids。""" + out = ['

'] + for p in doc.paragraphs: + style = _style(p) + finding_ids = finding_map.get(p.index, []) + attrs = [ + f'data-pi="{p.index}"', + f'data-role="{escape(p.role or "")}"', + ] + if finding_ids: + attrs.append(f'data-finding-ids="{escape(",".join(finding_ids))}"') + if style: + attrs.append(f'style="{escape(style)}"') + out.append(f"

{escape(p.text)}

") + out.append("
") + return "\n".join(out) diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/reporter/html_renderer.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/reporter/html_renderer.py new file mode 100644 index 0000000..809442b --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/reporter/html_renderer.py @@ -0,0 +1,76 @@ +"""把 AuditResult 渲染成单文件 HTML 报告。""" + +from __future__ import annotations +from html import escape +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.result import AuditResult + + +_CSS = """ +body { font-family: -apple-system, "PingFang SC", sans-serif; margin: 0; padding: 24px; + background: #f7f7f9; color: #1a1a1a; } +.header { display: flex; align-items: center; gap: 16px; margin-bottom: 24px; } +.score { width: 96px; height: 96px; border-radius: 50%; + background: conic-gradient(#22c55e var(--p), #e5e7eb var(--p)); + display: grid; place-items: center; font-weight: 700; font-size: 22px; color: #111; } +.score-inner { background: white; width: 76px; height: 76px; border-radius: 50%; + display: grid; place-items: center; } +.tag { padding: 2px 8px; border-radius: 999px; font-size: 12px; } +.error { background: #fee2e2; color: #b91c1c; } +.warning { background: #fef9c3; color: #a16207; } +.info { background: #dbeafe; color: #1d4ed8; } +table { width: 100%; border-collapse: collapse; background: white; border-radius: 8px; + overflow: hidden; box-shadow: 0 1px 3px rgba(0,0,0,0.06); } +th, td { padding: 10px 12px; text-align: left; border-bottom: 1px solid #f1f5f9; vertical-align: top; } +th { background: #f8fafc; font-size: 13px; } +td.msg { max-width: 480px; } +.context { color: #64748b; font-size: 12px; margin-top: 4px; } +""" + + +def render_html(result: AuditResult) -> str: + s = result.summary + score = s.score + pct = f"{score}%" + rows = [] + for f in result.findings: + loc = f.location + suggest = ( + f'
建议: {escape(f.suggestion)}
' + if f.suggestion else "" + ) + rows.append(f""" + + {escape(f.finding_id)} + {escape(f.rule_id)}
{escape(f.rule_name)} + {f.severity} + {escape(f.category)} + P{loc.paragraph_index} ({escape(loc.role or '')}) + {escape(f.message)} +
原文: {escape((loc.context or '')[:80])}
+ {suggest} + +""") + + body = f""" +公文审核报告 + +
+
{score}
+
+

公文格式审核报告

+
{escape(result.document.get('filename', ''))} · 共 {s.total_findings} 项
+
+ 错误 {s.by_severity.get('error', 0)} + 警告 {s.by_severity.get('warning', 0)} + 提示 {s.by_severity.get('info', 0)} +
+
+
+ + + + + {''.join(rows) or ''} +
编号规则严重度类别位置说明
未发现问题
+""" + return body diff --git a/fastapi_modules/fastapi_leaudit/govdoc_engine/reporter/json_report.py b/fastapi_modules/fastapi_leaudit/govdoc_engine/reporter/json_report.py new file mode 100644 index 0000000..f7a5954 --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/govdoc_engine/reporter/json_report.py @@ -0,0 +1,12 @@ +"""把 AuditResult 序列化为 JSON 字符串。""" + +import json +from fastapi_modules.fastapi_leaudit.govdoc_engine.engine.result import AuditResult + + +def to_json(result: AuditResult, indent: int = 2) -> str: + return json.dumps( + result.model_dump(mode="json"), + ensure_ascii=False, + indent=indent, + ) diff --git a/fastapi_modules/fastapi_leaudit/models/__init__.py b/fastapi_modules/fastapi_leaudit/models/__init__.py index f0ff2c7..800adcb 100644 --- a/fastapi_modules/fastapi_leaudit/models/__init__.py +++ b/fastapi_modules/fastapi_leaudit/models/__init__.py @@ -14,6 +14,9 @@ from fastapi_modules.fastapi_leaudit.models.leauditRagChatApp import LeauditRagC from fastapi_modules.fastapi_leaudit.models.leauditRagConversation import LeauditRagConversation from fastapi_modules.fastapi_leaudit.models.leauditRagMessage import LeauditRagMessage from fastapi_modules.fastapi_leaudit.models.usageLoginEvent import UsageLoginEvent +from fastapi_modules.fastapi_leaudit.models.govdocRun import GovdocRun +from fastapi_modules.fastapi_leaudit.models.govdocRuleResult import GovdocRuleResult +from fastapi_modules.fastapi_leaudit.models.govdocReportArtifact import GovdocReportArtifact __all__ = [ "LeauditDocument", @@ -30,4 +33,7 @@ __all__ = [ "LeauditRagConversation", "LeauditRagMessage", "UsageLoginEvent", + "GovdocRun", + "GovdocRuleResult", + "GovdocReportArtifact", ] diff --git a/fastapi_modules/fastapi_leaudit/models/govdocReportArtifact.py b/fastapi_modules/fastapi_leaudit/models/govdocReportArtifact.py new file mode 100644 index 0000000..eafedd3 --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/models/govdocReportArtifact.py @@ -0,0 +1,27 @@ +"""Govdoc 报告产物模型 —— govdoc_report_artifacts 表。""" + +from __future__ import annotations + +from sqlalchemy import BigInteger, String +from sqlalchemy.orm import Mapped, mapped_column + +from fastapi_common.fastapi_common_web.models import BaseModel + + +class GovdocReportArtifact(BaseModel): + """公文审查报告产物索引表。""" + + __tablename__ = "govdoc_report_artifacts" + + Id: Mapped[int] = mapped_column("id", BigInteger, primary_key=True, autoincrement=True) + runId: Mapped[int] = mapped_column("run_id", BigInteger, comment="关联 govdoc_runs.id") + + artifactType: Mapped[str] = mapped_column("artifact_type", String(64), comment="产物类型:html_report/annotated_docx/paragraph_html/json_report/original") + fileName: Mapped[str] = mapped_column("file_name", String(512), comment="文件名") + fileExt: Mapped[str | None] = mapped_column("file_ext", String(32), comment="扩展名") + mimeType: Mapped[str | None] = mapped_column("mime_type", String(128), comment="MIME 类型") + fileSize: Mapped[int | None] = mapped_column("file_size", BigInteger, comment="文件大小(字节)") + sha256: Mapped[str | None] = mapped_column("sha256", String(64), comment="文件 SHA256") + ossUrl: Mapped[str | None] = mapped_column("oss_url", String(2048), comment="OSS 访问地址") + storageProvider: Mapped[str | None] = mapped_column("storage_provider", String(32), comment="存储提供商:oss/minio/local") + description: Mapped[str | None] = mapped_column("description", String(512), comment="产物说明") diff --git a/fastapi_modules/fastapi_leaudit/models/govdocRuleResult.py b/fastapi_modules/fastapi_leaudit/models/govdocRuleResult.py new file mode 100644 index 0000000..1468df2 --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/models/govdocRuleResult.py @@ -0,0 +1,39 @@ +"""Govdoc 规则结果模型 —— govdoc_rule_results 表。""" + +from __future__ import annotations + +from sqlalchemy import BigInteger, Integer, Numeric, String, Text +from sqlalchemy.orm import Mapped, mapped_column + +from fastapi_common.fastapi_common_web.models import BaseModel + + +class GovdocRuleResult(BaseModel): + """公文规则执行结果明细表。""" + + __tablename__ = "govdoc_rule_results" + + Id: Mapped[int] = mapped_column("id", BigInteger, primary_key=True, autoincrement=True) + runId: Mapped[int] = mapped_column("run_id", BigInteger, comment="关联 govdoc_runs.id") + + # 规则标识 + ruleId: Mapped[str] = mapped_column("rule_id", String(128), comment="规则标识") + ruleName: Mapped[str | None] = mapped_column("rule_name", String(256), comment="规则名称") + severity: Mapped[str | None] = mapped_column("severity", String(32), comment="严重等级:error/warning/info") + category: Mapped[str | None] = mapped_column("category", String(128), comment="规则分类") + + # 结果内容 + message: Mapped[str | None] = mapped_column("message", Text, comment="结果描述") + suggestion: Mapped[str | None] = mapped_column("suggestion", Text, comment="修改建议") + actual: Mapped[str | None] = mapped_column("actual", Text, comment="实际值") + expected: Mapped[str | None] = mapped_column("expected", Text, comment="期望值") + evidence: Mapped[str | None] = mapped_column("evidence", Text, comment="证据文本") + + # 文档定位 + paragraphIndex: Mapped[int | None] = mapped_column("paragraph_index", Integer, comment="段落索引") + paragraphText: Mapped[str | None] = mapped_column("paragraph_text", Text, comment="段落原文") + locationPath: Mapped[str | None] = mapped_column("location_path", String(512), comment="文档结构位置路径") + + # 判定 + result: Mapped[str] = mapped_column("result", String(32), default="pass", comment="执行结果:pass/fail/skipped/error") + score: Mapped[float | None] = mapped_column("score", Numeric(10, 2), comment="本条得分") diff --git a/fastapi_modules/fastapi_leaudit/models/govdocRun.py b/fastapi_modules/fastapi_leaudit/models/govdocRun.py new file mode 100644 index 0000000..59ca654 --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/models/govdocRun.py @@ -0,0 +1,46 @@ +"""Govdoc 审查运行模型 —— govdoc_runs 表。""" + +from __future__ import annotations + +from datetime import datetime + +from sqlalchemy import BigInteger, DateTime, Integer, Numeric, String, Text +from sqlalchemy.orm import Mapped, mapped_column + +from fastapi_common.fastapi_common_web.models import BaseModel + + +class GovdocRun(BaseModel): + """公文审查运行主表。""" + + __tablename__ = "govdoc_runs" + + Id: Mapped[int] = mapped_column("id", BigInteger, primary_key=True, autoincrement=True) + documentId: Mapped[int] = mapped_column("document_id", BigInteger, comment="关联 leaudit_documents.id") + documentFileId: Mapped[int | None] = mapped_column("document_file_id", BigInteger, comment="输入文件 ID,关联 leaudit_document_files.id") + runNo: Mapped[int] = mapped_column("run_no", Integer, default=1, comment="同一文档第几次执行") + triggerSource: Mapped[str] = mapped_column("trigger_source", String(64), default="upload", comment="触发来源:upload/manual/retry/migration") + triggerUserId: Mapped[int | None] = mapped_column("trigger_user_id", BigInteger, comment="触发人 user_id") + taskId: Mapped[str | None] = mapped_column("task_id", String(128), comment="Celery 任务 ID") + + # 运行状态 + status: Mapped[str] = mapped_column("status", String(64), default="pending", comment="pending/processing/completed/failed/cancelled") + phase: Mapped[str | None] = mapped_column("phase", String(32), comment="当前阶段:parsing/executing/reporting") + + # 引擎快照 + engineVersion: Mapped[str | None] = mapped_column("engine_version", String(64), comment="引擎版本号") + llmProvider: Mapped[str | None] = mapped_column("llm_provider", String(64), comment="LLM 提供商") + llmModel: Mapped[str | None] = mapped_column("llm_model", String(128), comment="LLM 模型名") + + # 结果汇总 + totalScore: Mapped[float | None] = mapped_column("total_score", Numeric(10, 2), comment="总分") + passedCount: Mapped[int | None] = mapped_column("passed_count", Integer, comment="通过规则数") + failedCount: Mapped[int | None] = mapped_column("failed_count", Integer, comment="未通过规则数") + skippedCount: Mapped[int | None] = mapped_column("skipped_count", Integer, comment="跳过规则数") + resultStatus: Mapped[str | None] = mapped_column("result_status", String(32), comment="综合结果:pass/fail/partial/error") + resultSummaryJson: Mapped[str | None] = mapped_column("result_summary_json", Text, comment="结构化结果摘要 JSON") + errorMessage: Mapped[str | None] = mapped_column("error_message", Text, comment="运行失败时错误描述") + + # 时间 + startedAt: Mapped[datetime | None] = mapped_column("started_at", DateTime(timezone=True), comment="开始执行时间") + finishedAt: Mapped[datetime | None] = mapped_column("finished_at", DateTime(timezone=True), comment="结束执行时间") diff --git a/fastapi_modules/fastapi_leaudit/services/__init__.py b/fastapi_modules/fastapi_leaudit/services/__init__.py index 4b3e28c..2aaa39a 100644 --- a/fastapi_modules/fastapi_leaudit/services/__init__.py +++ b/fastapi_modules/fastapi_leaudit/services/__init__.py @@ -16,6 +16,7 @@ from fastapi_modules.fastapi_leaudit.services.ruleConfigService import IRuleConf from fastapi_modules.fastapi_leaudit.services.ruleService import IRuleService from fastapi_modules.fastapi_leaudit.services.ragDatasetService import IRagDatasetService from fastapi_modules.fastapi_leaudit.services.ragChatService import IRagChatService +from fastapi_modules.fastapi_leaudit.services.govdocService import IGovdocService from fastapi_modules.fastapi_leaudit.services.usageStatsService import IUsageStatsService __all__ = [ @@ -36,4 +37,5 @@ __all__ = [ "IRagDatasetService", "IRagChatService", "IUsageStatsService", + "IGovdocService", ] diff --git a/fastapi_modules/fastapi_leaudit/services/govdocService.py b/fastapi_modules/fastapi_leaudit/services/govdocService.py new file mode 100644 index 0000000..61419dd --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/services/govdocService.py @@ -0,0 +1,121 @@ +"""Govdoc 公文模块服务接口。""" + +from abc import ABC, abstractmethod +from typing import Any + +from fastapi import UploadFile + + +class IGovdocService(ABC): + """公文处理与格式审查服务抽象接口。""" + + # ── 文档 ────────────────────────────────────────────── + + @abstractmethod + async def UploadDocument( + self, + file: UploadFile, + typeId: int | None = None, + region: str = "default", + autoRun: bool = False, + speed: str = "normal", + ruleVersionId: int | None = None, + createdBy: int | None = None, + ) -> dict[str, Any]: + """上传公文文档,创建主档记录,可选自动触发审查。""" + ... + + @abstractmethod + async def ListDocuments( + self, + page: int = 1, + pageSize: int = 20, + keyword: str | None = None, + region: str | None = None, + status: str | None = None, + resultStatus: str | None = None, + createdBy: int | None = None, + dateFrom: str | None = None, + dateTo: str | None = None, + userId: int | None = None, + ) -> dict[str, Any]: + """获取公文模块文档列表,自动限制 engine_type='govdoc'。""" + ... + + @abstractmethod + async def GetDocumentDetail(self, documentId: int, userId: int | None = None) -> dict[str, Any]: + """获取公文详情:文档基础信息 + 最新 run 摘要 + 报告引用。""" + ... + + @abstractmethod + async def UpdateDocument(self, documentId: int, body: dict[str, Any], userId: int | None = None) -> dict[str, Any]: + """修改公文标题、文号、备注等基础信息。""" + ... + + @abstractmethod + async def DeleteDocument(self, documentId: int, userId: int | None = None) -> dict[str, Any]: + """软删除文档。""" + ... + + # ── 审查运行 ────────────────────────────────────────── + + @abstractmethod + async def CreateRun( + self, + documentId: int, + ruleVersionId: int | None = None, + speed: str = "normal", + force: bool = False, + triggerUserId: int | None = None, + ) -> dict[str, Any]: + """对已存在文档发起一次公文审查 run。""" + ... + + @abstractmethod + async def GetRunStatus(self, runId: int) -> dict[str, Any]: + """查询 run 状态、阶段、耗时、错误摘要。""" + ... + + # ── 结果与报告 ──────────────────────────────────────── + + @abstractmethod + async def GetRunResult(self, runId: int) -> dict[str, Any]: + """获取审查结果摘要:summary + checked rules + findings 统计 + entities 摘要。""" + ... + + @abstractmethod + async def GetRunFindings(self, runId: int) -> dict[str, Any]: + """获取段落级 findings 明细列表。""" + ... + + @abstractmethod + async def GetRunEntities(self, runId: int) -> dict[str, Any]: + """获取识别出的标题、文号、署名等实体。""" + ... + + @abstractmethod + async def GetRunParagraphs(self, runId: int) -> dict[str, Any]: + """获取前端文档联动视图所需的段落 HTML。""" + ... + + @abstractmethod + async def GetReportHtml(self, runId: int) -> dict[str, Any]: + """获取 HTML 报告内容或下载地址。""" + ... + + @abstractmethod + async def GetReportDocx(self, runId: int) -> dict[str, Any]: + """获取批注 DOCX 下载地址。""" + ... + + @abstractmethod + async def DownloadOriginal(self, documentId: int) -> dict[str, Any]: + """获取原始上传文档下载地址。""" + ... + + # ── 规则 ────────────────────────────────────────────── + + @abstractmethod + async def ListRules(self) -> dict[str, Any]: + """获取当前生效规则集摘要。""" + ... diff --git a/fastapi_modules/fastapi_leaudit/services/impl/govdocServiceImpl.py b/fastapi_modules/fastapi_leaudit/services/impl/govdocServiceImpl.py new file mode 100644 index 0000000..04aa60c --- /dev/null +++ b/fastapi_modules/fastapi_leaudit/services/impl/govdocServiceImpl.py @@ -0,0 +1,130 @@ +"""Govdoc 公文模块服务实现(阶段骨架)。 + +本文件为 Phase 1 骨架实现,所有方法暂返回占位结果。 +后续步骤将逐步接入: + - govdoc_bridge 执行桥接 + - govdoc_engine 引擎内核 + - 文档主档复用 + - OSS / Celery 集成 +""" + +from __future__ import annotations + +from typing import Any + +from fastapi import UploadFile + +from fastapi_common.fastapi_common_logger import logger +from fastapi_modules.fastapi_leaudit.services import IGovdocService + + +class GovdocServiceImpl(IGovdocService): + """公文处理与格式审查服务实现。""" + + # ── 文档 ────────────────────────────────────────────── + + async def UploadDocument( + self, + file: UploadFile, + typeId: int | None = None, + region: str = "default", + autoRun: bool = False, + speed: str = "normal", + ruleVersionId: int | None = None, + createdBy: int | None = None, + ) -> dict[str, Any]: + logger.info("[Govdoc] UploadDocument placeholder — file=%s region=%s", file.filename, region) + return { + "documentId": 0, + "fileId": 0, + "fileName": file.filename, + "region": region, + "engineType": "govdoc", + "autoRunTriggered": autoRun, + } + + async def ListDocuments( + self, + page: int = 1, + pageSize: int = 20, + keyword: str | None = None, + region: str | None = None, + status: str | None = None, + resultStatus: str | None = None, + createdBy: int | None = None, + dateFrom: str | None = None, + dateTo: str | None = None, + userId: int | None = None, + ) -> dict[str, Any]: + logger.info("[Govdoc] ListDocuments placeholder — page=%s pageSize=%s", page, pageSize) + return {"items": [], "total": 0, "page": page, "pageSize": pageSize} + + async def GetDocumentDetail(self, documentId: int, userId: int | None = None) -> dict[str, Any]: + logger.info("[Govdoc] GetDocumentDetail placeholder — id=%s", documentId) + return {"documentId": documentId} + + async def UpdateDocument(self, documentId: int, body: dict[str, Any], userId: int | None = None) -> dict[str, Any]: + logger.info("[Govdoc] UpdateDocument placeholder — id=%s", documentId) + return {"documentId": documentId, **body} + + async def DeleteDocument(self, documentId: int, userId: int | None = None) -> dict[str, Any]: + logger.info("[Govdoc] DeleteDocument placeholder — id=%s", documentId) + return {"documentId": documentId, "deleted": True} + + # ── 审查运行 ────────────────────────────────────────── + + async def CreateRun( + self, + documentId: int, + ruleVersionId: int | None = None, + speed: str = "normal", + force: bool = False, + triggerUserId: int | None = None, + ) -> dict[str, Any]: + logger.info("[Govdoc] CreateRun placeholder — documentId=%s", documentId) + return { + "runId": 0, + "documentId": documentId, + "status": "queued", + "phase": "dispatch", + } + + async def GetRunStatus(self, runId: int) -> dict[str, Any]: + logger.info("[Govdoc] GetRunStatus placeholder — runId=%s", runId) + return {"runId": runId, "status": "pending"} + + # ── 结果与报告 ──────────────────────────────────────── + + async def GetRunResult(self, runId: int) -> dict[str, Any]: + logger.info("[Govdoc] GetRunResult placeholder — runId=%s", runId) + return {"runId": runId, "summary": {}} + + async def GetRunFindings(self, runId: int) -> dict[str, Any]: + logger.info("[Govdoc] GetRunFindings placeholder — runId=%s", runId) + return {"runId": runId, "findings": []} + + async def GetRunEntities(self, runId: int) -> dict[str, Any]: + logger.info("[Govdoc] GetRunEntities placeholder — runId=%s", runId) + return {"runId": runId, "entities": []} + + async def GetRunParagraphs(self, runId: int) -> dict[str, Any]: + logger.info("[Govdoc] GetRunParagraphs placeholder — runId=%s", runId) + return {"runId": runId, "paragraphs": []} + + async def GetReportHtml(self, runId: int) -> dict[str, Any]: + logger.info("[Govdoc] GetReportHtml placeholder — runId=%s", runId) + return {"runId": runId, "htmlUrl": ""} + + async def GetReportDocx(self, runId: int) -> dict[str, Any]: + logger.info("[Govdoc] GetReportDocx placeholder — runId=%s", runId) + return {"runId": runId, "docxUrl": ""} + + async def DownloadOriginal(self, documentId: int) -> dict[str, Any]: + logger.info("[Govdoc] DownloadOriginal placeholder — documentId=%s", documentId) + return {"documentId": documentId, "downloadUrl": ""} + + # ── 规则 ────────────────────────────────────────────── + + async def ListRules(self) -> dict[str, Any]: + logger.info("[Govdoc] ListRules placeholder") + return {"rules": []} diff --git a/scripts/创建sql/schema_add_govdoc_module.sql b/scripts/创建sql/schema_add_govdoc_module.sql new file mode 100644 index 0000000..2f31778 --- /dev/null +++ b/scripts/创建sql/schema_add_govdoc_module.sql @@ -0,0 +1,198 @@ +-- ============================================================================ +-- govdoc 模块建表 DDL +-- 用途: +-- 1. 创建 govdoc 模块专用结果域表(run / rule_result / report_artifact) +-- 2. 给 leaudit_documents 补充 engine_type 字段 +-- 3. 幂等执行,重复跑不报错 +-- +-- 设计原则: +-- - 复用 leaudit_documents / leaudit_document_files 作为文档主档 +-- - 新建 govdoc 结果域表,不与 leaudit 引擎结果表混用 +-- - 后续规则平台化时再补 govdoc_rule_sets / govdoc_rule_versions +-- ============================================================================ + +BEGIN; + +-- --------------------------------------------------------------------------- +-- 1. govdoc_runs —— 公文审查运行主表 +-- --------------------------------------------------------------------------- +CREATE TABLE IF NOT EXISTS public.govdoc_runs ( + id BIGINT GENERATED ALWAYS AS IDENTITY PRIMARY KEY, + document_id BIGINT NOT NULL , + document_file_id BIGINT, + run_no INTEGER NOT NULL DEFAULT 1, + trigger_source VARCHAR(64) NOT NULL DEFAULT 'upload', + trigger_user_id BIGINT, + task_id VARCHAR(128), + + -- 运行状态 + status VARCHAR(64) NOT NULL DEFAULT 'pending', + phase VARCHAR(32), + + -- 引擎快照 + engine_version VARCHAR(64), + llm_provider VARCHAR(64), + llm_model VARCHAR(128), + + -- 结果汇总 + total_score NUMERIC(10, 2), + passed_count INTEGER, + failed_count INTEGER, + skipped_count INTEGER, + result_status VARCHAR(32), + result_summary_json TEXT, + error_message TEXT, + + -- 时间 + started_at TIMESTAMPTZ, + finished_at TIMESTAMPTZ, + + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + deleted_at TIMESTAMPTZ DEFAULT NULL +); + +COMMENT ON TABLE public.govdoc_runs IS '公文审查运行主表'; +COMMENT ON COLUMN public.govdoc_runs.id IS '自增主键'; +COMMENT ON COLUMN public.govdoc_runs.document_id IS '关联 leaudit_documents.id'; +COMMENT ON COLUMN public.govdoc_runs.document_file_id IS '输入文件 ID,关联 leaudit_document_files.id'; +COMMENT ON COLUMN public.govdoc_runs.run_no IS '同一文档第几次执行'; +COMMENT ON COLUMN public.govdoc_runs.trigger_source IS '触发来源:upload/manual/retry/migration'; +COMMENT ON COLUMN public.govdoc_runs.trigger_user_id IS '触发人 user_id'; +COMMENT ON COLUMN public.govdoc_runs.task_id IS 'Celery 任务 ID'; +COMMENT ON COLUMN public.govdoc_runs.status IS 'pending/processing/completed/failed/cancelled'; +COMMENT ON COLUMN public.govdoc_runs.phase IS '当前阶段:parsing/executing/reporting'; +COMMENT ON COLUMN public.govdoc_runs.engine_version IS '引擎版本号'; +COMMENT ON COLUMN public.govdoc_runs.llm_provider IS 'LLM 提供商'; +COMMENT ON COLUMN public.govdoc_runs.llm_model IS 'LLM 模型名'; +COMMENT ON COLUMN public.govdoc_runs.total_score IS '总分'; +COMMENT ON COLUMN public.govdoc_runs.passed_count IS '通过规则数'; +COMMENT ON COLUMN public.govdoc_runs.failed_count IS '未通过规则数'; +COMMENT ON COLUMN public.govdoc_runs.skipped_count IS '跳过规则数'; +COMMENT ON COLUMN public.govdoc_runs.result_status IS '综合结果:pass/fail/partial/error'; +COMMENT ON COLUMN public.govdoc_runs.result_summary_json IS '结构化结果摘要 JSON'; +COMMENT ON COLUMN public.govdoc_runs.error_message IS '运行失败时错误描述'; +COMMENT ON COLUMN public.govdoc_runs.started_at IS '开始执行时间'; +COMMENT ON COLUMN public.govdoc_runs.finished_at IS '结束执行时间'; + +CREATE INDEX IF NOT EXISTS idx_govdoc_runs_document_id ON public.govdoc_runs(document_id) WHERE deleted_at IS NULL; +CREATE INDEX IF NOT EXISTS idx_govdoc_runs_status ON public.govdoc_runs(status) WHERE deleted_at IS NULL; +CREATE INDEX IF NOT EXISTS idx_govdoc_runs_trigger_user_id ON public.govdoc_runs(trigger_user_id); + +-- --------------------------------------------------------------------------- +-- 2. govdoc_rule_results —— 单条规则执行结果 +-- --------------------------------------------------------------------------- +CREATE TABLE IF NOT EXISTS public.govdoc_rule_results ( + id BIGINT GENERATED ALWAYS AS IDENTITY PRIMARY KEY, + run_id BIGINT NOT NULL, + + -- 规则标识 + rule_id VARCHAR(128) NOT NULL, + rule_name VARCHAR(256), + severity VARCHAR(32), + category VARCHAR(128), + + -- 结果内容 + message TEXT, + suggestion TEXT, + actual TEXT, + expected TEXT, + evidence TEXT, + + -- 文档定位 + paragraph_index INTEGER, + paragraph_text TEXT, + location_path VARCHAR(512), + + -- 判定 + result VARCHAR(32) NOT NULL DEFAULT 'pass', + score NUMERIC(10, 2), + + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + deleted_at TIMESTAMPTZ DEFAULT NULL +); + +COMMENT ON TABLE public.govdoc_rule_results IS '公文规则执行结果明细表'; +COMMENT ON COLUMN public.govdoc_rule_results.id IS '自增主键'; +COMMENT ON COLUMN public.govdoc_rule_results.run_id IS '关联 govdoc_runs.id'; +COMMENT ON COLUMN public.govdoc_rule_results.rule_id IS '规则标识'; +COMMENT ON COLUMN public.govdoc_rule_results.rule_name IS '规则名称'; +COMMENT ON COLUMN public.govdoc_rule_results.severity IS '严重等级:error/warning/info'; +COMMENT ON COLUMN public.govdoc_rule_results.category IS '规则分类'; +COMMENT ON COLUMN public.govdoc_rule_results.message IS '结果描述'; +COMMENT ON COLUMN public.govdoc_rule_results.suggestion IS '修改建议'; +COMMENT ON COLUMN public.govdoc_rule_results.actual IS '实际值'; +COMMENT ON COLUMN public.govdoc_rule_results.expected IS '期望值'; +COMMENT ON COLUMN public.govdoc_rule_results.evidence IS '证据文本'; +COMMENT ON COLUMN public.govdoc_rule_results.paragraph_index IS '段落索引'; +COMMENT ON COLUMN public.govdoc_rule_results.paragraph_text IS '段落原文'; +COMMENT ON COLUMN public.govdoc_rule_results.location_path IS '文档结构位置路径'; +COMMENT ON COLUMN public.govdoc_rule_results.result IS '执行结果:pass/fail/skipped/error'; +COMMENT ON COLUMN public.govdoc_rule_results.score IS '本条得分'; + +CREATE INDEX IF NOT EXISTS idx_govdoc_rule_results_run_id ON public.govdoc_rule_results(run_id) WHERE deleted_at IS NULL; +CREATE INDEX IF NOT EXISTS idx_govdoc_rule_results_rule_id ON public.govdoc_rule_results(rule_id) WHERE deleted_at IS NULL; +CREATE INDEX IF NOT EXISTS idx_govdoc_rule_results_result ON public.govdoc_rule_results(result) WHERE deleted_at IS NULL; +CREATE INDEX IF NOT EXISTS idx_govdoc_rule_results_paragraph ON public.govdoc_rule_results(run_id, paragraph_index) WHERE deleted_at IS NULL; + +-- --------------------------------------------------------------------------- +-- 3. govdoc_report_artifacts —— 报告产物索引 +-- --------------------------------------------------------------------------- +CREATE TABLE IF NOT EXISTS public.govdoc_report_artifacts ( + id BIGINT GENERATED ALWAYS AS IDENTITY PRIMARY KEY, + run_id BIGINT NOT NULL, + + artifact_type VARCHAR(64) NOT NULL, + file_name VARCHAR(512) NOT NULL, + file_ext VARCHAR(32), + mime_type VARCHAR(128), + file_size BIGINT, + sha256 VARCHAR(64), + oss_url VARCHAR(2048), + storage_provider VARCHAR(32), + description VARCHAR(512), + + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + deleted_at TIMESTAMPTZ DEFAULT NULL +); + +COMMENT ON TABLE public.govdoc_report_artifacts IS '公文审查报告产物索引表'; +COMMENT ON COLUMN public.govdoc_report_artifacts.id IS '自增主键'; +COMMENT ON COLUMN public.govdoc_report_artifacts.run_id IS '关联 govdoc_runs.id'; +COMMENT ON COLUMN public.govdoc_report_artifacts.artifact_type IS '产物类型:html_report/annotated_docx/paragraph_html/json_report/original'; +COMMENT ON COLUMN public.govdoc_report_artifacts.file_name IS '文件名'; +COMMENT ON COLUMN public.govdoc_report_artifacts.file_ext IS '扩展名'; +COMMENT ON COLUMN public.govdoc_report_artifacts.mime_type IS 'MIME 类型'; +COMMENT ON COLUMN public.govdoc_report_artifacts.file_size IS '文件大小(字节)'; +COMMENT ON COLUMN public.govdoc_report_artifacts.sha256 IS '文件 SHA256'; +COMMENT ON COLUMN public.govdoc_report_artifacts.oss_url IS 'OSS 访问地址'; +COMMENT ON COLUMN public.govdoc_report_artifacts.storage_provider IS '存储提供商:oss/minio/local'; +COMMENT ON COLUMN public.govdoc_report_artifacts.description IS '产物说明'; + +CREATE INDEX IF NOT EXISTS idx_govdoc_report_artifacts_run_id ON public.govdoc_report_artifacts(run_id) WHERE deleted_at IS NULL; +CREATE INDEX IF NOT EXISTS idx_govdoc_report_artifacts_type ON public.govdoc_report_artifacts(run_id, artifact_type) WHERE deleted_at IS NULL; + +-- --------------------------------------------------------------------------- +-- 4. leaudit_documents —— 补充 engine_type 字段 +-- --------------------------------------------------------------------------- +DO $$ +BEGIN + IF NOT EXISTS ( + SELECT 1 + FROM information_schema.columns + WHERE table_schema = 'public' + AND table_name = 'leaudit_documents' + AND column_name = 'engine_type' + ) THEN + ALTER TABLE public.leaudit_documents + ADD COLUMN engine_type VARCHAR(32) NOT NULL DEFAULT 'leaudit'; + COMMENT ON COLUMN public.leaudit_documents.engine_type IS '引擎类型:leaudit/govdoc/rag'; + END IF; +END $$; + +-- 为 engine_type 加索引,方便按模块过滤文档列表 +CREATE INDEX IF NOT EXISTS idx_leaudit_documents_engine_type ON public.leaudit_documents(engine_type) WHERE deleted_at IS NULL; + +COMMIT; \ No newline at end of file diff --git a/scripts/创建sql/seed_govdoc_entry_module.sql b/scripts/创建sql/seed_govdoc_entry_module.sql new file mode 100644 index 0000000..47f036f --- /dev/null +++ b/scripts/创建sql/seed_govdoc_entry_module.sql @@ -0,0 +1,140 @@ +-- ============================================================================ +-- govdoc 模块入口初始化脚本 +-- 用途: +-- 1. 为“内部公文 / govdoc”模块补齐首页入口配置 +-- 2. 使用幂等写法,重复执行不会产生重复数据 +-- 3. 兼容历史库中 entry_modules 时间字段命名差异 +-- +-- 说明: +-- - 当前模块入口统一收口到 /govdoc/list,与模块路由 seed 保持一致。 +-- - 若后续前端改成其他入口页,请同步更新本脚本中的 v_target_path。 +-- - 页面访问权限与左侧菜单路由仍依赖 sys_routes / role_route, +-- 这部分建议由配套脚本 seed_govdoc_routes.sql 单独维护。 +-- ============================================================================ + +BEGIN; + +DO $$ +DECLARE + v_name text := '内部公文'; + v_description text := '内部公文处理与格式审查入口'; + v_target_path text := '/govdoc/list'; + v_icon_path text := 'documents/mz/static/img/entry_module_3.png'; + v_sort_order integer := 30; + v_areas jsonb := '[ + {"area":"梅州","enabled":true,"sort_order":1}, + {"area":"云浮","enabled":true,"sort_order":2}, + {"area":"揭阳","enabled":true,"sort_order":3}, + {"area":"潮州","enabled":true,"sort_order":4}, + {"area":"省局","enabled":true,"sort_order":5} + ]'::jsonb; + has_created_at boolean; + has_updated_at boolean; + has_create_time boolean; + has_update_time boolean; + has_deleted_at boolean; + sql_stmt text; +BEGIN + SELECT EXISTS ( + SELECT 1 + FROM information_schema.columns + WHERE table_schema = 'public' + AND table_name = 'leaudit_entry_modules' + AND column_name = 'created_at' + ), + EXISTS ( + SELECT 1 + FROM information_schema.columns + WHERE table_schema = 'public' + AND table_name = 'leaudit_entry_modules' + AND column_name = 'updated_at' + ), + EXISTS ( + SELECT 1 + FROM information_schema.columns + WHERE table_schema = 'public' + AND table_name = 'leaudit_entry_modules' + AND column_name = 'create_time' + ), + EXISTS ( + SELECT 1 + FROM information_schema.columns + WHERE table_schema = 'public' + AND table_name = 'leaudit_entry_modules' + AND column_name = 'update_time' + ), + EXISTS ( + SELECT 1 + FROM information_schema.columns + WHERE table_schema = 'public' + AND table_name = 'leaudit_entry_modules' + AND column_name = 'deleted_at' + ) + INTO has_created_at, has_updated_at, has_create_time, has_update_time, has_deleted_at; + + IF NOT EXISTS ( + SELECT 1 + FROM information_schema.tables + WHERE table_schema = 'public' + AND table_name = 'leaudit_entry_modules' + ) THEN + RAISE EXCEPTION '表 public.leaudit_entry_modules 不存在,请先执行入口模块建表脚本'; + END IF; + + sql_stmt := 'INSERT INTO public.leaudit_entry_modules (' || + 'name, description, path, icon_path, areas, sort_order, is_enabled'; + + IF has_created_at THEN + sql_stmt := sql_stmt || ', created_at'; + ELSIF has_create_time THEN + sql_stmt := sql_stmt || ', create_time'; + END IF; + + IF has_updated_at THEN + sql_stmt := sql_stmt || ', updated_at'; + ELSIF has_update_time THEN + sql_stmt := sql_stmt || ', update_time'; + END IF; + + IF has_deleted_at THEN + sql_stmt := sql_stmt || ', deleted_at'; + END IF; + + sql_stmt := sql_stmt || ') VALUES (' || + '$1, $2, $3, $4, $5, $6, TRUE'; + + IF has_created_at OR has_create_time THEN + sql_stmt := sql_stmt || ', NOW()'; + END IF; + + IF has_updated_at OR has_update_time THEN + sql_stmt := sql_stmt || ', NOW()'; + END IF; + + IF has_deleted_at THEN + sql_stmt := sql_stmt || ', NULL'; + END IF; + + sql_stmt := sql_stmt || ') ON CONFLICT (name) DO UPDATE SET ' || + 'description = EXCLUDED.description, ' || + 'path = EXCLUDED.path, ' || + 'icon_path = EXCLUDED.icon_path, ' || + 'areas = EXCLUDED.areas, ' || + 'sort_order = EXCLUDED.sort_order, ' || + 'is_enabled = EXCLUDED.is_enabled'; + + IF has_updated_at THEN + sql_stmt := sql_stmt || ', updated_at = NOW()'; + ELSIF has_update_time THEN + sql_stmt := sql_stmt || ', update_time = NOW()'; + END IF; + + IF has_deleted_at THEN + sql_stmt := sql_stmt || ', deleted_at = NULL'; + END IF; + + EXECUTE sql_stmt + USING v_name, v_description, v_target_path, v_icon_path, v_areas, v_sort_order; +END $$; + +COMMIT; diff --git a/scripts/创建sql/seed_govdoc_permissions.sql b/scripts/创建sql/seed_govdoc_permissions.sql new file mode 100644 index 0000000..77a0341 --- /dev/null +++ b/scripts/创建sql/seed_govdoc_permissions.sql @@ -0,0 +1,138 @@ +-- ============================================================================ +-- govdoc 模块权限种子 +-- 用途: +-- 1. 为 govdoc 模块插入权限点到 permissions 表 +-- 2. 为默认角色分发角色-权限映射到 role_permissions 表 +-- 3. 幂等执行,重复跑会更新 description / display_name 等可刷新字段 +-- +-- 权限键格式:govdoc:{resource}:{action} +-- 角色分发遵循《内部公文模块接口与权限设计》§5 +-- ============================================================================ + +BEGIN; + +-- --------------------------------------------------------------------------- +-- 1. 权限点定义 +-- --------------------------------------------------------------------------- +INSERT INTO permissions ( + permission_key, module, resource, action, description, display_name, + permission_type, is_system, metadata, created_at, updated_at, + sort_order, route_id, api_path, api_method +) +VALUES + -- 模块权限 + ('govdoc:module:read', 'govdoc', 'module', 'read', '查看内部公文处理模块菜单', '查看公文模块', 'MENU', TRUE, '{"group":"govdoc"}'::jsonb, NOW(), NOW(), 10, NULL, '/govdoc', 'GET'), + + -- 文档权限 + ('govdoc:document:create', 'govdoc', 'document', 'create', '上传公文文档', '上传公文', 'API', TRUE, '{"group":"govdoc"}'::jsonb, NOW(), NOW(), 20, NULL, '/api/govdoc/documents', 'POST'), + ('govdoc:document:read', 'govdoc', 'document', 'read', '查看公文文档列表与详情', '查看公文', 'API', TRUE, '{"group":"govdoc"}'::jsonb, NOW(), NOW(), 21, NULL, '/api/govdoc/documents', 'GET'), + ('govdoc:document:update', 'govdoc', 'document', 'update', '更新公文文档基础信息', '编辑公文', 'API', TRUE, '{"group":"govdoc"}'::jsonb, NOW(), NOW(), 22, NULL, '/api/govdoc/documents/{DocumentId}', 'PATCH'), + ('govdoc:document:delete', 'govdoc', 'document', 'delete', '删除公文文档', '删除公文', 'API', TRUE, '{"group":"govdoc"}'::jsonb, NOW(), NOW(), 23, NULL, '/api/govdoc/documents/{DocumentId}', 'DELETE'), + + -- 审查运行权限 + ('govdoc:run:create', 'govdoc', 'run', 'create', '发起公文格式审查', '发起审查', 'API', TRUE, '{"group":"govdoc"}'::jsonb, NOW(), NOW(), 30, NULL, '/api/govdoc/runs', 'POST'), + ('govdoc:run:read', 'govdoc', 'run', 'read', '查看审查运行状态', '查看审查状态', 'API', TRUE, '{"group":"govdoc"}'::jsonb, NOW(), NOW(), 31, NULL, '/api/govdoc/runs/{RunId}', 'GET'), + ('govdoc:run:retry', 'govdoc', 'run', 'retry', '失败后重试审查', '重试审查', 'API', TRUE, '{"group":"govdoc"}'::jsonb, NOW(), NOW(), 32, NULL, '/api/govdoc/runs/{RunId}/retry', 'POST'), + + -- 报告与结果权限 + ('govdoc:report:read', 'govdoc', 'report', 'read', '下载审查报告(HTML/DOCX/原文)', '下载报告', 'API', TRUE, '{"group":"govdoc"}'::jsonb, NOW(), NOW(), 40, NULL, '/api/govdoc/runs/{RunId}/report', 'GET'), + ('govdoc:result:read', 'govdoc', 'result', 'read', '查看审查结果(findings/entities/summary)', '查看审查结果', 'API', TRUE, '{"group":"govdoc"}'::jsonb, NOW(), NOW(), 41, NULL, '/api/govdoc/runs/{RunId}/result', 'GET'), + + -- 规则权限 + ('govdoc:rule:read', 'govdoc', 'rule', 'read', '查看公文规则清单与详情', '查看规则', 'API', TRUE, '{"group":"govdoc"}'::jsonb, NOW(), NOW(), 50, NULL, '/api/govdoc/rules', 'GET'), + ('govdoc:rule:manage', 'govdoc', 'rule', 'manage', '发布、更新、切换规则版本', '管理规则', 'API', TRUE, '{"group":"govdoc"}'::jsonb, NOW(), NOW(), 51, NULL, '/api/govdoc/rule-versions', 'POST'), + + -- 配置权限(可选) + ('govdoc:settings:read', 'govdoc', 'settings', 'read', '查看公文模块配置', '查看设置', 'API', TRUE, '{"group":"govdoc"}'::jsonb, NOW(), NOW(), 60, NULL, '/api/govdoc/settings', 'GET'), + ('govdoc:settings:update', 'govdoc', 'settings', 'update', '修改公文模块配置', '修改设置', 'API', TRUE, '{"group":"govdoc"}'::jsonb, NOW(), NOW(), 61, NULL, '/api/govdoc/settings', 'PATCH') +ON CONFLICT (permission_key) DO UPDATE SET + module = EXCLUDED.module, + resource = EXCLUDED.resource, + action = EXCLUDED.action, + description = EXCLUDED.description, + display_name = EXCLUDED.display_name, + api_path = EXCLUDED.api_path, + api_method = EXCLUDED.api_method, + updated_at = NOW(); + +-- --------------------------------------------------------------------------- +-- 2. 角色权限分发 +-- --------------------------------------------------------------------------- +WITH role_map AS ( + SELECT id, role_key + FROM roles + WHERE role_key IN ('super_admin', 'provincial_admin', 'admin', 'common') +), +perm_map AS ( + SELECT id, permission_key + FROM permissions + WHERE permission_key LIKE 'govdoc:%' +), +seed(role_key, permission_key, grant_type, data_scope) AS ( + VALUES + -- super_admin: 全部权限 + ('super_admin', 'govdoc:module:read', 'GRANT', 'ALL'), + ('super_admin', 'govdoc:document:create', 'GRANT', 'ALL'), + ('super_admin', 'govdoc:document:read', 'GRANT', 'ALL'), + ('super_admin', 'govdoc:document:update', 'GRANT', 'ALL'), + ('super_admin', 'govdoc:document:delete', 'GRANT', 'ALL'), + ('super_admin', 'govdoc:run:create', 'GRANT', 'ALL'), + ('super_admin', 'govdoc:run:read', 'GRANT', 'ALL'), + ('super_admin', 'govdoc:run:retry', 'GRANT', 'ALL'), + ('super_admin', 'govdoc:report:read', 'GRANT', 'ALL'), + ('super_admin', 'govdoc:result:read', 'GRANT', 'ALL'), + ('super_admin', 'govdoc:rule:read', 'GRANT', 'ALL'), + ('super_admin', 'govdoc:rule:manage', 'GRANT', 'ALL'), + ('super_admin', 'govdoc:settings:read', 'GRANT', 'ALL'), + ('super_admin', 'govdoc:settings:update', 'GRANT', 'ALL'), + + -- provincial_admin: 全部业务权限 + ('provincial_admin', 'govdoc:module:read', 'GRANT', 'ALL'), + ('provincial_admin', 'govdoc:document:create', 'GRANT', 'ALL'), + ('provincial_admin', 'govdoc:document:read', 'GRANT', 'ALL'), + ('provincial_admin', 'govdoc:document:update', 'GRANT', 'ALL'), + ('provincial_admin', 'govdoc:document:delete', 'GRANT', 'ALL'), + ('provincial_admin', 'govdoc:run:create', 'GRANT', 'ALL'), + ('provincial_admin', 'govdoc:run:read', 'GRANT', 'ALL'), + ('provincial_admin', 'govdoc:run:retry', 'GRANT', 'ALL'), + ('provincial_admin', 'govdoc:report:read', 'GRANT', 'ALL'), + ('provincial_admin', 'govdoc:result:read', 'GRANT', 'ALL'), + ('provincial_admin', 'govdoc:rule:read', 'GRANT', 'ALL'), + ('provincial_admin', 'govdoc:rule:manage', 'GRANT', 'ALL'), + ('provincial_admin', 'govdoc:settings:read', 'GRANT', 'ALL'), + ('provincial_admin', 'govdoc:settings:update', 'GRANT', 'ALL'), + + -- admin: 模块读写 + 规则查看,不含规则管理与配置修改 + ('admin', 'govdoc:module:read', 'GRANT', 'REGION'), + ('admin', 'govdoc:document:create', 'GRANT', 'REGION'), + ('admin', 'govdoc:document:read', 'GRANT', 'REGION'), + ('admin', 'govdoc:document:update', 'GRANT', 'REGION'), + ('admin', 'govdoc:document:delete', 'GRANT', 'REGION'), + ('admin', 'govdoc:run:create', 'GRANT', 'REGION'), + ('admin', 'govdoc:run:read', 'GRANT', 'REGION'), + ('admin', 'govdoc:run:retry', 'GRANT', 'REGION'), + ('admin', 'govdoc:report:read', 'GRANT', 'REGION'), + ('admin', 'govdoc:result:read', 'GRANT', 'REGION'), + ('admin', 'govdoc:rule:read', 'GRANT', 'REGION'), + + -- common: 模块查看 + 文档上传/查看 + 审查发起/查看 + 报告/结果查看 + 规则查看 + ('common', 'govdoc:module:read', 'GRANT', 'OWN'), + ('common', 'govdoc:document:create', 'GRANT', 'OWN'), + ('common', 'govdoc:document:read', 'GRANT', 'OWN'), + ('common', 'govdoc:run:create', 'GRANT', 'OWN'), + ('common', 'govdoc:run:read', 'GRANT', 'OWN'), + ('common', 'govdoc:report:read', 'GRANT', 'OWN'), + ('common', 'govdoc:result:read', 'GRANT', 'OWN'), + ('common', 'govdoc:rule:read', 'GRANT', 'OWN') +) +INSERT INTO role_permissions (role_id, permission_id, grant_type, data_scope, created_at, updated_at) +SELECT rm.id, pm.id, seed.grant_type, seed.data_scope, NOW(), NOW() +FROM seed +JOIN role_map rm ON rm.role_key = seed.role_key +JOIN perm_map pm ON pm.permission_key = seed.permission_key +ON CONFLICT (role_id, permission_id) DO UPDATE SET + grant_type = EXCLUDED.grant_type, + data_scope = EXCLUDED.data_scope, + updated_at = NOW(); + +COMMIT; \ No newline at end of file diff --git a/scripts/创建sql/seed_govdoc_routes.sql b/scripts/创建sql/seed_govdoc_routes.sql new file mode 100644 index 0000000..4727efd --- /dev/null +++ b/scripts/创建sql/seed_govdoc_routes.sql @@ -0,0 +1,276 @@ +BEGIN; + +-- ============================================================================ +-- govdoc 模块前端路由 / 菜单初始化草案 +-- 目标: +-- 1. 注册 /govdoc 模块根路由与常用子路由 +-- 2. 尽量兼容当前 sys_routes 的父子层级展示方式 +-- 3. 为默认角色补齐 role_route 菜单可见性 +-- 说明: +-- - 采用 route_path / route_name / route_title 这一套当前仓库主流字段风格 +-- - 明细页 / 详情页默认 hidden,避免菜单树出现无意义叶子节点 +-- - 幂等执行:重复跑会更新标题、父子关系、显示状态 +-- ============================================================================ + +WITH upsert_root AS ( + INSERT INTO sys_routes ( + route_path, + route_name, + component, + parent_id, + route_title, + icon, + sort_order, + is_hidden, + is_cache, + meta, + status, + created_at, + updated_at, + deleted_at + ) + VALUES ( + '/govdoc', + 'govdoc', + 'govdoc', + NULL, + '内部公文处理', + 'ri-file-paper-2-line', + 80, + FALSE, + TRUE, + '{"group":"govdoc","module":"govdoc"}'::jsonb, + 0, + NOW(), + NOW(), + NULL + ) + ON CONFLICT (route_path) WHERE deleted_at IS NULL + DO UPDATE SET + route_name = EXCLUDED.route_name, + component = EXCLUDED.component, + route_title = EXCLUDED.route_title, + icon = EXCLUDED.icon, + sort_order = EXCLUDED.sort_order, + is_hidden = EXCLUDED.is_hidden, + is_cache = EXCLUDED.is_cache, + meta = EXCLUDED.meta, + status = 0, + updated_at = NOW(), + deleted_at = NULL + RETURNING id +), +root_route AS ( + SELECT id FROM upsert_root + UNION ALL + SELECT id + FROM sys_routes + WHERE route_path = '/govdoc' + AND deleted_at IS NULL + LIMIT 1 +) +INSERT INTO sys_routes ( + route_path, + route_name, + component, + parent_id, + route_title, + icon, + sort_order, + is_hidden, + is_cache, + meta, + status, + created_at, + updated_at, + deleted_at +) +VALUES + ( + '/govdoc/upload', + 'govdoc.upload', + 'govdoc.upload', + (SELECT id FROM root_route), + '上传公文', + 'ri-upload-cloud-2-line', + 1, + FALSE, + TRUE, + '{"group":"govdoc","module":"govdoc","page":"upload"}'::jsonb, + 0, + NOW(), + NOW(), + NULL + ), + ( + '/govdoc/list', + 'govdoc.list', + 'govdoc.list', + (SELECT id FROM root_route), + '公文列表', + 'ri-file-list-3-line', + 2, + FALSE, + TRUE, + '{"group":"govdoc","module":"govdoc","page":"list"}'::jsonb, + 0, + NOW(), + NOW(), + NULL + ), + ( + '/govdoc/detail', + 'govdoc.detail', + 'govdoc.detail', + (SELECT id FROM root_route), + '公文详情', + 'ri-file-search-line', + 3, + TRUE, + TRUE, + '{"group":"govdoc","module":"govdoc","page":"detail"}'::jsonb, + 0, + NOW(), + NOW(), + NULL + ), + ( + '/govdoc/rules', + 'govdoc.rules', + 'govdoc.rules', + (SELECT id FROM root_route), + '规则配置', + 'ri-scales-3-line', + 4, + FALSE, + TRUE, + '{"group":"govdoc","module":"govdoc","page":"rules"}'::jsonb, + 0, + NOW(), + NOW(), + NULL + ), + ( + '/govdoc/settings', + 'govdoc.settings', + 'govdoc.settings', + (SELECT id FROM root_route), + '模块配置', + 'ri-settings-3-line', + 5, + FALSE, + TRUE, + '{"group":"govdoc","module":"govdoc","page":"settings"}'::jsonb, + 0, + NOW(), + NOW(), + NULL + ) +ON CONFLICT (route_path) WHERE deleted_at IS NULL +DO UPDATE SET + route_name = EXCLUDED.route_name, + component = EXCLUDED.component, + parent_id = EXCLUDED.parent_id, + route_title = EXCLUDED.route_title, + icon = EXCLUDED.icon, + sort_order = EXCLUDED.sort_order, + is_hidden = EXCLUDED.is_hidden, + is_cache = EXCLUDED.is_cache, + meta = EXCLUDED.meta, + status = 0, + updated_at = NOW(), + deleted_at = NULL; + +-- 修正旧环境中可能已存在但未正确挂到 /govdoc 下的子路由。 +UPDATE sys_routes AS child +SET + parent_id = root.id, + route_title = CASE + WHEN child.route_path = '/govdoc/upload' THEN '上传公文' + WHEN child.route_path = '/govdoc/list' THEN '公文列表' + WHEN child.route_path = '/govdoc/detail' THEN '公文详情' + WHEN child.route_path = '/govdoc/rules' THEN '规则配置' + WHEN child.route_path = '/govdoc/settings' THEN '模块配置' + ELSE child.route_title + END, + is_hidden = CASE + WHEN child.route_path = '/govdoc/detail' THEN TRUE + ELSE child.is_hidden + END, + updated_at = NOW() +FROM sys_routes root +WHERE child.deleted_at IS NULL + AND root.deleted_at IS NULL + AND root.route_path = '/govdoc' + AND child.route_path IN ( + '/govdoc/upload', + '/govdoc/list', + '/govdoc/detail', + '/govdoc/rules', + '/govdoc/settings' + ); + +-- 根路由标题兜底,避免历史环境残留旧文案。 +UPDATE sys_routes +SET route_title = '内部公文处理', + updated_at = NOW() +WHERE deleted_at IS NULL + AND route_path = '/govdoc'; + +WITH role_map AS ( + SELECT id, role_key + FROM roles + WHERE role_key IN ('super_admin', 'provincial_admin', 'admin', 'common') +), +route_map AS ( + SELECT id, route_path + FROM sys_routes + WHERE deleted_at IS NULL + AND route_path IN ( + '/govdoc', + '/govdoc/upload', + '/govdoc/list', + '/govdoc/detail', + '/govdoc/rules', + '/govdoc/settings' + ) +), +seed(role_key, route_path, permission, status) AS ( + VALUES + ('super_admin', '/govdoc', 'RW', 1), + ('super_admin', '/govdoc/upload', 'RW', 1), + ('super_admin', '/govdoc/list', 'RW', 1), + ('super_admin', '/govdoc/detail', 'RW', 1), + ('super_admin', '/govdoc/rules', 'RW', 1), + ('super_admin', '/govdoc/settings', 'RW', 1), + + ('provincial_admin', '/govdoc', 'RW', 1), + ('provincial_admin', '/govdoc/upload', 'RW', 1), + ('provincial_admin', '/govdoc/list', 'RW', 1), + ('provincial_admin', '/govdoc/detail', 'RW', 1), + ('provincial_admin', '/govdoc/rules', 'RW', 1), + ('provincial_admin', '/govdoc/settings', 'RW', 1), + + ('admin', '/govdoc', 'RW', 1), + ('admin', '/govdoc/upload', 'RW', 1), + ('admin', '/govdoc/list', 'RW', 1), + ('admin', '/govdoc/detail', 'RW', 1), + ('admin', '/govdoc/rules', 'R', 1), + + ('common', '/govdoc', 'R', 1), + ('common', '/govdoc/upload', 'R', 1), + ('common', '/govdoc/list', 'R', 1), + ('common', '/govdoc/detail', 'R', 1), + ('common', '/govdoc/rules', 'R', 1) +) +INSERT INTO role_route (role_id, route_id, permission, status, created_at, updated_at) +SELECT rm.id, tm.id, s.permission, s.status, NOW(), NOW() +FROM seed s +JOIN role_map rm ON rm.role_key = s.role_key +JOIN route_map tm ON tm.route_path = s.route_path +ON CONFLICT (role_id, route_id) DO UPDATE SET + permission = EXCLUDED.permission, + status = EXCLUDED.status, + updated_at = NOW(); + +COMMIT;