From 201e3adc1850f5d7074d088e852404a03ad556c2 Mon Sep 17 00:00:00 2001 From: wren <“porlong@qq.com”> Date: Wed, 6 May 2026 11:36:02 +0800 Subject: [PATCH] fix: synthesize docx field location chunks --- .../leaudit_bridge/pipeline.py | 44 +++++++++++++++++-- .../leaudit_bridge/storage_adapter.py | 29 ++++++++++-- 2 files changed, 67 insertions(+), 6 deletions(-) diff --git a/fastapi_modules/fastapi_leaudit/leaudit_bridge/pipeline.py b/fastapi_modules/fastapi_leaudit/leaudit_bridge/pipeline.py index 8802d66..aa82ce7 100644 --- a/fastapi_modules/fastapi_leaudit/leaudit_bridge/pipeline.py +++ b/fastapi_modules/fastapi_leaudit/leaudit_bridge/pipeline.py @@ -28,6 +28,43 @@ from fastapi_modules.fastapi_leaudit.leaudit_bridge.storage_adapter import Stora log = logging.getLogger(__name__) +def _ensure_text_page_chunks(ocr_result: OcrResult) -> None: + """Backfill pseudo chunks for text-native pages that have no OCR chunks. + + DOCX/legacy-doc normalization currently produces page text but often no + geometric chunks, which causes ``resolve_bundle_positions`` to return zero + positions for every extracted field. We synthesize coarse text chunks so at + least page-level定位 can be recovered on the review page. + """ + for page in ocr_result.pages: + if page.chunks: + continue + + raw_text = page.text or "" + normalized_text = ( + raw_text + .replace("\r\n", "\n") + .replace("\r", "\n") + ) + blocks = [ + block.strip() + for block in normalized_text.split("\n\n") + if block.strip() and not block.strip().startswith("