diff --git a/fastapi_modules/fastapi_leaudit/leaudit_bridge/pipeline.py b/fastapi_modules/fastapi_leaudit/leaudit_bridge/pipeline.py index 8802d66..aa82ce7 100644 --- a/fastapi_modules/fastapi_leaudit/leaudit_bridge/pipeline.py +++ b/fastapi_modules/fastapi_leaudit/leaudit_bridge/pipeline.py @@ -28,6 +28,43 @@ from fastapi_modules.fastapi_leaudit.leaudit_bridge.storage_adapter import Stora log = logging.getLogger(__name__) +def _ensure_text_page_chunks(ocr_result: OcrResult) -> None: + """Backfill pseudo chunks for text-native pages that have no OCR chunks. + + DOCX/legacy-doc normalization currently produces page text but often no + geometric chunks, which causes ``resolve_bundle_positions`` to return zero + positions for every extracted field. We synthesize coarse text chunks so at + least page-level定位 can be recovered on the review page. + """ + for page in ocr_result.pages: + if page.chunks: + continue + + raw_text = page.text or "" + normalized_text = ( + raw_text + .replace("\r\n", "\n") + .replace("\r", "\n") + ) + blocks = [ + block.strip() + for block in normalized_text.split("\n\n") + if block.strip() and not block.strip().startswith("