fix: synthesize docx field location chunks

2026-05-06 11:36:02 +08:00
parent 6ff1fb1cf6
commit 201e3adc18
2 changed files with 67 additions and 6 deletions
@@ -28,6 +28,43 @@ from fastapi_modules.fastapi_leaudit.leaudit_bridge.storage_adapter import Stora
 log = logging.getLogger(__name__)


+def _ensure_text_page_chunks(ocr_result: OcrResult) -> None:
+    """Backfill pseudo chunks for text-native pages that have no OCR chunks.
+
+    DOCX/legacy-doc normalization currently produces page text but often no
+    geometric chunks, which causes ``resolve_bundle_positions`` to return zero
+    positions for every extracted field. We synthesize coarse text chunks so at
+    least page-level定位 can be recovered on the review page.
+    """
+    for page in ocr_result.pages:
+        if page.chunks:
+            continue
+
+        raw_text = page.text or ""
+        normalized_text = (
+            raw_text
+            .replace("\r\n", "\n")
+            .replace("\r", "\n")
+        )
+        blocks = [
+            block.strip()
+            for block in normalized_text.split("\n\n")
+            if block.strip() and not block.strip().startswith("<!-- PAGE ")
+        ]
+        if not blocks:
+            continue
+
+        page.chunks = [
+            {
+                "content": block,
+                "bbox": [0, 0, 0, 0],
+                "label": "text",
+            }
+            for block in blocks
+        ]
+        page.bboxes = [chunk["bbox"] for chunk in page.chunks]
+
+
@dataclass
 class PipelineResult:
    """Complete result from the leaudit pipeline."""
@@ -89,6 +126,7 @@ class LauditPipeline:
        t0 = time.time()
        log.info("[%d] OCR starting: %s", document_id, file_path.name)
        ocr_result = await self._run_ocr(file_path)
+        _ensure_text_page_chunks(ocr_result)
        timing["ocr"] = round(time.time() - t0, 2)
        log.info(
            "[%d] OCR done: %d pages, %.1fs",
@@ -140,9 +178,6 @@ class LauditPipeline:
            timing["extraction"],
        )

-        # --- Save extraction result ---
-        await self.storage.save_extraction_result(document_id, extraction_bundle)
-
        # --- Resolve field positions from OCR chunks ---
        from leaudit.extraction.coordinate_resolver import resolve_bundle_positions
        resolve_bundle_positions(extraction_bundle, ocr_result)
@@ -156,6 +191,9 @@ class LauditPipeline:
            len(extraction_bundle.fields),
        )

+        # --- Save extraction result ---
+        await self.storage.save_extraction_result(document_id, extraction_bundle)
+
        # --- Phase 4: Phase detection ---
        visual_manifest = extraction_bundle.visual_manifest or ocr_result.visual_manifest
        detected_phase = await determine_phase(