From 201e3adc1850f5d7074d088e852404a03ad556c2 Mon Sep 17 00:00:00 2001
From: wren <“porlong@qq.com”>
Date: Wed, 6 May 2026 11:36:02 +0800
Subject: [PATCH] fix: synthesize docx field location chunks

---
 .../leaudit_bridge/pipeline.py                | 44 +++++++++++++++++--
 .../leaudit_bridge/storage_adapter.py         | 29 ++++++++++--
 2 files changed, 67 insertions(+), 6 deletions(-)

diff --git a/fastapi_modules/fastapi_leaudit/leaudit_bridge/pipeline.py b/fastapi_modules/fastapi_leaudit/leaudit_bridge/pipeline.py
index 8802d66..aa82ce7 100644
--- a/fastapi_modules/fastapi_leaudit/leaudit_bridge/pipeline.py
+++ b/fastapi_modules/fastapi_leaudit/leaudit_bridge/pipeline.py
@@ -28,6 +28,43 @@ from fastapi_modules.fastapi_leaudit.leaudit_bridge.storage_adapter import Stora
 log = logging.getLogger(__name__)
 
 
+def _ensure_text_page_chunks(ocr_result: OcrResult) -> None:
+    """Backfill pseudo chunks for text-native pages that have no OCR chunks.
+
+    DOCX/legacy-doc normalization currently produces page text but often no
+    geometric chunks, which causes ``resolve_bundle_positions`` to return zero
+    positions for every extracted field. We synthesize coarse text chunks so at
+    least page-level定位 can be recovered on the review page.
+    """
+    for page in ocr_result.pages:
+        if page.chunks:
+            continue
+
+        raw_text = page.text or ""
+        normalized_text = (
+            raw_text
+            .replace("\r\n", "\n")
+            .replace("\r", "\n")
+        )
+        blocks = [
+            block.strip()
+            for block in normalized_text.split("\n\n")
+            if block.strip() and not block.strip().startswith("<!-- PAGE ")
+        ]
+        if not blocks:
+            continue
+
+        page.chunks = [
+            {
+                "content": block,
+                "bbox": [0, 0, 0, 0],
+                "label": "text",
+            }
+            for block in blocks
+        ]
+        page.bboxes = [chunk["bbox"] for chunk in page.chunks]
+
+
 @dataclass
 class PipelineResult:
     """Complete result from the leaudit pipeline."""
@@ -89,6 +126,7 @@ class LauditPipeline:
         t0 = time.time()
         log.info("[%d] OCR starting: %s", document_id, file_path.name)
         ocr_result = await self._run_ocr(file_path)
+        _ensure_text_page_chunks(ocr_result)
         timing["ocr"] = round(time.time() - t0, 2)
         log.info(
             "[%d] OCR done: %d pages, %.1fs",
@@ -140,9 +178,6 @@ class LauditPipeline:
             timing["extraction"],
         )
 
-        # --- Save extraction result ---
-        await self.storage.save_extraction_result(document_id, extraction_bundle)
-
         # --- Resolve field positions from OCR chunks ---
         from leaudit.extraction.coordinate_resolver import resolve_bundle_positions
         resolve_bundle_positions(extraction_bundle, ocr_result)
@@ -156,6 +191,9 @@ class LauditPipeline:
             len(extraction_bundle.fields),
         )
 
+        # --- Save extraction result ---
+        await self.storage.save_extraction_result(document_id, extraction_bundle)
+
         # --- Phase 4: Phase detection ---
         visual_manifest = extraction_bundle.visual_manifest or ocr_result.visual_manifest
         detected_phase = await determine_phase(
diff --git a/fastapi_modules/fastapi_leaudit/leaudit_bridge/storage_adapter.py b/fastapi_modules/fastapi_leaudit/leaudit_bridge/storage_adapter.py
index 85aa061..499b3e9 100644
--- a/fastapi_modules/fastapi_leaudit/leaudit_bridge/storage_adapter.py
+++ b/fastapi_modules/fastapi_leaudit/leaudit_bridge/storage_adapter.py
@@ -109,14 +109,37 @@ class StorageAdapter:
         async with GetAsyncSession() as session:
             for name, fv in bundle.fields.items():
                 field_data = extracted.get("fields", {}).get(name, {})
+                raw_value = fv.raw_value if isinstance(fv, FieldValue) else None
+                meta_json = {
+                    "position": _field_value_position_payload(fv),
+                    "reasons": list(fv.reasons or []),
+                    "type_name": fv.type_name,
+                } if isinstance(fv, FieldValue) else None
                 await session.execute(
-                    text("""INSERT INTO leaudit_field_results (run_id, document_id, field_name, value_text,
-                        confidence) VALUES (:rid, :did, :fn, :vt, :cf)
+                    text("""INSERT INTO leaudit_field_results (
+                        run_id, document_id, field_name, field_type, value_text,
+                        raw_value_json, confidence, logprob_score, grounding_score,
+                        grounding_method, rule_score, hard_failed, fallback_value, meta_json
+                    ) VALUES (
+                        :rid, :did, :fn, :ft, :vt,
+                        CAST(:rv AS JSONB), :cf, :lp, :gs,
+                        :gm, :rs, :hf, :fv, CAST(:mj AS JSONB)
+                    )
                         ON CONFLICT DO NOTHING"""),
                     {
                         "rid": resolved_run_id, "did": document_id,
-                        "fn": name, "vt": str(field_data.get("value", "")),
+                        "fn": name,
+                        "ft": fv.type_name if isinstance(fv, FieldValue) else None,
+                        "vt": str(field_data.get("value", "")),
+                        "rv": json.dumps(raw_value, ensure_ascii=False) if raw_value is not None else None,
                         "cf": float(field_data.get("confidence", 0)),
+                        "lp": float(fv.logprob or 0) if isinstance(fv, FieldValue) else None,
+                        "gs": float(fv.grounding or 0) if isinstance(fv, FieldValue) else None,
+                        "gm": fv.grounding_method if isinstance(fv, FieldValue) else None,
+                        "rs": float(fv.rule_score or 0) if isinstance(fv, FieldValue) else None,
+                        "hf": bool(fv.hard_failed) if isinstance(fv, FieldValue) else None,
+                        "fv": fv.fallback_value if isinstance(fv, FieldValue) else None,
+                        "mj": json.dumps(meta_json, ensure_ascii=False) if meta_json else None,
                     },
                 )
             await session.commit()