fix: synthesize docx field location chunks
This commit is contained in:
@@ -28,6 +28,43 @@ from fastapi_modules.fastapi_leaudit.leaudit_bridge.storage_adapter import Stora
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _ensure_text_page_chunks(ocr_result: OcrResult) -> None:
|
||||
"""Backfill pseudo chunks for text-native pages that have no OCR chunks.
|
||||
|
||||
DOCX/legacy-doc normalization currently produces page text but often no
|
||||
geometric chunks, which causes ``resolve_bundle_positions`` to return zero
|
||||
positions for every extracted field. We synthesize coarse text chunks so at
|
||||
least page-level定位 can be recovered on the review page.
|
||||
"""
|
||||
for page in ocr_result.pages:
|
||||
if page.chunks:
|
||||
continue
|
||||
|
||||
raw_text = page.text or ""
|
||||
normalized_text = (
|
||||
raw_text
|
||||
.replace("\r\n", "\n")
|
||||
.replace("\r", "\n")
|
||||
)
|
||||
blocks = [
|
||||
block.strip()
|
||||
for block in normalized_text.split("\n\n")
|
||||
if block.strip() and not block.strip().startswith("<!-- PAGE ")
|
||||
]
|
||||
if not blocks:
|
||||
continue
|
||||
|
||||
page.chunks = [
|
||||
{
|
||||
"content": block,
|
||||
"bbox": [0, 0, 0, 0],
|
||||
"label": "text",
|
||||
}
|
||||
for block in blocks
|
||||
]
|
||||
page.bboxes = [chunk["bbox"] for chunk in page.chunks]
|
||||
|
||||
|
||||
@dataclass
|
||||
class PipelineResult:
|
||||
"""Complete result from the leaudit pipeline."""
|
||||
@@ -89,6 +126,7 @@ class LauditPipeline:
|
||||
t0 = time.time()
|
||||
log.info("[%d] OCR starting: %s", document_id, file_path.name)
|
||||
ocr_result = await self._run_ocr(file_path)
|
||||
_ensure_text_page_chunks(ocr_result)
|
||||
timing["ocr"] = round(time.time() - t0, 2)
|
||||
log.info(
|
||||
"[%d] OCR done: %d pages, %.1fs",
|
||||
@@ -140,9 +178,6 @@ class LauditPipeline:
|
||||
timing["extraction"],
|
||||
)
|
||||
|
||||
# --- Save extraction result ---
|
||||
await self.storage.save_extraction_result(document_id, extraction_bundle)
|
||||
|
||||
# --- Resolve field positions from OCR chunks ---
|
||||
from leaudit.extraction.coordinate_resolver import resolve_bundle_positions
|
||||
resolve_bundle_positions(extraction_bundle, ocr_result)
|
||||
@@ -156,6 +191,9 @@ class LauditPipeline:
|
||||
len(extraction_bundle.fields),
|
||||
)
|
||||
|
||||
# --- Save extraction result ---
|
||||
await self.storage.save_extraction_result(document_id, extraction_bundle)
|
||||
|
||||
# --- Phase 4: Phase detection ---
|
||||
visual_manifest = extraction_bundle.visual_manifest or ocr_result.visual_manifest
|
||||
detected_phase = await determine_phase(
|
||||
|
||||
@@ -109,14 +109,37 @@ class StorageAdapter:
|
||||
async with GetAsyncSession() as session:
|
||||
for name, fv in bundle.fields.items():
|
||||
field_data = extracted.get("fields", {}).get(name, {})
|
||||
raw_value = fv.raw_value if isinstance(fv, FieldValue) else None
|
||||
meta_json = {
|
||||
"position": _field_value_position_payload(fv),
|
||||
"reasons": list(fv.reasons or []),
|
||||
"type_name": fv.type_name,
|
||||
} if isinstance(fv, FieldValue) else None
|
||||
await session.execute(
|
||||
text("""INSERT INTO leaudit_field_results (run_id, document_id, field_name, value_text,
|
||||
confidence) VALUES (:rid, :did, :fn, :vt, :cf)
|
||||
text("""INSERT INTO leaudit_field_results (
|
||||
run_id, document_id, field_name, field_type, value_text,
|
||||
raw_value_json, confidence, logprob_score, grounding_score,
|
||||
grounding_method, rule_score, hard_failed, fallback_value, meta_json
|
||||
) VALUES (
|
||||
:rid, :did, :fn, :ft, :vt,
|
||||
CAST(:rv AS JSONB), :cf, :lp, :gs,
|
||||
:gm, :rs, :hf, :fv, CAST(:mj AS JSONB)
|
||||
)
|
||||
ON CONFLICT DO NOTHING"""),
|
||||
{
|
||||
"rid": resolved_run_id, "did": document_id,
|
||||
"fn": name, "vt": str(field_data.get("value", "")),
|
||||
"fn": name,
|
||||
"ft": fv.type_name if isinstance(fv, FieldValue) else None,
|
||||
"vt": str(field_data.get("value", "")),
|
||||
"rv": json.dumps(raw_value, ensure_ascii=False) if raw_value is not None else None,
|
||||
"cf": float(field_data.get("confidence", 0)),
|
||||
"lp": float(fv.logprob or 0) if isinstance(fv, FieldValue) else None,
|
||||
"gs": float(fv.grounding or 0) if isinstance(fv, FieldValue) else None,
|
||||
"gm": fv.grounding_method if isinstance(fv, FieldValue) else None,
|
||||
"rs": float(fv.rule_score or 0) if isinstance(fv, FieldValue) else None,
|
||||
"hf": bool(fv.hard_failed) if isinstance(fv, FieldValue) else None,
|
||||
"fv": fv.fallback_value if isinstance(fv, FieldValue) else None,
|
||||
"mj": json.dumps(meta_json, ensure_ascii=False) if meta_json else None,
|
||||
},
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
Reference in New Issue
Block a user