"""Main leaudit pipeline orchestrator: OCR → Extract → Evaluate. Uses leaudit's own pipeline directly (no conversion), stores results into docauditai's database via StorageAdapter. """ from __future__ import annotations import logging import time from dataclasses import dataclass, field from pathlib import Path from typing import Any from leaudit.dsl.schema import RulesFile from leaudit.engine.case_file_evaluator import evaluate_extraction from leaudit.engine.models import EvaluationResult from leaudit.extraction.bundle import ExtractionBundle from leaudit.extraction.dispatcher import dispatch_extract from leaudit.extraction.phase_detection import determine_phase from leaudit.llm.base import BaseLLMClient from leaudit.ocr.base import BaseOCRClient from leaudit.ocr.models import OcrResult from fastapi_modules.fastapi_leaudit.leaudit_bridge.storage_adapter import StorageAdapter log = logging.getLogger(__name__) def _ensure_text_page_chunks(ocr_result: OcrResult) -> None: """Backfill pseudo chunks for text-native pages that have no OCR chunks. DOCX/legacy-doc normalization currently produces page text but often no geometric chunks, which causes ``resolve_bundle_positions`` to return zero positions for every extracted field. We synthesize coarse text chunks so at least page-level定位 can be recovered on the review page. """ for page in ocr_result.pages: if page.chunks: continue raw_text = page.text or "" normalized_text = ( raw_text .replace("\r\n", "\n") .replace("\r", "\n") ) blocks = [ block.strip() for block in normalized_text.split("\n\n") if block.strip() and not block.strip().startswith("