Files
leaudit-platform-backend/fastapi_modules/fastapi_leaudit/leaudit_bridge/ctx_builder.py
T
wren 535d97a70c chore: initial commit — leaudit-platform project skeleton
17-table PostgreSQL schema with full Chinese column comments,
FastAPI project structure (admin/common/modules),
DSL rule files, and schema migration scripts.
2026-04-27 16:48:22 +08:00

133 lines
4.1 KiB
Python

"""Build leaudit execution context from docauditai document data.
Currently leaudit's pipeline in docauditai bypasses leaudit's own
``AuditCtx`` / ``AuditService`` and calls engine modules directly.
This module encapsulates the pre-execution setup that currently lives
inlined in ``pipeline.py`` and ``tasks.py``:
- Resolve local file path (download from OSS to temp if needed)
- Determine RulesFile (from document metadata, type binding, or
content classification)
- Prepare OCR/LLM/VLM client references
"""
from __future__ import annotations
import logging
import os
import tempfile
from dataclasses import dataclass, field
from pathlib import Path
from typing import TYPE_CHECKING
from leaudit.dsl.schema import RulesFile
from leaudit.llm.base import BaseLLMClient
from leaudit.ocr.base import BaseOCRClient
if TYPE_CHECKING:
from leaudit.llm.vlm_base import BaseVLMClient
log = logging.getLogger(__name__)
@dataclass
class ExecutionContext:
"""Everything leaudit needs to run for one document."""
document_id: int
file_path: Path
rules_file: RulesFile
ocr_client: BaseOCRClient
llm_client: BaseLLMClient | None = None
vlm_client: object | None = None
source_port: int = 8000
tmp_path: Path | None = None
metadata: dict = field(default_factory=dict)
def cleanup(self) -> None:
"""Remove temporary file if one was created."""
if self.tmp_path is not None:
try:
os.remove(self.tmp_path)
except OSError:
pass
class CtxBuilder:
"""Build :class:`ExecutionContext` from docauditai document data.
Handles the glue between docauditai's document model and leaudit's
execution expectations — primarily file-path resolution and rules
selection.
"""
def __init__(
self,
ocr_client: BaseOCRClient | None = None,
llm_client: BaseLLMClient | None = None,
vlm_client: object | None = None,
) -> None:
self.ocr_client = ocr_client
self.llm_client = llm_client
self.vlm_client = vlm_client
async def build(
self,
document_id: int,
file_path: str | Path | None = None,
file_content: bytes | None = None,
filename: str | None = None,
rules_file: RulesFile | None = None,
*,
source_port: int = 8000,
) -> ExecutionContext:
"""Build a ready-to-use execution context.
At least one of *file_path* or (*file_content* + *filename*)
must be provided.
Args:
document_id: docauditai document ID.
file_path: Existing local path to the document file.
file_content: Raw bytes (from DB or OSS) — a temp file is
created.
filename: Required when *file_content* is given.
rules_file: Pre-loaded RulesFile. When None, the caller
must resolve after OCR classification.
source_port: Instance port.
Returns:
ExecutionContext ready for pipeline.run().
"""
tmp_path: Path | None = None
if file_path is not None:
resolved = Path(file_path)
elif file_content is not None and filename is not None:
suffix = self._suffix(filename)
tmp = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
tmp.write(file_content)
tmp.close()
resolved = Path(tmp.name)
tmp_path = resolved
else:
raise ValueError(
"Either file_path or (file_content + filename) is required"
)
return ExecutionContext(
document_id=document_id,
file_path=resolved,
rules_file=rules_file, # type: ignore[arg-type]
ocr_client=self.ocr_client, # type: ignore[arg-type]
llm_client=self.llm_client,
vlm_client=self.vlm_client,
source_port=source_port,
tmp_path=tmp_path,
)
@staticmethod
def _suffix(filename: str) -> str:
_, ext = os.path.splitext(filename)
return ext if ext else ".pdf"