fix: preserve review field page positions in platform
This commit is contained in:
@@ -107,6 +107,7 @@ class NativeRunner:
|
|||||||
await self.storage.save_extraction_result(
|
await self.storage.save_extraction_result(
|
||||||
document_id,
|
document_id,
|
||||||
ctx.extraction,
|
ctx.extraction,
|
||||||
|
ocr_result=ctx.normalized_doc,
|
||||||
run_id=run_id,
|
run_id=run_id,
|
||||||
)
|
)
|
||||||
if ctx.evaluation is not None and ctx.rules_file is not None and ctx.extraction is not None:
|
if ctx.evaluation is not None and ctx.rules_file is not None and ctx.extraction is not None:
|
||||||
@@ -115,6 +116,7 @@ class NativeRunner:
|
|||||||
ctx.rules_file,
|
ctx.rules_file,
|
||||||
ctx.evaluation,
|
ctx.evaluation,
|
||||||
ctx.extraction,
|
ctx.extraction,
|
||||||
|
ocr_result=ctx.normalized_doc,
|
||||||
run_id=run_id,
|
run_id=run_id,
|
||||||
rule_version_id=result.metadata.rule_version_id,
|
rule_version_id=result.metadata.rule_version_id,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -99,11 +99,13 @@ class StorageAdapter:
|
|||||||
self,
|
self,
|
||||||
document_id: int,
|
document_id: int,
|
||||||
bundle: ExtractionBundle,
|
bundle: ExtractionBundle,
|
||||||
|
ocr_result: OcrResult | None = None,
|
||||||
*,
|
*,
|
||||||
run_id: int | None = None,
|
run_id: int | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Save extraction result to leaudit_field_results table."""
|
"""Save extraction result to leaudit_field_results table."""
|
||||||
extracted = _bundle_to_extracted(bundle)
|
inferred_positions = _build_inferred_field_positions(bundle, ocr_result)
|
||||||
|
extracted = _bundle_to_extracted(bundle, inferred_positions=inferred_positions)
|
||||||
resolved_run_id = await self._ensure_run_id(document_id, run_id)
|
resolved_run_id = await self._ensure_run_id(document_id, run_id)
|
||||||
|
|
||||||
async with GetAsyncSession() as session:
|
async with GetAsyncSession() as session:
|
||||||
@@ -111,7 +113,8 @@ class StorageAdapter:
|
|||||||
field_data = extracted.get("fields", {}).get(name, {})
|
field_data = extracted.get("fields", {}).get(name, {})
|
||||||
raw_value = fv.raw_value if isinstance(fv, FieldValue) else None
|
raw_value = fv.raw_value if isinstance(fv, FieldValue) else None
|
||||||
meta_json = {
|
meta_json = {
|
||||||
"position": _field_value_position_payload(fv),
|
"position": (_field_value_position_payload(fv) if isinstance(fv, FieldValue) else None)
|
||||||
|
or inferred_positions.get(name),
|
||||||
"reasons": list(fv.reasons or []),
|
"reasons": list(fv.reasons or []),
|
||||||
"type_name": fv.type_name,
|
"type_name": fv.type_name,
|
||||||
} if isinstance(fv, FieldValue) else None
|
} if isinstance(fv, FieldValue) else None
|
||||||
@@ -153,6 +156,7 @@ class StorageAdapter:
|
|||||||
rules_file: RulesFile,
|
rules_file: RulesFile,
|
||||||
evaluation: EvaluationResult,
|
evaluation: EvaluationResult,
|
||||||
bundle: ExtractionBundle,
|
bundle: ExtractionBundle,
|
||||||
|
ocr_result: OcrResult | None = None,
|
||||||
*,
|
*,
|
||||||
run_id: int | None = None,
|
run_id: int | None = None,
|
||||||
rule_version_id: int | None = None,
|
rule_version_id: int | None = None,
|
||||||
@@ -163,6 +167,7 @@ class StorageAdapter:
|
|||||||
then inserts fresh rows.
|
then inserts fresh rows.
|
||||||
"""
|
"""
|
||||||
resolved_run_id = await self._ensure_run_id(document_id, run_id)
|
resolved_run_id = await self._ensure_run_id(document_id, run_id)
|
||||||
|
inferred_positions = _build_inferred_field_positions(bundle, ocr_result)
|
||||||
async with GetAsyncSession() as session:
|
async with GetAsyncSession() as session:
|
||||||
# Delete existing results for this document+run
|
# Delete existing results for this document+run
|
||||||
await session.execute(
|
await session.execute(
|
||||||
@@ -178,7 +183,14 @@ class StorageAdapter:
|
|||||||
# Insert one row per rule result
|
# Insert one row per rule result
|
||||||
for rule_result in evaluation.rules:
|
for rule_result in evaluation.rules:
|
||||||
rule = rule_meta.get(rule_result.rule_id)
|
rule = rule_meta.get(rule_result.rule_id)
|
||||||
row = _rule_result_to_row(document_id, resolved_run_id, rule_result, rule, bundle)
|
row = _rule_result_to_row(
|
||||||
|
document_id,
|
||||||
|
resolved_run_id,
|
||||||
|
rule_result,
|
||||||
|
rule,
|
||||||
|
bundle,
|
||||||
|
inferred_positions=inferred_positions,
|
||||||
|
)
|
||||||
if rule_version_id is not None:
|
if rule_version_id is not None:
|
||||||
row["rule_version_id"] = rule_version_id
|
row["rule_version_id"] = rule_version_id
|
||||||
json_columns = {"stages", "extracted_fields", "field_positions", "remediation", "rule_meta"}
|
json_columns = {"stages", "extracted_fields", "field_positions", "remediation", "rule_meta"}
|
||||||
@@ -550,8 +562,13 @@ def _ocr_to_dict(ocr: OcrResult) -> dict[str, Any]:
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
def _bundle_to_extracted(bundle: ExtractionBundle) -> dict[str, Any]:
|
def _bundle_to_extracted(
|
||||||
|
bundle: ExtractionBundle,
|
||||||
|
*,
|
||||||
|
inferred_positions: dict[str, dict[str, Any]] | None = None,
|
||||||
|
) -> dict[str, Any]:
|
||||||
"""Convert ExtractionBundle to docauditai's extracted_results format."""
|
"""Convert ExtractionBundle to docauditai's extracted_results format."""
|
||||||
|
inferred_positions = inferred_positions or {}
|
||||||
fields: dict[str, Any] = {}
|
fields: dict[str, Any] = {}
|
||||||
for name, fv in bundle.fields.items():
|
for name, fv in bundle.fields.items():
|
||||||
if isinstance(fv, FieldValue):
|
if isinstance(fv, FieldValue):
|
||||||
@@ -559,7 +576,7 @@ def _bundle_to_extracted(bundle: ExtractionBundle) -> dict[str, Any]:
|
|||||||
"value": fv.value,
|
"value": fv.value,
|
||||||
"confidence": float(fv.confidence) if fv.confidence else 0.0,
|
"confidence": float(fv.confidence) if fv.confidence else 0.0,
|
||||||
}
|
}
|
||||||
position_payload = _field_value_position_payload(fv)
|
position_payload = _field_value_position_payload(fv) or inferred_positions.get(name)
|
||||||
if position_payload is not None:
|
if position_payload is not None:
|
||||||
field_data["position"] = position_payload
|
field_data["position"] = position_payload
|
||||||
fields[name] = field_data
|
fields[name] = field_data
|
||||||
@@ -633,8 +650,11 @@ def _extract_relevant_fields(rule: Any, bundle: ExtractionBundle) -> dict[str, A
|
|||||||
def _extract_relevant_field_positions(
|
def _extract_relevant_field_positions(
|
||||||
rule: Any,
|
rule: Any,
|
||||||
bundle: ExtractionBundle,
|
bundle: ExtractionBundle,
|
||||||
|
*,
|
||||||
|
inferred_positions: dict[str, dict[str, Any]] | None = None,
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
"""Extract position data for fields referenced by a rule's stages."""
|
"""Extract position data for fields referenced by a rule's stages."""
|
||||||
|
inferred_positions = inferred_positions or {}
|
||||||
positions: dict[str, Any] = {}
|
positions: dict[str, Any] = {}
|
||||||
if not rule or not hasattr(rule, "stages") or not rule.stages:
|
if not rule or not hasattr(rule, "stages") or not rule.stages:
|
||||||
return positions
|
return positions
|
||||||
@@ -671,7 +691,7 @@ def _extract_relevant_field_positions(
|
|||||||
continue
|
continue
|
||||||
fv = bundle.fields.get(f)
|
fv = bundle.fields.get(f)
|
||||||
if fv is not None and isinstance(fv, FieldValue):
|
if fv is not None and isinstance(fv, FieldValue):
|
||||||
position_payload = _field_value_position_payload(fv)
|
position_payload = _field_value_position_payload(fv) or inferred_positions.get(f)
|
||||||
if position_payload is not None:
|
if position_payload is not None:
|
||||||
positions[f] = position_payload
|
positions[f] = position_payload
|
||||||
return positions
|
return positions
|
||||||
@@ -703,12 +723,183 @@ def _field_value_position_payload(fv: FieldValue) -> dict[str, Any] | None:
|
|||||||
return payload or None
|
return payload or None
|
||||||
|
|
||||||
|
|
||||||
|
_POSITION_PUNCT_TABLE = str.maketrans({
|
||||||
|
",": ",", "。": ".", ";": ";", ":": ":",
|
||||||
|
"!": "!", "?": "?", "(": "(", ")": ")",
|
||||||
|
"【": "[", "】": "]", "「": '"', "」": '"',
|
||||||
|
"‘": "'", "’": "'", "“": '"', "”": '"',
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def _build_inferred_field_positions(
|
||||||
|
bundle: ExtractionBundle,
|
||||||
|
ocr_result: OcrResult | None,
|
||||||
|
) -> dict[str, dict[str, Any]]:
|
||||||
|
"""平台侧兜底补齐字段页码,避免 bridge 落库后丢失定位页。"""
|
||||||
|
if ocr_result is None or not ocr_result.pages:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
inferred: dict[str, dict[str, Any]] = {}
|
||||||
|
page_texts = [
|
||||||
|
(int(page.page_num) + 1, str(page.text or ""))
|
||||||
|
for page in ocr_result.pages
|
||||||
|
if str(page.text or "").strip()
|
||||||
|
]
|
||||||
|
if not page_texts:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
for field_name, field_value in bundle.fields.items():
|
||||||
|
if not isinstance(field_value, FieldValue):
|
||||||
|
continue
|
||||||
|
if _field_value_position_payload(field_value) is not None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
value_text = _stringify_position_value(field_value.value)
|
||||||
|
if not value_text:
|
||||||
|
continue
|
||||||
|
|
||||||
|
chunk_match = _infer_position_from_chunks(value_text, ocr_result)
|
||||||
|
if chunk_match is not None:
|
||||||
|
inferred[field_name] = chunk_match
|
||||||
|
continue
|
||||||
|
|
||||||
|
page_num = _infer_page_num_from_page_texts(value_text, page_texts)
|
||||||
|
if page_num is not None:
|
||||||
|
inferred[field_name] = {
|
||||||
|
"pageNum": page_num,
|
||||||
|
"matchMethod": "platform_bridge_page_fallback",
|
||||||
|
}
|
||||||
|
|
||||||
|
return inferred
|
||||||
|
|
||||||
|
|
||||||
|
def _infer_position_from_chunks(
|
||||||
|
value_text: str,
|
||||||
|
ocr_result: OcrResult,
|
||||||
|
) -> dict[str, Any] | None:
|
||||||
|
normalized_value = _normalize_position_text(value_text)
|
||||||
|
if not normalized_value:
|
||||||
|
return None
|
||||||
|
|
||||||
|
for page in ocr_result.pages:
|
||||||
|
for chunk in page.chunks or []:
|
||||||
|
content = ""
|
||||||
|
bbox = None
|
||||||
|
if isinstance(chunk, dict):
|
||||||
|
content = str(chunk.get("content") or "")
|
||||||
|
bbox = chunk.get("bbox")
|
||||||
|
else:
|
||||||
|
content = str(getattr(chunk, "content", "") or "")
|
||||||
|
bbox = getattr(chunk, "bbox", None)
|
||||||
|
|
||||||
|
if not content:
|
||||||
|
continue
|
||||||
|
if normalized_value not in _normalize_position_text(content):
|
||||||
|
continue
|
||||||
|
|
||||||
|
payload: dict[str, Any] = {
|
||||||
|
"pageNum": int(page.page_num) + 1,
|
||||||
|
"matchMethod": "platform_bridge_chunk_fallback",
|
||||||
|
}
|
||||||
|
if bbox:
|
||||||
|
payload["bbox"] = bbox
|
||||||
|
return payload
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _infer_page_num_from_page_texts(
|
||||||
|
value_text: str,
|
||||||
|
page_texts: list[tuple[int, str]],
|
||||||
|
) -> int | None:
|
||||||
|
normalized_value = _normalize_position_text(value_text)
|
||||||
|
if not normalized_value:
|
||||||
|
return None
|
||||||
|
|
||||||
|
best_page: int | None = None
|
||||||
|
best_score = 0.0
|
||||||
|
min_length_for_fuzzy = 8
|
||||||
|
|
||||||
|
for page_num, page_text in page_texts:
|
||||||
|
normalized_page = _normalize_position_text(page_text)
|
||||||
|
if not normalized_page:
|
||||||
|
continue
|
||||||
|
if normalized_value in normalized_page:
|
||||||
|
return page_num
|
||||||
|
|
||||||
|
if len(normalized_value) < min_length_for_fuzzy:
|
||||||
|
continue
|
||||||
|
|
||||||
|
score = _partial_similarity(normalized_value, normalized_page)
|
||||||
|
if score > best_score:
|
||||||
|
best_score = score
|
||||||
|
best_page = page_num
|
||||||
|
|
||||||
|
if best_score >= 0.92:
|
||||||
|
return best_page
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_position_text(text: str) -> str:
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
normalized = re.sub(r"<[^>]+>", "", str(text))
|
||||||
|
normalized = re.sub(r"\s+", "", normalized)
|
||||||
|
return normalized.translate(_POSITION_PUNCT_TABLE)
|
||||||
|
|
||||||
|
|
||||||
|
def _partial_similarity(needle: str, haystack: str) -> float:
|
||||||
|
if not needle or not haystack:
|
||||||
|
return 0.0
|
||||||
|
if len(needle) > len(haystack):
|
||||||
|
needle, haystack = haystack, needle
|
||||||
|
|
||||||
|
window = len(needle)
|
||||||
|
if window <= 0:
|
||||||
|
return 0.0
|
||||||
|
if needle == haystack:
|
||||||
|
return 1.0
|
||||||
|
|
||||||
|
best = 0.0
|
||||||
|
step = max(1, window // 6)
|
||||||
|
stop = max(len(haystack) - window + 1, 1)
|
||||||
|
for start in range(0, stop, step):
|
||||||
|
chunk = haystack[start : start + window]
|
||||||
|
if not chunk:
|
||||||
|
continue
|
||||||
|
matches = sum(1 for left, right in zip(needle, chunk) if left == right)
|
||||||
|
best = max(best, matches / window)
|
||||||
|
if best >= 0.999:
|
||||||
|
return 1.0
|
||||||
|
return best
|
||||||
|
|
||||||
|
|
||||||
|
def _stringify_position_value(raw_value: Any) -> str:
|
||||||
|
if raw_value is None:
|
||||||
|
return ""
|
||||||
|
if isinstance(raw_value, str):
|
||||||
|
return raw_value.strip()
|
||||||
|
if isinstance(raw_value, (int, float, bool)):
|
||||||
|
return str(raw_value)
|
||||||
|
if isinstance(raw_value, dict):
|
||||||
|
for key in ("value", "text", "value_text"):
|
||||||
|
value = raw_value.get(key)
|
||||||
|
if isinstance(value, str) and value.strip():
|
||||||
|
return value.strip()
|
||||||
|
return ""
|
||||||
|
if isinstance(raw_value, list):
|
||||||
|
parts = [_stringify_position_value(item) for item in raw_value]
|
||||||
|
return " ".join(part for part in parts if part).strip()
|
||||||
|
return str(raw_value).strip()
|
||||||
|
|
||||||
|
|
||||||
def _rule_result_to_row(
|
def _rule_result_to_row(
|
||||||
document_id: int,
|
document_id: int,
|
||||||
run_id: int | None,
|
run_id: int | None,
|
||||||
rule_result: RuleResult,
|
rule_result: RuleResult,
|
||||||
rule: Any | None,
|
rule: Any | None,
|
||||||
bundle: ExtractionBundle,
|
bundle: ExtractionBundle,
|
||||||
|
*,
|
||||||
|
inferred_positions: dict[str, dict[str, Any]] | None = None,
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
"""Convert a RuleResult to a leaudit_rule_results row."""
|
"""Convert a RuleResult to a leaudit_rule_results row."""
|
||||||
passed = rule_result.passed
|
passed = rule_result.passed
|
||||||
@@ -760,7 +951,11 @@ def _rule_result_to_row(
|
|||||||
"fail_message": fail_msg,
|
"fail_message": fail_msg,
|
||||||
"stages": [s.model_dump(mode="json") for s in (rule_result.stages or [])],
|
"stages": [s.model_dump(mode="json") for s in (rule_result.stages or [])],
|
||||||
"extracted_fields": relevant_fields,
|
"extracted_fields": relevant_fields,
|
||||||
"field_positions": _extract_relevant_field_positions(rule, bundle),
|
"field_positions": _extract_relevant_field_positions(
|
||||||
|
rule,
|
||||||
|
bundle,
|
||||||
|
inferred_positions=inferred_positions,
|
||||||
|
),
|
||||||
"remediation": remediation,
|
"remediation": remediation,
|
||||||
"rule_meta": rule_meta_data,
|
"rule_meta": rule_meta_data,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
from typing import Any
|
from typing import Any
|
||||||
from datetime import date as date_type, datetime
|
from datetime import date as date_type, datetime
|
||||||
import hashlib
|
import hashlib
|
||||||
@@ -143,6 +144,7 @@ class DocumentServiceImpl(IDocumentService):
|
|||||||
resolvedTypeId = int(typeRow["id"])
|
resolvedTypeId = int(typeRow["id"])
|
||||||
resolvedTypeCode = str(typeRow["code"])
|
resolvedTypeCode = str(typeRow["code"])
|
||||||
resolvedGroupId = await self._resolveDocumentGroupId(Session, resolvedTypeId, GroupId)
|
resolvedGroupId = await self._resolveDocumentGroupId(Session, resolvedTypeId, GroupId)
|
||||||
|
resolvedRootGroupId = await self._resolveDocumentRootGroupId(Session, resolvedTypeId, resolvedGroupId)
|
||||||
duplicateUpload = False
|
duplicateUpload = False
|
||||||
previousVersionId: int | None = None
|
previousVersionId: int | None = None
|
||||||
rootVersionId: int | None = None
|
rootVersionId: int | None = None
|
||||||
@@ -154,6 +156,7 @@ class DocumentServiceImpl(IDocumentService):
|
|||||||
latestCandidate = await _find_latest_version_candidate(
|
latestCandidate = await _find_latest_version_candidate(
|
||||||
Session,
|
Session,
|
||||||
type_id=resolvedTypeId,
|
type_id=resolvedTypeId,
|
||||||
|
root_group_id=resolvedRootGroupId,
|
||||||
region=normalizedRegion,
|
region=normalizedRegion,
|
||||||
normalized_name=normalizedName,
|
normalized_name=normalizedName,
|
||||||
)
|
)
|
||||||
@@ -1547,6 +1550,50 @@ class DocumentServiceImpl(IDocumentService):
|
|||||||
raise LeauditException(StatusCodeEnum.HTTP_400_BAD_REQUEST, "当前子类型不属于所选文档类型,无法上传")
|
raise LeauditException(StatusCodeEnum.HTTP_400_BAD_REQUEST, "当前子类型不属于所选文档类型,无法上传")
|
||||||
return int(row["id"])
|
return int(row["id"])
|
||||||
|
|
||||||
|
async def _resolveDocumentRootGroupId(self, Session, TypeId: int, GroupId: int | None) -> int | None:
|
||||||
|
"""解析上传命中的一级分组,用于跨二级类型做版本归档。"""
|
||||||
|
if GroupId is not None:
|
||||||
|
row = (
|
||||||
|
await Session.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
SELECT
|
||||||
|
CASE
|
||||||
|
WHEN COALESCE(pid, 0) = 0 THEN id
|
||||||
|
ELSE pid
|
||||||
|
END AS root_group_id
|
||||||
|
FROM leaudit_evaluation_point_groups
|
||||||
|
WHERE id = :group_id
|
||||||
|
AND deleted_at IS NULL
|
||||||
|
LIMIT 1
|
||||||
|
"""
|
||||||
|
),
|
||||||
|
{"group_id": GroupId},
|
||||||
|
)
|
||||||
|
).mappings().first()
|
||||||
|
if row and row["root_group_id"] is not None:
|
||||||
|
return int(row["root_group_id"])
|
||||||
|
|
||||||
|
row = (
|
||||||
|
await Session.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
SELECT
|
||||||
|
CASE WHEN COUNT(DISTINCT pid) = 1 THEN MIN(pid) END AS root_group_id
|
||||||
|
FROM leaudit_evaluation_point_groups
|
||||||
|
WHERE document_type_id = :doc_type_id
|
||||||
|
AND COALESCE(pid, 0) <> 0
|
||||||
|
AND deleted_at IS NULL
|
||||||
|
AND is_enabled = true
|
||||||
|
"""
|
||||||
|
),
|
||||||
|
{"doc_type_id": TypeId},
|
||||||
|
)
|
||||||
|
).mappings().first()
|
||||||
|
if row and row["root_group_id"] is not None:
|
||||||
|
return int(row["root_group_id"])
|
||||||
|
return None
|
||||||
|
|
||||||
async def _getDocumentDetail(
|
async def _getDocumentDetail(
|
||||||
self,
|
self,
|
||||||
Session,
|
Session,
|
||||||
@@ -2220,6 +2267,12 @@ class DocumentServiceImpl(IDocumentService):
|
|||||||
{"run_id": RunId, "document_id": Detail.documentId},
|
{"run_id": RunId, "document_id": Detail.documentId},
|
||||||
)
|
)
|
||||||
).mappings().all()
|
).mappings().all()
|
||||||
|
rows = await self._backfillMissingReviewFieldPositions(
|
||||||
|
Session,
|
||||||
|
Detail.documentId,
|
||||||
|
RunId,
|
||||||
|
rows,
|
||||||
|
)
|
||||||
|
|
||||||
result: list[ReviewPointResultVO] = []
|
result: list[ReviewPointResultVO] = []
|
||||||
for row in rows:
|
for row in rows:
|
||||||
@@ -2273,6 +2326,197 @@ class DocumentServiceImpl(IDocumentService):
|
|||||||
)
|
)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
async def _backfillMissingReviewFieldPositions(
|
||||||
|
self,
|
||||||
|
Session,
|
||||||
|
DocumentId: int,
|
||||||
|
RunId: int,
|
||||||
|
rows: list[dict[str, Any]],
|
||||||
|
) -> list[dict[str, Any]]:
|
||||||
|
"""详情页兜底补齐缺失页码,避免长文本字段只能按文本定位。"""
|
||||||
|
missingFields: dict[str, str] = {}
|
||||||
|
touchedRuleRows: dict[int, dict[str, Any]] = {}
|
||||||
|
|
||||||
|
for rawRow in rows:
|
||||||
|
row = dict(rawRow)
|
||||||
|
extractedFields = row["extracted_fields"] if isinstance(row.get("extracted_fields"), dict) else {}
|
||||||
|
fieldPositions = dict(row["field_positions"]) if isinstance(row.get("field_positions"), dict) else {}
|
||||||
|
row["field_positions"] = fieldPositions
|
||||||
|
|
||||||
|
rowTouched = False
|
||||||
|
for fieldName, rawValue in extractedFields.items():
|
||||||
|
if _extract_review_page(fieldPositions.get(fieldName)) is not None:
|
||||||
|
continue
|
||||||
|
valueText = _stringify_char_position_value(_normalize_review_value(rawValue))
|
||||||
|
if not valueText.strip():
|
||||||
|
continue
|
||||||
|
existing = missingFields.get(str(fieldName))
|
||||||
|
if existing is None or len(valueText) > len(existing):
|
||||||
|
missingFields[str(fieldName)] = valueText
|
||||||
|
rowTouched = True
|
||||||
|
|
||||||
|
if rowTouched and row.get("id") is not None:
|
||||||
|
touchedRuleRows[int(row["id"])] = row
|
||||||
|
|
||||||
|
if not missingFields:
|
||||||
|
return rows
|
||||||
|
|
||||||
|
inferredPositions = await self._inferFieldPositionsFromDocumentSource(
|
||||||
|
Session,
|
||||||
|
DocumentId,
|
||||||
|
missingFields,
|
||||||
|
)
|
||||||
|
if not inferredPositions:
|
||||||
|
return rows
|
||||||
|
|
||||||
|
for ruleRowId, row in touchedRuleRows.items():
|
||||||
|
fieldPositions = row["field_positions"] if isinstance(row.get("field_positions"), dict) else {}
|
||||||
|
changed = False
|
||||||
|
extractedFields = row["extracted_fields"] if isinstance(row.get("extracted_fields"), dict) else {}
|
||||||
|
for fieldName in extractedFields.keys():
|
||||||
|
key = str(fieldName)
|
||||||
|
if _extract_review_page(fieldPositions.get(key)) is not None:
|
||||||
|
continue
|
||||||
|
inferred = inferredPositions.get(key)
|
||||||
|
if inferred is None:
|
||||||
|
continue
|
||||||
|
fieldPositions[key] = inferred
|
||||||
|
changed = True
|
||||||
|
|
||||||
|
if not changed:
|
||||||
|
continue
|
||||||
|
|
||||||
|
await Session.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
UPDATE leaudit_rule_results
|
||||||
|
SET field_positions = CAST(:field_positions AS JSONB),
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE id = :rule_result_id
|
||||||
|
"""
|
||||||
|
),
|
||||||
|
{
|
||||||
|
"rule_result_id": ruleRowId,
|
||||||
|
"field_positions": json.dumps(fieldPositions, ensure_ascii=False),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
for fieldName, position in inferredPositions.items():
|
||||||
|
await Session.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
UPDATE leaudit_field_results
|
||||||
|
SET meta_json = jsonb_set(
|
||||||
|
COALESCE(meta_json, '{}'::jsonb),
|
||||||
|
'{position}',
|
||||||
|
CAST(:position AS JSONB),
|
||||||
|
true
|
||||||
|
),
|
||||||
|
updated_at = NOW()
|
||||||
|
WHERE run_id = :run_id
|
||||||
|
AND document_id = :document_id
|
||||||
|
AND field_name = :field_name
|
||||||
|
AND (
|
||||||
|
meta_json IS NULL
|
||||||
|
OR meta_json->'position' IS NULL
|
||||||
|
OR meta_json->'position' = 'null'::jsonb
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
),
|
||||||
|
{
|
||||||
|
"run_id": RunId,
|
||||||
|
"document_id": DocumentId,
|
||||||
|
"field_name": fieldName,
|
||||||
|
"position": json.dumps(position, ensure_ascii=False),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
await Session.commit()
|
||||||
|
return rows
|
||||||
|
|
||||||
|
async def _inferFieldPositionsFromDocumentSource(
|
||||||
|
self,
|
||||||
|
Session,
|
||||||
|
DocumentId: int,
|
||||||
|
FieldTexts: dict[str, str],
|
||||||
|
) -> dict[str, dict[str, Any]]:
|
||||||
|
"""按当前文档源文件逐页匹配字段文本,推导最小可用 pageNum。"""
|
||||||
|
fileRow = (
|
||||||
|
await Session.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
SELECT
|
||||||
|
id,
|
||||||
|
file_name,
|
||||||
|
file_ext,
|
||||||
|
local_path,
|
||||||
|
oss_url,
|
||||||
|
file_role
|
||||||
|
FROM leaudit_document_files
|
||||||
|
WHERE document_id = :document_id
|
||||||
|
AND is_active = true
|
||||||
|
ORDER BY
|
||||||
|
CASE file_role
|
||||||
|
WHEN 'converted_pdf' THEN 0
|
||||||
|
WHEN 'merged_pdf' THEN 1
|
||||||
|
WHEN 'primary' THEN 2
|
||||||
|
ELSE 9
|
||||||
|
END,
|
||||||
|
id DESC
|
||||||
|
LIMIT 1
|
||||||
|
"""
|
||||||
|
),
|
||||||
|
{"document_id": DocumentId},
|
||||||
|
)
|
||||||
|
).mappings().first()
|
||||||
|
if not fileRow:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
fileName = str(fileRow["file_name"] or "")
|
||||||
|
fileExt = f".{str(fileRow['file_ext']).lstrip('.')}" if fileRow.get("file_ext") else Path(fileName).suffix
|
||||||
|
tempPath: str | None = None
|
||||||
|
localPath = str(fileRow["local_path"] or "").strip()
|
||||||
|
try:
|
||||||
|
if localPath and Path(localPath).is_file():
|
||||||
|
sourcePath = Path(localPath)
|
||||||
|
else:
|
||||||
|
ossUrl = str(fileRow["oss_url"] or "").strip()
|
||||||
|
if not ossUrl:
|
||||||
|
return {}
|
||||||
|
downloaded = await self.OssService.DownloadToTempFile(
|
||||||
|
ossUrl,
|
||||||
|
Suffix=fileExt or Path(fileName).suffix or "",
|
||||||
|
Prefix=f"review-page-{DocumentId}-",
|
||||||
|
)
|
||||||
|
tempPath = downloaded
|
||||||
|
sourcePath = Path(downloaded)
|
||||||
|
|
||||||
|
suffix = sourcePath.suffix.lower()
|
||||||
|
if suffix == ".pdf":
|
||||||
|
pageTexts = _extract_page_texts_from_pdf(sourcePath)
|
||||||
|
elif suffix == ".docx":
|
||||||
|
pageTexts = _extract_page_texts_from_docx(sourcePath)
|
||||||
|
else:
|
||||||
|
return {}
|
||||||
|
if not pageTexts:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
inferred: dict[str, dict[str, Any]] = {}
|
||||||
|
for fieldName, fieldText in FieldTexts.items():
|
||||||
|
pageNum = _infer_page_num_from_page_texts(fieldText, pageTexts)
|
||||||
|
if pageNum is None:
|
||||||
|
continue
|
||||||
|
inferred[fieldName] = {"pageNum": pageNum, "matchMethod": "detail_page_fallback"}
|
||||||
|
return inferred
|
||||||
|
except Exception:
|
||||||
|
return {}
|
||||||
|
finally:
|
||||||
|
if tempPath:
|
||||||
|
try:
|
||||||
|
Path(tempPath).unlink(missing_ok=True)
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
|
||||||
def _buildReviewPointStats(self, ReviewPoints: list[ReviewPointResultVO]) -> ReviewPointStatsVO:
|
def _buildReviewPointStats(self, ReviewPoints: list[ReviewPointResultVO]) -> ReviewPointStatsVO:
|
||||||
"""按旧详情页口径汇总统计。"""
|
"""按旧详情页口径汇总统计。"""
|
||||||
stats = ReviewPointStatsVO(total=len(ReviewPoints), success=0, warning=0, error=0, score=0)
|
stats = ReviewPointStatsVO(total=len(ReviewPoints), success=0, warning=0, error=0, score=0)
|
||||||
@@ -2341,10 +2585,71 @@ async def _find_latest_version_candidate(
|
|||||||
session,
|
session,
|
||||||
*,
|
*,
|
||||||
type_id: int,
|
type_id: int,
|
||||||
|
root_group_id: int | None,
|
||||||
region: str,
|
region: str,
|
||||||
normalized_name: str,
|
normalized_name: str,
|
||||||
) -> dict | None:
|
) -> dict | None:
|
||||||
"""Find the latest primary document version candidate by normalized name."""
|
"""Find the latest primary document version candidate by normalized name.
|
||||||
|
|
||||||
|
Preferred rule: same region + same root group + same normalized name.
|
||||||
|
Fallback rule: when a root group cannot be resolved, keep the old same-type behavior.
|
||||||
|
"""
|
||||||
|
if root_group_id is not None:
|
||||||
|
result = await session.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
SELECT
|
||||||
|
d.id AS document_id,
|
||||||
|
d.version_group_key,
|
||||||
|
d.version_no,
|
||||||
|
d.root_version_id,
|
||||||
|
f.id AS file_id,
|
||||||
|
f.sha256
|
||||||
|
FROM leaudit_documents d
|
||||||
|
JOIN leaudit_document_files f
|
||||||
|
ON f.document_id = d.id
|
||||||
|
AND f.is_active = true
|
||||||
|
AND f.file_role = 'primary'
|
||||||
|
LEFT JOIN leaudit_evaluation_point_groups eg
|
||||||
|
ON eg.id = d.group_id
|
||||||
|
LEFT JOIN (
|
||||||
|
SELECT
|
||||||
|
document_type_id,
|
||||||
|
CASE WHEN COUNT(DISTINCT pid) = 1 THEN MIN(pid) END AS inferred_root_group_id
|
||||||
|
FROM leaudit_evaluation_point_groups
|
||||||
|
WHERE COALESCE(pid, 0) <> 0
|
||||||
|
AND deleted_at IS NULL
|
||||||
|
AND is_enabled = true
|
||||||
|
AND document_type_id IS NOT NULL
|
||||||
|
GROUP BY document_type_id
|
||||||
|
) dg
|
||||||
|
ON dg.document_type_id = d.type_id
|
||||||
|
WHERE d.region = :region
|
||||||
|
AND d.normalized_name = :normalized_name
|
||||||
|
AND d.is_latest_version = true
|
||||||
|
AND d.deleted_at IS NULL
|
||||||
|
AND COALESCE(
|
||||||
|
CASE
|
||||||
|
WHEN eg.id IS NULL THEN NULL
|
||||||
|
WHEN COALESCE(eg.pid, 0) = 0 THEN eg.id
|
||||||
|
ELSE eg.pid
|
||||||
|
END,
|
||||||
|
dg.inferred_root_group_id
|
||||||
|
) = :root_group_id
|
||||||
|
ORDER BY d.version_no DESC, d.id DESC
|
||||||
|
LIMIT 1
|
||||||
|
"""
|
||||||
|
),
|
||||||
|
{
|
||||||
|
"root_group_id": root_group_id,
|
||||||
|
"region": region,
|
||||||
|
"normalized_name": normalized_name,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
row = result.mappings().first()
|
||||||
|
if row:
|
||||||
|
return dict(row)
|
||||||
|
|
||||||
result = await session.execute(
|
result = await session.execute(
|
||||||
text(
|
text(
|
||||||
"""
|
"""
|
||||||
@@ -2595,6 +2900,119 @@ def _stringify_char_position_value(raw_value: Any) -> str:
|
|||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
_REVIEW_TEXT_PUNCT_TABLE = str.maketrans({
|
||||||
|
",": ",", "。": ".", ";": ";", ":": ":",
|
||||||
|
"!": "!", "?": "?", "(": "(", ")": ")",
|
||||||
|
"【": "[", "】": "]", "「": '"', "」": '"',
|
||||||
|
"‘": "'", "’": "'", "“": '"', "”": '"',
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_review_match_text(text: str) -> str:
|
||||||
|
"""统一比较文本,尽量消除 OCR/文档解析带来的空白和标点差异。"""
|
||||||
|
if not text:
|
||||||
|
return ""
|
||||||
|
normalized = re.sub(r"<[^>]+>", "", str(text))
|
||||||
|
normalized = re.sub(r"\s+", "", normalized)
|
||||||
|
return normalized.translate(_REVIEW_TEXT_PUNCT_TABLE)
|
||||||
|
|
||||||
|
|
||||||
|
def _infer_page_num_from_page_texts(field_text: str, page_texts: list[tuple[int, str]]) -> int | None:
|
||||||
|
"""根据字段文本在逐页正文中的命中情况推导 1-based 页码。"""
|
||||||
|
normalizedField = _normalize_review_match_text(field_text)
|
||||||
|
if not normalizedField:
|
||||||
|
return None
|
||||||
|
|
||||||
|
bestPage: int | None = None
|
||||||
|
bestScore = 0.0
|
||||||
|
fieldLength = len(normalizedField)
|
||||||
|
|
||||||
|
for pageNum, pageText in page_texts:
|
||||||
|
normalizedPage = _normalize_review_match_text(pageText)
|
||||||
|
if not normalizedPage:
|
||||||
|
continue
|
||||||
|
if normalizedField in normalizedPage:
|
||||||
|
return pageNum
|
||||||
|
|
||||||
|
if fieldLength < 8:
|
||||||
|
continue
|
||||||
|
|
||||||
|
ratio = _partial_similarity(normalizedField, normalizedPage)
|
||||||
|
if ratio > bestScore:
|
||||||
|
bestScore = ratio
|
||||||
|
bestPage = pageNum
|
||||||
|
|
||||||
|
if bestScore >= 0.92:
|
||||||
|
return bestPage
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _partial_similarity(needle: str, haystack: str) -> float:
|
||||||
|
"""简化版 partial similarity,避免详情页兜底依赖额外包。"""
|
||||||
|
if not needle or not haystack:
|
||||||
|
return 0.0
|
||||||
|
if len(needle) > len(haystack):
|
||||||
|
needle, haystack = haystack, needle
|
||||||
|
|
||||||
|
window = len(needle)
|
||||||
|
if window <= 0:
|
||||||
|
return 0.0
|
||||||
|
if needle == haystack:
|
||||||
|
return 1.0
|
||||||
|
|
||||||
|
best = 0.0
|
||||||
|
step = max(1, window // 6)
|
||||||
|
stop = max(len(haystack) - window + 1, 1)
|
||||||
|
for start in range(0, stop, step):
|
||||||
|
chunk = haystack[start : start + window]
|
||||||
|
if not chunk:
|
||||||
|
continue
|
||||||
|
matches = sum(1 for left, right in zip(needle, chunk) if left == right)
|
||||||
|
best = max(best, matches / window)
|
||||||
|
if best >= 0.999:
|
||||||
|
return 1.0
|
||||||
|
|
||||||
|
return best
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_page_texts_from_pdf(path: Path) -> list[tuple[int, str]]:
|
||||||
|
"""平台侧直接从 PDF 提取逐页文本,避免依赖 leaudit 内核。"""
|
||||||
|
import fitz
|
||||||
|
|
||||||
|
doc = fitz.open(path)
|
||||||
|
try:
|
||||||
|
page_texts: list[tuple[int, str]] = []
|
||||||
|
for index in range(len(doc)):
|
||||||
|
text = doc[index].get_text("text") or ""
|
||||||
|
if text.strip():
|
||||||
|
page_texts.append((index + 1, text))
|
||||||
|
return page_texts
|
||||||
|
finally:
|
||||||
|
doc.close()
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_page_texts_from_docx(path: Path) -> list[tuple[int, str]]:
|
||||||
|
"""DOCX 无稳定真实分页时,平台侧仅退化为整文第 1 页定位。"""
|
||||||
|
from docx import Document as DocxDocument
|
||||||
|
|
||||||
|
doc = DocxDocument(str(path))
|
||||||
|
parts: list[str] = []
|
||||||
|
|
||||||
|
for para in doc.paragraphs:
|
||||||
|
text = (para.text or "").strip()
|
||||||
|
if text:
|
||||||
|
parts.append(text)
|
||||||
|
|
||||||
|
for table in doc.tables:
|
||||||
|
for row in table.rows:
|
||||||
|
cells = [cell.text.strip() for cell in row.cells if cell.text and cell.text.strip()]
|
||||||
|
if cells:
|
||||||
|
parts.append(" | ".join(cells))
|
||||||
|
|
||||||
|
text = "\n".join(parts).strip()
|
||||||
|
return [(1, text)] if text else []
|
||||||
|
|
||||||
|
|
||||||
def _format_display_datetime(value: Any) -> str:
|
def _format_display_datetime(value: Any) -> str:
|
||||||
"""格式化详情页展示时间。"""
|
"""格式化详情页展示时间。"""
|
||||||
if isinstance(value, datetime):
|
if isinstance(value, datetime):
|
||||||
|
|||||||
Reference in New Issue
Block a user