fix: improve page quality vlm detection

This commit is contained in:
wren
2026-05-22 14:41:42 +08:00
parent 842b362150
commit 9434f2b22b
3 changed files with 159 additions and 7 deletions
@@ -239,7 +239,7 @@ class ResilientQwenVLMClient(QwenVLMClient):
body = response.json() body = response.json()
text = (body.get("choices") or [{}])[0].get("message", {}).get("content", "") text = (body.get("choices") or [{}])[0].get("message", {}).get("content", "")
parsed = _parse_json_loose(text) parsed = _parse_json_loose(text)
return parsed if isinstance(parsed, dict) else {} return parsed if isinstance(parsed, dict) else {"result": text, "reason": text}
class ResilientChandraOCRClient(ChandraOCRClient): class ResilientChandraOCRClient(ChandraOCRClient):
@@ -6,6 +6,7 @@ from typing import Any
from pathlib import Path from pathlib import Path
import tempfile import tempfile
import logging import logging
import json
import fitz import fitz
from leaudit.converters import doc2pdf from leaudit.converters import doc2pdf
@@ -30,9 +31,11 @@ _PAGE_QUALITY_VLM_PROMPT = """
你是文档扫描图片质量检测员。请判断这 1 页文档图片是否适合继续做 OCR 与合同/公文评查。 你是文档扫描图片质量检测员。请判断这 1 页文档图片是否适合继续做 OCR 与合同/公文评查。
判定标准: 判定标准:
1. pass:文字主体清晰、方向正常、没有明显截断,能稳定阅读 1. 必须同时检查整页扫描质量,以及页面内所有内嵌照片、证据照片、现场照片、截图、印章和签名图片的清晰度
2. review:存在轻微模糊、倾斜、阴影、低对比度、局部遮挡、轻微截断,建议人工确认但仍可能可读 2. pass:文字主体清晰、方向正常、没有明显截断;页面内嵌照片/证据照片也能辨认关键视觉信息
3. reject:严重模糊、重影、过曝/过暗、页面大面积缺失、关键文字不可辨认、方向严重错误、空白页或非文档页,建议重拍 3. review:存在轻微模糊、倾斜、阴影、低对比度、局部遮挡、轻微截断;或内嵌照片/证据照片主体明显发虚、牌匾/场所/人物/关键物证不易辨认,建议人工确认但仍可能可用
4. reject:严重模糊、重影、过曝/过暗、页面大面积缺失、关键文字不可辨认、方向严重错误、空白页或非文档页;或内嵌证据照片主体无法辨认、关键证据信息不可用,建议重拍。
5. 即使页面周边文字清楚,只要内嵌证据照片明显模糊,也不能判 pass,至少判 review,严重时判 reject。
只输出 JSON,不要输出 Markdown,不要解释额外文本: 只输出 JSON,不要输出 Markdown,不要解释额外文本:
{"status":"pass|review|reject","score":0.0到1.0,"reason":"20字以内中文原因"} {"status":"pass|review|reject","score":0.0到1.0,"reason":"20字以内中文原因"}
@@ -495,12 +498,28 @@ class PageQualityServiceImpl(IPageQualityService):
logger.warning("VLM page quality detection failed: %s", exc) logger.warning("VLM page quality detection failed: %s", exc)
return "review", 0.5, "VLM图片质量检测失败,需人工确认" return "review", 0.5, "VLM图片质量检测失败,需人工确认"
status = str((result or {}).get("status") or "").strip().lower() result_dict = self._coerce_vlm_result(result)
status = self._normalize_quality_status(
self._first_non_empty(
result_dict,
("status", "quality_status", "qualityStatus", "result", "label", "decision", "conclusion"),
)
)
reason = self._normalize_quality_reason(
self._first_non_empty(
result_dict,
("reason", "quality_reason", "qualityReason", "message", "msg", "detail", "explanation", "description"),
)
)
if status is None and reason:
status = self._normalize_quality_status(reason)
if status not in {"pass", "review", "reject"}: if status not in {"pass", "review", "reject"}:
return "review", 0.5, "VLM返回结果不可用,需人工确认" return "review", 0.5, "VLM返回结果不可用,需人工确认"
score = self._normalize_quality_score((result or {}).get("score"), status) score = self._normalize_quality_score(
reason = str((result or {}).get("reason") or "").strip() or None self._first_non_empty(result_dict, ("score", "quality_score", "qualityScore", "confidence")),
status,
)
if status != "pass" and not reason: if status != "pass" and not reason:
reason = "页面图片质量需人工确认" reason = "页面图片质量需人工确认"
return status, score, reason return status, score, reason
@@ -526,6 +545,56 @@ class PageQualityServiceImpl(IPageQualityService):
return defaults[status] return defaults[status]
return max(0.0, min(1.0, score)) return max(0.0, min(1.0, score))
def _coerce_vlm_result(self, result: Any) -> dict[str, Any]:
if isinstance(result, dict):
return result
if isinstance(result, str):
text_result = result.strip()
if not text_result:
return {}
try:
parsed = json.loads(text_result)
except json.JSONDecodeError:
return {"result": text_result, "reason": text_result}
return parsed if isinstance(parsed, dict) else {"result": text_result}
return {}
def _first_non_empty(self, payload: dict[str, Any], keys: tuple[str, ...]) -> Any:
for key in keys:
value = payload.get(key)
if value is not None and str(value).strip():
return value
return None
def _normalize_quality_status(self, raw_status: Any) -> str | None:
text_status = str(raw_status or "").strip().lower()
if not text_status:
return None
compact_status = text_status.replace(" ", "").replace("_", "").replace("-", "")
if compact_status in {"pass", "passed", "ok", "good", "clear", "readable"}:
return "pass"
if compact_status in {"review", "warn", "warning", "manual", "uncertain", "suspect", "suspicious"}:
return "review"
if compact_status in {"reject", "rejected", "fail", "failed", "bad", "unreadable", "retake"}:
return "reject"
reject_keywords = ("不通过", "拒绝", "重拍", "不可读", "无法辨认", "无法识别", "严重", "大面积缺失", "空白页")
review_keywords = ("复核", "人工", "疑似", "轻微", "建议确认", "建议人工", "模糊", "不清晰", "低对比", "发虚")
pass_keywords = ("通过", "合格", "清晰", "可读")
if any(keyword in text_status for keyword in reject_keywords):
return "reject"
if any(keyword in text_status for keyword in review_keywords):
return "review"
if any(keyword in text_status for keyword in pass_keywords):
return "pass"
return None
def _normalize_quality_reason(self, raw_reason: Any) -> str | None:
reason = str(raw_reason or "").strip()
if not reason:
return None
return reason[:80]
def _document_service(self): def _document_service(self):
if self.DocumentService is None: if self.DocumentService is None:
from fastapi_modules.fastapi_leaudit.services.impl.documentServiceImpl import DocumentServiceImpl from fastapi_modules.fastapi_leaudit.services.impl.documentServiceImpl import DocumentServiceImpl
+83
View File
@@ -1,5 +1,7 @@
import pytest import pytest
import httpx
from fastapi_modules.fastapi_leaudit.leaudit_bridge.resilient_clients import ResilientQwenVLMClient
from fastapi_modules.fastapi_leaudit.services.impl.pageQualityServiceImpl import PageQualityServiceImpl from fastapi_modules.fastapi_leaudit.services.impl.pageQualityServiceImpl import PageQualityServiceImpl
@@ -32,6 +34,58 @@ async def test_vlm_page_quality_reject_result_is_used():
assert score == 0.18 assert score == 0.18
assert "严重模糊" in reason assert "严重模糊" in reason
assert "只输出 JSON" in service.VlmClient.prompts[0][0] assert "只输出 JSON" in service.VlmClient.prompts[0][0]
assert "内嵌照片" in service.VlmClient.prompts[0][0]
assert "即使页面周边文字清楚" in service.VlmClient.prompts[0][0]
@pytest.mark.asyncio
async def test_vlm_page_quality_embedded_evidence_blur_cannot_pass():
service = PageQualityServiceImpl()
service.VlmClient = _FakeVlmClient(
{
"quality_status": "疑似模糊",
"quality_score": "0.42",
"message": "内嵌证据照片主体发虚,门头文字不易辨认",
}
)
status, score, reason = await service._classify_page_image_by_vlm(b"image-bytes")
assert status == "review"
assert score == 0.42
assert "内嵌证据照片" in reason
@pytest.mark.asyncio
async def test_vlm_page_quality_chinese_reject_status_is_supported():
service = PageQualityServiceImpl()
service.VlmClient = _FakeVlmClient(
{
"result": "不通过",
"confidence": 0.1,
"detail": "证据照片严重模糊,关键场所无法辨认",
}
)
status, score, reason = await service._classify_page_image_by_vlm(b"image-bytes")
assert status == "reject"
assert score == 0.1
assert "严重模糊" in reason
@pytest.mark.asyncio
async def test_vlm_page_quality_json_string_result_is_supported():
service = PageQualityServiceImpl()
service.VlmClient = _FakeVlmClient(
'{"status":"review","score":0.33,"reason":"页面内照片模糊"}'
)
status, score, reason = await service._classify_page_image_by_vlm(b"image-bytes")
assert status == "review"
assert score == 0.33
assert reason == "页面内照片模糊"
@pytest.mark.asyncio @pytest.mark.asyncio
@@ -56,3 +110,32 @@ async def test_vlm_page_quality_error_falls_back_to_review_not_pass():
assert status == "review" assert status == "review"
assert score == 0.5 assert score == 0.5
assert "VLM图片质量检测失败" in reason assert "VLM图片质量检测失败" in reason
@pytest.mark.asyncio
async def test_resilient_vlm_extract_multifield_keeps_raw_text_when_json_parse_fails(monkeypatch):
client = ResilientQwenVLMClient(base_url="http://example.test", api_key="x", model="vlm-test")
async def fake_post_with_retry(payload):
return httpx.Response(
200,
json={
"choices": [
{
"message": {
"content": "疑似模糊:内嵌证据照片主体发虚,建议人工复核",
}
}
]
},
)
monkeypatch.setattr(client, "_post_with_retry", fake_post_with_retry)
result = await client.extract_multifield(
prompt="图片质量检测",
images_data_urls=["data:image/png;base64,xxx"],
)
assert result["result"].startswith("疑似模糊")
assert "内嵌证据照片" in result["reason"]