Merge pull request 'fix: 稳定后端服务与结构比对链路' (#8) from wren-dev into main
Reviewed-on: #8
This commit was merged in pull request #8.
This commit is contained in:
@@ -74,6 +74,9 @@ SIGNATURE_PROBE_RETRY_BACKOFF_BASE_SECONDS = 0.5
|
|||||||
WORKER_QUEUE_URGENT = "leaudit.urgent"
|
WORKER_QUEUE_URGENT = "leaudit.urgent"
|
||||||
WORKER_QUEUE_NORMAL = "leaudit.normal"
|
WORKER_QUEUE_NORMAL = "leaudit.normal"
|
||||||
WORKER_CONCURRENCY = 2
|
WORKER_CONCURRENCY = 2
|
||||||
|
PAGE_QUALITY_ENABLED = true
|
||||||
|
PAGE_QUALITY_QUEUE_URGENT = "leaudit.page_quality.urgent"
|
||||||
|
PAGE_QUALITY_QUEUE_NORMAL = "leaudit.page_quality.normal"
|
||||||
RUN_LOCK_SECONDS = 1800
|
RUN_LOCK_SECONDS = 1800
|
||||||
TASK_SOFT_TIME_LIMIT = 3300
|
TASK_SOFT_TIME_LIMIT = 3300
|
||||||
TASK_TIME_LIMIT = 3600
|
TASK_TIME_LIMIT = 3600
|
||||||
|
|||||||
@@ -508,6 +508,7 @@ class ContractTemplateServiceImpl(IContractTemplateService):
|
|||||||
ORDER BY c.name ASC
|
ORDER BY c.name ASC
|
||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
|
(sql,) = self._bind_expanding(sql, params)
|
||||||
rows = (await session.execute(sql, params)).mappings().all()
|
rows = (await session.execute(sql, params)).mappings().all()
|
||||||
|
|
||||||
return [
|
return [
|
||||||
|
|||||||
@@ -26,6 +26,18 @@ from fastapi_modules.fastapi_leaudit.services.impl.ossServiceImpl import OssServ
|
|||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_PAGE_QUALITY_VLM_PROMPT = """
|
||||||
|
你是文档扫描图片质量检测员。请判断这 1 页文档图片是否适合继续做 OCR 与合同/公文评查。
|
||||||
|
|
||||||
|
判定标准:
|
||||||
|
1. pass:文字主体清晰、方向正常、没有明显截断,能稳定阅读。
|
||||||
|
2. review:存在轻微模糊、倾斜、阴影、低对比度、局部遮挡、轻微截断,建议人工确认但仍可能可读。
|
||||||
|
3. reject:严重模糊、重影、过曝/过暗、页面大面积缺失、关键文字不可辨认、方向严重错误、空白页或非文档页,建议重拍。
|
||||||
|
|
||||||
|
只输出 JSON,不要输出 Markdown,不要解释额外文本:
|
||||||
|
{"status":"pass|review|reject","score":0.0到1.0,"reason":"20字以内中文原因"}
|
||||||
|
""".strip()
|
||||||
|
|
||||||
|
|
||||||
class PageQualityServiceImpl(IPageQualityService):
|
class PageQualityServiceImpl(IPageQualityService):
|
||||||
"""页级图片质量服务实现。"""
|
"""页级图片质量服务实现。"""
|
||||||
@@ -33,6 +45,7 @@ class PageQualityServiceImpl(IPageQualityService):
|
|||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self.OssService = OssServiceImpl()
|
self.OssService = OssServiceImpl()
|
||||||
self.DocumentService = None
|
self.DocumentService = None
|
||||||
|
self.VlmClient = None
|
||||||
|
|
||||||
async def DispatchForDocument(
|
async def DispatchForDocument(
|
||||||
self,
|
self,
|
||||||
@@ -282,7 +295,7 @@ class PageQualityServiceImpl(IPageQualityService):
|
|||||||
reject_pages = 0
|
reject_pages = 0
|
||||||
async with GetAsyncSession() as session:
|
async with GetAsyncSession() as session:
|
||||||
for page_num, page_image in page_images:
|
for page_num, page_image in page_images:
|
||||||
status, score, reason = self._classify_page_image(page_image)
|
status, score, reason = await self._classify_page_image_by_vlm(page_image)
|
||||||
if status == "review":
|
if status == "review":
|
||||||
review_pages += 1
|
review_pages += 1
|
||||||
elif status == "reject":
|
elif status == "reject":
|
||||||
@@ -466,13 +479,52 @@ class PageQualityServiceImpl(IPageQualityService):
|
|||||||
finally:
|
finally:
|
||||||
doc.close()
|
doc.close()
|
||||||
|
|
||||||
def _classify_page_image(self, image_bytes: bytes) -> tuple[str, float, str | None]:
|
async def _classify_page_image_by_vlm(self, image_bytes: bytes) -> tuple[str, float, str | None]:
|
||||||
size = len(image_bytes)
|
"""使用 VLM 对单页图片做质量判定。VLM 异常不能默认为通过。"""
|
||||||
if size < 25_000:
|
client = self._vlm_client()
|
||||||
return "reject", 0.2, "页面图像内容过少或清晰度较低,建议重拍"
|
if client is None:
|
||||||
if size < 60_000:
|
return "review", 0.5, "VLM未配置,需人工确认图片质量"
|
||||||
return "review", 0.45, "页面疑似存在模糊,建议人工确认"
|
|
||||||
return "pass", 0.9, None
|
try:
|
||||||
|
result = await client.extract_multifield(
|
||||||
|
prompt=_PAGE_QUALITY_VLM_PROMPT,
|
||||||
|
images_data_urls=[self._image_data_url(image_bytes)],
|
||||||
|
max_tokens=300,
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("VLM page quality detection failed: %s", exc)
|
||||||
|
return "review", 0.5, "VLM图片质量检测失败,需人工确认"
|
||||||
|
|
||||||
|
status = str((result or {}).get("status") or "").strip().lower()
|
||||||
|
if status not in {"pass", "review", "reject"}:
|
||||||
|
return "review", 0.5, "VLM返回结果不可用,需人工确认"
|
||||||
|
|
||||||
|
score = self._normalize_quality_score((result or {}).get("score"), status)
|
||||||
|
reason = str((result or {}).get("reason") or "").strip() or None
|
||||||
|
if status != "pass" and not reason:
|
||||||
|
reason = "页面图片质量需人工确认"
|
||||||
|
return status, score, reason
|
||||||
|
|
||||||
|
def _vlm_client(self):
|
||||||
|
if self.VlmClient is None:
|
||||||
|
from fastapi_modules.fastapi_leaudit.leaudit_bridge.client_factory import create_vlm_client
|
||||||
|
|
||||||
|
self.VlmClient = create_vlm_client()
|
||||||
|
return self.VlmClient
|
||||||
|
|
||||||
|
def _image_data_url(self, image_bytes: bytes) -> str:
|
||||||
|
import base64
|
||||||
|
|
||||||
|
encoded = base64.b64encode(image_bytes).decode()
|
||||||
|
return f"data:image/png;base64,{encoded}"
|
||||||
|
|
||||||
|
def _normalize_quality_score(self, raw_score: Any, status: str) -> float:
|
||||||
|
defaults = {"pass": 0.9, "review": 0.5, "reject": 0.2}
|
||||||
|
try:
|
||||||
|
score = float(raw_score)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return defaults[status]
|
||||||
|
return max(0.0, min(1.0, score))
|
||||||
|
|
||||||
def _document_service(self):
|
def _document_service(self):
|
||||||
if self.DocumentService is None:
|
if self.DocumentService is None:
|
||||||
|
|||||||
@@ -54,6 +54,8 @@ class RagChatServiceImpl(IRagChatService):
|
|||||||
_task_done: dict[str, bool] = {}
|
_task_done: dict[str, bool] = {}
|
||||||
_task_locks: dict[str, asyncio.Lock] = {}
|
_task_locks: dict[str, asyncio.Lock] = {}
|
||||||
_title_tasks: dict[str, asyncio.Task] = {}
|
_title_tasks: dict[str, asyncio.Task] = {}
|
||||||
|
_chat_schema_checked = False
|
||||||
|
_chat_schema_lock = asyncio.Lock()
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self.TenantResolver = TenantResolver()
|
self.TenantResolver = TenantResolver()
|
||||||
@@ -731,8 +733,34 @@ class RagChatServiceImpl(IRagChatService):
|
|||||||
)
|
)
|
||||||
|
|
||||||
async def _ensure_rag_chat_schema(self, session) -> None:
|
async def _ensure_rag_chat_schema(self, session) -> None:
|
||||||
await session.execute(text("ALTER TABLE rag_chat_app ADD COLUMN IF NOT EXISTS tenant_code VARCHAR(64) NULL"))
|
if self.__class__._chat_schema_checked:
|
||||||
await session.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chat_app_tenant_code ON rag_chat_app(tenant_code) WHERE deleted_at IS NULL"))
|
return
|
||||||
|
|
||||||
|
async with self.__class__._chat_schema_lock:
|
||||||
|
if self.__class__._chat_schema_checked:
|
||||||
|
return
|
||||||
|
|
||||||
|
exists = (
|
||||||
|
await session.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
SELECT 1
|
||||||
|
FROM information_schema.columns
|
||||||
|
WHERE table_schema = current_schema()
|
||||||
|
AND table_name = 'rag_chat_app'
|
||||||
|
AND column_name = 'tenant_code'
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
)
|
||||||
|
).scalar_one_or_none()
|
||||||
|
if exists:
|
||||||
|
self.__class__._chat_schema_checked = True
|
||||||
|
return
|
||||||
|
|
||||||
|
await session.execute(text("SET LOCAL lock_timeout = '1000ms'"))
|
||||||
|
await session.execute(text("ALTER TABLE rag_chat_app ADD COLUMN tenant_code VARCHAR(64) NULL"))
|
||||||
|
await session.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chat_app_tenant_code ON rag_chat_app(tenant_code) WHERE deleted_at IS NULL"))
|
||||||
|
self.__class__._chat_schema_checked = True
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _tenant_context_is_global(tenant_context: dict[str, str | None]) -> bool:
|
def _tenant_context_is_global(tenant_context: dict[str, str | None]) -> bool:
|
||||||
|
|||||||
@@ -57,6 +57,8 @@ class RagDatasetServiceImpl(IRagDatasetService):
|
|||||||
ORDER BY dataset_id, is_default DESC, sort_order ASC, id ASC
|
ORDER BY dataset_id, is_default DESC, sort_order ASC, id ASC
|
||||||
) a ON a.dataset_id = d.id
|
) a ON a.dataset_id = d.id
|
||||||
"""
|
"""
|
||||||
|
_tenant_schema_checked = False
|
||||||
|
_tenant_schema_lock = asyncio.Lock()
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self.TenantResolver = TenantResolver()
|
self.TenantResolver = TenantResolver()
|
||||||
@@ -1038,10 +1040,39 @@ class RagDatasetServiceImpl(IRagDatasetService):
|
|||||||
raise LeauditException(StatusCodeEnum.HTTP_403_FORBIDDEN, "当前用户只能管理本地区知识库")
|
raise LeauditException(StatusCodeEnum.HTTP_403_FORBIDDEN, "当前用户只能管理本地区知识库")
|
||||||
|
|
||||||
async def _ensure_rag_tenant_schema(self, session) -> None:
|
async def _ensure_rag_tenant_schema(self, session) -> None:
|
||||||
await session.execute(text("ALTER TABLE rag_dataset ADD COLUMN IF NOT EXISTS tenant_code VARCHAR(64) NULL"))
|
if self.__class__._tenant_schema_checked:
|
||||||
await session.execute(text("ALTER TABLE rag_chat_app ADD COLUMN IF NOT EXISTS tenant_code VARCHAR(64) NULL"))
|
return
|
||||||
await session.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_dataset_tenant_code ON rag_dataset(tenant_code) WHERE deleted_at IS NULL"))
|
|
||||||
await session.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chat_app_tenant_code ON rag_chat_app(tenant_code) WHERE deleted_at IS NULL"))
|
async with self.__class__._tenant_schema_lock:
|
||||||
|
if self.__class__._tenant_schema_checked:
|
||||||
|
return
|
||||||
|
|
||||||
|
columns = (
|
||||||
|
await session.execute(
|
||||||
|
text(
|
||||||
|
"""
|
||||||
|
SELECT table_name
|
||||||
|
FROM information_schema.columns
|
||||||
|
WHERE table_schema = current_schema()
|
||||||
|
AND table_name IN ('rag_dataset', 'rag_chat_app')
|
||||||
|
AND column_name = 'tenant_code'
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
)
|
||||||
|
).scalars().all()
|
||||||
|
existing = set(columns)
|
||||||
|
if existing == {"rag_dataset", "rag_chat_app"}:
|
||||||
|
self.__class__._tenant_schema_checked = True
|
||||||
|
return
|
||||||
|
|
||||||
|
await session.execute(text("SET LOCAL lock_timeout = '1000ms'"))
|
||||||
|
if "rag_dataset" not in existing:
|
||||||
|
await session.execute(text("ALTER TABLE rag_dataset ADD COLUMN tenant_code VARCHAR(64) NULL"))
|
||||||
|
if "rag_chat_app" not in existing:
|
||||||
|
await session.execute(text("ALTER TABLE rag_chat_app ADD COLUMN tenant_code VARCHAR(64) NULL"))
|
||||||
|
await session.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_dataset_tenant_code ON rag_dataset(tenant_code) WHERE deleted_at IS NULL"))
|
||||||
|
await session.execute(text("CREATE INDEX IF NOT EXISTS idx_rag_chat_app_tenant_code ON rag_chat_app(tenant_code) WHERE deleted_at IS NULL"))
|
||||||
|
self.__class__._tenant_schema_checked = True
|
||||||
|
|
||||||
def _dataset_tenant_filter_sql(
|
def _dataset_tenant_filter_sql(
|
||||||
self,
|
self,
|
||||||
|
|||||||
+1
-1
Submodule legal-platform-frontend updated: f219811a6e...df04238bbb
@@ -9,13 +9,21 @@ source .venv/bin/activate
|
|||||||
eval "$(
|
eval "$(
|
||||||
.venv/bin/python - <<'PY'
|
.venv/bin/python - <<'PY'
|
||||||
from fastapi_admin.config import (
|
from fastapi_admin.config import (
|
||||||
|
LEAUDIT_PAGE_QUALITY_QUEUE_NORMAL,
|
||||||
|
LEAUDIT_PAGE_QUALITY_QUEUE_URGENT,
|
||||||
LEAUDIT_WORKER_CONCURRENCY,
|
LEAUDIT_WORKER_CONCURRENCY,
|
||||||
LEAUDIT_WORKER_QUEUE_NORMAL,
|
LEAUDIT_WORKER_QUEUE_NORMAL,
|
||||||
LEAUDIT_WORKER_QUEUE_URGENT,
|
LEAUDIT_WORKER_QUEUE_URGENT,
|
||||||
)
|
)
|
||||||
|
|
||||||
print(f'WORKER_CONCURRENCY={LEAUDIT_WORKER_CONCURRENCY}')
|
print(f'WORKER_CONCURRENCY={LEAUDIT_WORKER_CONCURRENCY}')
|
||||||
print(f'WORKER_QUEUES={LEAUDIT_WORKER_QUEUE_URGENT},{LEAUDIT_WORKER_QUEUE_NORMAL}')
|
print(
|
||||||
|
'WORKER_QUEUES='
|
||||||
|
f'{LEAUDIT_WORKER_QUEUE_URGENT},'
|
||||||
|
f'{LEAUDIT_WORKER_QUEUE_NORMAL},'
|
||||||
|
f'{LEAUDIT_PAGE_QUALITY_QUEUE_URGENT},'
|
||||||
|
f'{LEAUDIT_PAGE_QUALITY_QUEUE_NORMAL}'
|
||||||
|
)
|
||||||
PY
|
PY
|
||||||
)"
|
)"
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,68 @@
|
|||||||
|
import asyncio
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
from fastapi_modules.fastapi_leaudit.services.impl.contractTemplateServiceImpl import ContractTemplateServiceImpl
|
||||||
|
|
||||||
|
|
||||||
|
class _EmptyMappingResult:
|
||||||
|
def mappings(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def all(self):
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
class _FakeSession:
|
||||||
|
def __init__(self):
|
||||||
|
self.executed_sql = None
|
||||||
|
self.executed_params = None
|
||||||
|
|
||||||
|
async def execute(self, sql, params=None):
|
||||||
|
self.executed_sql = sql
|
||||||
|
self.executed_params = params
|
||||||
|
return _EmptyMappingResult()
|
||||||
|
|
||||||
|
|
||||||
|
class _FakeSessionContext:
|
||||||
|
def __init__(self, session):
|
||||||
|
self.session = session
|
||||||
|
|
||||||
|
async def __aenter__(self):
|
||||||
|
return self.session
|
||||||
|
|
||||||
|
async def __aexit__(self, exc_type, exc, tb):
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def test_contract_template_search_category_stats_binds_expanding_scope_params():
|
||||||
|
service = ContractTemplateServiceImpl()
|
||||||
|
fake_session = _FakeSession()
|
||||||
|
|
||||||
|
async def noop_ensure_schema(session):
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def fake_user_context(current_user_id, session):
|
||||||
|
return {
|
||||||
|
"id": current_user_id,
|
||||||
|
"area": "梅州",
|
||||||
|
"tenant_code": "MZ",
|
||||||
|
"tenant_name": "梅州",
|
||||||
|
"tenant_scope_value": "梅州",
|
||||||
|
"is_global": False,
|
||||||
|
"can_manage": True,
|
||||||
|
"is_area_admin": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
service._ensureContractTemplateSchema = noop_ensure_schema
|
||||||
|
service._getCurrentUserContext = fake_user_context
|
||||||
|
|
||||||
|
with patch(
|
||||||
|
"fastapi_modules.fastapi_leaudit.services.impl.contractTemplateServiceImpl.GetAsyncSession",
|
||||||
|
return_value=_FakeSessionContext(fake_session),
|
||||||
|
):
|
||||||
|
asyncio.run(service._load_search_category_stats("买卖", None, None, 5))
|
||||||
|
|
||||||
|
assert fake_session.executed_sql._bindparams["visible_tenant_codes"].expanding is True
|
||||||
|
assert fake_session.executed_sql._bindparams["visible_regions"].expanding is True
|
||||||
|
assert fake_session.executed_params["visible_tenant_codes"] == ["PROVINCIAL", "PUBLIC", "MZ"]
|
||||||
|
assert fake_session.executed_params["visible_regions"] == ["省级", "公共", "梅州"]
|
||||||
@@ -0,0 +1,58 @@
|
|||||||
|
import pytest
|
||||||
|
|
||||||
|
from fastapi_modules.fastapi_leaudit.services.impl.pageQualityServiceImpl import PageQualityServiceImpl
|
||||||
|
|
||||||
|
|
||||||
|
class _FakeVlmClient:
|
||||||
|
def __init__(self, response):
|
||||||
|
self.response = response
|
||||||
|
self.prompts = []
|
||||||
|
|
||||||
|
async def extract_multifield(self, *, prompt, images_data_urls, max_tokens=800):
|
||||||
|
self.prompts.append((prompt, images_data_urls, max_tokens))
|
||||||
|
if isinstance(self.response, Exception):
|
||||||
|
raise self.response
|
||||||
|
return self.response
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_vlm_page_quality_reject_result_is_used():
|
||||||
|
service = PageQualityServiceImpl()
|
||||||
|
service.VlmClient = _FakeVlmClient(
|
||||||
|
{
|
||||||
|
"status": "reject",
|
||||||
|
"score": 0.18,
|
||||||
|
"reason": "页面文字严重模糊,无法稳定辨认关键内容",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
status, score, reason = await service._classify_page_image_by_vlm(b"image-bytes")
|
||||||
|
|
||||||
|
assert status == "reject"
|
||||||
|
assert score == 0.18
|
||||||
|
assert "严重模糊" in reason
|
||||||
|
assert "只输出 JSON" in service.VlmClient.prompts[0][0]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_vlm_page_quality_invalid_result_falls_back_to_review_not_pass():
|
||||||
|
service = PageQualityServiceImpl()
|
||||||
|
service.VlmClient = _FakeVlmClient({"status": "unknown", "reason": ""})
|
||||||
|
|
||||||
|
status, score, reason = await service._classify_page_image_by_vlm(b"image-bytes")
|
||||||
|
|
||||||
|
assert status == "review"
|
||||||
|
assert score == 0.5
|
||||||
|
assert "VLM返回结果不可用" in reason
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_vlm_page_quality_error_falls_back_to_review_not_pass():
|
||||||
|
service = PageQualityServiceImpl()
|
||||||
|
service.VlmClient = _FakeVlmClient(RuntimeError("vlm down"))
|
||||||
|
|
||||||
|
status, score, reason = await service._classify_page_image_by_vlm(b"image-bytes")
|
||||||
|
|
||||||
|
assert status == "review"
|
||||||
|
assert score == 0.5
|
||||||
|
assert "VLM图片质量检测失败" in reason
|
||||||
Reference in New Issue
Block a user