fix: harden cross-review task state handling
This commit is contained in:
@@ -359,7 +359,9 @@ class StorageAdapter:
|
||||
"level": level,
|
||||
"error_code": error_code,
|
||||
"message": message,
|
||||
"detail_json": detail_json,
|
||||
"detail_json": json.dumps(detail_json, ensure_ascii=False)
|
||||
if detail_json is not None
|
||||
else None,
|
||||
},
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
@@ -17,6 +17,7 @@ from sqlalchemy import select
|
||||
from fastapi_admin.celery_app import celery_app
|
||||
from fastapi_admin.config import (
|
||||
LEAUDIT_RULES_DIR,
|
||||
LEAUDIT_STUCK_TIMEOUT_MINUTES,
|
||||
LEAUDIT_WORKER_QUEUE_NORMAL,
|
||||
LEAUDIT_WORKER_QUEUE_URGENT,
|
||||
)
|
||||
@@ -256,6 +257,22 @@ def leaudit_process_document_task(self, run_id: int, rules_path: str | None = No
|
||||
)
|
||||
|
||||
|
||||
@celery_app.task(
|
||||
bind=True,
|
||||
name="leaudit.scan_stuck_documents",
|
||||
)
|
||||
def leaudit_scan_stuck_documents_task(self) -> dict[str, Any]:
|
||||
"""周期扫描长时间无进展的评查文档,并自动标记失败。"""
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
try:
|
||||
return loop.run_until_complete(
|
||||
_scan_and_fail_stuck_documents(timeout_minutes=max(1, int(LEAUDIT_STUCK_TIMEOUT_MINUTES)))
|
||||
)
|
||||
finally:
|
||||
loop.close()
|
||||
|
||||
|
||||
# type_id → rules directory mapping (only fixed-mapping types)
|
||||
# 行政许可 (type_id=2) has 9 sub-types, NOT mapped here —
|
||||
# must come from document metadata (rules_file_path) or content classification.
|
||||
@@ -633,3 +650,146 @@ def _queue_label(queue_name: str | None) -> str:
|
||||
if queue_name == LEAUDIT_WORKER_QUEUE_URGENT:
|
||||
return "urgent"
|
||||
return "normal"
|
||||
|
||||
|
||||
async def _scan_and_fail_stuck_documents(*, timeout_minutes: int) -> dict[str, Any]:
|
||||
"""扫描当前 run 长时间无更新时间的文档,并将其标记为失败。"""
|
||||
from fastapi_common.fastapi_common_sqlalchemy.database import GetAsyncSession
|
||||
from sqlalchemy import text as sa_text
|
||||
|
||||
lock_key = 20260512
|
||||
timed_out_rows: list[dict[str, Any]] = []
|
||||
|
||||
async with GetAsyncSession() as session:
|
||||
lock_row = (
|
||||
await session.execute(
|
||||
sa_text("SELECT pg_try_advisory_lock(:lock_key)"),
|
||||
{"lock_key": lock_key},
|
||||
)
|
||||
).fetchone()
|
||||
has_lock = bool(lock_row[0]) if lock_row else False
|
||||
if not has_lock:
|
||||
log.info("stuck-scan skipped: advisory lock already held")
|
||||
return {
|
||||
"status": "skipped",
|
||||
"reason": "lock_not_acquired",
|
||||
"timeout_minutes": timeout_minutes,
|
||||
"matched": 0,
|
||||
"failed": 0,
|
||||
}
|
||||
|
||||
try:
|
||||
rows = (
|
||||
await session.execute(
|
||||
sa_text(
|
||||
"""
|
||||
SELECT
|
||||
d.id AS document_id,
|
||||
d.processing_status,
|
||||
d.current_run_id,
|
||||
d.updated_at AS document_updated_at,
|
||||
d.region,
|
||||
d.normalized_name,
|
||||
ar.id AS run_id,
|
||||
ar.status AS run_status,
|
||||
ar.phase,
|
||||
ar.task_id,
|
||||
ar.updated_at AS run_updated_at,
|
||||
EXTRACT(EPOCH FROM (
|
||||
NOW() - CASE
|
||||
WHEN LOWER(COALESCE(ar.status, '')) IN ('pending', 'queued', 'running', 'retrying')
|
||||
THEN COALESCE(ar.updated_at, d.updated_at)
|
||||
ELSE COALESCE(d.updated_at, ar.updated_at)
|
||||
END
|
||||
))::bigint AS idle_seconds
|
||||
FROM leaudit_documents d
|
||||
JOIN leaudit_audit_runs ar
|
||||
ON ar.id = d.current_run_id
|
||||
WHERE d.deleted_at IS NULL
|
||||
AND d.is_latest_version = true
|
||||
AND (
|
||||
LOWER(COALESCE(ar.status, '')) IN ('pending', 'queued', 'running', 'retrying')
|
||||
OR LOWER(COALESCE(d.processing_status, '')) IN ('waiting', 'queued', 'running', 'processing')
|
||||
)
|
||||
AND CASE
|
||||
WHEN LOWER(COALESCE(ar.status, '')) IN ('pending', 'queued', 'running', 'retrying')
|
||||
THEN COALESCE(ar.updated_at, d.updated_at)
|
||||
ELSE COALESCE(d.updated_at, ar.updated_at)
|
||||
END < NOW() - make_interval(mins => :timeout_minutes)
|
||||
ORDER BY CASE
|
||||
WHEN LOWER(COALESCE(ar.status, '')) IN ('pending', 'queued', 'running', 'retrying')
|
||||
THEN COALESCE(ar.updated_at, d.updated_at)
|
||||
ELSE COALESCE(d.updated_at, ar.updated_at)
|
||||
END ASC,
|
||||
d.id ASC
|
||||
"""
|
||||
),
|
||||
{"timeout_minutes": timeout_minutes},
|
||||
)
|
||||
).mappings().all()
|
||||
|
||||
timed_out_rows = [dict(row) for row in rows]
|
||||
finally:
|
||||
await session.execute(
|
||||
sa_text("SELECT pg_advisory_unlock(:lock_key)"),
|
||||
{"lock_key": lock_key},
|
||||
)
|
||||
await session.commit()
|
||||
|
||||
storage = StorageAdapter()
|
||||
failed_items: list[dict[str, Any]] = []
|
||||
|
||||
for row in timed_out_rows:
|
||||
document_id = int(row["document_id"])
|
||||
run_id = int(row["run_id"])
|
||||
idle_seconds = int(row.get("idle_seconds") or 0)
|
||||
run_phase = row.get("phase")
|
||||
run_status = row.get("run_status")
|
||||
processing_status = row.get("processing_status")
|
||||
file_name = row.get("normalized_name") or f"document-{document_id}"
|
||||
|
||||
message = (
|
||||
f"文档处理长时间无进展,已自动终止:"
|
||||
f"document_id={document_id}, run_id={run_id}, "
|
||||
f"processing_status={processing_status}, run_status={run_status}, "
|
||||
f"phase={run_phase}, idle_seconds={idle_seconds}"
|
||||
)
|
||||
|
||||
try:
|
||||
await _update_status_safe(document_id, "failed")
|
||||
await storage.fail_run(
|
||||
document_id,
|
||||
run_id=run_id,
|
||||
phase=run_phase or "dispatch",
|
||||
message=message,
|
||||
detail_json={
|
||||
"reason": "stuck_timeout",
|
||||
"timeoutMinutes": timeout_minutes,
|
||||
"idleSeconds": idle_seconds,
|
||||
"taskId": row.get("task_id"),
|
||||
"runStatus": run_status,
|
||||
"processingStatus": processing_status,
|
||||
"phase": run_phase,
|
||||
"fileName": file_name,
|
||||
"region": row.get("region"),
|
||||
},
|
||||
)
|
||||
failed_items.append(
|
||||
{
|
||||
"document_id": document_id,
|
||||
"run_id": run_id,
|
||||
"phase": run_phase,
|
||||
"idle_seconds": idle_seconds,
|
||||
}
|
||||
)
|
||||
log.warning("stuck document auto-failed: %s", message)
|
||||
except Exception:
|
||||
log.exception("stuck document auto-fail failed: document_id=%s run_id=%s", document_id, run_id)
|
||||
|
||||
return {
|
||||
"status": "success",
|
||||
"timeout_minutes": timeout_minutes,
|
||||
"matched": len(timed_out_rows),
|
||||
"failed": len(failed_items),
|
||||
"items": failed_items,
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user