diff --git a/app/api/evaluation_points/reviews.ts b/app/api/evaluation_points/reviews.ts index bbe9e9d..c95f1a2 100644 --- a/app/api/evaluation_points/reviews.ts +++ b/app/api/evaluation_points/reviews.ts @@ -461,22 +461,22 @@ export async function getReviewPoints(fileId: string, request: Request) { evaluatedPointResultsLog: evaluatedPointResultsLog || {} // evaluatedPointResultsLog: { // rules:[ - // { - // "id": "0", - // "type": "consistency", - // "res": true, - // "config": { - // "logic": "all", - // "pairs": [ - // { - // "sourceField": {"证据先行登记保存批准书-负责人意见并签名-时间": {page: 1,value: ''}}, - // "targetField": {"证据先行登记保存批准书-负责人意见并签名-签名": {page: 2,value: '有无判断类型'}}, - // "compareMethod": "exact", - // "res": true - // } - // ] - // } - // }, + // { + // "id": "0", + // "type": "consistency", + // "res": true, + // "config": { + // "logic": "all", + // "pairs": [ + // { + // "sourceField": {"证据先行登记保存批准书-负责人意见并签名-时间": {page: 1,value: ''}}, + // "targetField": {"证据先行登记保存批准书-负责人意见并签名-签名": {page: 2,value: '有无判断类型'}}, + // "compareMethod": "exact", + // "res": true + // } + // ] + // } + // }, // { // "id": "1", // "type": "consistency", diff --git a/app/routes/monaco-demo.tsx b/app/routes/monaco-demo.tsx index 3dc6ea8..b3228cc 100644 --- a/app/routes/monaco-demo.tsx +++ b/app/routes/monaco-demo.tsx @@ -13,6 +13,7 @@ import { useState, useRef, useEffect } from "react"; import { DiffEditor } from "@monaco-editor/react"; import type { editor } from "monaco-editor"; import { pdfjs } from 'react-pdf'; +import mammoth from 'mammoth'; import { toastService } from '~/components/ui/Toast'; // 设置 PDF.js worker(与 pdf-demo.tsx 相同) @@ -25,14 +26,26 @@ export const meta: MetaFunction = () => { ]; }; +// 文档类型枚举 +type DocumentType = 'pdf' | 'docx' | 'unknown'; + // PDF 类型枚举 type PdfType = 'text' | 'scanned' | 'unknown'; -// PDF 信息接口 +// PDF 信息接口(内部使用) interface PdfInfo { type: PdfType; numPages: number; textLength: number; + confidence: number; +} + +// 文档信息接口 +interface DocumentInfo { + fileType: DocumentType; + pdfType?: PdfType; // 只有 PDF 才有 + numPages?: number; // PDF 页数 + textLength: number; confidence: number; // 文本提取置信度 (0-1) } @@ -120,15 +133,23 @@ export default function MonacoDemoPage() { const [diffCount, setDiffCount] = useState(0); const [currentDiff, setCurrentDiff] = useState(0); - // PDF相关状态 - const [pdf1Url, setPdf1Url] = useState(''); - const [pdf2Url, setPdf2Url] = useState(''); - const [pdf1Info, setPdf1Info] = useState(null); - const [pdf2Info, setPdf2Info] = useState(null); - const [isLoadingPdf1, setIsLoadingPdf1] = useState(false); - const [isLoadingPdf2, setIsLoadingPdf2] = useState(false); + // 文档相关状态 + const [doc1Url, setDoc1Url] = useState(''); + const [doc2Url, setDoc2Url] = useState(''); + const [doc1Info, setDoc1Info] = useState(null); + const [doc2Info, setDoc2Info] = useState(null); + const [isLoadingDoc1, setIsLoadingDoc1] = useState(false); + const [isLoadingDoc2, setIsLoadingDoc2] = useState(false); const [useExample, setUseExample] = useState(true); + // 检测文件类型(根据文件路径) + const detectFileType = (filePath: string): DocumentType => { + const lowerPath = filePath.toLowerCase(); + if (lowerPath.endsWith('.pdf')) return 'pdf'; + if (lowerPath.endsWith('.docx') || lowerPath.endsWith('.doc')) return 'docx'; + return 'unknown'; + }; + // PDF类型检测函数 const detectPdfType = async (pdfUrl: string): Promise => { const loadingTask = pdfjs.getDocument(pdfUrl); @@ -189,32 +210,81 @@ export default function MonacoDemoPage() { return fullText; }; - // 加载PDF并提取文本 - const loadPdfAndExtractText = async (pdfUrl: string, setPdfInfo: (info: PdfInfo | null) => void, setLoading: (loading: boolean) => void, setTextContent: (text: string) => void) => { + // Word文档文本提取函数 + const extractTextFromWord = async (docUrl: string): Promise => { + // 通过 fetch 获取文件 + const response = await fetch(docUrl); + if (!response.ok) { + throw new Error(`无法加载文档: ${response.statusText}`); + } + + // 获取 ArrayBuffer + const arrayBuffer = await response.arrayBuffer(); + + // 使用 mammoth 提取纯文本 + const result = await mammoth.extractRawText({ arrayBuffer }); + + return result.value; + }; + + // 加载文档并提取文本(支持 PDF 和 Word) + const loadDocumentAndExtractText = async ( + docUrl: string, + filePath: string, + setDocInfo: (info: DocumentInfo | null) => void, + setLoading: (loading: boolean) => void, + setTextContent: (text: string) => void + ) => { try { setLoading(true); - // 1. 检测PDF类型 - const pdfInfo = await detectPdfType(pdfUrl); - setPdfInfo(pdfInfo); + // 1. 检测文件类型 + const fileType = detectFileType(filePath); - // 2. 提取文本 - if (pdfInfo.type === 'text') { - const text = await extractTextFromPdf(pdfUrl); + if (fileType === 'pdf') { + // PDF 处理 + const pdfInfo = await detectPdfType(docUrl); + const text = await extractTextFromPdf(docUrl); + + const docInfo: DocumentInfo = { + fileType: 'pdf', + pdfType: pdfInfo.type, + numPages: pdfInfo.numPages, + textLength: pdfInfo.textLength, + confidence: pdfInfo.confidence + }; + + setDocInfo(docInfo); setTextContent(text); - toastService.success(`PDF加载成功!共 ${pdfInfo.numPages} 页,提取了 ${pdfInfo.textLength} 个字符`); - } else if (pdfInfo.type === 'scanned') { - toastService.warning('检测到扫描版PDF,文本提取质量可能较低'); - const text = await extractTextFromPdf(pdfUrl); + + if (pdfInfo.type === 'text') { + toastService.success(`PDF加载成功!共 ${pdfInfo.numPages} 页,提取了 ${pdfInfo.textLength} 个字符`); + } else if (pdfInfo.type === 'scanned') { + toastService.warning('检测到扫描版PDF,文本提取质量可能较低'); + } else { + toastService.error('无法识别PDF类型,可能是图片PDF'); + } + } else if (fileType === 'docx') { + // Word 处理 + const text = await extractTextFromWord(docUrl); + + const docInfo: DocumentInfo = { + fileType: 'docx', + textLength: text.length, + confidence: 1.0 // Word 文档文本提取置信度为 100% + }; + + setDocInfo(docInfo); setTextContent(text); + toastService.success(`Word文档加载成功!提取了 ${text.length} 个字符`); } else { - toastService.error('无法识别PDF类型,可能是图片PDF'); + toastService.error('不支持的文件类型'); setTextContent(''); } } catch (error) { - console.error('PDF加载失败:', error); - toastService.error('PDF加载失败,请检查文件路径'); - setPdfInfo(null); + console.error('文档加载失败:', error); + toastService.error(`文档加载失败: ${error instanceof Error ? error.message : '未知错误'}`); + setDocInfo(null); setTextContent(''); } finally { setLoading(false); @@ -281,8 +351,8 @@ export default function MonacoDemoPage() { setModifiedText(CONTRACT_B); setCurrentDiff(0); setUseExample(true); - setPdf1Info(null); - setPdf2Info(null); + setDoc1Info(null); + setDoc2Info(null); // 重新计算差异数量 setTimeout(() => { @@ -295,34 +365,50 @@ export default function MonacoDemoPage() { }, 100); }; - // 从URL参数加载PDF - const loadPdfsFromUrl = () => { + // 构建文件访问 URL + const buildFileUrl = (filePath: string): string => { + // 如果路径以 public/ 开头或者以已知的 public 子目录开头(如 testWork/) + // 则直接使用静态资源路径 + if (filePath.startsWith('public/')) { + // 去掉 public/ 前缀,直接访问静态资源 + return '/' + filePath.substring(7); + } else if (filePath.startsWith('testWork/') || filePath.startsWith('testPDF/')) { + // testWork 和 testPDF 目录在 public 下,直接作为静态资源访问 + return '/' + filePath; + } else { + // 其他路径通过 api/pdf-proxy 代理访问(从 MinIO 获取) + return `/api/pdf-proxy?path=${encodeURIComponent(filePath)}`; + } + }; + + // 从URL参数加载文档(支持 PDF 和 Word) + const loadDocumentsFromUrl = () => { if (typeof window === 'undefined') return; const searchParams = new URLSearchParams(window.location.search); - const pdf1Path = searchParams.get('pdf1'); - const pdf2Path = searchParams.get('pdf2'); + const doc1Path = searchParams.get('doc1') || searchParams.get('pdf1'); // 兼容旧参数名 + const doc2Path = searchParams.get('doc2') || searchParams.get('pdf2'); // 兼容旧参数名 - if (pdf1Path || pdf2Path) { + if (doc1Path || doc2Path) { setUseExample(false); - if (pdf1Path) { - const fullUrl = `/api/pdf-proxy?path=${encodeURIComponent(pdf1Path)}`; - setPdf1Url(fullUrl); - loadPdfAndExtractText(fullUrl, setPdf1Info, setIsLoadingPdf1, setOriginalText); + if (doc1Path) { + const fullUrl = buildFileUrl(doc1Path); + setDoc1Url(fullUrl); + loadDocumentAndExtractText(fullUrl, doc1Path, setDoc1Info, setIsLoadingDoc1, setOriginalText); } - if (pdf2Path) { - const fullUrl = `/api/pdf-proxy?path=${encodeURIComponent(pdf2Path)}`; - setPdf2Url(fullUrl); - loadPdfAndExtractText(fullUrl, setPdf2Info, setIsLoadingPdf2, setModifiedText); + if (doc2Path) { + const fullUrl = buildFileUrl(doc2Path); + setDoc2Url(fullUrl); + loadDocumentAndExtractText(fullUrl, doc2Path, setDoc2Info, setIsLoadingDoc2, setModifiedText); } } }; // 组件挂载时读取URL参数 useEffect(() => { - loadPdfsFromUrl(); + loadDocumentsFromUrl(); // eslint-disable-next-line react-hooks/exhaustive-deps }, []); @@ -452,8 +538,8 @@ export default function MonacoDemoPage() { - {/* PDF加载信息 */} - {!useExample && (pdf1Info || pdf2Info || isLoadingPdf1 || isLoadingPdf2) && ( + {/* 文档加载信息 */} + {!useExample && (doc1Info || doc2Info || isLoadingDoc1 || isLoadingDoc2) && (
- +
- PDF文档信息: + 文档信息:
- {/* PDF 1 信息 */} + {/* 文档 1 信息 */}
📄 文档1(左侧/原始)
- {isLoadingPdf1 ? ( + {isLoadingDoc1 ? (
⏳ 加载中...
- ) : pdf1Info ? ( + ) : doc1Info ? (
类型: - {pdf1Info.type === 'text' ? '✅ 文本PDF' : pdf1Info.type === 'scanned' ? '⚠️ 扫描PDF' : '❌ 未知类型'} + {doc1Info.fileType === 'pdf' ? '📕 PDF文档' : doc1Info.fileType === 'docx' ? '📘 Word文档' : '❌ 未知类型'}
-
页数: {pdf1Info.numPages} 页
-
字符数: {pdf1Info.textLength} 个
-
置信度: {(pdf1Info.confidence * 100).toFixed(0)}%
+ {doc1Info.fileType === 'pdf' && doc1Info.numPages && ( +
页数: {doc1Info.numPages} 页
+ )} + {doc1Info.fileType === 'pdf' && doc1Info.pdfType && ( +
PDF类型: {doc1Info.pdfType === 'text' ? '✅ 文本' : doc1Info.pdfType === 'scanned' ? '⚠️ 扫描' : '❌ 未知'}
+ )} +
字符数: {doc1Info.textLength} 个
+
置信度: {(doc1Info.confidence * 100).toFixed(0)}%
) : (
未加载
)}
- {/* PDF 2 信息 */} + {/* 文档 2 信息 */}
📄 文档2(右侧/修改)
- {isLoadingPdf2 ? ( + {isLoadingDoc2 ? (
⏳ 加载中...
- ) : pdf2Info ? ( + ) : doc2Info ? (
类型: - {pdf2Info.type === 'text' ? '✅ 文本PDF' : pdf2Info.type === 'scanned' ? '⚠️ 扫描PDF' : '❌ 未知类型'} + {doc2Info.fileType === 'pdf' ? '📕 PDF文档' : doc2Info.fileType === 'docx' ? '📘 Word文档' : '❌ 未知类型'}
-
页数: {pdf2Info.numPages} 页
-
字符数: {pdf2Info.textLength} 个
-
置信度: {(pdf2Info.confidence * 100).toFixed(0)}%
+ {doc2Info.fileType === 'pdf' && doc2Info.numPages && ( +
页数: {doc2Info.numPages} 页
+ )} + {doc2Info.fileType === 'pdf' && doc2Info.pdfType && ( +
PDF类型: {doc2Info.pdfType === 'text' ? '✅ 文本' : doc2Info.pdfType === 'scanned' ? '⚠️ 扫描' : '❌ 未知'}
+ )} +
字符数: {doc2Info.textLength} 个
+
置信度: {(doc2Info.confidence * 100).toFixed(0)}%
) : (
未加载
@@ -537,7 +633,7 @@ export default function MonacoDemoPage() {
💡 使用提示:
- 您可以通过URL参数加载PDF文档进行对比: + 您可以通过URL参数加载文档进行对比(支持 PDF 和 Word): - /monaco-demo?pdf1=路径1&pdf2=路径2 + /monaco-demo?doc1=路径1&doc2=路径2
- 示例: /monaco-demo?pdf1=documents/contract_v1.pdf&pdf2=documents/contract_v2.pdf +
PDF示例: /monaco-demo?doc1=documents/contract_v1.pdf&doc2=documents/contract_v2.pdf
+
Word示例: /monaco-demo?doc1=testWork/(最终版)智慧法务平台建设采购项目合同(1).docx&doc2=testWork/(最终版)智慧法务平台建设采购项目合同(2).docx
@@ -591,8 +688,8 @@ export default function MonacoDemoPage() { }} /> - {/* PDF加载中的遮罩层 */} - {(isLoadingPdf1 || isLoadingPdf2) && ( + {/* 文档加载中的遮罩层 */} + {(isLoadingDoc1 || isLoadingDoc2) && (
- 正在加载PDF文档并提取文本... + 正在加载文档并提取文本...
- {isLoadingPdf1 &&
📄 加载文档1
} - {isLoadingPdf2 &&
📄 加载文档2
} + {isLoadingDoc1 &&
📄 加载文档1
} + {isLoadingDoc2 &&
📄 加载文档2
}
)} diff --git a/app/routes/pdf-demo.tsx b/app/routes/pdf-demo.tsx index 51a8bc8..22a9540 100644 --- a/app/routes/pdf-demo.tsx +++ b/app/routes/pdf-demo.tsx @@ -39,7 +39,7 @@ interface HighlightArea { // 基于坐标的字符数据 interface CharacterBox { - box: [number, number][]; // 4个点:左上、右上、右下、左下 + box: [number, number][]; char: string; page: number; } @@ -70,7 +70,7 @@ export default function PdfDemo() { // PDF文件URL(使用示例PDF) // const [pdfUrl] = useState('/testPDF/sample.pdf'); // 使用包含真实文本层的PDF // const [pdfUrl] = useState('/api/pdf-proxy?path=documents/mz/行政处罚决定书/2025/11月13日/第71号--未在当地烟草专卖批发企业进货_02时58分36秒/第71号--未在当地烟草专卖批发企业进货.pdf'); // 使用项目中的示例PDF - const [pdfUrl] = useState('/api/pdf-proxy?path=documents/mz/行政处罚决定书/2025/11月22日/第35号--无烟草专卖品准运证运输烟草专卖品_15时15分24秒/第35号--无烟草专卖品准运证运输烟草专卖品.pdf') + const [pdfUrl] = useState('/api/pdf-proxy?path=documents/mz/测试示范类型/2025/11月24日/第37号--涉嫌生产、销售伪劣产品罪_12时19分10秒/第37号--涉嫌生产、销售伪劣产品罪.pdf') // PDF状态 const [numPages, setNumPages] = useState(null); @@ -227,87 +227,28 @@ export default function PdfDemo() { // 获取Page容器(SVG实际渲染的坐标空间) const pageContainer = canvas?.closest('.react-pdf__Page') as HTMLElement; - if (canvas && pageContainer && pdfOriginalWidthPt) { - // Canvas 内部绘制尺寸(考虑了 devicePixelRatio) - const canvasInternalWidth = canvas.width; - const canvasInternalHeight = canvas.height; - + if (canvas && pdfOriginalWidthPt) { // Canvas 显示尺寸(浏览器中实际占用的像素) const canvasDisplayWidth = canvas.offsetWidth; const canvasDisplayHeight = canvas.offsetHeight; - // Page容器尺寸(SVG高亮渲染的实际坐标空间) - const pageContainerWidth = pageContainer.offsetWidth; - const pageContainerHeight = pageContainer.offsetHeight; + // 计算坐标缩放比例:Canvas显示尺寸 / PDF原始尺寸 + const autoScale = canvasDisplayWidth / pdfOriginalWidthPt; - // 尝试多种计算方式 - const scale1_canvasDisplay = canvasDisplayWidth / pdfOriginalWidthPt; - const scale2_canvasInternal = canvasInternalWidth / pdfOriginalWidthPt; - const scale3_pageContainer = pageContainerWidth / pdfOriginalWidthPt; - - // 尝试反向计算:如果OCR尺寸比渲染尺寸大(需要缩小) - const scale4_inverseCanvasInternal = canvasDisplayWidth / canvasInternalWidth; - const scale5_inversePage = canvasDisplayWidth / pageContainerWidth; - - // 计算如果要达到 0.83 的缩放比例,OCR原始尺寸应该是多少 - const expectedOcrWidth = canvasDisplayWidth / 0.83; - - console.log('📏 尺寸信息汇总:'); - console.log(' 1️⃣ PDF原始尺寸 (page.view):', pdfOriginalWidthPt, 'x', pdfOriginalHeightPt, 'pt'); - console.log(' 2️⃣ Page容器尺寸:', pageContainerWidth, 'x', pageContainerHeight, 'px'); - console.log(' 3️⃣ Canvas显示尺寸:', canvasDisplayWidth, 'x', canvasDisplayHeight, 'px'); - console.log(' 4️⃣ Canvas内部尺寸:', canvasInternalWidth, 'x', canvasInternalHeight, 'px'); - console.log(' 5️⃣ 用户缩放 (scale):', scale); - console.log(' 6️⃣ devicePixelRatio:', window.devicePixelRatio || 1); - console.log(''); - console.log('🎯 各种计算方式:'); - console.log(' 方案1️⃣: Canvas显示 / PDF原始 =', scale1_canvasDisplay.toFixed(3), 'x'); - console.log(' 方案2️⃣: Canvas内部 / PDF原始 =', scale2_canvasInternal.toFixed(3), 'x'); - console.log(' 方案3️⃣: Page容器 / PDF原始 =', scale3_pageContainer.toFixed(3), 'x'); - console.log(' 方案4️⃣: Canvas显示 / Canvas内部 =', scale4_inverseCanvasInternal.toFixed(3), 'x ⬅ 可能是这个!'); - console.log(' 方案5️⃣: Canvas显示 / Page容器 =', scale5_inversePage.toFixed(3), 'x'); - console.log(''); - console.log('🔍 目标值分析:'); - console.log(' - 手动校准的正确值: 0.83'); - console.log(' - 反推OCR图像尺寸:', expectedOcrWidth.toFixed(0), 'x', (canvasDisplayHeight / 0.83).toFixed(0), 'px'); - console.log(' - 比较: ', expectedOcrWidth.toFixed(0), 'vs Canvas内部', canvasInternalWidth); - - // 使用最接近0.83的方案 - let autoScale = scale1_canvasDisplay; - let scaleMethod = '方案1 (Canvas显示/PDF原始)'; - - // 检查哪个方案最接近0.83 - const diff1 = Math.abs(scale1_canvasDisplay - 0.83); - const diff2 = Math.abs(scale2_canvasInternal - 0.83); - const diff3 = Math.abs(scale3_pageContainer - 0.83); - const diff4 = Math.abs(scale4_inverseCanvasInternal - 0.83); - const diff5 = Math.abs(scale5_inversePage - 0.83); - - const minDiff = Math.min(diff1, diff2, diff3, diff4, diff5); - - if (minDiff === diff4) { - autoScale = scale4_inverseCanvasInternal; - scaleMethod = '方案4 (Canvas显示/Canvas内部)'; - } else if (minDiff === diff5) { - autoScale = scale5_inversePage; - scaleMethod = '方案5 (Canvas显示/Page容器)'; - } else if (minDiff === diff2) { - autoScale = scale2_canvasInternal; - scaleMethod = '方案2 (Canvas内部/PDF原始)'; - } else if (minDiff === diff3) { - autoScale = scale3_pageContainer; - scaleMethod = '方案3 (Page容器/PDF原始)'; - } - - console.log(''); - console.log('✅ 自动选择:', scaleMethod, '=', autoScale.toFixed(3), 'x (最接近0.83)'); + console.log('📏 PDF尺寸信息:'); + console.log(' - PDF原始尺寸 (page.view):', pdfOriginalWidthPt, 'x', pdfOriginalHeightPt, 'pt'); + console.log(' - Canvas显示尺寸 (offsetWidth):', canvasDisplayWidth, 'x', canvasDisplayHeight, 'px'); + console.log(' - 用户缩放 (scale):', scale); + console.log(' - devicePixelRatio:', window.devicePixelRatio || 1); + console.log('🎯 自动计算坐标缩放:', autoScale.toFixed(3), 'x'); + console.log(' 公式: Canvas显示宽度 / PDF原始宽度 =', canvasDisplayWidth, '/', pdfOriginalWidthPt); // 保存原始宽度和自动计算的缩放比例 setPdfOriginalWidth(pdfOriginalWidthPt); setCoordinateScale(autoScale); setIsScaleAutoCalculated(true); - toastService.success(`自动校准完成: ${autoScale.toFixed(3)}x (${scaleMethod})`); + toastService.success(`自动校准完成: ${autoScale.toFixed(3)}x`); } else { console.warn('⚠️ 无法获取Canvas元素、Page容器或原始尺寸'); console.log('调试信息:', { diff --git a/public/testWork/(最终版)智慧法务平台建设采购项目合同(1).docx b/public/testWork/(最终版)智慧法务平台建设采购项目合同(1).docx new file mode 100644 index 0000000..2aa055c Binary files /dev/null and b/public/testWork/(最终版)智慧法务平台建设采购项目合同(1).docx differ diff --git a/public/testWork/(最终版)智慧法务平台建设采购项目合同(2).docx b/public/testWork/(最终版)智慧法务平台建设采购项目合同(2).docx new file mode 100644 index 0000000..2aa055c Binary files /dev/null and b/public/testWork/(最终版)智慧法务平台建设采购项目合同(2).docx differ