备份
This commit is contained in:
+162
-65
@@ -13,6 +13,7 @@ import { useState, useRef, useEffect } from "react";
|
|||||||
import { DiffEditor } from "@monaco-editor/react";
|
import { DiffEditor } from "@monaco-editor/react";
|
||||||
import type { editor } from "monaco-editor";
|
import type { editor } from "monaco-editor";
|
||||||
import { pdfjs } from 'react-pdf';
|
import { pdfjs } from 'react-pdf';
|
||||||
|
import mammoth from 'mammoth';
|
||||||
import { toastService } from '~/components/ui/Toast';
|
import { toastService } from '~/components/ui/Toast';
|
||||||
|
|
||||||
// 设置 PDF.js worker(与 pdf-demo.tsx 相同)
|
// 设置 PDF.js worker(与 pdf-demo.tsx 相同)
|
||||||
@@ -25,14 +26,26 @@ export const meta: MetaFunction = () => {
|
|||||||
];
|
];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// 文档类型枚举
|
||||||
|
type DocumentType = 'pdf' | 'docx' | 'unknown';
|
||||||
|
|
||||||
// PDF 类型枚举
|
// PDF 类型枚举
|
||||||
type PdfType = 'text' | 'scanned' | 'unknown';
|
type PdfType = 'text' | 'scanned' | 'unknown';
|
||||||
|
|
||||||
// PDF 信息接口
|
// PDF 信息接口(内部使用)
|
||||||
interface PdfInfo {
|
interface PdfInfo {
|
||||||
type: PdfType;
|
type: PdfType;
|
||||||
numPages: number;
|
numPages: number;
|
||||||
textLength: number;
|
textLength: number;
|
||||||
|
confidence: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 文档信息接口
|
||||||
|
interface DocumentInfo {
|
||||||
|
fileType: DocumentType;
|
||||||
|
pdfType?: PdfType; // 只有 PDF 才有
|
||||||
|
numPages?: number; // PDF 页数
|
||||||
|
textLength: number;
|
||||||
confidence: number; // 文本提取置信度 (0-1)
|
confidence: number; // 文本提取置信度 (0-1)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -120,15 +133,23 @@ export default function MonacoDemoPage() {
|
|||||||
const [diffCount, setDiffCount] = useState<number>(0);
|
const [diffCount, setDiffCount] = useState<number>(0);
|
||||||
const [currentDiff, setCurrentDiff] = useState<number>(0);
|
const [currentDiff, setCurrentDiff] = useState<number>(0);
|
||||||
|
|
||||||
// PDF相关状态
|
// 文档相关状态
|
||||||
const [pdf1Url, setPdf1Url] = useState<string>('');
|
const [doc1Url, setDoc1Url] = useState<string>('');
|
||||||
const [pdf2Url, setPdf2Url] = useState<string>('');
|
const [doc2Url, setDoc2Url] = useState<string>('');
|
||||||
const [pdf1Info, setPdf1Info] = useState<PdfInfo | null>(null);
|
const [doc1Info, setDoc1Info] = useState<DocumentInfo | null>(null);
|
||||||
const [pdf2Info, setPdf2Info] = useState<PdfInfo | null>(null);
|
const [doc2Info, setDoc2Info] = useState<DocumentInfo | null>(null);
|
||||||
const [isLoadingPdf1, setIsLoadingPdf1] = useState(false);
|
const [isLoadingDoc1, setIsLoadingDoc1] = useState(false);
|
||||||
const [isLoadingPdf2, setIsLoadingPdf2] = useState(false);
|
const [isLoadingDoc2, setIsLoadingDoc2] = useState(false);
|
||||||
const [useExample, setUseExample] = useState(true);
|
const [useExample, setUseExample] = useState(true);
|
||||||
|
|
||||||
|
// 检测文件类型(根据文件路径)
|
||||||
|
const detectFileType = (filePath: string): DocumentType => {
|
||||||
|
const lowerPath = filePath.toLowerCase();
|
||||||
|
if (lowerPath.endsWith('.pdf')) return 'pdf';
|
||||||
|
if (lowerPath.endsWith('.docx') || lowerPath.endsWith('.doc')) return 'docx';
|
||||||
|
return 'unknown';
|
||||||
|
};
|
||||||
|
|
||||||
// PDF类型检测函数
|
// PDF类型检测函数
|
||||||
const detectPdfType = async (pdfUrl: string): Promise<PdfInfo> => {
|
const detectPdfType = async (pdfUrl: string): Promise<PdfInfo> => {
|
||||||
const loadingTask = pdfjs.getDocument(pdfUrl);
|
const loadingTask = pdfjs.getDocument(pdfUrl);
|
||||||
@@ -189,32 +210,81 @@ export default function MonacoDemoPage() {
|
|||||||
return fullText;
|
return fullText;
|
||||||
};
|
};
|
||||||
|
|
||||||
// 加载PDF并提取文本
|
// Word文档文本提取函数
|
||||||
const loadPdfAndExtractText = async (pdfUrl: string, setPdfInfo: (info: PdfInfo | null) => void, setLoading: (loading: boolean) => void, setTextContent: (text: string) => void) => {
|
const extractTextFromWord = async (docUrl: string): Promise<string> => {
|
||||||
|
// 通过 fetch 获取文件
|
||||||
|
const response = await fetch(docUrl);
|
||||||
|
if (!response.ok) {
|
||||||
|
throw new Error(`无法加载文档: ${response.statusText}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 获取 ArrayBuffer
|
||||||
|
const arrayBuffer = await response.arrayBuffer();
|
||||||
|
|
||||||
|
// 使用 mammoth 提取纯文本
|
||||||
|
const result = await mammoth.extractRawText({ arrayBuffer });
|
||||||
|
|
||||||
|
return result.value;
|
||||||
|
};
|
||||||
|
|
||||||
|
// 加载文档并提取文本(支持 PDF 和 Word)
|
||||||
|
const loadDocumentAndExtractText = async (
|
||||||
|
docUrl: string,
|
||||||
|
filePath: string,
|
||||||
|
setDocInfo: (info: DocumentInfo | null) => void,
|
||||||
|
setLoading: (loading: boolean) => void,
|
||||||
|
setTextContent: (text: string) => void
|
||||||
|
) => {
|
||||||
try {
|
try {
|
||||||
setLoading(true);
|
setLoading(true);
|
||||||
|
|
||||||
// 1. 检测PDF类型
|
// 1. 检测文件类型
|
||||||
const pdfInfo = await detectPdfType(pdfUrl);
|
const fileType = detectFileType(filePath);
|
||||||
setPdfInfo(pdfInfo);
|
|
||||||
|
|
||||||
// 2. 提取文本
|
if (fileType === 'pdf') {
|
||||||
if (pdfInfo.type === 'text') {
|
// PDF 处理
|
||||||
const text = await extractTextFromPdf(pdfUrl);
|
const pdfInfo = await detectPdfType(docUrl);
|
||||||
|
const text = await extractTextFromPdf(docUrl);
|
||||||
|
|
||||||
|
const docInfo: DocumentInfo = {
|
||||||
|
fileType: 'pdf',
|
||||||
|
pdfType: pdfInfo.type,
|
||||||
|
numPages: pdfInfo.numPages,
|
||||||
|
textLength: pdfInfo.textLength,
|
||||||
|
confidence: pdfInfo.confidence
|
||||||
|
};
|
||||||
|
|
||||||
|
setDocInfo(docInfo);
|
||||||
setTextContent(text);
|
setTextContent(text);
|
||||||
|
|
||||||
|
if (pdfInfo.type === 'text') {
|
||||||
toastService.success(`PDF加载成功!共 ${pdfInfo.numPages} 页,提取了 ${pdfInfo.textLength} 个字符`);
|
toastService.success(`PDF加载成功!共 ${pdfInfo.numPages} 页,提取了 ${pdfInfo.textLength} 个字符`);
|
||||||
} else if (pdfInfo.type === 'scanned') {
|
} else if (pdfInfo.type === 'scanned') {
|
||||||
toastService.warning('检测到扫描版PDF,文本提取质量可能较低');
|
toastService.warning('检测到扫描版PDF,文本提取质量可能较低');
|
||||||
const text = await extractTextFromPdf(pdfUrl);
|
|
||||||
setTextContent(text);
|
|
||||||
} else {
|
} else {
|
||||||
toastService.error('无法识别PDF类型,可能是图片PDF');
|
toastService.error('无法识别PDF类型,可能是图片PDF');
|
||||||
|
}
|
||||||
|
} else if (fileType === 'docx') {
|
||||||
|
// Word 处理
|
||||||
|
const text = await extractTextFromWord(docUrl);
|
||||||
|
|
||||||
|
const docInfo: DocumentInfo = {
|
||||||
|
fileType: 'docx',
|
||||||
|
textLength: text.length,
|
||||||
|
confidence: 1.0 // Word 文档文本提取置信度为 100%
|
||||||
|
};
|
||||||
|
|
||||||
|
setDocInfo(docInfo);
|
||||||
|
setTextContent(text);
|
||||||
|
toastService.success(`Word文档加载成功!提取了 ${text.length} 个字符`);
|
||||||
|
} else {
|
||||||
|
toastService.error('不支持的文件类型');
|
||||||
setTextContent('');
|
setTextContent('');
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('PDF加载失败:', error);
|
console.error('文档加载失败:', error);
|
||||||
toastService.error('PDF加载失败,请检查文件路径');
|
toastService.error(`文档加载失败: ${error instanceof Error ? error.message : '未知错误'}`);
|
||||||
setPdfInfo(null);
|
setDocInfo(null);
|
||||||
setTextContent('');
|
setTextContent('');
|
||||||
} finally {
|
} finally {
|
||||||
setLoading(false);
|
setLoading(false);
|
||||||
@@ -281,8 +351,8 @@ export default function MonacoDemoPage() {
|
|||||||
setModifiedText(CONTRACT_B);
|
setModifiedText(CONTRACT_B);
|
||||||
setCurrentDiff(0);
|
setCurrentDiff(0);
|
||||||
setUseExample(true);
|
setUseExample(true);
|
||||||
setPdf1Info(null);
|
setDoc1Info(null);
|
||||||
setPdf2Info(null);
|
setDoc2Info(null);
|
||||||
|
|
||||||
// 重新计算差异数量
|
// 重新计算差异数量
|
||||||
setTimeout(() => {
|
setTimeout(() => {
|
||||||
@@ -295,34 +365,50 @@ export default function MonacoDemoPage() {
|
|||||||
}, 100);
|
}, 100);
|
||||||
};
|
};
|
||||||
|
|
||||||
// 从URL参数加载PDF
|
// 构建文件访问 URL
|
||||||
const loadPdfsFromUrl = () => {
|
const buildFileUrl = (filePath: string): string => {
|
||||||
|
// 如果路径以 public/ 开头或者以已知的 public 子目录开头(如 testWork/)
|
||||||
|
// 则直接使用静态资源路径
|
||||||
|
if (filePath.startsWith('public/')) {
|
||||||
|
// 去掉 public/ 前缀,直接访问静态资源
|
||||||
|
return '/' + filePath.substring(7);
|
||||||
|
} else if (filePath.startsWith('testWork/') || filePath.startsWith('testPDF/')) {
|
||||||
|
// testWork 和 testPDF 目录在 public 下,直接作为静态资源访问
|
||||||
|
return '/' + filePath;
|
||||||
|
} else {
|
||||||
|
// 其他路径通过 api/pdf-proxy 代理访问(从 MinIO 获取)
|
||||||
|
return `/api/pdf-proxy?path=${encodeURIComponent(filePath)}`;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// 从URL参数加载文档(支持 PDF 和 Word)
|
||||||
|
const loadDocumentsFromUrl = () => {
|
||||||
if (typeof window === 'undefined') return;
|
if (typeof window === 'undefined') return;
|
||||||
|
|
||||||
const searchParams = new URLSearchParams(window.location.search);
|
const searchParams = new URLSearchParams(window.location.search);
|
||||||
const pdf1Path = searchParams.get('pdf1');
|
const doc1Path = searchParams.get('doc1') || searchParams.get('pdf1'); // 兼容旧参数名
|
||||||
const pdf2Path = searchParams.get('pdf2');
|
const doc2Path = searchParams.get('doc2') || searchParams.get('pdf2'); // 兼容旧参数名
|
||||||
|
|
||||||
if (pdf1Path || pdf2Path) {
|
if (doc1Path || doc2Path) {
|
||||||
setUseExample(false);
|
setUseExample(false);
|
||||||
|
|
||||||
if (pdf1Path) {
|
if (doc1Path) {
|
||||||
const fullUrl = `/api/pdf-proxy?path=${encodeURIComponent(pdf1Path)}`;
|
const fullUrl = buildFileUrl(doc1Path);
|
||||||
setPdf1Url(fullUrl);
|
setDoc1Url(fullUrl);
|
||||||
loadPdfAndExtractText(fullUrl, setPdf1Info, setIsLoadingPdf1, setOriginalText);
|
loadDocumentAndExtractText(fullUrl, doc1Path, setDoc1Info, setIsLoadingDoc1, setOriginalText);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pdf2Path) {
|
if (doc2Path) {
|
||||||
const fullUrl = `/api/pdf-proxy?path=${encodeURIComponent(pdf2Path)}`;
|
const fullUrl = buildFileUrl(doc2Path);
|
||||||
setPdf2Url(fullUrl);
|
setDoc2Url(fullUrl);
|
||||||
loadPdfAndExtractText(fullUrl, setPdf2Info, setIsLoadingPdf2, setModifiedText);
|
loadDocumentAndExtractText(fullUrl, doc2Path, setDoc2Info, setIsLoadingDoc2, setModifiedText);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// 组件挂载时读取URL参数
|
// 组件挂载时读取URL参数
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
loadPdfsFromUrl();
|
loadDocumentsFromUrl();
|
||||||
// eslint-disable-next-line react-hooks/exhaustive-deps
|
// eslint-disable-next-line react-hooks/exhaustive-deps
|
||||||
}, []);
|
}, []);
|
||||||
|
|
||||||
@@ -452,8 +538,8 @@ export default function MonacoDemoPage() {
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
{/* PDF加载信息 */}
|
{/* 文档加载信息 */}
|
||||||
{!useExample && (pdf1Info || pdf2Info || isLoadingPdf1 || isLoadingPdf2) && (
|
{!useExample && (doc1Info || doc2Info || isLoadingDoc1 || isLoadingDoc2) && (
|
||||||
<div style={{
|
<div style={{
|
||||||
padding: '12px 24px',
|
padding: '12px 24px',
|
||||||
backgroundColor: '#fff3cd',
|
backgroundColor: '#fff3cd',
|
||||||
@@ -462,48 +548,58 @@ export default function MonacoDemoPage() {
|
|||||||
color: '#856404'
|
color: '#856404'
|
||||||
}}>
|
}}>
|
||||||
<div style={{ display: 'flex', alignItems: 'flex-start', gap: '24px' }}>
|
<div style={{ display: 'flex', alignItems: 'flex-start', gap: '24px' }}>
|
||||||
<i className="ri-file-pdf-line" style={{ fontSize: '18px', marginTop: '2px' }}></i>
|
<i className="ri-file-text-line" style={{ fontSize: '18px', marginTop: '2px' }}></i>
|
||||||
<div style={{ flex: 1 }}>
|
<div style={{ flex: 1 }}>
|
||||||
<strong>PDF文档信息:</strong>
|
<strong>文档信息:</strong>
|
||||||
<div style={{ display: 'flex', gap: '24px', marginTop: '8px' }}>
|
<div style={{ display: 'flex', gap: '24px', marginTop: '8px' }}>
|
||||||
{/* PDF 1 信息 */}
|
{/* 文档 1 信息 */}
|
||||||
<div style={{ flex: 1 }}>
|
<div style={{ flex: 1 }}>
|
||||||
<div style={{ fontWeight: 'bold', marginBottom: '4px' }}>📄 文档1(左侧/原始)</div>
|
<div style={{ fontWeight: 'bold', marginBottom: '4px' }}>📄 文档1(左侧/原始)</div>
|
||||||
{isLoadingPdf1 ? (
|
{isLoadingDoc1 ? (
|
||||||
<div style={{ color: '#666' }}>⏳ 加载中...</div>
|
<div style={{ color: '#666' }}>⏳ 加载中...</div>
|
||||||
) : pdf1Info ? (
|
) : doc1Info ? (
|
||||||
<div>
|
<div>
|
||||||
<div>类型: <span style={{
|
<div>类型: <span style={{
|
||||||
color: pdf1Info.type === 'text' ? '#28a745' : pdf1Info.type === 'scanned' ? '#ffc107' : '#dc3545',
|
color: doc1Info.fileType === 'pdf' ? '#007bff' : doc1Info.fileType === 'docx' ? '#28a745' : '#dc3545',
|
||||||
fontWeight: 'bold'
|
fontWeight: 'bold'
|
||||||
}}>
|
}}>
|
||||||
{pdf1Info.type === 'text' ? '✅ 文本PDF' : pdf1Info.type === 'scanned' ? '⚠️ 扫描PDF' : '❌ 未知类型'}
|
{doc1Info.fileType === 'pdf' ? '📕 PDF文档' : doc1Info.fileType === 'docx' ? '📘 Word文档' : '❌ 未知类型'}
|
||||||
</span></div>
|
</span></div>
|
||||||
<div>页数: {pdf1Info.numPages} 页</div>
|
{doc1Info.fileType === 'pdf' && doc1Info.numPages && (
|
||||||
<div>字符数: {pdf1Info.textLength} 个</div>
|
<div>页数: {doc1Info.numPages} 页</div>
|
||||||
<div>置信度: {(pdf1Info.confidence * 100).toFixed(0)}%</div>
|
)}
|
||||||
|
{doc1Info.fileType === 'pdf' && doc1Info.pdfType && (
|
||||||
|
<div>PDF类型: {doc1Info.pdfType === 'text' ? '✅ 文本' : doc1Info.pdfType === 'scanned' ? '⚠️ 扫描' : '❌ 未知'}</div>
|
||||||
|
)}
|
||||||
|
<div>字符数: {doc1Info.textLength} 个</div>
|
||||||
|
<div>置信度: {(doc1Info.confidence * 100).toFixed(0)}%</div>
|
||||||
</div>
|
</div>
|
||||||
) : (
|
) : (
|
||||||
<div style={{ color: '#999' }}>未加载</div>
|
<div style={{ color: '#999' }}>未加载</div>
|
||||||
)}
|
)}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
{/* PDF 2 信息 */}
|
{/* 文档 2 信息 */}
|
||||||
<div style={{ flex: 1 }}>
|
<div style={{ flex: 1 }}>
|
||||||
<div style={{ fontWeight: 'bold', marginBottom: '4px' }}>📄 文档2(右侧/修改)</div>
|
<div style={{ fontWeight: 'bold', marginBottom: '4px' }}>📄 文档2(右侧/修改)</div>
|
||||||
{isLoadingPdf2 ? (
|
{isLoadingDoc2 ? (
|
||||||
<div style={{ color: '#666' }}>⏳ 加载中...</div>
|
<div style={{ color: '#666' }}>⏳ 加载中...</div>
|
||||||
) : pdf2Info ? (
|
) : doc2Info ? (
|
||||||
<div>
|
<div>
|
||||||
<div>类型: <span style={{
|
<div>类型: <span style={{
|
||||||
color: pdf2Info.type === 'text' ? '#28a745' : pdf2Info.type === 'scanned' ? '#ffc107' : '#dc3545',
|
color: doc2Info.fileType === 'pdf' ? '#007bff' : doc2Info.fileType === 'docx' ? '#28a745' : '#dc3545',
|
||||||
fontWeight: 'bold'
|
fontWeight: 'bold'
|
||||||
}}>
|
}}>
|
||||||
{pdf2Info.type === 'text' ? '✅ 文本PDF' : pdf2Info.type === 'scanned' ? '⚠️ 扫描PDF' : '❌ 未知类型'}
|
{doc2Info.fileType === 'pdf' ? '📕 PDF文档' : doc2Info.fileType === 'docx' ? '📘 Word文档' : '❌ 未知类型'}
|
||||||
</span></div>
|
</span></div>
|
||||||
<div>页数: {pdf2Info.numPages} 页</div>
|
{doc2Info.fileType === 'pdf' && doc2Info.numPages && (
|
||||||
<div>字符数: {pdf2Info.textLength} 个</div>
|
<div>页数: {doc2Info.numPages} 页</div>
|
||||||
<div>置信度: {(pdf2Info.confidence * 100).toFixed(0)}%</div>
|
)}
|
||||||
|
{doc2Info.fileType === 'pdf' && doc2Info.pdfType && (
|
||||||
|
<div>PDF类型: {doc2Info.pdfType === 'text' ? '✅ 文本' : doc2Info.pdfType === 'scanned' ? '⚠️ 扫描' : '❌ 未知'}</div>
|
||||||
|
)}
|
||||||
|
<div>字符数: {doc2Info.textLength} 个</div>
|
||||||
|
<div>置信度: {(doc2Info.confidence * 100).toFixed(0)}%</div>
|
||||||
</div>
|
</div>
|
||||||
) : (
|
) : (
|
||||||
<div style={{ color: '#999' }}>未加载</div>
|
<div style={{ color: '#999' }}>未加载</div>
|
||||||
@@ -537,7 +633,7 @@ export default function MonacoDemoPage() {
|
|||||||
<div style={{ marginTop: '12px', paddingTop: '12px', borderTop: '1px solid #b3d9ff' }}>
|
<div style={{ marginTop: '12px', paddingTop: '12px', borderTop: '1px solid #b3d9ff' }}>
|
||||||
<strong>💡 使用提示:</strong>
|
<strong>💡 使用提示:</strong>
|
||||||
<div style={{ marginTop: '4px' }}>
|
<div style={{ marginTop: '4px' }}>
|
||||||
您可以通过URL参数加载PDF文档进行对比:
|
您可以通过URL参数加载文档进行对比(支持 PDF 和 Word):
|
||||||
<code style={{
|
<code style={{
|
||||||
display: 'block',
|
display: 'block',
|
||||||
marginTop: '4px',
|
marginTop: '4px',
|
||||||
@@ -547,10 +643,11 @@ export default function MonacoDemoPage() {
|
|||||||
fontSize: '12px',
|
fontSize: '12px',
|
||||||
wordBreak: 'break-all'
|
wordBreak: 'break-all'
|
||||||
}}>
|
}}>
|
||||||
/monaco-demo?pdf1=路径1&pdf2=路径2
|
/monaco-demo?doc1=路径1&doc2=路径2
|
||||||
</code>
|
</code>
|
||||||
<div style={{ marginTop: '4px', fontSize: '12px' }}>
|
<div style={{ marginTop: '4px', fontSize: '12px' }}>
|
||||||
示例: <code>/monaco-demo?pdf1=documents/contract_v1.pdf&pdf2=documents/contract_v2.pdf</code>
|
<div>PDF示例: <code>/monaco-demo?doc1=documents/contract_v1.pdf&doc2=documents/contract_v2.pdf</code></div>
|
||||||
|
<div style={{ marginTop: '2px' }}>Word示例: <code>/monaco-demo?doc1=testWork/(最终版)智慧法务平台建设采购项目合同(1).docx&doc2=testWork/(最终版)智慧法务平台建设采购项目合同(2).docx</code></div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@@ -591,8 +688,8 @@ export default function MonacoDemoPage() {
|
|||||||
}}
|
}}
|
||||||
/>
|
/>
|
||||||
|
|
||||||
{/* PDF加载中的遮罩层 */}
|
{/* 文档加载中的遮罩层 */}
|
||||||
{(isLoadingPdf1 || isLoadingPdf2) && (
|
{(isLoadingDoc1 || isLoadingDoc2) && (
|
||||||
<div style={{
|
<div style={{
|
||||||
position: 'absolute',
|
position: 'absolute',
|
||||||
top: 0,
|
top: 0,
|
||||||
@@ -616,10 +713,10 @@ export default function MonacoDemoPage() {
|
|||||||
margin: '0 auto 16px'
|
margin: '0 auto 16px'
|
||||||
}}></div>
|
}}></div>
|
||||||
<div style={{ fontSize: '16px', color: '#333' }}>
|
<div style={{ fontSize: '16px', color: '#333' }}>
|
||||||
正在加载PDF文档并提取文本...
|
正在加载文档并提取文本...
|
||||||
</div>
|
</div>
|
||||||
{isLoadingPdf1 && <div style={{ fontSize: '14px', color: '#666', marginTop: '8px' }}>📄 加载文档1</div>}
|
{isLoadingDoc1 && <div style={{ fontSize: '14px', color: '#666', marginTop: '8px' }}>📄 加载文档1</div>}
|
||||||
{isLoadingPdf2 && <div style={{ fontSize: '14px', color: '#666', marginTop: '8px' }}>📄 加载文档2</div>}
|
{isLoadingDoc2 && <div style={{ fontSize: '14px', color: '#666', marginTop: '8px' }}>📄 加载文档2</div>}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
)}
|
)}
|
||||||
|
|||||||
+13
-72
@@ -39,7 +39,7 @@ interface HighlightArea {
|
|||||||
|
|
||||||
// 基于坐标的字符数据
|
// 基于坐标的字符数据
|
||||||
interface CharacterBox {
|
interface CharacterBox {
|
||||||
box: [number, number][]; // 4个点:左上、右上、右下、左下
|
box: [number, number][];
|
||||||
char: string;
|
char: string;
|
||||||
page: number;
|
page: number;
|
||||||
}
|
}
|
||||||
@@ -70,7 +70,7 @@ export default function PdfDemo() {
|
|||||||
// PDF文件URL(使用示例PDF)
|
// PDF文件URL(使用示例PDF)
|
||||||
// const [pdfUrl] = useState('/testPDF/sample.pdf'); // 使用包含真实文本层的PDF
|
// const [pdfUrl] = useState('/testPDF/sample.pdf'); // 使用包含真实文本层的PDF
|
||||||
// const [pdfUrl] = useState('/api/pdf-proxy?path=documents/mz/行政处罚决定书/2025/11月13日/第71号--未在当地烟草专卖批发企业进货_02时58分36秒/第71号--未在当地烟草专卖批发企业进货.pdf'); // 使用项目中的示例PDF
|
// const [pdfUrl] = useState('/api/pdf-proxy?path=documents/mz/行政处罚决定书/2025/11月13日/第71号--未在当地烟草专卖批发企业进货_02时58分36秒/第71号--未在当地烟草专卖批发企业进货.pdf'); // 使用项目中的示例PDF
|
||||||
const [pdfUrl] = useState('/api/pdf-proxy?path=documents/mz/行政处罚决定书/2025/11月22日/第35号--无烟草专卖品准运证运输烟草专卖品_15时15分24秒/第35号--无烟草专卖品准运证运输烟草专卖品.pdf')
|
const [pdfUrl] = useState('/api/pdf-proxy?path=documents/mz/测试示范类型/2025/11月24日/第37号--涉嫌生产、销售伪劣产品罪_12时19分10秒/第37号--涉嫌生产、销售伪劣产品罪.pdf')
|
||||||
|
|
||||||
// PDF状态
|
// PDF状态
|
||||||
const [numPages, setNumPages] = useState<number | null>(null);
|
const [numPages, setNumPages] = useState<number | null>(null);
|
||||||
@@ -227,87 +227,28 @@ export default function PdfDemo() {
|
|||||||
// 获取Page容器(SVG实际渲染的坐标空间)
|
// 获取Page容器(SVG实际渲染的坐标空间)
|
||||||
const pageContainer = canvas?.closest('.react-pdf__Page') as HTMLElement;
|
const pageContainer = canvas?.closest('.react-pdf__Page') as HTMLElement;
|
||||||
|
|
||||||
if (canvas && pageContainer && pdfOriginalWidthPt) {
|
if (canvas && pdfOriginalWidthPt) {
|
||||||
// Canvas 内部绘制尺寸(考虑了 devicePixelRatio)
|
|
||||||
const canvasInternalWidth = canvas.width;
|
|
||||||
const canvasInternalHeight = canvas.height;
|
|
||||||
|
|
||||||
// Canvas 显示尺寸(浏览器中实际占用的像素)
|
// Canvas 显示尺寸(浏览器中实际占用的像素)
|
||||||
const canvasDisplayWidth = canvas.offsetWidth;
|
const canvasDisplayWidth = canvas.offsetWidth;
|
||||||
const canvasDisplayHeight = canvas.offsetHeight;
|
const canvasDisplayHeight = canvas.offsetHeight;
|
||||||
|
|
||||||
// Page容器尺寸(SVG高亮渲染的实际坐标空间)
|
// 计算坐标缩放比例:Canvas显示尺寸 / PDF原始尺寸
|
||||||
const pageContainerWidth = pageContainer.offsetWidth;
|
const autoScale = canvasDisplayWidth / pdfOriginalWidthPt;
|
||||||
const pageContainerHeight = pageContainer.offsetHeight;
|
|
||||||
|
|
||||||
// 尝试多种计算方式
|
console.log('📏 PDF尺寸信息:');
|
||||||
const scale1_canvasDisplay = canvasDisplayWidth / pdfOriginalWidthPt;
|
console.log(' - PDF原始尺寸 (page.view):', pdfOriginalWidthPt, 'x', pdfOriginalHeightPt, 'pt');
|
||||||
const scale2_canvasInternal = canvasInternalWidth / pdfOriginalWidthPt;
|
console.log(' - Canvas显示尺寸 (offsetWidth):', canvasDisplayWidth, 'x', canvasDisplayHeight, 'px');
|
||||||
const scale3_pageContainer = pageContainerWidth / pdfOriginalWidthPt;
|
console.log(' - 用户缩放 (scale):', scale);
|
||||||
|
console.log(' - devicePixelRatio:', window.devicePixelRatio || 1);
|
||||||
// 尝试反向计算:如果OCR尺寸比渲染尺寸大(需要缩小)
|
console.log('🎯 自动计算坐标缩放:', autoScale.toFixed(3), 'x');
|
||||||
const scale4_inverseCanvasInternal = canvasDisplayWidth / canvasInternalWidth;
|
console.log(' 公式: Canvas显示宽度 / PDF原始宽度 =', canvasDisplayWidth, '/', pdfOriginalWidthPt);
|
||||||
const scale5_inversePage = canvasDisplayWidth / pageContainerWidth;
|
|
||||||
|
|
||||||
// 计算如果要达到 0.83 的缩放比例,OCR原始尺寸应该是多少
|
|
||||||
const expectedOcrWidth = canvasDisplayWidth / 0.83;
|
|
||||||
|
|
||||||
console.log('📏 尺寸信息汇总:');
|
|
||||||
console.log(' 1️⃣ PDF原始尺寸 (page.view):', pdfOriginalWidthPt, 'x', pdfOriginalHeightPt, 'pt');
|
|
||||||
console.log(' 2️⃣ Page容器尺寸:', pageContainerWidth, 'x', pageContainerHeight, 'px');
|
|
||||||
console.log(' 3️⃣ Canvas显示尺寸:', canvasDisplayWidth, 'x', canvasDisplayHeight, 'px');
|
|
||||||
console.log(' 4️⃣ Canvas内部尺寸:', canvasInternalWidth, 'x', canvasInternalHeight, 'px');
|
|
||||||
console.log(' 5️⃣ 用户缩放 (scale):', scale);
|
|
||||||
console.log(' 6️⃣ devicePixelRatio:', window.devicePixelRatio || 1);
|
|
||||||
console.log('');
|
|
||||||
console.log('🎯 各种计算方式:');
|
|
||||||
console.log(' 方案1️⃣: Canvas显示 / PDF原始 =', scale1_canvasDisplay.toFixed(3), 'x');
|
|
||||||
console.log(' 方案2️⃣: Canvas内部 / PDF原始 =', scale2_canvasInternal.toFixed(3), 'x');
|
|
||||||
console.log(' 方案3️⃣: Page容器 / PDF原始 =', scale3_pageContainer.toFixed(3), 'x');
|
|
||||||
console.log(' 方案4️⃣: Canvas显示 / Canvas内部 =', scale4_inverseCanvasInternal.toFixed(3), 'x ⬅ 可能是这个!');
|
|
||||||
console.log(' 方案5️⃣: Canvas显示 / Page容器 =', scale5_inversePage.toFixed(3), 'x');
|
|
||||||
console.log('');
|
|
||||||
console.log('🔍 目标值分析:');
|
|
||||||
console.log(' - 手动校准的正确值: 0.83');
|
|
||||||
console.log(' - 反推OCR图像尺寸:', expectedOcrWidth.toFixed(0), 'x', (canvasDisplayHeight / 0.83).toFixed(0), 'px');
|
|
||||||
console.log(' - 比较: ', expectedOcrWidth.toFixed(0), 'vs Canvas内部', canvasInternalWidth);
|
|
||||||
|
|
||||||
// 使用最接近0.83的方案
|
|
||||||
let autoScale = scale1_canvasDisplay;
|
|
||||||
let scaleMethod = '方案1 (Canvas显示/PDF原始)';
|
|
||||||
|
|
||||||
// 检查哪个方案最接近0.83
|
|
||||||
const diff1 = Math.abs(scale1_canvasDisplay - 0.83);
|
|
||||||
const diff2 = Math.abs(scale2_canvasInternal - 0.83);
|
|
||||||
const diff3 = Math.abs(scale3_pageContainer - 0.83);
|
|
||||||
const diff4 = Math.abs(scale4_inverseCanvasInternal - 0.83);
|
|
||||||
const diff5 = Math.abs(scale5_inversePage - 0.83);
|
|
||||||
|
|
||||||
const minDiff = Math.min(diff1, diff2, diff3, diff4, diff5);
|
|
||||||
|
|
||||||
if (minDiff === diff4) {
|
|
||||||
autoScale = scale4_inverseCanvasInternal;
|
|
||||||
scaleMethod = '方案4 (Canvas显示/Canvas内部)';
|
|
||||||
} else if (minDiff === diff5) {
|
|
||||||
autoScale = scale5_inversePage;
|
|
||||||
scaleMethod = '方案5 (Canvas显示/Page容器)';
|
|
||||||
} else if (minDiff === diff2) {
|
|
||||||
autoScale = scale2_canvasInternal;
|
|
||||||
scaleMethod = '方案2 (Canvas内部/PDF原始)';
|
|
||||||
} else if (minDiff === diff3) {
|
|
||||||
autoScale = scale3_pageContainer;
|
|
||||||
scaleMethod = '方案3 (Page容器/PDF原始)';
|
|
||||||
}
|
|
||||||
|
|
||||||
console.log('');
|
|
||||||
console.log('✅ 自动选择:', scaleMethod, '=', autoScale.toFixed(3), 'x (最接近0.83)');
|
|
||||||
|
|
||||||
// 保存原始宽度和自动计算的缩放比例
|
// 保存原始宽度和自动计算的缩放比例
|
||||||
setPdfOriginalWidth(pdfOriginalWidthPt);
|
setPdfOriginalWidth(pdfOriginalWidthPt);
|
||||||
setCoordinateScale(autoScale);
|
setCoordinateScale(autoScale);
|
||||||
setIsScaleAutoCalculated(true);
|
setIsScaleAutoCalculated(true);
|
||||||
|
|
||||||
toastService.success(`自动校准完成: ${autoScale.toFixed(3)}x (${scaleMethod})`);
|
toastService.success(`自动校准完成: ${autoScale.toFixed(3)}x`);
|
||||||
} else {
|
} else {
|
||||||
console.warn('⚠️ 无法获取Canvas元素、Page容器或原始尺寸');
|
console.warn('⚠️ 无法获取Canvas元素、Page容器或原始尺寸');
|
||||||
console.log('调试信息:', {
|
console.log('调试信息:', {
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user