This commit is contained in:
2025-11-25 11:02:40 +08:00
parent 93bae2de17
commit 0ed6f0aaf4
5 changed files with 194 additions and 156 deletions
+165 -68
View File
@@ -13,6 +13,7 @@ import { useState, useRef, useEffect } from "react";
import { DiffEditor } from "@monaco-editor/react";
import type { editor } from "monaco-editor";
import { pdfjs } from 'react-pdf';
import mammoth from 'mammoth';
import { toastService } from '~/components/ui/Toast';
// 设置 PDF.js worker(与 pdf-demo.tsx 相同)
@@ -25,14 +26,26 @@ export const meta: MetaFunction = () => {
];
};
// 文档类型枚举
type DocumentType = 'pdf' | 'docx' | 'unknown';
// PDF 类型枚举
type PdfType = 'text' | 'scanned' | 'unknown';
// PDF 信息接口
// PDF 信息接口(内部使用)
interface PdfInfo {
type: PdfType;
numPages: number;
textLength: number;
confidence: number;
}
// 文档信息接口
interface DocumentInfo {
fileType: DocumentType;
pdfType?: PdfType; // 只有 PDF 才有
numPages?: number; // PDF 页数
textLength: number;
confidence: number; // 文本提取置信度 (0-1)
}
@@ -120,15 +133,23 @@ export default function MonacoDemoPage() {
const [diffCount, setDiffCount] = useState<number>(0);
const [currentDiff, setCurrentDiff] = useState<number>(0);
// PDF相关状态
const [pdf1Url, setPdf1Url] = useState<string>('');
const [pdf2Url, setPdf2Url] = useState<string>('');
const [pdf1Info, setPdf1Info] = useState<PdfInfo | null>(null);
const [pdf2Info, setPdf2Info] = useState<PdfInfo | null>(null);
const [isLoadingPdf1, setIsLoadingPdf1] = useState(false);
const [isLoadingPdf2, setIsLoadingPdf2] = useState(false);
// 文档相关状态
const [doc1Url, setDoc1Url] = useState<string>('');
const [doc2Url, setDoc2Url] = useState<string>('');
const [doc1Info, setDoc1Info] = useState<DocumentInfo | null>(null);
const [doc2Info, setDoc2Info] = useState<DocumentInfo | null>(null);
const [isLoadingDoc1, setIsLoadingDoc1] = useState(false);
const [isLoadingDoc2, setIsLoadingDoc2] = useState(false);
const [useExample, setUseExample] = useState(true);
// 检测文件类型(根据文件路径)
const detectFileType = (filePath: string): DocumentType => {
const lowerPath = filePath.toLowerCase();
if (lowerPath.endsWith('.pdf')) return 'pdf';
if (lowerPath.endsWith('.docx') || lowerPath.endsWith('.doc')) return 'docx';
return 'unknown';
};
// PDF类型检测函数
const detectPdfType = async (pdfUrl: string): Promise<PdfInfo> => {
const loadingTask = pdfjs.getDocument(pdfUrl);
@@ -189,32 +210,81 @@ export default function MonacoDemoPage() {
return fullText;
};
// 加载PDF并提取文本
const loadPdfAndExtractText = async (pdfUrl: string, setPdfInfo: (info: PdfInfo | null) => void, setLoading: (loading: boolean) => void, setTextContent: (text: string) => void) => {
// Word文档文本提取函数
const extractTextFromWord = async (docUrl: string): Promise<string> => {
// 通过 fetch 获取文件
const response = await fetch(docUrl);
if (!response.ok) {
throw new Error(`无法加载文档: ${response.statusText}`);
}
// 获取 ArrayBuffer
const arrayBuffer = await response.arrayBuffer();
// 使用 mammoth 提取纯文本
const result = await mammoth.extractRawText({ arrayBuffer });
return result.value;
};
// 加载文档并提取文本(支持 PDF 和 Word)
const loadDocumentAndExtractText = async (
docUrl: string,
filePath: string,
setDocInfo: (info: DocumentInfo | null) => void,
setLoading: (loading: boolean) => void,
setTextContent: (text: string) => void
) => {
try {
setLoading(true);
// 1. 检测PDF类型
const pdfInfo = await detectPdfType(pdfUrl);
setPdfInfo(pdfInfo);
// 1. 检测文件类型
const fileType = detectFileType(filePath);
// 2. 提取文本
if (pdfInfo.type === 'text') {
const text = await extractTextFromPdf(pdfUrl);
if (fileType === 'pdf') {
// PDF 处理
const pdfInfo = await detectPdfType(docUrl);
const text = await extractTextFromPdf(docUrl);
const docInfo: DocumentInfo = {
fileType: 'pdf',
pdfType: pdfInfo.type,
numPages: pdfInfo.numPages,
textLength: pdfInfo.textLength,
confidence: pdfInfo.confidence
};
setDocInfo(docInfo);
setTextContent(text);
toastService.success(`PDF加载成功!共 ${pdfInfo.numPages} 页,提取了 ${pdfInfo.textLength} 个字符`);
} else if (pdfInfo.type === 'scanned') {
toastService.warning('检测到扫描版PDF,文本提取质量可能较低');
const text = await extractTextFromPdf(pdfUrl);
if (pdfInfo.type === 'text') {
toastService.success(`PDF加载成功!共 ${pdfInfo.numPages} 页,提取了 ${pdfInfo.textLength} 个字符`);
} else if (pdfInfo.type === 'scanned') {
toastService.warning('检测到扫描版PDF,文本提取质量可能较低');
} else {
toastService.error('无法识别PDF类型,可能是图片PDF');
}
} else if (fileType === 'docx') {
// Word 处理
const text = await extractTextFromWord(docUrl);
const docInfo: DocumentInfo = {
fileType: 'docx',
textLength: text.length,
confidence: 1.0 // Word 文档文本提取置信度为 100%
};
setDocInfo(docInfo);
setTextContent(text);
toastService.success(`Word文档加载成功!提取了 ${text.length} 个字符`);
} else {
toastService.error('无法识别PDF类型,可能是图片PDF');
toastService.error('不支持的文件类型');
setTextContent('');
}
} catch (error) {
console.error('PDF加载失败:', error);
toastService.error('PDF加载失败,请检查文件路径');
setPdfInfo(null);
console.error('文档加载失败:', error);
toastService.error(`文档加载失败: ${error instanceof Error ? error.message : '未知错误'}`);
setDocInfo(null);
setTextContent('');
} finally {
setLoading(false);
@@ -281,8 +351,8 @@ export default function MonacoDemoPage() {
setModifiedText(CONTRACT_B);
setCurrentDiff(0);
setUseExample(true);
setPdf1Info(null);
setPdf2Info(null);
setDoc1Info(null);
setDoc2Info(null);
// 重新计算差异数量
setTimeout(() => {
@@ -295,34 +365,50 @@ export default function MonacoDemoPage() {
}, 100);
};
// 从URL参数加载PDF
const loadPdfsFromUrl = () => {
// 构建文件访问 URL
const buildFileUrl = (filePath: string): string => {
// 如果路径以 public/ 开头或者以已知的 public 子目录开头(如 testWork/
// 则直接使用静态资源路径
if (filePath.startsWith('public/')) {
// 去掉 public/ 前缀,直接访问静态资源
return '/' + filePath.substring(7);
} else if (filePath.startsWith('testWork/') || filePath.startsWith('testPDF/')) {
// testWork 和 testPDF 目录在 public 下,直接作为静态资源访问
return '/' + filePath;
} else {
// 其他路径通过 api/pdf-proxy 代理访问(从 MinIO 获取)
return `/api/pdf-proxy?path=${encodeURIComponent(filePath)}`;
}
};
// 从URL参数加载文档(支持 PDF 和 Word)
const loadDocumentsFromUrl = () => {
if (typeof window === 'undefined') return;
const searchParams = new URLSearchParams(window.location.search);
const pdf1Path = searchParams.get('pdf1');
const pdf2Path = searchParams.get('pdf2');
const doc1Path = searchParams.get('doc1') || searchParams.get('pdf1'); // 兼容旧参数名
const doc2Path = searchParams.get('doc2') || searchParams.get('pdf2'); // 兼容旧参数名
if (pdf1Path || pdf2Path) {
if (doc1Path || doc2Path) {
setUseExample(false);
if (pdf1Path) {
const fullUrl = `/api/pdf-proxy?path=${encodeURIComponent(pdf1Path)}`;
setPdf1Url(fullUrl);
loadPdfAndExtractText(fullUrl, setPdf1Info, setIsLoadingPdf1, setOriginalText);
if (doc1Path) {
const fullUrl = buildFileUrl(doc1Path);
setDoc1Url(fullUrl);
loadDocumentAndExtractText(fullUrl, doc1Path, setDoc1Info, setIsLoadingDoc1, setOriginalText);
}
if (pdf2Path) {
const fullUrl = `/api/pdf-proxy?path=${encodeURIComponent(pdf2Path)}`;
setPdf2Url(fullUrl);
loadPdfAndExtractText(fullUrl, setPdf2Info, setIsLoadingPdf2, setModifiedText);
if (doc2Path) {
const fullUrl = buildFileUrl(doc2Path);
setDoc2Url(fullUrl);
loadDocumentAndExtractText(fullUrl, doc2Path, setDoc2Info, setIsLoadingDoc2, setModifiedText);
}
}
};
// 组件挂载时读取URL参数
useEffect(() => {
loadPdfsFromUrl();
loadDocumentsFromUrl();
// eslint-disable-next-line react-hooks/exhaustive-deps
}, []);
@@ -452,8 +538,8 @@ export default function MonacoDemoPage() {
</div>
</div>
{/* PDF加载信息 */}
{!useExample && (pdf1Info || pdf2Info || isLoadingPdf1 || isLoadingPdf2) && (
{/* 文档加载信息 */}
{!useExample && (doc1Info || doc2Info || isLoadingDoc1 || isLoadingDoc2) && (
<div style={{
padding: '12px 24px',
backgroundColor: '#fff3cd',
@@ -462,48 +548,58 @@ export default function MonacoDemoPage() {
color: '#856404'
}}>
<div style={{ display: 'flex', alignItems: 'flex-start', gap: '24px' }}>
<i className="ri-file-pdf-line" style={{ fontSize: '18px', marginTop: '2px' }}></i>
<i className="ri-file-text-line" style={{ fontSize: '18px', marginTop: '2px' }}></i>
<div style={{ flex: 1 }}>
<strong>PDF文档信息</strong>
<strong></strong>
<div style={{ display: 'flex', gap: '24px', marginTop: '8px' }}>
{/* PDF 1 信息 */}
{/* 文档 1 信息 */}
<div style={{ flex: 1 }}>
<div style={{ fontWeight: 'bold', marginBottom: '4px' }}>📄 1/</div>
{isLoadingPdf1 ? (
{isLoadingDoc1 ? (
<div style={{ color: '#666' }}> ...</div>
) : pdf1Info ? (
) : doc1Info ? (
<div>
<div>: <span style={{
color: pdf1Info.type === 'text' ? '#28a745' : pdf1Info.type === 'scanned' ? '#ffc107' : '#dc3545',
color: doc1Info.fileType === 'pdf' ? '#007bff' : doc1Info.fileType === 'docx' ? '#28a745' : '#dc3545',
fontWeight: 'bold'
}}>
{pdf1Info.type === 'text' ? '✅ 文本PDF' : pdf1Info.type === 'scanned' ? '⚠️ 扫描PDF' : '❌ 未知类型'}
{doc1Info.fileType === 'pdf' ? '📕 PDF文档' : doc1Info.fileType === 'docx' ? '📘 Word文档' : '❌ 未知类型'}
</span></div>
<div>: {pdf1Info.numPages} </div>
<div>: {pdf1Info.textLength} </div>
<div>: {(pdf1Info.confidence * 100).toFixed(0)}%</div>
{doc1Info.fileType === 'pdf' && doc1Info.numPages && (
<div>: {doc1Info.numPages} </div>
)}
{doc1Info.fileType === 'pdf' && doc1Info.pdfType && (
<div>PDF类型: {doc1Info.pdfType === 'text' ? '✅ 文本' : doc1Info.pdfType === 'scanned' ? '⚠️ 扫描' : '❌ 未知'}</div>
)}
<div>: {doc1Info.textLength} </div>
<div>: {(doc1Info.confidence * 100).toFixed(0)}%</div>
</div>
) : (
<div style={{ color: '#999' }}></div>
)}
</div>
{/* PDF 2 信息 */}
{/* 文档 2 信息 */}
<div style={{ flex: 1 }}>
<div style={{ fontWeight: 'bold', marginBottom: '4px' }}>📄 2/</div>
{isLoadingPdf2 ? (
{isLoadingDoc2 ? (
<div style={{ color: '#666' }}> ...</div>
) : pdf2Info ? (
) : doc2Info ? (
<div>
<div>: <span style={{
color: pdf2Info.type === 'text' ? '#28a745' : pdf2Info.type === 'scanned' ? '#ffc107' : '#dc3545',
color: doc2Info.fileType === 'pdf' ? '#007bff' : doc2Info.fileType === 'docx' ? '#28a745' : '#dc3545',
fontWeight: 'bold'
}}>
{pdf2Info.type === 'text' ? '✅ 文本PDF' : pdf2Info.type === 'scanned' ? '⚠️ 扫描PDF' : '❌ 未知类型'}
{doc2Info.fileType === 'pdf' ? '📕 PDF文档' : doc2Info.fileType === 'docx' ? '📘 Word文档' : '❌ 未知类型'}
</span></div>
<div>: {pdf2Info.numPages} </div>
<div>: {pdf2Info.textLength} </div>
<div>: {(pdf2Info.confidence * 100).toFixed(0)}%</div>
{doc2Info.fileType === 'pdf' && doc2Info.numPages && (
<div>: {doc2Info.numPages} </div>
)}
{doc2Info.fileType === 'pdf' && doc2Info.pdfType && (
<div>PDF类型: {doc2Info.pdfType === 'text' ? '✅ 文本' : doc2Info.pdfType === 'scanned' ? '⚠️ 扫描' : '❌ 未知'}</div>
)}
<div>: {doc2Info.textLength} </div>
<div>: {(doc2Info.confidence * 100).toFixed(0)}%</div>
</div>
) : (
<div style={{ color: '#999' }}></div>
@@ -537,7 +633,7 @@ export default function MonacoDemoPage() {
<div style={{ marginTop: '12px', paddingTop: '12px', borderTop: '1px solid #b3d9ff' }}>
<strong>💡 使</strong>
<div style={{ marginTop: '4px' }}>
URL参数加载PDF文档进行对比
URL参数加载文档进行对比 PDF Word
<code style={{
display: 'block',
marginTop: '4px',
@@ -547,10 +643,11 @@ export default function MonacoDemoPage() {
fontSize: '12px',
wordBreak: 'break-all'
}}>
/monaco-demo?pdf1=1&pdf2=2
/monaco-demo?doc1=1&doc2=2
</code>
<div style={{ marginTop: '4px', fontSize: '12px' }}>
: <code>/monaco-demo?pdf1=documents/contract_v1.pdf&pdf2=documents/contract_v2.pdf</code>
<div>PDF示例: <code>/monaco-demo?doc1=documents/contract_v1.pdf&doc2=documents/contract_v2.pdf</code></div>
<div style={{ marginTop: '2px' }}>Word示例: <code>/monaco-demo?doc1=testWork/()(1).docx&doc2=testWork/()(2).docx</code></div>
</div>
</div>
</div>
@@ -591,8 +688,8 @@ export default function MonacoDemoPage() {
}}
/>
{/* PDF加载中的遮罩层 */}
{(isLoadingPdf1 || isLoadingPdf2) && (
{/* 文档加载中的遮罩层 */}
{(isLoadingDoc1 || isLoadingDoc2) && (
<div style={{
position: 'absolute',
top: 0,
@@ -616,10 +713,10 @@ export default function MonacoDemoPage() {
margin: '0 auto 16px'
}}></div>
<div style={{ fontSize: '16px', color: '#333' }}>
PDF文档并提取文本...
...
</div>
{isLoadingPdf1 && <div style={{ fontSize: '14px', color: '#666', marginTop: '8px' }}>📄 1</div>}
{isLoadingPdf2 && <div style={{ fontSize: '14px', color: '#666', marginTop: '8px' }}>📄 2</div>}
{isLoadingDoc1 && <div style={{ fontSize: '14px', color: '#666', marginTop: '8px' }}>📄 1</div>}
{isLoadingDoc2 && <div style={{ fontSize: '14px', color: '#666', marginTop: '8px' }}>📄 2</div>}
</div>
</div>
)}