备份

2025-11-25 11:02:40 +08:00
parent 93bae2de17
commit 0ed6f0aaf4
5 changed files with 194 additions and 156 deletions
@@ -13,6 +13,7 @@ import { useState, useRef, useEffect } from "react";
 import { DiffEditor } from "@monaco-editor/react";
 import type { editor } from "monaco-editor";
 import { pdfjs } from 'react-pdf';
+import mammoth from 'mammoth';
 import { toastService } from '~/components/ui/Toast';

 // 设置 PDF.js worker（与 pdf-demo.tsx 相同）
@@ -25,14 +26,26 @@ export const meta: MetaFunction = () => {
  ];
 };

+// 文档类型枚举
+type DocumentType = 'pdf' | 'docx' | 'unknown';
+
 // PDF 类型枚举
 type PdfType = 'text' | 'scanned' | 'unknown';

-// PDF 信息接口
+// PDF 信息接口（内部使用）
 interface PdfInfo {
  type: PdfType;
  numPages: number;
  textLength: number;
+  confidence: number;
+}
+
+// 文档信息接口
+interface DocumentInfo {
+  fileType: DocumentType;
+  pdfType?: PdfType; // 只有 PDF 才有
+  numPages?: number; // PDF 页数
+  textLength: number;
  confidence: number; // 文本提取置信度 (0-1)
 }

@@ -120,15 +133,23 @@ export default function MonacoDemoPage() {
  const [diffCount, setDiffCount] = useState<number>(0);
  const [currentDiff, setCurrentDiff] = useState<number>(0);

-  // PDF相关状态
-  const [pdf1Url, setPdf1Url] = useState<string>('');
-  const [pdf2Url, setPdf2Url] = useState<string>('');
-  const [pdf1Info, setPdf1Info] = useState<PdfInfo | null>(null);
-  const [pdf2Info, setPdf2Info] = useState<PdfInfo | null>(null);
-  const [isLoadingPdf1, setIsLoadingPdf1] = useState(false);
-  const [isLoadingPdf2, setIsLoadingPdf2] = useState(false);
+  // 文档相关状态
+  const [doc1Url, setDoc1Url] = useState<string>('');
+  const [doc2Url, setDoc2Url] = useState<string>('');
+  const [doc1Info, setDoc1Info] = useState<DocumentInfo | null>(null);
+  const [doc2Info, setDoc2Info] = useState<DocumentInfo | null>(null);
+  const [isLoadingDoc1, setIsLoadingDoc1] = useState(false);
+  const [isLoadingDoc2, setIsLoadingDoc2] = useState(false);
  const [useExample, setUseExample] = useState(true);

+  // 检测文件类型（根据文件路径）
+  const detectFileType = (filePath: string): DocumentType => {
+    const lowerPath = filePath.toLowerCase();
+    if (lowerPath.endsWith('.pdf')) return 'pdf';
+    if (lowerPath.endsWith('.docx') || lowerPath.endsWith('.doc')) return 'docx';
+    return 'unknown';
+  };
+
  // PDF类型检测函数
  const detectPdfType = async (pdfUrl: string): Promise<PdfInfo> => {
    const loadingTask = pdfjs.getDocument(pdfUrl);
@@ -189,32 +210,81 @@ export default function MonacoDemoPage() {
    return fullText;
  };

-  // 加载PDF并提取文本
-  const loadPdfAndExtractText = async (pdfUrl: string, setPdfInfo: (info: PdfInfo | null) => void, setLoading: (loading: boolean) => void, setTextContent: (text: string) => void) => {
+  // Word文档文本提取函数
+  const extractTextFromWord = async (docUrl: string): Promise<string> => {
+    // 通过 fetch 获取文件
+    const response = await fetch(docUrl);
+    if (!response.ok) {
+      throw new Error(`无法加载文档: ${response.statusText}`);
+    }
+
+    // 获取 ArrayBuffer
+    const arrayBuffer = await response.arrayBuffer();
+
+    // 使用 mammoth 提取纯文本
+    const result = await mammoth.extractRawText({ arrayBuffer });
+
+    return result.value;
+  };
+
+  // 加载文档并提取文本（支持 PDF 和 Word）
+  const loadDocumentAndExtractText = async (
+    docUrl: string,
+    filePath: string,
+    setDocInfo: (info: DocumentInfo | null) => void,
+    setLoading: (loading: boolean) => void,
+    setTextContent: (text: string) => void
+  ) => {
    try {
      setLoading(true);

-      // 1. 检测PDF类型
-      const pdfInfo = await detectPdfType(pdfUrl);
-      setPdfInfo(pdfInfo);
+      // 1. 检测文件类型
+      const fileType = detectFileType(filePath);

-      // 2. 提取文本
-      if (pdfInfo.type === 'text') {
-        const text = await extractTextFromPdf(pdfUrl);
+      if (fileType === 'pdf') {
+        // PDF 处理
+        const pdfInfo = await detectPdfType(docUrl);
+        const text = await extractTextFromPdf(docUrl);
+
+        const docInfo: DocumentInfo = {
+          fileType: 'pdf',
+          pdfType: pdfInfo.type,
+          numPages: pdfInfo.numPages,
+          textLength: pdfInfo.textLength,
+          confidence: pdfInfo.confidence
+        };
+
+        setDocInfo(docInfo);
        setTextContent(text);
-        toastService.success(`PDF加载成功！共 ${pdfInfo.numPages} 页，提取了 ${pdfInfo.textLength} 个字符`);
-      } else if (pdfInfo.type === 'scanned') {
-        toastService.warning('检测到扫描版PDF，文本提取质量可能较低');
-        const text = await extractTextFromPdf(pdfUrl);
+
+        if (pdfInfo.type === 'text') {
+          toastService.success(`PDF加载成功！共 ${pdfInfo.numPages} 页，提取了 ${pdfInfo.textLength} 个字符`);
+        } else if (pdfInfo.type === 'scanned') {
+          toastService.warning('检测到扫描版PDF，文本提取质量可能较低');
+        } else {
+          toastService.error('无法识别PDF类型，可能是图片PDF');
+        }
+      } else if (fileType === 'docx') {
+        // Word 处理
+        const text = await extractTextFromWord(docUrl);
+
+        const docInfo: DocumentInfo = {
+          fileType: 'docx',
+          textLength: text.length,
+          confidence: 1.0 // Word 文档文本提取置信度为 100%
+        };
+
+        setDocInfo(docInfo);
        setTextContent(text);
+        toastService.success(`Word文档加载成功！提取了 ${text.length} 个字符`);
      } else {
-        toastService.error('无法识别PDF类型，可能是图片PDF');
+        toastService.error('不支持的文件类型');
        setTextContent('');
      }
    } catch (error) {
-      console.error('PDF加载失败:', error);
-      toastService.error('PDF加载失败，请检查文件路径');
-      setPdfInfo(null);
+      console.error('文档加载失败:', error);
+      toastService.error(`文档加载失败: ${error instanceof Error ? error.message : '未知错误'}`);
+      setDocInfo(null);
      setTextContent('');
    } finally {
      setLoading(false);
@@ -281,8 +351,8 @@ export default function MonacoDemoPage() {
    setModifiedText(CONTRACT_B);
    setCurrentDiff(0);
    setUseExample(true);
-    setPdf1Info(null);
-    setPdf2Info(null);
+    setDoc1Info(null);
+    setDoc2Info(null);

    // 重新计算差异数量
    setTimeout(() => {
@@ -295,34 +365,50 @@ export default function MonacoDemoPage() {
    }, 100);
  };

-  // 从URL参数加载PDF
-  const loadPdfsFromUrl = () => {
+  // 构建文件访问 URL
+  const buildFileUrl = (filePath: string): string => {
+    // 如果路径以 public/ 开头或者以已知的 public 子目录开头（如 testWork/）
+    // 则直接使用静态资源路径
+    if (filePath.startsWith('public/')) {
+      // 去掉 public/ 前缀，直接访问静态资源
+      return '/' + filePath.substring(7);
+    } else if (filePath.startsWith('testWork/') || filePath.startsWith('testPDF/')) {
+      // testWork 和 testPDF 目录在 public 下，直接作为静态资源访问
+      return '/' + filePath;
+    } else {
+      // 其他路径通过 api/pdf-proxy 代理访问（从 MinIO 获取）
+      return `/api/pdf-proxy?path=${encodeURIComponent(filePath)}`;
+    }
+  };
+
+  // 从URL参数加载文档（支持 PDF 和 Word）
+  const loadDocumentsFromUrl = () => {
    if (typeof window === 'undefined') return;

    const searchParams = new URLSearchParams(window.location.search);
-    const pdf1Path = searchParams.get('pdf1');
-    const pdf2Path = searchParams.get('pdf2');
+    const doc1Path = searchParams.get('doc1') || searchParams.get('pdf1'); // 兼容旧参数名
+    const doc2Path = searchParams.get('doc2') || searchParams.get('pdf2'); // 兼容旧参数名

-    if (pdf1Path || pdf2Path) {
+    if (doc1Path || doc2Path) {
      setUseExample(false);

-      if (pdf1Path) {
-        const fullUrl = `/api/pdf-proxy?path=${encodeURIComponent(pdf1Path)}`;
-        setPdf1Url(fullUrl);
-        loadPdfAndExtractText(fullUrl, setPdf1Info, setIsLoadingPdf1, setOriginalText);
+      if (doc1Path) {
+        const fullUrl = buildFileUrl(doc1Path);
+        setDoc1Url(fullUrl);
+        loadDocumentAndExtractText(fullUrl, doc1Path, setDoc1Info, setIsLoadingDoc1, setOriginalText);
      }

-      if (pdf2Path) {
-        const fullUrl = `/api/pdf-proxy?path=${encodeURIComponent(pdf2Path)}`;
-        setPdf2Url(fullUrl);
-        loadPdfAndExtractText(fullUrl, setPdf2Info, setIsLoadingPdf2, setModifiedText);
+      if (doc2Path) {
+        const fullUrl = buildFileUrl(doc2Path);
+        setDoc2Url(fullUrl);
+        loadDocumentAndExtractText(fullUrl, doc2Path, setDoc2Info, setIsLoadingDoc2, setModifiedText);
      }
    }
  };

  // 组件挂载时读取URL参数
  useEffect(() => {
-    loadPdfsFromUrl();
+    loadDocumentsFromUrl();
    // eslint-disable-next-line react-hooks/exhaustive-deps
  }, []);

@@ -452,8 +538,8 @@ export default function MonacoDemoPage() {
        </div>
      </div>

-      {/* PDF加载信息 */}
-      {!useExample && (pdf1Info || pdf2Info || isLoadingPdf1 || isLoadingPdf2) && (
+      {/* 文档加载信息 */}
+      {!useExample && (doc1Info || doc2Info || isLoadingDoc1 || isLoadingDoc2) && (
        <div style={{
          padding: '12px 24px',
          backgroundColor: '#fff3cd',
@@ -462,48 +548,58 @@ export default function MonacoDemoPage() {
          color: '#856404'
        }}>
          <div style={{ display: 'flex', alignItems: 'flex-start', gap: '24px' }}>
-            <i className="ri-file-pdf-line" style={{ fontSize: '18px', marginTop: '2px' }}></i>
+            <i className="ri-file-text-line" style={{ fontSize: '18px', marginTop: '2px' }}></i>
            <div style={{ flex: 1 }}>
-              <strong>PDF文档信息：</strong>
+              <strong>文档信息：</strong>
              <div style={{ display: 'flex', gap: '24px', marginTop: '8px' }}>
-                {/* PDF 1 信息 */}
+                {/* 文档 1 信息 */}
                <div style={{ flex: 1 }}>
                  <div style={{ fontWeight: 'bold', marginBottom: '4px' }}>📄 文档1（左侧/原始）</div>
-                  {isLoadingPdf1 ? (
+                  {isLoadingDoc1 ? (
                    <div style={{ color: '#666' }}>⏳ 加载中...</div>
-                  ) : pdf1Info ? (
+                  ) : doc1Info ? (
                    <div>
                      <div>类型: <span style={{
-                        color: pdf1Info.type === 'text' ? '#28a745' : pdf1Info.type === 'scanned' ? '#ffc107' : '#dc3545',
+                        color: doc1Info.fileType === 'pdf' ? '#007bff' : doc1Info.fileType === 'docx' ? '#28a745' : '#dc3545',
                        fontWeight: 'bold'
                      }}>
-                        {pdf1Info.type === 'text' ? '✅ 文本PDF' : pdf1Info.type === 'scanned' ? '⚠️ 扫描PDF' : '❌ 未知类型'}
+                        {doc1Info.fileType === 'pdf' ? '📕 PDF文档' : doc1Info.fileType === 'docx' ? '📘 Word文档' : '❌ 未知类型'}
                      </span></div>
-                      <div>页数: {pdf1Info.numPages} 页</div>
-                      <div>字符数: {pdf1Info.textLength} 个</div>
-                      <div>置信度: {(pdf1Info.confidence * 100).toFixed(0)}%</div>
+                      {doc1Info.fileType === 'pdf' && doc1Info.numPages && (
+                        <div>页数: {doc1Info.numPages} 页</div>
+                      )}
+                      {doc1Info.fileType === 'pdf' && doc1Info.pdfType && (
+                        <div>PDF类型: {doc1Info.pdfType === 'text' ? '✅ 文本' : doc1Info.pdfType === 'scanned' ? '⚠️ 扫描' : '❌ 未知'}</div>
+                      )}
+                      <div>字符数: {doc1Info.textLength} 个</div>
+                      <div>置信度: {(doc1Info.confidence * 100).toFixed(0)}%</div>
                    </div>
                  ) : (
                    <div style={{ color: '#999' }}>未加载</div>
                  )}
                </div>

-                {/* PDF 2 信息 */}
+                {/* 文档 2 信息 */}
                <div style={{ flex: 1 }}>
                  <div style={{ fontWeight: 'bold', marginBottom: '4px' }}>📄 文档2（右侧/修改）</div>
-                  {isLoadingPdf2 ? (
+                  {isLoadingDoc2 ? (
                    <div style={{ color: '#666' }}>⏳ 加载中...</div>
-                  ) : pdf2Info ? (
+                  ) : doc2Info ? (
                    <div>
                      <div>类型: <span style={{
-                        color: pdf2Info.type === 'text' ? '#28a745' : pdf2Info.type === 'scanned' ? '#ffc107' : '#dc3545',
+                        color: doc2Info.fileType === 'pdf' ? '#007bff' : doc2Info.fileType === 'docx' ? '#28a745' : '#dc3545',
                        fontWeight: 'bold'
                      }}>
-                        {pdf2Info.type === 'text' ? '✅ 文本PDF' : pdf2Info.type === 'scanned' ? '⚠️ 扫描PDF' : '❌ 未知类型'}
+                        {doc2Info.fileType === 'pdf' ? '📕 PDF文档' : doc2Info.fileType === 'docx' ? '📘 Word文档' : '❌ 未知类型'}
                      </span></div>
-                      <div>页数: {pdf2Info.numPages} 页</div>
-                      <div>字符数: {pdf2Info.textLength} 个</div>
-                      <div>置信度: {(pdf2Info.confidence * 100).toFixed(0)}%</div>
+                      {doc2Info.fileType === 'pdf' && doc2Info.numPages && (
+                        <div>页数: {doc2Info.numPages} 页</div>
+                      )}
+                      {doc2Info.fileType === 'pdf' && doc2Info.pdfType && (
+                        <div>PDF类型: {doc2Info.pdfType === 'text' ? '✅ 文本' : doc2Info.pdfType === 'scanned' ? '⚠️ 扫描' : '❌ 未知'}</div>
+                      )}
+                      <div>字符数: {doc2Info.textLength} 个</div>
+                      <div>置信度: {(doc2Info.confidence * 100).toFixed(0)}%</div>
                    </div>
                  ) : (
                    <div style={{ color: '#999' }}>未加载</div>
@@ -537,7 +633,7 @@ export default function MonacoDemoPage() {
              <div style={{ marginTop: '12px', paddingTop: '12px', borderTop: '1px solid #b3d9ff' }}>
                <strong>💡 使用提示：</strong>
                <div style={{ marginTop: '4px' }}>
-                  您可以通过URL参数加载PDF文档进行对比：
+                  您可以通过URL参数加载文档进行对比（支持 PDF 和 Word）：
                  <code style={{
                    display: 'block',
                    marginTop: '4px',
@@ -547,10 +643,11 @@ export default function MonacoDemoPage() {
                    fontSize: '12px',
                    wordBreak: 'break-all'
                  }}>
-                    /monaco-demo?pdf1=路径1&pdf2=路径2
+                    /monaco-demo?doc1=路径1&doc2=路径2
                  </code>
                  <div style={{ marginTop: '4px', fontSize: '12px' }}>
-                    示例: <code>/monaco-demo?pdf1=documents/contract_v1.pdf&pdf2=documents/contract_v2.pdf</code>
+                    <div>PDF示例: <code>/monaco-demo?doc1=documents/contract_v1.pdf&doc2=documents/contract_v2.pdf</code></div>
+                    <div style={{ marginTop: '2px' }}>Word示例: <code>/monaco-demo?doc1=testWork/(最终版)智慧法务平台建设采购项目合同(1).docx&doc2=testWork/(最终版)智慧法务平台建设采购项目合同(2).docx</code></div>
                  </div>
                </div>
              </div>
@@ -591,8 +688,8 @@ export default function MonacoDemoPage() {
          }}
        />

-        {/* PDF加载中的遮罩层 */}
-        {(isLoadingPdf1 || isLoadingPdf2) && (
+        {/* 文档加载中的遮罩层 */}
+        {(isLoadingDoc1 || isLoadingDoc2) && (
          <div style={{
            position: 'absolute',
            top: 0,
@@ -616,10 +713,10 @@ export default function MonacoDemoPage() {
                margin: '0 auto 16px'
              }}></div>
              <div style={{ fontSize: '16px', color: '#333' }}>
-                正在加载PDF文档并提取文本...
+                正在加载文档并提取文本...
              </div>
-              {isLoadingPdf1 && <div style={{ fontSize: '14px', color: '#666', marginTop: '8px' }}>📄 加载文档1</div>}
-              {isLoadingPdf2 && <div style={{ fontSize: '14px', color: '#666', marginTop: '8px' }}>📄 加载文档2</div>}
+              {isLoadingDoc1 && <div style={{ fontSize: '14px', color: '#666', marginTop: '8px' }}>📄 加载文档1</div>}
+              {isLoadingDoc2 && <div style={{ fontSize: '14px', color: '#666', marginTop: '8px' }}>📄 加载文档2</div>}
            </div>
          </div>
        )}