leaudit-platform-frontend/app/routes/rules.new1.tsx

/**
 * 文档预览与内容抽取模块
 *
 * 依赖包说明:
 * 1. react-pdf - PDF文档预览
 *    安装命令: npm install react-pdf
 *    或: yarn add react-pdf
 *
 * 2. mammoth - Word文档转HTML预览
 *    安装命令: npm install mammoth
 *    或: yarn add mammoth
 *
 * 3. @remix-run/react, @remix-run/node - Remix框架组件
 *    安装命令: npm install @remix-run/react @remix-run/node
 *    或: yarn add @remix-run/react @remix-run/node
 *
 * 注意事项:
 * - react-pdf需要pdfjs-dist作为依赖，安装react-pdf时会自动安装
 * - 需要引入PDF.js worker文件，本代码通过CDN方式引入
 * - 如需本地加载PDF.js worker文件，请安装pdfjs-dist并修改worker配置
 */

import { useState, useEffect, useRef } from "react";
import { useLoaderData } from "@remix-run/react";
import { Document, Page, pdfjs } from "react-pdf";
import type { LoaderFunctionArgs } from "@remix-run/node";
import mammoth from "mammoth";

/**
 * 设置 pdfjs 工作线程
 * 使用 CDN 上的 worker.js 文件处理 PDF 解析
 */
pdfjs.GlobalWorkerOptions.workerSrc = `//cdnjs.cloudflare.com/ajax/libs/pdf.js/${pdfjs.version}/pdf.worker.js`;

/**
 * 模拟后端返回的文档抽取内容数据
 * 实际应用中应从API获取
 */
const mockExtractedContent = [
  { id: 1, text: "合同条款", page: 2, position: { start: 50, end: 60 } },
  { id: 2, text: "签署日期", page: 5, position: { start: 120, end: 130 } },
  { id: 3, text: "责任划分", page: 3, position: { start: 80, end: 90 } },
];

/**
 * 文档抽取内容接口定义
 */
interface ExtractedContent {
  id: number;           // 内容唯一标识
  text: string;         // 抽取的文本内容
  page: number;         // 所在页码
  position: {           // 在页面中的位置信息
    start: number;
    end: number;
  };
}

/**
 * Loader 函数返回数据接口定义
 */
interface LoaderData {
  fileUrl: string;                       // 当前文档URL
  initialPage: number;                   // 初始页码
  extractedContent: ExtractedContent[];  // 抽取内容数组
  fileType: "pdf" | "docx";              // 文档类型
  urls: Record<string, string>;          // 可用文档URL列表
}

/**
 * PDF文档加载成功回调接口
 */
interface DocumentLoadSuccess {
  numPages: number;  // 文档总页数
}

/**
 * 根据URL判断文件类型
 * @param url 文档URL
 * @returns 文档类型："pdf" 或 "docx"
 */
function getFileTypeFromUrl(url: string): "pdf" | "docx" {
  const lowerCaseUrl = url.toLowerCase();
  if (lowerCaseUrl.endsWith(".pdf")) {
    return "pdf";
  } else if (lowerCaseUrl.endsWith(".docx") || lowerCaseUrl.endsWith(".doc")) {
    return "docx";
  }
  // 默认当作PDF处理
  return "pdf";
}

/**
 * Remix Loader 函数 - 请求处理和数据加载
 */
export const loader = async ({ request }: LoaderFunctionArgs) => {
  // 从URL获取查询参数
  const url = new URL(request.url);
  const page = url.searchParams.get("page") || 1;

  // 示例文档URLs集合
  const urls = {
    // 1. 原始文档URL - 可能有CORS限制
    original: "https://dev-xc-enroll.oss-cn-guangzhou.aliyuncs.com/uploads/7840-230620112939.docx",
    // 2. 公开示例文档 - 仍可能有CORS限制
    public: "https://dev-xc-enroll.oss-cn-guangzhou.aliyuncs.com/uploads/7840-230620112939.docx",
    // 3. 通过CORS代理 (示例)
    proxy: "https://dev-xc-enroll.oss-cn-guangzhou.aliyuncs.com/uploads/7840-230620112939.docx",
    // 4. 本地服务器上的文档 (假设已经部署)
    local: "/uploads/sample.docx",
    // 5. PDF示例
    pdf: "http://nas.7bm.co:9000/docauditai/documents/%E5%90%88%E5%90%8C%E6%96%87%E6%A1%A3/2025/04%E6%9C%8816%E6%97%A5/%E7%AC%AC16%E5%8F%B7--%E9%94%80%E5%94%AE%E6%97%A0%E6%A0%87%E5%BF%97%E5%A4%96%E5%9B%BD%E5%8D%B7%E7%83%9F_10%E6%97%B626%E5%88%8632%E7%A7%92/%E7%AC%AC16%E5%8F%B7--%E9%94%80%E5%94%AE%E6%97%A0%E6%A0%87%E5%BF%97%E5%A4%96%E5%9B%BD%E5%8D%B7%E7%83%9F.pdf"
  };

  // 使用默认文档URL
  const fileUrl = urls.pdf;

  // 判断文件类型
  const fileType = getFileTypeFromUrl(fileUrl);

  // 返回加载的数据
  return {
    fileUrl,
    initialPage: Number(page),
    extractedContent: mockExtractedContent,
    fileType,
    urls
  };
};

/**
 * 文档预览组件
 */
export default function Documents() {
  // 从loader获取数据
  const { fileUrl, extractedContent, fileType, urls } = useLoaderData<LoaderData>();

  // 状态管理
  const [numPages, setNumPages] = useState<number | null>(null);                // PDF总页数
  const [scrollToPage, setScrollToPage] = useState<number | null>(null);        // 滚动目标页码
  const [docxLoading, setDocxLoading] = useState(false);                        // Word文档加载状态
  const [loadError, setLoadError] = useState<string | null>(null);              // 加载错误信息
  const [debugInfo, setDebugInfo] = useState<string[]>([]);                     // 调试信息
  const [docxHtml, setDocxHtml] = useState<string>("");                         // 转换后的HTML内容
  const [currentUrl, setCurrentUrl] = useState<string>(fileUrl);                // 当前文档URL

  // 引用
  const docxContainerRef = useRef<HTMLDivElement>(null);                        // Word文档容器引用

  /**
   * 处理抽取内容点击事件 - 仅对PDF文档生效
   * @param item 被点击的抽取内容项
   */
  const handleContentClick = (item: ExtractedContent) => {
    // 仅对PDF文档执行交互操作
    if (fileType === "pdf") {
      setScrollToPage(item.page);
      // 对于PDF，滚动到指定页面
      const pageElement = document.getElementById(`page-${item.page}`);
      if (pageElement) {
        pageElement.scrollIntoView({ behavior: 'smooth' });
      }
    }
    // DOCX文档不执行任何交互操作
  };

  /**
   * PDF文档加载成功回调函数
   * @param param0 包含numPages的对象
   */
  function onDocumentLoadSuccess({ numPages }: DocumentLoadSuccess) {
    setNumPages(numPages);
    // console.log("PDF加载成功，页数:", numPages);
  }

  /**
   * 添加调试信息
   * @param info 调试信息文本
   */
  const addDebugInfo = (info: string) => {
    // console.log(info);
    setDebugInfo(prev => [...prev, `${new Date().toISOString().split('T')[1].split('.')[0]}: ${info}`]);
  };

  /**
   * 切换文档URL
   * @param urlKey URL键名
   */
  const switchDocumentUrl = (urlKey: keyof typeof urls) => {
    setCurrentUrl(urls[urlKey]);
    setDebugInfo([]);
    setLoadError(null);
    setDocxLoading(false);
    addDebugInfo(`切换到新的文档URL: ${urls[urlKey]}`);
  };

  /**
   * Word文档处理逻辑
   */
  useEffect(() => {
    if (fileType === "docx" && docxContainerRef.current) {
      setDocxLoading(true);
      setDebugInfo([]); // 清空调试信息
      addDebugInfo(`准备加载Word文档: ${currentUrl}`);

      const loadDocx = async () => {
        try {
          // 1. 获取文档文件
          addDebugInfo(`开始获取文件...`);
          let response;
          try {
            response = await fetch(currentUrl, {
              mode: 'cors',
              credentials: 'omit',
              headers: {
                'Access-Control-Allow-Origin': '*'
              }
            });
            addDebugInfo(`fetch请求状态: ${response.status} ${response.statusText}`);
          } catch (fetchError) {
            addDebugInfo(`fetch请求失败: ${fetchError instanceof Error ? fetchError.message : String(fetchError)}`);
            throw new Error(`网络请求失败: ${fetchError instanceof Error ? fetchError.message : String(fetchError)}`);
          }

          // 检查响应状态
          if (!response.ok) {
            throw new Error(`文档无法访问，状态码: ${response.status}`);
          }
          addDebugInfo(`文档下载成功，状态码: ${response.status}`);

          // 2. 将响应转换为ArrayBuffer
          addDebugInfo(`开始读取响应内容为ArrayBuffer...`);
          let buffer;
          try {
            buffer = await response.arrayBuffer();
            addDebugInfo(`获取到文档数据，大小: ${buffer.byteLength} 字节`);
          } catch (bufferError) {
            addDebugInfo(`读取为ArrayBuffer失败: ${bufferError instanceof Error ? bufferError.message : String(bufferError)}`);
            throw new Error(`转换文档内容失败: ${bufferError instanceof Error ? bufferError.message : String(bufferError)}`);
          }

          // 3. 使用mammoth.js将Word转换为HTML
          addDebugInfo("使用mammoth开始转换文档为HTML...");
          try {
            // 自定义样式映射
            const styleMap = `
              p[style-name='Heading 1'] => h1:fresh
              p[style-name='Heading 2'] => h2:fresh
              p[style-name='Title'] => h1.title:fresh
              p[style-name='Subtitle'] => h2.subtitle:fresh
              table => table.docx-table
            `;

            // 转换选项
            const options = {
              arrayBuffer: buffer,
              styleMap: styleMap,
              includeDefaultStyleMap: true
            };

            // 执行转换
            const result = await mammoth.convertToHtml(options);

            // 检查转换警告
            if (result.messages.length > 0) {
              result.messages.forEach(message => {
                addDebugInfo(`转换警告: [${message.type}] ${message.message}`);
              });
            }

            addDebugInfo("文档转换成功，获取到HTML内容");

            // 4. 为生成的HTML添加包装容器和样式
            const enhancedHtml = `
              <div class="document-container">
                ${result.value}
                <div class="format-note">
                  <p>注意：部分复杂格式(如页眉页脚、复杂表格样式)可能无法完全显示。</p>
                </div>
              </div>
            `;

            // 更新状态
            setDocxHtml(enhancedHtml);
            setDocxLoading(false);
          } catch (mammothError) {
            addDebugInfo(`Mammoth转换失败: ${mammothError instanceof Error ? mammothError.message : String(mammothError)}`);
            throw new Error(`Word转HTML失败: ${mammothError instanceof Error ? mammothError.message : String(mammothError)}`);
          }
        } catch (error: unknown) {
          const errorMessage = error instanceof Error ? error.message : String(error);
          addDebugInfo(`文档处理错误: ${errorMessage}`);
          setLoadError(`加载Word文档失败: ${errorMessage}`);
          setDocxLoading(false);
        }
      };

      loadDocx();
    }
  }, [currentUrl, fileType]);

  /**
   * 页面滚动逻辑
   */
  useEffect(() => {
    if (scrollToPage && fileType === "pdf") {
      const pageElement = document.getElementById(`page-${scrollToPage}`);
      if (pageElement) {
        pageElement.scrollIntoView({ behavior: 'smooth' });
      }
      setScrollToPage(null);
    }
  }, [scrollToPage, fileType]);

  /**
   * 生成所有PDF页面的渲染数组
   * @returns 页面组件数组
   */
  const renderAllPages = () => {
    if (!numPages) return null;

    const pages = [];
    for (let i = 1; i <= numPages; i++) {
      pages.push(
        <div key={i} id={`page-${i}`} className="mb-6">
          <div className="text-center text-gray-500 text-sm mb-2">第 {i} 页</div>
          <Page
            pageNumber={i}
            renderTextLayer={true}
            renderAnnotationLayer={true}
            className="border border-gray-300 shadow-md"
          />
        </div>
      );
    }
    return pages;
  };

  return (
    <div className="flex h-screen bg-gray-50">
      {/* 文档展示区域 */}
      <div className="flex-1 mr-6 p-4">
        <div className="bg-white p-4 rounded-lg shadow-md h-full flex flex-col">
          <h1 className="text-2xl font-bold mb-4">文档预览 ({fileType.toUpperCase()})</h1>

          {/* 文档内容显示区域 */}
          <div className="w-full flex-1 overflow-auto bg-gray-100 rounded-lg p-4">
            {loadError ? (
              <div className="text-red-500 flex flex-col items-center justify-center h-full">
                <p className="mb-4">加载错误:</p>
                <p>{loadError}</p>
                <div className="mt-6 p-4 bg-gray-800 text-green-400 rounded text-xs max-w-xl overflow-auto max-h-96">
                  <p className="font-bold mb-2">调试信息:</p>
                  {debugInfo.map((info, index) => (
                    <div key={index} className="mb-1">{info}</div>
                  ))}
                </div>
                <div className="mt-4">
                  <p className="text-black mb-2">尝试其他方式:</p>
                  <div className="flex flex-wrap gap-2">
                    <button onClick={() => switchDocumentUrl('public')} className="px-3 py-1 bg-green-500 text-white rounded">
                      使用公共示例
                    </button>
                    <button onClick={() => switchDocumentUrl('proxy')} className="px-3 py-1 bg-blue-500 text-white rounded">
                      使用CORS代理
                    </button>
                    <button onClick={() => switchDocumentUrl('pdf')} className="px-3 py-1 bg-yellow-500 text-white rounded">
                      切换到PDF
                    </button>
                    <a href={currentUrl} className="px-3 py-1 bg-gray-500 text-white rounded" download target="_blank" rel="noreferrer">
                      下载文档
                    </a>
                  </div>
                </div>
              </div>
            ) : fileType === "pdf" ? (
              /* PDF 文档渲染 */
              <Document
                file={currentUrl}
                onLoadSuccess={onDocumentLoadSuccess}
                onLoadError={(error) => {
                  console.error("PDF加载错误:", error);
                  setLoadError("PDF文档加载失败：" + (error.message || "未知错误"));
                }}
                className="flex flex-col items-center"
                error={<div className="text-red-500">PDF文档加载失败，请检查链接或网络连接。</div>}
                noData={<div>无数据</div>}
                loading={<div className="text-center py-10">PDF加载中...</div>}
              >
                {renderAllPages()}
              </Document>
            ) : (
              /* Word 文档渲染 */
              <>
                {docxLoading ? (
                  /* 加载状态显示 */
                  <div className="flex flex-col items-center justify-center h-full">
                    <div className="mb-6">
                      <div className="animate-spin rounded-full h-12 w-12 border-t-2 border-b-2 border-blue-500"></div>
                    </div>
                    <p className="mb-4 text-lg">Word文档加载中...</p>
                    {debugInfo.length > 0 && (
                      <div className="mt-4 p-4 bg-gray-800 text-green-400 rounded text-xs max-w-xl overflow-auto max-h-72">
                        <p className="font-bold mb-2">加载过程:</p>
                        {debugInfo.map((info, index) => (
                          <div key={index} className="mb-1">{info}</div>
                        ))}
                      </div>
                    )}
                  </div>
                ) : (
                  /* 本地渲染的Word文档 */
                  <div
                    ref={docxContainerRef}
                    className="w-full h-full"
                    style={{
                      height: '100%',
                      overflowY: 'auto',
                      padding: '20px',
                      backgroundColor: 'white'
                    }}
                    dangerouslySetInnerHTML={{ __html: docxHtml }}
                  />
                )}
              </>
            )}
          </div>
        </div>
      </div>

      {/* 抽取内容区域 - 始终显示，但DOCX模式下不交互 */}
      <div className="w-80 bg-white p-4 rounded-lg shadow-md mr-4 my-4 overflow-auto">
        <h2 className="text-xl font-semibold mb-4">抽取内容</h2>
        <ul className="space-y-3">
          {extractedContent.map((item) => (
            <button
              key={item.id}
              onClick={() => handleContentClick(item)}
              className={`w-full text-left p-3 ${fileType === "pdf" ? "bg-gray-50 hover:bg-gray-100 cursor-pointer" : "bg-gray-100"} rounded-lg transition`}
              disabled={fileType === "docx"}
              aria-label={`查看内容: ${item.text}`}
            >
              <p className="text-sm font-medium">{item.text}</p>
              <p className="text-xs text-gray-500">页面: {item.page}</p>
            </button>
          ))}
        </ul>
      </div>

      {/* 添加自定义样式 */}
      <style dangerouslySetInnerHTML={{
        __html: `
          /* 高亮显示样式 */
          .docx-highlight {
            background-color: #ffff00;
            outline: 2px solid orange;
            position: relative;
          }

          /* 找到的内容高亮样式 */
          .docx-content-found {
            background-color: rgba(255, 230, 0, 0.3);
            outline: 1px solid orange;
          }

          /* Mammoth.js生成的内容样式 */
          .document-container {
            font-family: "Microsoft YaHei", Arial, sans-serif;
            line-height: 1.5;
            color: #333;
            max-width: 800px;
            margin: 0 auto;
          }

          .document-container .format-note {
            margin-top: 30px;
            padding: 10px;
            background-color: #f5f5f5;
            border-left: 3px solid #ccc;
            font-size: 12px;
            color: #666;
          }

          .document-container h1 {
            font-size: 24px;
            margin-top: 24px;
            margin-bottom: 16px;
            font-weight: bold;
            color: #222;
          }

          .document-container h1.title {
            font-size: 28px;
            text-align: center;
            margin-bottom: 24px;
          }

          .document-container h2 {
            font-size: 20px;
            margin-top: 20px;
            margin-bottom: 14px;
            font-weight: bold;
            color: #333;
          }

          .document-container h2.subtitle {
            font-size: 18px;
            text-align: center;
            margin-bottom: 20px;
            color: #555;
          }

          .document-container p {
            margin-bottom: 16px;
            text-align: justify;
            overflow-wrap: break-word;
          }

          .document-container table {
            border-collapse: collapse;
            width: 100%;
            margin-bottom: 16px;
          }

          .document-container table.docx-table {
            border: 1px solid #ddd;
            margin: 16px 0;
          }

          .document-container table.docx-table th,
          .document-container table.docx-table td {
            border: 1px solid #ddd;
            padding: 8px;
            text-align: left;
          }

          .document-container table.docx-table th {
            background-color: #f2f2f2;
            font-weight: bold;
          }

          .document-container ul, .document-container ol {
            margin-left: 20px;
            margin-bottom: 16px;
          }

          .document-container li {
            margin-bottom: 5px;
          }

          .document-container img {
            max-width: 100%;
            height: auto;
            margin: 10px 0;
          }

          .document-container span.underline {
            text-decoration: underline;
          }

          .document-container span.strikethrough {
            text-decoration: line-through;
          }

          /* 段落缩进 */
          .document-container p:not(.no-indent) {
            text-indent: 2em;
          }
        `
      }} />
    </div>
  );
}