Files
leaudit-platform-frontend/app/routes/rules.new1.tsx
T

572 lines
20 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* 文档预览与内容抽取模块
*
* 依赖包说明:
* 1. react-pdf - PDF文档预览
* 安装命令: npm install react-pdf
* 或: yarn add react-pdf
*
* 2. mammoth - Word文档转HTML预览
* 安装命令: npm install mammoth
* 或: yarn add mammoth
*
* 3. @remix-run/react, @remix-run/node - Remix框架组件
* 安装命令: npm install @remix-run/react @remix-run/node
* 或: yarn add @remix-run/react @remix-run/node
*
* 注意事项:
* - react-pdf需要pdfjs-dist作为依赖,安装react-pdf时会自动安装
* - 需要引入PDF.js worker文件,本代码通过CDN方式引入
* - 如需本地加载PDF.js worker文件,请安装pdfjs-dist并修改worker配置
*/
import { useState, useEffect, useRef } from "react";
import { useLoaderData } from "@remix-run/react";
import { Document, Page, pdfjs } from "react-pdf";
import type { LoaderFunctionArgs } from "@remix-run/node";
import mammoth from "mammoth";
/**
* 设置 pdfjs 工作线程
* 使用 CDN 上的 worker.js 文件处理 PDF 解析
*/
pdfjs.GlobalWorkerOptions.workerSrc = `//cdnjs.cloudflare.com/ajax/libs/pdf.js/${pdfjs.version}/pdf.worker.js`;
/**
* 模拟后端返回的文档抽取内容数据
* 实际应用中应从API获取
*/
const mockExtractedContent = [
{ id: 1, text: "合同条款", page: 2, position: { start: 50, end: 60 } },
{ id: 2, text: "签署日期", page: 5, position: { start: 120, end: 130 } },
{ id: 3, text: "责任划分", page: 3, position: { start: 80, end: 90 } },
];
/**
* 文档抽取内容接口定义
*/
interface ExtractedContent {
id: number; // 内容唯一标识
text: string; // 抽取的文本内容
page: number; // 所在页码
position: { // 在页面中的位置信息
start: number;
end: number;
};
}
/**
* Loader 函数返回数据接口定义
*/
interface LoaderData {
fileUrl: string; // 当前文档URL
initialPage: number; // 初始页码
extractedContent: ExtractedContent[]; // 抽取内容数组
fileType: "pdf" | "docx"; // 文档类型
urls: Record<string, string>; // 可用文档URL列表
}
/**
* PDF文档加载成功回调接口
*/
interface DocumentLoadSuccess {
numPages: number; // 文档总页数
}
/**
* 根据URL判断文件类型
* @param url 文档URL
* @returns 文档类型:"pdf" 或 "docx"
*/
function getFileTypeFromUrl(url: string): "pdf" | "docx" {
const lowerCaseUrl = url.toLowerCase();
if (lowerCaseUrl.endsWith(".pdf")) {
return "pdf";
} else if (lowerCaseUrl.endsWith(".docx") || lowerCaseUrl.endsWith(".doc")) {
return "docx";
}
// 默认当作PDF处理
return "pdf";
}
/**
* Remix Loader 函数 - 请求处理和数据加载
*/
export const loader = async ({ request }: LoaderFunctionArgs) => {
// 从URL获取查询参数
const url = new URL(request.url);
const page = url.searchParams.get("page") || 1;
// 示例文档URLs集合
const urls = {
// 1. 原始文档URL - 可能有CORS限制
original: "https://dev-xc-enroll.oss-cn-guangzhou.aliyuncs.com/uploads/7840-230620112939.docx",
// 2. 公开示例文档 - 仍可能有CORS限制
public: "https://dev-xc-enroll.oss-cn-guangzhou.aliyuncs.com/uploads/7840-230620112939.docx",
// 3. 通过CORS代理 (示例)
proxy: "https://dev-xc-enroll.oss-cn-guangzhou.aliyuncs.com/uploads/7840-230620112939.docx",
// 4. 本地服务器上的文档 (假设已经部署)
local: "/uploads/sample.docx",
// 5. PDF示例
pdf: "http://nas.7bm.co:9000/docauditai/documents/%E5%90%88%E5%90%8C%E6%96%87%E6%A1%A3/2025/04%E6%9C%8816%E6%97%A5/%E7%AC%AC16%E5%8F%B7--%E9%94%80%E5%94%AE%E6%97%A0%E6%A0%87%E5%BF%97%E5%A4%96%E5%9B%BD%E5%8D%B7%E7%83%9F_10%E6%97%B626%E5%88%8632%E7%A7%92/%E7%AC%AC16%E5%8F%B7--%E9%94%80%E5%94%AE%E6%97%A0%E6%A0%87%E5%BF%97%E5%A4%96%E5%9B%BD%E5%8D%B7%E7%83%9F.pdf"
};
// 使用默认文档URL
const fileUrl = urls.pdf;
// 判断文件类型
const fileType = getFileTypeFromUrl(fileUrl);
// 返回加载的数据
return {
fileUrl,
initialPage: Number(page),
extractedContent: mockExtractedContent,
fileType,
urls
};
};
/**
* 文档预览组件
*/
export default function Documents() {
// 从loader获取数据
const { fileUrl, extractedContent, fileType, urls } = useLoaderData<LoaderData>();
// 状态管理
const [numPages, setNumPages] = useState<number | null>(null); // PDF总页数
const [scrollToPage, setScrollToPage] = useState<number | null>(null); // 滚动目标页码
const [docxLoading, setDocxLoading] = useState(false); // Word文档加载状态
const [loadError, setLoadError] = useState<string | null>(null); // 加载错误信息
const [debugInfo, setDebugInfo] = useState<string[]>([]); // 调试信息
const [docxHtml, setDocxHtml] = useState<string>(""); // 转换后的HTML内容
const [currentUrl, setCurrentUrl] = useState<string>(fileUrl); // 当前文档URL
// 引用
const docxContainerRef = useRef<HTMLDivElement>(null); // Word文档容器引用
/**
* 处理抽取内容点击事件 - 仅对PDF文档生效
* @param item 被点击的抽取内容项
*/
const handleContentClick = (item: ExtractedContent) => {
// 仅对PDF文档执行交互操作
if (fileType === "pdf") {
setScrollToPage(item.page);
// 对于PDF,滚动到指定页面
const pageElement = document.getElementById(`page-${item.page}`);
if (pageElement) {
pageElement.scrollIntoView({ behavior: 'smooth' });
}
}
// DOCX文档不执行任何交互操作
};
/**
* PDF文档加载成功回调函数
* @param param0 包含numPages的对象
*/
function onDocumentLoadSuccess({ numPages }: DocumentLoadSuccess) {
setNumPages(numPages);
// console.log("PDF加载成功,页数:", numPages);
}
/**
* 添加调试信息
* @param info 调试信息文本
*/
const addDebugInfo = (info: string) => {
// console.log(info);
setDebugInfo(prev => [...prev, `${new Date().toISOString().split('T')[1].split('.')[0]}: ${info}`]);
};
/**
* 切换文档URL
* @param urlKey URL键名
*/
const switchDocumentUrl = (urlKey: keyof typeof urls) => {
setCurrentUrl(urls[urlKey]);
setDebugInfo([]);
setLoadError(null);
setDocxLoading(false);
addDebugInfo(`切换到新的文档URL: ${urls[urlKey]}`);
};
/**
* Word文档处理逻辑
*/
useEffect(() => {
if (fileType === "docx" && docxContainerRef.current) {
setDocxLoading(true);
setDebugInfo([]); // 清空调试信息
addDebugInfo(`准备加载Word文档: ${currentUrl}`);
const loadDocx = async () => {
try {
// 1. 获取文档文件
addDebugInfo(`开始获取文件...`);
let response;
try {
response = await fetch(currentUrl, {
mode: 'cors',
credentials: 'omit',
headers: {
'Access-Control-Allow-Origin': '*'
}
});
addDebugInfo(`fetch请求状态: ${response.status} ${response.statusText}`);
} catch (fetchError) {
addDebugInfo(`fetch请求失败: ${fetchError instanceof Error ? fetchError.message : String(fetchError)}`);
throw new Error(`网络请求失败: ${fetchError instanceof Error ? fetchError.message : String(fetchError)}`);
}
// 检查响应状态
if (!response.ok) {
throw new Error(`文档无法访问,状态码: ${response.status}`);
}
addDebugInfo(`文档下载成功,状态码: ${response.status}`);
// 2. 将响应转换为ArrayBuffer
addDebugInfo(`开始读取响应内容为ArrayBuffer...`);
let buffer;
try {
buffer = await response.arrayBuffer();
addDebugInfo(`获取到文档数据,大小: ${buffer.byteLength} 字节`);
} catch (bufferError) {
addDebugInfo(`读取为ArrayBuffer失败: ${bufferError instanceof Error ? bufferError.message : String(bufferError)}`);
throw new Error(`转换文档内容失败: ${bufferError instanceof Error ? bufferError.message : String(bufferError)}`);
}
// 3. 使用mammoth.js将Word转换为HTML
addDebugInfo("使用mammoth开始转换文档为HTML...");
try {
// 自定义样式映射
const styleMap = `
p[style-name='Heading 1'] => h1:fresh
p[style-name='Heading 2'] => h2:fresh
p[style-name='Title'] => h1.title:fresh
p[style-name='Subtitle'] => h2.subtitle:fresh
table => table.docx-table
`;
// 转换选项
const options = {
arrayBuffer: buffer,
styleMap: styleMap,
includeDefaultStyleMap: true
};
// 执行转换
const result = await mammoth.convertToHtml(options);
// 检查转换警告
if (result.messages.length > 0) {
result.messages.forEach(message => {
addDebugInfo(`转换警告: [${message.type}] ${message.message}`);
});
}
addDebugInfo("文档转换成功,获取到HTML内容");
// 4. 为生成的HTML添加包装容器和样式
const enhancedHtml = `
<div class="document-container">
${result.value}
<div class="format-note">
<p>注意:部分复杂格式(如页眉页脚、复杂表格样式)可能无法完全显示。</p>
</div>
</div>
`;
// 更新状态
setDocxHtml(enhancedHtml);
setDocxLoading(false);
} catch (mammothError) {
addDebugInfo(`Mammoth转换失败: ${mammothError instanceof Error ? mammothError.message : String(mammothError)}`);
throw new Error(`Word转HTML失败: ${mammothError instanceof Error ? mammothError.message : String(mammothError)}`);
}
} catch (error: unknown) {
const errorMessage = error instanceof Error ? error.message : String(error);
addDebugInfo(`文档处理错误: ${errorMessage}`);
setLoadError(`加载Word文档失败: ${errorMessage}`);
setDocxLoading(false);
}
};
loadDocx();
}
}, [currentUrl, fileType]);
/**
* 页面滚动逻辑
*/
useEffect(() => {
if (scrollToPage && fileType === "pdf") {
const pageElement = document.getElementById(`page-${scrollToPage}`);
if (pageElement) {
pageElement.scrollIntoView({ behavior: 'smooth' });
}
setScrollToPage(null);
}
}, [scrollToPage, fileType]);
/**
* 生成所有PDF页面的渲染数组
* @returns 页面组件数组
*/
const renderAllPages = () => {
if (!numPages) return null;
const pages = [];
for (let i = 1; i <= numPages; i++) {
pages.push(
<div key={i} id={`page-${i}`} className="mb-6">
<div className="text-center text-gray-500 text-sm mb-2"> {i} </div>
<Page
pageNumber={i}
renderTextLayer={true}
renderAnnotationLayer={true}
className="border border-gray-300 shadow-md"
/>
</div>
);
}
return pages;
};
return (
<div className="flex h-screen bg-gray-50">
{/* 文档展示区域 */}
<div className="flex-1 mr-6 p-4">
<div className="bg-white p-4 rounded-lg shadow-md h-full flex flex-col">
<h1 className="text-2xl font-bold mb-4"> ({fileType.toUpperCase()})</h1>
{/* 文档内容显示区域 */}
<div className="w-full flex-1 overflow-auto bg-gray-100 rounded-lg p-4">
{loadError ? (
<div className="text-red-500 flex flex-col items-center justify-center h-full">
<p className="mb-4">:</p>
<p>{loadError}</p>
<div className="mt-6 p-4 bg-gray-800 text-green-400 rounded text-xs max-w-xl overflow-auto max-h-96">
<p className="font-bold mb-2">:</p>
{debugInfo.map((info, index) => (
<div key={index} className="mb-1">{info}</div>
))}
</div>
<div className="mt-4">
<p className="text-black mb-2">:</p>
<div className="flex flex-wrap gap-2">
<button onClick={() => switchDocumentUrl('public')} className="px-3 py-1 bg-green-500 text-white rounded">
使
</button>
<button onClick={() => switchDocumentUrl('proxy')} className="px-3 py-1 bg-blue-500 text-white rounded">
使CORS代理
</button>
<button onClick={() => switchDocumentUrl('pdf')} className="px-3 py-1 bg-yellow-500 text-white rounded">
PDF
</button>
<a href={currentUrl} className="px-3 py-1 bg-gray-500 text-white rounded" download target="_blank" rel="noreferrer">
</a>
</div>
</div>
</div>
) : fileType === "pdf" ? (
/* PDF 文档渲染 */
<Document
file={currentUrl}
onLoadSuccess={onDocumentLoadSuccess}
onLoadError={(error) => {
console.error("PDF加载错误:", error);
setLoadError("PDF文档加载失败:" + (error.message || "未知错误"));
}}
className="flex flex-col items-center"
error={<div className="text-red-500">PDF文档加载失败</div>}
noData={<div></div>}
loading={<div className="text-center py-10">PDF加载中...</div>}
>
{renderAllPages()}
</Document>
) : (
/* Word 文档渲染 */
<>
{docxLoading ? (
/* 加载状态显示 */
<div className="flex flex-col items-center justify-center h-full">
<div className="mb-6">
<div className="animate-spin rounded-full h-12 w-12 border-t-2 border-b-2 border-blue-500"></div>
</div>
<p className="mb-4 text-lg">Word文档加载中...</p>
{debugInfo.length > 0 && (
<div className="mt-4 p-4 bg-gray-800 text-green-400 rounded text-xs max-w-xl overflow-auto max-h-72">
<p className="font-bold mb-2">:</p>
{debugInfo.map((info, index) => (
<div key={index} className="mb-1">{info}</div>
))}
</div>
)}
</div>
) : (
/* 本地渲染的Word文档 */
<div
ref={docxContainerRef}
className="w-full h-full"
style={{
height: '100%',
overflowY: 'auto',
padding: '20px',
backgroundColor: 'white'
}}
dangerouslySetInnerHTML={{ __html: docxHtml }}
/>
)}
</>
)}
</div>
</div>
</div>
{/* 抽取内容区域 - 始终显示,但DOCX模式下不交互 */}
<div className="w-80 bg-white p-4 rounded-lg shadow-md mr-4 my-4 overflow-auto">
<h2 className="text-xl font-semibold mb-4"></h2>
<ul className="space-y-3">
{extractedContent.map((item) => (
<button
key={item.id}
onClick={() => handleContentClick(item)}
className={`w-full text-left p-3 ${fileType === "pdf" ? "bg-gray-50 hover:bg-gray-100 cursor-pointer" : "bg-gray-100"} rounded-lg transition`}
disabled={fileType === "docx"}
aria-label={`查看内容: ${item.text}`}
>
<p className="text-sm font-medium">{item.text}</p>
<p className="text-xs text-gray-500">: {item.page}</p>
</button>
))}
</ul>
</div>
{/* 添加自定义样式 */}
<style dangerouslySetInnerHTML={{
__html: `
/* 高亮显示样式 */
.docx-highlight {
background-color: #ffff00;
outline: 2px solid orange;
position: relative;
}
/* 找到的内容高亮样式 */
.docx-content-found {
background-color: rgba(255, 230, 0, 0.3);
outline: 1px solid orange;
}
/* Mammoth.js生成的内容样式 */
.document-container {
font-family: "Microsoft YaHei", Arial, sans-serif;
line-height: 1.5;
color: #333;
max-width: 800px;
margin: 0 auto;
}
.document-container .format-note {
margin-top: 30px;
padding: 10px;
background-color: #f5f5f5;
border-left: 3px solid #ccc;
font-size: 12px;
color: #666;
}
.document-container h1 {
font-size: 24px;
margin-top: 24px;
margin-bottom: 16px;
font-weight: bold;
color: #222;
}
.document-container h1.title {
font-size: 28px;
text-align: center;
margin-bottom: 24px;
}
.document-container h2 {
font-size: 20px;
margin-top: 20px;
margin-bottom: 14px;
font-weight: bold;
color: #333;
}
.document-container h2.subtitle {
font-size: 18px;
text-align: center;
margin-bottom: 20px;
color: #555;
}
.document-container p {
margin-bottom: 16px;
text-align: justify;
overflow-wrap: break-word;
}
.document-container table {
border-collapse: collapse;
width: 100%;
margin-bottom: 16px;
}
.document-container table.docx-table {
border: 1px solid #ddd;
margin: 16px 0;
}
.document-container table.docx-table th,
.document-container table.docx-table td {
border: 1px solid #ddd;
padding: 8px;
text-align: left;
}
.document-container table.docx-table th {
background-color: #f2f2f2;
font-weight: bold;
}
.document-container ul, .document-container ol {
margin-left: 20px;
margin-bottom: 16px;
}
.document-container li {
margin-bottom: 5px;
}
.document-container img {
max-width: 100%;
height: auto;
margin: 10px 0;
}
.document-container span.underline {
text-decoration: underline;
}
.document-container span.strikethrough {
text-decoration: line-through;
}
/* 段落缩进 */
.document-container p:not(.no-indent) {
text-indent: 2em;
}
`
}} />
</div>
);
}