572 lines
20 KiB
TypeScript
572 lines
20 KiB
TypeScript
/**
|
||
* 文档预览与内容抽取模块
|
||
*
|
||
* 依赖包说明:
|
||
* 1. react-pdf - PDF文档预览
|
||
* 安装命令: npm install react-pdf
|
||
* 或: yarn add react-pdf
|
||
*
|
||
* 2. mammoth - Word文档转HTML预览
|
||
* 安装命令: npm install mammoth
|
||
* 或: yarn add mammoth
|
||
*
|
||
* 3. @remix-run/react, @remix-run/node - Remix框架组件
|
||
* 安装命令: npm install @remix-run/react @remix-run/node
|
||
* 或: yarn add @remix-run/react @remix-run/node
|
||
*
|
||
* 注意事项:
|
||
* - react-pdf需要pdfjs-dist作为依赖,安装react-pdf时会自动安装
|
||
* - 需要引入PDF.js worker文件,本代码通过CDN方式引入
|
||
* - 如需本地加载PDF.js worker文件,请安装pdfjs-dist并修改worker配置
|
||
*/
|
||
|
||
import { useState, useEffect, useRef } from "react";
|
||
import { useLoaderData } from "@remix-run/react";
|
||
import { Document, Page, pdfjs } from "react-pdf";
|
||
import type { LoaderFunctionArgs } from "@remix-run/node";
|
||
import mammoth from "mammoth";
|
||
|
||
/**
|
||
* 设置 pdfjs 工作线程
|
||
* 使用 CDN 上的 worker.js 文件处理 PDF 解析
|
||
*/
|
||
pdfjs.GlobalWorkerOptions.workerSrc = `//cdnjs.cloudflare.com/ajax/libs/pdf.js/${pdfjs.version}/pdf.worker.js`;
|
||
|
||
/**
|
||
* 模拟后端返回的文档抽取内容数据
|
||
* 实际应用中应从API获取
|
||
*/
|
||
const mockExtractedContent = [
|
||
{ id: 1, text: "合同条款", page: 2, position: { start: 50, end: 60 } },
|
||
{ id: 2, text: "签署日期", page: 5, position: { start: 120, end: 130 } },
|
||
{ id: 3, text: "责任划分", page: 3, position: { start: 80, end: 90 } },
|
||
];
|
||
|
||
/**
|
||
* 文档抽取内容接口定义
|
||
*/
|
||
interface ExtractedContent {
|
||
id: number; // 内容唯一标识
|
||
text: string; // 抽取的文本内容
|
||
page: number; // 所在页码
|
||
position: { // 在页面中的位置信息
|
||
start: number;
|
||
end: number;
|
||
};
|
||
}
|
||
|
||
/**
|
||
* Loader 函数返回数据接口定义
|
||
*/
|
||
interface LoaderData {
|
||
fileUrl: string; // 当前文档URL
|
||
initialPage: number; // 初始页码
|
||
extractedContent: ExtractedContent[]; // 抽取内容数组
|
||
fileType: "pdf" | "docx"; // 文档类型
|
||
urls: Record<string, string>; // 可用文档URL列表
|
||
}
|
||
|
||
/**
|
||
* PDF文档加载成功回调接口
|
||
*/
|
||
interface DocumentLoadSuccess {
|
||
numPages: number; // 文档总页数
|
||
}
|
||
|
||
/**
|
||
* 根据URL判断文件类型
|
||
* @param url 文档URL
|
||
* @returns 文档类型:"pdf" 或 "docx"
|
||
*/
|
||
function getFileTypeFromUrl(url: string): "pdf" | "docx" {
|
||
const lowerCaseUrl = url.toLowerCase();
|
||
if (lowerCaseUrl.endsWith(".pdf")) {
|
||
return "pdf";
|
||
} else if (lowerCaseUrl.endsWith(".docx") || lowerCaseUrl.endsWith(".doc")) {
|
||
return "docx";
|
||
}
|
||
// 默认当作PDF处理
|
||
return "pdf";
|
||
}
|
||
|
||
/**
|
||
* Remix Loader 函数 - 请求处理和数据加载
|
||
*/
|
||
export const loader = async ({ request }: LoaderFunctionArgs) => {
|
||
// 从URL获取查询参数
|
||
const url = new URL(request.url);
|
||
const page = url.searchParams.get("page") || 1;
|
||
|
||
// 示例文档URLs集合
|
||
const urls = {
|
||
// 1. 原始文档URL - 可能有CORS限制
|
||
original: "https://dev-xc-enroll.oss-cn-guangzhou.aliyuncs.com/uploads/7840-230620112939.docx",
|
||
// 2. 公开示例文档 - 仍可能有CORS限制
|
||
public: "https://dev-xc-enroll.oss-cn-guangzhou.aliyuncs.com/uploads/7840-230620112939.docx",
|
||
// 3. 通过CORS代理 (示例)
|
||
proxy: "https://dev-xc-enroll.oss-cn-guangzhou.aliyuncs.com/uploads/7840-230620112939.docx",
|
||
// 4. 本地服务器上的文档 (假设已经部署)
|
||
local: "/uploads/sample.docx",
|
||
// 5. PDF示例
|
||
pdf: "http://nas.7bm.co:9000/docauditai/documents/%E5%90%88%E5%90%8C%E6%96%87%E6%A1%A3/2025/04%E6%9C%8816%E6%97%A5/%E7%AC%AC16%E5%8F%B7--%E9%94%80%E5%94%AE%E6%97%A0%E6%A0%87%E5%BF%97%E5%A4%96%E5%9B%BD%E5%8D%B7%E7%83%9F_10%E6%97%B626%E5%88%8632%E7%A7%92/%E7%AC%AC16%E5%8F%B7--%E9%94%80%E5%94%AE%E6%97%A0%E6%A0%87%E5%BF%97%E5%A4%96%E5%9B%BD%E5%8D%B7%E7%83%9F.pdf"
|
||
};
|
||
|
||
// 使用默认文档URL
|
||
const fileUrl = urls.pdf;
|
||
|
||
// 判断文件类型
|
||
const fileType = getFileTypeFromUrl(fileUrl);
|
||
|
||
// 返回加载的数据
|
||
return {
|
||
fileUrl,
|
||
initialPage: Number(page),
|
||
extractedContent: mockExtractedContent,
|
||
fileType,
|
||
urls
|
||
};
|
||
};
|
||
|
||
/**
|
||
* 文档预览组件
|
||
*/
|
||
export default function Documents() {
|
||
// 从loader获取数据
|
||
const { fileUrl, extractedContent, fileType, urls } = useLoaderData<LoaderData>();
|
||
|
||
// 状态管理
|
||
const [numPages, setNumPages] = useState<number | null>(null); // PDF总页数
|
||
const [scrollToPage, setScrollToPage] = useState<number | null>(null); // 滚动目标页码
|
||
const [docxLoading, setDocxLoading] = useState(false); // Word文档加载状态
|
||
const [loadError, setLoadError] = useState<string | null>(null); // 加载错误信息
|
||
const [debugInfo, setDebugInfo] = useState<string[]>([]); // 调试信息
|
||
const [docxHtml, setDocxHtml] = useState<string>(""); // 转换后的HTML内容
|
||
const [currentUrl, setCurrentUrl] = useState<string>(fileUrl); // 当前文档URL
|
||
|
||
// 引用
|
||
const docxContainerRef = useRef<HTMLDivElement>(null); // Word文档容器引用
|
||
|
||
/**
|
||
* 处理抽取内容点击事件 - 仅对PDF文档生效
|
||
* @param item 被点击的抽取内容项
|
||
*/
|
||
const handleContentClick = (item: ExtractedContent) => {
|
||
// 仅对PDF文档执行交互操作
|
||
if (fileType === "pdf") {
|
||
setScrollToPage(item.page);
|
||
// 对于PDF,滚动到指定页面
|
||
const pageElement = document.getElementById(`page-${item.page}`);
|
||
if (pageElement) {
|
||
pageElement.scrollIntoView({ behavior: 'smooth' });
|
||
}
|
||
}
|
||
// DOCX文档不执行任何交互操作
|
||
};
|
||
|
||
/**
|
||
* PDF文档加载成功回调函数
|
||
* @param param0 包含numPages的对象
|
||
*/
|
||
function onDocumentLoadSuccess({ numPages }: DocumentLoadSuccess) {
|
||
setNumPages(numPages);
|
||
// console.log("PDF加载成功,页数:", numPages);
|
||
}
|
||
|
||
/**
|
||
* 添加调试信息
|
||
* @param info 调试信息文本
|
||
*/
|
||
const addDebugInfo = (info: string) => {
|
||
// console.log(info);
|
||
setDebugInfo(prev => [...prev, `${new Date().toISOString().split('T')[1].split('.')[0]}: ${info}`]);
|
||
};
|
||
|
||
/**
|
||
* 切换文档URL
|
||
* @param urlKey URL键名
|
||
*/
|
||
const switchDocumentUrl = (urlKey: keyof typeof urls) => {
|
||
setCurrentUrl(urls[urlKey]);
|
||
setDebugInfo([]);
|
||
setLoadError(null);
|
||
setDocxLoading(false);
|
||
addDebugInfo(`切换到新的文档URL: ${urls[urlKey]}`);
|
||
};
|
||
|
||
/**
|
||
* Word文档处理逻辑
|
||
*/
|
||
useEffect(() => {
|
||
if (fileType === "docx" && docxContainerRef.current) {
|
||
setDocxLoading(true);
|
||
setDebugInfo([]); // 清空调试信息
|
||
addDebugInfo(`准备加载Word文档: ${currentUrl}`);
|
||
|
||
const loadDocx = async () => {
|
||
try {
|
||
// 1. 获取文档文件
|
||
addDebugInfo(`开始获取文件...`);
|
||
let response;
|
||
try {
|
||
response = await fetch(currentUrl, {
|
||
mode: 'cors',
|
||
credentials: 'omit',
|
||
headers: {
|
||
'Access-Control-Allow-Origin': '*'
|
||
}
|
||
});
|
||
addDebugInfo(`fetch请求状态: ${response.status} ${response.statusText}`);
|
||
} catch (fetchError) {
|
||
addDebugInfo(`fetch请求失败: ${fetchError instanceof Error ? fetchError.message : String(fetchError)}`);
|
||
throw new Error(`网络请求失败: ${fetchError instanceof Error ? fetchError.message : String(fetchError)}`);
|
||
}
|
||
|
||
// 检查响应状态
|
||
if (!response.ok) {
|
||
throw new Error(`文档无法访问,状态码: ${response.status}`);
|
||
}
|
||
addDebugInfo(`文档下载成功,状态码: ${response.status}`);
|
||
|
||
// 2. 将响应转换为ArrayBuffer
|
||
addDebugInfo(`开始读取响应内容为ArrayBuffer...`);
|
||
let buffer;
|
||
try {
|
||
buffer = await response.arrayBuffer();
|
||
addDebugInfo(`获取到文档数据,大小: ${buffer.byteLength} 字节`);
|
||
} catch (bufferError) {
|
||
addDebugInfo(`读取为ArrayBuffer失败: ${bufferError instanceof Error ? bufferError.message : String(bufferError)}`);
|
||
throw new Error(`转换文档内容失败: ${bufferError instanceof Error ? bufferError.message : String(bufferError)}`);
|
||
}
|
||
|
||
// 3. 使用mammoth.js将Word转换为HTML
|
||
addDebugInfo("使用mammoth开始转换文档为HTML...");
|
||
try {
|
||
// 自定义样式映射
|
||
const styleMap = `
|
||
p[style-name='Heading 1'] => h1:fresh
|
||
p[style-name='Heading 2'] => h2:fresh
|
||
p[style-name='Title'] => h1.title:fresh
|
||
p[style-name='Subtitle'] => h2.subtitle:fresh
|
||
table => table.docx-table
|
||
`;
|
||
|
||
// 转换选项
|
||
const options = {
|
||
arrayBuffer: buffer,
|
||
styleMap: styleMap,
|
||
includeDefaultStyleMap: true
|
||
};
|
||
|
||
// 执行转换
|
||
const result = await mammoth.convertToHtml(options);
|
||
|
||
// 检查转换警告
|
||
if (result.messages.length > 0) {
|
||
result.messages.forEach(message => {
|
||
addDebugInfo(`转换警告: [${message.type}] ${message.message}`);
|
||
});
|
||
}
|
||
|
||
addDebugInfo("文档转换成功,获取到HTML内容");
|
||
|
||
// 4. 为生成的HTML添加包装容器和样式
|
||
const enhancedHtml = `
|
||
<div class="document-container">
|
||
${result.value}
|
||
<div class="format-note">
|
||
<p>注意:部分复杂格式(如页眉页脚、复杂表格样式)可能无法完全显示。</p>
|
||
</div>
|
||
</div>
|
||
`;
|
||
|
||
// 更新状态
|
||
setDocxHtml(enhancedHtml);
|
||
setDocxLoading(false);
|
||
} catch (mammothError) {
|
||
addDebugInfo(`Mammoth转换失败: ${mammothError instanceof Error ? mammothError.message : String(mammothError)}`);
|
||
throw new Error(`Word转HTML失败: ${mammothError instanceof Error ? mammothError.message : String(mammothError)}`);
|
||
}
|
||
} catch (error: unknown) {
|
||
const errorMessage = error instanceof Error ? error.message : String(error);
|
||
addDebugInfo(`文档处理错误: ${errorMessage}`);
|
||
setLoadError(`加载Word文档失败: ${errorMessage}`);
|
||
setDocxLoading(false);
|
||
}
|
||
};
|
||
|
||
loadDocx();
|
||
}
|
||
}, [currentUrl, fileType]);
|
||
|
||
/**
|
||
* 页面滚动逻辑
|
||
*/
|
||
useEffect(() => {
|
||
if (scrollToPage && fileType === "pdf") {
|
||
const pageElement = document.getElementById(`page-${scrollToPage}`);
|
||
if (pageElement) {
|
||
pageElement.scrollIntoView({ behavior: 'smooth' });
|
||
}
|
||
setScrollToPage(null);
|
||
}
|
||
}, [scrollToPage, fileType]);
|
||
|
||
/**
|
||
* 生成所有PDF页面的渲染数组
|
||
* @returns 页面组件数组
|
||
*/
|
||
const renderAllPages = () => {
|
||
if (!numPages) return null;
|
||
|
||
const pages = [];
|
||
for (let i = 1; i <= numPages; i++) {
|
||
pages.push(
|
||
<div key={i} id={`page-${i}`} className="mb-6">
|
||
<div className="text-center text-gray-500 text-sm mb-2">第 {i} 页</div>
|
||
<Page
|
||
pageNumber={i}
|
||
renderTextLayer={true}
|
||
renderAnnotationLayer={true}
|
||
className="border border-gray-300 shadow-md"
|
||
/>
|
||
</div>
|
||
);
|
||
}
|
||
return pages;
|
||
};
|
||
|
||
return (
|
||
<div className="flex h-screen bg-gray-50">
|
||
{/* 文档展示区域 */}
|
||
<div className="flex-1 mr-6 p-4">
|
||
<div className="bg-white p-4 rounded-lg shadow-md h-full flex flex-col">
|
||
<h1 className="text-2xl font-bold mb-4">文档预览 ({fileType.toUpperCase()})</h1>
|
||
|
||
{/* 文档内容显示区域 */}
|
||
<div className="w-full flex-1 overflow-auto bg-gray-100 rounded-lg p-4">
|
||
{loadError ? (
|
||
<div className="text-red-500 flex flex-col items-center justify-center h-full">
|
||
<p className="mb-4">加载错误:</p>
|
||
<p>{loadError}</p>
|
||
<div className="mt-6 p-4 bg-gray-800 text-green-400 rounded text-xs max-w-xl overflow-auto max-h-96">
|
||
<p className="font-bold mb-2">调试信息:</p>
|
||
{debugInfo.map((info, index) => (
|
||
<div key={index} className="mb-1">{info}</div>
|
||
))}
|
||
</div>
|
||
<div className="mt-4">
|
||
<p className="text-black mb-2">尝试其他方式:</p>
|
||
<div className="flex flex-wrap gap-2">
|
||
<button onClick={() => switchDocumentUrl('public')} className="px-3 py-1 bg-green-500 text-white rounded">
|
||
使用公共示例
|
||
</button>
|
||
<button onClick={() => switchDocumentUrl('proxy')} className="px-3 py-1 bg-blue-500 text-white rounded">
|
||
使用CORS代理
|
||
</button>
|
||
<button onClick={() => switchDocumentUrl('pdf')} className="px-3 py-1 bg-yellow-500 text-white rounded">
|
||
切换到PDF
|
||
</button>
|
||
<a href={currentUrl} className="px-3 py-1 bg-gray-500 text-white rounded" download target="_blank" rel="noreferrer">
|
||
下载文档
|
||
</a>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
) : fileType === "pdf" ? (
|
||
/* PDF 文档渲染 */
|
||
<Document
|
||
file={currentUrl}
|
||
onLoadSuccess={onDocumentLoadSuccess}
|
||
onLoadError={(error) => {
|
||
console.error("PDF加载错误:", error);
|
||
setLoadError("PDF文档加载失败:" + (error.message || "未知错误"));
|
||
}}
|
||
className="flex flex-col items-center"
|
||
error={<div className="text-red-500">PDF文档加载失败,请检查链接或网络连接。</div>}
|
||
noData={<div>无数据</div>}
|
||
loading={<div className="text-center py-10">PDF加载中...</div>}
|
||
>
|
||
{renderAllPages()}
|
||
</Document>
|
||
) : (
|
||
/* Word 文档渲染 */
|
||
<>
|
||
{docxLoading ? (
|
||
/* 加载状态显示 */
|
||
<div className="flex flex-col items-center justify-center h-full">
|
||
<div className="mb-6">
|
||
<div className="animate-spin rounded-full h-12 w-12 border-t-2 border-b-2 border-blue-500"></div>
|
||
</div>
|
||
<p className="mb-4 text-lg">Word文档加载中...</p>
|
||
{debugInfo.length > 0 && (
|
||
<div className="mt-4 p-4 bg-gray-800 text-green-400 rounded text-xs max-w-xl overflow-auto max-h-72">
|
||
<p className="font-bold mb-2">加载过程:</p>
|
||
{debugInfo.map((info, index) => (
|
||
<div key={index} className="mb-1">{info}</div>
|
||
))}
|
||
</div>
|
||
)}
|
||
</div>
|
||
) : (
|
||
/* 本地渲染的Word文档 */
|
||
<div
|
||
ref={docxContainerRef}
|
||
className="w-full h-full"
|
||
style={{
|
||
height: '100%',
|
||
overflowY: 'auto',
|
||
padding: '20px',
|
||
backgroundColor: 'white'
|
||
}}
|
||
dangerouslySetInnerHTML={{ __html: docxHtml }}
|
||
/>
|
||
)}
|
||
</>
|
||
)}
|
||
</div>
|
||
</div>
|
||
</div>
|
||
|
||
{/* 抽取内容区域 - 始终显示,但DOCX模式下不交互 */}
|
||
<div className="w-80 bg-white p-4 rounded-lg shadow-md mr-4 my-4 overflow-auto">
|
||
<h2 className="text-xl font-semibold mb-4">抽取内容</h2>
|
||
<ul className="space-y-3">
|
||
{extractedContent.map((item) => (
|
||
<button
|
||
key={item.id}
|
||
onClick={() => handleContentClick(item)}
|
||
className={`w-full text-left p-3 ${fileType === "pdf" ? "bg-gray-50 hover:bg-gray-100 cursor-pointer" : "bg-gray-100"} rounded-lg transition`}
|
||
disabled={fileType === "docx"}
|
||
aria-label={`查看内容: ${item.text}`}
|
||
>
|
||
<p className="text-sm font-medium">{item.text}</p>
|
||
<p className="text-xs text-gray-500">页面: {item.page}</p>
|
||
</button>
|
||
))}
|
||
</ul>
|
||
</div>
|
||
|
||
{/* 添加自定义样式 */}
|
||
<style dangerouslySetInnerHTML={{
|
||
__html: `
|
||
/* 高亮显示样式 */
|
||
.docx-highlight {
|
||
background-color: #ffff00;
|
||
outline: 2px solid orange;
|
||
position: relative;
|
||
}
|
||
|
||
/* 找到的内容高亮样式 */
|
||
.docx-content-found {
|
||
background-color: rgba(255, 230, 0, 0.3);
|
||
outline: 1px solid orange;
|
||
}
|
||
|
||
/* Mammoth.js生成的内容样式 */
|
||
.document-container {
|
||
font-family: "Microsoft YaHei", Arial, sans-serif;
|
||
line-height: 1.5;
|
||
color: #333;
|
||
max-width: 800px;
|
||
margin: 0 auto;
|
||
}
|
||
|
||
.document-container .format-note {
|
||
margin-top: 30px;
|
||
padding: 10px;
|
||
background-color: #f5f5f5;
|
||
border-left: 3px solid #ccc;
|
||
font-size: 12px;
|
||
color: #666;
|
||
}
|
||
|
||
.document-container h1 {
|
||
font-size: 24px;
|
||
margin-top: 24px;
|
||
margin-bottom: 16px;
|
||
font-weight: bold;
|
||
color: #222;
|
||
}
|
||
|
||
.document-container h1.title {
|
||
font-size: 28px;
|
||
text-align: center;
|
||
margin-bottom: 24px;
|
||
}
|
||
|
||
.document-container h2 {
|
||
font-size: 20px;
|
||
margin-top: 20px;
|
||
margin-bottom: 14px;
|
||
font-weight: bold;
|
||
color: #333;
|
||
}
|
||
|
||
.document-container h2.subtitle {
|
||
font-size: 18px;
|
||
text-align: center;
|
||
margin-bottom: 20px;
|
||
color: #555;
|
||
}
|
||
|
||
.document-container p {
|
||
margin-bottom: 16px;
|
||
text-align: justify;
|
||
overflow-wrap: break-word;
|
||
}
|
||
|
||
.document-container table {
|
||
border-collapse: collapse;
|
||
width: 100%;
|
||
margin-bottom: 16px;
|
||
}
|
||
|
||
.document-container table.docx-table {
|
||
border: 1px solid #ddd;
|
||
margin: 16px 0;
|
||
}
|
||
|
||
.document-container table.docx-table th,
|
||
.document-container table.docx-table td {
|
||
border: 1px solid #ddd;
|
||
padding: 8px;
|
||
text-align: left;
|
||
}
|
||
|
||
.document-container table.docx-table th {
|
||
background-color: #f2f2f2;
|
||
font-weight: bold;
|
||
}
|
||
|
||
.document-container ul, .document-container ol {
|
||
margin-left: 20px;
|
||
margin-bottom: 16px;
|
||
}
|
||
|
||
.document-container li {
|
||
margin-bottom: 5px;
|
||
}
|
||
|
||
.document-container img {
|
||
max-width: 100%;
|
||
height: auto;
|
||
margin: 10px 0;
|
||
}
|
||
|
||
.document-container span.underline {
|
||
text-decoration: underline;
|
||
}
|
||
|
||
.document-container span.strikethrough {
|
||
text-decoration: line-through;
|
||
}
|
||
|
||
/* 段落缩进 */
|
||
.document-container p:not(.no-indent) {
|
||
text-indent: 2em;
|
||
}
|
||
`
|
||
}} />
|
||
</div>
|
||
);
|
||
} |