Files
2025-12-05 00:09:32 +08:00

149 lines
4.5 KiB
TypeScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* DOCX 文档解析工具
* 使用 docxtemplater 从 docx 文件中提取占位符
*/
import PizZip from 'pizzip';
import type { PlaceholderField, PlaceholderSchema } from '~/types/contract-draft';
import { DOCUMENT_URL } from '../axios-client';
/**
* 从 docx 文件中提取占位符
* @param filePath MinIO 文件路径(相对路径,如 contract-template/买卖/买卖合同范本.docx
* @returns 占位符列表
*/
export async function extractPlaceholdersFromDocx(
filePath: string
): Promise<string[]> {
try {
// 构建完整的 MinIO URL
const fileUrl = `${DOCUMENT_URL}${filePath}`;
// 从 MinIO 下载文件
const response = await fetch(fileUrl);
if (!response.ok) {
throw new Error(`下载文件失败: ${response.status} ${response.statusText}`);
}
// 获取文件内容(ArrayBuffer
const arrayBuffer = await response.arrayBuffer();
// 转换为 BufferPizZip 需要)
const content = Buffer.from(arrayBuffer);
// 使用 PizZip 解压
const zip = new PizZip(content);
// 读取 document.xml 文件内容(不使用 docxtemplater,避免格式化文本的标签分割问题)
const documentXml = zip.file('word/document.xml');
if (!documentXml) {
throw new Error('无法找到 word/document.xml 文件');
}
// 获取 XML 文本内容
const xmlContent = documentXml.asText();
// console.log('[DOCX Parser] 文档 XML 长度:', xmlContent.length);
// 移除所有 XML 标签,只保留纯文本
const fullText = xmlContent.replace(/<[^>]+>/g, '');
// console.log('[DOCX Parser] 文档文本长度:', fullText.length);
// 使用正则表达式提取所有 {{...}} 占位符
const placeholderRegex = /\{\{([^}]+)\}\}/g;
const matches = fullText.matchAll(placeholderRegex);
// 去重并返回
const placeholders = new Set<string>();
for (const match of matches) {
const placeholder = match[1].trim();
if (placeholder) {
placeholders.add(placeholder);
}
}
const placeholderList = Array.from(placeholders);
// console.log('[DOCX Parser] 提取到的占位符:', placeholderList);
return placeholderList;
} catch (error) {
console.error('[DOCX Parser] 解析文档失败:', error);
throw new Error(`解析文档失败: ${error instanceof Error ? error.message : '未知错误'}`);
}
}
/**
* 从占位符列表生成默认的 PlaceholderSchema
* @param placeholders 占位符列表
* @returns PlaceholderSchema
*/
export function generateDefaultSchema(
placeholders: string[]
): PlaceholderSchema {
// 按名称自动分组
const fields: PlaceholderField[] = placeholders.map(placeholder => {
// 根据占位符名称推测分组
let group = '基本信息';
if (placeholder.includes('甲方') || placeholder.includes('partyA')) {
group = '甲方信息';
} else if (placeholder.includes('乙方') || placeholder.includes('partyB')) {
group = '乙方信息';
} else if (
placeholder.includes('金额') ||
placeholder.includes('价格') ||
placeholder.includes('数量') ||
placeholder.includes('amount')
) {
group = '合同条款';
} else if (
placeholder.includes('日期') ||
placeholder.includes('时间') ||
placeholder.includes('date')
) {
group = '日期信息';
}
// 根据名称推测字段类型
let type: 'text' | 'number' | 'date' | 'textarea' = 'text';
if (
placeholder.includes('金额') ||
placeholder.includes('数量') ||
placeholder.includes('价格') ||
placeholder.includes('amount') ||
placeholder.includes('price') ||
placeholder.includes('quantity')
) {
type = 'number';
} else if (
placeholder.includes('日期') ||
placeholder.includes('时间') ||
placeholder.includes('date') ||
placeholder.includes('time')
) {
type = 'date';
} else if (
placeholder.includes('地址') ||
placeholder.includes('说明') ||
placeholder.includes('备注') ||
placeholder.includes('address') ||
placeholder.includes('description') ||
placeholder.includes('remark')
) {
type = 'textarea';
}
// 根据名称推测是否必填
const required = !placeholder.includes('可选') && !placeholder.includes('optional');
return {
key: placeholder,
label: placeholder, // 使用占位符本身作为标签
type,
required,
group
};
});
return { fields };
}