This commit is contained in:
2025-12-05 00:09:32 +08:00
parent bb3d22eabf
commit 3d1dbb3f97
214 changed files with 113060 additions and 1232 deletions
+150
View File
@@ -0,0 +1,150 @@
/**
* 测试从 DOCX 文件提取占位符功能(本地文件版本)
* 运行: node scripts/test-docx-parser.cjs
*
* 注意:此脚本测试本地文件系统中的文档
* 实际应用中,文件从 MinIO 下载(使用 fetch
*/
const fs = require('fs');
const path = require('path');
const PizZip = require('pizzip');
// 测试文件路径
const testDocPath = path.join(__dirname, '..', 'public', 'testWork', '买卖合同 (1).docx');
console.log('='.repeat(60));
console.log('测试从 DOCX 文件提取占位符功能(本地文件)');
console.log('='.repeat(60));
console.log('');
try {
// 1. 检查文件是否存在
if (!fs.existsSync(testDocPath)) {
console.error('❌ 文件不存在:', testDocPath);
process.exit(1);
}
console.log('✅ 文件存在:', testDocPath);
console.log('');
// 2. 读取文件(本地文件使用 fs.readFileSync
const content = fs.readFileSync(testDocPath);
console.log('✅ 文件读取成功, 大小:', (content.length / 1024).toFixed(2), 'KB');
console.log('');
// 3. 使用 PizZip 解压
const zip = new PizZip(content);
console.log('✅ PizZip 解压成功');
console.log('');
// 4. 读取 document.xml 文件
const documentXml = zip.file('word/document.xml');
if (!documentXml) {
console.error('❌ 无法找到 word/document.xml 文件');
process.exit(1);
}
console.log('✅ document.xml 读取成功');
console.log('');
// 5. 获取 XML 内容并提取纯文本
const xmlContent = documentXml.asText();
console.log('✅ XML 内容长度:', xmlContent.length);
// 移除所有 XML 标签,只保留纯文本
const fullText = xmlContent.replace(/<[^>]+>/g, '');
console.log('✅ 提取纯文本成功');
console.log('文本长度:', fullText.length);
console.log('');
// 6. 提取占位符
const placeholderRegex = /\{\{([^}]+)\}\}/g;
const matches = fullText.matchAll(placeholderRegex);
const placeholders = new Set();
for (const match of matches) {
const placeholder = match[1].trim();
if (placeholder) {
placeholders.add(placeholder);
}
}
const placeholderList = Array.from(placeholders);
console.log('✅ 提取占位符成功');
console.log('');
console.log('找到', placeholderList.length, '个占位符:');
console.log('='.repeat(60));
placeholderList.forEach((p, index) => {
console.log(`${index + 1}. {{${p}}}`);
});
console.log('='.repeat(60));
console.log('');
// 7. 推测字段类型和分组
console.log('字段分析:');
console.log('='.repeat(60));
placeholderList.forEach(placeholder => {
// 推测类型
let type = 'text';
if (
placeholder.includes('金额') ||
placeholder.includes('数量') ||
placeholder.includes('价格')
) {
type = 'number';
} else if (
placeholder.includes('日期') ||
placeholder.includes('时间')
) {
type = 'date';
} else if (
placeholder.includes('地址') ||
placeholder.includes('说明') ||
placeholder.includes('备注')
) {
type = 'textarea';
}
// 推测分组
let group = '基本信息';
if (placeholder.includes('甲方')) {
group = '甲方信息';
} else if (placeholder.includes('乙方')) {
group = '乙方信息';
} else if (
placeholder.includes('金额') ||
placeholder.includes('价格') ||
placeholder.includes('数量')
) {
group = '合同条款';
} else if (
placeholder.includes('日期') ||
placeholder.includes('时间')
) {
group = '日期信息';
}
// 推测是否必填
const required = !placeholder.includes('可选') && !placeholder.includes('optional');
console.log(`📝 ${placeholder}`);
console.log(` 类型: ${type}`);
console.log(` 分组: ${group}`);
console.log(` 必填: ${required ? '是' : '否'}`);
console.log('');
});
console.log('='.repeat(60));
console.log('');
console.log('🎉 测试完成!所有功能正常工作。');
console.log('');
} catch (error) {
console.error('');
console.error('❌ 测试失败:');
console.error(error);
console.error('');
process.exit(1);
}