151 lines
4.1 KiB
JavaScript
151 lines
4.1 KiB
JavaScript
/**
|
||
* 测试从 DOCX 文件提取占位符功能(本地文件版本)
|
||
* 运行: node scripts/test-docx-parser.cjs
|
||
*
|
||
* 注意:此脚本测试本地文件系统中的文档
|
||
* 实际应用中,文件从 MinIO 下载(使用 fetch)
|
||
*/
|
||
|
||
const fs = require('fs');
|
||
const path = require('path');
|
||
const PizZip = require('pizzip');
|
||
|
||
// 测试文件路径
|
||
const testDocPath = path.join(__dirname, '..', 'public', 'testWork', '买卖合同 (1).docx');
|
||
|
||
console.log('='.repeat(60));
|
||
console.log('测试从 DOCX 文件提取占位符功能(本地文件)');
|
||
console.log('='.repeat(60));
|
||
console.log('');
|
||
|
||
try {
|
||
// 1. 检查文件是否存在
|
||
if (!fs.existsSync(testDocPath)) {
|
||
console.error('❌ 文件不存在:', testDocPath);
|
||
process.exit(1);
|
||
}
|
||
|
||
console.log('✅ 文件存在:', testDocPath);
|
||
console.log('');
|
||
|
||
// 2. 读取文件(本地文件使用 fs.readFileSync)
|
||
const content = fs.readFileSync(testDocPath);
|
||
console.log('✅ 文件读取成功, 大小:', (content.length / 1024).toFixed(2), 'KB');
|
||
console.log('');
|
||
|
||
// 3. 使用 PizZip 解压
|
||
const zip = new PizZip(content);
|
||
console.log('✅ PizZip 解压成功');
|
||
console.log('');
|
||
|
||
// 4. 读取 document.xml 文件
|
||
const documentXml = zip.file('word/document.xml');
|
||
if (!documentXml) {
|
||
console.error('❌ 无法找到 word/document.xml 文件');
|
||
process.exit(1);
|
||
}
|
||
console.log('✅ document.xml 读取成功');
|
||
console.log('');
|
||
|
||
// 5. 获取 XML 内容并提取纯文本
|
||
const xmlContent = documentXml.asText();
|
||
console.log('✅ XML 内容长度:', xmlContent.length);
|
||
|
||
// 移除所有 XML 标签,只保留纯文本
|
||
const fullText = xmlContent.replace(/<[^>]+>/g, '');
|
||
console.log('✅ 提取纯文本成功');
|
||
console.log('文本长度:', fullText.length);
|
||
console.log('');
|
||
|
||
// 6. 提取占位符
|
||
const placeholderRegex = /\{\{([^}]+)\}\}/g;
|
||
const matches = fullText.matchAll(placeholderRegex);
|
||
|
||
const placeholders = new Set();
|
||
for (const match of matches) {
|
||
const placeholder = match[1].trim();
|
||
if (placeholder) {
|
||
placeholders.add(placeholder);
|
||
}
|
||
}
|
||
|
||
const placeholderList = Array.from(placeholders);
|
||
|
||
console.log('✅ 提取占位符成功');
|
||
console.log('');
|
||
console.log('找到', placeholderList.length, '个占位符:');
|
||
console.log('='.repeat(60));
|
||
placeholderList.forEach((p, index) => {
|
||
console.log(`${index + 1}. {{${p}}}`);
|
||
});
|
||
console.log('='.repeat(60));
|
||
console.log('');
|
||
|
||
// 7. 推测字段类型和分组
|
||
console.log('字段分析:');
|
||
console.log('='.repeat(60));
|
||
|
||
placeholderList.forEach(placeholder => {
|
||
// 推测类型
|
||
let type = 'text';
|
||
if (
|
||
placeholder.includes('金额') ||
|
||
placeholder.includes('数量') ||
|
||
placeholder.includes('价格')
|
||
) {
|
||
type = 'number';
|
||
} else if (
|
||
placeholder.includes('日期') ||
|
||
placeholder.includes('时间')
|
||
) {
|
||
type = 'date';
|
||
} else if (
|
||
placeholder.includes('地址') ||
|
||
placeholder.includes('说明') ||
|
||
placeholder.includes('备注')
|
||
) {
|
||
type = 'textarea';
|
||
}
|
||
|
||
// 推测分组
|
||
let group = '基本信息';
|
||
if (placeholder.includes('甲方')) {
|
||
group = '甲方信息';
|
||
} else if (placeholder.includes('乙方')) {
|
||
group = '乙方信息';
|
||
} else if (
|
||
placeholder.includes('金额') ||
|
||
placeholder.includes('价格') ||
|
||
placeholder.includes('数量')
|
||
) {
|
||
group = '合同条款';
|
||
} else if (
|
||
placeholder.includes('日期') ||
|
||
placeholder.includes('时间')
|
||
) {
|
||
group = '日期信息';
|
||
}
|
||
|
||
// 推测是否必填
|
||
const required = !placeholder.includes('可选') && !placeholder.includes('optional');
|
||
|
||
console.log(`📝 ${placeholder}`);
|
||
console.log(` 类型: ${type}`);
|
||
console.log(` 分组: ${group}`);
|
||
console.log(` 必填: ${required ? '是' : '否'}`);
|
||
console.log('');
|
||
});
|
||
|
||
console.log('='.repeat(60));
|
||
console.log('');
|
||
console.log('🎉 测试完成!所有功能正常工作。');
|
||
console.log('');
|
||
|
||
} catch (error) {
|
||
console.error('');
|
||
console.error('❌ 测试失败:');
|
||
console.error(error);
|
||
console.error('');
|
||
process.exit(1);
|
||
}
|