Files
2025-12-05 00:09:32 +08:00

151 lines
4.1 KiB
JavaScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* 测试从 DOCX 文件提取占位符功能(本地文件版本)
* 运行: node scripts/test-docx-parser.cjs
*
* 注意:此脚本测试本地文件系统中的文档
* 实际应用中,文件从 MinIO 下载(使用 fetch
*/
const fs = require('fs');
const path = require('path');
const PizZip = require('pizzip');
// 测试文件路径
const testDocPath = path.join(__dirname, '..', 'public', 'testWork', '买卖合同 (1).docx');
console.log('='.repeat(60));
console.log('测试从 DOCX 文件提取占位符功能(本地文件)');
console.log('='.repeat(60));
console.log('');
try {
// 1. 检查文件是否存在
if (!fs.existsSync(testDocPath)) {
console.error('❌ 文件不存在:', testDocPath);
process.exit(1);
}
console.log('✅ 文件存在:', testDocPath);
console.log('');
// 2. 读取文件(本地文件使用 fs.readFileSync
const content = fs.readFileSync(testDocPath);
console.log('✅ 文件读取成功, 大小:', (content.length / 1024).toFixed(2), 'KB');
console.log('');
// 3. 使用 PizZip 解压
const zip = new PizZip(content);
console.log('✅ PizZip 解压成功');
console.log('');
// 4. 读取 document.xml 文件
const documentXml = zip.file('word/document.xml');
if (!documentXml) {
console.error('❌ 无法找到 word/document.xml 文件');
process.exit(1);
}
console.log('✅ document.xml 读取成功');
console.log('');
// 5. 获取 XML 内容并提取纯文本
const xmlContent = documentXml.asText();
console.log('✅ XML 内容长度:', xmlContent.length);
// 移除所有 XML 标签,只保留纯文本
const fullText = xmlContent.replace(/<[^>]+>/g, '');
console.log('✅ 提取纯文本成功');
console.log('文本长度:', fullText.length);
console.log('');
// 6. 提取占位符
const placeholderRegex = /\{\{([^}]+)\}\}/g;
const matches = fullText.matchAll(placeholderRegex);
const placeholders = new Set();
for (const match of matches) {
const placeholder = match[1].trim();
if (placeholder) {
placeholders.add(placeholder);
}
}
const placeholderList = Array.from(placeholders);
console.log('✅ 提取占位符成功');
console.log('');
console.log('找到', placeholderList.length, '个占位符:');
console.log('='.repeat(60));
placeholderList.forEach((p, index) => {
console.log(`${index + 1}. {{${p}}}`);
});
console.log('='.repeat(60));
console.log('');
// 7. 推测字段类型和分组
console.log('字段分析:');
console.log('='.repeat(60));
placeholderList.forEach(placeholder => {
// 推测类型
let type = 'text';
if (
placeholder.includes('金额') ||
placeholder.includes('数量') ||
placeholder.includes('价格')
) {
type = 'number';
} else if (
placeholder.includes('日期') ||
placeholder.includes('时间')
) {
type = 'date';
} else if (
placeholder.includes('地址') ||
placeholder.includes('说明') ||
placeholder.includes('备注')
) {
type = 'textarea';
}
// 推测分组
let group = '基本信息';
if (placeholder.includes('甲方')) {
group = '甲方信息';
} else if (placeholder.includes('乙方')) {
group = '乙方信息';
} else if (
placeholder.includes('金额') ||
placeholder.includes('价格') ||
placeholder.includes('数量')
) {
group = '合同条款';
} else if (
placeholder.includes('日期') ||
placeholder.includes('时间')
) {
group = '日期信息';
}
// 推测是否必填
const required = !placeholder.includes('可选') && !placeholder.includes('optional');
console.log(`📝 ${placeholder}`);
console.log(` 类型: ${type}`);
console.log(` 分组: ${group}`);
console.log(` 必填: ${required ? '是' : '否'}`);
console.log('');
});
console.log('='.repeat(60));
console.log('');
console.log('🎉 测试完成!所有功能正常工作。');
console.log('');
} catch (error) {
console.error('');
console.error('❌ 测试失败:');
console.error(error);
console.error('');
process.exit(1);
}