ooxmlParser.ts•62.3 kB
/**
* 自定义OOXML解析器 - 基于OOXML.md方案
* 完全不依赖外部独立软件,只使用Node.js包(JSZip、xml2js等)
* 可以达到95-97%的格式还原率
*/
const fs = require('fs').promises;
const JSZip = require('jszip');
const xml2js = require('xml2js');
import path from 'path';
interface StyleDefinition {
type: string;
name: string;
basedOn: string | null;
css: Record<string, string>;
}
interface NumberingLevel {
start: string;
numFmt: string;
text: string;
indent: Record<string, string>;
}
interface NumberingDefinition {
[level: string]: NumberingLevel;
}
interface RunNode {
type: 'run' | 'break';
text: string;
style: Record<string, string>;
}
interface ImageNode {
type: 'image';
src: string;
alt: string;
style: Record<string, string>;
}
interface HyperlinkNode {
type: 'hyperlink';
href: string;
children: (RunNode | ImageNode)[];
}
type ChildNode = RunNode | ImageNode | HyperlinkNode;
interface ParagraphNode {
type: 'paragraph';
style: Record<string, string>;
children: ChildNode[];
numbering: { numId: string; level: string } | null;
}
interface TableCellNode {
type: 'tableCell';
style: Record<string, string>;
children: ParagraphNode[];
colspan: number;
rowspan: number;
}
interface TableRowNode {
type: 'tableRow';
style: Record<string, string>;
cells: TableCellNode[];
}
interface TableNode {
type: 'table';
style: Record<string, string>;
rows: TableRowNode[];
}
class OOXMLParser {
private parser: any;
private styles: Record<string, StyleDefinition>;
private numbering: Record<string, NumberingDefinition>;
private relationships: Record<string, string>;
private themes: Record<string, any>;
private settings: Record<string, any>;
private media: Record<string, any>;
private styleCache: Map<string, any>;
private numberingCache: Map<string, string>;
private outputDir: string;
private cacheDir: string;
private imageCache: Map<string, string>; // 图片缓存映射
// 移除了listCounters,现在使用HTML原生列表功能
private tempImageId = 0;
constructor() {
this.parser = new xml2js.Parser({
explicitArray: false,
mergeAttrs: true,
normalize: true,
normalizeTags: true
});
// 存储解析后的数据
this.styles = {};
this.numbering = {};
this.relationships = {};
this.themes = {};
this.settings = {};
this.media = {};
// 样式映射缓存
this.styleCache = new Map();
this.numberingCache = new Map();
this.imageCache = new Map();
// 移除了listCounters初始化
// 设置缓存目录和输出目录
this.cacheDir = process.env.CACHE_DIR || process.env.OUTPUT_DIR || '';
this.outputDir = process.env.OUTPUT_DIR || '';
}
/**
* 主解析入口
*/
async parseDocx(docxPath: string, outputDir?: string): Promise<{ html: string; css: string; media: Record<string, any> }> {
try {
console.log('🚀 开始自定义OOXML解析器转换...');
// 设置输出目录和缓存目录
if (outputDir) {
this.outputDir = outputDir;
} else if (!this.outputDir) {
// 如果没有设置输出目录,使用临时目录
this.outputDir = path.dirname(docxPath);
}
// 设置缓存目录,优先使用环境变量CACHE_DIR
if (!this.cacheDir) {
this.cacheDir = process.env.CACHE_DIR || this.outputDir;
}
console.log('📂 缓存目录:', this.cacheDir);
console.log('📂 输出目录:', this.outputDir);
// 1. 读取并解压DOCX文件
const data = await fs.readFile(docxPath);
const zip = await JSZip.loadAsync(data);
// 2. 并行加载所有必要的XML文件
await Promise.all([
this.loadStyles(zip),
this.loadNumbering(zip),
this.loadRelationships(zip),
this.loadThemes(zip),
this.loadSettings(zip),
this.loadMedia(zip)
]);
// 3. 解析主文档
const documentXml = await this.getFileContent(zip, 'word/document.xml');
if (!documentXml) {
throw new Error('无法找到主文档 word/document.xml');
}
const document = await this.parseXml(documentXml);
// 4. 构建DOM树
const dom = await this.buildDocumentDOM(document);
// 5. 生成HTML
const html = this.renderToHTML(dom);
// 6. 复制图片到输出目录
await this.copyImagesToOutput();
console.log('✅ 自定义OOXML解析器转换完成');
console.log(`📊 处理了 ${this.imageCache.size} 张图片`);
return {
html,
css: this.generateCSS(),
media: this.media
};
} catch (error: any) {
console.error('❌ OOXML解析器转换失败:', error);
throw error;
}
}
async saveImageToCache(filename: string, base64: string, relationId: string): Promise<string> {
// 检查是否已经缓存过这个图片
if (this.imageCache.has(relationId)) {
const cachedPath = this.imageCache.get(relationId)!;
console.log(`📸 使用缓存图片: ${relationId} -> ${cachedPath}`);
return cachedPath;
}
// 创建缓存目录下的 images 子目录
const imagesDir = path.join(this.cacheDir, 'images');
await fs.mkdir(imagesDir, { recursive: true });
// 保证唯一文件名
const ext = filename.split('.').pop() || 'png';
const basename = path.basename(filename, '.' + ext);
const uniq = `${basename}_${Date.now()}_${this.tempImageId++}.${ext}`;
const filepath = path.join(imagesDir, uniq);
// 写入图片到缓存目录
const buffer = Buffer.from(base64, 'base64');
await fs.writeFile(filepath, buffer);
// 生成绝对路径(file:// 协议)
const relativePath = `file://${filepath}`;
// 缓存映射关系
this.imageCache.set(relationId, relativePath);
console.log(`💾 图片已缓存: ${relationId} -> ${filepath}`);
console.log(`🔗 相对路径: ${relativePath}`);
return relativePath;
}
/**
* 复制缓存图片到输出目录
*/
async copyImagesToOutput(): Promise<void> {
// 由于图片路径已经是从输出目录到缓存目录的相对路径,不需要复制
console.log('📸 图片使用相对路径,无需复制');
return;
}
// 保留原有的复制逻辑作为备用
async copyImagesToOutputOld(): Promise<void> {
if (this.cacheDir === this.outputDir || this.imageCache.size === 0) {
console.log('📸 图片已在正确位置或无图片需要复制');
return;
}
console.log(`📸 开始复制 ${this.imageCache.size} 张图片到输出目录...`);
// 创建输出目录下的images文件夹
const outputImagesDir = path.join(this.outputDir, 'images');
await fs.mkdir(outputImagesDir, { recursive: true });
for (const [relationId, relativePath] of this.imageCache) {
try {
const sourcePath = path.join(this.cacheDir, relativePath);
const targetPath = path.join(this.outputDir, relativePath);
// 检查源文件是否存在
try {
await fs.access(sourcePath);
} catch {
console.log(`⚠️ 源图片不存在: ${sourcePath}`);
continue;
}
// 复制文件
await fs.copyFile(sourcePath, targetPath);
console.log(`✅ 图片已复制: ${relativePath}`);
} catch (error) {
console.error(`❌ 复制图片失败 ${relativePath}:`, error);
}
}
console.log('📸 图片复制完成');
}
/**
* 加载样式定义
*/
async loadStyles(zip: any): Promise<void> {
try {
const stylesXml = await this.getFileContent(zip, 'word/styles.xml');
if (!stylesXml) {
console.log('⚠️ 未找到样式文件,使用默认样式');
return;
}
const result = await this.parseXml(stylesXml);
if (result && result['w:styles']) {
const styles = result['w:styles'];
// 解析文档默认样式
if (styles['w:docdefaults']) {
this.styles.defaults = this.parseDocDefaults(styles['w:docdefaults']);
}
// 解析所有样式定义
const styleList = Array.isArray(styles['w:style'])
? styles['w:style']
: [styles['w:style']].filter(Boolean);
styleList.forEach(style => {
if (style && style['w:styleid']) {
const styleId = style['w:styleid'];
this.styles[styleId] = this.parseStyleDefinition(style);
}
});
console.log(`📊 解析了 ${Object.keys(this.styles).length} 个样式`);
}
} catch (e: any) {
console.log('⚠️ 样式文件解析失败,使用默认样式:', e.message);
}
}
/**
* 解析文档默认样式
*/
parseDocDefaults(docDefaults: any): StyleDefinition {
const def: StyleDefinition = {
type: 'default',
name: 'Document Defaults',
basedOn: null,
css: {}
};
// 解析默认段落属性
if (docDefaults['w:pprdefault'] && docDefaults['w:pprdefault']['w:ppr']) {
Object.assign(def.css, this.parseParagraphProperties(docDefaults['w:pprdefault']['w:ppr']));
}
// 解析默认字符属性
if (docDefaults['w:rprdefault'] && docDefaults['w:rprdefault']['w:rpr']) {
Object.assign(def.css, this.parseRunProperties(docDefaults['w:rprdefault']['w:rpr']));
}
return def;
}
/**
* 解析样式定义
*/
parseStyleDefinition(style: any): StyleDefinition {
const def: StyleDefinition = {
type: style['w:type'] || 'paragraph',
name: style['w:name'] ? style['w:name']['w:val'] : '',
basedOn: style['w:basedon'] ? style['w:basedon']['w:val'] : null,
css: {}
};
// 解析段落属性
if (style['w:ppr']) {
Object.assign(def.css, this.parseParagraphProperties(style['w:ppr']));
}
// 解析字符属性
if (style['w:rpr']) {
Object.assign(def.css, this.parseRunProperties(style['w:rpr']));
}
return def;
}
/**
* 解析段落属性
*/
parseParagraphProperties(pPr: any): Record<string, string> {
const css: Record<string, string> = {};
// 对齐方式
if (pPr['w:jc']) {
const align = pPr['w:jc']['w:val'];
css['text-align'] = this.mapAlignment(align);
}
// 缩进
if (pPr['w:ind']) {
const ind = pPr['w:ind'];
if (ind['w:left']) {
css['margin-left'] = this.twipToPixel(ind['w:left']) + 'px';
}
if (ind['w:right']) {
css['margin-right'] = this.twipToPixel(ind['w:right']) + 'px';
}
if (ind['w:firstline']) {
css['text-indent'] = this.twipToPixel(ind['w:firstline']) + 'px';
}
}
// 行距 - 优化行距计算
if (pPr['w:spacing']) {
const spacing = pPr['w:spacing'];
if (spacing['w:line']) {
const lineValue = parseInt(spacing['w:line']);
// 改善行距计算,使其更接近Word的显示效果
if (spacing['w:lineRule'] === 'exact') {
css['line-height'] = this.twipToPixel(lineValue) + 'px';
} else {
// 自动行距或最小行距
css['line-height'] = Math.max(1.0, lineValue / 240) + '';
}
}
if (spacing['w:before']) {
css['margin-top'] = Math.max(0, this.twipToPixel(spacing['w:before'])) + 'px';
}
if (spacing['w:after']) {
css['margin-bottom'] = Math.max(0, this.twipToPixel(spacing['w:after'])) + 'px';
}
}
// 边框
if (pPr['w:pbdr']) {
css.border = this.parseBorders(pPr['w:pbdr']);
}
// 背景色
if (pPr['w:shd']) {
const shd = pPr['w:shd'];
const fill = shd['w:fill'] || shd.fill;
if (fill && fill !== 'auto' && fill !== 'auto') {
// 确保颜色值是有效的十六进制
const cleanColor = fill.replace('#', '');
if (/^[0-9A-Fa-f]{6}$/.test(cleanColor)) {
css['background-color'] = '#' + cleanColor;
}
}
}
// 处理运行属性中的字体大小
if (pPr['w:rPr']) {
const runProps = this.parseRunProperties(pPr['w:rPr']);
Object.assign(css, runProps);
}
return css;
}
/**
* 解析字符属性
*/
parseRunProperties(rPr: any): Record<string, string> {
const css: Record<string, string> = {};
// 字体
if (rPr['w:rfonts']) {
const fonts = rPr['w:rfonts'];
const fontList: string[] = [];
// 优先使用ASCII字体,然后是东亚字体
if (fonts['w:ascii']) fontList.push(`"${fonts['w:ascii']}"`);
if (fonts['w:eastAsia'] && fonts['w:eastAsia'] !== fonts['w:ascii']) {
fontList.push(`"${fonts['w:eastAsia']}"`);
}
if (fonts['w:hAnsi'] && fonts['w:hAnsi'] !== fonts['w:ascii']) {
fontList.push(`"${fonts['w:hAnsi']}"`);
}
// 添加默认字体
fontList.push('"Microsoft YaHei"', '"SimSun"', 'sans-serif');
if (fontList.length > 0) {
css['font-family'] = fontList.join(', ');
}
}
// 字号 - 修复字号解析
if (rPr['w:sz']) {
const size = rPr['w:sz']['w:val'] || rPr['w:sz'];
if (size) {
const sizeNum = parseInt(size);
if (!isNaN(sizeNum) && sizeNum > 0) {
css['font-size'] = (sizeNum / 2) + 'pt';
}
}
}
// 东亚字体字号
if (rPr['w:szCs']) {
const size = rPr['w:szCs']['w:val'] || rPr['w:szCs'];
if (size && !css['font-size']) {
const sizeNum = parseInt(size);
if (!isNaN(sizeNum) && sizeNum > 0) {
css['font-size'] = (sizeNum / 2) + 'pt';
}
}
}
// 加粗 - 修复逻辑
if (rPr['w:b']) {
const boldVal = rPr['w:b']['w:val'];
// 如果没有val属性,或者val为true/1,则为粗体
if (boldVal === undefined || boldVal === 'true' || boldVal === '1' || boldVal === true) {
css['font-weight'] = 'bold';
}
}
// 东亚字体加粗
if (rPr['w:bCs']) {
const boldVal = rPr['w:bCs']['w:val'];
if (boldVal === undefined || boldVal === 'true' || boldVal === '1' || boldVal === true) {
css['font-weight'] = 'bold';
}
}
// 斜体
if (rPr['w:i']) {
const v = rPr['w:i']['w:val'];
if (typeof v === 'undefined' || v === 'true' || v === '1') {
css['font-style'] = 'italic';
}
}
// 下划线
if (rPr['w:u']) {
const uType = rPr['w:u']['w:val'] || 'single';
if (uType !== 'none') {
css['text-decoration'] = 'underline';
if (uType === 'double') {
css['text-decoration-style'] = 'double';
}
}
}
// 删除线 - 修复逻辑,只有明确设置为true或存在且没有val属性时才添加删除线
if (rPr['w:strike']) {
const strikeVal = rPr['w:strike']['w:val'];
if (strikeVal === undefined || strikeVal === 'true' || strikeVal === '1') {
if (css['text-decoration']) {
css['text-decoration'] += ' line-through';
} else {
css['text-decoration'] = 'line-through';
}
}
}
// 颜色
if (rPr['w:color']) {
const color = rPr['w:color']['w:val'] || rPr['w:color'];
if (color && color !== 'auto' && color !== 'auto') {
// 确保颜色值是有效的十六进制
const cleanColor = color.replace('#', '');
if (/^[0-9A-Fa-f]{6}$/.test(cleanColor)) {
css['color'] = '#' + cleanColor;
}
}
}
// 高亮
if (rPr['w:highlight']) {
const highlight = rPr['w:highlight']['w:val'] || rPr['w:highlight'];
if (highlight) {
css['background-color'] = this.mapHighlightColor(highlight);
}
}
// 背景色(shading)
if (rPr['w:shd']) {
const shd = rPr['w:shd'];
const fill = shd['w:fill'] || shd.fill;
if (fill && fill !== 'auto') {
const cleanColor = fill.replace('#', '');
if (/^[0-9A-Fa-f]{6}$/.test(cleanColor)) {
css['background-color'] = '#' + cleanColor;
}
}
}
// 上下标
if (rPr['w:vertAlign']) {
const align = rPr['w:vertAlign']['w:val'];
if (align === 'superscript') {
css['vertical-align'] = 'super';
css['font-size'] = '0.8em';
} else if (align === 'subscript') {
css['vertical-align'] = 'sub';
css['font-size'] = '0.8em';
}
}
return css;
}
/**
* 解析边框
*/
parseBorders(borders: any): string {
const borderStyles: string[] = [];
['top', 'bottom', 'left', 'right'].forEach(side => {
const key = `w:${side}`;
if (borders[key]) {
const border = borders[key];
const width = this.eighthPointToPixel(border['w:sz']) || 1;
const color = border['w:color'] || '000000';
const style = this.mapBorderStyle(border['w:val']) || 'solid';
borderStyles.push(`border-${side}: ${width}px ${style} #${color}`);
}
});
return borderStyles.join('; ');
}
/**
* 加载编号/列表定义
*/
async loadNumbering(zip: any): Promise<void> {
try {
const numberingXml = await this.getFileContent(zip, 'word/numbering.xml');
if (!numberingXml) {
console.log('⚠️ 未找到编号文件');
return;
}
const result = await this.parseXml(numberingXml);
// 尝试不同的属性名格式
let numbering = null;
if (result && result['w:numbering']) {
numbering = result['w:numbering'];
} else if (result && result['numbering']) {
numbering = result['numbering'];
}
if (numbering) {
// 解析抽象编号定义
let abstractNums: any = numbering['w:abstractnum'] || numbering['abstractnum'];
if (abstractNums) {
abstractNums = Array.isArray(abstractNums) ? abstractNums : [abstractNums].filter(Boolean);
abstractNums.forEach(abstractNum => {
const id = abstractNum['w:abstractNumId'] || abstractNum['w:abstractnumid'] || abstractNum['abstractnumid'] || abstractNum['abstractNumId'];
if (id) {
this.numbering[id] = this.parseAbstractNumbering(abstractNum);
}
});
}
// 解析编号实例
let nums: any = numbering['w:num'] || numbering['num'];
if (nums) {
nums = Array.isArray(nums) ? nums : [nums].filter(Boolean);
nums.forEach(num => {
const numId = num['w:numId'] || num['w:numid'] || num['numid'] || num['numId'];
const abstractNumId = num['w:abstractNumId'] || num['w:abstractnumid'] || num['abstractnumid'] || num['abstractNumId'];
const abstractId = abstractNumId ? (abstractNumId['w:val'] || abstractNumId['val'] || abstractNumId) : null;
if (numId && abstractId) {
this.numberingCache.set(numId, abstractId);
}
});
}
console.log(`📊 解析了 ${Object.keys(this.numbering).length} 个编号定义`);
}
} catch (e: any) {
console.log('⚠️ 编号文件解析失败:', e.message);
}
}
/**
* 解析抽象编号定义
*/
parseAbstractNumbering(abstractNum: any): NumberingDefinition {
const levels: NumberingDefinition = {};
if (abstractNum['w:lvl']) {
const lvls = Array.isArray(abstractNum['w:lvl'])
? abstractNum['w:lvl']
: [abstractNum['w:lvl']].filter(Boolean);
lvls.forEach(lvl => {
const level = lvl['w:ilvl'] || '0';
levels[level] = {
start: lvl['w:start'] ? lvl['w:start']['w:val'] : '1',
numFmt: lvl['w:numfmt'] ? lvl['w:numfmt']['w:val'] : 'decimal',
text: lvl['w:lvltext'] ? lvl['w:lvltext']['w:val'] : '',
indent: lvl['w:ppr'] ? this.parseParagraphProperties(lvl['w:ppr']) : {}
};
});
}
return levels;
}
/**
* 加载关系定义
*/
async loadRelationships(zip: any): Promise<void> {
try {
const relsXml = await this.getFileContent(zip, 'word/_rels/document.xml.rels');
if (!relsXml) {
console.log('⚠️ 未找到关系文件');
return;
}
const result = await this.parseXml(relsXml);
// 尝试不同的属性名格式(大小写)
let relationshipArray = null;
if (result && result['Relationships'] && result['Relationships']['Relationship']) {
relationshipArray = result['Relationships']['Relationship'];
} else if (result && result['relationships'] && result['relationships']['relationship']) {
relationshipArray = result['relationships']['relationship'];
}
if (relationshipArray) {
const relationships = Array.isArray(relationshipArray)
? relationshipArray
: [relationshipArray].filter(Boolean);
relationships.forEach(rel => {
if (rel['Id'] && rel['Target']) {
this.relationships[rel['Id']] = rel['Target'];
}
});
console.log(`📊 解析了 ${Object.keys(this.relationships).length} 个关系`);
}
} catch (e: any) {
console.log('⚠️ 关系文件解析失败:', e.message);
}
}
/**
* 加载主题定义
*/
async loadThemes(zip: any): Promise<void> {
try {
const themeXml = await this.getFileContent(zip, 'word/theme/theme1.xml');
if (!themeXml) {
console.log('⚠️ 未找到主题文件');
return;
}
const result = await this.parseXml(themeXml);
this.themes = result || {};
console.log('📊 主题文件解析完成');
} catch (e: any) {
console.log('⚠️ 主题文件解析失败:', e.message);
}
}
/**
* 加载设置定义
*/
async loadSettings(zip: any): Promise<void> {
try {
const settingsXml = await this.getFileContent(zip, 'word/settings.xml');
if (!settingsXml) {
console.log('⚠️ 未找到设置文件');
return;
}
const result = await this.parseXml(settingsXml);
this.settings = result || {};
console.log('📊 设置文件解析完成');
} catch (e: any) {
console.log('⚠️ 设置文件解析失败:', e.message);
}
}
/**
* 加载媒体文件
*/
async loadMedia(zip: any): Promise<void> {
try {
const mediaFiles = Object.keys(zip.files).filter(filename =>
filename.startsWith('word/media/')
);
for (const filename of mediaFiles) {
const file = zip.files[filename];
if (!file.dir) {
const data = await file.async('base64');
this.media[filename] = data;
}
}
console.log(`📊 解析了 ${Object.keys(this.media).length} 个媒体文件`);
} catch (e: any) {
console.log('⚠️ 媒体文件解析失败:', e.message);
}
}
/**
* 构建文档DOM
*/
async buildDocumentDOM(document: any): Promise<any[]> {
const body = document['w:document']['w:body'];
const dom: any[] = [];
// 如果body是数组,按顺序处理所有元素
if (Array.isArray(body)) {
for (const element of body) {
await this.processBodyElement(element, dom);
}
} else {
// 如果body是对象,需要按照XML中的顺序处理元素
// 先收集所有元素并按照它们在XML中的顺序排序
const elements: Array<{type: string, data: any, order: number}> = [];
let order = 0;
// 处理段落
if (body['w:p']) {
const paragraphs = Array.isArray(body['w:p']) ? body['w:p'] : [body['w:p']].filter(Boolean);
for (const p of paragraphs) {
elements.push({type: 'paragraph', data: p, order: order++});
}
}
// 处理表格
if (body['w:tbl']) {
const tables = Array.isArray(body['w:tbl']) ? body['w:tbl'] : [body['w:tbl']].filter(Boolean);
for (const tbl of tables) {
elements.push({type: 'table', data: tbl, order: order++});
}
}
// 按顺序处理元素
elements.sort((a, b) => a.order - b.order);
for (const element of elements) {
if (element.type === 'paragraph') {
const paragraph = await this.buildParagraph(element.data);
if (paragraph) dom.push(paragraph);
} else if (element.type === 'table') {
const table = await this.buildTable(element.data);
if (table) dom.push(table);
}
}
}
return dom;
}
/**
* 处理body中的单个元素
*/
async processBodyElement(element: any, dom: any[]): Promise<void> {
if (element['w:p']) {
const paragraph = await this.buildParagraph(element['w:p']);
if (paragraph) dom.push(paragraph);
} else if (element['w:tbl']) {
const table = await this.buildTable(element['w:tbl']);
if (table) dom.push(table);
}
}
/**
* 构建段落DOM
*/
async buildParagraph(p: any): Promise<ParagraphNode | null> {
if (!p) return null;
const paragraph: ParagraphNode = {
type: 'paragraph',
style: {},
children: [],
numbering: null
};
// 解析段落属性
if (p['w:ppr']) {
paragraph.style = this.parseParagraphProperties(p['w:ppr']);
// 检查是否是列表项
if (p['w:ppr']['w:numpr']) {
paragraph.numbering = this.parseNumberingProperties(p['w:ppr']['w:numpr']);
}
// 检查段落样式引用
if (p['w:ppr']['w:pstyle']) {
const styleId = p['w:ppr']['w:pstyle']['w:val'];
if (this.styles[styleId]) {
paragraph.style = { ...this.styles[styleId].css, ...paragraph.style };
}
}
}
// 解析文本内容
if (p['w:r']) {
const runs = Array.isArray(p['w:r']) ? p['w:r'] : [p['w:r']].filter(Boolean);
for (const r of runs) {
const run = await this.buildRun(r);
if (run) paragraph.children.push(run);
}
}
// 处理特殊元素(如超链接)
if (p['w:hyperlink']) {
const hyperlinks = Array.isArray(p['w:hyperlink']) ? p['w:hyperlink'] : [p['w:hyperlink']].filter(Boolean);
for (const link of hyperlinks) {
const hyperlinkNode = await this.buildHyperlink(link);
if (hyperlinkNode) paragraph.children.push(hyperlinkNode);
}
}
return paragraph;
}
/**
* 解析编号属性
*/
parseNumberingProperties(numPr: any): { numId: string; level: string } | null {
if (!numPr) return null;
const numId = numPr['w:numid'] ? numPr['w:numid']['w:val'] : null;
const level = numPr['w:ilvl'] ? numPr['w:ilvl']['w:val'] : '0';
if (numId) {
return { numId, level };
}
return null;
}
/**
* 构建文本运行DOM
*/
async buildRun(r: any): Promise<RunNode | ImageNode | null> {
if (!r) return null;
const run: RunNode = {
type: 'run',
text: '',
style: {}
};
// 解析运行属性
if (r['w:rpr']) {
run.style = this.parseRunProperties(r['w:rpr']);
// 检查样式引用
if (r['w:rpr']['w:rstyle']) {
const styleId = r['w:rpr']['w:rstyle']['w:val'];
if (this.styles[styleId]) {
run.style = { ...this.styles[styleId].css, ...run.style };
}
}
}
// 提取文本
if (r['w:t']) {
if (typeof r['w:t'] === 'string') {
run.text = r['w:t'];
} else if (r['w:t']['_']) {
run.text = r['w:t']['_'];
} else if (r['w:t']['#text']) {
run.text = r['w:t']['#text'];
}
}
// 处理制表符
if (r['w:tab']) {
run.text += '\t';
}
// 处理换行
if (r['w:br']) {
run.type = 'break';
}
// 处理图片
if (r['w:drawing'] || r['w:pict']) {
return await this.buildImage(r);
}
return run;
}
/**
* 构建图片DOM
*/
async buildImage(r: any): Promise<ImageNode | null> {
const image: ImageNode = {
type: 'image',
src: '',
alt: '',
style: {}
};
console.log(`🖼️ 开始处理图片节点:`, JSON.stringify(r, null, 2));
// 处理 w:pict/w:imageData
if (r['w:pict'] && r['w:pict']['v:imagedata']) {
const imgData = r['w:pict']['v:imagedata'];
const relId = imgData['r:id'];
console.log(`📸 处理 w:pict 图片,关系ID: ${relId}`);
image.src = await this.getImageData(relId);
image.alt = imgData['o:title'] || imgData['alt'] || '';
}
// 从drawing中提取图片
if (r['w:drawing']) {
const drawing = r['w:drawing'];
const inline = drawing['wp:inline'] || drawing['wp:anchor'];
console.log(`🎨 处理 w:drawing 图片`);
if (inline) {
// 获取尺寸
const extent = inline['wp:extent'];
if (extent) {
const width = this.emuToPixel(extent['cx']);
const height = this.emuToPixel(extent['cy']);
image.style['width'] = width + 'px';
image.style['height'] = height + 'px';
console.log(`📏 图片尺寸: ${width}x${height}px`);
}
// 获取图片数据
const blip = this.findBlip(inline);
if (blip && blip['r:embed']) {
const relationId = blip['r:embed'];
console.log(`🔗 找到图片关系ID: ${relationId}`);
image.src = await this.getImageData(relationId);
} else {
console.log(`⚠️ 未找到图片blip或关系ID`);
}
}
}
console.log(`🖼️ 图片处理结果:`, {
src: image.src,
alt: image.alt,
style: image.style
});
// 如果没有找到图片源,返回null
if (!image.src) {
console.log(`❌ 图片源为空,跳过此图片`);
return null;
}
return image;
}
/**
* 查找图片blip元素
*/
findBlip(element: any): any {
if (!element) return null;
// 递归查找blip元素
if (element['a:blip']) {
return element['a:blip'];
}
for (const key in element) {
if (typeof element[key] === 'object') {
const result = this.findBlip(element[key]);
if (result) return result;
}
}
return null;
}
/**
* 获取图片数据 - 使用缓存机制
*/
async getImageData(relationId: string): Promise<string> {
const target = this.relationships[relationId];
if (!target) {
console.log(`⚠️ 未找到关系ID: ${relationId}`);
return '';
}
console.log(`🖼️ 处理图片关系: ${relationId} -> ${target}`);
// 多路径尝试
const possiblePaths = [
`word/${target}`,
target,
`word/media/${target.split('/').pop()}`,
`word/media/${target}`
];
console.log(`🔍 尝试路径:`, possiblePaths);
console.log(`📁 可用媒体文件:`, Object.keys(this.media));
for (const mediaPath of possiblePaths) {
const mediaData = this.media[mediaPath];
if (mediaData) {
console.log(`✅ 找到图片数据: ${mediaPath}`);
try {
// 获取图片文件名
const filename = path.basename(target);
// 保存图片到缓存并返回相对路径
const storedPath = await this.saveImageToCache(filename, mediaData, relationId);
return storedPath;
} catch (error) {
console.error(`❌ 保存图片失败:`, error);
return '';
}
}
}
console.log(`❌ 未找到图片数据: ${target}`);
return '';
}
/**
* 获取图片MIME类型
*/
getImageMimeType(filename: string): string {
const ext = filename.split('.').pop()?.toLowerCase();
const mimeTypes: Record<string, string> = {
'png': 'image/png',
'jpg': 'image/jpeg',
'jpeg': 'image/jpeg',
'gif': 'image/gif',
'bmp': 'image/bmp',
'svg': 'image/svg+xml'
};
return mimeTypes[ext || ''] || 'image/png';
}
/**
* 构建超链接DOM
*/
async buildHyperlink(link: any): Promise<HyperlinkNode> {
const hyperlink: HyperlinkNode = {
type: 'hyperlink',
href: '',
children: []
};
// 获取链接地址
if (link['r:id']) {
const relationId = link['r:id'];
hyperlink.href = this.relationships[relationId] || '';
}
// 处理链接内容
if (link['w:r']) {
const runs = Array.isArray(link['w:r']) ? link['w:r'] : [link['w:r']].filter(Boolean);
for (const r of runs) {
const run = await this.buildRun(r);
if (run) hyperlink.children.push(run);
}
}
return hyperlink;
}
/**
* 构建表格DOM
*/
async buildTable(tbl: any): Promise<TableNode | null> {
if (!tbl) return null;
const table: TableNode = {
type: 'table',
style: {},
rows: []
};
// 解析表格属性
if (tbl['w:tblpr']) {
table.style = this.parseTableProperties(tbl['w:tblpr']);
}
// 解析表格行
if (tbl['w:tr']) {
const rows = Array.isArray(tbl['w:tr']) ? tbl['w:tr'] : [tbl['w:tr']].filter(Boolean);
for (const tr of rows) {
const row = await this.buildTableRow(tr);
if (row) table.rows.push(row);
}
}
return table;
}
/**
* 解析表格属性
*/
parseTableProperties(tblPr: any): Record<string, string> {
const css: Record<string, string> = {
'border-collapse': 'collapse',
'width': '100%'
};
// 表格宽度
if (tblPr['w:tblw']) {
const width = tblPr['w:tblw'];
if (width['w:type'] === 'pct') {
css['width'] = (parseInt(width['w:w']) / 50) + '%';
} else if (width['w:type'] === 'dxa') {
css['width'] = this.twipToPixel(width['w:w']) + 'px';
}
}
// 表格边框
if (tblPr['w:tblborders']) {
const borders = tblPr['w:tblborders'];
css['border'] = this.parseTableBorders(borders);
}
// 表格对齐
if (tblPr['w:jc']) {
const align = tblPr['w:jc']['w:val'];
if (align === 'center') {
css['margin'] = '0 auto';
} else if (align === 'right') {
css['margin-left'] = 'auto';
}
}
return css;
}
/**
* 解析表格边框
*/
parseTableBorders(borders: any): string {
const borderStyles: string[] = [];
['top', 'bottom', 'left', 'right'].forEach(side => {
const key = `w:${side}`;
if (borders[key]) {
const border = borders[key];
const width = this.eighthPointToPixel(border['w:sz']) || 1;
const color = border['w:color'] || '000000';
const style = this.mapBorderStyle(border['w:val']) || 'solid';
borderStyles.push(`border-${side}: ${width}px ${style} #${color}`);
}
});
return borderStyles.join('; ');
}
/**
* 构建表格行DOM
*/
async buildTableRow(tr: any): Promise<TableRowNode | null> {
if (!tr) return null;
const row: TableRowNode = {
type: 'tableRow',
style: {},
cells: []
};
// 解析行属性
if (tr['w:trpr']) {
row.style = this.parseTableRowProperties(tr['w:trpr']);
}
// 解析单元格
if (tr['w:tc']) {
const cells = Array.isArray(tr['w:tc']) ? tr['w:tc'] : [tr['w:tc']].filter(Boolean);
for (const tc of cells) {
const cell = await this.buildTableCell(tc);
if (cell) row.cells.push(cell);
}
}
return row;
}
/**
* 解析表格行属性
*/
parseTableRowProperties(trPr: any): Record<string, string> {
const css: Record<string, string> = {};
// 行高
if (trPr['w:trheight']) {
const height = trPr['w:trheight'];
if (height['w:val']) {
css['height'] = this.twipToPixel(height['w:val']) + 'px';
}
}
return css;
}
/**
* 构建表格单元格DOM
*/
async buildTableCell(tc: any): Promise<TableCellNode | null> {
if (!tc) return null;
const cell: TableCellNode = {
type: 'tableCell',
style: {},
children: [],
colspan: 1,
rowspan: 1
};
// 解析单元格属性
if (tc['w:tcpr']) {
const tcPr = tc['w:tcpr'];
// 合并单元格
if (tcPr['w:gridspan']) {
cell.colspan = parseInt(tcPr['w:gridspan']['w:val']);
}
if (tcPr['w:vmerge']) {
if (!tcPr['w:vmerge']['w:val']) {
cell.rowspan = 1; // 标记为需要计算
} else {
cell.rowspan = 0; // 被合并的单元格
}
}
// 单元格宽度
if (tcPr['w:tcw']) {
const width = tcPr['w:tcw'];
if (width['w:type'] === 'pct') {
cell.style['width'] = (parseInt(width['w:w']) / 50) + '%';
} else if (width['w:type'] === 'dxa') {
cell.style['width'] = this.twipToPixel(width['w:w']) + 'px';
}
}
// 单元格边框
if (tcPr['w:tcborders']) {
cell.style['border'] = this.parseTableBorders(tcPr['w:tcborders']);
}
// 背景色
if (tcPr['w:shd']) {
const fill = tcPr['w:shd']['w:fill'];
if (fill && fill !== 'auto') {
cell.style['background-color'] = '#' + fill;
}
}
}
// 解析单元格内容
if (tc['w:p']) {
const paragraphs = Array.isArray(tc['w:p']) ? tc['w:p'] : [tc['w:p']].filter(Boolean);
for (const p of paragraphs) {
const paragraph = await this.buildParagraph(p);
if (paragraph) cell.children.push(paragraph);
}
}
return cell;
}
/**
* 渲染DOM为HTML
*/
renderToHTML(dom: any[]): string {
const htmlParts: string[] = [];
const listStack: string[] = []; // 用于跟踪列表嵌套
let lastWasListItem = false;
for (let i = 0; i < dom.length; i++) {
const node = dom[i];
const html = this.renderNode(node, listStack);
if (html) {
// 检测当前是否为列表项
const isCurrentListItem = html.includes('<li ');
// 如果上一个是列表项,当前不是列表项,需要关闭列表
if (lastWasListItem && !isCurrentListItem && listStack.length > 0) {
const listType = listStack.pop();
if (listType) {
htmlParts.push(`</${listType}>`);
}
}
htmlParts.push(html);
lastWasListItem = isCurrentListItem;
}
}
// 关闭所有未关闭的列表
while (listStack.length > 0) {
const listType = listStack.pop();
if (listType) {
htmlParts.push(`</${listType}>`);
}
}
return this.wrapHTML(htmlParts.join('\n'));
}
/**
* 渲染单个节点
*/
renderNode(node: any, listStack: string[] = []): string {
switch (node.type) {
case 'paragraph':
return this.renderParagraph(node, listStack);
case 'table':
return this.renderTable(node);
case 'image':
return this.renderImage(node);
case 'hyperlink':
return this.renderHyperlink(node);
default:
return '';
}
}
/**
* 渲染段落 - 优化版
*/
renderParagraph(paragraph: ParagraphNode, listStack: string[]): string {
const style = this.cssToString(paragraph.style);
let content = this.renderChildren(paragraph.children);
console.log(`📝 渲染段落: "${content.substring(0, 50)}...", 样式: ${style}`);
console.log(`🔍 段落编号信息:`, paragraph.numbering);
// 处理空段落 - 保留空行以改善排版
if (!content || content.trim() === '') {
return '<p style="margin: 12px 0; height: 1.2em;"> </p>';
}
// 1. 优先处理正式编号列表
if (paragraph.numbering) {
console.log(`📋 处理正式编号列表: numId=${paragraph.numbering.numId}, level=${paragraph.numbering.level}`);
return this.renderFormalList(paragraph, content, style, listStack);
}
// 2. 检测手动列表项
const manualListItem = this.detectManualListItem(content);
if (manualListItem) {
console.log(`📋 处理手动列表项: 类型=${manualListItem.listType}, 内容="${manualListItem.content}"`);
return this.renderManualListWithCounter(manualListItem, style, listStack);
}
// 3. 检测是否为标题
const headingLevel = this.detectHeading(paragraph);
if (headingLevel) {
console.log(`📰 渲染标题: 级别=${headingLevel}`);
return `<h${headingLevel} style="${style}">${content}</h${headingLevel}>`;
}
// 4. 检测是否为代码块
if (this.isCodeBlock(paragraph)) {
console.log(`💻 渲染代码块`);
return `<pre><code style="${style}">${this.escapeHtml(content)}</code></pre>`;
}
// 5. 普通段落
console.log(`📄 渲染普通段落`);
const defaultStyle = 'margin: 12px 0; line-height: 1.6; padding: 4px 0;';
const combinedStyle = style ? `${defaultStyle} ${style}` : defaultStyle;
return `<p style="${combinedStyle}">${content}</p>`;
}
/**
* 渲染正式编号列表
*/
private renderFormalList(paragraph: ParagraphNode, content: string, style: string, listStack: string[]): string {
const { numId, level } = paragraph.numbering!;
const listType = this.getListType(numId, level);
const levelNum = parseInt(level);
console.log(`🔢 正式列表: 类型=${listType}, 级别=${levelNum}, 当前栈=${listStack.join(',')}`);
let result = '';
// 关闭多余的嵌套级别
while (listStack.length > levelNum) {
const closingType = listStack.pop();
if (closingType) {
result += `</${closingType}>`;
console.log(`🔚 关闭列表: ${closingType}`);
}
}
// 检查是否需要开启新列表
if (listStack.length === levelNum) {
// 同级别,检查类型是否一致
if (listStack[levelNum - 1] !== listType) {
// 类型不同,关闭旧的,开启新的
const oldType = listStack.pop();
result += `</${oldType}>`;
// 获取起始序号
const startNum = this.getListStartNumber(numId, level);
result += `<${listType} start="${startNum}">`;
listStack.push(listType);
console.log(`🔄 切换列表类型: ${oldType} -> ${listType}`);
}
} else {
// 新的嵌套级别
const startNum = this.getListStartNumber(numId, level);
result += `<${listType} start="${startNum}">`;
listStack.push(listType);
console.log(`🆕 开启新列表: ${listType}`);
}
// 简单地添加列表项,让浏览器处理序号
result += `<li style="${style}">${content}</li>`;
return result;
}
/**
* 渲染手动列表项
*/
private renderManualList(manualListItem: { content: string; listType: 'ul' | 'ol'; level: number }, style: string, listStack: string[]): string {
const { content, listType, level } = manualListItem;
console.log(`✋ 手动列表: 类型=${listType}, 级别=${level}, 当前栈=${listStack.join(',')}`);
let result = '';
// 检查是否需要开启新列表
const needNewList = listStack.length === 0 ||
listStack[listStack.length - 1] !== listType ||
listStack.length <= level;
if (needNewList) {
// 关闭不匹配的列表
while (listStack.length > level) {
const closingType = listStack.pop();
if (closingType) {
result += `</${closingType}>`;
}
}
// 开启新列表
result += `<${listType}>`;
listStack.push(listType);
console.log(`🆕 开启手动列表: ${listType}`);
}
// 添加列表项
result += `<li style="${style}">${content}</li>`;
return result;
}
/**
* 渲染带计数器的手动列表项
*/
private renderManualListWithCounter(manualListItem: { content: string; listType: 'ul' | 'ol'; level: number }, style: string, listStack: string[]): string {
const { content, listType, level } = manualListItem;
console.log(`✋ 手动列表: 类型=${listType}, 级别=${level}, 当前栈=${listStack.join(',')}`);
let result = '';
// 关闭不匹配的列表
while (listStack.length > level) {
const closingType = listStack.pop();
if (closingType) {
result += `</${closingType}>`;
}
}
// 检查是否需要开启新列表
if (listStack.length === 0 || listStack[listStack.length - 1] !== listType || listStack.length <= level) {
result += `<${listType}>`;
listStack.push(listType);
console.log(`🆕 开启手动列表: ${listType}`);
}
// 简单地添加列表项,让浏览器处理序号
result += `<li style="${style}">${content}</li>`;
return result;
}
/**
* 渲染子元素
*/
renderChildren(children: ChildNode[]): string {
return children.map(child => {
if (child.type === 'run' || child.type === 'break') {
return this.renderRun(child);
} else if (child.type === 'hyperlink') {
return this.renderHyperlink(child);
} else if (child.type === 'image') {
return this.renderImage(child);
}
return '';
}).join('');
}
/**
* 渲染文本运行
*/
renderRun(run: RunNode): string {
if (!run.text && run.text !== '0') return '';
const style = this.cssToString(run.style);
let text = this.escapeHtml(run.text);
// 改善空格和换行处理
// 保留连续空格
text = text.replace(/ {2,}/g, match => ' '.repeat(match.length));
// 保留制表符
text = text.replace(/\t/g, ' ');
// 处理换行符
text = text.replace(/\n/g, '<br>');
// 保留行首空格
text = text.replace(/^\s+/gm, match => ' '.repeat(match.length));
if (style) {
return `<span style="${style}">${text}</span>`;
}
return text;
}
/**
* 渲染超链接
*/
renderHyperlink(hyperlink: HyperlinkNode): string {
const content = this.renderChildren(hyperlink.children);
if (hyperlink.href) {
return `<a href="${this.escapeHtml(hyperlink.href)}">${content}</a>`;
}
return content;
}
/**
* 渲染表格
*/
renderTable(table: TableNode): string {
const style = this.cssToString(table.style);
const rows = table.rows.map(row => this.renderTableRow(row)).join('\n');
return `<table style="${style}">\n${rows}\n</table>`;
}
/**
* 渲染表格行
*/
renderTableRow(row: TableRowNode): string {
const style = this.cssToString(row.style);
const cells = row.cells.map(cell => this.renderTableCell(cell)).join('');
return `<tr style="${style}">${cells}</tr>`;
}
/**
* 渲染表格单元格
*/
renderTableCell(cell: TableCellNode): string {
if (cell.rowspan === 0) return ''; // 被合并的单元格不渲染
const style = this.cssToString(cell.style);
const content = cell.children.map(child => this.renderNode(child)).join('');
let attrs = `style="${style}"`;
if (cell.colspan > 1) attrs += ` colspan="${cell.colspan}"`;
if (cell.rowspan > 1) attrs += ` rowspan="${cell.rowspan}"`;
return `<td ${attrs}>${content || ' '}</td>`;
}
/**
* 渲染图片 - 使用相对路径
*/
renderImage(image: ImageNode): string {
if (!image.src) {
console.log('⚠️ 图片源为空,跳过渲染');
return '';
}
const style = this.cssToString(image.style);
const alt = this.escapeHtml(image.alt || '图片');
// 直接使用相对路径,因为图片已经被复制到输出目录
const imageSrc = image.src;
console.log(`🖼️ 渲染图片: src=${imageSrc}, alt=${alt}, style=${style}`);
return `<img src="${imageSrc}" alt="${alt}" style="${style}" loading="lazy" />`;
}
/**
* 检测手动列表项(以项目符号开头的段落)- 增强版
*/
detectManualListItem(content: string): { content: string; listType: 'ul' | 'ol'; level: number } | null {
if (!content || content.trim() === '') return null;
const trimmedContent = content.trim();
console.log(`🔍 检测列表项: "${trimmedContent}"`);
// 检测各种项目符号 - 增强版
const bulletPatterns = [
// 无序列表符号
{ pattern: /^[•●○◆◇■□▪▫⁃‣⁌⁍]\s*(.+)$/, type: 'ul' as const, level: 0 },
{ pattern: /^[·]\s*(.+)$/, type: 'ul' as const, level: 0 },
{ pattern: /^[-*+]\s*(.+)$/, type: 'ul' as const, level: 0 },
// 有序列表符号
{ pattern: /^\d+[.)、]\s*(.+)$/, type: 'ol' as const, level: 0 },
{ pattern: /^[a-zA-Z][.)、]\s*(.+)$/, type: 'ol' as const, level: 0 },
{ pattern: /^[①②③④⑤⑥⑦⑧⑨⑩]\s*(.+)$/, type: 'ol' as const, level: 0 },
{ pattern: /^[一二三四五六七八九十]、\s*(.+)$/, type: 'ol' as const, level: 0 },
// Word 特有格式
{ pattern: /^\s*[•]\s+(.+)$/, type: 'ul' as const, level: 0 },
{ pattern: /^\s*\d+\.\s+(.+)$/, type: 'ol' as const, level: 0 },
// 带缩进的列表项
{ pattern: /^\s{2,}[•●○]\s*(.+)$/, type: 'ul' as const, level: 1 },
{ pattern: /^\s{2,}\d+[.)、]\s*(.+)$/, type: 'ol' as const, level: 1 },
// 考勤时间特有格式
{ pattern: /^工作时间[::]\s*(.+)$/, type: 'ul' as const, level: 0 },
{ pattern: /^午休时间[::]\s*(.+)$/, type: 'ul' as const, level: 0 },
{ pattern: /^因工作安排[,,]\s*(.+)$/, type: 'ul' as const, level: 0 }
];
for (const { pattern, type, level } of bulletPatterns) {
const match = trimmedContent.match(pattern);
if (match) {
console.log(`✅ 匹配到列表项: 类型=${type}, 级别=${level}, 内容="${match[1].trim()}"`);
return {
content: match[1].trim(),
listType: type,
level: level
};
}
}
console.log(`❌ 未匹配到列表项`);
return null;
}
/**
* 检测是否为代码块
*/
isCodeBlock(paragraph: ParagraphNode): boolean {
// 检查是否使用等宽字体
const fontFamily = paragraph.style['font-family'];
if (fontFamily && /Consolas|Courier|Monaco|monospace/i.test(fontFamily)) {
return true;
}
// 检查是否有代码块背景色
const bgColor = paragraph.style['background-color'];
if (bgColor && /^#[fF][0-9a-fA-F]{5}$/.test(bgColor)) {
return true;
}
return false;
}
/**
* 检测标题级别 - 增强版
*/
detectHeading(paragraph: ParagraphNode): number | null {
// 检查是否有标题样式
const hasHeadingStyle = this.hasHeadingStyle(paragraph);
if (hasHeadingStyle) {
return hasHeadingStyle;
}
// 根据字体大小和粗体判断
const fontSize = paragraph.style['font-size'];
const isBold = paragraph.style['font-weight'] === 'bold';
if (fontSize) {
const size = parseFloat(fontSize);
// 调整标题判断逻辑
if (size >= 22 && isBold) return 1;
if (size >= 18 && isBold) return 2;
if (size >= 16 && isBold) return 3;
if (size >= 14 && isBold) return 4;
if (size >= 12 && isBold) return 5;
}
// 检查子元素中是否有大字体粗体文本
for (const child of paragraph.children) {
if (child.type === 'run') {
const childFontSize = child.style['font-size'];
const childIsBold = child.style['font-weight'] === 'bold';
if (childFontSize && childIsBold) {
const size = parseFloat(childFontSize);
if (size >= 22) return 1;
if (size >= 18) return 2;
if (size >= 16) return 3;
if (size >= 14) return 4;
if (size >= 12) return 5;
}
}
}
return null;
}
/**
* 检查是否有标题样式
*/
hasHeadingStyle(paragraph: ParagraphNode): number | null {
// 这里可以根据样式名称判断是否为标题
// 例如检查样式ID是否包含"Heading"等
return null;
}
/**
* 生成CSS样式表
*/
generateCSS(): string {
const css: string[] = [];
// 基础样式 - 优化排版
css.push(`
body {
font-family: 'Microsoft YaHei', 'SimHei', 'Calibri', 'Arial', sans-serif;
line-height: 1.5;
margin: 20px;
padding: 0;
word-wrap: break-word;
word-break: break-all;
}
p {
margin: 12px 0;
line-height: 1.6;
text-align: justify;
word-spacing: 0.1em;
padding: 4px 0;
}
h1, h2, h3, h4, h5, h6 {
margin: 20px 0 12px 0;
line-height: 1.4;
font-weight: bold;
padding: 6px 0;
}
table {
border-collapse: collapse;
width: 100%;
margin: 10px 0;
}
td, th {
padding: 8px 12px;
border: 1px solid #ddd;
vertical-align: top;
word-wrap: break-word;
}
pre {
background-color: #f8f9fa;
padding: 12px;
border-radius: 4px;
overflow-x: auto;
margin: 10px 0;
border-left: 4px solid #007acc;
}
code {
font-family: 'Consolas', 'Monaco', 'Courier New', monospace;
background-color: #f1f3f4;
padding: 2px 4px;
border-radius: 3px;
}
ul, ol {
margin: 8px 0;
padding-left: 24px;
list-style-position: outside;
display: block;
}
ul {
list-style-type: disc;
}
ol {
list-style-type: decimal;
}
/* 确保嵌套列表正确显示 */
ol ol {
list-style-type: lower-alpha;
}
ol ol ol {
list-style-type: lower-roman;
}
li {
margin: 4px 0;
line-height: 1.6;
padding: 2px 0;
display: list-item;
word-wrap: break-word;
overflow-wrap: break-word;
}
li p {
margin: 0;
padding: 0;
display: inline;
}
blockquote {
margin: 16px 0;
padding: 12px 20px;
border-left: 4px solid #ddd;
background-color: #f9f9f9;
font-style: italic;
line-height: 1.6;
}
`);
// 添加文档中定义的样式
for (const [styleId, styleDef] of Object.entries(this.styles)) {
if (styleDef.css && Object.keys(styleDef.css).length > 0) {
const selector = `.${styleId}`;
const rules = this.cssToString(styleDef.css);
css.push(`${selector} { ${rules} }`);
}
}
return css.join('\n');
}
/**
* 包装HTML
*/
wrapHTML(content: string): string {
return `<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Converted Document</title>
<style>${this.generateCSS()}</style>
</head>
<body>
${content}
</body>
</html>`;
}
// === 工具方法 ===
/**
* 获取ZIP文件内容
*/
async getFileContent(zip: any, path: string): Promise<string | null> {
if (zip.files[path]) {
return zip.file(path).async('string');
}
return null;
}
/**
* 解析XML
*/
parseXml(xml: string): Promise<any> {
return new Promise((resolve, reject) => {
this.parser.parseString(xml, (err: any, result: any) => {
if (err) reject(err);
else resolve(result);
});
});
}
/**
* CSS对象转字符串
*/
cssToString(cssObj: Record<string, string>): string {
if (!cssObj || Object.keys(cssObj).length === 0) return '';
return Object.entries(cssObj)
.filter(([key, value]) => value != null && value !== '')
.map(([key, value]) => `${key}: ${value}`)
.join('; ');
}
/**
* Twip转像素
*/
twipToPixel(twip: string | number): number {
return Math.round(parseInt(String(twip)) / 20);
}
/**
* 八分之一点转像素
*/
eighthPointToPixel(eighthPoint: string | number): number {
return Math.round(parseInt(String(eighthPoint)) / 8);
}
/**
* EMU转像素
*/
emuToPixel(emu: string | number): number {
return Math.round(parseInt(String(emu)) / 9525);
}
/**
* 映射对齐方式
*/
mapAlignment(align: string): string {
const map: Record<string, string> = {
'left': 'left',
'center': 'center',
'right': 'right',
'both': 'justify',
'justify': 'justify'
};
return map[align] || 'left';
}
/**
* 映射边框样式
*/
mapBorderStyle(style: string): string {
const map: Record<string, string> = {
'single': 'solid',
'double': 'double',
'dotted': 'dotted',
'dashed': 'dashed',
'none': 'none'
};
return map[style] || 'solid';
}
/**
* 映射高亮颜色
*/
mapHighlightColor(color: string): string {
const map: Record<string, string> = {
'yellow': '#FFFF00',
'green': '#00FF00',
'cyan': '#00FFFF',
'magenta': '#FF00FF',
'blue': '#0000FF',
'red': '#FF0000',
'darkBlue': '#000080',
'darkCyan': '#008080',
'darkGreen': '#008000',
'darkMagenta': '#800080',
'darkRed': '#800000',
'darkYellow': '#808000',
'darkGray': '#808080',
'lightGray': '#C0C0C0',
'black': '#000000'
};
return map[color] || '#FFFF00';
}
/**
* 转义HTML特殊字符
*/
escapeHtml(text: string): string {
const map: Record<string, string> = {
'&': '&',
'<': '<',
'>': '>',
'"': '"',
"'": '''
};
return text.replace(/[&<>"']/g, char => map[char]);
}
/**
* 获取列表起始序号
*/
getListStartNumber(numId: string, level: string): number {
const abstractId = this.numberingCache.get(numId);
if (abstractId && this.numbering[abstractId]) {
const levelDef = this.numbering[abstractId][level];
if (levelDef && levelDef.start) {
const startNum = parseInt(levelDef.start);
return isNaN(startNum) ? 1 : startNum;
}
}
return 1;
}
// 移除了formatListNumber和toRoman方法,现在使用HTML原生列表功能
/**
* 获取列表类型 - 增强版
*/
getListType(numId: string, level: string): string {
console.log(`🔍 获取列表类型: numId=${numId}, level=${level}`);
const abstractId = this.numberingCache.get(numId);
console.log(`📋 抽象ID映射: ${numId} -> ${abstractId}`);
if (abstractId && this.numbering[abstractId]) {
const levelDef = this.numbering[abstractId][level];
console.log(`📊 级别定义:`, levelDef);
if (levelDef) {
const numFmt = levelDef.numFmt;
const text = levelDef.text;
console.log(`🎯 编号格式: numFmt=${numFmt}, text="${text}"`);
// 扩展项目符号类型检测
const isBullet = numFmt === 'bullet' ||
numFmt === 'disc' ||
numFmt === 'circle' ||
numFmt === 'square' ||
text === '·' ||
text === '•' ||
text === '○' ||
text === '■' ||
text === '' ||
text === '%1.' ||
text.includes('●') ||
text.includes('◆') ||
text.includes('▪') ||
text.includes('◇') ||
text.includes('□');
const listType = isBullet ? 'ul' : 'ol';
console.log(`✅ 确定列表类型: ${listType}`);
return listType;
}
} else {
console.log(`⚠️ 未找到编号定义,使用默认无序列表`);
return 'ul';
}
console.log(`📝 默认返回有序列表`);
return 'ol';
}
}
// 导出转换函数
export async function convertDocxToHtmlWithOOXML(inputPath: string, options: any = {}): Promise<any> {
try {
const parser = new OOXMLParser();
console.log('🚀 开始自定义OOXML解析器转换...');
console.log('📁 输入文件:', inputPath);
console.log('📂 输出目录:', options.outputDir || '未指定');
// 确保有输出目录
const outputDir = options.outputDir || path.dirname(inputPath);
console.log('📂 实际使用的输出目录:', outputDir);
const result = await parser.parseDocx(inputPath, outputDir);
console.log('✅ 自定义OOXML解析器转换完成');
console.log('📊 媒体文件数量:', Object.keys(result.media).length);
return {
success: true,
content: result.html,
css: result.css,
media: result.media,
metadata: {
format: 'html',
originalFormat: 'docx',
converter: 'custom-ooxml-parser',
stylesPreserved: true,
standalone: true,
conversionTime: Date.now(),
mediaCount: Object.keys(result.media).length
}
};
} catch (error: any) {
console.error('❌ 自定义OOXML解析器转换失败:', error);
return {
success: false,
error: error.message,
metadata: {
converter: 'custom-ooxml-parser',
failed: true
}
};
}
}
export { OOXMLParser };