diff --git a/src/js/gridRenderer.js b/src/js/gridRenderer.js index a592f4b..c10e0e7 100644 --- a/src/js/gridRenderer.js +++ b/src/js/gridRenderer.js @@ -115,26 +115,26 @@ export class GridRenderer { * 处理单个物理列 */ processPhysicalColumn(logicalCols) { - // 合并所有字符 - let allChars = []; - for (const lc of logicalCols) { - allChars = allChars.concat(lc.chars); - } - - // 分离大字和小字 - const bigChars = allChars.filter(c => !c.isSmall); - const smallChars = allChars.filter(c => c.isSmall); - // 处理双行小字配对 - let smallPairs = []; - if (smallChars.length > 0) { - smallPairs = this.pairSmallChars(smallChars); + const smallPairs = this.pairSmallChars(logicalCols); + + // 收集大字 + const bigChars = []; + for (const lc of logicalCols) { + if (!lc.isAllSmall) { + bigChars.push(...lc.chars); + } } // 按y坐标排序大字 bigChars.sort((a, b) => a.yCenter - b.yCenter); - // 计算物理列中心 + // 计算物理列中心(包括所有字符) + let allChars = [...bigChars]; + for (const pair of smallPairs) { + if (pair.right) allChars.push(pair.right); + if (pair.left) allChars.push(pair.left); + } const xCenter = allChars.reduce((sum, c) => sum + c.xCenter, 0) / allChars.length; return { @@ -148,33 +148,38 @@ export class GridRenderer { /** * 双行小字配对 + * 按line_id顺序配对:先出现的line_id是右列,后出现的line_id是左列 */ - pairSmallChars(smallChars) { - if (smallChars.length === 0) return []; - - // 按x坐标分左右 - const xValues = smallChars.map(c => c.xCenter); - const xThreshold = xValues.reduce((a, b) => a + b, 0) / xValues.length; - - // 右列(x >= 阈值)和左列(x < 阈值) - const rightChars = smallChars - .filter(c => c.xCenter >= xThreshold) - .sort((a, b) => a.yCenter - b.yCenter); - const leftChars = smallChars - .filter(c => c.xCenter < xThreshold) - .sort((a, b) => a.yCenter - b.yCenter); - - // 配对 + pairSmallChars(logicalCols) { const pairs = []; - const maxLen = Math.max(rightChars.length, leftChars.length); - for (let i = 0; i < maxLen; i++) { - pairs.push({ - right: rightChars[i] || null, - left: leftChars[i] || null, - // 使用右字符的y坐标作为配对的位置基准 - yCenter: rightChars[i]?.yCenter || leftChars[i]?.yCenter - }); + // 找出所有全是小字的逻辑列 + const smallLogicalCols = logicalCols.filter(lc => lc.isAllSmall); + + if (smallLogicalCols.length === 0) return pairs; + + // 按line_id排序(确保右列在前,左列在后) + smallLogicalCols.sort((a, b) => a.lineId - b.lineId); + + // 成对处理:每两个连续的小字逻辑列配对 + for (let i = 0; i < smallLogicalCols.length; i += 2) { + const rightCol = smallLogicalCols[i]; + const leftCol = smallLogicalCols[i + 1]; + + // 右列和左列各自按y排序 + const rightChars = [...rightCol.chars].sort((a, b) => a.yCenter - b.yCenter); + const leftChars = leftCol ? [...leftCol.chars].sort((a, b) => a.yCenter - b.yCenter) : []; + + // 配对:右列第i个 配 左列第i个 + const maxLen = Math.max(rightChars.length, leftChars.length); + for (let j = 0; j < maxLen; j++) { + pairs.push({ + right: rightChars[j] || null, + left: leftChars[j] || null, + // 使用右字符的y坐标作为配对的位置基准 + yCenter: rightChars[j]?.yCenter || leftChars[j]?.yCenter + }); + } } return pairs; @@ -193,21 +198,30 @@ export class GridRenderer { } } - // 计算行高 - const cellHeight = this.ocrData.Height / this.rowsPerColumn; + if (this.physicalColumns.length === 0) return; - // 填充物理列到网格 - // 从右到左填充(物理列已经从右到左排序) + // 计算单元格尺寸 + const cellHeight = this.ocrData.Height / this.rowsPerColumn; + const cellWidth = this.ocrData.Width / this.totalColumns; + + // === 问题2:检测空列,映射物理列到网格列 === + const columnMapping = this.detectEmptyColumns(cellWidth); + + // === 问题3:多列对齐基准(暂时简化,后续可优化)=== + // 找到所有物理列中第一个字的最小y坐标作为参考基准 + // const topMostY = this.findTopMostY(); + + // 填充每个物理列 for (let pi = 0; pi < this.physicalColumns.length; pi++) { const pc = this.physicalColumns[pi]; - const gridCol = pi; // 从第0列开始填充 + const gridCol = columnMapping[pi]; - if (gridCol >= this.totalColumns) { - console.warn(`物理列${pi}超出网格范围`); + if (gridCol >= this.totalColumns || gridCol < 0) { + console.warn(`物理列${pi}映射到网格列${gridCol}超出范围`); continue; } - // 合并大字和小字对,按y坐标排序 + // 合并大字和小字对 const items = []; // 添加大字 @@ -234,18 +248,92 @@ export class GridRenderer { // 按y坐标排序 items.sort((a, b) => a.yCenter - b.yCenter); - // 从第0行开始填充(规整化排版) - if (items.length === 0) continue; + // === 问题1:基于y坐标计算行号 === + for (const item of items) { + const row = Math.round(item.yCenter / cellHeight); - // 填充到网格,从第0行开始 - for (let i = 0; i < items.length; i++) { - if (i < this.rowsPerColumn) { - this.grid[gridCol][i] = items[i]; + if (row >= 0 && row < this.rowsPerColumn) { + this.grid[gridCol][row] = item; + } else { + console.warn(`字符 "${item.char || '[pair]'}" 行号${row}超出范围[0, ${this.rowsPerColumn})`); } } } } + /** + * 检测空列,返回物理列到网格列的映射 + * 问题2的解决方案 + */ + detectEmptyColumns(cellWidth) { + const mapping = []; + + if (this.physicalColumns.length === 0) return mapping; + + // 计算所有物理列之间的间距 + const gaps = []; + for (let i = 0; i < this.physicalColumns.length - 1; i++) { + const gap = this.physicalColumns[i].xCenter - this.physicalColumns[i + 1].xCenter; + gaps.push(gap); + } + + // 计算标准列间距(使用 cellWidth 或根据数据计算) + // 方法1:基于版式 + const standardGap = cellWidth; + + // 方法2(可选):从实际间距中过滤异常值,取平均 + // const normalGaps = gaps.filter(g => g < cellWidth * 1.5); + // const avgGap = normalGaps.length > 0 + // ? normalGaps.reduce((a, b) => a + b, 0) / normalGaps.length + // : cellWidth; + + // 映射物理列到网格列 + let currentGridCol = 0; + mapping[0] = currentGridCol; + + for (let i = 0; i < gaps.length; i++) { + const gap = gaps[i]; + + if (gap > standardGap * 1.5) { + // 大间距,中间有空列 + const emptyColumns = Math.round(gap / standardGap) - 1; + currentGridCol += 1 + emptyColumns; + } else { + // 正常间距 + currentGridCol += 1; + } + + mapping[i + 1] = currentGridCol; + } + + console.log('列映射:', mapping); + return mapping; + } + + /** + * 找到所有物理列中第一个字的最小y坐标(对齐基准) + * 问题3的辅助方法 + */ + findTopMostY() { + let topMostY = Infinity; + + for (const pc of this.physicalColumns) { + // 找到该物理列的第一个字符 + const allItems = [...pc.bigChars]; + for (const pair of pc.smallPairs) { + if (pair.right) allItems.push(pair.right); + } + + allItems.sort((a, b) => a.yCenter - b.yCenter); + + if (allItems.length > 0 && allItems[0].yCenter < topMostY) { + topMostY = allItems[0].yCenter; + } + } + + return topMostY === Infinity ? 0 : topMostY; + } + /** * 渲染网格到Canvas */