190 lines
7.0 KiB
Python
190 lines
7.0 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
"""
|
|||
|
|
分析古籍OCR JSON数据 - 基于间隔检测的物理列聚合
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import json
|
|||
|
|
import numpy as np
|
|||
|
|
from collections import defaultdict
|
|||
|
|
|
|||
|
|
def analyze_physical_columns(json_path):
|
|||
|
|
with open(json_path, 'r', encoding='utf-8') as f:
|
|||
|
|
data = json.load(f)
|
|||
|
|
|
|||
|
|
print("=" * 80)
|
|||
|
|
print(f"文件: {data['FileName']}")
|
|||
|
|
print(f"尺寸: {data['Width']} × {data['Height']}")
|
|||
|
|
print(f"版式: 10列 × 25行")
|
|||
|
|
print("=" * 80)
|
|||
|
|
|
|||
|
|
chars = data['chars']
|
|||
|
|
char_marking = data['charMarking']
|
|||
|
|
coors = data['coors']
|
|||
|
|
line_ids = data['line_ids']
|
|||
|
|
|
|||
|
|
# 按line_id分组
|
|||
|
|
logical_columns = defaultdict(list)
|
|||
|
|
for i in range(len(chars)):
|
|||
|
|
logical_columns[line_ids[i]].append({
|
|||
|
|
'index': i,
|
|||
|
|
'char': chars[i],
|
|||
|
|
'x1': coors[i][0],
|
|||
|
|
'y1': coors[i][1],
|
|||
|
|
'x2': coors[i][2],
|
|||
|
|
'y2': coors[i][3],
|
|||
|
|
'x_center': (coors[i][0] + coors[i][2]) / 2,
|
|||
|
|
'y_center': (coors[i][1] + coors[i][3]) / 2,
|
|||
|
|
'is_small': len(char_marking[i]) > 0
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
# 计算每个逻辑列的信息
|
|||
|
|
logical_col_info = []
|
|||
|
|
for line_id in sorted(logical_columns.keys()):
|
|||
|
|
col_chars = logical_columns[line_id]
|
|||
|
|
x_centers = [c['x_center'] for c in col_chars]
|
|||
|
|
x_avg = np.mean(x_centers)
|
|||
|
|
|
|||
|
|
char_count = len(col_chars)
|
|||
|
|
small_count = sum(1 for c in col_chars if c['is_small'])
|
|||
|
|
|
|||
|
|
logical_col_info.append({
|
|||
|
|
'line_id': line_id,
|
|||
|
|
'x_avg': x_avg,
|
|||
|
|
'chars': col_chars,
|
|||
|
|
'char_count': char_count,
|
|||
|
|
'small_count': small_count,
|
|||
|
|
'is_all_small': small_count == char_count
|
|||
|
|
})
|
|||
|
|
|
|||
|
|
# 按x坐标排序(从右到左)
|
|||
|
|
logical_col_info.sort(key=lambda c: c['x_avg'], reverse=True)
|
|||
|
|
|
|||
|
|
print("\n逻辑列x坐标(从右到左):")
|
|||
|
|
for lc in logical_col_info:
|
|||
|
|
small_mark = "(小字)" if lc['is_all_small'] else ""
|
|||
|
|
print(f" line_id={lc['line_id']}: x={lc['x_avg']:.0f} {small_mark}")
|
|||
|
|
|
|||
|
|
# 基于间隔检测来聚合物理列
|
|||
|
|
print("\n" + "=" * 80)
|
|||
|
|
print("基于间隔的物理列聚合:")
|
|||
|
|
print("-" * 80)
|
|||
|
|
|
|||
|
|
# 新策略:
|
|||
|
|
# 1. 所有逻辑列一起考虑(大字和小字)
|
|||
|
|
# 2. 如果相邻逻辑列的x坐标差距小于阈值(如150px),它们属于同一物理列
|
|||
|
|
# 3. 如果差距大于阈值,就是新的物理列
|
|||
|
|
|
|||
|
|
# 计算相邻逻辑列的间隔
|
|||
|
|
print("所有逻辑列间隔分析:")
|
|||
|
|
gaps = []
|
|||
|
|
for i in range(len(logical_col_info) - 1):
|
|||
|
|
gap = logical_col_info[i]['x_avg'] - logical_col_info[i+1]['x_avg']
|
|||
|
|
gaps.append(gap)
|
|||
|
|
is_small_1 = "(小)" if logical_col_info[i]['is_all_small'] else ""
|
|||
|
|
is_small_2 = "(小)" if logical_col_info[i+1]['is_all_small'] else ""
|
|||
|
|
print(f" line_id={logical_col_info[i]['line_id']}{is_small_1} -> line_id={logical_col_info[i+1]['line_id']}{is_small_2}: {gap:.0f}px")
|
|||
|
|
|
|||
|
|
# 分析间隔分布
|
|||
|
|
print(f"\n间隔统计: min={min(gaps):.0f}, max={max(gaps):.0f}, avg={np.mean(gaps):.0f}, std={np.std(gaps):.0f}")
|
|||
|
|
|
|||
|
|
# 使用阈值:小于150px认为是同一物理列(双行小字的左右列差约112px)
|
|||
|
|
# 大于150px认为是不同物理列
|
|||
|
|
MERGE_THRESHOLD = 150 # 小于这个值就合并
|
|||
|
|
|
|||
|
|
physical_columns = []
|
|||
|
|
current_group = [logical_col_info[0]]
|
|||
|
|
|
|||
|
|
for i in range(1, len(logical_col_info)):
|
|||
|
|
gap = current_group[-1]['x_avg'] - logical_col_info[i]['x_avg']
|
|||
|
|
if gap < MERGE_THRESHOLD:
|
|||
|
|
# 间隔小,属于同一物理列
|
|||
|
|
current_group.append(logical_col_info[i])
|
|||
|
|
else:
|
|||
|
|
# 间隔大,新的物理列
|
|||
|
|
physical_columns.append(current_group)
|
|||
|
|
current_group = [logical_col_info[i]]
|
|||
|
|
physical_columns.append(current_group)
|
|||
|
|
|
|||
|
|
print(f"\n聚合结果: {len(physical_columns)} 个物理列")
|
|||
|
|
print("-" * 80)
|
|||
|
|
|
|||
|
|
# 计算物理列的实际中心位置
|
|||
|
|
pc_centers = []
|
|||
|
|
for pc in physical_columns:
|
|||
|
|
x_values = [lc['x_avg'] for lc in pc]
|
|||
|
|
center = np.mean(x_values)
|
|||
|
|
pc_centers.append(center)
|
|||
|
|
|
|||
|
|
# 计算列间距
|
|||
|
|
print("\n物理列间距:")
|
|||
|
|
for i in range(len(pc_centers) - 1):
|
|||
|
|
gap = pc_centers[i] - pc_centers[i+1]
|
|||
|
|
print(f" 物理列{i+1} -> 物理列{i+2}: {gap:.0f}px")
|
|||
|
|
|
|||
|
|
avg_gap = np.mean([pc_centers[i] - pc_centers[i+1] for i in range(len(pc_centers)-1)])
|
|||
|
|
print(f" 平均间距: {avg_gap:.0f}px")
|
|||
|
|
|
|||
|
|
# 推断在10列网格中的位置
|
|||
|
|
canvas_width = data['Width']
|
|||
|
|
cell_width = canvas_width / 10
|
|||
|
|
|
|||
|
|
print(f"\n在10列网格中的映射 (列宽={cell_width:.0f}px):")
|
|||
|
|
for pi, (pc, center) in enumerate(zip(physical_columns, pc_centers)):
|
|||
|
|
grid_col = round((canvas_width - center) / cell_width)
|
|||
|
|
if grid_col < 1: grid_col = 1
|
|||
|
|
if grid_col > 10: grid_col = 10
|
|||
|
|
print(f" 物理列{pi+1} (中心x={center:.0f}) -> 网格第{grid_col}列")
|
|||
|
|
|
|||
|
|
# 显示每个物理列的详细信息
|
|||
|
|
print("\n" + "=" * 80)
|
|||
|
|
print("物理列详情:")
|
|||
|
|
print("-" * 80)
|
|||
|
|
|
|||
|
|
cell_height = data['Height'] / 25
|
|||
|
|
|
|||
|
|
for pi, pc in enumerate(physical_columns):
|
|||
|
|
line_ids_in_pc = [lc['line_id'] for lc in pc]
|
|||
|
|
total_chars = sum(lc['char_count'] for lc in pc)
|
|||
|
|
total_small = sum(lc['small_count'] for lc in pc)
|
|||
|
|
|
|||
|
|
print(f"\n物理列 {pi+1}:")
|
|||
|
|
print(f" 包含line_ids: {line_ids_in_pc}")
|
|||
|
|
print(f" 总字数: {total_chars} (小字: {total_small})")
|
|||
|
|
|
|||
|
|
# 合并所有字符,按y排序
|
|||
|
|
all_chars = []
|
|||
|
|
for lc in pc:
|
|||
|
|
all_chars.extend(lc['chars'])
|
|||
|
|
all_chars.sort(key=lambda c: c['y_center'])
|
|||
|
|
|
|||
|
|
# 分析行分配
|
|||
|
|
first_y = all_chars[0]['y_center']
|
|||
|
|
start_row = int(first_y / cell_height)
|
|||
|
|
print(f" 起始行: 第{start_row+1}行 (y={first_y:.0f})")
|
|||
|
|
|
|||
|
|
# 显示内容结构
|
|||
|
|
big_chars_content = ''.join([c['char'] for c in all_chars if not c['is_small']])
|
|||
|
|
small_chars_content = ''.join([c['char'] for c in all_chars if c['is_small']])
|
|||
|
|
print(f" 大字: {big_chars_content[:20]}{'...' if len(big_chars_content) > 20 else ''}")
|
|||
|
|
if small_chars_content:
|
|||
|
|
print(f" 小字: {small_chars_content}")
|
|||
|
|
|
|||
|
|
# 如果有小字,分析配对
|
|||
|
|
small_chars = [c for c in all_chars if c['is_small']]
|
|||
|
|
if small_chars:
|
|||
|
|
x_centers = [c['x_center'] for c in small_chars]
|
|||
|
|
x_threshold = np.mean(x_centers)
|
|||
|
|
|
|||
|
|
right_chars = sorted([c for c in small_chars if c['x_center'] >= x_threshold],
|
|||
|
|
key=lambda c: c['y_center'])
|
|||
|
|
left_chars = sorted([c for c in small_chars if c['x_center'] < x_threshold],
|
|||
|
|
key=lambda c: c['y_center'])
|
|||
|
|
|
|||
|
|
print(f" 双行小字配对:")
|
|||
|
|
print(f" 右列({len(right_chars)}字): {''.join([c['char'] for c in right_chars])}")
|
|||
|
|
print(f" 左列({len(left_chars)}字): {''.join([c['char'] for c in left_chars])}")
|
|||
|
|
|
|||
|
|
if __name__ == '__main__':
|
|||
|
|
analyze_physical_columns('/home/yuuko/test/0011B.json')
|