- Canvas-based dual display (image + text) - Grid rendering system with layout support - Uniform font size rendering - Double-line small character handling - Comprehensive documentation of OCR rules and algorithms 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
157 lines
5.8 KiB
Python
157 lines
5.8 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
分析古籍OCR JSON数据,理解双行小字的排列规律
|
||
"""
|
||
|
||
import json
|
||
import numpy as np
|
||
from collections import defaultdict
|
||
|
||
def analyze_ocr_json(json_path):
|
||
with open(json_path, 'r', encoding='utf-8') as f:
|
||
data = json.load(f)
|
||
|
||
print("=" * 80)
|
||
print(f"文件: {data['FileName']}")
|
||
print(f"尺寸: {data['Width']} × {data['Height']}")
|
||
print(f"字符数: {data['CharNumber']}, 行数: {data['LineNumber']}")
|
||
print("=" * 80)
|
||
|
||
# 分析字符类型分布
|
||
chars = data['chars']
|
||
char_marking = data['charMarking']
|
||
coors = data['coors']
|
||
line_ids = data['line_ids']
|
||
|
||
# 统计大字和小字
|
||
big_chars = []
|
||
small_chars = []
|
||
|
||
for i, marking in enumerate(char_marking):
|
||
if len(marking) == 0:
|
||
big_chars.append(i)
|
||
else:
|
||
small_chars.append(i)
|
||
|
||
print(f"\n字符分类:")
|
||
print(f" 大字(正文): {len(big_chars)} 个")
|
||
print(f" 小字(双行注): {len(small_chars)} 个")
|
||
|
||
# 分析列分布(x坐标聚类)
|
||
print(f"\n列分布分析:")
|
||
x_centers = [(coors[i][0] + coors[i][2]) / 2 for i in range(len(coors))]
|
||
|
||
# 只看大字的x坐标,用于确定列
|
||
big_x_centers = [x_centers[i] for i in big_chars]
|
||
|
||
# 简单聚类:按x坐标排序,找间隔
|
||
big_x_sorted = sorted(set(big_x_centers))
|
||
|
||
# 找列(x坐标的聚类中心)
|
||
columns = []
|
||
current_cluster = [big_x_sorted[0]]
|
||
|
||
for x in big_x_sorted[1:]:
|
||
if x - current_cluster[-1] > 200: # 列间距阈值
|
||
columns.append(np.mean(current_cluster))
|
||
current_cluster = [x]
|
||
else:
|
||
current_cluster.append(x)
|
||
columns.append(np.mean(current_cluster))
|
||
|
||
# 从右到左排序(古籍从右往左)
|
||
columns = sorted(columns, reverse=True)
|
||
|
||
print(f" 检测到 {len(columns)} 列")
|
||
print(f" 列中心x坐标(从右到左): {[f'{x:.1f}' for x in columns]}")
|
||
|
||
# 分析双行小字的分组
|
||
print(f"\n双行小字分组分析:")
|
||
|
||
# 找出连续的小字段
|
||
small_groups = []
|
||
if small_chars:
|
||
current_group = [small_chars[0]]
|
||
for i in range(1, len(small_chars)):
|
||
if small_chars[i] == small_chars[i-1] + 1:
|
||
current_group.append(small_chars[i])
|
||
else:
|
||
small_groups.append(current_group)
|
||
current_group = [small_chars[i]]
|
||
small_groups.append(current_group)
|
||
|
||
print(f" 发现 {len(small_groups)} 组双行小字:")
|
||
|
||
for gi, group in enumerate(small_groups):
|
||
print(f"\n 组 {gi+1}: 索引 {group[0]}-{group[-1]}, 共 {len(group)} 个字")
|
||
group_chars = [chars[i] for i in group]
|
||
group_text = ''.join(group_chars)
|
||
print(f" 内容: {group_text}")
|
||
|
||
# 分析这组小字的x坐标分布
|
||
group_x = [(coors[i][0] + coors[i][2]) / 2 for i in group]
|
||
group_x_min = min(group_x)
|
||
group_x_max = max(group_x)
|
||
group_x_mean = np.mean(group_x)
|
||
|
||
print(f" x坐标范围: [{group_x_min:.1f}, {group_x_max:.1f}], 平均: {group_x_mean:.1f}")
|
||
|
||
# 尝试分成左右两列
|
||
threshold = group_x_mean
|
||
right_col = [i for i, x in zip(group, group_x) if x >= threshold]
|
||
left_col = [i for i, x in zip(group, group_x) if x < threshold]
|
||
|
||
print(f" 右列(x >= {threshold:.1f}): {len(right_col)} 个字")
|
||
if right_col:
|
||
right_text = ''.join([chars[i] for i in right_col])
|
||
print(f" 内容: {right_text}")
|
||
print(f" 索引: {right_col[:5]}{'...' if len(right_col) > 5 else ''}")
|
||
|
||
print(f" 左列(x < {threshold:.1f}): {len(left_col)} 个字")
|
||
if left_col:
|
||
left_text = ''.join([chars[i] for i in left_col])
|
||
print(f" 内容: {left_text}")
|
||
print(f" 索引: {left_col[:5]}{'...' if len(left_col) > 5 else ''}")
|
||
|
||
# 分析y坐标(垂直位置)
|
||
print(f" 垂直排列分析:")
|
||
for idx in group[:3]: # 只看前3个字
|
||
y1, y2 = coors[idx][1], coors[idx][3]
|
||
x1, x2 = coors[idx][0], coors[idx][2]
|
||
print(f" 索引{idx} '{chars[idx]}': x=[{x1},{x2}], y=[{y1},{y2}]")
|
||
|
||
# 分析行高和列宽
|
||
print(f"\n字符尺寸分析:")
|
||
big_heights = [(coors[i][3] - coors[i][1]) for i in big_chars]
|
||
big_widths = [(coors[i][2] - coors[i][0]) for i in big_chars]
|
||
|
||
print(f" 大字平均高度: {np.mean(big_heights):.1f}px (std: {np.std(big_heights):.1f})")
|
||
print(f" 大字平均宽度: {np.mean(big_widths):.1f}px (std: {np.std(big_widths):.1f})")
|
||
|
||
if small_chars:
|
||
small_heights = [(coors[i][3] - coors[i][1]) for i in small_chars]
|
||
small_widths = [(coors[i][2] - coors[i][0]) for i in small_chars]
|
||
|
||
print(f" 小字平均高度: {np.mean(small_heights):.1f}px (std: {np.std(small_heights):.1f})")
|
||
print(f" 小字平均宽度: {np.mean(small_widths):.1f}px (std: {np.std(small_widths):.1f})")
|
||
print(f" 小字/大字高度比: {np.mean(small_heights)/np.mean(big_heights):.2f}")
|
||
|
||
# 输出推荐的网格参数
|
||
print(f"\n推荐的虚拟网格参数:")
|
||
print(f" 列数: {len(columns)}")
|
||
print(f" 标准列宽: {np.mean(big_widths):.0f}px")
|
||
print(f" 标准行高: {np.mean(big_heights):.0f}px")
|
||
print(f" 大字字号: {np.mean(big_heights) * 0.8:.0f}px")
|
||
if small_chars:
|
||
print(f" 小字字号: {np.mean(small_heights) * 0.8:.0f}px")
|
||
|
||
if __name__ == '__main__':
|
||
# 分析带双行小字的文件
|
||
print("\n分析带双行小字的文件: 0011B.json")
|
||
analyze_ocr_json('/home/yuuko/test/0011B.json')
|
||
|
||
print("\n\n" + "=" * 80)
|
||
print("\n分析普通文件: 0003A.json")
|
||
analyze_ocr_json('/home/yuuko/test/ancient-ocr-viewer/data/0003A.json')
|