ancient-ocr-viewer/analysis.py

157 lines
5.8 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
分析古籍OCR JSON数据理解双行小字的排列规律
"""
import json
import numpy as np
from collections import defaultdict
def analyze_ocr_json(json_path):
with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
print("=" * 80)
print(f"文件: {data['FileName']}")
print(f"尺寸: {data['Width']} × {data['Height']}")
print(f"字符数: {data['CharNumber']}, 行数: {data['LineNumber']}")
print("=" * 80)
# 分析字符类型分布
chars = data['chars']
char_marking = data['charMarking']
coors = data['coors']
line_ids = data['line_ids']
# 统计大字和小字
big_chars = []
small_chars = []
for i, marking in enumerate(char_marking):
if len(marking) == 0:
big_chars.append(i)
else:
small_chars.append(i)
print(f"\n字符分类:")
print(f" 大字(正文): {len(big_chars)}")
print(f" 小字(双行注): {len(small_chars)}")
# 分析列分布x坐标聚类
print(f"\n列分布分析:")
x_centers = [(coors[i][0] + coors[i][2]) / 2 for i in range(len(coors))]
# 只看大字的x坐标用于确定列
big_x_centers = [x_centers[i] for i in big_chars]
# 简单聚类按x坐标排序找间隔
big_x_sorted = sorted(set(big_x_centers))
# 找列x坐标的聚类中心
columns = []
current_cluster = [big_x_sorted[0]]
for x in big_x_sorted[1:]:
if x - current_cluster[-1] > 200: # 列间距阈值
columns.append(np.mean(current_cluster))
current_cluster = [x]
else:
current_cluster.append(x)
columns.append(np.mean(current_cluster))
# 从右到左排序(古籍从右往左)
columns = sorted(columns, reverse=True)
print(f" 检测到 {len(columns)}")
print(f" 列中心x坐标从右到左: {[f'{x:.1f}' for x in columns]}")
# 分析双行小字的分组
print(f"\n双行小字分组分析:")
# 找出连续的小字段
small_groups = []
if small_chars:
current_group = [small_chars[0]]
for i in range(1, len(small_chars)):
if small_chars[i] == small_chars[i-1] + 1:
current_group.append(small_chars[i])
else:
small_groups.append(current_group)
current_group = [small_chars[i]]
small_groups.append(current_group)
print(f" 发现 {len(small_groups)} 组双行小字:")
for gi, group in enumerate(small_groups):
print(f"\n{gi+1}: 索引 {group[0]}-{group[-1]}, 共 {len(group)} 个字")
group_chars = [chars[i] for i in group]
group_text = ''.join(group_chars)
print(f" 内容: {group_text}")
# 分析这组小字的x坐标分布
group_x = [(coors[i][0] + coors[i][2]) / 2 for i in group]
group_x_min = min(group_x)
group_x_max = max(group_x)
group_x_mean = np.mean(group_x)
print(f" x坐标范围: [{group_x_min:.1f}, {group_x_max:.1f}], 平均: {group_x_mean:.1f}")
# 尝试分成左右两列
threshold = group_x_mean
right_col = [i for i, x in zip(group, group_x) if x >= threshold]
left_col = [i for i, x in zip(group, group_x) if x < threshold]
print(f" 右列x >= {threshold:.1f}: {len(right_col)} 个字")
if right_col:
right_text = ''.join([chars[i] for i in right_col])
print(f" 内容: {right_text}")
print(f" 索引: {right_col[:5]}{'...' if len(right_col) > 5 else ''}")
print(f" 左列x < {threshold:.1f}: {len(left_col)} 个字")
if left_col:
left_text = ''.join([chars[i] for i in left_col])
print(f" 内容: {left_text}")
print(f" 索引: {left_col[:5]}{'...' if len(left_col) > 5 else ''}")
# 分析y坐标垂直位置
print(f" 垂直排列分析:")
for idx in group[:3]: # 只看前3个字
y1, y2 = coors[idx][1], coors[idx][3]
x1, x2 = coors[idx][0], coors[idx][2]
print(f" 索引{idx} '{chars[idx]}': x=[{x1},{x2}], y=[{y1},{y2}]")
# 分析行高和列宽
print(f"\n字符尺寸分析:")
big_heights = [(coors[i][3] - coors[i][1]) for i in big_chars]
big_widths = [(coors[i][2] - coors[i][0]) for i in big_chars]
print(f" 大字平均高度: {np.mean(big_heights):.1f}px (std: {np.std(big_heights):.1f})")
print(f" 大字平均宽度: {np.mean(big_widths):.1f}px (std: {np.std(big_widths):.1f})")
if small_chars:
small_heights = [(coors[i][3] - coors[i][1]) for i in small_chars]
small_widths = [(coors[i][2] - coors[i][0]) for i in small_chars]
print(f" 小字平均高度: {np.mean(small_heights):.1f}px (std: {np.std(small_heights):.1f})")
print(f" 小字平均宽度: {np.mean(small_widths):.1f}px (std: {np.std(small_widths):.1f})")
print(f" 小字/大字高度比: {np.mean(small_heights)/np.mean(big_heights):.2f}")
# 输出推荐的网格参数
print(f"\n推荐的虚拟网格参数:")
print(f" 列数: {len(columns)}")
print(f" 标准列宽: {np.mean(big_widths):.0f}px")
print(f" 标准行高: {np.mean(big_heights):.0f}px")
print(f" 大字字号: {np.mean(big_heights) * 0.8:.0f}px")
if small_chars:
print(f" 小字字号: {np.mean(small_heights) * 0.8:.0f}px")
if __name__ == '__main__':
# 分析带双行小字的文件
print("\n分析带双行小字的文件: 0011B.json")
analyze_ocr_json('/home/yuuko/test/0011B.json')
print("\n\n" + "=" * 80)
print("\n分析普通文件: 0003A.json")
analyze_ocr_json('/home/yuuko/test/ancient-ocr-viewer/data/0003A.json')