ancient-ocr-viewer/analysis.py
Yuuko 1018416a7a Initial commit: Ancient OCR Viewer
- Canvas-based dual display (image + text)
- Grid rendering system with layout support
- Uniform font size rendering
- Double-line small character handling
- Comprehensive documentation of OCR rules and algorithms

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-19 16:59:40 +08:00

157 lines
5.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
分析古籍OCR JSON数据理解双行小字的排列规律
"""
import json
import numpy as np
from collections import defaultdict
def analyze_ocr_json(json_path):
with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
print("=" * 80)
print(f"文件: {data['FileName']}")
print(f"尺寸: {data['Width']} × {data['Height']}")
print(f"字符数: {data['CharNumber']}, 行数: {data['LineNumber']}")
print("=" * 80)
# 分析字符类型分布
chars = data['chars']
char_marking = data['charMarking']
coors = data['coors']
line_ids = data['line_ids']
# 统计大字和小字
big_chars = []
small_chars = []
for i, marking in enumerate(char_marking):
if len(marking) == 0:
big_chars.append(i)
else:
small_chars.append(i)
print(f"\n字符分类:")
print(f" 大字(正文): {len(big_chars)}")
print(f" 小字(双行注): {len(small_chars)}")
# 分析列分布x坐标聚类
print(f"\n列分布分析:")
x_centers = [(coors[i][0] + coors[i][2]) / 2 for i in range(len(coors))]
# 只看大字的x坐标用于确定列
big_x_centers = [x_centers[i] for i in big_chars]
# 简单聚类按x坐标排序找间隔
big_x_sorted = sorted(set(big_x_centers))
# 找列x坐标的聚类中心
columns = []
current_cluster = [big_x_sorted[0]]
for x in big_x_sorted[1:]:
if x - current_cluster[-1] > 200: # 列间距阈值
columns.append(np.mean(current_cluster))
current_cluster = [x]
else:
current_cluster.append(x)
columns.append(np.mean(current_cluster))
# 从右到左排序(古籍从右往左)
columns = sorted(columns, reverse=True)
print(f" 检测到 {len(columns)}")
print(f" 列中心x坐标从右到左: {[f'{x:.1f}' for x in columns]}")
# 分析双行小字的分组
print(f"\n双行小字分组分析:")
# 找出连续的小字段
small_groups = []
if small_chars:
current_group = [small_chars[0]]
for i in range(1, len(small_chars)):
if small_chars[i] == small_chars[i-1] + 1:
current_group.append(small_chars[i])
else:
small_groups.append(current_group)
current_group = [small_chars[i]]
small_groups.append(current_group)
print(f" 发现 {len(small_groups)} 组双行小字:")
for gi, group in enumerate(small_groups):
print(f"\n{gi+1}: 索引 {group[0]}-{group[-1]}, 共 {len(group)} 个字")
group_chars = [chars[i] for i in group]
group_text = ''.join(group_chars)
print(f" 内容: {group_text}")
# 分析这组小字的x坐标分布
group_x = [(coors[i][0] + coors[i][2]) / 2 for i in group]
group_x_min = min(group_x)
group_x_max = max(group_x)
group_x_mean = np.mean(group_x)
print(f" x坐标范围: [{group_x_min:.1f}, {group_x_max:.1f}], 平均: {group_x_mean:.1f}")
# 尝试分成左右两列
threshold = group_x_mean
right_col = [i for i, x in zip(group, group_x) if x >= threshold]
left_col = [i for i, x in zip(group, group_x) if x < threshold]
print(f" 右列x >= {threshold:.1f}: {len(right_col)} 个字")
if right_col:
right_text = ''.join([chars[i] for i in right_col])
print(f" 内容: {right_text}")
print(f" 索引: {right_col[:5]}{'...' if len(right_col) > 5 else ''}")
print(f" 左列x < {threshold:.1f}: {len(left_col)} 个字")
if left_col:
left_text = ''.join([chars[i] for i in left_col])
print(f" 内容: {left_text}")
print(f" 索引: {left_col[:5]}{'...' if len(left_col) > 5 else ''}")
# 分析y坐标垂直位置
print(f" 垂直排列分析:")
for idx in group[:3]: # 只看前3个字
y1, y2 = coors[idx][1], coors[idx][3]
x1, x2 = coors[idx][0], coors[idx][2]
print(f" 索引{idx} '{chars[idx]}': x=[{x1},{x2}], y=[{y1},{y2}]")
# 分析行高和列宽
print(f"\n字符尺寸分析:")
big_heights = [(coors[i][3] - coors[i][1]) for i in big_chars]
big_widths = [(coors[i][2] - coors[i][0]) for i in big_chars]
print(f" 大字平均高度: {np.mean(big_heights):.1f}px (std: {np.std(big_heights):.1f})")
print(f" 大字平均宽度: {np.mean(big_widths):.1f}px (std: {np.std(big_widths):.1f})")
if small_chars:
small_heights = [(coors[i][3] - coors[i][1]) for i in small_chars]
small_widths = [(coors[i][2] - coors[i][0]) for i in small_chars]
print(f" 小字平均高度: {np.mean(small_heights):.1f}px (std: {np.std(small_heights):.1f})")
print(f" 小字平均宽度: {np.mean(small_widths):.1f}px (std: {np.std(small_widths):.1f})")
print(f" 小字/大字高度比: {np.mean(small_heights)/np.mean(big_heights):.2f}")
# 输出推荐的网格参数
print(f"\n推荐的虚拟网格参数:")
print(f" 列数: {len(columns)}")
print(f" 标准列宽: {np.mean(big_widths):.0f}px")
print(f" 标准行高: {np.mean(big_heights):.0f}px")
print(f" 大字字号: {np.mean(big_heights) * 0.8:.0f}px")
if small_chars:
print(f" 小字字号: {np.mean(small_heights) * 0.8:.0f}px")
if __name__ == '__main__':
# 分析带双行小字的文件
print("\n分析带双行小字的文件: 0011B.json")
analyze_ocr_json('/home/yuuko/test/0011B.json')
print("\n\n" + "=" * 80)
print("\n分析普通文件: 0003A.json")
analyze_ocr_json('/home/yuuko/test/ancient-ocr-viewer/data/0003A.json')