ancient-ocr-viewer/analysis_v2.py
Yuuko 1018416a7a Initial commit: Ancient OCR Viewer
- Canvas-based dual display (image + text)
- Grid rendering system with layout support
- Uniform font size rendering
- Double-line small character handling
- Comprehensive documentation of OCR rules and algorithms

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-19 16:59:40 +08:00

194 lines
6.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
分析古籍OCR JSON数据 - 理解物理列聚合逻辑
"""
import json
import numpy as np
from collections import defaultdict
def analyze_physical_columns(json_path):
with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
print("=" * 80)
print(f"文件: {data['FileName']}")
print(f"尺寸: {data['Width']} × {data['Height']}")
print(f"版式: 10列 × 25行")
print("=" * 80)
chars = data['chars']
char_marking = data['charMarking']
coors = data['coors']
line_ids = data['line_ids']
# 按line_id分组
logical_columns = defaultdict(list)
for i in range(len(chars)):
logical_columns[line_ids[i]].append({
'index': i,
'char': chars[i],
'x1': coors[i][0],
'y1': coors[i][1],
'x2': coors[i][2],
'y2': coors[i][3],
'x_center': (coors[i][0] + coors[i][2]) / 2,
'y_center': (coors[i][1] + coors[i][3]) / 2,
'is_small': len(char_marking[i]) > 0
})
# 分析每个逻辑列
print("\n逻辑列(line_id)分析:")
print("-" * 80)
logical_col_info = []
for line_id in sorted(logical_columns.keys()):
col_chars = logical_columns[line_id]
x_centers = [c['x_center'] for c in col_chars]
y_centers = [c['y_center'] for c in col_chars]
x_min, x_max = min(x_centers), max(x_centers)
x_avg = np.mean(x_centers)
y_min, y_max = min(y_centers), max(y_centers)
char_count = len(col_chars)
small_count = sum(1 for c in col_chars if c['is_small'])
big_count = char_count - small_count
text = ''.join([c['char'] for c in col_chars])
text_preview = text[:15] + '...' if len(text) > 15 else text
print(f"line_id={line_id}: x范围[{x_min:.0f}-{x_max:.0f}], 平均x={x_avg:.0f}")
print(f" 字数: {char_count} (大字:{big_count}, 小字:{small_count})")
print(f" 内容: {text_preview}")
logical_col_info.append({
'line_id': line_id,
'x_avg': x_avg,
'x_min': x_min,
'x_max': x_max,
'chars': col_chars,
'char_count': char_count,
'small_count': small_count
})
# 尝试聚合到物理列
print("\n" + "=" * 80)
print("物理列聚合分析:")
print("-" * 80)
# 计算物理列宽度
canvas_width = data['Width']
total_physical_cols = 10
cell_width = canvas_width / total_physical_cols
print(f"每个物理列宽度: {cell_width:.0f}px")
# 按x坐标排序逻辑列从右到左x大的在前
logical_col_info.sort(key=lambda c: c['x_avg'], reverse=True)
# 计算10个物理列的理想中心位置从右到左
physical_centers = []
for i in range(total_physical_cols):
center_x = canvas_width - (i + 0.5) * cell_width
physical_centers.append(center_x)
print(f"物理列理想中心: {[f'{x:.0f}' for x in physical_centers]}")
# 将每个逻辑列映射到最近的物理列
physical_columns = defaultdict(list)
for lc in logical_col_info:
# 找最近的物理列
min_dist = float('inf')
best_col = 0
for i, center in enumerate(physical_centers):
dist = abs(lc['x_avg'] - center)
if dist < min_dist:
min_dist = dist
best_col = i
physical_columns[best_col].append(lc)
print(f" line_id={lc['line_id']} (x={lc['x_avg']:.0f}) -> 物理列{best_col+1} (距离{min_dist:.0f}px)")
# 转换为带索引的列表格式
physical_columns_with_index = [(i, physical_columns[i]) for i in sorted(physical_columns.keys())]
print(f"\n聚合结果: {len(physical_columns_with_index)} 个有内容的物理列共10列")
print("-" * 80)
for physical_col_idx, pc in physical_columns_with_index:
line_ids_in_pc = [lc['line_id'] for lc in pc]
x_avgs = [lc['x_avg'] for lc in pc]
total_chars = sum(lc['char_count'] for lc in pc)
total_small = sum(lc['small_count'] for lc in pc)
print(f"\n物理列 {physical_col_idx+1}:")
print(f" 包含line_ids: {line_ids_in_pc}")
print(f" x坐标平均值: {[f'{x:.0f}' for x in x_avgs]}")
print(f" 总字数: {total_chars} (小字: {total_small})")
# 合并所有字符按y排序
all_chars = []
for lc in pc:
all_chars.extend(lc['chars'])
all_chars.sort(key=lambda c: c['y_center'])
# 显示字符序列(用符号表示大字/小字)
sequence = ""
for c in all_chars:
if c['is_small']:
sequence += "s" # small
else:
sequence += "B" # Big
print(f" 字符序列: {sequence}")
# 计算每个字符对应的行号
cell_height = data['Height'] / 25
print(f" 行分配 (行高={cell_height:.0f}px):")
for c in all_chars[:5]: # 只显示前5个
row = int(c['y_center'] / cell_height)
char_type = "" if c['is_small'] else ""
print(f" '{c['char']}' ({char_type}): y={c['y_center']:.0f} -> 第{row+1}")
if len(all_chars) > 5:
print(f" ... 还有 {len(all_chars)-5} 个字符")
# 分析双行小字的配对
print("\n" + "=" * 80)
print("双行小字配对分析:")
print("-" * 80)
for physical_col_idx, pc in physical_columns_with_index:
# 找这个物理列中的小字
all_chars = []
for lc in pc:
all_chars.extend(lc['chars'])
small_chars = [c for c in all_chars if c['is_small']]
if not small_chars:
continue
print(f"\n物理列 {physical_col_idx+1} 的小字分析:")
# 按x坐标分左右
x_centers = [c['x_center'] for c in small_chars]
x_threshold = np.mean(x_centers)
right_chars = sorted([c for c in small_chars if c['x_center'] >= x_threshold],
key=lambda c: c['y_center'])
left_chars = sorted([c for c in small_chars if c['x_center'] < x_threshold],
key=lambda c: c['y_center'])
print(f" 右列({len(right_chars)}字): {''.join([c['char'] for c in right_chars])}")
print(f" 左列({len(left_chars)}字): {''.join([c['char'] for c in left_chars])}")
# 配对
print(f" 配对结果:")
max_len = max(len(right_chars), len(left_chars))
for i in range(max_len):
r_char = right_chars[i]['char'] if i < len(right_chars) else ''
l_char = left_chars[i]['char'] if i < len(left_chars) else ''
print(f"{i+1}: [{r_char}|{l_char}]")
if __name__ == '__main__':
analyze_physical_columns('/home/yuuko/test/0011B.json')