ancient-ocr-viewer/analysis_v3.py
Yuuko 1018416a7a Initial commit: Ancient OCR Viewer
- Canvas-based dual display (image + text)
- Grid rendering system with layout support
- Uniform font size rendering
- Double-line small character handling
- Comprehensive documentation of OCR rules and algorithms

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-19 16:59:40 +08:00

190 lines
7.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
分析古籍OCR JSON数据 - 基于间隔检测的物理列聚合
"""
import json
import numpy as np
from collections import defaultdict
def analyze_physical_columns(json_path):
with open(json_path, 'r', encoding='utf-8') as f:
data = json.load(f)
print("=" * 80)
print(f"文件: {data['FileName']}")
print(f"尺寸: {data['Width']} × {data['Height']}")
print(f"版式: 10列 × 25行")
print("=" * 80)
chars = data['chars']
char_marking = data['charMarking']
coors = data['coors']
line_ids = data['line_ids']
# 按line_id分组
logical_columns = defaultdict(list)
for i in range(len(chars)):
logical_columns[line_ids[i]].append({
'index': i,
'char': chars[i],
'x1': coors[i][0],
'y1': coors[i][1],
'x2': coors[i][2],
'y2': coors[i][3],
'x_center': (coors[i][0] + coors[i][2]) / 2,
'y_center': (coors[i][1] + coors[i][3]) / 2,
'is_small': len(char_marking[i]) > 0
})
# 计算每个逻辑列的信息
logical_col_info = []
for line_id in sorted(logical_columns.keys()):
col_chars = logical_columns[line_id]
x_centers = [c['x_center'] for c in col_chars]
x_avg = np.mean(x_centers)
char_count = len(col_chars)
small_count = sum(1 for c in col_chars if c['is_small'])
logical_col_info.append({
'line_id': line_id,
'x_avg': x_avg,
'chars': col_chars,
'char_count': char_count,
'small_count': small_count,
'is_all_small': small_count == char_count
})
# 按x坐标排序从右到左
logical_col_info.sort(key=lambda c: c['x_avg'], reverse=True)
print("\n逻辑列x坐标从右到左:")
for lc in logical_col_info:
small_mark = "(小字)" if lc['is_all_small'] else ""
print(f" line_id={lc['line_id']}: x={lc['x_avg']:.0f} {small_mark}")
# 基于间隔检测来聚合物理列
print("\n" + "=" * 80)
print("基于间隔的物理列聚合:")
print("-" * 80)
# 新策略:
# 1. 所有逻辑列一起考虑(大字和小字)
# 2. 如果相邻逻辑列的x坐标差距小于阈值如150px它们属于同一物理列
# 3. 如果差距大于阈值,就是新的物理列
# 计算相邻逻辑列的间隔
print("所有逻辑列间隔分析:")
gaps = []
for i in range(len(logical_col_info) - 1):
gap = logical_col_info[i]['x_avg'] - logical_col_info[i+1]['x_avg']
gaps.append(gap)
is_small_1 = "(小)" if logical_col_info[i]['is_all_small'] else ""
is_small_2 = "(小)" if logical_col_info[i+1]['is_all_small'] else ""
print(f" line_id={logical_col_info[i]['line_id']}{is_small_1} -> line_id={logical_col_info[i+1]['line_id']}{is_small_2}: {gap:.0f}px")
# 分析间隔分布
print(f"\n间隔统计: min={min(gaps):.0f}, max={max(gaps):.0f}, avg={np.mean(gaps):.0f}, std={np.std(gaps):.0f}")
# 使用阈值小于150px认为是同一物理列双行小字的左右列差约112px
# 大于150px认为是不同物理列
MERGE_THRESHOLD = 150 # 小于这个值就合并
physical_columns = []
current_group = [logical_col_info[0]]
for i in range(1, len(logical_col_info)):
gap = current_group[-1]['x_avg'] - logical_col_info[i]['x_avg']
if gap < MERGE_THRESHOLD:
# 间隔小,属于同一物理列
current_group.append(logical_col_info[i])
else:
# 间隔大,新的物理列
physical_columns.append(current_group)
current_group = [logical_col_info[i]]
physical_columns.append(current_group)
print(f"\n聚合结果: {len(physical_columns)} 个物理列")
print("-" * 80)
# 计算物理列的实际中心位置
pc_centers = []
for pc in physical_columns:
x_values = [lc['x_avg'] for lc in pc]
center = np.mean(x_values)
pc_centers.append(center)
# 计算列间距
print("\n物理列间距:")
for i in range(len(pc_centers) - 1):
gap = pc_centers[i] - pc_centers[i+1]
print(f" 物理列{i+1} -> 物理列{i+2}: {gap:.0f}px")
avg_gap = np.mean([pc_centers[i] - pc_centers[i+1] for i in range(len(pc_centers)-1)])
print(f" 平均间距: {avg_gap:.0f}px")
# 推断在10列网格中的位置
canvas_width = data['Width']
cell_width = canvas_width / 10
print(f"\n在10列网格中的映射 (列宽={cell_width:.0f}px):")
for pi, (pc, center) in enumerate(zip(physical_columns, pc_centers)):
grid_col = round((canvas_width - center) / cell_width)
if grid_col < 1: grid_col = 1
if grid_col > 10: grid_col = 10
print(f" 物理列{pi+1} (中心x={center:.0f}) -> 网格第{grid_col}")
# 显示每个物理列的详细信息
print("\n" + "=" * 80)
print("物理列详情:")
print("-" * 80)
cell_height = data['Height'] / 25
for pi, pc in enumerate(physical_columns):
line_ids_in_pc = [lc['line_id'] for lc in pc]
total_chars = sum(lc['char_count'] for lc in pc)
total_small = sum(lc['small_count'] for lc in pc)
print(f"\n物理列 {pi+1}:")
print(f" 包含line_ids: {line_ids_in_pc}")
print(f" 总字数: {total_chars} (小字: {total_small})")
# 合并所有字符按y排序
all_chars = []
for lc in pc:
all_chars.extend(lc['chars'])
all_chars.sort(key=lambda c: c['y_center'])
# 分析行分配
first_y = all_chars[0]['y_center']
start_row = int(first_y / cell_height)
print(f" 起始行: 第{start_row+1}行 (y={first_y:.0f})")
# 显示内容结构
big_chars_content = ''.join([c['char'] for c in all_chars if not c['is_small']])
small_chars_content = ''.join([c['char'] for c in all_chars if c['is_small']])
print(f" 大字: {big_chars_content[:20]}{'...' if len(big_chars_content) > 20 else ''}")
if small_chars_content:
print(f" 小字: {small_chars_content}")
# 如果有小字,分析配对
small_chars = [c for c in all_chars if c['is_small']]
if small_chars:
x_centers = [c['x_center'] for c in small_chars]
x_threshold = np.mean(x_centers)
right_chars = sorted([c for c in small_chars if c['x_center'] >= x_threshold],
key=lambda c: c['y_center'])
left_chars = sorted([c for c in small_chars if c['x_center'] < x_threshold],
key=lambda c: c['y_center'])
print(f" 双行小字配对:")
print(f" 右列({len(right_chars)}字): {''.join([c['char'] for c in right_chars])}")
print(f" 左列({len(left_chars)}字): {''.join([c['char'] for c in left_chars])}")
if __name__ == '__main__':
analyze_physical_columns('/home/yuuko/test/0011B.json')