#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 分析古籍OCR JSON数据 - 理解物理列聚合逻辑 """ import json import numpy as np from collections import defaultdict def analyze_physical_columns(json_path): with open(json_path, 'r', encoding='utf-8') as f: data = json.load(f) print("=" * 80) print(f"文件: {data['FileName']}") print(f"尺寸: {data['Width']} × {data['Height']}") print(f"版式: 10列 × 25行") print("=" * 80) chars = data['chars'] char_marking = data['charMarking'] coors = data['coors'] line_ids = data['line_ids'] # 按line_id分组 logical_columns = defaultdict(list) for i in range(len(chars)): logical_columns[line_ids[i]].append({ 'index': i, 'char': chars[i], 'x1': coors[i][0], 'y1': coors[i][1], 'x2': coors[i][2], 'y2': coors[i][3], 'x_center': (coors[i][0] + coors[i][2]) / 2, 'y_center': (coors[i][1] + coors[i][3]) / 2, 'is_small': len(char_marking[i]) > 0 }) # 分析每个逻辑列 print("\n逻辑列(line_id)分析:") print("-" * 80) logical_col_info = [] for line_id in sorted(logical_columns.keys()): col_chars = logical_columns[line_id] x_centers = [c['x_center'] for c in col_chars] y_centers = [c['y_center'] for c in col_chars] x_min, x_max = min(x_centers), max(x_centers) x_avg = np.mean(x_centers) y_min, y_max = min(y_centers), max(y_centers) char_count = len(col_chars) small_count = sum(1 for c in col_chars if c['is_small']) big_count = char_count - small_count text = ''.join([c['char'] for c in col_chars]) text_preview = text[:15] + '...' if len(text) > 15 else text print(f"line_id={line_id}: x范围[{x_min:.0f}-{x_max:.0f}], 平均x={x_avg:.0f}") print(f" 字数: {char_count} (大字:{big_count}, 小字:{small_count})") print(f" 内容: {text_preview}") logical_col_info.append({ 'line_id': line_id, 'x_avg': x_avg, 'x_min': x_min, 'x_max': x_max, 'chars': col_chars, 'char_count': char_count, 'small_count': small_count }) # 尝试聚合到物理列 print("\n" + "=" * 80) print("物理列聚合分析:") print("-" * 80) # 计算物理列宽度 canvas_width = data['Width'] total_physical_cols = 10 cell_width = canvas_width / total_physical_cols print(f"每个物理列宽度: {cell_width:.0f}px") # 按x坐标排序逻辑列(从右到左,x大的在前) logical_col_info.sort(key=lambda c: c['x_avg'], reverse=True) # 计算10个物理列的理想中心位置(从右到左) physical_centers = [] for i in range(total_physical_cols): center_x = canvas_width - (i + 0.5) * cell_width physical_centers.append(center_x) print(f"物理列理想中心: {[f'{x:.0f}' for x in physical_centers]}") # 将每个逻辑列映射到最近的物理列 physical_columns = defaultdict(list) for lc in logical_col_info: # 找最近的物理列 min_dist = float('inf') best_col = 0 for i, center in enumerate(physical_centers): dist = abs(lc['x_avg'] - center) if dist < min_dist: min_dist = dist best_col = i physical_columns[best_col].append(lc) print(f" line_id={lc['line_id']} (x={lc['x_avg']:.0f}) -> 物理列{best_col+1} (距离{min_dist:.0f}px)") # 转换为带索引的列表格式 physical_columns_with_index = [(i, physical_columns[i]) for i in sorted(physical_columns.keys())] print(f"\n聚合结果: {len(physical_columns_with_index)} 个有内容的物理列(共10列)") print("-" * 80) for physical_col_idx, pc in physical_columns_with_index: line_ids_in_pc = [lc['line_id'] for lc in pc] x_avgs = [lc['x_avg'] for lc in pc] total_chars = sum(lc['char_count'] for lc in pc) total_small = sum(lc['small_count'] for lc in pc) print(f"\n物理列 {physical_col_idx+1}:") print(f" 包含line_ids: {line_ids_in_pc}") print(f" x坐标平均值: {[f'{x:.0f}' for x in x_avgs]}") print(f" 总字数: {total_chars} (小字: {total_small})") # 合并所有字符,按y排序 all_chars = [] for lc in pc: all_chars.extend(lc['chars']) all_chars.sort(key=lambda c: c['y_center']) # 显示字符序列(用符号表示大字/小字) sequence = "" for c in all_chars: if c['is_small']: sequence += "s" # small else: sequence += "B" # Big print(f" 字符序列: {sequence}") # 计算每个字符对应的行号 cell_height = data['Height'] / 25 print(f" 行分配 (行高={cell_height:.0f}px):") for c in all_chars[:5]: # 只显示前5个 row = int(c['y_center'] / cell_height) char_type = "小" if c['is_small'] else "大" print(f" '{c['char']}' ({char_type}): y={c['y_center']:.0f} -> 第{row+1}行") if len(all_chars) > 5: print(f" ... 还有 {len(all_chars)-5} 个字符") # 分析双行小字的配对 print("\n" + "=" * 80) print("双行小字配对分析:") print("-" * 80) for physical_col_idx, pc in physical_columns_with_index: # 找这个物理列中的小字 all_chars = [] for lc in pc: all_chars.extend(lc['chars']) small_chars = [c for c in all_chars if c['is_small']] if not small_chars: continue print(f"\n物理列 {physical_col_idx+1} 的小字分析:") # 按x坐标分左右 x_centers = [c['x_center'] for c in small_chars] x_threshold = np.mean(x_centers) right_chars = sorted([c for c in small_chars if c['x_center'] >= x_threshold], key=lambda c: c['y_center']) left_chars = sorted([c for c in small_chars if c['x_center'] < x_threshold], key=lambda c: c['y_center']) print(f" 右列({len(right_chars)}字): {''.join([c['char'] for c in right_chars])}") print(f" 左列({len(left_chars)}字): {''.join([c['char'] for c in left_chars])}") # 配对 print(f" 配对结果:") max_len = max(len(right_chars), len(left_chars)) for i in range(max_len): r_char = right_chars[i]['char'] if i < len(right_chars) else '□' l_char = left_chars[i]['char'] if i < len(left_chars) else '□' print(f" 格{i+1}: [{r_char}|{l_char}]") if __name__ == '__main__': analyze_physical_columns('/home/yuuko/test/0011B.json')