#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 分析古籍OCR JSON数据,理解双行小字的排列规律 """ import json import numpy as np from collections import defaultdict def analyze_ocr_json(json_path): with open(json_path, 'r', encoding='utf-8') as f: data = json.load(f) print("=" * 80) print(f"文件: {data['FileName']}") print(f"尺寸: {data['Width']} × {data['Height']}") print(f"字符数: {data['CharNumber']}, 行数: {data['LineNumber']}") print("=" * 80) # 分析字符类型分布 chars = data['chars'] char_marking = data['charMarking'] coors = data['coors'] line_ids = data['line_ids'] # 统计大字和小字 big_chars = [] small_chars = [] for i, marking in enumerate(char_marking): if len(marking) == 0: big_chars.append(i) else: small_chars.append(i) print(f"\n字符分类:") print(f" 大字(正文): {len(big_chars)} 个") print(f" 小字(双行注): {len(small_chars)} 个") # 分析列分布(x坐标聚类) print(f"\n列分布分析:") x_centers = [(coors[i][0] + coors[i][2]) / 2 for i in range(len(coors))] # 只看大字的x坐标,用于确定列 big_x_centers = [x_centers[i] for i in big_chars] # 简单聚类:按x坐标排序,找间隔 big_x_sorted = sorted(set(big_x_centers)) # 找列(x坐标的聚类中心) columns = [] current_cluster = [big_x_sorted[0]] for x in big_x_sorted[1:]: if x - current_cluster[-1] > 200: # 列间距阈值 columns.append(np.mean(current_cluster)) current_cluster = [x] else: current_cluster.append(x) columns.append(np.mean(current_cluster)) # 从右到左排序(古籍从右往左) columns = sorted(columns, reverse=True) print(f" 检测到 {len(columns)} 列") print(f" 列中心x坐标(从右到左): {[f'{x:.1f}' for x in columns]}") # 分析双行小字的分组 print(f"\n双行小字分组分析:") # 找出连续的小字段 small_groups = [] if small_chars: current_group = [small_chars[0]] for i in range(1, len(small_chars)): if small_chars[i] == small_chars[i-1] + 1: current_group.append(small_chars[i]) else: small_groups.append(current_group) current_group = [small_chars[i]] small_groups.append(current_group) print(f" 发现 {len(small_groups)} 组双行小字:") for gi, group in enumerate(small_groups): print(f"\n 组 {gi+1}: 索引 {group[0]}-{group[-1]}, 共 {len(group)} 个字") group_chars = [chars[i] for i in group] group_text = ''.join(group_chars) print(f" 内容: {group_text}") # 分析这组小字的x坐标分布 group_x = [(coors[i][0] + coors[i][2]) / 2 for i in group] group_x_min = min(group_x) group_x_max = max(group_x) group_x_mean = np.mean(group_x) print(f" x坐标范围: [{group_x_min:.1f}, {group_x_max:.1f}], 平均: {group_x_mean:.1f}") # 尝试分成左右两列 threshold = group_x_mean right_col = [i for i, x in zip(group, group_x) if x >= threshold] left_col = [i for i, x in zip(group, group_x) if x < threshold] print(f" 右列(x >= {threshold:.1f}): {len(right_col)} 个字") if right_col: right_text = ''.join([chars[i] for i in right_col]) print(f" 内容: {right_text}") print(f" 索引: {right_col[:5]}{'...' if len(right_col) > 5 else ''}") print(f" 左列(x < {threshold:.1f}): {len(left_col)} 个字") if left_col: left_text = ''.join([chars[i] for i in left_col]) print(f" 内容: {left_text}") print(f" 索引: {left_col[:5]}{'...' if len(left_col) > 5 else ''}") # 分析y坐标(垂直位置) print(f" 垂直排列分析:") for idx in group[:3]: # 只看前3个字 y1, y2 = coors[idx][1], coors[idx][3] x1, x2 = coors[idx][0], coors[idx][2] print(f" 索引{idx} '{chars[idx]}': x=[{x1},{x2}], y=[{y1},{y2}]") # 分析行高和列宽 print(f"\n字符尺寸分析:") big_heights = [(coors[i][3] - coors[i][1]) for i in big_chars] big_widths = [(coors[i][2] - coors[i][0]) for i in big_chars] print(f" 大字平均高度: {np.mean(big_heights):.1f}px (std: {np.std(big_heights):.1f})") print(f" 大字平均宽度: {np.mean(big_widths):.1f}px (std: {np.std(big_widths):.1f})") if small_chars: small_heights = [(coors[i][3] - coors[i][1]) for i in small_chars] small_widths = [(coors[i][2] - coors[i][0]) for i in small_chars] print(f" 小字平均高度: {np.mean(small_heights):.1f}px (std: {np.std(small_heights):.1f})") print(f" 小字平均宽度: {np.mean(small_widths):.1f}px (std: {np.std(small_widths):.1f})") print(f" 小字/大字高度比: {np.mean(small_heights)/np.mean(big_heights):.2f}") # 输出推荐的网格参数 print(f"\n推荐的虚拟网格参数:") print(f" 列数: {len(columns)}") print(f" 标准列宽: {np.mean(big_widths):.0f}px") print(f" 标准行高: {np.mean(big_heights):.0f}px") print(f" 大字字号: {np.mean(big_heights) * 0.8:.0f}px") if small_chars: print(f" 小字字号: {np.mean(small_heights) * 0.8:.0f}px") if __name__ == '__main__': # 分析带双行小字的文件 print("\n分析带双行小字的文件: 0011B.json") analyze_ocr_json('/home/yuuko/test/0011B.json') print("\n\n" + "=" * 80) print("\n分析普通文件: 0003A.json") analyze_ocr_json('/home/yuuko/test/ancient-ocr-viewer/data/0003A.json')