SlideCombine/slide_combine.py

608 lines
22 KiB
Python
Raw Normal View History

2025-11-24 17:24:27 +08:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
PDF书签合并工具 - Python版本
用于从PDF文件夹中提取书签信息与TXT元数据文件合并
"""
import os
import re
import sys
import tkinter as tk
from tkinter import ttk, filedialog, messagebox, scrolledtext
from pathlib import Path
from typing import List, Dict, Optional, Tuple
from datetime import datetime
import logging
class BookmarkItem:
"""书签项数据模型"""
def __init__(self, title: str = "", page: str = ""):
self.title = title
self.page = page
def __str__(self):
return f"BookmarkItem(title='{self.title}', page='{self.page}')"
class DocumentMetadata:
"""文档元数据模型"""
def __init__(self):
# 基本信息
self.title = ""
self.other_titles = ""
self.volume = ""
self.isbn = ""
# 创建和出版信息
self.creator = ""
self.contributor = ""
self.issued_date = ""
self.publisher = ""
self.place = ""
# 分类和页码信息
self.classification_number = ""
self.page = ""
# 书签目录
self.table_of_contents: List[BookmarkItem] = []
# 扩展信息
self.subject = ""
self.date = ""
self.spatial = ""
self.other_isbn = ""
self.other_time = ""
self.url = ""
def to_formatted_string(self) -> str:
"""转换为格式化的字符串输出"""
result = []
# 基本信息
result.append(f"title:{self.title}")
if self.other_titles:
result.append(f"Other titles:{self.other_titles}")
result.append(f"Volume:{self.volume}")
result.append(f"ISBN:{self.isbn}")
result.append(f"creator:{self.creator}")
result.append(f"contributor:{self.contributor}")
result.append(f"issuedDate:{self.issued_date}")
result.append(f"publisher:{self.publisher}")
result.append(f"place:{self.place}")
result.append(f"Classification number:{self.classification_number}")
result.append(f"page:{self.page}")
# 书签目录
result.append("tableOfContents:")
for bookmark in self.table_of_contents:
if bookmark.title:
result.append(f"{bookmark.title}---------------{bookmark.page}<br/>")
# 扩展信息
result.append(f"subject:{self.subject}")
result.append(f"date:{self.date}")
result.append(f"spatial:{self.spatial}")
result.append(f"Other ISBN:{self.other_isbn}")
result.append(f"Other time:{self.other_time}")
result.append(f"url:{self.url}")
return "\\n".join(result)
class BookmarkExtractor:
"""书签提取器"""
@staticmethod
def extract_bookmarks_from_bkmk(bkmk_file_path: str) -> List[BookmarkItem]:
"""从bkmk文件提取书签列表"""
if not os.path.exists(bkmk_file_path):
raise FileNotFoundError(f"FreePic2Pdf_bkmk文件不存在: {bkmk_file_path}")
bookmarks = []
try:
# 尝试不同编码
content = None
for encoding in ['utf-8', 'gbk', 'gb2312', 'utf-8-sig']:
try:
with open(bkmk_file_path, 'r', encoding=encoding) as f:
content = f.read()
break
except UnicodeDecodeError:
continue
if content is None:
content = open(bkmk_file_path, 'r', encoding='latin-1').read()
# 按行分割内容
lines = content.split('\\n')
for line in lines:
line = line.strip()
if not line:
continue
# 解析书签行
bookmark = BookmarkExtractor._parse_bookmark_line(line)
if bookmark:
bookmarks.append(bookmark)
except Exception as ex:
raise Exception(f"读取书签文件失败: {ex}")
return bookmarks
@staticmethod
def _parse_bookmark_line(line: str) -> Optional[BookmarkItem]:
"""解析单行书签数据"""
# 分割行内容,最后一部分作为页码
parts = re.split(r'[\\s\\t:]+', line)
parts = [p for p in parts if p.strip()]
if len(parts) < 2:
return None
bookmark = BookmarkItem()
page_part = parts[-1]
# 验证页码格式(支持阿拉伯数字和罗马数字)
if BookmarkExtractor._is_page_number(page_part):
bookmark.page = page_part
bookmark.title = " ".join(parts[:-1])
return bookmark
return None
@staticmethod
def _is_page_number(text: str) -> bool:
"""验证是否为页码格式"""
# 支持阿拉伯数字
if re.match(r'^\\d+$', text):
return True
# 支持罗马数字
if re.match(r'^[IVXLCDMivxlcdm]+$', text):
return True
return False
class FileProcessor:
"""文件处理器"""
@staticmethod
def process_all_folders(pdf_root_path: str, txt_source_path: str) -> Dict[str, Dict]:
"""处理所有文件夹"""
results = {}
# 查找所有bkmk文件
bkmk_files = []
for root, dirs, files in os.walk(pdf_root_path):
for file in files:
if file.startswith('FreePic2Pdf_bkmk'):
bkmk_files.append(os.path.join(root, file))
if not bkmk_files:
raise Exception(f"在路径 {pdf_root_path} 下未找到任何 FreePic2Pdf_bkmk 文件")
# 按基础文件名分组
file_groups = {}
for bkmk_file in bkmk_files:
folder_name = os.path.basename(os.path.dirname(bkmk_file))
base_name = FileProcessor._get_base_filename(folder_name)
if base_name not in file_groups:
file_groups[base_name] = []
file_groups[base_name].append(bkmk_file)
# 处理每个分组
for base_name, bkmk_files in file_groups.items():
try:
bkmk_files.sort() # 按文件名排序
metadata_documents = []
for bkmk_file in bkmk_files:
# 查找对应的TXT文件
folder_name = os.path.basename(os.path.dirname(bkmk_file))
txt_file = os.path.join(txt_source_path, f"{folder_name}.txt")
# 创建元数据文档
metadata = FileProcessor._create_metadata_from_files(txt_file, bkmk_file)
if metadata:
metadata_documents.append(metadata)
# 合并所有元数据文档
combined_content = FileProcessor._combine_metadata_documents(metadata_documents)
results[base_name] = {
'success': True,
'base_filename': base_name,
'source_files': bkmk_files,
'metadata_documents': metadata_documents,
'output_content': combined_content
}
except Exception as ex:
results[base_name] = {
'success': False,
'base_filename': base_name,
'error_message': str(ex)
}
return results
@staticmethod
def _get_base_filename(folder_name: str) -> str:
"""获取基础文件名"""
# 获取空格前的部分作为基础名称
space_index = folder_name.find(' ')
return folder_name[:space_index] if space_index > 0 else folder_name
@staticmethod
def _create_metadata_from_files(txt_file: str, bkmk_file: str) -> Optional[DocumentMetadata]:
"""从文件创建元数据"""
metadata = DocumentMetadata()
# 从TXT文件读取元数据
if os.path.exists(txt_file):
FileProcessor._read_metadata_from_txt(txt_file, metadata)
# 从bkmk文件提取书签
if os.path.exists(bkmk_file):
metadata.table_of_contents = BookmarkExtractor.extract_bookmarks_from_bkmk(bkmk_file)
return metadata
@staticmethod
def _read_metadata_from_txt(txt_file: str, metadata: DocumentMetadata):
"""从TXT文件读取元数据"""
try:
# 尝试不同编码
lines = None
for encoding in ['gb2312', 'gbk', 'utf-8', 'utf-8-sig']:
try:
with open(txt_file, 'r', encoding=encoding) as f:
lines = f.readlines()
break
except UnicodeDecodeError:
continue
if lines is None:
lines = open(txt_file, 'r', encoding='latin-1').readlines()
for line in lines:
line = line.strip()
if not line:
continue
# 按冒号分割,最多分割成两部分
parts = line.split(':', 1)
if len(parts) == 2:
key = parts[0].strip()
value = parts[1].strip()
# 根据字段名设置对应的属性
if key == "title":
metadata.title = value
elif key == "Other titles":
metadata.other_titles = value
elif key == "Volume":
metadata.volume = value
elif key == "ISBN":
metadata.isbn = value
elif key == "creator":
metadata.creator = value
elif key == "contributor":
metadata.contributor = value
elif key == "issuedDate":
metadata.issued_date = value
elif key == "publisher":
metadata.publisher = value
elif key == "place":
metadata.place = value
elif key == "Classification number":
metadata.classification_number = value
elif key == "page":
metadata.page = value
elif key == "subject":
metadata.subject = value
elif key == "date":
metadata.date = value
elif key == "spatial":
metadata.spatial = value
elif key == "Other ISBN":
metadata.other_isbn = value
elif key == "Other time":
metadata.other_time = value
elif key == "url":
metadata.url = value
except Exception as ex:
raise Exception(f"读取TXT文件 {txt_file} 失败: {ex}")
@staticmethod
def _combine_metadata_documents(metadata_documents: List[DocumentMetadata]) -> str:
"""合并多个元数据文档"""
if not metadata_documents:
return ""
if len(metadata_documents) == 1:
return metadata_documents[0].to_formatted_string()
# 多个文档用 <> 分隔
formatted_docs = [doc.to_formatted_string() for doc in metadata_documents]
return " <>\\n".join(formatted_docs)
class SlideCombineGUI:
"""PDF书签合并工具图形界面"""
def __init__(self):
self.root = tk.Tk()
self.root.title("PDF书签合并工具 v2.0 - Python版")
self.root.geometry("800x600")
self.root.configure(bg='#f0f0f0')
# 设置窗口居中
self.center_window()
# 创建界面
self.create_widgets()
# 设置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def center_window(self):
"""窗口居中"""
self.root.update_idletasks()
width = self.root.winfo_width()
height = self.root.winfo_height()
x = (self.root.winfo_screenwidth() // 2) - (width // 2)
y = (self.root.winfo_screenheight() // 2) - (height // 2)
self.root.geometry(f'{width}x{height}+{x}+{y}')
def create_widgets(self):
"""创建界面组件"""
# 主标题
title_frame = tk.Frame(self.root, bg='#f0f0f0')
title_frame.pack(pady=10)
tk.Label(title_frame, text="📄 PDF书签合并工具",
font=("微软雅黑", 16, "bold"), bg='#f0f0f0').pack()
tk.Label(title_frame, text="v2.0 - Python版",
font=("微软雅黑", 10), bg='#f0f0f0', fg='#666').pack()
# 路径选择区域
path_frame = tk.LabelFrame(self.root, text="📁 路径选择",
font=("微软雅黑", 12), bg='#f0f0f0')
path_frame.pack(fill='x', padx=20, pady=10)
# PDF路径
tk.Label(path_frame, text="PDF文件夹路径含FreePic2Pdf_bkmk.txt文件",
font=("微软雅黑", 10), bg='#f0f0f0').grid(row=0, column=0, columnspan=2,
sticky='w', padx=10, pady=5)
self.pdf_path_var = tk.StringVar()
tk.Entry(path_frame, textvariable=self.pdf_path_var, width=60,
font=("微软雅黑", 9)).grid(row=1, column=0, padx=10, sticky='ew')
tk.Button(path_frame, text="浏览", command=self.browse_pdf_path,
bg='#4285f4', fg='white', font=("微软雅黑", 9),
relief='flat', padx=15).grid(row=1, column=1, padx=10, pady=5)
# TXT路径
tk.Label(path_frame, text="TXT源文件路径",
font=("微软雅黑", 10), bg='#f0f0f0').grid(row=2, column=0, columnspan=2,
sticky='w', padx=10, pady=5)
self.txt_path_var = tk.StringVar()
tk.Entry(path_frame, textvariable=self.txt_path_var, width=60,
font=("微软雅黑", 9)).grid(row=3, column=0, padx=10, sticky='ew')
tk.Button(path_frame, text="浏览", command=self.browse_txt_path,
bg='#4285f4', fg='white', font=("微软雅黑", 9),
relief='flat', padx=15).grid(row=3, column=1, padx=10, pady=5)
# 输出路径
tk.Label(path_frame, text="输出路径:",
font=("微软雅黑", 10), bg='#f0f0f0').grid(row=4, column=0, columnspan=2,
sticky='w', padx=10, pady=5)
self.output_path_var = tk.StringVar()
tk.Entry(path_frame, textvariable=self.output_path_var, width=60,
font=("微软雅黑", 9)).grid(row=5, column=0, padx=10, sticky='ew')
tk.Button(path_frame, text="浏览", command=self.browse_output_path,
bg='#4285f4', fg='white', font=("微软雅黑", 9),
relief='flat', padx=15).grid(row=5, column=1, padx=10, pady=5)
path_frame.columnconfigure(0, weight=1)
# 操作按钮
button_frame = tk.Frame(self.root, bg='#f0f0f0')
button_frame.pack(fill='x', padx=20, pady=10)
tk.Button(button_frame, text="🚀 开始合并", command=self.start_merge,
bg='#34a853', fg='white', font=("微软雅黑", 11, "bold"),
relief='flat', padx=20, pady=10).pack(side='left', padx=5)
tk.Button(button_frame, text="🔄 清空", command=self.clear_all,
bg='#fbbc04', fg='white', font=("微软雅黑", 10),
relief='flat', padx=15, pady=10).pack(side='left', padx=5)
tk.Button(button_frame, text="❌ 退出", command=self.root.quit,
bg='#ea4335', fg='white', font=("微软雅黑", 10),
relief='flat', padx=15, pady=10).pack(side='left', padx=5)
# 日志显示区域
log_frame = tk.LabelFrame(self.root, text="📊 处理日志",
font=("微软雅黑", 12), bg='#f0f0f0')
log_frame.pack(fill='both', expand=True, padx=20, pady=10)
self.log_text = scrolledtext.ScrolledText(log_frame, height=15,
font=("Consolas", 9),
bg='#2d2d2d', fg='#00ff00')
self.log_text.pack(fill='both', expand=True, padx=10, pady=10)
self.log("PDF书签合并工具已启动", "info")
def browse_pdf_path(self):
"""浏览PDF路径"""
path = filedialog.askdirectory(title="选择包含PDF文件夹的路径")
if path:
self.pdf_path_var.set(path)
self.log(f"已选择PDF路径: {path}", "info")
def browse_txt_path(self):
"""浏览TXT路径"""
path = filedialog.askdirectory(title="选择包含TXT源文件的路径")
if path:
self.txt_path_var.set(path)
self.log(f"已选择TXT源路径: {path}", "info")
def browse_output_path(self):
"""浏览输出路径"""
path = filedialog.askdirectory(title="选择输出路径")
if path:
self.output_path_var.set(path)
self.log(f"已选择输出路径: {path}", "info")
def clear_all(self):
"""清空所有输入"""
self.pdf_path_var.set("")
self.txt_path_var.set("")
self.output_path_var.set("")
self.log_text.delete(1.0, tk.END)
self.log("界面已清空", "info")
def log(self, message: str, level: str = "info"):
"""添加日志"""
timestamp = datetime.now().strftime("%H:%M:%S")
if level == "info":
prefix = ""
color = "white"
elif level == "success":
prefix = ""
color = "#00ff00"
elif level == "error":
prefix = ""
color = "#ff6b6b"
elif level == "warning":
prefix = "⚠️"
color = "#feca57"
else:
prefix = ""
color = "white"
log_message = f"[{timestamp}] {prefix} {message}\\n"
self.log_text.insert(tk.END, log_message)
self.log_text.see(tk.END)
self.root.update_idletasks()
# 控制台输出
print(log_message.rstrip())
def start_merge(self):
"""开始合并"""
# 验证输入
pdf_path = self.pdf_path_var.get().strip()
txt_path = self.txt_path_var.get().strip()
output_path = self.output_path_var.get().strip()
if not pdf_path or not txt_path or not output_path:
messagebox.showwarning("提示", "请选择所有三个路径PDF路径、TXT源路径和输出路径")
return
if not os.path.exists(pdf_path):
messagebox.showerror("错误", f"指定的PDF文件夹路径不存在\\n{pdf_path}")
return
if not os.path.exists(txt_path):
messagebox.showerror("错误", f"指定的TXT源文件路径不存在\\n{txt_path}")
return
# 禁用按钮
for widget in self.root.winfo_children():
if isinstance(widget, tk.Frame):
for child in widget.winfo_children():
if isinstance(child, tk.Button):
child.config(state='disabled')
# 开始处理
self.log("开始处理PDF书签文件...", "info")
try:
# 处理文件
results = FileProcessor.process_all_folders(pdf_path, txt_path)
self.log(f"找到 {len(results)} 个文件组需要处理", "info")
# 保存结果
success_count = 0
fail_count = 0
for base_name, result in results.items():
if result['success']:
try:
# 确保输出目录存在
os.makedirs(output_path, exist_ok=True)
# 保存文件
output_file = os.path.join(output_path, f"{base_name}.txt")
with open(output_file, 'w', encoding='utf-8-sig') as f:
f.write(result['output_content'])
success_count += 1
source_count = len(result['source_files'])
self.log(f"✓ 成功处理: {base_name} (合并了 {source_count} 个文件)", "success")
except Exception as ex:
fail_count += 1
self.log(f"✗ 保存文件失败: {base_name} - {ex}", "error")
else:
fail_count += 1
self.log(f"✗ 处理失败: {result['base_filename']} - {result['error_message']}", "error")
self.log(f"处理完成! 成功: {success_count}, 失败: {fail_count}", "info")
if success_count > 0:
messagebox.showinfo("处理完成",
f"书签合并完成!\\n成功处理 {success_count} 个文件\\n输出路径: {output_path}")
else:
messagebox.showwarning("处理失败", "没有成功处理任何文件,请检查输入路径和文件格式。")
except Exception as ex:
self.log(f"处理过程中发生错误: {ex}", "error")
messagebox.showerror("错误", f"处理过程中发生错误:\\n{ex}")
finally:
# 重新启用按钮
for widget in self.root.winfo_children():
if isinstance(widget, tk.Frame):
for child in widget.winfo_children():
if isinstance(child, tk.Button):
child.config(state='normal')
def run(self):
"""运行应用"""
self.root.mainloop()
def main():
"""主函数"""
try:
app = SlideCombineGUI()
app.run()
except Exception as ex:
print(f"程序启动失败: {ex}")
messagebox.showerror("启动错误", f"程序启动失败:\\n{ex}")
if __name__ == "__main__":
main()