608 lines
22 KiB
Python
608 lines
22 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
|
||
"""
|
||
PDF书签合并工具 - Python版本
|
||
用于从PDF文件夹中提取书签信息,与TXT元数据文件合并
|
||
"""
|
||
|
||
import os
|
||
import re
|
||
import sys
|
||
import tkinter as tk
|
||
from tkinter import ttk, filedialog, messagebox, scrolledtext
|
||
from pathlib import Path
|
||
from typing import List, Dict, Optional, Tuple
|
||
from datetime import datetime
|
||
import logging
|
||
|
||
|
||
class BookmarkItem:
|
||
"""书签项数据模型"""
|
||
|
||
def __init__(self, title: str = "", page: str = ""):
|
||
self.title = title
|
||
self.page = page
|
||
|
||
def __str__(self):
|
||
return f"BookmarkItem(title='{self.title}', page='{self.page}')"
|
||
|
||
|
||
class DocumentMetadata:
|
||
"""文档元数据模型"""
|
||
|
||
def __init__(self):
|
||
# 基本信息
|
||
self.title = ""
|
||
self.other_titles = ""
|
||
self.volume = ""
|
||
self.isbn = ""
|
||
|
||
# 创建和出版信息
|
||
self.creator = ""
|
||
self.contributor = ""
|
||
self.issued_date = ""
|
||
self.publisher = ""
|
||
self.place = ""
|
||
|
||
# 分类和页码信息
|
||
self.classification_number = ""
|
||
self.page = ""
|
||
|
||
# 书签目录
|
||
self.table_of_contents: List[BookmarkItem] = []
|
||
|
||
# 扩展信息
|
||
self.subject = ""
|
||
self.date = ""
|
||
self.spatial = ""
|
||
self.other_isbn = ""
|
||
self.other_time = ""
|
||
self.url = ""
|
||
|
||
def to_formatted_string(self) -> str:
|
||
"""转换为格式化的字符串输出"""
|
||
result = []
|
||
|
||
# 基本信息
|
||
result.append(f"title:{self.title}")
|
||
if self.other_titles:
|
||
result.append(f"Other titles:{self.other_titles}")
|
||
result.append(f"Volume:{self.volume}")
|
||
result.append(f"ISBN:{self.isbn}")
|
||
result.append(f"creator:{self.creator}")
|
||
result.append(f"contributor:{self.contributor}")
|
||
result.append(f"issuedDate:{self.issued_date}")
|
||
result.append(f"publisher:{self.publisher}")
|
||
result.append(f"place:{self.place}")
|
||
result.append(f"Classification number:{self.classification_number}")
|
||
result.append(f"page:{self.page}")
|
||
|
||
# 书签目录
|
||
result.append("tableOfContents:")
|
||
for bookmark in self.table_of_contents:
|
||
if bookmark.title:
|
||
result.append(f"{bookmark.title}---------------{bookmark.page}<br/>")
|
||
|
||
# 扩展信息
|
||
result.append(f"subject:{self.subject}")
|
||
result.append(f"date:{self.date}")
|
||
result.append(f"spatial:{self.spatial}")
|
||
result.append(f"Other ISBN:{self.other_isbn}")
|
||
result.append(f"Other time:{self.other_time}")
|
||
result.append(f"url:{self.url}")
|
||
|
||
return "\\n".join(result)
|
||
|
||
|
||
class BookmarkExtractor:
|
||
"""书签提取器"""
|
||
|
||
@staticmethod
|
||
def extract_bookmarks_from_bkmk(bkmk_file_path: str) -> List[BookmarkItem]:
|
||
"""从bkmk文件提取书签列表"""
|
||
if not os.path.exists(bkmk_file_path):
|
||
raise FileNotFoundError(f"FreePic2Pdf_bkmk文件不存在: {bkmk_file_path}")
|
||
|
||
bookmarks = []
|
||
|
||
try:
|
||
# 尝试不同编码
|
||
content = None
|
||
for encoding in ['utf-8', 'gbk', 'gb2312', 'utf-8-sig']:
|
||
try:
|
||
with open(bkmk_file_path, 'r', encoding=encoding) as f:
|
||
content = f.read()
|
||
break
|
||
except UnicodeDecodeError:
|
||
continue
|
||
|
||
if content is None:
|
||
content = open(bkmk_file_path, 'r', encoding='latin-1').read()
|
||
|
||
# 按行分割内容
|
||
lines = content.split('\\n')
|
||
|
||
for line in lines:
|
||
line = line.strip()
|
||
if not line:
|
||
continue
|
||
|
||
# 解析书签行
|
||
bookmark = BookmarkExtractor._parse_bookmark_line(line)
|
||
if bookmark:
|
||
bookmarks.append(bookmark)
|
||
|
||
except Exception as ex:
|
||
raise Exception(f"读取书签文件失败: {ex}")
|
||
|
||
return bookmarks
|
||
|
||
@staticmethod
|
||
def _parse_bookmark_line(line: str) -> Optional[BookmarkItem]:
|
||
"""解析单行书签数据"""
|
||
# 分割行内容,最后一部分作为页码
|
||
parts = re.split(r'[\\s\\t:]+', line)
|
||
parts = [p for p in parts if p.strip()]
|
||
|
||
if len(parts) < 2:
|
||
return None
|
||
|
||
bookmark = BookmarkItem()
|
||
page_part = parts[-1]
|
||
|
||
# 验证页码格式(支持阿拉伯数字和罗马数字)
|
||
if BookmarkExtractor._is_page_number(page_part):
|
||
bookmark.page = page_part
|
||
bookmark.title = " ".join(parts[:-1])
|
||
return bookmark
|
||
|
||
return None
|
||
|
||
@staticmethod
|
||
def _is_page_number(text: str) -> bool:
|
||
"""验证是否为页码格式"""
|
||
# 支持阿拉伯数字
|
||
if re.match(r'^\\d+$', text):
|
||
return True
|
||
|
||
# 支持罗马数字
|
||
if re.match(r'^[IVXLCDMivxlcdm]+$', text):
|
||
return True
|
||
|
||
return False
|
||
|
||
|
||
class FileProcessor:
|
||
"""文件处理器"""
|
||
|
||
@staticmethod
|
||
def process_all_folders(pdf_root_path: str, txt_source_path: str) -> Dict[str, Dict]:
|
||
"""处理所有文件夹"""
|
||
results = {}
|
||
|
||
# 查找所有bkmk文件
|
||
bkmk_files = []
|
||
for root, dirs, files in os.walk(pdf_root_path):
|
||
for file in files:
|
||
if file.startswith('FreePic2Pdf_bkmk'):
|
||
bkmk_files.append(os.path.join(root, file))
|
||
|
||
if not bkmk_files:
|
||
raise Exception(f"在路径 {pdf_root_path} 下未找到任何 FreePic2Pdf_bkmk 文件")
|
||
|
||
# 按基础文件名分组
|
||
file_groups = {}
|
||
for bkmk_file in bkmk_files:
|
||
folder_name = os.path.basename(os.path.dirname(bkmk_file))
|
||
base_name = FileProcessor._get_base_filename(folder_name)
|
||
|
||
if base_name not in file_groups:
|
||
file_groups[base_name] = []
|
||
file_groups[base_name].append(bkmk_file)
|
||
|
||
# 处理每个分组
|
||
for base_name, bkmk_files in file_groups.items():
|
||
try:
|
||
bkmk_files.sort() # 按文件名排序
|
||
metadata_documents = []
|
||
|
||
for bkmk_file in bkmk_files:
|
||
# 查找对应的TXT文件
|
||
folder_name = os.path.basename(os.path.dirname(bkmk_file))
|
||
txt_file = os.path.join(txt_source_path, f"{folder_name}.txt")
|
||
|
||
# 创建元数据文档
|
||
metadata = FileProcessor._create_metadata_from_files(txt_file, bkmk_file)
|
||
if metadata:
|
||
metadata_documents.append(metadata)
|
||
|
||
# 合并所有元数据文档
|
||
combined_content = FileProcessor._combine_metadata_documents(metadata_documents)
|
||
|
||
results[base_name] = {
|
||
'success': True,
|
||
'base_filename': base_name,
|
||
'source_files': bkmk_files,
|
||
'metadata_documents': metadata_documents,
|
||
'output_content': combined_content
|
||
}
|
||
|
||
except Exception as ex:
|
||
results[base_name] = {
|
||
'success': False,
|
||
'base_filename': base_name,
|
||
'error_message': str(ex)
|
||
}
|
||
|
||
return results
|
||
|
||
@staticmethod
|
||
def _get_base_filename(folder_name: str) -> str:
|
||
"""获取基础文件名"""
|
||
# 获取空格前的部分作为基础名称
|
||
space_index = folder_name.find(' ')
|
||
return folder_name[:space_index] if space_index > 0 else folder_name
|
||
|
||
@staticmethod
|
||
def _create_metadata_from_files(txt_file: str, bkmk_file: str) -> Optional[DocumentMetadata]:
|
||
"""从文件创建元数据"""
|
||
metadata = DocumentMetadata()
|
||
|
||
# 从TXT文件读取元数据
|
||
if os.path.exists(txt_file):
|
||
FileProcessor._read_metadata_from_txt(txt_file, metadata)
|
||
|
||
# 从bkmk文件提取书签
|
||
if os.path.exists(bkmk_file):
|
||
metadata.table_of_contents = BookmarkExtractor.extract_bookmarks_from_bkmk(bkmk_file)
|
||
|
||
return metadata
|
||
|
||
@staticmethod
|
||
def _read_metadata_from_txt(txt_file: str, metadata: DocumentMetadata):
|
||
"""从TXT文件读取元数据"""
|
||
try:
|
||
# 尝试不同编码
|
||
lines = None
|
||
for encoding in ['gb2312', 'gbk', 'utf-8', 'utf-8-sig']:
|
||
try:
|
||
with open(txt_file, 'r', encoding=encoding) as f:
|
||
lines = f.readlines()
|
||
break
|
||
except UnicodeDecodeError:
|
||
continue
|
||
|
||
if lines is None:
|
||
lines = open(txt_file, 'r', encoding='latin-1').readlines()
|
||
|
||
for line in lines:
|
||
line = line.strip()
|
||
if not line:
|
||
continue
|
||
|
||
# 按冒号分割,最多分割成两部分
|
||
parts = line.split(':', 1)
|
||
if len(parts) == 2:
|
||
key = parts[0].strip()
|
||
value = parts[1].strip()
|
||
|
||
# 根据字段名设置对应的属性
|
||
if key == "title":
|
||
metadata.title = value
|
||
elif key == "Other titles":
|
||
metadata.other_titles = value
|
||
elif key == "Volume":
|
||
metadata.volume = value
|
||
elif key == "ISBN":
|
||
metadata.isbn = value
|
||
elif key == "creator":
|
||
metadata.creator = value
|
||
elif key == "contributor":
|
||
metadata.contributor = value
|
||
elif key == "issuedDate":
|
||
metadata.issued_date = value
|
||
elif key == "publisher":
|
||
metadata.publisher = value
|
||
elif key == "place":
|
||
metadata.place = value
|
||
elif key == "Classification number":
|
||
metadata.classification_number = value
|
||
elif key == "page":
|
||
metadata.page = value
|
||
elif key == "subject":
|
||
metadata.subject = value
|
||
elif key == "date":
|
||
metadata.date = value
|
||
elif key == "spatial":
|
||
metadata.spatial = value
|
||
elif key == "Other ISBN":
|
||
metadata.other_isbn = value
|
||
elif key == "Other time":
|
||
metadata.other_time = value
|
||
elif key == "url":
|
||
metadata.url = value
|
||
|
||
except Exception as ex:
|
||
raise Exception(f"读取TXT文件 {txt_file} 失败: {ex}")
|
||
|
||
@staticmethod
|
||
def _combine_metadata_documents(metadata_documents: List[DocumentMetadata]) -> str:
|
||
"""合并多个元数据文档"""
|
||
if not metadata_documents:
|
||
return ""
|
||
|
||
if len(metadata_documents) == 1:
|
||
return metadata_documents[0].to_formatted_string()
|
||
|
||
# 多个文档用 <> 分隔
|
||
formatted_docs = [doc.to_formatted_string() for doc in metadata_documents]
|
||
return " <>\\n".join(formatted_docs)
|
||
|
||
|
||
class SlideCombineGUI:
|
||
"""PDF书签合并工具图形界面"""
|
||
|
||
def __init__(self):
|
||
self.root = tk.Tk()
|
||
self.root.title("PDF书签合并工具 v2.0 - Python版")
|
||
self.root.geometry("800x600")
|
||
self.root.configure(bg='#f0f0f0')
|
||
|
||
# 设置窗口居中
|
||
self.center_window()
|
||
|
||
# 创建界面
|
||
self.create_widgets()
|
||
|
||
# 设置日志
|
||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||
|
||
def center_window(self):
|
||
"""窗口居中"""
|
||
self.root.update_idletasks()
|
||
width = self.root.winfo_width()
|
||
height = self.root.winfo_height()
|
||
x = (self.root.winfo_screenwidth() // 2) - (width // 2)
|
||
y = (self.root.winfo_screenheight() // 2) - (height // 2)
|
||
self.root.geometry(f'{width}x{height}+{x}+{y}')
|
||
|
||
def create_widgets(self):
|
||
"""创建界面组件"""
|
||
|
||
# 主标题
|
||
title_frame = tk.Frame(self.root, bg='#f0f0f0')
|
||
title_frame.pack(pady=10)
|
||
|
||
tk.Label(title_frame, text="📄 PDF书签合并工具",
|
||
font=("微软雅黑", 16, "bold"), bg='#f0f0f0').pack()
|
||
tk.Label(title_frame, text="v2.0 - Python版",
|
||
font=("微软雅黑", 10), bg='#f0f0f0', fg='#666').pack()
|
||
|
||
# 路径选择区域
|
||
path_frame = tk.LabelFrame(self.root, text="📁 路径选择",
|
||
font=("微软雅黑", 12), bg='#f0f0f0')
|
||
path_frame.pack(fill='x', padx=20, pady=10)
|
||
|
||
# PDF路径
|
||
tk.Label(path_frame, text="PDF文件夹路径(含FreePic2Pdf_bkmk.txt文件):",
|
||
font=("微软雅黑", 10), bg='#f0f0f0').grid(row=0, column=0, columnspan=2,
|
||
sticky='w', padx=10, pady=5)
|
||
|
||
self.pdf_path_var = tk.StringVar()
|
||
tk.Entry(path_frame, textvariable=self.pdf_path_var, width=60,
|
||
font=("微软雅黑", 9)).grid(row=1, column=0, padx=10, sticky='ew')
|
||
tk.Button(path_frame, text="浏览", command=self.browse_pdf_path,
|
||
bg='#4285f4', fg='white', font=("微软雅黑", 9),
|
||
relief='flat', padx=15).grid(row=1, column=1, padx=10, pady=5)
|
||
|
||
# TXT路径
|
||
tk.Label(path_frame, text="TXT源文件路径:",
|
||
font=("微软雅黑", 10), bg='#f0f0f0').grid(row=2, column=0, columnspan=2,
|
||
sticky='w', padx=10, pady=5)
|
||
|
||
self.txt_path_var = tk.StringVar()
|
||
tk.Entry(path_frame, textvariable=self.txt_path_var, width=60,
|
||
font=("微软雅黑", 9)).grid(row=3, column=0, padx=10, sticky='ew')
|
||
tk.Button(path_frame, text="浏览", command=self.browse_txt_path,
|
||
bg='#4285f4', fg='white', font=("微软雅黑", 9),
|
||
relief='flat', padx=15).grid(row=3, column=1, padx=10, pady=5)
|
||
|
||
# 输出路径
|
||
tk.Label(path_frame, text="输出路径:",
|
||
font=("微软雅黑", 10), bg='#f0f0f0').grid(row=4, column=0, columnspan=2,
|
||
sticky='w', padx=10, pady=5)
|
||
|
||
self.output_path_var = tk.StringVar()
|
||
tk.Entry(path_frame, textvariable=self.output_path_var, width=60,
|
||
font=("微软雅黑", 9)).grid(row=5, column=0, padx=10, sticky='ew')
|
||
tk.Button(path_frame, text="浏览", command=self.browse_output_path,
|
||
bg='#4285f4', fg='white', font=("微软雅黑", 9),
|
||
relief='flat', padx=15).grid(row=5, column=1, padx=10, pady=5)
|
||
|
||
path_frame.columnconfigure(0, weight=1)
|
||
|
||
# 操作按钮
|
||
button_frame = tk.Frame(self.root, bg='#f0f0f0')
|
||
button_frame.pack(fill='x', padx=20, pady=10)
|
||
|
||
tk.Button(button_frame, text="🚀 开始合并", command=self.start_merge,
|
||
bg='#34a853', fg='white', font=("微软雅黑", 11, "bold"),
|
||
relief='flat', padx=20, pady=10).pack(side='left', padx=5)
|
||
|
||
tk.Button(button_frame, text="🔄 清空", command=self.clear_all,
|
||
bg='#fbbc04', fg='white', font=("微软雅黑", 10),
|
||
relief='flat', padx=15, pady=10).pack(side='left', padx=5)
|
||
|
||
tk.Button(button_frame, text="❌ 退出", command=self.root.quit,
|
||
bg='#ea4335', fg='white', font=("微软雅黑", 10),
|
||
relief='flat', padx=15, pady=10).pack(side='left', padx=5)
|
||
|
||
# 日志显示区域
|
||
log_frame = tk.LabelFrame(self.root, text="📊 处理日志",
|
||
font=("微软雅黑", 12), bg='#f0f0f0')
|
||
log_frame.pack(fill='both', expand=True, padx=20, pady=10)
|
||
|
||
self.log_text = scrolledtext.ScrolledText(log_frame, height=15,
|
||
font=("Consolas", 9),
|
||
bg='#2d2d2d', fg='#00ff00')
|
||
self.log_text.pack(fill='both', expand=True, padx=10, pady=10)
|
||
|
||
self.log("PDF书签合并工具已启动", "info")
|
||
|
||
def browse_pdf_path(self):
|
||
"""浏览PDF路径"""
|
||
path = filedialog.askdirectory(title="选择包含PDF文件夹的路径")
|
||
if path:
|
||
self.pdf_path_var.set(path)
|
||
self.log(f"已选择PDF路径: {path}", "info")
|
||
|
||
def browse_txt_path(self):
|
||
"""浏览TXT路径"""
|
||
path = filedialog.askdirectory(title="选择包含TXT源文件的路径")
|
||
if path:
|
||
self.txt_path_var.set(path)
|
||
self.log(f"已选择TXT源路径: {path}", "info")
|
||
|
||
def browse_output_path(self):
|
||
"""浏览输出路径"""
|
||
path = filedialog.askdirectory(title="选择输出路径")
|
||
if path:
|
||
self.output_path_var.set(path)
|
||
self.log(f"已选择输出路径: {path}", "info")
|
||
|
||
def clear_all(self):
|
||
"""清空所有输入"""
|
||
self.pdf_path_var.set("")
|
||
self.txt_path_var.set("")
|
||
self.output_path_var.set("")
|
||
self.log_text.delete(1.0, tk.END)
|
||
self.log("界面已清空", "info")
|
||
|
||
def log(self, message: str, level: str = "info"):
|
||
"""添加日志"""
|
||
timestamp = datetime.now().strftime("%H:%M:%S")
|
||
|
||
if level == "info":
|
||
prefix = "ℹ️"
|
||
color = "white"
|
||
elif level == "success":
|
||
prefix = "✅"
|
||
color = "#00ff00"
|
||
elif level == "error":
|
||
prefix = "❌"
|
||
color = "#ff6b6b"
|
||
elif level == "warning":
|
||
prefix = "⚠️"
|
||
color = "#feca57"
|
||
else:
|
||
prefix = "ℹ️"
|
||
color = "white"
|
||
|
||
log_message = f"[{timestamp}] {prefix} {message}\\n"
|
||
|
||
self.log_text.insert(tk.END, log_message)
|
||
self.log_text.see(tk.END)
|
||
self.root.update_idletasks()
|
||
|
||
# 控制台输出
|
||
print(log_message.rstrip())
|
||
|
||
def start_merge(self):
|
||
"""开始合并"""
|
||
# 验证输入
|
||
pdf_path = self.pdf_path_var.get().strip()
|
||
txt_path = self.txt_path_var.get().strip()
|
||
output_path = self.output_path_var.get().strip()
|
||
|
||
if not pdf_path or not txt_path or not output_path:
|
||
messagebox.showwarning("提示", "请选择所有三个路径:PDF路径、TXT源路径和输出路径")
|
||
return
|
||
|
||
if not os.path.exists(pdf_path):
|
||
messagebox.showerror("错误", f"指定的PDF文件夹路径不存在\\n{pdf_path}")
|
||
return
|
||
|
||
if not os.path.exists(txt_path):
|
||
messagebox.showerror("错误", f"指定的TXT源文件路径不存在\\n{txt_path}")
|
||
return
|
||
|
||
# 禁用按钮
|
||
for widget in self.root.winfo_children():
|
||
if isinstance(widget, tk.Frame):
|
||
for child in widget.winfo_children():
|
||
if isinstance(child, tk.Button):
|
||
child.config(state='disabled')
|
||
|
||
# 开始处理
|
||
self.log("开始处理PDF书签文件...", "info")
|
||
|
||
try:
|
||
# 处理文件
|
||
results = FileProcessor.process_all_folders(pdf_path, txt_path)
|
||
|
||
self.log(f"找到 {len(results)} 个文件组需要处理", "info")
|
||
|
||
# 保存结果
|
||
success_count = 0
|
||
fail_count = 0
|
||
|
||
for base_name, result in results.items():
|
||
if result['success']:
|
||
try:
|
||
# 确保输出目录存在
|
||
os.makedirs(output_path, exist_ok=True)
|
||
|
||
# 保存文件
|
||
output_file = os.path.join(output_path, f"{base_name}.txt")
|
||
with open(output_file, 'w', encoding='utf-8-sig') as f:
|
||
f.write(result['output_content'])
|
||
|
||
success_count += 1
|
||
source_count = len(result['source_files'])
|
||
self.log(f"✓ 成功处理: {base_name} (合并了 {source_count} 个文件)", "success")
|
||
|
||
except Exception as ex:
|
||
fail_count += 1
|
||
self.log(f"✗ 保存文件失败: {base_name} - {ex}", "error")
|
||
else:
|
||
fail_count += 1
|
||
self.log(f"✗ 处理失败: {result['base_filename']} - {result['error_message']}", "error")
|
||
|
||
self.log(f"处理完成! 成功: {success_count}, 失败: {fail_count}", "info")
|
||
|
||
if success_count > 0:
|
||
messagebox.showinfo("处理完成",
|
||
f"书签合并完成!\\n成功处理 {success_count} 个文件\\n输出路径: {output_path}")
|
||
else:
|
||
messagebox.showwarning("处理失败", "没有成功处理任何文件,请检查输入路径和文件格式。")
|
||
|
||
except Exception as ex:
|
||
self.log(f"处理过程中发生错误: {ex}", "error")
|
||
messagebox.showerror("错误", f"处理过程中发生错误:\\n{ex}")
|
||
|
||
finally:
|
||
# 重新启用按钮
|
||
for widget in self.root.winfo_children():
|
||
if isinstance(widget, tk.Frame):
|
||
for child in widget.winfo_children():
|
||
if isinstance(child, tk.Button):
|
||
child.config(state='normal')
|
||
|
||
def run(self):
|
||
"""运行应用"""
|
||
self.root.mainloop()
|
||
|
||
|
||
def main():
|
||
"""主函数"""
|
||
try:
|
||
app = SlideCombineGUI()
|
||
app.run()
|
||
except Exception as ex:
|
||
print(f"程序启动失败: {ex}")
|
||
messagebox.showerror("启动错误", f"程序启动失败:\\n{ex}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main() |