2025-11-24 15:44:37 +08:00
|
|
|
|
using System;
|
|
|
|
|
|
using System.Collections.Generic;
|
|
|
|
|
|
using System.IO;
|
|
|
|
|
|
using System.Linq;
|
|
|
|
|
|
using System.Text;
|
2025-11-24 18:41:20 +08:00
|
|
|
|
using System.Text.RegularExpressions;
|
2025-11-24 15:44:37 +08:00
|
|
|
|
|
|
|
|
|
|
namespace SlideCombine
|
|
|
|
|
|
{
|
2025-11-24 18:41:20 +08:00
|
|
|
|
/// <summary>
|
|
|
|
|
|
/// Bkmk文件智能排序比较器
|
|
|
|
|
|
/// 按文件夹名称中的数字部分进行排序
|
|
|
|
|
|
/// </summary>
|
|
|
|
|
|
public class BkmkFileComparer : IComparer<string>
|
|
|
|
|
|
{
|
|
|
|
|
|
public int Compare(string x, string y)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (x == null && y == null) return 0;
|
|
|
|
|
|
if (x == null) return -1;
|
|
|
|
|
|
if (y == null) return 1;
|
|
|
|
|
|
|
|
|
|
|
|
// 获取文件夹名称(去掉路径和文件名)
|
|
|
|
|
|
var xFolder = Path.GetFileName(Path.GetDirectoryName(x));
|
|
|
|
|
|
var yFolder = Path.GetFileName(Path.GetDirectoryName(y));
|
|
|
|
|
|
|
|
|
|
|
|
// 提取数字部分进行智能排序
|
|
|
|
|
|
var xNumber = ExtractNumberFromFolder(xFolder);
|
|
|
|
|
|
var yNumber = ExtractNumberFromFolder(yFolder);
|
|
|
|
|
|
|
|
|
|
|
|
// 如果都有数字,按数字大小排序
|
|
|
|
|
|
if (xNumber.HasValue && yNumber.HasValue)
|
|
|
|
|
|
{
|
|
|
|
|
|
int result = xNumber.Value.CompareTo(yNumber.Value);
|
|
|
|
|
|
if (result != 0) return result;
|
|
|
|
|
|
}
|
|
|
|
|
|
// 如果只有一方有数字,有数字的排前面
|
|
|
|
|
|
else if (xNumber.HasValue)
|
|
|
|
|
|
{
|
|
|
|
|
|
return -1;
|
|
|
|
|
|
}
|
|
|
|
|
|
else if (yNumber.HasValue)
|
|
|
|
|
|
{
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 如果都没有数字或数字相同,按完整字符串排序
|
|
|
|
|
|
return string.Compare(x, y, StringComparison.OrdinalIgnoreCase);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
private int? ExtractNumberFromFolder(string folderName)
|
|
|
|
|
|
{
|
|
|
|
|
|
// 使用正则表达式提取文件夹名称中的数字部分
|
|
|
|
|
|
// 支持格式:CH-875 1-3, CH-875 4-6, CH-875 10-12, Volume 2, Part 1等
|
|
|
|
|
|
var match = Regex.Match(folderName, @"(?:[\w-]+\s+)?(\d+)", RegexOptions.IgnoreCase);
|
|
|
|
|
|
|
|
|
|
|
|
if (match.Success && match.Groups.Count > 1)
|
|
|
|
|
|
{
|
|
|
|
|
|
string numberStr = match.Groups[1].Value;
|
|
|
|
|
|
if (int.TryParse(numberStr, out int number))
|
|
|
|
|
|
{
|
|
|
|
|
|
return number;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return null;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2025-11-24 15:44:37 +08:00
|
|
|
|
public class ProcessResult
|
|
|
|
|
|
{
|
|
|
|
|
|
public string BaseFileName { get; set; }
|
|
|
|
|
|
public List<string> SourceFiles { get; set; }
|
|
|
|
|
|
public string OutputContent { get; set; }
|
|
|
|
|
|
public bool Success { get; set; }
|
|
|
|
|
|
public string ErrorMessage { get; set; }
|
2025-11-24 15:51:08 +08:00
|
|
|
|
public List<DocumentMetadata> MetadataDocuments { get; set; }
|
2025-11-24 15:44:37 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
public class FileMerger
|
|
|
|
|
|
{
|
2025-11-24 16:14:05 +08:00
|
|
|
|
public static List<ProcessResult> ProcessAllFolders(string pdfRootPath, string txtSourcePath, string txtOutputPath)
|
2025-11-24 15:44:37 +08:00
|
|
|
|
{
|
|
|
|
|
|
var results = new List<ProcessResult>();
|
|
|
|
|
|
|
|
|
|
|
|
try
|
|
|
|
|
|
{
|
2025-11-24 15:47:19 +08:00
|
|
|
|
// 获取所有包含FreePic2Pdf_bkmk文件的文件夹(支持无扩展名和.txt扩展名)
|
|
|
|
|
|
var bkmkFiles = new List<string>();
|
|
|
|
|
|
bkmkFiles.AddRange(Directory.GetFiles(pdfRootPath, "FreePic2Pdf_bkmk", SearchOption.AllDirectories));
|
|
|
|
|
|
bkmkFiles.AddRange(Directory.GetFiles(pdfRootPath, "FreePic2Pdf_bkmk.txt", SearchOption.AllDirectories));
|
2025-11-24 15:44:37 +08:00
|
|
|
|
|
2025-11-24 15:47:19 +08:00
|
|
|
|
if (bkmkFiles.Count == 0)
|
2025-11-24 15:44:37 +08:00
|
|
|
|
{
|
2025-11-24 15:47:19 +08:00
|
|
|
|
throw new Exception($"在路径 {pdfRootPath} 下未找到任何 FreePic2Pdf_bkmk 或 FreePic2Pdf_bkmk.txt 文件");
|
2025-11-24 15:44:37 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-24 16:14:05 +08:00
|
|
|
|
// 检查TXT源路径是否存在
|
|
|
|
|
|
if (!Directory.Exists(txtSourcePath))
|
|
|
|
|
|
{
|
|
|
|
|
|
throw new Exception($"TXT源文件路径不存在: {txtSourcePath}");
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-24 15:44:37 +08:00
|
|
|
|
// 按基础文件名分组(取文件夹名称的空格前缀)
|
|
|
|
|
|
var fileGroups = new Dictionary<string, List<string>>();
|
|
|
|
|
|
|
|
|
|
|
|
foreach (var bkmkFile in bkmkFiles)
|
|
|
|
|
|
{
|
|
|
|
|
|
var folderName = Path.GetDirectoryName(bkmkFile);
|
|
|
|
|
|
var folderNameOnly = new DirectoryInfo(folderName).Name;
|
|
|
|
|
|
|
|
|
|
|
|
// 获取空格前的基础名称
|
|
|
|
|
|
var baseName = GetBaseFileName(folderNameOnly);
|
|
|
|
|
|
|
|
|
|
|
|
if (!fileGroups.ContainsKey(baseName))
|
|
|
|
|
|
{
|
|
|
|
|
|
fileGroups[baseName] = new List<string>();
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fileGroups[baseName].Add(bkmkFile);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 处理每个分组
|
|
|
|
|
|
foreach (var group in fileGroups)
|
|
|
|
|
|
{
|
2025-11-24 18:41:20 +08:00
|
|
|
|
var result = ProcessFileGroup(group.Key, group.Value.OrderBy(f => f, new BkmkFileComparer()).ToList(), txtSourcePath);
|
2025-11-24 15:44:37 +08:00
|
|
|
|
results.Add(result);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
catch (Exception ex)
|
|
|
|
|
|
{
|
|
|
|
|
|
var errorResult = new ProcessResult
|
|
|
|
|
|
{
|
|
|
|
|
|
Success = false,
|
|
|
|
|
|
ErrorMessage = ex.Message
|
|
|
|
|
|
};
|
|
|
|
|
|
results.Add(errorResult);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return results;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
private static string GetBaseFileName(string folderName)
|
|
|
|
|
|
{
|
|
|
|
|
|
// 获取空格前的部分作为基础名称
|
|
|
|
|
|
var spaceIndex = folderName.IndexOf(' ');
|
|
|
|
|
|
return spaceIndex > 0 ? folderName.Substring(0, spaceIndex) : folderName;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-24 16:14:05 +08:00
|
|
|
|
private static ProcessResult ProcessFileGroup(string baseName, List<string> bkmkFiles, string txtSourcePath)
|
2025-11-24 15:44:37 +08:00
|
|
|
|
{
|
|
|
|
|
|
var result = new ProcessResult
|
|
|
|
|
|
{
|
|
|
|
|
|
BaseFileName = baseName,
|
|
|
|
|
|
SourceFiles = bkmkFiles,
|
2025-11-24 15:51:08 +08:00
|
|
|
|
Success = true,
|
|
|
|
|
|
MetadataDocuments = new List<DocumentMetadata>()
|
2025-11-24 15:44:37 +08:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
try
|
|
|
|
|
|
{
|
|
|
|
|
|
foreach (var bkmkFile in bkmkFiles)
|
|
|
|
|
|
{
|
2025-11-24 15:51:08 +08:00
|
|
|
|
// 获取对应的TXT文件路径
|
2025-11-24 16:14:05 +08:00
|
|
|
|
var txtFile = GetCorrespondingTxtFile(bkmkFile, txtSourcePath);
|
2025-11-24 15:44:37 +08:00
|
|
|
|
|
2025-11-24 15:51:08 +08:00
|
|
|
|
// 创建元数据文档
|
|
|
|
|
|
var metadata = CreateMetadataFromFiles(txtFile, bkmkFile);
|
2025-11-24 15:44:37 +08:00
|
|
|
|
|
2025-11-24 15:51:08 +08:00
|
|
|
|
if (metadata != null)
|
|
|
|
|
|
{
|
|
|
|
|
|
result.MetadataDocuments.Add(metadata);
|
|
|
|
|
|
}
|
2025-11-24 15:44:37 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-24 15:51:08 +08:00
|
|
|
|
// 合并所有元数据文档
|
|
|
|
|
|
var combinedContent = ContentFormatter.CombineFormattedMetadataDocuments(result.MetadataDocuments);
|
2025-11-24 15:44:37 +08:00
|
|
|
|
result.OutputContent = combinedContent;
|
|
|
|
|
|
}
|
|
|
|
|
|
catch (Exception ex)
|
|
|
|
|
|
{
|
|
|
|
|
|
result.Success = false;
|
|
|
|
|
|
result.ErrorMessage = $"处理文件组 {baseName} 时出错: {ex.Message}";
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return result;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-24 16:14:05 +08:00
|
|
|
|
private static string GetCorrespondingTxtFile(string bkmkFile, string txtSourcePath)
|
2025-11-24 15:51:08 +08:00
|
|
|
|
{
|
|
|
|
|
|
var directory = Path.GetDirectoryName(bkmkFile);
|
|
|
|
|
|
var folderName = new DirectoryInfo(directory).Name;
|
|
|
|
|
|
|
2025-11-24 16:14:05 +08:00
|
|
|
|
// 在指定的TXT源路径中查找对应的文件
|
|
|
|
|
|
var txtFile = Path.Combine(txtSourcePath, $"{folderName}.txt");
|
2025-11-24 15:51:08 +08:00
|
|
|
|
|
|
|
|
|
|
return File.Exists(txtFile) ? txtFile : null;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
private static DocumentMetadata CreateMetadataFromFiles(string txtFile, string bkmkFile)
|
|
|
|
|
|
{
|
|
|
|
|
|
var metadata = new DocumentMetadata();
|
|
|
|
|
|
|
|
|
|
|
|
if (File.Exists(txtFile))
|
|
|
|
|
|
{
|
|
|
|
|
|
// 从TXT文件读取元数据
|
|
|
|
|
|
ReadMetadataFromTxt(txtFile, metadata);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 从bkmk文件提取书签
|
|
|
|
|
|
if (File.Exists(bkmkFile))
|
|
|
|
|
|
{
|
|
|
|
|
|
metadata.TableOfContents = BookmarkExtractor.ExtractBookmarksFromBkmk(bkmkFile);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return metadata;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
private static void ReadMetadataFromTxt(string txtFile, DocumentMetadata metadata)
|
|
|
|
|
|
{
|
|
|
|
|
|
try
|
|
|
|
|
|
{
|
|
|
|
|
|
string[] lines;
|
2025-11-24 16:03:56 +08:00
|
|
|
|
// 在Windows上使用GB2312编码读取,正确处理中文字符
|
2025-11-24 15:51:08 +08:00
|
|
|
|
try
|
|
|
|
|
|
{
|
2025-11-24 16:03:56 +08:00
|
|
|
|
lines = File.ReadAllLines(txtFile, Encoding.GetEncoding("GB2312"));
|
2025-11-24 15:51:08 +08:00
|
|
|
|
}
|
|
|
|
|
|
catch
|
|
|
|
|
|
{
|
2025-11-24 16:03:56 +08:00
|
|
|
|
// 如果GB2312不可用,使用系统默认编码
|
|
|
|
|
|
lines = File.ReadAllLines(txtFile, Encoding.Default);
|
2025-11-24 15:51:08 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
foreach (var line in lines)
|
|
|
|
|
|
{
|
|
|
|
|
|
var parts = line.Split(new[] { ':' }, 2);
|
|
|
|
|
|
if (parts.Length == 2)
|
|
|
|
|
|
{
|
|
|
|
|
|
var key = parts[0].Trim();
|
|
|
|
|
|
var value = parts[1].Trim();
|
|
|
|
|
|
|
|
|
|
|
|
switch (key)
|
|
|
|
|
|
{
|
|
|
|
|
|
case "title":
|
|
|
|
|
|
metadata.Title = value;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case "Other titles":
|
|
|
|
|
|
metadata.OtherTitles = value;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case "Volume":
|
|
|
|
|
|
metadata.Volume = value;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case "ISBN":
|
|
|
|
|
|
metadata.ISBN = value;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case "creator":
|
|
|
|
|
|
metadata.Creator = value;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case "contributor":
|
|
|
|
|
|
metadata.Contributor = value;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case "issuedDate":
|
|
|
|
|
|
metadata.IssuedDate = value;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case "publisher":
|
|
|
|
|
|
metadata.Publisher = value;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case "place":
|
|
|
|
|
|
metadata.Place = value;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case "Classification number":
|
|
|
|
|
|
metadata.ClassificationNumber = value;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case "page":
|
|
|
|
|
|
metadata.Page = value;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case "subject":
|
|
|
|
|
|
metadata.Subject = value;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case "date":
|
|
|
|
|
|
metadata.Date = value;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case "spatial":
|
|
|
|
|
|
metadata.Spatial = value;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case "Other ISBN":
|
|
|
|
|
|
metadata.OtherISBN = value;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case "Other time":
|
|
|
|
|
|
metadata.OtherTime = value;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case "url":
|
|
|
|
|
|
metadata.Url = value;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
catch (Exception ex)
|
|
|
|
|
|
{
|
|
|
|
|
|
throw new Exception($"读取TXT文件 {txtFile} 失败: {ex.Message}");
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-11-24 15:44:37 +08:00
|
|
|
|
public static void SaveResults(List<ProcessResult> results, string outputPath)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (!Directory.Exists(outputPath))
|
|
|
|
|
|
{
|
|
|
|
|
|
Directory.CreateDirectory(outputPath);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
foreach (var result in results)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (result.Success && !string.IsNullOrEmpty(result.OutputContent))
|
|
|
|
|
|
{
|
|
|
|
|
|
var outputFileName = $"{result.BaseFileName}.txt";
|
|
|
|
|
|
var outputFilePath = Path.Combine(outputPath, outputFileName);
|
|
|
|
|
|
|
2025-11-24 16:02:22 +08:00
|
|
|
|
// 使用UTF-8 with BOM保存,支持特殊字符
|
|
|
|
|
|
var utf8WithBom = new UTF8Encoding(true);
|
|
|
|
|
|
File.WriteAllText(outputFilePath, result.OutputContent, utf8WithBom);
|
2025-11-24 15:44:37 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|