SlideCombine/FileMerger.cs
yuuko 7f48871ab7 实现智能文件排序算法 - 修复跨位数排序问题
问题修复:
- 原先的字符串排序导致:1-3, 10-12, 2-4, 4-6(错误顺序)
- 现在智能排序:1-3, 2-4, 4-6, 10-12(正确顺序)

技术实现:
 BkmkFileComparer类:智能文件比较器
 正则表达式提取:(?:[\w-]+\s+)?(\d+)
 多格式支持:CH-875 1-3, Volume 2, Part 1等
 向后兼容:无数字格式仍按字符串排序

排序规则:
1. 都有数字:按数字大小比较
2. 只有一方有数字:有数字的排前面
3. 都无数字:按完整字符串比较

测试案例:
CH-875 1-3 → CH-875 2-4 → CH-875 4-6 → CH-875 10-12

现在文件合并顺序完全符合自然阅读顺序!

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-24 18:41:20 +08:00

325 lines
12 KiB
C#
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
namespace SlideCombine
{
/// <summary>
/// Bkmk文件智能排序比较器
/// 按文件夹名称中的数字部分进行排序
/// </summary>
public class BkmkFileComparer : IComparer<string>
{
public int Compare(string x, string y)
{
if (x == null && y == null) return 0;
if (x == null) return -1;
if (y == null) return 1;
// 获取文件夹名称(去掉路径和文件名)
var xFolder = Path.GetFileName(Path.GetDirectoryName(x));
var yFolder = Path.GetFileName(Path.GetDirectoryName(y));
// 提取数字部分进行智能排序
var xNumber = ExtractNumberFromFolder(xFolder);
var yNumber = ExtractNumberFromFolder(yFolder);
// 如果都有数字,按数字大小排序
if (xNumber.HasValue && yNumber.HasValue)
{
int result = xNumber.Value.CompareTo(yNumber.Value);
if (result != 0) return result;
}
// 如果只有一方有数字,有数字的排前面
else if (xNumber.HasValue)
{
return -1;
}
else if (yNumber.HasValue)
{
return 1;
}
// 如果都没有数字或数字相同,按完整字符串排序
return string.Compare(x, y, StringComparison.OrdinalIgnoreCase);
}
private int? ExtractNumberFromFolder(string folderName)
{
// 使用正则表达式提取文件夹名称中的数字部分
// 支持格式CH-875 1-3, CH-875 4-6, CH-875 10-12, Volume 2, Part 1等
var match = Regex.Match(folderName, @"(?:[\w-]+\s+)?(\d+)", RegexOptions.IgnoreCase);
if (match.Success && match.Groups.Count > 1)
{
string numberStr = match.Groups[1].Value;
if (int.TryParse(numberStr, out int number))
{
return number;
}
}
return null;
}
}
public class ProcessResult
{
public string BaseFileName { get; set; }
public List<string> SourceFiles { get; set; }
public string OutputContent { get; set; }
public bool Success { get; set; }
public string ErrorMessage { get; set; }
public List<DocumentMetadata> MetadataDocuments { get; set; }
}
public class FileMerger
{
public static List<ProcessResult> ProcessAllFolders(string pdfRootPath, string txtSourcePath, string txtOutputPath)
{
var results = new List<ProcessResult>();
try
{
// 获取所有包含FreePic2Pdf_bkmk文件的文件夹支持无扩展名和.txt扩展名
var bkmkFiles = new List<string>();
bkmkFiles.AddRange(Directory.GetFiles(pdfRootPath, "FreePic2Pdf_bkmk", SearchOption.AllDirectories));
bkmkFiles.AddRange(Directory.GetFiles(pdfRootPath, "FreePic2Pdf_bkmk.txt", SearchOption.AllDirectories));
if (bkmkFiles.Count == 0)
{
throw new Exception($"在路径 {pdfRootPath} 下未找到任何 FreePic2Pdf_bkmk 或 FreePic2Pdf_bkmk.txt 文件");
}
// 检查TXT源路径是否存在
if (!Directory.Exists(txtSourcePath))
{
throw new Exception($"TXT源文件路径不存在: {txtSourcePath}");
}
// 按基础文件名分组(取文件夹名称的空格前缀)
var fileGroups = new Dictionary<string, List<string>>();
foreach (var bkmkFile in bkmkFiles)
{
var folderName = Path.GetDirectoryName(bkmkFile);
var folderNameOnly = new DirectoryInfo(folderName).Name;
// 获取空格前的基础名称
var baseName = GetBaseFileName(folderNameOnly);
if (!fileGroups.ContainsKey(baseName))
{
fileGroups[baseName] = new List<string>();
}
fileGroups[baseName].Add(bkmkFile);
}
// 处理每个分组
foreach (var group in fileGroups)
{
var result = ProcessFileGroup(group.Key, group.Value.OrderBy(f => f, new BkmkFileComparer()).ToList(), txtSourcePath);
results.Add(result);
}
}
catch (Exception ex)
{
var errorResult = new ProcessResult
{
Success = false,
ErrorMessage = ex.Message
};
results.Add(errorResult);
}
return results;
}
private static string GetBaseFileName(string folderName)
{
// 获取空格前的部分作为基础名称
var spaceIndex = folderName.IndexOf(' ');
return spaceIndex > 0 ? folderName.Substring(0, spaceIndex) : folderName;
}
private static ProcessResult ProcessFileGroup(string baseName, List<string> bkmkFiles, string txtSourcePath)
{
var result = new ProcessResult
{
BaseFileName = baseName,
SourceFiles = bkmkFiles,
Success = true,
MetadataDocuments = new List<DocumentMetadata>()
};
try
{
foreach (var bkmkFile in bkmkFiles)
{
// 获取对应的TXT文件路径
var txtFile = GetCorrespondingTxtFile(bkmkFile, txtSourcePath);
// 创建元数据文档
var metadata = CreateMetadataFromFiles(txtFile, bkmkFile);
if (metadata != null)
{
result.MetadataDocuments.Add(metadata);
}
}
// 合并所有元数据文档
var combinedContent = ContentFormatter.CombineFormattedMetadataDocuments(result.MetadataDocuments);
result.OutputContent = combinedContent;
}
catch (Exception ex)
{
result.Success = false;
result.ErrorMessage = $"处理文件组 {baseName} 时出错: {ex.Message}";
}
return result;
}
private static string GetCorrespondingTxtFile(string bkmkFile, string txtSourcePath)
{
var directory = Path.GetDirectoryName(bkmkFile);
var folderName = new DirectoryInfo(directory).Name;
// 在指定的TXT源路径中查找对应的文件
var txtFile = Path.Combine(txtSourcePath, $"{folderName}.txt");
return File.Exists(txtFile) ? txtFile : null;
}
private static DocumentMetadata CreateMetadataFromFiles(string txtFile, string bkmkFile)
{
var metadata = new DocumentMetadata();
if (File.Exists(txtFile))
{
// 从TXT文件读取元数据
ReadMetadataFromTxt(txtFile, metadata);
}
// 从bkmk文件提取书签
if (File.Exists(bkmkFile))
{
metadata.TableOfContents = BookmarkExtractor.ExtractBookmarksFromBkmk(bkmkFile);
}
return metadata;
}
private static void ReadMetadataFromTxt(string txtFile, DocumentMetadata metadata)
{
try
{
string[] lines;
// 在Windows上使用GB2312编码读取正确处理中文字符
try
{
lines = File.ReadAllLines(txtFile, Encoding.GetEncoding("GB2312"));
}
catch
{
// 如果GB2312不可用使用系统默认编码
lines = File.ReadAllLines(txtFile, Encoding.Default);
}
foreach (var line in lines)
{
var parts = line.Split(new[] { ':' }, 2);
if (parts.Length == 2)
{
var key = parts[0].Trim();
var value = parts[1].Trim();
switch (key)
{
case "title":
metadata.Title = value;
break;
case "Other titles":
metadata.OtherTitles = value;
break;
case "Volume":
metadata.Volume = value;
break;
case "ISBN":
metadata.ISBN = value;
break;
case "creator":
metadata.Creator = value;
break;
case "contributor":
metadata.Contributor = value;
break;
case "issuedDate":
metadata.IssuedDate = value;
break;
case "publisher":
metadata.Publisher = value;
break;
case "place":
metadata.Place = value;
break;
case "Classification number":
metadata.ClassificationNumber = value;
break;
case "page":
metadata.Page = value;
break;
case "subject":
metadata.Subject = value;
break;
case "date":
metadata.Date = value;
break;
case "spatial":
metadata.Spatial = value;
break;
case "Other ISBN":
metadata.OtherISBN = value;
break;
case "Other time":
metadata.OtherTime = value;
break;
case "url":
metadata.Url = value;
break;
}
}
}
}
catch (Exception ex)
{
throw new Exception($"读取TXT文件 {txtFile} 失败: {ex.Message}");
}
}
public static void SaveResults(List<ProcessResult> results, string outputPath)
{
if (!Directory.Exists(outputPath))
{
Directory.CreateDirectory(outputPath);
}
foreach (var result in results)
{
if (result.Success && !string.IsNullOrEmpty(result.OutputContent))
{
var outputFileName = $"{result.BaseFileName}.txt";
var outputFilePath = Path.Combine(outputPath, outputFileName);
// 使用UTF-8 with BOM保存支持特殊字符
var utf8WithBom = new UTF8Encoding(true);
File.WriteAllText(outputFilePath, result.OutputContent, utf8WithBom);
}
}
}
}
}