SlideCombine/FileMerger.cs
yuuko fed8d14d73 针对Windows主机优化编码处理
- 使用GB2312编码读取TXT文件,确保中文字符正确显示
- 输出保持UTF-8 with BOM格式
- 解决Windows环境下乱码问题

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-24 16:03:56 +08:00

261 lines
9.6 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
namespace SlideCombine
{
public class ProcessResult
{
public string BaseFileName { get; set; }
public List<string> SourceFiles { get; set; }
public string OutputContent { get; set; }
public bool Success { get; set; }
public string ErrorMessage { get; set; }
public List<DocumentMetadata> MetadataDocuments { get; set; }
}
public class FileMerger
{
public static List<ProcessResult> ProcessAllFolders(string pdfRootPath, string txtOutputPath)
{
var results = new List<ProcessResult>();
try
{
// 获取所有包含FreePic2Pdf_bkmk文件的文件夹支持无扩展名和.txt扩展名
var bkmkFiles = new List<string>();
bkmkFiles.AddRange(Directory.GetFiles(pdfRootPath, "FreePic2Pdf_bkmk", SearchOption.AllDirectories));
bkmkFiles.AddRange(Directory.GetFiles(pdfRootPath, "FreePic2Pdf_bkmk.txt", SearchOption.AllDirectories));
if (bkmkFiles.Count == 0)
{
throw new Exception($"在路径 {pdfRootPath} 下未找到任何 FreePic2Pdf_bkmk 或 FreePic2Pdf_bkmk.txt 文件");
}
// 按基础文件名分组(取文件夹名称的空格前缀)
var fileGroups = new Dictionary<string, List<string>>();
foreach (var bkmkFile in bkmkFiles)
{
var folderName = Path.GetDirectoryName(bkmkFile);
var folderNameOnly = new DirectoryInfo(folderName).Name;
// 获取空格前的基础名称
var baseName = GetBaseFileName(folderNameOnly);
if (!fileGroups.ContainsKey(baseName))
{
fileGroups[baseName] = new List<string>();
}
fileGroups[baseName].Add(bkmkFile);
}
// 处理每个分组
foreach (var group in fileGroups)
{
var result = ProcessFileGroup(group.Key, group.Value.OrderBy(f => f).ToList());
results.Add(result);
}
}
catch (Exception ex)
{
var errorResult = new ProcessResult
{
Success = false,
ErrorMessage = ex.Message
};
results.Add(errorResult);
}
return results;
}
private static string GetBaseFileName(string folderName)
{
// 获取空格前的部分作为基础名称
var spaceIndex = folderName.IndexOf(' ');
return spaceIndex > 0 ? folderName.Substring(0, spaceIndex) : folderName;
}
private static ProcessResult ProcessFileGroup(string baseName, List<string> bkmkFiles)
{
var result = new ProcessResult
{
BaseFileName = baseName,
SourceFiles = bkmkFiles,
Success = true,
MetadataDocuments = new List<DocumentMetadata>()
};
try
{
foreach (var bkmkFile in bkmkFiles)
{
// 获取对应的TXT文件路径
var txtFile = GetCorrespondingTxtFile(bkmkFile);
// 创建元数据文档
var metadata = CreateMetadataFromFiles(txtFile, bkmkFile);
if (metadata != null)
{
result.MetadataDocuments.Add(metadata);
}
}
// 合并所有元数据文档
var combinedContent = ContentFormatter.CombineFormattedMetadataDocuments(result.MetadataDocuments);
result.OutputContent = combinedContent;
}
catch (Exception ex)
{
result.Success = false;
result.ErrorMessage = $"处理文件组 {baseName} 时出错: {ex.Message}";
}
return result;
}
private static string GetCorrespondingTxtFile(string bkmkFile)
{
var directory = Path.GetDirectoryName(bkmkFile);
var folderName = new DirectoryInfo(directory).Name;
// 在TXT文件夹中查找对应的文件
var txtDirectory = Path.Combine(Directory.GetParent(directory).Parent.FullName, "TXT");
var txtFile = Path.Combine(txtDirectory, $"{folderName}.txt");
return File.Exists(txtFile) ? txtFile : null;
}
private static DocumentMetadata CreateMetadataFromFiles(string txtFile, string bkmkFile)
{
var metadata = new DocumentMetadata();
if (File.Exists(txtFile))
{
// 从TXT文件读取元数据
ReadMetadataFromTxt(txtFile, metadata);
}
// 从bkmk文件提取书签
if (File.Exists(bkmkFile))
{
metadata.TableOfContents = BookmarkExtractor.ExtractBookmarksFromBkmk(bkmkFile);
}
return metadata;
}
private static void ReadMetadataFromTxt(string txtFile, DocumentMetadata metadata)
{
try
{
string[] lines;
// 在Windows上使用GB2312编码读取正确处理中文字符
try
{
lines = File.ReadAllLines(txtFile, Encoding.GetEncoding("GB2312"));
}
catch
{
// 如果GB2312不可用使用系统默认编码
lines = File.ReadAllLines(txtFile, Encoding.Default);
}
foreach (var line in lines)
{
var parts = line.Split(new[] { ':' }, 2);
if (parts.Length == 2)
{
var key = parts[0].Trim();
var value = parts[1].Trim();
switch (key)
{
case "title":
metadata.Title = value;
break;
case "Other titles":
metadata.OtherTitles = value;
break;
case "Volume":
metadata.Volume = value;
break;
case "ISBN":
metadata.ISBN = value;
break;
case "creator":
metadata.Creator = value;
break;
case "contributor":
metadata.Contributor = value;
break;
case "issuedDate":
metadata.IssuedDate = value;
break;
case "publisher":
metadata.Publisher = value;
break;
case "place":
metadata.Place = value;
break;
case "Classification number":
metadata.ClassificationNumber = value;
break;
case "page":
metadata.Page = value;
break;
case "subject":
metadata.Subject = value;
break;
case "date":
metadata.Date = value;
break;
case "spatial":
metadata.Spatial = value;
break;
case "Other ISBN":
metadata.OtherISBN = value;
break;
case "Other time":
metadata.OtherTime = value;
break;
case "url":
metadata.Url = value;
break;
}
}
}
}
catch (Exception ex)
{
throw new Exception($"读取TXT文件 {txtFile} 失败: {ex.Message}");
}
}
public static void SaveResults(List<ProcessResult> results, string outputPath)
{
if (!Directory.Exists(outputPath))
{
Directory.CreateDirectory(outputPath);
}
foreach (var result in results)
{
if (result.Success && !string.IsNullOrEmpty(result.OutputContent))
{
var outputFileName = $"{result.BaseFileName}.txt";
var outputFilePath = Path.Combine(outputPath, outputFileName);
// 使用UTF-8 with BOM保存支持特殊字符
var utf8WithBom = new UTF8Encoding(true);
File.WriteAllText(outputFilePath, result.OutputContent, utf8WithBom);
}
}
}
}
}