SlideCombine/BookmarkExtractor.cs

97 lines
3.0 KiB
C#
Raw Permalink Normal View History

using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
namespace SlideCombine
{
public class BookmarkItem
{
public string Title { get; set; }
public string Page { get; set; }
public string FormattedContent { get; set; }
}
public class BookmarkExtractor
{
public static List<BookmarkItem> ExtractBookmarksFromBkmk(string bkmkFilePath)
{
var bookmarks = new List<BookmarkItem>();
if (!File.Exists(bkmkFilePath))
{
throw new FileNotFoundException($"FreePic2Pdf_bkmk文件不存在: {bkmkFilePath}");
}
try
{
// 尝试用UTF-8读取如果失败则用GBK
string content;
try
{
content = File.ReadAllText(bkmkFilePath, Encoding.UTF8);
}
catch
{
content = File.ReadAllText(bkmkFilePath, Encoding.GetEncoding("GBK"));
}
// 按行分割内容
var lines = content.Split(new[] { '\r', '\n' }, StringSplitOptions.RemoveEmptyEntries);
foreach (var line in lines)
{
var trimmedLine = line.Trim();
if (string.IsNullOrEmpty(trimmedLine))
continue;
// 查找页码模式 - 假设格式为 "标题 页码" 或 "标题:页码"
var bookmark = ParseBookmarkLine(trimmedLine);
if (bookmark != null)
{
bookmarks.Add(bookmark);
}
}
}
catch (Exception ex)
{
throw new Exception($"读取书签文件失败: {ex.Message}");
}
return bookmarks;
}
private static BookmarkItem ParseBookmarkLine(string line)
{
// 简单的解析逻辑根据实际FreePic2Pdf_bkmk文件格式调整
var parts = line.Split(new[] { ' ', '\t', ':' }, StringSplitOptions.RemoveEmptyEntries);
if (parts.Length < 2)
return null;
var bookmark = new BookmarkItem();
// 假设最后一部分是页码
var pagePart = parts[parts.Length - 1];
if (IsPageNumber(pagePart))
{
bookmark.Page = pagePart;
bookmark.Title = string.Join(" ", parts, 0, parts.Length - 1);
}
else
{
// 如果没有明确的页码,跳过这一行
return null;
}
return bookmark;
}
private static bool IsPageNumber(string text)
{
// 检查是否为数字(可以是罗马数字或阿拉伯数字)
return System.Text.RegularExpressions.Regex.IsMatch(text, @"^\d+$") ||
System.Text.RegularExpressions.Regex.IsMatch(text, @"^[IVXLCDMivxlcdm]+$");
}
}
}