From 2fedcebc25aa08774da45c3e45d7c2b044eb607f Mon Sep 17 00:00:00 2001 From: m1ngsama Date: Fri, 5 Dec 2025 14:59:02 +0800 Subject: [PATCH] feat: Add HTML parser and content extraction Implement HTML parser for extracting readable content: - Parse HTML structure (headings, paragraphs, lists, links) - Extract and decode HTML entities - Smart content area detection (article, main, body) - Relative URL to absolute URL conversion - Support for both absolute and relative paths - Filter out scripts, styles, and non-content elements The parser uses regex-based extraction optimized for text-heavy websites and documentation. --- src/html_parser.cpp | 294 ++++++++++++++++++++++++++++++++++++++++++++ src/html_parser.h | 57 +++++++++ 2 files changed, 351 insertions(+) create mode 100644 src/html_parser.cpp create mode 100644 src/html_parser.h diff --git a/src/html_parser.cpp b/src/html_parser.cpp new file mode 100644 index 0000000..a66711b --- /dev/null +++ b/src/html_parser.cpp @@ -0,0 +1,294 @@ +#include "html_parser.h" +#include +#include +#include +#include + +class HtmlParser::Impl { +public: + bool keep_code_blocks = true; + bool keep_lists = true; + + // 简单的HTML标签清理 + std::string remove_tags(const std::string& html) { + std::string result; + bool in_tag = false; + for (char c : html) { + if (c == '<') { + in_tag = true; + } else if (c == '>') { + in_tag = false; + } else if (!in_tag) { + result += c; + } + } + return result; + } + + // 解码HTML实体 + std::string decode_html_entities(const std::string& text) { + std::string result = text; + + // 常见HTML实体 + const std::vector> entities = { + {" ", " "}, + {"&", "&"}, + {"<", "<"}, + {">", ">"}, + {""", "\""}, + {"'", "'"}, + {"'", "'"}, + {"—", "\u2014"}, + {"–", "\u2013"}, + {"…", "..."}, + {"“", "\u201C"}, + {"”", "\u201D"}, + {"‘", "\u2018"}, + {"’", "\u2019"} + }; + + for (const auto& [entity, replacement] : entities) { + size_t pos = 0; + while ((pos = result.find(entity, pos)) != std::string::npos) { + result.replace(pos, entity.length(), replacement); + pos += replacement.length(); + } + } + + return result; + } + + // 提取标签内容 + std::string extract_tag_content(const std::string& html, const std::string& tag) { + std::regex tag_regex("<" + tag + "[^>]*>([\\s\\S]*?)", + std::regex::icase); + std::smatch match; + if (std::regex_search(html, match, tag_regex)) { + return match[1].str(); + } + return ""; + } + + // 提取所有匹配的标签 + std::vector extract_all_tags(const std::string& html, const std::string& tag) { + std::vector results; + std::regex tag_regex("<" + tag + "[^>]*>([\\s\\S]*?)", + std::regex::icase); + + auto begin = std::sregex_iterator(html.begin(), html.end(), tag_regex); + auto end = std::sregex_iterator(); + + for (std::sregex_iterator i = begin; i != end; ++i) { + std::smatch match = *i; + results.push_back(match[1].str()); + } + + return results; + } + + // 提取链接 + std::vector extract_links(const std::string& html, const std::string& base_url) { + std::vector links; + std::regex link_regex(R"(]*href\s*=\s*["']([^"']*)["'][^>]*>([\s\S]*?))", + std::regex::icase); + + auto begin = std::sregex_iterator(html.begin(), html.end(), link_regex); + auto end = std::sregex_iterator(); + + int position = 0; + for (std::sregex_iterator i = begin; i != end; ++i) { + std::smatch match = *i; + Link link; + link.url = match[1].str(); + link.text = decode_html_entities(remove_tags(match[2].str())); + link.position = position++; + + // 处理相对URL + if (!link.url.empty() && link.url[0] != '#') { + // 如果是相对路径 + if (link.url.find("://") == std::string::npos) { + // 提取base_url的协议和域名 + std::regex base_regex(R"((https?://[^/]+)(/.*)?)", std::regex::icase); + std::smatch base_match; + if (std::regex_match(base_url, base_match, base_regex)) { + std::string base_domain = base_match[1].str(); + std::string base_path = base_match[2].str(); + + if (link.url[0] == '/') { + // 绝对路径(从根目录开始) + link.url = base_domain + link.url; + } else { + // 相对路径 + // 获取当前页面的目录 + size_t last_slash = base_path.rfind('/'); + std::string current_dir = (last_slash != std::string::npos) + ? base_path.substr(0, last_slash + 1) + : "/"; + link.url = base_domain + current_dir + link.url; + } + } + } + + // 过滤空链接文本 + if (!link.text.empty()) { + links.push_back(link); + } + } + } + + return links; + } + + // 清理空白字符 + std::string trim(const std::string& str) { + auto start = str.begin(); + while (start != str.end() && std::isspace(*start)) { + ++start; + } + + auto end = str.end(); + do { + --end; + } while (std::distance(start, end) > 0 && std::isspace(*end)); + + return std::string(start, end + 1); + } + + // 移除脚本和样式 + std::string remove_scripts_and_styles(const std::string& html) { + std::string result = html; + + // 移除script标签 + result = std::regex_replace(result, + std::regex("]*>[\\s\\S]*?", std::regex::icase), + ""); + + // 移除style标签 + result = std::regex_replace(result, + std::regex("]*>[\\s\\S]*?", std::regex::icase), + ""); + + return result; + } +}; + +HtmlParser::HtmlParser() : pImpl(std::make_unique()) {} + +HtmlParser::~HtmlParser() = default; + +ParsedDocument HtmlParser::parse(const std::string& html, const std::string& base_url) { + ParsedDocument doc; + doc.url = base_url; + + // 清理HTML + std::string clean_html = pImpl->remove_scripts_and_styles(html); + + // 提取标题 + std::string title_content = pImpl->extract_tag_content(clean_html, "title"); + doc.title = pImpl->decode_html_entities(pImpl->trim(pImpl->remove_tags(title_content))); + + if (doc.title.empty()) { + std::string h1_content = pImpl->extract_tag_content(clean_html, "h1"); + doc.title = pImpl->decode_html_entities(pImpl->trim(pImpl->remove_tags(h1_content))); + } + + // 提取主要内容区域(article, main, 或 body) + std::string main_content = pImpl->extract_tag_content(clean_html, "article"); + if (main_content.empty()) { + main_content = pImpl->extract_tag_content(clean_html, "main"); + } + if (main_content.empty()) { + main_content = pImpl->extract_tag_content(clean_html, "body"); + } + if (main_content.empty()) { + main_content = clean_html; + } + + // 提取链接 + doc.links = pImpl->extract_links(main_content, base_url); + + // 解析标题 + for (int level = 1; level <= 6; ++level) { + std::string tag = "h" + std::to_string(level); + auto headings = pImpl->extract_all_tags(main_content, tag); + for (const auto& heading : headings) { + ContentElement elem; + elem.type = (level == 1) ? ElementType::HEADING1 : + (level == 2) ? ElementType::HEADING2 : ElementType::HEADING3; + elem.text = pImpl->decode_html_entities(pImpl->trim(pImpl->remove_tags(heading))); + elem.level = level; + if (!elem.text.empty()) { + doc.elements.push_back(elem); + } + } + } + + // 解析列表项 + if (pImpl->keep_lists) { + auto list_items = pImpl->extract_all_tags(main_content, "li"); + for (const auto& item : list_items) { + std::string text = pImpl->decode_html_entities(pImpl->trim(pImpl->remove_tags(item))); + if (!text.empty() && text.length() > 1) { + ContentElement elem; + elem.type = ElementType::LIST_ITEM; + elem.text = text; + doc.elements.push_back(elem); + } + } + } + + // 解析段落 + auto paragraphs = pImpl->extract_all_tags(main_content, "p"); + for (const auto& para : paragraphs) { + std::string text = pImpl->decode_html_entities(pImpl->trim(pImpl->remove_tags(para))); + if (!text.empty() && text.length() > 1) { + ContentElement elem; + elem.type = ElementType::PARAGRAPH; + elem.text = text; + doc.elements.push_back(elem); + } + } + + // 如果内容很少,尝试提取div中的文本 + if (doc.elements.size() < 3) { + auto divs = pImpl->extract_all_tags(main_content, "div"); + for (const auto& div : divs) { + std::string text = pImpl->decode_html_entities(pImpl->trim(pImpl->remove_tags(div))); + if (!text.empty() && text.length() > 20) { // 忽略太短的div + ContentElement elem; + elem.type = ElementType::PARAGRAPH; + elem.text = text; + doc.elements.push_back(elem); + } + } + } + + // 如果仍然没有内容,尝试提取整个文本 + if (doc.elements.empty()) { + std::string all_text = pImpl->decode_html_entities(pImpl->trim(pImpl->remove_tags(main_content))); + if (!all_text.empty()) { + // 按换行符分割 + std::istringstream iss(all_text); + std::string line; + while (std::getline(iss, line)) { + line = pImpl->trim(line); + if (!line.empty() && line.length() > 1) { + ContentElement elem; + elem.type = ElementType::PARAGRAPH; + elem.text = line; + doc.elements.push_back(elem); + } + } + } + } + + return doc; +} + +void HtmlParser::set_keep_code_blocks(bool keep) { + pImpl->keep_code_blocks = keep; +} + +void HtmlParser::set_keep_lists(bool keep) { + pImpl->keep_lists = keep; +} diff --git a/src/html_parser.h b/src/html_parser.h new file mode 100644 index 0000000..056c6e2 --- /dev/null +++ b/src/html_parser.h @@ -0,0 +1,57 @@ +#pragma once + +#include +#include + +enum class ElementType { + TEXT, + HEADING1, + HEADING2, + HEADING3, + PARAGRAPH, + LINK, + LIST_ITEM, + BLOCKQUOTE, + CODE_BLOCK, + HORIZONTAL_RULE, + LINE_BREAK +}; + +struct Link { + std::string text; + std::string url; + int position; // 在文档中的位置(用于TAB导航) +}; + +struct ContentElement { + ElementType type; + std::string text; + std::string url; // 对于链接元素 + int level; // 对于标题元素(1-6) +}; + +struct ParsedDocument { + std::string title; + std::string url; + std::vector elements; + std::vector links; +}; + +class HtmlParser { +public: + HtmlParser(); + ~HtmlParser(); + + // 解析HTML并提取可读内容 + ParsedDocument parse(const std::string& html, const std::string& base_url = ""); + + // 设置是否保留代码块 + void set_keep_code_blocks(bool keep); + + // 设置是否保留列表 + void set_keep_lists(bool keep); + +private: + class Impl; + std::unique_ptr pImpl; +};