#include "html_parser.h" #include "dom_tree.h" #include // ============================================================================ // HtmlParser::Impl 实现 // ============================================================================ class HtmlParser::Impl { public: bool keep_code_blocks = true; bool keep_lists = true; DomTreeBuilder tree_builder; DocumentTree parse_tree(const std::string& html, const std::string& base_url) { return tree_builder.build(html, base_url); } // 将DocumentTree转换为ParsedDocument(向后兼容) ParsedDocument convert_to_parsed_document(const DocumentTree& tree) { ParsedDocument doc; doc.title = tree.title; doc.url = tree.url; doc.links = tree.links; // 递归遍历DOM树,收集ContentElement if (tree.root) { collect_content_elements(tree.root.get(), doc.elements); } return doc; } private: void collect_content_elements(DomNode* node, std::vector& elements) { if (!node || !node->should_render()) return; if (node->node_type == NodeType::ELEMENT) { ContentElement elem; elem.type = node->element_type; elem.url = node->href; elem.level = 0; // TODO: 根据需要计算层级 elem.list_number = 0; elem.nesting_level = 0; // 提取文本内容 elem.text = node->get_all_text(); // 收集内联链接 collect_inline_links(node, elem.inline_links); // 只添加有内容的元素 if (!elem.text.empty() || node->element_type == ElementType::HORIZONTAL_RULE) { elements.push_back(elem); } } // 递归处理子节点 for (const auto& child : node->children) { collect_content_elements(child.get(), elements); } } void collect_inline_links(DomNode* node, std::vector& links) { if (!node) return; if (node->element_type == ElementType::LINK && node->link_index >= 0) { InlineLink link; link.text = node->get_all_text(); link.url = node->href; link.link_index = node->link_index; link.start_pos = 0; // 简化:不计算精确位置 link.end_pos = link.text.length(); links.push_back(link); } for (const auto& child : node->children) { collect_inline_links(child.get(), links); } } }; // ============================================================================ // HtmlParser 公共接口实现 // ============================================================================ HtmlParser::HtmlParser() : pImpl(std::make_unique()) {} HtmlParser::~HtmlParser() = default; DocumentTree HtmlParser::parse_tree(const std::string& html, const std::string& base_url) { return pImpl->parse_tree(html, base_url); } ParsedDocument HtmlParser::parse(const std::string& html, const std::string& base_url) { // 使用新的DOM树解析,然后转换为旧格式 DocumentTree tree = pImpl->parse_tree(html, base_url); return pImpl->convert_to_parsed_document(tree); } void HtmlParser::set_keep_code_blocks(bool keep) { pImpl->keep_code_blocks = keep; } void HtmlParser::set_keep_lists(bool keep) { pImpl->keep_lists = keep; }