TUT/src/html_parser.cpp
m1ngsama 0ecedb1aed feat: Add DOM tree implementation and fix compiler warnings
Major improvements:
- Add proper DOM tree structure (dom_tree.cpp/h) with hierarchical node representation
- Refactor HTML parser to use DOM tree instead of flat ContentElement structure
- Enhance text renderer with improved inline content handling and UTF-8 support
- Improve browser interactive element tracking with byte-accurate positioning
- Add comprehensive HTML entity decoding (80+ named entities + numeric)
- Enhance form handling with better field tracking and submission

Code quality improvements:
- Fix all compiler warnings (unused parameters/variables)
- Clean build with zero warnings
- Better separation of concerns between parsing and rendering

Testing:
- Add test_table.html for table rendering verification

This change enables better handling of complex HTML structures while
maintaining the Unix philosophy of simplicity and focus.
2025-12-25 13:18:08 +08:00

108 lines
3.4 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "html_parser.h"
#include "dom_tree.h"
#include <stdexcept>
// ============================================================================
// HtmlParser::Impl 实现
// ============================================================================
class HtmlParser::Impl {
public:
bool keep_code_blocks = true;
bool keep_lists = true;
DomTreeBuilder tree_builder;
DocumentTree parse_tree(const std::string& html, const std::string& base_url) {
return tree_builder.build(html, base_url);
}
// 将DocumentTree转换为ParsedDocument向后兼容
ParsedDocument convert_to_parsed_document(const DocumentTree& tree) {
ParsedDocument doc;
doc.title = tree.title;
doc.url = tree.url;
doc.links = tree.links;
// 递归遍历DOM树收集ContentElement
if (tree.root) {
collect_content_elements(tree.root.get(), doc.elements);
}
return doc;
}
private:
void collect_content_elements(DomNode* node, std::vector<ContentElement>& elements) {
if (!node || !node->should_render()) return;
if (node->node_type == NodeType::ELEMENT) {
ContentElement elem;
elem.type = node->element_type;
elem.url = node->href;
elem.level = 0; // TODO: 根据需要计算层级
elem.list_number = 0;
elem.nesting_level = 0;
// 提取文本内容
elem.text = node->get_all_text();
// 收集内联链接
collect_inline_links(node, elem.inline_links);
// 只添加有内容的元素
if (!elem.text.empty() || node->element_type == ElementType::HORIZONTAL_RULE) {
elements.push_back(elem);
}
}
// 递归处理子节点
for (const auto& child : node->children) {
collect_content_elements(child.get(), elements);
}
}
void collect_inline_links(DomNode* node, std::vector<InlineLink>& links) {
if (!node) return;
if (node->element_type == ElementType::LINK && node->link_index >= 0) {
InlineLink link;
link.text = node->get_all_text();
link.url = node->href;
link.link_index = node->link_index;
link.start_pos = 0; // 简化:不计算精确位置
link.end_pos = link.text.length();
links.push_back(link);
}
for (const auto& child : node->children) {
collect_inline_links(child.get(), links);
}
}
};
// ============================================================================
// HtmlParser 公共接口实现
// ============================================================================
HtmlParser::HtmlParser() : pImpl(std::make_unique<Impl>()) {}
HtmlParser::~HtmlParser() = default;
DocumentTree HtmlParser::parse_tree(const std::string& html, const std::string& base_url) {
return pImpl->parse_tree(html, base_url);
}
ParsedDocument HtmlParser::parse(const std::string& html, const std::string& base_url) {
// 使用新的DOM树解析然后转换为旧格式
DocumentTree tree = pImpl->parse_tree(html, base_url);
return pImpl->convert_to_parsed_document(tree);
}
void HtmlParser::set_keep_code_blocks(bool keep) {
pImpl->keep_code_blocks = keep;
}
void HtmlParser::set_keep_lists(bool keep) {
pImpl->keep_lists = keep;
}