feat: Add table, image, and nested list support to HTML parser

- Add Table, Image, and Form data structures
- Implement table extraction with proper row/column parsing
- Add image extraction with alt text and dimensions
- Implement recursive nested list parsing (ul/ol)
- Support ordered and unordered lists with nesting levels
- Extract list item numbers for ordered lists
- Add HEADING4-6, ORDERED_LIST_ITEM, TABLE, IMAGE element types

This enhancement allows TUT to properly extract and represent
structured content from HTML, enabling better rendering of
data-heavy websites.
This commit is contained in:
m1ngsama 2025-12-17 13:43:27 +08:00
parent ea71b0ca02
commit befe004553
2 changed files with 293 additions and 12 deletions

View file

@ -3,6 +3,7 @@
#include <algorithm>
#include <cctype>
#include <sstream>
#include <functional>
class HtmlParser::Impl {
public:
@ -234,6 +235,133 @@ public:
return result;
}
// Extract images
std::vector<Image> extract_images(const std::string& html) {
std::vector<Image> images;
std::regex img_regex(R"(<img[^>]*src\s*=\s*["']([^"']*)["'][^>]*>)", std::regex::icase);
auto begin = std::sregex_iterator(html.begin(), html.end(), img_regex);
auto end = std::sregex_iterator();
for (std::sregex_iterator i = begin; i != end; ++i) {
std::smatch match = *i;
Image img;
img.src = match[1].str();
img.width = -1;
img.height = -1;
// Extract alt text
std::string img_tag = match[0].str();
std::regex alt_regex(R"(alt\s*=\s*["']([^"']*)["'])", std::regex::icase);
std::smatch alt_match;
if (std::regex_search(img_tag, alt_match, alt_regex)) {
img.alt = decode_html_entities(alt_match[1].str());
}
// Extract width
std::regex width_regex(R"(width\s*=\s*["']?(\d+)["']?)", std::regex::icase);
std::smatch width_match;
if (std::regex_search(img_tag, width_match, width_regex)) {
try {
img.width = std::stoi(width_match[1].str());
} catch (...) {}
}
// Extract height
std::regex height_regex(R"(height\s*=\s*["']?(\d+)["']?)", std::regex::icase);
std::smatch height_match;
if (std::regex_search(img_tag, height_match, height_regex)) {
try {
img.height = std::stoi(height_match[1].str());
} catch (...) {}
}
images.push_back(img);
}
return images;
}
// Extract tables
std::vector<Table> extract_tables(const std::string& html, std::vector<Link>& all_links) {
std::vector<Table> tables;
auto table_contents = extract_all_tags(html, "table");
for (const auto& table_html : table_contents) {
Table table;
table.has_header = false;
// Extract rows
auto thead_html = extract_tag_content(table_html, "thead");
auto tbody_html = extract_tag_content(table_html, "tbody");
// If no thead/tbody, just get all rows
std::vector<std::string> row_htmls;
if (!thead_html.empty() || !tbody_html.empty()) {
if (!thead_html.empty()) {
auto header_rows = extract_all_tags(thead_html, "tr");
row_htmls.insert(row_htmls.end(), header_rows.begin(), header_rows.end());
table.has_header = !header_rows.empty();
}
if (!tbody_html.empty()) {
auto body_rows = extract_all_tags(tbody_html, "tr");
row_htmls.insert(row_htmls.end(), body_rows.begin(), body_rows.end());
}
} else {
row_htmls = extract_all_tags(table_html, "tr");
// Check if first row has <th> tags
if (!row_htmls.empty()) {
table.has_header = (row_htmls[0].find("<th") != std::string::npos);
}
}
bool is_first_row = true;
for (const auto& row_html : row_htmls) {
TableRow row;
// Extract cells (both th and td)
auto th_cells = extract_all_tags(row_html, "th");
auto td_cells = extract_all_tags(row_html, "td");
// Process th cells (headers)
for (const auto& cell_html : th_cells) {
TableCell cell;
std::vector<InlineLink> inline_links;
cell.text = extract_text_with_links(cell_html, all_links, inline_links);
cell.inline_links = inline_links;
cell.is_header = true;
cell.colspan = 1;
cell.rowspan = 1;
row.cells.push_back(cell);
}
// Process td cells (data)
for (const auto& cell_html : td_cells) {
TableCell cell;
std::vector<InlineLink> inline_links;
cell.text = extract_text_with_links(cell_html, all_links, inline_links);
cell.inline_links = inline_links;
cell.is_header = is_first_row && table.has_header && th_cells.empty();
cell.colspan = 1;
cell.rowspan = 1;
row.cells.push_back(cell);
}
if (!row.cells.empty()) {
table.rows.push_back(row);
}
is_first_row = false;
}
if (!table.rows.empty()) {
tables.push_back(table);
}
}
return tables;
}
};
HtmlParser::HtmlParser() : pImpl(std::make_unique<Impl>()) {}
@ -271,33 +399,117 @@ ParsedDocument HtmlParser::parse(const std::string& html, const std::string& bas
// 提取链接
doc.links = pImpl->extract_links(main_content, base_url);
// Extract and add images
auto images = pImpl->extract_images(main_content);
for (const auto& img : images) {
ContentElement elem;
elem.type = ElementType::IMAGE;
elem.image_data = img;
elem.level = 0;
elem.list_number = 0;
elem.nesting_level = 0;
doc.elements.push_back(elem);
}
// Extract and add tables
auto tables = pImpl->extract_tables(main_content, doc.links);
for (const auto& tbl : tables) {
ContentElement elem;
elem.type = ElementType::TABLE;
elem.table_data = tbl;
elem.level = 0;
elem.list_number = 0;
elem.nesting_level = 0;
doc.elements.push_back(elem);
}
// 解析标题
for (int level = 1; level <= 6; ++level) {
std::string tag = "h" + std::to_string(level);
auto headings = pImpl->extract_all_tags(main_content, tag);
for (const auto& heading : headings) {
ContentElement elem;
elem.type = (level == 1) ? ElementType::HEADING1 :
(level == 2) ? ElementType::HEADING2 : ElementType::HEADING3;
ElementType type;
if (level == 1) type = ElementType::HEADING1;
else if (level == 2) type = ElementType::HEADING2;
else if (level == 3) type = ElementType::HEADING3;
else if (level == 4) type = ElementType::HEADING4;
else if (level == 5) type = ElementType::HEADING5;
else type = ElementType::HEADING6;
elem.type = type;
elem.text = pImpl->decode_html_entities(pImpl->trim(pImpl->remove_tags(heading)));
elem.level = level;
elem.list_number = 0;
elem.nesting_level = 0;
if (!elem.text.empty()) {
doc.elements.push_back(elem);
}
}
}
// 解析列表项
// 解析列表项 - with nesting support
if (pImpl->keep_lists) {
auto list_items = pImpl->extract_all_tags(main_content, "li");
for (const auto& item : list_items) {
std::string text = pImpl->decode_html_entities(pImpl->trim(pImpl->remove_tags(item)));
if (!text.empty() && text.length() > 1) {
ContentElement elem;
elem.type = ElementType::LIST_ITEM;
elem.text = text;
doc.elements.push_back(elem);
// Extract both <ul> and <ol> lists
auto ul_lists = pImpl->extract_all_tags(main_content, "ul");
auto ol_lists = pImpl->extract_all_tags(main_content, "ol");
// Helper to parse a list recursively
std::function<void(const std::string&, bool, int)> parse_list;
parse_list = [&](const std::string& list_html, bool is_ordered, int nesting) {
auto list_items = pImpl->extract_all_tags(list_html, "li");
int item_number = 1;
for (const auto& item_html : list_items) {
// Check if this item contains nested lists
bool has_nested_ul = item_html.find("<ul") != std::string::npos;
bool has_nested_ol = item_html.find("<ol") != std::string::npos;
// Extract text without nested lists
std::string item_text = item_html;
if (has_nested_ul || has_nested_ol) {
// Remove nested lists from text
item_text = std::regex_replace(item_text,
std::regex("<ul[^>]*>[\\s\\S]*?</ul>", std::regex::icase), "");
item_text = std::regex_replace(item_text,
std::regex("<ol[^>]*>[\\s\\S]*?</ol>", std::regex::icase), "");
}
std::string text = pImpl->decode_html_entities(pImpl->trim(pImpl->remove_tags(item_text)));
if (!text.empty() && text.length() > 1) {
ContentElement elem;
elem.type = is_ordered ? ElementType::ORDERED_LIST_ITEM : ElementType::LIST_ITEM;
elem.text = text;
elem.level = 0;
elem.list_number = item_number++;
elem.nesting_level = nesting;
doc.elements.push_back(elem);
}
// Parse nested lists
if (has_nested_ul) {
auto nested_uls = pImpl->extract_all_tags(item_html, "ul");
for (const auto& nested_ul : nested_uls) {
parse_list(nested_ul, false, nesting + 1);
}
}
if (has_nested_ol) {
auto nested_ols = pImpl->extract_all_tags(item_html, "ol");
for (const auto& nested_ol : nested_ols) {
parse_list(nested_ol, true, nesting + 1);
}
}
}
};
// Parse unordered lists
for (const auto& ul : ul_lists) {
parse_list(ul, false, 0);
}
// Parse ordered lists
for (const auto& ol : ol_lists) {
parse_list(ol, true, 0);
}
}
@ -307,6 +519,9 @@ ParsedDocument HtmlParser::parse(const std::string& html, const std::string& bas
ContentElement elem;
elem.type = ElementType::PARAGRAPH;
elem.text = pImpl->extract_text_with_links(para, doc.links, elem.inline_links);
elem.level = 0;
elem.list_number = 0;
elem.nesting_level = 0;
if (!elem.text.empty() && elem.text.length() > 1) {
doc.elements.push_back(elem);
}
@ -321,6 +536,9 @@ ParsedDocument HtmlParser::parse(const std::string& html, const std::string& bas
ContentElement elem;
elem.type = ElementType::PARAGRAPH;
elem.text = text;
elem.level = 0;
elem.list_number = 0;
elem.nesting_level = 0;
doc.elements.push_back(elem);
}
}
@ -339,6 +557,9 @@ ParsedDocument HtmlParser::parse(const std::string& html, const std::string& bas
ContentElement elem;
elem.type = ElementType::PARAGRAPH;
elem.text = line;
elem.level = 0;
elem.list_number = 0;
elem.nesting_level = 0;
doc.elements.push_back(elem);
}
}

View file

@ -9,13 +9,28 @@ enum class ElementType {
HEADING1,
HEADING2,
HEADING3,
HEADING4,
HEADING5,
HEADING6,
PARAGRAPH,
LINK,
LIST_ITEM,
ORDERED_LIST_ITEM,
BLOCKQUOTE,
CODE_BLOCK,
HORIZONTAL_RULE,
LINE_BREAK
LINE_BREAK,
TABLE,
IMAGE,
FORM,
SECTION_START,
SECTION_END,
NAV_START,
NAV_END,
HEADER_START,
HEADER_END,
ASIDE_START,
ASIDE_END
};
struct Link {
@ -32,12 +47,57 @@ struct InlineLink {
int link_index; // Index in the document's links array
};
struct TableCell {
std::string text;
std::vector<InlineLink> inline_links;
bool is_header;
int colspan;
int rowspan;
};
struct TableRow {
std::vector<TableCell> cells;
};
struct Table {
std::vector<TableRow> rows;
bool has_header;
};
struct Image {
std::string src;
std::string alt;
int width; // -1 if not specified
int height; // -1 if not specified
};
struct FormField {
enum Type { TEXT, PASSWORD, CHECKBOX, RADIO, SUBMIT, BUTTON } type;
std::string name;
std::string value;
std::string placeholder;
bool checked;
};
struct Form {
std::string action;
std::string method;
std::vector<FormField> fields;
};
struct ContentElement {
ElementType type;
std::string text;
std::string url;
int level;
int list_number; // For ordered lists
int nesting_level; // For nested lists
std::vector<InlineLink> inline_links; // Links within this element's text
// Extended content types
Table table_data;
Image image_data;
Form form_data;
};
struct ParsedDocument {