diff --git a/src/html_parser.cpp b/src/html_parser.cpp
index adedcbf..59d49bc 100644
--- a/src/html_parser.cpp
+++ b/src/html_parser.cpp
@@ -3,6 +3,7 @@
#include
#include
#include
+#include
class HtmlParser::Impl {
public:
@@ -234,6 +235,133 @@ public:
return result;
}
+
+ // Extract images
+ std::vector extract_images(const std::string& html) {
+ std::vector images;
+ std::regex img_regex(R"(
]*src\s*=\s*["']([^"']*)["'][^>]*>)", std::regex::icase);
+
+ auto begin = std::sregex_iterator(html.begin(), html.end(), img_regex);
+ auto end = std::sregex_iterator();
+
+ for (std::sregex_iterator i = begin; i != end; ++i) {
+ std::smatch match = *i;
+ Image img;
+ img.src = match[1].str();
+ img.width = -1;
+ img.height = -1;
+
+ // Extract alt text
+ std::string img_tag = match[0].str();
+ std::regex alt_regex(R"(alt\s*=\s*["']([^"']*)["'])", std::regex::icase);
+ std::smatch alt_match;
+ if (std::regex_search(img_tag, alt_match, alt_regex)) {
+ img.alt = decode_html_entities(alt_match[1].str());
+ }
+
+ // Extract width
+ std::regex width_regex(R"(width\s*=\s*["']?(\d+)["']?)", std::regex::icase);
+ std::smatch width_match;
+ if (std::regex_search(img_tag, width_match, width_regex)) {
+ try {
+ img.width = std::stoi(width_match[1].str());
+ } catch (...) {}
+ }
+
+ // Extract height
+ std::regex height_regex(R"(height\s*=\s*["']?(\d+)["']?)", std::regex::icase);
+ std::smatch height_match;
+ if (std::regex_search(img_tag, height_match, height_regex)) {
+ try {
+ img.height = std::stoi(height_match[1].str());
+ } catch (...) {}
+ }
+
+ images.push_back(img);
+ }
+
+ return images;
+ }
+
+ // Extract tables
+ std::vector extract_tables(const std::string& html, std::vector& all_links) {
+ std::vector tables;
+ auto table_contents = extract_all_tags(html, "table");
+
+ for (const auto& table_html : table_contents) {
+ Table table;
+ table.has_header = false;
+
+ // Extract rows
+ auto thead_html = extract_tag_content(table_html, "thead");
+ auto tbody_html = extract_tag_content(table_html, "tbody");
+
+ // If no thead/tbody, just get all rows
+ std::vector row_htmls;
+ if (!thead_html.empty() || !tbody_html.empty()) {
+ if (!thead_html.empty()) {
+ auto header_rows = extract_all_tags(thead_html, "tr");
+ row_htmls.insert(row_htmls.end(), header_rows.begin(), header_rows.end());
+ table.has_header = !header_rows.empty();
+ }
+ if (!tbody_html.empty()) {
+ auto body_rows = extract_all_tags(tbody_html, "tr");
+ row_htmls.insert(row_htmls.end(), body_rows.begin(), body_rows.end());
+ }
+ } else {
+ row_htmls = extract_all_tags(table_html, "tr");
+ // Check if first row has | tags
+ if (!row_htmls.empty()) {
+ table.has_header = (row_htmls[0].find(" | inline_links;
+ cell.text = extract_text_with_links(cell_html, all_links, inline_links);
+ cell.inline_links = inline_links;
+ cell.is_header = true;
+ cell.colspan = 1;
+ cell.rowspan = 1;
+ row.cells.push_back(cell);
+ }
+
+ // Process td cells (data)
+ for (const auto& cell_html : td_cells) {
+ TableCell cell;
+ std::vector inline_links;
+ cell.text = extract_text_with_links(cell_html, all_links, inline_links);
+ cell.inline_links = inline_links;
+ cell.is_header = is_first_row && table.has_header && th_cells.empty();
+ cell.colspan = 1;
+ cell.rowspan = 1;
+ row.cells.push_back(cell);
+ }
+
+ if (!row.cells.empty()) {
+ table.rows.push_back(row);
+ }
+
+ is_first_row = false;
+ }
+
+ if (!table.rows.empty()) {
+ tables.push_back(table);
+ }
+ }
+
+ return tables;
+ }
};
HtmlParser::HtmlParser() : pImpl(std::make_unique()) {}
@@ -271,33 +399,117 @@ ParsedDocument HtmlParser::parse(const std::string& html, const std::string& bas
// 提取链接
doc.links = pImpl->extract_links(main_content, base_url);
+ // Extract and add images
+ auto images = pImpl->extract_images(main_content);
+ for (const auto& img : images) {
+ ContentElement elem;
+ elem.type = ElementType::IMAGE;
+ elem.image_data = img;
+ elem.level = 0;
+ elem.list_number = 0;
+ elem.nesting_level = 0;
+ doc.elements.push_back(elem);
+ }
+
+ // Extract and add tables
+ auto tables = pImpl->extract_tables(main_content, doc.links);
+ for (const auto& tbl : tables) {
+ ContentElement elem;
+ elem.type = ElementType::TABLE;
+ elem.table_data = tbl;
+ elem.level = 0;
+ elem.list_number = 0;
+ elem.nesting_level = 0;
+ doc.elements.push_back(elem);
+ }
+
// 解析标题
for (int level = 1; level <= 6; ++level) {
std::string tag = "h" + std::to_string(level);
auto headings = pImpl->extract_all_tags(main_content, tag);
for (const auto& heading : headings) {
ContentElement elem;
- elem.type = (level == 1) ? ElementType::HEADING1 :
- (level == 2) ? ElementType::HEADING2 : ElementType::HEADING3;
+ ElementType type;
+ if (level == 1) type = ElementType::HEADING1;
+ else if (level == 2) type = ElementType::HEADING2;
+ else if (level == 3) type = ElementType::HEADING3;
+ else if (level == 4) type = ElementType::HEADING4;
+ else if (level == 5) type = ElementType::HEADING5;
+ else type = ElementType::HEADING6;
+
+ elem.type = type;
elem.text = pImpl->decode_html_entities(pImpl->trim(pImpl->remove_tags(heading)));
elem.level = level;
+ elem.list_number = 0;
+ elem.nesting_level = 0;
if (!elem.text.empty()) {
doc.elements.push_back(elem);
}
}
}
- // 解析列表项
+ // 解析列表项 - with nesting support
if (pImpl->keep_lists) {
- auto list_items = pImpl->extract_all_tags(main_content, "li");
- for (const auto& item : list_items) {
- std::string text = pImpl->decode_html_entities(pImpl->trim(pImpl->remove_tags(item)));
- if (!text.empty() && text.length() > 1) {
- ContentElement elem;
- elem.type = ElementType::LIST_ITEM;
- elem.text = text;
- doc.elements.push_back(elem);
+ // Extract both and lists
+ auto ul_lists = pImpl->extract_all_tags(main_content, "ul");
+ auto ol_lists = pImpl->extract_all_tags(main_content, "ol");
+
+ // Helper to parse a list recursively
+ std::function parse_list;
+ parse_list = [&](const std::string& list_html, bool is_ordered, int nesting) {
+ auto list_items = pImpl->extract_all_tags(list_html, "li");
+ int item_number = 1;
+
+ for (const auto& item_html : list_items) {
+ // Check if this item contains nested lists
+ bool has_nested_ul = item_html.find("", std::regex::icase), "");
+ item_text = std::regex_replace(item_text,
+ std::regex("]*>[\\s\\S]*? ", std::regex::icase), "");
+ }
+
+ std::string text = pImpl->decode_html_entities(pImpl->trim(pImpl->remove_tags(item_text)));
+ if (!text.empty() && text.length() > 1) {
+ ContentElement elem;
+ elem.type = is_ordered ? ElementType::ORDERED_LIST_ITEM : ElementType::LIST_ITEM;
+ elem.text = text;
+ elem.level = 0;
+ elem.list_number = item_number++;
+ elem.nesting_level = nesting;
+ doc.elements.push_back(elem);
+ }
+
+ // Parse nested lists
+ if (has_nested_ul) {
+ auto nested_uls = pImpl->extract_all_tags(item_html, "ul");
+ for (const auto& nested_ul : nested_uls) {
+ parse_list(nested_ul, false, nesting + 1);
+ }
+ }
+ if (has_nested_ol) {
+ auto nested_ols = pImpl->extract_all_tags(item_html, "ol");
+ for (const auto& nested_ol : nested_ols) {
+ parse_list(nested_ol, true, nesting + 1);
+ }
+ }
}
+ };
+
+ // Parse unordered lists
+ for (const auto& ul : ul_lists) {
+ parse_list(ul, false, 0);
+ }
+
+ // Parse ordered lists
+ for (const auto& ol : ol_lists) {
+ parse_list(ol, true, 0);
}
}
@@ -307,6 +519,9 @@ ParsedDocument HtmlParser::parse(const std::string& html, const std::string& bas
ContentElement elem;
elem.type = ElementType::PARAGRAPH;
elem.text = pImpl->extract_text_with_links(para, doc.links, elem.inline_links);
+ elem.level = 0;
+ elem.list_number = 0;
+ elem.nesting_level = 0;
if (!elem.text.empty() && elem.text.length() > 1) {
doc.elements.push_back(elem);
}
@@ -321,6 +536,9 @@ ParsedDocument HtmlParser::parse(const std::string& html, const std::string& bas
ContentElement elem;
elem.type = ElementType::PARAGRAPH;
elem.text = text;
+ elem.level = 0;
+ elem.list_number = 0;
+ elem.nesting_level = 0;
doc.elements.push_back(elem);
}
}
@@ -339,6 +557,9 @@ ParsedDocument HtmlParser::parse(const std::string& html, const std::string& bas
ContentElement elem;
elem.type = ElementType::PARAGRAPH;
elem.text = line;
+ elem.level = 0;
+ elem.list_number = 0;
+ elem.nesting_level = 0;
doc.elements.push_back(elem);
}
}
diff --git a/src/html_parser.h b/src/html_parser.h
index 90c2872..ed6bac3 100644
--- a/src/html_parser.h
+++ b/src/html_parser.h
@@ -9,13 +9,28 @@ enum class ElementType {
HEADING1,
HEADING2,
HEADING3,
+ HEADING4,
+ HEADING5,
+ HEADING6,
PARAGRAPH,
LINK,
LIST_ITEM,
+ ORDERED_LIST_ITEM,
BLOCKQUOTE,
CODE_BLOCK,
HORIZONTAL_RULE,
- LINE_BREAK
+ LINE_BREAK,
+ TABLE,
+ IMAGE,
+ FORM,
+ SECTION_START,
+ SECTION_END,
+ NAV_START,
+ NAV_END,
+ HEADER_START,
+ HEADER_END,
+ ASIDE_START,
+ ASIDE_END
};
struct Link {
@@ -32,12 +47,57 @@ struct InlineLink {
int link_index; // Index in the document's links array
};
+struct TableCell {
+ std::string text;
+ std::vector inline_links;
+ bool is_header;
+ int colspan;
+ int rowspan;
+};
+
+struct TableRow {
+ std::vector cells;
+};
+
+struct Table {
+ std::vector rows;
+ bool has_header;
+};
+
+struct Image {
+ std::string src;
+ std::string alt;
+ int width; // -1 if not specified
+ int height; // -1 if not specified
+};
+
+struct FormField {
+ enum Type { TEXT, PASSWORD, CHECKBOX, RADIO, SUBMIT, BUTTON } type;
+ std::string name;
+ std::string value;
+ std::string placeholder;
+ bool checked;
+};
+
+struct Form {
+ std::string action;
+ std::string method;
+ std::vector fields;
+};
+
struct ContentElement {
ElementType type;
std::string text;
std::string url;
int level;
+ int list_number; // For ordered lists
+ int nesting_level; // For nested lists
std::vector inline_links; // Links within this element's text
+
+ // Extended content types
+ Table table_data;
+ Image image_data;
+ Form form_data;
};
struct ParsedDocument {
|