mirror of
https://github.com/m1ngsama/TUT.git
synced 2025-12-26 12:04:11 +00:00
Major improvements: - Add proper DOM tree structure (dom_tree.cpp/h) with hierarchical node representation - Refactor HTML parser to use DOM tree instead of flat ContentElement structure - Enhance text renderer with improved inline content handling and UTF-8 support - Improve browser interactive element tracking with byte-accurate positioning - Add comprehensive HTML entity decoding (80+ named entities + numeric) - Enhance form handling with better field tracking and submission Code quality improvements: - Fix all compiler warnings (unused parameters/variables) - Clean build with zero warnings - Better separation of concerns between parsing and rendering Testing: - Add test_table.html for table rendering verification This change enables better handling of complex HTML structures while maintaining the Unix philosophy of simplicity and focus.
136 lines
2.6 KiB
C++
136 lines
2.6 KiB
C++
#pragma once
|
||
|
||
#include <string>
|
||
#include <vector>
|
||
#include <memory>
|
||
|
||
// Forward declaration
|
||
struct DocumentTree;
|
||
|
||
enum class ElementType {
|
||
TEXT,
|
||
HEADING1,
|
||
HEADING2,
|
||
HEADING3,
|
||
HEADING4,
|
||
HEADING5,
|
||
HEADING6,
|
||
PARAGRAPH,
|
||
LINK,
|
||
LIST_ITEM,
|
||
ORDERED_LIST_ITEM,
|
||
BLOCKQUOTE,
|
||
CODE_BLOCK,
|
||
HORIZONTAL_RULE,
|
||
LINE_BREAK,
|
||
TABLE,
|
||
IMAGE,
|
||
FORM,
|
||
INPUT,
|
||
TEXTAREA,
|
||
SELECT,
|
||
OPTION,
|
||
BUTTON,
|
||
SECTION_START,
|
||
SECTION_END,
|
||
NAV_START,
|
||
NAV_END,
|
||
HEADER_START,
|
||
HEADER_END,
|
||
ASIDE_START,
|
||
ASIDE_END
|
||
};
|
||
|
||
struct Link {
|
||
std::string text;
|
||
std::string url;
|
||
int position;
|
||
};
|
||
|
||
struct InlineLink {
|
||
std::string text;
|
||
std::string url;
|
||
size_t start_pos; // Position in the text where link starts
|
||
size_t end_pos; // Position in the text where link ends
|
||
int link_index; // Index in the document's links array
|
||
int field_index = -1; // Index in the document's form_fields array
|
||
};
|
||
|
||
struct TableCell {
|
||
std::string text;
|
||
std::vector<InlineLink> inline_links;
|
||
bool is_header;
|
||
int colspan;
|
||
int rowspan;
|
||
};
|
||
|
||
struct TableRow {
|
||
std::vector<TableCell> cells;
|
||
};
|
||
|
||
struct Table {
|
||
std::vector<TableRow> rows;
|
||
bool has_header;
|
||
};
|
||
|
||
struct Image {
|
||
std::string src;
|
||
std::string alt;
|
||
int width; // -1 if not specified
|
||
int height; // -1 if not specified
|
||
};
|
||
|
||
struct FormField {
|
||
enum Type { TEXT, PASSWORD, CHECKBOX, RADIO, SUBMIT, BUTTON } type;
|
||
std::string name;
|
||
std::string value;
|
||
std::string placeholder;
|
||
bool checked;
|
||
};
|
||
|
||
struct Form {
|
||
std::string action;
|
||
std::string method;
|
||
std::vector<FormField> fields;
|
||
};
|
||
|
||
struct ContentElement {
|
||
ElementType type;
|
||
std::string text;
|
||
std::string url;
|
||
int level;
|
||
int list_number; // For ordered lists
|
||
int nesting_level; // For nested lists
|
||
std::vector<InlineLink> inline_links; // Links within this element's text
|
||
|
||
// Extended content types
|
||
Table table_data;
|
||
Image image_data;
|
||
Form form_data;
|
||
};
|
||
|
||
struct ParsedDocument {
|
||
std::string title;
|
||
std::string url;
|
||
std::vector<ContentElement> elements;
|
||
std::vector<Link> links;
|
||
};
|
||
|
||
class HtmlParser {
|
||
public:
|
||
HtmlParser();
|
||
~HtmlParser();
|
||
|
||
// 新接口:使用DOM树解析
|
||
DocumentTree parse_tree(const std::string& html, const std::string& base_url = "");
|
||
|
||
// 旧接口:保持向后兼容(已废弃,内部使用parse_tree)
|
||
ParsedDocument parse(const std::string& html, const std::string& base_url = "");
|
||
|
||
void set_keep_code_blocks(bool keep);
|
||
void set_keep_lists(bool keep);
|
||
|
||
private:
|
||
class Impl;
|
||
std::unique_ptr<Impl> pImpl;
|
||
};
|