From ceffe59234265d0cd5f264fa62e27dc37ae94610 Mon Sep 17 00:00:00 2001 From: m1ngsama Date: Wed, 27 May 2026 09:18:23 +0800 Subject: [PATCH] Harden message log replay parsing --- docs/CHANGELOG.md | 3 + docs/Development-Guide.md | 5 ++ docs/ROADMAP.md | 4 +- src/message.c | 161 +++++++++++++++++++++----------------- tests/unit/test_message.c | 89 +++++++++++++++++++++ 5 files changed, 187 insertions(+), 75 deletions(-) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 13a7045..1b0c4cc 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -51,6 +51,9 @@ - Session callback refs are now owned and released through `client.c`, so bootstrap and interactive cleanup no longer need to manually mirror the main-ref / callback-ref release sequence. +- Message-log replay and search now share one strict record parser and skip + malformed, invalid UTF-8, extra-separator, oversized, or unterminated + records instead of accepting partial replay data. - The two-user lifecycle test now covers opening `:inbox` before a private message arrives, matching the way users often leave an inbox page open. - Private-message inbox access now uses its own mutex instead of sharing the diff --git a/docs/Development-Guide.md b/docs/Development-Guide.md index 5541bfe..362c6ce 100644 --- a/docs/Development-Guide.md +++ b/docs/Development-Guide.md @@ -296,6 +296,11 @@ void room_broadcast(chat_room_t *room, const message_t *msg) { 2024-01-13T10:30:45Z|username|message content ``` +Log replay and search use the same strict parser. A record is accepted only +when it has exactly three fields, a strict UTC RFC3339 timestamp, valid UTF-8 +username/content, bounded field lengths, and a trailing newline. Unterminated +last lines are treated as partial writes and skipped. + **Optimized loading** (backward scan): ```c /* Scan backwards from file end */ diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md index 6bd64f9..7b70a1e 100644 --- a/docs/ROADMAP.md +++ b/docs/ROADMAP.md @@ -56,10 +56,10 @@ Goal: make stored history durable, inspectable, and recoverable. - formalize the message log format and version it - keep timestamps in a timezone-safe format throughout write and replay -- validate persisted UTF-8 and record structure before replay +- āœ… validate persisted UTF-8 and record structure before replay/search - add log rotation and compaction tooling - provide an offline inspection/export command -- define recovery behavior for truncated or partially corrupted logs +- define broader recovery behavior for truncated or partially corrupted logs ## Stage 4: Interactive UX diff --git a/src/message.c b/src/message.c index e48b5cd..667795c 100644 --- a/src/message.c +++ b/src/message.c @@ -26,6 +26,82 @@ static time_t parse_rfc3339_utc(const char *timestamp_str) { return timegm(&tm); } +static void discard_line_remainder(FILE *fp) { + int c; + + while ((c = fgetc(fp)) != '\n' && c != EOF) { + } +} + +static bool parse_log_record(const char *line, message_t *out, + time_t now) { + char line_copy[2048]; + char *first_sep; + char *second_sep; + char *timestamp_str; + char *username; + char *content; + time_t msg_time; + size_t line_len; + + if (!line || !out) { + return false; + } + + line_len = strlen(line); + if (line_len == 0 || line[line_len - 1] != '\n') { + return false; + } + if (line_len >= sizeof(line_copy)) { + return false; + } + + memcpy(line_copy, line, line_len + 1); + line_copy[line_len - 1] = '\0'; + + first_sep = strchr(line_copy, '|'); + if (!first_sep) { + return false; + } + second_sep = strchr(first_sep + 1, '|'); + if (!second_sep || strchr(second_sep + 1, '|')) { + return false; + } + + *first_sep = '\0'; + *second_sep = '\0'; + timestamp_str = line_copy; + username = first_sep + 1; + content = second_sep + 1; + + if (timestamp_str[0] == '\0' || username[0] == '\0' || + content[0] == '\0') { + return false; + } + if (strlen(username) >= MAX_USERNAME_LEN || + strlen(content) >= MAX_MESSAGE_LEN) { + return false; + } + if (!utf8_is_valid_string(username) || !utf8_is_valid_string(content)) { + return false; + } + + msg_time = parse_rfc3339_utc(timestamp_str); + if (msg_time == (time_t)-1) { + return false; + } + if (msg_time > now + 86400 || msg_time < now - 31536000 * 10) { + return false; + } + + out->timestamp = msg_time; + strncpy(out->username, username, MAX_USERNAME_LEN - 1); + out->username[MAX_USERNAME_LEN - 1] = '\0'; + strncpy(out->content, content, MAX_MESSAGE_LEN - 1); + out->content[MAX_MESSAGE_LEN - 1] = '\0'; + return true; +} + /* Initialize message subsystem */ void message_init(void) { /* Nothing to initialize for now */ @@ -120,65 +196,23 @@ int message_load(message_t **messages, int max_messages) { read_messages:; char line[2048]; int count = 0; + time_t now = time(NULL); /* Now read forward */ while (fgets(line, sizeof(line), fp) && count < max_messages) { /* Check for oversized lines */ size_t line_len = strlen(line); - if (line_len >= sizeof(line) - 1) { - /* Skip remainder of line */ - int c; - while ((c = fgetc(fp)) != '\n' && c != EOF); + if (line_len >= sizeof(line) - 1 && line[line_len - 1] != '\n') { + discard_line_remainder(fp); continue; } - /* Format: RFC3339_timestamp|username|content */ - char line_copy[2048]; - strncpy(line_copy, line, sizeof(line_copy) - 1); - line_copy[sizeof(line_copy) - 1] = '\0'; - - char *timestamp_str = strtok(line_copy, "|"); - char *username = strtok(NULL, "|"); - char *content = strtok(NULL, "\n"); - - /* Validate all fields exist and are non-empty */ - if (!timestamp_str || !username || !content) { - continue; - } - if (username[0] == '\0') { + message_t parsed; + if (!parse_log_record(line, &parsed, now)) { continue; } - /* Validate field lengths */ - if (strlen(username) >= MAX_USERNAME_LEN) { - continue; - } - if (strlen(content) >= MAX_MESSAGE_LEN) { - continue; - } - - if (!utf8_is_valid_string(username) || !utf8_is_valid_string(content)) { - continue; - } - - /* Parse strict UTC RFC3339 timestamp */ - time_t msg_time = parse_rfc3339_utc(timestamp_str); - if (msg_time == (time_t)-1) { - continue; - } - - /* Validate timestamp is reasonable (not in far future or past) */ - time_t now = time(NULL); - if (msg_time > now + 86400 || msg_time < now - 31536000 * 10) { - continue; - } - - msg_array[count].timestamp = msg_time; - strncpy(msg_array[count].username, username, MAX_USERNAME_LEN - 1); - msg_array[count].username[MAX_USERNAME_LEN - 1] = '\0'; - strncpy(msg_array[count].content, content, MAX_MESSAGE_LEN - 1); - msg_array[count].content[MAX_MESSAGE_LEN - 1] = '\0'; - count++; + msg_array[count++] = parsed; } fclose(fp); @@ -276,38 +310,19 @@ int message_search(const char *query, message_t **results, int max_results) { char line[2048]; int count = 0; + time_t now = time(NULL); while (fgets(line, sizeof(line), fp)) { size_t line_len = strlen(line); - if (line_len >= sizeof(line) - 1) { - int c; - while ((c = fgetc(fp)) != '\n' && c != EOF); + if (line_len >= sizeof(line) - 1 && line[line_len - 1] != '\n') { + discard_line_remainder(fp); continue; } - char line_copy[2048]; - strncpy(line_copy, line, sizeof(line_copy) - 1); - line_copy[sizeof(line_copy) - 1] = '\0'; - - char *timestamp_str = strtok(line_copy, "|"); - char *username = strtok(NULL, "|"); - char *content = strtok(NULL, "\n"); - - if (!timestamp_str || !username || !content || username[0] == '\0') continue; - if (strlen(username) >= MAX_USERNAME_LEN || strlen(content) >= MAX_MESSAGE_LEN) continue; - if (!utf8_is_valid_string(username) || !utf8_is_valid_string(content)) continue; - - if (strcasestr(username, query) == NULL && strcasestr(content, query) == NULL) continue; - - time_t msg_time = parse_rfc3339_utc(timestamp_str); - if (msg_time == (time_t)-1) continue; - message_t m; - m.timestamp = msg_time; - strncpy(m.username, username, MAX_USERNAME_LEN - 1); - m.username[MAX_USERNAME_LEN - 1] = '\0'; - strncpy(m.content, content, MAX_MESSAGE_LEN - 1); - m.content[MAX_MESSAGE_LEN - 1] = '\0'; + if (!parse_log_record(line, &m, now)) continue; + if (strcasestr(m.username, query) == NULL && + strcasestr(m.content, query) == NULL) continue; if (count < max_results) { res[count++] = m; diff --git a/tests/unit/test_message.c b/tests/unit/test_message.c index 0eb9012..060fa0d 100644 --- a/tests/unit/test_message.c +++ b/tests/unit/test_message.c @@ -3,8 +3,10 @@ #include #include #include +#include #include #include +#include #define TEST(name) static void test_##name() #define RUN_TEST(name) do { \ @@ -16,12 +18,45 @@ static int tests_passed = 0; static const char *test_log = "test_messages.log"; +static char test_state_dir[PATH_MAX]; /* Helper: Clean up test log file */ static void cleanup_test_log(void) { unlink(test_log); } +static void cleanup_state_dir(void) { + if (test_state_dir[0] != '\0') { + char log_path[PATH_MAX]; + snprintf(log_path, sizeof(log_path), "%s/messages.log", test_state_dir); + unlink(log_path); + rmdir(test_state_dir); + test_state_dir[0] = '\0'; + } + unsetenv("TNT_STATE_DIR"); +} + +static void setup_state_dir(void) { + const char *tmp = getenv("TMPDIR"); + + cleanup_state_dir(); + if (!tmp || tmp[0] == '\0') { + tmp = "/tmp"; + } + snprintf(test_state_dir, sizeof(test_state_dir), + "%s/tnt-message-test.XXXXXX", tmp); + assert(mkdtemp(test_state_dir) != NULL); + assert(setenv("TNT_STATE_DIR", test_state_dir, 1) == 0); +} + +static void format_rfc3339_now(char *buffer, size_t buf_size) { + time_t now = time(NULL); + struct tm tm_info; + + gmtime_r(&now, &tm_info); + strftime(buffer, buf_size, "%Y-%m-%dT%H:%M:%SZ", &tm_info); +} + /* Test message initialization */ TEST(message_init) { message_init(); @@ -122,6 +157,57 @@ TEST(message_save_basic) { cleanup_test_log(); } +TEST(message_load_skips_malformed_records) { + char ts[64]; + char log_path[PATH_MAX]; + message_t *messages = NULL; + + setup_state_dir(); + format_rfc3339_now(ts, sizeof(ts)); + snprintf(log_path, sizeof(log_path), "%s/messages.log", test_state_dir); + + FILE *fp = fopen(log_path, "wb"); + assert(fp != NULL); + fprintf(fp, "%s|alice|valid one\n", ts); + fprintf(fp, "not-a-date|bob|bad date\n"); + fprintf(fp, "%s||empty user\n", ts); + fprintf(fp, "%s|mallory|extra|pipe\n", ts); + fprintf(fp, "%s|badutf|bad \xC3\x28\n", ts); + fprintf(fp, "%s|partial|truncated record", ts); + fclose(fp); + + int count = message_load(&messages, 10); + assert(count == 1); + assert(strcmp(messages[0].username, "alice") == 0); + assert(strcmp(messages[0].content, "valid one") == 0); + free(messages); + cleanup_state_dir(); +} + +TEST(message_search_skips_malformed_records) { + char ts[64]; + char log_path[PATH_MAX]; + message_t *results = NULL; + + setup_state_dir(); + format_rfc3339_now(ts, sizeof(ts)); + snprintf(log_path, sizeof(log_path), "%s/messages.log", test_state_dir); + + FILE *fp = fopen(log_path, "wb"); + assert(fp != NULL); + fprintf(fp, "%s|alice|needle valid\n", ts); + fprintf(fp, "%s|mallory|needle extra|pipe\n", ts); + fprintf(fp, "%s|partial|needle truncated", ts); + fclose(fp); + + int count = message_search("needle", &results, 10); + assert(count == 1); + assert(strcmp(results[0].username, "alice") == 0); + assert(strcmp(results[0].content, "needle valid") == 0); + free(results); + cleanup_state_dir(); +} + /* Test edge cases */ TEST(message_edge_cases) { message_t msg; @@ -215,12 +301,15 @@ int main(void) { RUN_TEST(message_format_unicode); RUN_TEST(message_format_width_limits); RUN_TEST(message_save_basic); + RUN_TEST(message_load_skips_malformed_records); + RUN_TEST(message_search_skips_malformed_records); RUN_TEST(message_edge_cases); RUN_TEST(message_special_characters); RUN_TEST(message_buffer_safety); RUN_TEST(message_timestamp_formats); cleanup_test_log(); + cleanup_state_dir(); printf("\nāœ“ All %d tests passed!\n", tests_passed); return 0;