use sqlite to save index, major thread pool refactor

2026-01-22 18:34:28 +00:00 · 2023-04-03 21:39:50 -04:00
parent ca973d63a4
commit fc36f33d52
62 changed files with 3630 additions and 4673 deletions
--- a/src/io/serialize.c
+++ b/src/io/serialize.c
@@ -1,9 +1,7 @@
 #include "src/ctx.h"
 #include "serialize.h"
-#include "src/parsing/parse.h"
 #include "src/parsing/mime.h"

-#include <zstd.h>

 char *get_meta_key_text(enum metakey meta_key) {

@@ -79,7 +77,7 @@ char *get_meta_key_text(enum metakey meta_key) {
        case MetaChecksum:
            return "checksum";
        default:
-        LOG_FATALF("serialize.c", "FIXME: Unknown meta key: %d", meta_key)
+        LOG_FATALF("serialize.c", "FIXME: Unknown meta key: %d", meta_key);
    }
 }

@@ -175,7 +173,7 @@ char *build_json_string(document_t *doc) {
                break;
            }
            default:
-            LOG_FATALF("serialize.c", "Invalid meta key: %x %s", meta->key, get_meta_key_text(meta->key))
+            LOG_FATALF("serialize.c", "Invalid meta key: %x %s", meta->key, get_meta_key_text(meta->key));
        }

        meta_line_t *tmp = meta;
@@ -189,394 +187,10 @@ char *build_json_string(document_t *doc) {
    return json_str;
 }

-static struct {
-    FILE *out_file;
-    size_t buf_out_size;
-
-    void *buf_out;
-
-    ZSTD_CCtx *cctx;
-} WriterCtx = {
-    .out_file =  NULL
-};
-
-#define ZSTD_COMPRESSION_LEVEL 10
-
-void initialize_writer_ctx(const char *file_path) {
-    WriterCtx.out_file = fopen(file_path, "wb");
-
-    WriterCtx.buf_out_size = ZSTD_CStreamOutSize();
-    WriterCtx.buf_out = malloc(WriterCtx.buf_out_size);
-
-    WriterCtx.cctx = ZSTD_createCCtx();
-
-    ZSTD_CCtx_setParameter(WriterCtx.cctx, ZSTD_c_compressionLevel, ZSTD_COMPRESSION_LEVEL);
-    ZSTD_CCtx_setParameter(WriterCtx.cctx, ZSTD_c_checksumFlag, FALSE);
-
-    LOG_DEBUGF("serialize.c", "Open index file for writing %s", file_path)
-}
-
-void zstd_write_string(const char *string, const size_t len) {
-    ZSTD_inBuffer input = {string, len, 0};
-
-    do {
-        ZSTD_outBuffer output = {WriterCtx.buf_out, WriterCtx.buf_out_size, 0};
-        ZSTD_compressStream2(WriterCtx.cctx, &output, &input, ZSTD_e_continue);
-
-        if (output.pos > 0) {
-            ScanCtx.stat_index_size += fwrite(WriterCtx.buf_out, 1, output.pos, WriterCtx.out_file);
-        }
-    } while (input.pos != input.size);
-}
-
-void write_document_func(tpool_work_arg_shm_t *arg) {
-
-    const char *json_str = arg->arg;
-
-    if (WriterCtx.out_file == NULL) {
-        char dstfile[PATH_MAX];
-        snprintf(dstfile, PATH_MAX, "%s_index_main.ndjson.zst", ScanCtx.index.path);
-        initialize_writer_ctx(dstfile);
-    }
-
-    zstd_write_string(json_str, arg->arg_size);
-}
-
-void zstd_close() {
-    if (WriterCtx.out_file == NULL) {
-        LOG_DEBUG("serialize.c", "No zstd stream to close, skipping cleanup")
-        return;
-    }
-
-    size_t remaining;
-    do {
-        ZSTD_outBuffer output = {WriterCtx.buf_out, WriterCtx.buf_out_size, 0};
-        remaining = ZSTD_endStream(WriterCtx.cctx, &output);
-
-        if (output.pos > 0) {
-            ScanCtx.stat_index_size += fwrite(WriterCtx.buf_out, 1, output.pos, WriterCtx.out_file);
-        }
-    } while (remaining != 0);
-
-    ZSTD_freeCCtx(WriterCtx.cctx);
-    free(WriterCtx.buf_out);
-    fclose(WriterCtx.out_file);
-
-    LOG_DEBUG("serialize.c", "End zstd stream & close index file")
-}
-
-void writer_cleanup() {
-    zstd_close();
-    WriterCtx.out_file = NULL;
-}
-
-void write_index_descriptor(char *path, index_descriptor_t *desc) {
-    cJSON *json = cJSON_CreateObject();
-    cJSON_AddStringToObject(json, "id", desc->id);
-    cJSON_AddStringToObject(json, "version", desc->version);
-    cJSON_AddStringToObject(json, "root", desc->root);
-    cJSON_AddStringToObject(json, "name", desc->name);
-    cJSON_AddStringToObject(json, "type", desc->type);
-    cJSON_AddStringToObject(json, "rewrite_url", desc->rewrite_url);
-    cJSON_AddNumberToObject(json, "timestamp", (double) desc->timestamp);
-
-    int fd = open(path, O_CREAT | O_WRONLY, S_IRUSR | S_IWUSR);
-    if (fd < 0) {
-        LOG_FATALF("serialize.c", "Could not open index descriptor: %s", strerror(errno));
-    }
-    char *str = cJSON_Print(json);
-    size_t ret = write(fd, str, strlen(str));
-    if (ret == -1) {
-        LOG_FATALF("serialize.c", "Could not write index descriptor: %s", strerror(errno));
-    }
-    free(str);
-    close(fd);
-
-    cJSON_Delete(json);
-}
-
-index_descriptor_t read_index_descriptor(char *path) {
-
-    struct stat info;
-    stat(path, &info);
-    int fd = open(path, O_RDONLY);
-
-    if (fd == -1) {
-        LOG_FATALF("serialize.c", "Invalid/corrupt index (Could not find descriptor): %s: %s\n", path, strerror(errno))
-    }
-
-    char *buf = malloc(info.st_size + 1);
-    size_t ret = read(fd, buf, info.st_size);
-    if (ret == -1) {
-        LOG_FATALF("serialize.c", "Could not read index descriptor: %s", strerror(errno));
-    }
-    *(buf + info.st_size) = '\0';
-    close(fd);
-
-    cJSON *json = cJSON_Parse(buf);
-
-    index_descriptor_t descriptor;
-    descriptor.timestamp = (long) cJSON_GetObjectItem(json, "timestamp")->valuedouble;
-    strcpy(descriptor.root, cJSON_GetObjectItem(json, "root")->valuestring);
-    strcpy(descriptor.name, cJSON_GetObjectItem(json, "name")->valuestring);
-    strcpy(descriptor.rewrite_url, cJSON_GetObjectItem(json, "rewrite_url")->valuestring);
-    descriptor.root_len = (short) strlen(descriptor.root);
-    strcpy(descriptor.version, cJSON_GetObjectItem(json, "version")->valuestring);
-    strcpy(descriptor.id, cJSON_GetObjectItem(json, "id")->valuestring);
-    if (cJSON_GetObjectItem(json, "type") == NULL) {
-        strcpy(descriptor.type, INDEX_TYPE_NDJSON);
-    } else {
-        strcpy(descriptor.type, cJSON_GetObjectItem(json, "type")->valuestring);
-    }
-
-    cJSON_Delete(json);
-    free(buf);
-
-    return descriptor;
-}
-
-
 void write_document(document_t *doc) {
    char *json_str = build_json_string(doc);
+
+    database_write_document(ProcData.index_db, doc, json_str);
    free(doc);
-    const size_t json_str_len = strlen(json_str);
-
-    json_str = realloc(json_str, json_str_len + 1);
-    *(json_str + json_str_len) = '\n';
-
-    tpool_work_arg_t arg = {
-        .arg_size = json_str_len + 1,
-        .arg = json_str
-    };
-
-    tpool_add_work(ScanCtx.writer_pool, write_document_func, &arg);
-}
-
-void thread_cleanup() {
-    cleanup_parse();
-    cleanup_font();
-}
-
-void read_index_bin_handle_line(const char *line, const char *index_id, index_func func) {
-
-    cJSON *document = cJSON_Parse(line);
-    const char *path_md5_str = cJSON_GetObjectItem(document, "_id")->valuestring;
-
-    cJSON_AddStringToObject(document, "index", index_id);
-
-    // Load meta from sidecar files
-    cJSON *meta_obj = NULL;
-    if (IndexCtx.meta != NULL) {
-        const char *meta_string = g_hash_table_lookup(IndexCtx.meta, path_md5_str);
-        if (meta_string != NULL) {
-            meta_obj = cJSON_Parse(meta_string);
-
-            cJSON *child;
-            for (child = meta_obj->child; child != NULL; child = child->next) {
-                char meta_key[4096];
-                strcpy(meta_key, child->string);
-                cJSON_DeleteItemFromObject(document, meta_key);
-                cJSON_AddItemReferenceToObject(document, meta_key, child);
-            }
-        }
-    }
-
-    // Load tags from tags DB
-    if (IndexCtx.tags != NULL) {
-        const char *tags_string = g_hash_table_lookup(IndexCtx.tags, path_md5_str);
-        if (tags_string != NULL) {
-            cJSON *tags_arr = cJSON_Parse(tags_string);
-            cJSON_DeleteItemFromObject(document, "tag");
-            cJSON_AddItemToObject(document, "tag", tags_arr);
-        }
-    }
-
-    func(document, path_md5_str);
-    cJSON_DeleteItemFromObject(document, "_id");
-    cJSON_Delete(document);
-    if (meta_obj) {
-        cJSON_Delete(meta_obj);
-    }
-}
-
-void read_lines(const char *path, const line_processor_t processor) {
-    dyn_buffer_t buf = dyn_buffer_create();
-
-    // Initialize zstd things
-    FILE *file = fopen(path, "rb");
-
-    size_t const buf_in_size = ZSTD_DStreamInSize();
-    void *const buf_in = malloc(buf_in_size);
-
-    size_t const buf_out_size = ZSTD_DStreamOutSize();
-    void *const buf_out = malloc(buf_out_size);
-
-    ZSTD_DCtx *const dctx = ZSTD_createDCtx();
-
-    size_t read;
-    size_t last_ret = 0;
-    while ((read = fread(buf_in, 1, buf_in_size, file))) {
-        ZSTD_inBuffer input = {buf_in, read, 0};
-
-        while (input.pos < input.size) {
-            ZSTD_outBuffer output = {buf_out, buf_out_size, 0};
-
-            size_t const ret = ZSTD_decompressStream(dctx, &output, &input);
-
-            for (int i = 0; i < output.pos; i++) {
-                char c = ((char *) output.dst)[i];
-
-                if (c == '\n') {
-                    dyn_buffer_write_char(&buf, '\0');
-                    processor.func(buf.buf, processor.data);
-                    buf.cur = 0;
-                } else {
-                    dyn_buffer_write_char(&buf, c);
-                }
-            }
-
-            last_ret = ret;
-        }
-    }
-
-    if (last_ret != 0) {
-        /* The last return value from ZSTD_decompressStream did not end on a
-         * frame, but we reached the end of the file! We assume this is an
-         * error, and the input was truncated.
-         */
-        LOG_FATALF("serialize.c", "EOF before end of stream: %zu", last_ret)
-    }
-
-    ZSTD_freeDCtx(dctx);
-    free(buf_in);
-    free(buf_out);
-
-    dyn_buffer_destroy(&buf);
-    fclose(file);
-}
-
-void read_index_ndjson(const char *line, void *_data) {
-    void **data = _data;
-    const char *index_id = data[0];
-    index_func func = data[1];
-    read_index_bin_handle_line(line, index_id, func);
-}
-
-void read_index(const char *path, const char index_id[SIST_INDEX_ID_LEN], const char *type, index_func func) {
-    if (strcmp(type, INDEX_TYPE_NDJSON) == 0) {
-        read_lines(path, (line_processor_t) {
-                .data = (void *[2]) {(void *) index_id, func},
-                .func = read_index_ndjson,
-        });
-    }
-}
-
-static __thread GHashTable *IncrementalReadTable = NULL;
-
-void json_put_incremental(cJSON *document, UNUSED(const char doc_id[SIST_DOC_ID_LEN])) {
-    const char *path_md5_str = cJSON_GetObjectItem(document, "_id")->valuestring;
-    const int mtime = cJSON_GetObjectItem(document, "mtime")->valueint;
-
-    incremental_put(IncrementalReadTable, path_md5_str, mtime);
-}
-
-void incremental_read(GHashTable *table, const char *filepath, index_descriptor_t *desc) {
-    IncrementalReadTable = table;
-    read_index(filepath, desc->id, desc->type, json_put_incremental);
-}
-
-static __thread GHashTable *IncrementalCopyTable = NULL;
-static __thread GHashTable *IncrementalNewTable = NULL;
-static __thread store_t *IncrementalCopySourceStore = NULL;
-static __thread store_t *IncrementalCopyDestinationStore = NULL;
-
-void incremental_copy_handle_doc(cJSON *document, UNUSED(const char id_str[SIST_DOC_ID_LEN])) {
-
-    const char *doc_id = cJSON_GetObjectItem(document, "_id")->valuestring;
-
-    if (cJSON_GetObjectItem(document, "parent") != NULL || incremental_get(IncrementalCopyTable, doc_id)) {
-        // Copy index line
-        cJSON_DeleteItemFromObject(document, "index");
-        char *json_str = cJSON_PrintUnformatted(document);
-        const size_t json_str_len = strlen(json_str);
-
-        json_str = realloc(json_str, json_str_len + 1);
-        *(json_str + json_str_len) = '\n';
-
-        // Copy tn store contents
-        size_t buf_len;
-        char *buf = store_read(IncrementalCopySourceStore, (char *) doc_id, SIST_DOC_ID_LEN, &buf_len);
-        if (buf_len != 0) {
-            store_write(IncrementalCopyDestinationStore, (char *) doc_id, SIST_DOC_ID_LEN, buf, buf_len);
-            free(buf);
-        }
-
-        // Also copy additional thumbnails
-        if (cJSON_GetObjectItem(document, "thumbnail") != NULL) {
-            const int thumbnail_count = cJSON_GetObjectItem(document, "thumbnail")->valueint;
-
-            for (int i = 1; i < thumbnail_count; i++) {
-                char tn_key[SIST_DOC_ID_LEN + sizeof(char) * 4];
-
-                snprintf(tn_key, sizeof(tn_key), "%s%04d", doc_id, i);
-
-                buf = store_read(IncrementalCopySourceStore, tn_key, sizeof(tn_key), &buf_len);
-                if (buf_len != 0) {
-                    store_write(IncrementalCopyDestinationStore, tn_key, sizeof(tn_key), buf, buf_len);
-                    free(buf);
-                }
-            }
-        }
-
-        zstd_write_string(json_str, json_str_len + 1);
-        free(json_str);
-    }
-}
-
-/**
- * Copy items from an index that are in the copy_table. Also copies from
- * the store.
- */
-void incremental_copy(store_t *store, store_t *dst_store, const char *filepath,
-                      const char *dst_filepath, GHashTable *copy_table) {
-
-    if (WriterCtx.out_file == NULL) {
-        initialize_writer_ctx(dst_filepath);
-    }
-
-    IncrementalCopyTable = copy_table;
-    IncrementalCopySourceStore = store;
-    IncrementalCopyDestinationStore = dst_store;
-
-    read_index(filepath, "", INDEX_TYPE_NDJSON, incremental_copy_handle_doc);
-}
-
-void incremental_delete_handle_doc(cJSON *document, UNUSED(const char id_str[SIST_DOC_ID_LEN])) {
-
-    char doc_id_n[SIST_DOC_ID_LEN + 1];
-    doc_id_n[SIST_DOC_ID_LEN] = '\0';
-    doc_id_n[SIST_DOC_ID_LEN - 1] = '\n';
-    const char *doc_id = cJSON_GetObjectItem(document, "_id")->valuestring;
-
-    // do not delete archive virtual entries
-    if (cJSON_GetObjectItem(document, "parent") == NULL 
-        && !incremental_get(IncrementalCopyTable, doc_id)
-        && !incremental_get(IncrementalNewTable, doc_id)
-        ) {
-        memcpy(doc_id_n, doc_id, SIST_DOC_ID_LEN - 1);
-        zstd_write_string(doc_id, sizeof(doc_id_n));
-    }
-}
-
-void incremental_delete(const char *del_filepath, const char *index_filepath,
-                        GHashTable *copy_table, GHashTable *new_table) {
-
-    if (WriterCtx.out_file == NULL) {
-        initialize_writer_ctx(del_filepath);
-    }
-
-    IncrementalCopyTable = copy_table;
-    IncrementalNewTable = new_table;
-
-    read_index(index_filepath, "", INDEX_TYPE_NDJSON, incremental_delete_handle_doc);
-}
+    free(json_str);
+}