libgenbulk

Owner: IIIlllIIIllI URL: git@github.com:nyangkosense/libgenbulk.git
src/parser.zig

const std = @import("std");
const ascii = std.ascii;
const mem = std.mem;
const fs = std.fs;

pub fn main() !void {
    var general_purpose_allocator = std.heap.GeneralPurposeAllocator(.{}){};
    const gpa = general_purpose_allocator.allocator();
    defer _ = general_purpose_allocator.deinit();

    const args = try std.process.argsAlloc(gpa);
    defer std.process.argsFree(gpa, args);

    if (args.len < 2) {
        std.debug.print("Usage: {s} <sql_file_path> [output_file_path] [--languages=lang1,lang2,...] [--debug=md5]\n", .{args[0]});
        std.debug.print("If output_file_path is not provided, results will be printed to stdout\n", .{});
        std.debug.print("Example: {s} libgen.sql books.txt --languages=english,german,russian\n", .{args[0]});
        return;
    }

    const file_path = args[1];

    const output_to_file = args.len >= 3;
    const output_path = if (output_to_file) args[2] else "";

    var debug_md5: []const u8 = "";
    var languages = std.ArrayList([]const u8).init(gpa);
    defer {
        for (languages.items) |lang| {
            gpa.free(lang);
        }
        languages.deinit();
    }

    for (args[3..]) |arg| {
        if (std.mem.startsWith(u8, arg, "--languages=")) {
            const langs_str = arg["--languages=".len..];
            var lang_iter = std.mem.split(u8, langs_str, ",");
            while (lang_iter.next()) |lang| {
                if (lang.len > 0) {
                    const normalized_lang = try gpa.dupe(u8, lang);
                    for (0..normalized_lang.len) |i| {
                        normalized_lang[i] = std.ascii.toLower(normalized_lang[i]);
                    }
                    try languages.append(normalized_lang);
                }
            }
        } else if (std.mem.startsWith(u8, arg, "--debug=")) {
            debug_md5 = arg["--debug=".len..];
        }
    }

    if (languages.items.len > 0) {
        std.debug.print("Filtering for languages: ", .{});
        for (languages.items) |lang| {
            std.debug.print("{s} ", .{lang});
        }
        std.debug.print("\n", .{});
    } else {
        std.debug.print("No language filters - including all languages\n", .{});
    }

    var output_file: ?fs.File = null;
    if (output_to_file) {
        output_file = try fs.cwd().createFile(output_path, .{});
    }

    try processLargeSQLFile(file_path, output_file, debug_md5, &languages, gpa);

    if (output_file) |of| {
        of.close();
    }

    std.debug.print("SQL processing completed.\n", .{});
}

fn processLargeSQLFile(file_path: []const u8, output_file: ?fs.File, debug_md5: []const u8, languages: *std.ArrayList([]const u8), allocator: std.mem.Allocator) !void {
    std.debug.print("Opening file: {s}\n", .{file_path});

    const file = try std.fs.cwd().openFile(file_path, .{});
    defer file.close();

    const file_size = try file.getEndPos();
    std.debug.print("File size: {} bytes\n", .{file_size});

    var reader = file.reader();

    // Using a 1MB buffer to read chunks
    const buffer_size = 1024 * 1024;
    var buffer = try allocator.alloc(u8, buffer_size);
    defer allocator.free(buffer);

    var in_values_section = false;
    var in_tuple = false;
    var in_quotes = false;
    var tuple_start: usize = 0;
    var bytes_read: usize = 0;
    var tuple_buffer = std.ArrayList(u8).init(allocator);
    defer tuple_buffer.deinit();

    var entry_count: usize = 0;
    var last_progress_pct: usize = 0;

    while (true) {
        const read_amount = try reader.read(buffer);
        if (read_amount == 0) break; // End of file

        bytes_read += read_amount;

        const progress_pct = bytes_read * 100 / file_size;
        if (progress_pct > last_progress_pct) {
            std.debug.print("Progress: {}% ({} of {} bytes)\n", .{ progress_pct, bytes_read, file_size });
            last_progress_pct = progress_pct;
        }

        var i: usize = 0;
        while (i < read_amount) {
            const c = buffer[i];

            if (!in_values_section) {
                if (c == 'L') {
                    if (i + 11 < read_amount and
                        mem.eql(u8, buffer[i .. i + 11], "LOCK TABLES"))
                    {
                        in_values_section = true;
                        std.debug.print("Found start of values section\n", .{});
                    }
                }
                i += 1;
                continue;
            }

            if (c == '\'' and (i == 0 or buffer[i - 1] != '\\')) {
                in_quotes = !in_quotes;
            }

            if (!in_quotes) {
                if (c == '(') {
                    if (!in_tuple) {
                        in_tuple = true;
                        tuple_start = i + 1;
                        tuple_buffer.clearRetainingCapacity();
                    }
                } else if (c == ')' and in_tuple) {
                    try tuple_buffer.append(')'); // Add the closing parenthesis
                    if (processEntryTupleFromBuffer(&tuple_buffer, output_file, debug_md5, languages, allocator, &entry_count)) {
                        // Success - continue
                    } else |err| {
                        // Log error but continue processing
                        std.debug.print("Error processing tuple: {}\n", .{err});
                    }
                    in_tuple = false;
                }
            }

            if (in_tuple) {
                try tuple_buffer.append(c);
            }

            i += 1;
        }
    }

    std.debug.print("Processed {} entries from {} bytes\n", .{ entry_count, bytes_read });
}

fn processEntryTupleFromBuffer(tuple_buffer: *std.ArrayList(u8), output_file: ?fs.File, debug_md5: []const u8, languages: *std.ArrayList([]const u8), allocator: std.mem.Allocator, entry_count: *usize) !void {
    const tuple = tuple_buffer.items;

    if (tuple.len < 10) return; // Skip very small tuples

    // The tuple format is (field1,field2,field3,...)
    // We need to extract fields 1 (Title), 37 (MD5), and 40 (Locator)

    var fields = std.ArrayList([]const u8).init(allocator);
    defer fields.deinit();

    var in_quotes = false;
    var field_start: usize = 0;

    var start_idx: usize = 0;
    if (tuple.len > 0 and tuple[0] == '(') {
        start_idx = 1;
        field_start = 1;
    }

    for (tuple[start_idx..], start_idx..) |c, i| {
        if (c == '\'' and (i == start_idx or tuple[i - 1] != '\\')) {
            in_quotes = !in_quotes;
        } else if (c == ',' and !in_quotes) {
            try fields.append(tuple[field_start..i]);
            field_start = i + 1;
        } else if (c == ')' and !in_quotes and i == tuple.len - 1) {
            // End of tuple
            if (field_start < i) {
                try fields.append(tuple[field_start..i]);
            }
            break;
        }
    }

    if (fields.items.len < 41) {
        return;
    }

    const id_raw = if (fields.items.len > 0) fields.items[0] else ""; // ID is field 0
    const title_raw = if (fields.items.len > 1) fields.items[1] else ""; // Title is field 1
    const extension_raw = if (fields.items.len > 36) fields.items[36] else ""; // Extension is field 36
    const md5_raw = if (fields.items.len > 37) fields.items[37] else ""; // MD5 is field 37
    const locator_raw = if (fields.items.len > 40) fields.items[40] else ""; // Locator is field 40
    const local_raw = if (fields.items.len > 41) fields.items[41] else ""; // Local is field 41
    const language_raw = if (fields.items.len > 12) fields.items[12] else "";

    const language = cleanSQLString(language_raw, allocator) catch try allocator.dupe(u8, "");
    defer allocator.free(language);

    const id = cleanSQLString(id_raw, allocator) catch try allocator.dupe(u8, "");
    defer allocator.free(id);

    const title = cleanSQLString(title_raw, allocator) catch try allocator.dupe(u8, "");
    defer allocator.free(title);

    const extension = cleanSQLString(extension_raw, allocator) catch try allocator.dupe(u8, "");
    defer allocator.free(extension);

    const md5 = cleanSQLString(md5_raw, allocator) catch try allocator.dupe(u8, "");
    defer allocator.free(md5);

    if (md5.len == 0) return; // Skip if no MD5

    const locator = cleanSQLString(locator_raw, allocator) catch try allocator.dupe(u8, "");
    defer allocator.free(locator);

    const local = cleanSQLString(local_raw, allocator) catch try allocator.dupe(u8, "");
    defer allocator.free(local);

    const author_raw = if (fields.items.len > 5) fields.items[5] else "";
    const year_raw = if (fields.items.len > 6) fields.items[6] else "";

    const author = cleanSQLString(author_raw, allocator) catch try allocator.dupe(u8, "");
    defer allocator.free(author);

    const year = cleanSQLString(year_raw, allocator) catch try allocator.dupe(u8, "");
    defer allocator.free(year);

    const is_debug_entry = debug_md5.len > 0 and std.mem.eql(u8, md5, debug_md5);

    // I'm not so sure how this ID is defined, so i tried this and it does seem to work (for testing examples)
    // Thus, im using this logic here
    // A simple, direct approach to get folder ID: use record ID / 1000 * 1000
    var folder_id_buf: [20]u8 = undefined; // Stack buffer for folder ID
    var folder_id: []const u8 = "0"; // Default

    // First try to get the raw ID directly
    if (id.len > 0) {
        const id_num = std.fmt.parseInt(u32, id, 10) catch 0;
        if (id_num > 0) {
            // Calculate folder as ID/1000*1000
            const folder_num = (id_num / 1000) * 1000;
            folder_id = std.fmt.bufPrint(&folder_id_buf, "{d}", .{folder_num}) catch "0";

            if (is_debug_entry) {
                std.debug.print("Calculated folder ID: {s} from record ID: {s}\n", .{ folder_id, id });
            }
        } else {
            // Error parsing, fallback
            if (is_debug_entry) {
                std.debug.print("Couldn't parse ID, using default folder 0\n", .{});
            }
        }
    }

    var filename_buffer = std.ArrayList(u8).init(allocator);
    defer filename_buffer.deinit();

    // Format: Author - Title (Year).extension
    if (author.len > 0) {
        try filename_buffer.appendSlice(author);
        try filename_buffer.appendSlice(" - ");
    }

    try filename_buffer.appendSlice(title);

    if (year.len > 0) {
        try filename_buffer.appendSlice(" (");
        try filename_buffer.appendSlice(year);
        try filename_buffer.appendSlice(")");
    }

    if (extension.len > 0) {
        try filename_buffer.appendSlice(".");
        try filename_buffer.appendSlice(extension);
    }

    // SQL LANGUAGE
    var language_allowed = false;
    if (languages.items.len == 0) {
        language_allowed = true;
    } else {
        for (languages.items) |allowed_lang| {
            if (std.ascii.eqlIgnoreCase(language, allowed_lang)) {
                language_allowed = true;
                break;
            }
        }
    }

    if (!language_allowed) return;

    const filename = filename_buffer.items;

    var encoded_filename = try allocator.alloc(u8, filename.len * 3); // Worst case: each char becomes %XX
    defer allocator.free(encoded_filename);

    const encoded_size = try urlEncode(filename, encoded_filename);

    const url = try std.fmt.allocPrint(allocator, "https://download.books.ms/main/{s}/{s}/{s}", .{ folder_id, md5, encoded_filename[0..encoded_size] });
    defer allocator.free(url);

    if (is_debug_entry) {
        std.debug.print("\n=== DEBUG INFO FOR MD5: {s} ===\n", .{md5});
        std.debug.print("ID: {s}\n", .{id});
        std.debug.print("Title: {s}\n", .{title});
        std.debug.print("Extension: {s}\n", .{extension});
        std.debug.print("Author: {s}\n", .{author});
        std.debug.print("Year: {s}\n", .{year});
        std.debug.print("Locator: {s}\n", .{locator});
        std.debug.print("Local: {s}\n", .{local});
        std.debug.print("Folder ID: {s}\n", .{folder_id});
        std.debug.print("Generated URL: {s}\n", .{url});
        std.debug.print("=== END DEBUG INFO ===\n\n", .{});
    }

    if (output_file) |of| {
        of.writeAll(url) catch {
            std.debug.print("Error writing to file\n", .{});
        };
        of.writeAll("\n") catch {};
    } else {
        std.debug.print("{s}\n", .{url});
    }

    entry_count.* += 1;
}

fn cleanSQLString(s: []const u8, allocator: std.mem.Allocator) ![]u8 {
    if (s.len == 0) return allocator.dupe(u8, "");

    var result = s;
    if (result.len >= 2 and result[0] == '\'' and result[result.len - 1] == '\'') {
        result = result[1 .. result.len - 1];
    }

    var cleaned = std.ArrayList(u8).init(allocator);
    errdefer cleaned.deinit();

    var i: usize = 0;
    while (i < result.len) {
        if (i + 1 < result.len and result[i] == '\\') {
            switch (result[i + 1]) {
                '\'', '\\', '"' => {
                    try cleaned.append(result[i + 1]);
                    i += 2;
                },
                'n' => {
                    try cleaned.append('\n');
                    i += 2;
                },
                't' => {
                    try cleaned.append('\t');
                    i += 2;
                },
                else => {
                    // Skip the backslash but include the character after it
                    i += 1;
                    try cleaned.append(result[i]);
                    i += 1;
                },
            }
        } else {
            try cleaned.append(result[i]);
            i += 1;
        }
    }

    return cleaned.toOwnedSlice();
}

fn urlEncode(input: []const u8, buffer: []u8) !usize {
    var j: usize = 0;
    for (input) |c| {
        if (j + 3 >= buffer.len) {
            // Buffer is too small, avoid overrun
            return error.BufferTooSmall;
        }

        if (ascii.isAlphanumeric(c) or c == '-' or c == '_' or c == '.' or c == '~') {
            buffer[j] = c;
            j += 1;
        } else if (c == ' ') {
            buffer[j] = '%';
            buffer[j + 1] = '2';
            buffer[j + 2] = '0';
            j += 3;
        } else {
            // Encode as %XX
            buffer[j] = '%';

            // Convert to hex
            const hi = c >> 4;
            const lo = c & 0x0F;

            if (hi < 10) {
                buffer[j + 1] = '0' + @as(u8, @intCast(hi));
            } else {
                buffer[j + 1] = 'A' + @as(u8, @intCast(hi - 10));
            }

            if (lo < 10) {
                buffer[j + 2] = '0' + @as(u8, @intCast(lo));
            } else {
                buffer[j + 2] = 'A' + @as(u8, @intCast(lo - 10));
            }

            j += 3;
        }
    }

    return j; // Return the number of bytes written
}