libgenbulk
Owner: IIIlllIIIllI URL: git@github.com:nyangkosense/libgenbulk.git
parser
Commit 7a132cb4e6bf9138d3e565d030376789565bf449 by SM <seb.michalk@gmail.com> on 2025-05-20 16:46:58 +0200
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..6e58f4e
--- /dev/null
+++ b/README.md
@@ -0,0 +1,67 @@
+# Libgen URL Parser & Downloader
+
+This repo contains two tools that work together to help you (bulk)download books from Library Genesis ( http://libgen.is/ ):
+
+1. `parser` - Zig tool that extracts download URLs from Libgen SQL dumps
+2. `downloader` - A small tool written in Go that downloads files from those URLs
+
+These binaries are compiled on Debian with glibc.
+Recommended to build them yourself from source, see below.
+
+## Building the Parser
+
+```bash
+# Install Zig (if you don't have it)
+wget https://ziglang.org/download/0.12.0/zig-linux-x86_64-0.12.0.tar.xz
+tar -xf zig-linux-x86_64-0.12.0.tar.xz
+export PATH=$PATH:$PWD/zig-linux-x86_64-0.12.0
+
+# Build it
+zig build-exe parser.zig
+```
+
+## Using the Parser
+
+The parser reads a Libgen SQL dump and outputs download URLs:
+
+```bash
+./parser libgen_compact.sql links.txt
+```
+
+It'll show progress as it runs through the file and generates URLs like:
+`https://download.books.ms/main/11000/29c764a1af51f8f9ebfd721a6eac4a7b/Filaseta%20M.%20-%20Algebraic%20number%20theory%20%28Math%20784%29%20%281996%29.pdf`
+
+## Building the Downloader
+
+```bash
+# Install Go (if you don't have it)
+# Then build the downloader
+go build downloader.go
+```
+
+## Using the Downloader
+
+```bash
+./downloader -file=urls.txt -dir=books -n=4 -c=8
+```
+
+Where:
+- `-file` points to the URL list from the parser
+- `-dir` is where books should be saved (will be created if needed)
+- `-n` is how many books to download at once (4 is good for most connections)
+- `-c` is how many chunks to split each download into (try 8 for faster downloads)
+
+If your download gets interrupted, just run the command again - it'll pick up where it left off
+
+## Notice
+
+- If your download gets stuck, use `Ctrl+C` to stop it, then run with `-start=123` to resume from line 123
+- For slower connections, reduce `-n` to 2 and `-c` to 4
+- For fast connections `-n=8 -c=16`
+- Downloads are automatically resumed if they get interrupted
+
+## Issues
+
+- "segmentation fault" from parser? You might be low on RAM, close other apps
+- "error downloading" from downloader? Check your internet or try with more `-retries`
+- Weird filenames? The downloader handles URL-encoding/special chars automatically
diff --git a/downloader b/downloader
new file mode 100755
index 0000000..b5a7ccb
Binary files /dev/null and b/downloader differ
diff --git a/misc/generate_sample_urls.sh b/misc/generate_sample_urls.sh
new file mode 100755
index 0000000..0063d13
--- /dev/null
+++ b/misc/generate_sample_urls.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+N="$1"
+INPUT="$2"
+OUTPUT="$3"
+
+if [[ -z "$N" || -z "$INPUT" || -z "$OUTPUT" ]]; then
+ echo "Usage: $0 <number_of_lines> <input_file> <output_file>"
+ exit 1
+fi
+
+shuf -n "$N" "$INPUT" > "$OUTPUT"
+
diff --git a/parser b/parser
new file mode 100755
index 0000000..9d24345
Binary files /dev/null and b/parser differ
diff --git a/src/downloader.go b/src/downloader.go
new file mode 100644
index 0000000..0e64dba
--- /dev/null
+++ b/src/downloader.go
@@ -0,0 +1,325 @@
+package main
+
+import (
+ "bufio"
+ "crypto/md5"
+ "flag"
+ "fmt"
+ "io"
+ "net/http"
+ "net/url"
+ "os"
+ "path/filepath"
+ "strconv"
+ "strings"
+ "sync"
+ "time"
+)
+
+type ProgressWriter struct {
+ io.Writer
+ Total int64
+ Downloaded int64
+ Filename string
+}
+
+func (pw *ProgressWriter) Write(p []byte) (int, error) {
+ n, err := pw.Writer.Write(p)
+ pw.Downloaded += int64(n)
+ percentage := float64(pw.Downloaded) / float64(pw.Total) * 100
+ fmt.Printf("\r%s: %.2f%% (%d/%d bytes) ",
+ shortenFilename(pw.Filename), percentage, pw.Downloaded, pw.Total)
+ return n, err
+}
+
+func shortenFilename(filename string) string {
+ if len(filename) > 40 {
+ return filename[:18] + "..." + filename[len(filename)-18:]
+ }
+ return filename
+}
+
+func getTempDirName(url string) string {
+ hash := md5.Sum([]byte(url))
+ return fmt.Sprintf("download_%x", hash)
+}
+
+func getSafeFilename(urlStr string) string {
+ parsedURL, err := url.Parse(urlStr)
+ if err != nil {
+ hash := md5.Sum([]byte(urlStr))
+ return fmt.Sprintf("download_%x", hash)
+ }
+
+ filename := filepath.Base(parsedURL.Path)
+
+ decodedFilename, err := url.QueryUnescape(filename)
+ if err != nil {
+ decodedFilename = strings.ReplaceAll(filename, "%20", " ")
+ }
+
+ safeFilename := strings.Map(func(r rune) rune {
+ if strings.ContainsRune(`<>:"/\|?*`, r) {
+ return '_'
+ }
+ return r
+ }, decodedFilename)
+
+ if safeFilename == "" || safeFilename == "." {
+ hash := md5.Sum([]byte(urlStr))
+ return fmt.Sprintf("download_%x", hash)
+ }
+
+ return safeFilename
+}
+
+func downloadFile(url string, filepath string, concurrency int) error {
+ tempDirName := getTempDirName(url)
+ dir := fmt.Sprintf(".%s.chunks", tempDirName)
+
+ err := os.MkdirAll(dir, 0755)
+ if err != nil {
+ return err
+ }
+
+ resp, err := http.Head(url)
+ if err != nil {
+ return err
+ }
+ if resp.StatusCode != http.StatusOK {
+ return fmt.Errorf("bad status: %s", resp.Status)
+ }
+
+ fileSize, err := strconv.ParseInt(resp.Header.Get("Content-Length"), 10, 64)
+ if err != nil {
+ return fmt.Errorf("failed to parse Content-Length: %v", err)
+ }
+
+ metaFile, err := os.OpenFile(dir+"/metadata", os.O_CREATE|os.O_RDWR, 0644)
+ if err != nil {
+ return err
+ }
+ defer metaFile.Close()
+
+ var downloadedChunks map[int]bool = make(map[int]bool)
+ scanner := bufio.NewScanner(metaFile)
+ for scanner.Scan() {
+ chunkID, err := strconv.Atoi(scanner.Text())
+ if err == nil {
+ downloadedChunks[chunkID] = true
+ }
+ }
+
+ chunkSize := fileSize / int64(concurrency)
+ if chunkSize < 1024*1024 { // Minimum 1MB chunk
+ chunkSize = 1024 * 1024
+ if fileSize < chunkSize {
+ chunkSize = fileSize
+ }
+ }
+
+ numChunks := int((fileSize + chunkSize - 1) / chunkSize)
+ if numChunks > concurrency {
+ numChunks = concurrency
+ }
+
+ fmt.Printf("Downloading %s (%.2f MB) using %d chunks\n",
+ filepath, float64(fileSize)/(1024*1024), numChunks)
+
+ var wg sync.WaitGroup
+ var mutex sync.Mutex
+ totalDownloaded := int64(0)
+
+ for i := 0; i < numChunks; i++ {
+ if downloadedChunks[i] {
+ start := int64(i) * chunkSize
+ end := start + chunkSize - 1
+ if end >= fileSize {
+ end = fileSize - 1
+ }
+ chunkBytes := end - start + 1
+ totalDownloaded += chunkBytes
+ continue
+ }
+
+ wg.Add(1)
+ go func(chunkID int) {
+ defer wg.Done()
+
+ start := int64(chunkID) * chunkSize
+ end := start + chunkSize - 1
+ if end >= fileSize {
+ end = fileSize - 1
+ }
+
+ chunkFileName := fmt.Sprintf("%s/chunk%d", dir, chunkID)
+ chunkFile, err := os.Create(chunkFileName)
+ if err != nil {
+ fmt.Printf("Error creating chunk file %s: %v\n", chunkFileName, err)
+ return
+ }
+ defer chunkFile.Close()
+
+ req, err := http.NewRequest("GET", url, nil)
+ if err != nil {
+ fmt.Printf("Error creating request for chunk %d: %v\n", chunkID, err)
+ return
+ }
+ req.Header.Set("Range", fmt.Sprintf("bytes=%d-%d", start, end))
+
+ client := &http.Client{
+ Timeout: 30 * time.Minute, // Increase timeout for large files
+ }
+ resp, err := client.Do(req)
+ if err != nil {
+ fmt.Printf("Error downloading chunk %d: %v\n", chunkID, err)
+ return
+ }
+ defer resp.Body.Close()
+
+ if resp.StatusCode != http.StatusPartialContent && resp.StatusCode != http.StatusOK {
+ fmt.Printf("Error: bad status for chunk %d: %s\n", chunkID, resp.Status)
+ return
+ }
+
+ chunkWriter := &ProgressWriter{
+ Writer: chunkFile,
+ Total: end - start + 1,
+ Filename: filepath,
+ Downloaded: 0,
+ }
+ bytesWritten, err := io.Copy(chunkWriter, resp.Body)
+ if err != nil {
+ fmt.Printf("Error writing chunk %d: %v\n", chunkID, err)
+ return
+ }
+
+ mutex.Lock()
+ totalDownloaded += bytesWritten
+ metaFile.WriteString(fmt.Sprintf("%d\n", chunkID))
+ mutex.Unlock()
+
+ fmt.Printf("\rDownloaded chunk %d (%d bytes) \n", chunkID, bytesWritten)
+ }(i)
+ }
+
+ wg.Wait()
+
+ if totalDownloaded < fileSize {
+ return fmt.Errorf("download incomplete: %d/%d bytes", totalDownloaded, fileSize)
+ }
+
+ fmt.Println("Merging chunks...")
+ finalFile, err := os.Create(filepath)
+ if err != nil {
+ return err
+ }
+ defer finalFile.Close()
+
+ for i := 0; i < numChunks; i++ {
+ chunkFileName := fmt.Sprintf("%s/chunk%d", dir, i)
+ chunkFile, err := os.Open(chunkFileName)
+ if err != nil {
+ return err
+ }
+
+ _, err = io.Copy(finalFile, chunkFile)
+ chunkFile.Close()
+ if err != nil {
+ return err
+ }
+ }
+
+ os.RemoveAll(dir)
+ fmt.Printf("Download completed: %s (%.2f MB)\n", filepath, float64(fileSize)/(1024*1024))
+ return nil
+}
+
+func main() {
+ urlFile := flag.String("file", "", "file containing URLs to download")
+ outputDir := flag.String("dir", "downloads", "directory to save downloads")
+ concurrency := flag.Int("n", 4, "number of concurrent downloads")
+ chunkConcurrency := flag.Int("c", 4, "number of chunks per download")
+ startFrom := flag.Int("start", 0, "start from this line (skip earlier lines)")
+ retries := flag.Int("retries", 3, "number of retry attempts for failed downloads")
+ flag.Parse()
+
+ if *urlFile == "" {
+ fmt.Println("Please specify a file containing URLs with -file")
+ flag.PrintDefaults()
+ return
+ }
+
+ err := os.MkdirAll(*outputDir, 0755)
+ if err != nil {
+ fmt.Printf("Error creating output directory: %v\n", err)
+ return
+ }
+
+ file, err := os.Open(*urlFile)
+ if err != nil {
+ fmt.Printf("Error opening URL file: %v\n", err)
+ return
+ }
+ defer file.Close()
+
+ sem := make(chan struct{}, *concurrency)
+ var wg sync.WaitGroup
+
+ scanner := bufio.NewScanner(file)
+ lineNum := 0
+ for scanner.Scan() {
+ url := scanner.Text()
+ lineNum++
+
+ if lineNum < *startFrom {
+ continue
+ }
+
+ filename := getSafeFilename(url)
+
+ fullPath := filepath.Join(*outputDir, filename)
+
+ if _, err := os.Stat(fullPath); err == nil {
+ fmt.Printf("Skipping %s (already exists)\n", fullPath)
+ continue
+ }
+
+ wg.Add(1)
+ sem <- struct{}{} // Acquire semaphore
+ go func(url, path string, line int) {
+ defer wg.Done()
+ defer func() { <-sem }() // Release semaphore
+
+ fmt.Printf("Starting download [%d]: %s\n", line, path)
+ start := time.Now()
+
+ var downloadErr error
+ for attempt := 0; attempt < *retries; attempt++ {
+ if attempt > 0 {
+ fmt.Printf("Retry %d/%d for %s\n", attempt, *retries, path)
+ time.Sleep(time.Duration(attempt) * 2 * time.Second) // Exponential backoff
+ }
+
+ downloadErr = downloadFile(url, path, *chunkConcurrency)
+ if downloadErr == nil {
+ break // Download succeeded
+ }
+
+ fmt.Printf("Error on attempt %d: %v\n", attempt+1, downloadErr)
+ }
+
+ if downloadErr != nil {
+ fmt.Printf("Failed to download after %d attempts: %s - %v\n",
+ *retries, url, downloadErr)
+ return
+ }
+
+ elapsed := time.Since(start)
+ fmt.Printf("Completed download [%d]: %s (took %s)\n", line, path, elapsed)
+ }(url, fullPath, lineNum)
+ }
+
+ wg.Wait()
+ fmt.Println("All downloads completed!")
+}
diff --git a/src/parser.zig b/src/parser.zig
new file mode 100644
index 0000000..4d860a7
--- /dev/null
+++ b/src/parser.zig
@@ -0,0 +1,375 @@
+const std = @import("std");
+const ascii = std.ascii;
+const mem = std.mem;
+const fs = std.fs;
+
+pub fn main() !void {
+ var general_purpose_allocator = std.heap.GeneralPurposeAllocator(.{}){};
+ const gpa = general_purpose_allocator.allocator();
+ defer _ = general_purpose_allocator.deinit();
+
+ const args = try std.process.argsAlloc(gpa);
+ defer std.process.argsFree(gpa, args);
+
+ if (args.len < 2) {
+ std.debug.print("Usage: {s} <sql_file_path> [output_file_path] [md5_to_debug]\n", .{args[0]});
+ std.debug.print("If output_file_path is not provided, results will be printed to stdout\n", .{});
+ return;
+ }
+
+ const file_path = args[1];
+ const output_to_file = args.len >= 3;
+ const output_path = if (output_to_file) args[2] else "";
+
+ // Optional MD5 to debug - if provided, print all details for this record
+ const debug_md5 = if (args.len >= 4) args[3] else "";
+
+ var output_file: ?fs.File = null;
+ if (output_to_file) {
+ output_file = try fs.cwd().createFile(output_path, .{});
+ }
+
+ try processLargeSQLFile(file_path, output_file, debug_md5, gpa);
+
+ if (output_file) |of| {
+ of.close();
+ }
+
+ std.debug.print("SQL processing completed.\n", .{});
+}
+
+fn processLargeSQLFile(file_path: []const u8, output_file: ?fs.File, debug_md5: []const u8, allocator: std.mem.Allocator) !void {
+ std.debug.print("Opening file: {s}\n", .{file_path});
+
+ const file = try std.fs.cwd().openFile(file_path, .{});
+ defer file.close();
+
+ const file_size = try file.getEndPos();
+ std.debug.print("File size: {} bytes\n", .{file_size});
+
+ var reader = file.reader();
+
+ // Using a 1MB buffer to read chunks
+ const buffer_size = 1024 * 1024;
+ var buffer = try allocator.alloc(u8, buffer_size);
+ defer allocator.free(buffer);
+
+ var in_values_section = false;
+ var in_tuple = false;
+ var in_quotes = false;
+ var tuple_start: usize = 0;
+ var bytes_read: usize = 0;
+ var tuple_buffer = std.ArrayList(u8).init(allocator);
+ defer tuple_buffer.deinit();
+
+ var entry_count: usize = 0;
+ var last_progress_pct: usize = 0;
+
+ while (true) {
+ const read_amount = try reader.read(buffer);
+ if (read_amount == 0) break; // End of file
+
+ bytes_read += read_amount;
+
+ const progress_pct = bytes_read * 100 / file_size;
+ if (progress_pct > last_progress_pct) {
+ std.debug.print("Progress: {}% ({} of {} bytes)\n", .{ progress_pct, bytes_read, file_size });
+ last_progress_pct = progress_pct;
+ }
+
+ var i: usize = 0;
+ while (i < read_amount) {
+ const c = buffer[i];
+
+ if (!in_values_section) {
+ if (c == 'L') {
+ if (i + 11 < read_amount and
+ mem.eql(u8, buffer[i .. i + 11], "LOCK TABLES"))
+ {
+ in_values_section = true;
+ std.debug.print("Found start of values section\n", .{});
+ }
+ }
+ i += 1;
+ continue;
+ }
+
+ if (c == '\'' and (i == 0 or buffer[i - 1] != '\\')) {
+ in_quotes = !in_quotes;
+ }
+
+ if (!in_quotes) {
+ if (c == '(') {
+ if (!in_tuple) {
+ in_tuple = true;
+ tuple_start = i + 1;
+ tuple_buffer.clearRetainingCapacity();
+ }
+ } else if (c == ')' and in_tuple) {
+ try tuple_buffer.append(')'); // Add the closing parenthesis
+ if (processEntryTupleFromBuffer(&tuple_buffer, output_file, debug_md5, allocator, &entry_count)) {
+ // Success - continue
+ } else |err| {
+ // Log error but continue processing
+ std.debug.print("Error processing tuple: {}\n", .{err});
+ }
+ in_tuple = false;
+ }
+ }
+
+ if (in_tuple) {
+ try tuple_buffer.append(c);
+ }
+
+ i += 1;
+ }
+ }
+
+ std.debug.print("Processed {} entries from {} bytes\n", .{ entry_count, bytes_read });
+}
+
+fn processEntryTupleFromBuffer(tuple_buffer: *std.ArrayList(u8), output_file: ?fs.File, debug_md5: []const u8, allocator: std.mem.Allocator, entry_count: *usize) !void {
+ const tuple = tuple_buffer.items;
+
+ if (tuple.len < 10) return; // Skip very small tuples
+
+ // The tuple format is (field1,field2,field3,...)
+ // We need to extract fields 1 (Title), 37 (MD5), and 40 (Locator)
+
+ var fields = std.ArrayList([]const u8).init(allocator);
+ defer fields.deinit();
+
+ var in_quotes = false;
+ var field_start: usize = 0;
+
+ var start_idx: usize = 0;
+ if (tuple.len > 0 and tuple[0] == '(') {
+ start_idx = 1;
+ field_start = 1;
+ }
+
+ for (tuple[start_idx..], start_idx..) |c, i| {
+ if (c == '\'' and (i == start_idx or tuple[i - 1] != '\\')) {
+ in_quotes = !in_quotes;
+ } else if (c == ',' and !in_quotes) {
+ try fields.append(tuple[field_start..i]);
+ field_start = i + 1;
+ } else if (c == ')' and !in_quotes and i == tuple.len - 1) {
+ // End of tuple
+ if (field_start < i) {
+ try fields.append(tuple[field_start..i]);
+ }
+ break;
+ }
+ }
+
+ if (fields.items.len < 41) {
+ return;
+ }
+
+ const id_raw = if (fields.items.len > 0) fields.items[0] else ""; // ID is field 0
+ const title_raw = if (fields.items.len > 1) fields.items[1] else ""; // Title is field 1
+ const extension_raw = if (fields.items.len > 36) fields.items[36] else ""; // Extension is field 36
+ const md5_raw = if (fields.items.len > 37) fields.items[37] else ""; // MD5 is field 37
+ const locator_raw = if (fields.items.len > 40) fields.items[40] else ""; // Locator is field 40
+ const local_raw = if (fields.items.len > 41) fields.items[41] else ""; // Local is field 41
+
+ const id = cleanSQLString(id_raw, allocator) catch try allocator.dupe(u8, "");
+ defer allocator.free(id);
+
+ const title = cleanSQLString(title_raw, allocator) catch try allocator.dupe(u8, "");
+ defer allocator.free(title);
+
+ const extension = cleanSQLString(extension_raw, allocator) catch try allocator.dupe(u8, "");
+ defer allocator.free(extension);
+
+ const md5 = cleanSQLString(md5_raw, allocator) catch try allocator.dupe(u8, "");
+ defer allocator.free(md5);
+
+ if (md5.len == 0) return; // Skip if no MD5
+
+ const locator = cleanSQLString(locator_raw, allocator) catch try allocator.dupe(u8, "");
+ defer allocator.free(locator);
+
+ const local = cleanSQLString(local_raw, allocator) catch try allocator.dupe(u8, "");
+ defer allocator.free(local);
+
+ const author_raw = if (fields.items.len > 5) fields.items[5] else "";
+ const year_raw = if (fields.items.len > 6) fields.items[6] else "";
+
+ const author = cleanSQLString(author_raw, allocator) catch try allocator.dupe(u8, "");
+ defer allocator.free(author);
+
+ const year = cleanSQLString(year_raw, allocator) catch try allocator.dupe(u8, "");
+ defer allocator.free(year);
+
+ const is_debug_entry = debug_md5.len > 0 and std.mem.eql(u8, md5, debug_md5);
+
+ // I'm not so sure how this ID is defined, so i tried this and it does seem to work (for testing examples)
+ // Thus, im using this logic here
+ // A simple, direct approach to get folder ID: use record ID / 1000 * 1000
+ var folder_id_buf: [20]u8 = undefined; // Stack buffer for folder ID
+ var folder_id: []const u8 = "0"; // Default
+
+ // First try to get the raw ID directly
+ if (id.len > 0) {
+ const id_num = std.fmt.parseInt(u32, id, 10) catch 0;
+ if (id_num > 0) {
+ // Calculate folder as ID/1000*1000
+ const folder_num = (id_num / 1000) * 1000;
+ folder_id = std.fmt.bufPrint(&folder_id_buf, "{d}", .{folder_num}) catch "0";
+
+ if (is_debug_entry) {
+ std.debug.print("Calculated folder ID: {s} from record ID: {s}\n", .{ folder_id, id });
+ }
+ } else {
+ // Error parsing, fallback
+ if (is_debug_entry) {
+ std.debug.print("Couldn't parse ID, using default folder 0\n", .{});
+ }
+ }
+ }
+
+ var filename_buffer = std.ArrayList(u8).init(allocator);
+ defer filename_buffer.deinit();
+
+ // Format: Author - Title (Year).extension
+ if (author.len > 0) {
+ try filename_buffer.appendSlice(author);
+ try filename_buffer.appendSlice(" - ");
+ }
+
+ try filename_buffer.appendSlice(title);
+
+ if (year.len > 0) {
+ try filename_buffer.appendSlice(" (");
+ try filename_buffer.appendSlice(year);
+ try filename_buffer.appendSlice(")");
+ }
+
+ if (extension.len > 0) {
+ try filename_buffer.appendSlice(".");
+ try filename_buffer.appendSlice(extension);
+ }
+
+ const filename = filename_buffer.items;
+
+ var encoded_filename = try allocator.alloc(u8, filename.len * 3); // Worst case: each char becomes %XX
+ defer allocator.free(encoded_filename);
+
+ const encoded_size = try urlEncode(filename, encoded_filename);
+
+ const url = try std.fmt.allocPrint(allocator, "https://download.books.ms/main/{s}/{s}/{s}", .{ folder_id, md5, encoded_filename[0..encoded_size] });
+ defer allocator.free(url);
+
+ if (is_debug_entry) {
+ std.debug.print("\n=== DEBUG INFO FOR MD5: {s} ===\n", .{md5});
+ std.debug.print("ID: {s}\n", .{id});
+ std.debug.print("Title: {s}\n", .{title});
+ std.debug.print("Extension: {s}\n", .{extension});
+ std.debug.print("Author: {s}\n", .{author});
+ std.debug.print("Year: {s}\n", .{year});
+ std.debug.print("Locator: {s}\n", .{locator});
+ std.debug.print("Local: {s}\n", .{local});
+ std.debug.print("Folder ID: {s}\n", .{folder_id});
+ std.debug.print("Generated URL: {s}\n", .{url});
+ std.debug.print("=== END DEBUG INFO ===\n\n", .{});
+ }
+
+ if (output_file) |of| {
+ of.writeAll(url) catch {
+ std.debug.print("Error writing to file\n", .{});
+ };
+ of.writeAll("\n") catch {};
+ } else {
+ std.debug.print("{s}\n", .{url});
+ }
+
+ entry_count.* += 1;
+}
+
+fn cleanSQLString(s: []const u8, allocator: std.mem.Allocator) ![]u8 {
+ if (s.len == 0) return allocator.dupe(u8, "");
+
+ var result = s;
+ if (result.len >= 2 and result[0] == '\'' and result[result.len - 1] == '\'') {
+ result = result[1 .. result.len - 1];
+ }
+
+ var cleaned = std.ArrayList(u8).init(allocator);
+ errdefer cleaned.deinit();
+
+ var i: usize = 0;
+ while (i < result.len) {
+ if (i + 1 < result.len and result[i] == '\\') {
+ switch (result[i + 1]) {
+ '\'', '\\', '"' => {
+ try cleaned.append(result[i + 1]);
+ i += 2;
+ },
+ 'n' => {
+ try cleaned.append('\n');
+ i += 2;
+ },
+ 't' => {
+ try cleaned.append('\t');
+ i += 2;
+ },
+ else => {
+ // Skip the backslash but include the character after it
+ i += 1;
+ try cleaned.append(result[i]);
+ i += 1;
+ },
+ }
+ } else {
+ try cleaned.append(result[i]);
+ i += 1;
+ }
+ }
+
+ return cleaned.toOwnedSlice();
+}
+
+fn urlEncode(input: []const u8, buffer: []u8) !usize {
+ var j: usize = 0;
+ for (input) |c| {
+ if (j + 3 >= buffer.len) {
+ // Buffer is too small, avoid overrun
+ return error.BufferTooSmall;
+ }
+
+ if (ascii.isAlphanumeric(c) or c == '-' or c == '_' or c == '.' or c == '~') {
+ buffer[j] = c;
+ j += 1;
+ } else if (c == ' ') {
+ buffer[j] = '%';
+ buffer[j + 1] = '2';
+ buffer[j + 2] = '0';
+ j += 3;
+ } else {
+ // Encode as %XX
+ buffer[j] = '%';
+
+ // Convert to hex
+ const hi = c >> 4;
+ const lo = c & 0x0F;
+
+ if (hi < 10) {
+ buffer[j + 1] = '0' + @as(u8, @intCast(hi));
+ } else {
+ buffer[j + 1] = 'A' + @as(u8, @intCast(hi - 10));
+ }
+
+ if (lo < 10) {
+ buffer[j + 2] = '0' + @as(u8, @intCast(lo));
+ } else {
+ buffer[j + 2] = 'A' + @as(u8, @intCast(lo - 10));
+ }
+
+ j += 3;
+ }
+ }
+
+ return j; // Return the number of bytes written
+}