libgenbulk

Owner: IIIlllIIIllI URL: git@github.com:nyangkosense/libgenbulk.git

parser

Commit 7a132cb4e6bf9138d3e565d030376789565bf449 by SM <seb.michalk@gmail.com> on 2025-05-20 16:46:58 +0200
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..6e58f4e
--- /dev/null
+++ b/README.md
@@ -0,0 +1,67 @@
+# Libgen URL Parser & Downloader
+
+This repo contains two tools that work together to help you (bulk)download books from Library Genesis ( http://libgen.is/ ):
+
+1. `parser` - Zig tool that extracts download URLs from Libgen SQL dumps
+2. `downloader` - A small tool written in Go that downloads files from those URLs
+
+These binaries are compiled on Debian with glibc. 
+Recommended to build them yourself from source, see below.
+
+## Building the Parser
+
+```bash
+# Install Zig (if you don't have it)
+wget https://ziglang.org/download/0.12.0/zig-linux-x86_64-0.12.0.tar.xz
+tar -xf zig-linux-x86_64-0.12.0.tar.xz
+export PATH=$PATH:$PWD/zig-linux-x86_64-0.12.0
+
+# Build it
+zig build-exe parser.zig
+```
+
+## Using the Parser
+
+The parser reads a Libgen SQL dump and outputs download URLs:
+
+```bash
+./parser libgen_compact.sql links.txt
+```
+
+It'll show progress as it runs through the file and generates URLs like:
+`https://download.books.ms/main/11000/29c764a1af51f8f9ebfd721a6eac4a7b/Filaseta%20M.%20-%20Algebraic%20number%20theory%20%28Math%20784%29%20%281996%29.pdf`
+
+## Building the Downloader
+
+```bash
+# Install Go (if you don't have it)
+# Then build the downloader
+go build downloader.go
+```
+
+## Using the Downloader
+
+```bash
+./downloader -file=urls.txt -dir=books -n=4 -c=8
+```
+
+Where:
+- `-file` points to the URL list from the parser
+- `-dir` is where books should be saved (will be created if needed)
+- `-n` is how many books to download at once (4 is good for most connections)
+- `-c` is how many chunks to split each download into (try 8 for faster downloads)
+
+If your download gets interrupted, just run the command again - it'll pick up where it left off
+
+## Notice
+
+- If your download gets stuck, use `Ctrl+C` to stop it, then run with `-start=123` to resume from line 123
+- For slower connections, reduce `-n` to 2 and `-c` to 4
+- For fast connections `-n=8 -c=16`
+- Downloads are automatically resumed if they get interrupted
+
+## Issues
+
+- "segmentation fault" from parser? You might be low on RAM, close other apps
+- "error downloading" from downloader? Check your internet or try with more `-retries`
+- Weird filenames? The downloader handles URL-encoding/special chars automatically
diff --git a/downloader b/downloader
new file mode 100755
index 0000000..b5a7ccb
Binary files /dev/null and b/downloader differ
diff --git a/misc/generate_sample_urls.sh b/misc/generate_sample_urls.sh
new file mode 100755
index 0000000..0063d13
--- /dev/null
+++ b/misc/generate_sample_urls.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+N="$1"
+INPUT="$2"
+OUTPUT="$3"
+
+if [[ -z "$N" || -z "$INPUT" || -z "$OUTPUT" ]]; then
+  echo "Usage: $0 <number_of_lines> <input_file> <output_file>"
+  exit 1
+fi
+
+shuf -n "$N" "$INPUT" > "$OUTPUT"
+
diff --git a/parser b/parser
new file mode 100755
index 0000000..9d24345
Binary files /dev/null and b/parser differ
diff --git a/src/downloader.go b/src/downloader.go
new file mode 100644
index 0000000..0e64dba
--- /dev/null
+++ b/src/downloader.go
@@ -0,0 +1,325 @@
+package main
+
+import (
+	"bufio"
+	"crypto/md5"
+	"flag"
+	"fmt"
+	"io"
+	"net/http"
+	"net/url"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+)
+
+type ProgressWriter struct {
+	io.Writer
+	Total      int64
+	Downloaded int64
+	Filename   string
+}
+
+func (pw *ProgressWriter) Write(p []byte) (int, error) {
+	n, err := pw.Writer.Write(p)
+	pw.Downloaded += int64(n)
+	percentage := float64(pw.Downloaded) / float64(pw.Total) * 100
+	fmt.Printf("\r%s: %.2f%% (%d/%d bytes)    ",
+		shortenFilename(pw.Filename), percentage, pw.Downloaded, pw.Total)
+	return n, err
+}
+
+func shortenFilename(filename string) string {
+	if len(filename) > 40 {
+		return filename[:18] + "..." + filename[len(filename)-18:]
+	}
+	return filename
+}
+
+func getTempDirName(url string) string {
+	hash := md5.Sum([]byte(url))
+	return fmt.Sprintf("download_%x", hash)
+}
+
+func getSafeFilename(urlStr string) string {
+	parsedURL, err := url.Parse(urlStr)
+	if err != nil {
+		hash := md5.Sum([]byte(urlStr))
+		return fmt.Sprintf("download_%x", hash)
+	}
+
+	filename := filepath.Base(parsedURL.Path)
+
+	decodedFilename, err := url.QueryUnescape(filename)
+	if err != nil {
+		decodedFilename = strings.ReplaceAll(filename, "%20", " ")
+	}
+
+	safeFilename := strings.Map(func(r rune) rune {
+		if strings.ContainsRune(`<>:"/\|?*`, r) {
+			return '_'
+		}
+		return r
+	}, decodedFilename)
+
+	if safeFilename == "" || safeFilename == "." {
+		hash := md5.Sum([]byte(urlStr))
+		return fmt.Sprintf("download_%x", hash)
+	}
+
+	return safeFilename
+}
+
+func downloadFile(url string, filepath string, concurrency int) error {
+	tempDirName := getTempDirName(url)
+	dir := fmt.Sprintf(".%s.chunks", tempDirName)
+
+	err := os.MkdirAll(dir, 0755)
+	if err != nil {
+		return err
+	}
+
+	resp, err := http.Head(url)
+	if err != nil {
+		return err
+	}
+	if resp.StatusCode != http.StatusOK {
+		return fmt.Errorf("bad status: %s", resp.Status)
+	}
+
+	fileSize, err := strconv.ParseInt(resp.Header.Get("Content-Length"), 10, 64)
+	if err != nil {
+		return fmt.Errorf("failed to parse Content-Length: %v", err)
+	}
+
+	metaFile, err := os.OpenFile(dir+"/metadata", os.O_CREATE|os.O_RDWR, 0644)
+	if err != nil {
+		return err
+	}
+	defer metaFile.Close()
+
+	var downloadedChunks map[int]bool = make(map[int]bool)
+	scanner := bufio.NewScanner(metaFile)
+	for scanner.Scan() {
+		chunkID, err := strconv.Atoi(scanner.Text())
+		if err == nil {
+			downloadedChunks[chunkID] = true
+		}
+	}
+
+	chunkSize := fileSize / int64(concurrency)
+	if chunkSize < 1024*1024 { // Minimum 1MB chunk
+		chunkSize = 1024 * 1024
+		if fileSize < chunkSize {
+			chunkSize = fileSize
+		}
+	}
+
+	numChunks := int((fileSize + chunkSize - 1) / chunkSize)
+	if numChunks > concurrency {
+		numChunks = concurrency
+	}
+
+	fmt.Printf("Downloading %s (%.2f MB) using %d chunks\n",
+		filepath, float64(fileSize)/(1024*1024), numChunks)
+
+	var wg sync.WaitGroup
+	var mutex sync.Mutex
+	totalDownloaded := int64(0)
+
+	for i := 0; i < numChunks; i++ {
+		if downloadedChunks[i] {
+			start := int64(i) * chunkSize
+			end := start + chunkSize - 1
+			if end >= fileSize {
+				end = fileSize - 1
+			}
+			chunkBytes := end - start + 1
+			totalDownloaded += chunkBytes
+			continue
+		}
+
+		wg.Add(1)
+		go func(chunkID int) {
+			defer wg.Done()
+
+			start := int64(chunkID) * chunkSize
+			end := start + chunkSize - 1
+			if end >= fileSize {
+				end = fileSize - 1
+			}
+
+			chunkFileName := fmt.Sprintf("%s/chunk%d", dir, chunkID)
+			chunkFile, err := os.Create(chunkFileName)
+			if err != nil {
+				fmt.Printf("Error creating chunk file %s: %v\n", chunkFileName, err)
+				return
+			}
+			defer chunkFile.Close()
+
+			req, err := http.NewRequest("GET", url, nil)
+			if err != nil {
+				fmt.Printf("Error creating request for chunk %d: %v\n", chunkID, err)
+				return
+			}
+			req.Header.Set("Range", fmt.Sprintf("bytes=%d-%d", start, end))
+
+			client := &http.Client{
+				Timeout: 30 * time.Minute, // Increase timeout for large files
+			}
+			resp, err := client.Do(req)
+			if err != nil {
+				fmt.Printf("Error downloading chunk %d: %v\n", chunkID, err)
+				return
+			}
+			defer resp.Body.Close()
+
+			if resp.StatusCode != http.StatusPartialContent && resp.StatusCode != http.StatusOK {
+				fmt.Printf("Error: bad status for chunk %d: %s\n", chunkID, resp.Status)
+				return
+			}
+
+			chunkWriter := &ProgressWriter{
+				Writer:     chunkFile,
+				Total:      end - start + 1,
+				Filename:   filepath,
+				Downloaded: 0,
+			}
+			bytesWritten, err := io.Copy(chunkWriter, resp.Body)
+			if err != nil {
+				fmt.Printf("Error writing chunk %d: %v\n", chunkID, err)
+				return
+			}
+
+			mutex.Lock()
+			totalDownloaded += bytesWritten
+			metaFile.WriteString(fmt.Sprintf("%d\n", chunkID))
+			mutex.Unlock()
+
+			fmt.Printf("\rDownloaded chunk %d (%d bytes)    \n", chunkID, bytesWritten)
+		}(i)
+	}
+
+	wg.Wait()
+
+	if totalDownloaded < fileSize {
+		return fmt.Errorf("download incomplete: %d/%d bytes", totalDownloaded, fileSize)
+	}
+
+	fmt.Println("Merging chunks...")
+	finalFile, err := os.Create(filepath)
+	if err != nil {
+		return err
+	}
+	defer finalFile.Close()
+
+	for i := 0; i < numChunks; i++ {
+		chunkFileName := fmt.Sprintf("%s/chunk%d", dir, i)
+		chunkFile, err := os.Open(chunkFileName)
+		if err != nil {
+			return err
+		}
+
+		_, err = io.Copy(finalFile, chunkFile)
+		chunkFile.Close()
+		if err != nil {
+			return err
+		}
+	}
+
+	os.RemoveAll(dir)
+	fmt.Printf("Download completed: %s (%.2f MB)\n", filepath, float64(fileSize)/(1024*1024))
+	return nil
+}
+
+func main() {
+	urlFile := flag.String("file", "", "file containing URLs to download")
+	outputDir := flag.String("dir", "downloads", "directory to save downloads")
+	concurrency := flag.Int("n", 4, "number of concurrent downloads")
+	chunkConcurrency := flag.Int("c", 4, "number of chunks per download")
+	startFrom := flag.Int("start", 0, "start from this line (skip earlier lines)")
+	retries := flag.Int("retries", 3, "number of retry attempts for failed downloads")
+	flag.Parse()
+
+	if *urlFile == "" {
+		fmt.Println("Please specify a file containing URLs with -file")
+		flag.PrintDefaults()
+		return
+	}
+
+	err := os.MkdirAll(*outputDir, 0755)
+	if err != nil {
+		fmt.Printf("Error creating output directory: %v\n", err)
+		return
+	}
+
+	file, err := os.Open(*urlFile)
+	if err != nil {
+		fmt.Printf("Error opening URL file: %v\n", err)
+		return
+	}
+	defer file.Close()
+
+	sem := make(chan struct{}, *concurrency)
+	var wg sync.WaitGroup
+
+	scanner := bufio.NewScanner(file)
+	lineNum := 0
+	for scanner.Scan() {
+		url := scanner.Text()
+		lineNum++
+
+		if lineNum < *startFrom {
+			continue
+		}
+
+		filename := getSafeFilename(url)
+
+		fullPath := filepath.Join(*outputDir, filename)
+
+		if _, err := os.Stat(fullPath); err == nil {
+			fmt.Printf("Skipping %s (already exists)\n", fullPath)
+			continue
+		}
+
+		wg.Add(1)
+		sem <- struct{}{} // Acquire semaphore
+		go func(url, path string, line int) {
+			defer wg.Done()
+			defer func() { <-sem }() // Release semaphore
+
+			fmt.Printf("Starting download [%d]: %s\n", line, path)
+			start := time.Now()
+
+			var downloadErr error
+			for attempt := 0; attempt < *retries; attempt++ {
+				if attempt > 0 {
+					fmt.Printf("Retry %d/%d for %s\n", attempt, *retries, path)
+					time.Sleep(time.Duration(attempt) * 2 * time.Second) // Exponential backoff
+				}
+
+				downloadErr = downloadFile(url, path, *chunkConcurrency)
+				if downloadErr == nil {
+					break // Download succeeded
+				}
+
+				fmt.Printf("Error on attempt %d: %v\n", attempt+1, downloadErr)
+			}
+
+			if downloadErr != nil {
+				fmt.Printf("Failed to download after %d attempts: %s - %v\n",
+					*retries, url, downloadErr)
+				return
+			}
+
+			elapsed := time.Since(start)
+			fmt.Printf("Completed download [%d]: %s (took %s)\n", line, path, elapsed)
+		}(url, fullPath, lineNum)
+	}
+
+	wg.Wait()
+	fmt.Println("All downloads completed!")
+}
diff --git a/src/parser.zig b/src/parser.zig
new file mode 100644
index 0000000..4d860a7
--- /dev/null
+++ b/src/parser.zig
@@ -0,0 +1,375 @@
+const std = @import("std");
+const ascii = std.ascii;
+const mem = std.mem;
+const fs = std.fs;
+
+pub fn main() !void {
+    var general_purpose_allocator = std.heap.GeneralPurposeAllocator(.{}){};
+    const gpa = general_purpose_allocator.allocator();
+    defer _ = general_purpose_allocator.deinit();
+
+    const args = try std.process.argsAlloc(gpa);
+    defer std.process.argsFree(gpa, args);
+
+    if (args.len < 2) {
+        std.debug.print("Usage: {s} <sql_file_path> [output_file_path] [md5_to_debug]\n", .{args[0]});
+        std.debug.print("If output_file_path is not provided, results will be printed to stdout\n", .{});
+        return;
+    }
+
+    const file_path = args[1];
+    const output_to_file = args.len >= 3;
+    const output_path = if (output_to_file) args[2] else "";
+
+    // Optional MD5 to debug - if provided, print all details for this record
+    const debug_md5 = if (args.len >= 4) args[3] else "";
+
+    var output_file: ?fs.File = null;
+    if (output_to_file) {
+        output_file = try fs.cwd().createFile(output_path, .{});
+    }
+
+    try processLargeSQLFile(file_path, output_file, debug_md5, gpa);
+
+    if (output_file) |of| {
+        of.close();
+    }
+
+    std.debug.print("SQL processing completed.\n", .{});
+}
+
+fn processLargeSQLFile(file_path: []const u8, output_file: ?fs.File, debug_md5: []const u8, allocator: std.mem.Allocator) !void {
+    std.debug.print("Opening file: {s}\n", .{file_path});
+
+    const file = try std.fs.cwd().openFile(file_path, .{});
+    defer file.close();
+
+    const file_size = try file.getEndPos();
+    std.debug.print("File size: {} bytes\n", .{file_size});
+
+    var reader = file.reader();
+
+    // Using a 1MB buffer to read chunks
+    const buffer_size = 1024 * 1024;
+    var buffer = try allocator.alloc(u8, buffer_size);
+    defer allocator.free(buffer);
+
+    var in_values_section = false;
+    var in_tuple = false;
+    var in_quotes = false;
+    var tuple_start: usize = 0;
+    var bytes_read: usize = 0;
+    var tuple_buffer = std.ArrayList(u8).init(allocator);
+    defer tuple_buffer.deinit();
+
+    var entry_count: usize = 0;
+    var last_progress_pct: usize = 0;
+
+    while (true) {
+        const read_amount = try reader.read(buffer);
+        if (read_amount == 0) break; // End of file
+
+        bytes_read += read_amount;
+
+        const progress_pct = bytes_read * 100 / file_size;
+        if (progress_pct > last_progress_pct) {
+            std.debug.print("Progress: {}% ({} of {} bytes)\n", .{ progress_pct, bytes_read, file_size });
+            last_progress_pct = progress_pct;
+        }
+
+        var i: usize = 0;
+        while (i < read_amount) {
+            const c = buffer[i];
+
+            if (!in_values_section) {
+                if (c == 'L') {
+                    if (i + 11 < read_amount and
+                        mem.eql(u8, buffer[i .. i + 11], "LOCK TABLES"))
+                    {
+                        in_values_section = true;
+                        std.debug.print("Found start of values section\n", .{});
+                    }
+                }
+                i += 1;
+                continue;
+            }
+
+            if (c == '\'' and (i == 0 or buffer[i - 1] != '\\')) {
+                in_quotes = !in_quotes;
+            }
+
+            if (!in_quotes) {
+                if (c == '(') {
+                    if (!in_tuple) {
+                        in_tuple = true;
+                        tuple_start = i + 1;
+                        tuple_buffer.clearRetainingCapacity();
+                    }
+                } else if (c == ')' and in_tuple) {
+                    try tuple_buffer.append(')'); // Add the closing parenthesis
+                    if (processEntryTupleFromBuffer(&tuple_buffer, output_file, debug_md5, allocator, &entry_count)) {
+                        // Success - continue
+                    } else |err| {
+                        // Log error but continue processing
+                        std.debug.print("Error processing tuple: {}\n", .{err});
+                    }
+                    in_tuple = false;
+                }
+            }
+
+            if (in_tuple) {
+                try tuple_buffer.append(c);
+            }
+
+            i += 1;
+        }
+    }
+
+    std.debug.print("Processed {} entries from {} bytes\n", .{ entry_count, bytes_read });
+}
+
+fn processEntryTupleFromBuffer(tuple_buffer: *std.ArrayList(u8), output_file: ?fs.File, debug_md5: []const u8, allocator: std.mem.Allocator, entry_count: *usize) !void {
+    const tuple = tuple_buffer.items;
+
+    if (tuple.len < 10) return; // Skip very small tuples
+
+    // The tuple format is (field1,field2,field3,...)
+    // We need to extract fields 1 (Title), 37 (MD5), and 40 (Locator)
+
+    var fields = std.ArrayList([]const u8).init(allocator);
+    defer fields.deinit();
+
+    var in_quotes = false;
+    var field_start: usize = 0;
+
+    var start_idx: usize = 0;
+    if (tuple.len > 0 and tuple[0] == '(') {
+        start_idx = 1;
+        field_start = 1;
+    }
+
+    for (tuple[start_idx..], start_idx..) |c, i| {
+        if (c == '\'' and (i == start_idx or tuple[i - 1] != '\\')) {
+            in_quotes = !in_quotes;
+        } else if (c == ',' and !in_quotes) {
+            try fields.append(tuple[field_start..i]);
+            field_start = i + 1;
+        } else if (c == ')' and !in_quotes and i == tuple.len - 1) {
+            // End of tuple
+            if (field_start < i) {
+                try fields.append(tuple[field_start..i]);
+            }
+            break;
+        }
+    }
+
+    if (fields.items.len < 41) {
+        return;
+    }
+
+    const id_raw = if (fields.items.len > 0) fields.items[0] else ""; // ID is field 0
+    const title_raw = if (fields.items.len > 1) fields.items[1] else ""; // Title is field 1
+    const extension_raw = if (fields.items.len > 36) fields.items[36] else ""; // Extension is field 36
+    const md5_raw = if (fields.items.len > 37) fields.items[37] else ""; // MD5 is field 37
+    const locator_raw = if (fields.items.len > 40) fields.items[40] else ""; // Locator is field 40
+    const local_raw = if (fields.items.len > 41) fields.items[41] else ""; // Local is field 41
+
+    const id = cleanSQLString(id_raw, allocator) catch try allocator.dupe(u8, "");
+    defer allocator.free(id);
+
+    const title = cleanSQLString(title_raw, allocator) catch try allocator.dupe(u8, "");
+    defer allocator.free(title);
+
+    const extension = cleanSQLString(extension_raw, allocator) catch try allocator.dupe(u8, "");
+    defer allocator.free(extension);
+
+    const md5 = cleanSQLString(md5_raw, allocator) catch try allocator.dupe(u8, "");
+    defer allocator.free(md5);
+
+    if (md5.len == 0) return; // Skip if no MD5
+
+    const locator = cleanSQLString(locator_raw, allocator) catch try allocator.dupe(u8, "");
+    defer allocator.free(locator);
+
+    const local = cleanSQLString(local_raw, allocator) catch try allocator.dupe(u8, "");
+    defer allocator.free(local);
+
+    const author_raw = if (fields.items.len > 5) fields.items[5] else "";
+    const year_raw = if (fields.items.len > 6) fields.items[6] else "";
+
+    const author = cleanSQLString(author_raw, allocator) catch try allocator.dupe(u8, "");
+    defer allocator.free(author);
+
+    const year = cleanSQLString(year_raw, allocator) catch try allocator.dupe(u8, "");
+    defer allocator.free(year);
+
+    const is_debug_entry = debug_md5.len > 0 and std.mem.eql(u8, md5, debug_md5);
+
+    // I'm not so sure how this ID is defined, so i tried this and it does seem to work (for testing examples)
+    // Thus, im using this logic here
+    // A simple, direct approach to get folder ID: use record ID / 1000 * 1000
+    var folder_id_buf: [20]u8 = undefined; // Stack buffer for folder ID
+    var folder_id: []const u8 = "0"; // Default
+
+    // First try to get the raw ID directly
+    if (id.len > 0) {
+        const id_num = std.fmt.parseInt(u32, id, 10) catch 0;
+        if (id_num > 0) {
+            // Calculate folder as ID/1000*1000
+            const folder_num = (id_num / 1000) * 1000;
+            folder_id = std.fmt.bufPrint(&folder_id_buf, "{d}", .{folder_num}) catch "0";
+
+            if (is_debug_entry) {
+                std.debug.print("Calculated folder ID: {s} from record ID: {s}\n", .{ folder_id, id });
+            }
+        } else {
+            // Error parsing, fallback
+            if (is_debug_entry) {
+                std.debug.print("Couldn't parse ID, using default folder 0\n", .{});
+            }
+        }
+    }
+
+    var filename_buffer = std.ArrayList(u8).init(allocator);
+    defer filename_buffer.deinit();
+
+    // Format: Author - Title (Year).extension
+    if (author.len > 0) {
+        try filename_buffer.appendSlice(author);
+        try filename_buffer.appendSlice(" - ");
+    }
+
+    try filename_buffer.appendSlice(title);
+
+    if (year.len > 0) {
+        try filename_buffer.appendSlice(" (");
+        try filename_buffer.appendSlice(year);
+        try filename_buffer.appendSlice(")");
+    }
+
+    if (extension.len > 0) {
+        try filename_buffer.appendSlice(".");
+        try filename_buffer.appendSlice(extension);
+    }
+
+    const filename = filename_buffer.items;
+
+    var encoded_filename = try allocator.alloc(u8, filename.len * 3); // Worst case: each char becomes %XX
+    defer allocator.free(encoded_filename);
+
+    const encoded_size = try urlEncode(filename, encoded_filename);
+
+    const url = try std.fmt.allocPrint(allocator, "https://download.books.ms/main/{s}/{s}/{s}", .{ folder_id, md5, encoded_filename[0..encoded_size] });
+    defer allocator.free(url);
+
+    if (is_debug_entry) {
+        std.debug.print("\n=== DEBUG INFO FOR MD5: {s} ===\n", .{md5});
+        std.debug.print("ID: {s}\n", .{id});
+        std.debug.print("Title: {s}\n", .{title});
+        std.debug.print("Extension: {s}\n", .{extension});
+        std.debug.print("Author: {s}\n", .{author});
+        std.debug.print("Year: {s}\n", .{year});
+        std.debug.print("Locator: {s}\n", .{locator});
+        std.debug.print("Local: {s}\n", .{local});
+        std.debug.print("Folder ID: {s}\n", .{folder_id});
+        std.debug.print("Generated URL: {s}\n", .{url});
+        std.debug.print("=== END DEBUG INFO ===\n\n", .{});
+    }
+
+    if (output_file) |of| {
+        of.writeAll(url) catch {
+            std.debug.print("Error writing to file\n", .{});
+        };
+        of.writeAll("\n") catch {};
+    } else {
+        std.debug.print("{s}\n", .{url});
+    }
+
+    entry_count.* += 1;
+}
+
+fn cleanSQLString(s: []const u8, allocator: std.mem.Allocator) ![]u8 {
+    if (s.len == 0) return allocator.dupe(u8, "");
+
+    var result = s;
+    if (result.len >= 2 and result[0] == '\'' and result[result.len - 1] == '\'') {
+        result = result[1 .. result.len - 1];
+    }
+
+    var cleaned = std.ArrayList(u8).init(allocator);
+    errdefer cleaned.deinit();
+
+    var i: usize = 0;
+    while (i < result.len) {
+        if (i + 1 < result.len and result[i] == '\\') {
+            switch (result[i + 1]) {
+                '\'', '\\', '"' => {
+                    try cleaned.append(result[i + 1]);
+                    i += 2;
+                },
+                'n' => {
+                    try cleaned.append('\n');
+                    i += 2;
+                },
+                't' => {
+                    try cleaned.append('\t');
+                    i += 2;
+                },
+                else => {
+                    // Skip the backslash but include the character after it
+                    i += 1;
+                    try cleaned.append(result[i]);
+                    i += 1;
+                },
+            }
+        } else {
+            try cleaned.append(result[i]);
+            i += 1;
+        }
+    }
+
+    return cleaned.toOwnedSlice();
+}
+
+fn urlEncode(input: []const u8, buffer: []u8) !usize {
+    var j: usize = 0;
+    for (input) |c| {
+        if (j + 3 >= buffer.len) {
+            // Buffer is too small, avoid overrun
+            return error.BufferTooSmall;
+        }
+
+        if (ascii.isAlphanumeric(c) or c == '-' or c == '_' or c == '.' or c == '~') {
+            buffer[j] = c;
+            j += 1;
+        } else if (c == ' ') {
+            buffer[j] = '%';
+            buffer[j + 1] = '2';
+            buffer[j + 2] = '0';
+            j += 3;
+        } else {
+            // Encode as %XX
+            buffer[j] = '%';
+
+            // Convert to hex
+            const hi = c >> 4;
+            const lo = c & 0x0F;
+
+            if (hi < 10) {
+                buffer[j + 1] = '0' + @as(u8, @intCast(hi));
+            } else {
+                buffer[j + 1] = 'A' + @as(u8, @intCast(hi - 10));
+            }
+
+            if (lo < 10) {
+                buffer[j + 2] = '0' + @as(u8, @intCast(lo));
+            } else {
+                buffer[j + 2] = 'A' + @as(u8, @intCast(lo - 10));
+            }
+
+            j += 3;
+        }
+    }
+
+    return j; // Return the number of bytes written
+}