libgenbulk
Owner: IIIlllIIIllI URL: git@github.com:nyangkosense/libgenbulk.git
add filtering for lang
Commit 2abdc046923c83f617ca13d241e0cdc7f6dfa7fc by SM <seb.michalk@gmail.com> on 2025-05-21 11:11:04 +0200
diff --git a/bin/parser b/bin/parser
index 9d24345..cd3e3d3 100755
Binary files a/bin/parser and b/bin/parser differ
diff --git a/src/parser.zig b/src/parser.zig
index 4d860a7..e4b68ad 100644
--- a/src/parser.zig
+++ b/src/parser.zig
@@ -12,8 +12,9 @@ pub fn main() !void {
defer std.process.argsFree(gpa, args);
if (args.len < 2) {
- std.debug.print("Usage: {s} <sql_file_path> [output_file_path] [md5_to_debug]\n", .{args[0]});
- std.debug.print("If output_file_path is not provided, results will be printed to stdout\n", .{});
+ std.debug.print("Usage: {s} <sql_file_path> [output_file_path] [language1, language2, ...] [md5_to_debug]\n", .{args[0]});
+ std.debug.print("If output_file_path is not provided, results will be printed to stdout \n", .{});
+ std.debug.print("If no languages are specified, all languages will be included \n", .{});
return;
}
@@ -24,12 +25,40 @@ pub fn main() !void {
// Optional MD5 to debug - if provided, print all details for this record
const debug_md5 = if (args.len >= 4) args[3] else "";
+ var languages = std.ArrayList([]const u8).init(gpa);
+ defer {
+ for (languages.items) |lang| {
+ gpa.free(lang);
+ }
+ languages.deinit();
+ }
+
+ if (args.len >= 4) {
+ var lang_iter = std.mem.split(u8, args[3], ",");
+ while (lang_iter.next()) |lang| {
+ if (lang.len > 0) {
+ const normalized_lang = try gpa.dupe(u8, lang);
+ for (0..normalized_lang.len) |i| {
+ normalized_lang[i] = std.ascii.toLower(normalized_lang[i]);
+ }
+ try languages.append(normalized_lang);
+ }
+ }
+
+ std.debug.print("Filtering for languages: ", .{});
+ for (languages.items) |lang| {
+ std.debug.print("{s} ", .{lang});
+ }
+ std.debug.print("\n", .{});
+ }
+
+
var output_file: ?fs.File = null;
if (output_to_file) {
output_file = try fs.cwd().createFile(output_path, .{});
}
- try processLargeSQLFile(file_path, output_file, debug_md5, gpa);
+ try processLargeSQLFile(file_path, output_file, debug_md5, &languages, gpa);
if (output_file) |of| {
of.close();
@@ -38,7 +67,7 @@ pub fn main() !void {
std.debug.print("SQL processing completed.\n", .{});
}
-fn processLargeSQLFile(file_path: []const u8, output_file: ?fs.File, debug_md5: []const u8, allocator: std.mem.Allocator) !void {
+fn processLargeSQLFile(file_path: []const u8, output_file: ?fs.File, debug_md5: []const u8, languages: *std.ArrayList([]const u8), allocator: std.mem.Allocator) !void {
std.debug.print("Opening file: {s}\n", .{file_path});
const file = try std.fs.cwd().openFile(file_path, .{});
@@ -107,7 +136,7 @@ fn processLargeSQLFile(file_path: []const u8, output_file: ?fs.File, debug_md5:
}
} else if (c == ')' and in_tuple) {
try tuple_buffer.append(')'); // Add the closing parenthesis
- if (processEntryTupleFromBuffer(&tuple_buffer, output_file, debug_md5, allocator, &entry_count)) {
+ if (processEntryTupleFromBuffer(&tuple_buffer, output_file, debug_md5, languages, allocator, &entry_count)) {
// Success - continue
} else |err| {
// Log error but continue processing
@@ -128,7 +157,7 @@ fn processLargeSQLFile(file_path: []const u8, output_file: ?fs.File, debug_md5:
std.debug.print("Processed {} entries from {} bytes\n", .{ entry_count, bytes_read });
}
-fn processEntryTupleFromBuffer(tuple_buffer: *std.ArrayList(u8), output_file: ?fs.File, debug_md5: []const u8, allocator: std.mem.Allocator, entry_count: *usize) !void {
+fn processEntryTupleFromBuffer(tuple_buffer: *std.ArrayList(u8), output_file: ?fs.File, debug_md5: []const u8, languages: *std.ArrayList([]const u8), allocator: std.mem.Allocator, entry_count: *usize) !void {
const tuple = tuple_buffer.items;
if (tuple.len < 10) return; // Skip very small tuples
@@ -173,6 +202,10 @@ fn processEntryTupleFromBuffer(tuple_buffer: *std.ArrayList(u8), output_file: ?f
const md5_raw = if (fields.items.len > 37) fields.items[37] else ""; // MD5 is field 37
const locator_raw = if (fields.items.len > 40) fields.items[40] else ""; // Locator is field 40
const local_raw = if (fields.items.len > 41) fields.items[41] else ""; // Local is field 41
+ const language_raw = if (fields.items.len > 12) fields.items[12] else "";
+
+ const language = cleanSQLString(language_raw, allocator) catch try allocator.dupe(u8, "");
+ defer allocator.free(language);
const id = cleanSQLString(id_raw, allocator) catch try allocator.dupe(u8, "");
defer allocator.free(id);
@@ -252,6 +285,21 @@ fn processEntryTupleFromBuffer(tuple_buffer: *std.ArrayList(u8), output_file: ?f
try filename_buffer.appendSlice(extension);
}
+ // SQL LANGUAGE
+ var language_allowed = false;
+ if (languages.items.len == 0) {
+ language_allowed = true;
+ } else {
+ for (languages.items) |allowed_lang| {
+ if (std.ascii.eqlIgnoreCase(language, allowed_lang)) {
+ language_allowed = true;
+ break;
+ }
+ }
+ }
+
+ if (!language_allowed) return;
+
const filename = filename_buffer.items;
var encoded_filename = try allocator.alloc(u8, filename.len * 3); // Worst case: each char becomes %XX