Prevent duplicate regex_patterns

This commit is contained in:
Johan van Eck 2025-07-27 20:32:00 +03:00
parent d4582e7330
commit 06a7c06933
235 changed files with 467 additions and 782 deletions

View file

@ -2,11 +2,7 @@ import os
import json
import yaml
from utils.strings import get_name
# TODO: prevent duplicates by only writing unique regex patterns to files
# In some cases negations will result in a new regex pattern as of now
# NOTE: would need to keep track of all duplicate patterns so that trash_id can still be matched
from utils.strings import get_regex_pattern_name
def collect_regex_pattern(service, file_name, input_json, output_dir):
@ -28,7 +24,7 @@ def collect_regex_pattern(service, file_name, input_json, output_dir):
# Compose YAML structure
name = spec.get("name", "")
yml_data = {
"name": get_name(service, name),
"name": get_regex_pattern_name(service, name),
"pattern": pattern,
"description": "",
"tags": [],
@ -38,8 +34,13 @@ def collect_regex_pattern(service, file_name, input_json, output_dir):
# Output path
output_path = os.path.join(
output_dir,
f"{get_name(service, name)}.yml",
f"{get_regex_pattern_name(service, name)}.yml",
)
if os.path.exists(output_path):
print(f"exists{output_path}, skipping")
continue
with open(output_path, "w", encoding="utf-8") as f:
yaml.dump(yml_data, f, sort_keys=False, allow_unicode=True)
print(f"Generated: {output_path}")
@ -47,7 +48,7 @@ def collect_regex_pattern(service, file_name, input_json, output_dir):
def collect_regex_patterns(service, input_dir, output_dir):
for root, _, files in os.walk(input_dir):
for filename in files:
for filename in sorted(files):
if not filename.endswith(".json"):
continue