get_malware_yara_rules.py•4.35 kB
"""
This script downloads and processes YARA rules from the Yara-Rules GitHub repository.
It filters out incompatible rules, handles duplicates, and merges them into a single file.
It also creates an index file for quick rule lookup.
Since it is already included in the repo, you won't need to run it.
However, you might want to refresh the rules periodically.
"""
import json
import os
import re
import shutil
def get_rules_from_git():
shutil.rmtree("./rules", ignore_errors=True)
os.system("git clone https://github.com/Yara-Rules/rules.git")
# Remove .git directory
shutil.rmtree("./rules/.git", ignore_errors=True)
def list_yara_files():
all_yara_files = []
for root, directories, filenames in os.walk("./rules/malware"):
print("Processing " + root)
filenames.sort()
for file_name in filenames:
rule_filename, rule_file_extension = os.path.splitext(file_name)
if rule_file_extension == ".yar" or rule_file_extension == ".yara":
all_yara_files.append(os.path.join(root, file_name))
return all_yara_files
def remove_incompatible_imports(files):
filtered_files = []
for yara_file in files:
with open(yara_file, errors="ignore") as fd:
yara_in_file = fd.read()
if not (
('import "math"' in yara_in_file)
or ('import "cuckoo"' in yara_in_file)
or ('import "hash"' in yara_in_file)
or ("imphash" in yara_in_file)
):
filtered_files.append(yara_file)
return filtered_files
def fix_duplicated_rules(files):
filtered_files = []
first_elf = True
to_delete = False
for yara_file in files:
print("Processing " + yara_file)
with open(yara_file, errors="ignore") as fd:
yara_in_file = fd.readlines()
for line in yara_in_file:
if line.strip() == "private rule is__elf {":
if first_elf:
first_elf = False
else:
to_delete = True
if not to_delete:
filtered_files.append(line)
if (not first_elf) and line.strip() == "}":
to_delete = False
filtered_files.append("\n")
return filtered_files
def merge_rules(all_rules):
with open("malware_rules.yar", "w") as fd:
fd.write("".join(all_rules))
def create_rule_index(yara_file):
"""Create an index of rule names and their definitions."""
rule_index = {}
with open(yara_file, errors="ignore") as f:
content = f.read()
# Find all rule definitions, including those with tags
# This pattern captures both the rule name and any tags that follow
rule_pattern = re.compile(r"rule\s+(\w+)(?:\s*:\s*([\w\s]+))?\s*{", re.MULTILINE)
for match in rule_pattern.finditer(content):
rule_name = match.group(1)
tags = match.group(2).strip() if match.group(2) else None
start_pos = match.start()
# Find the end of the rule (closing brace)
brace_count = 0
end_pos = start_pos
for i in range(start_pos, len(content)):
if content[i] == "{":
brace_count += 1
elif content[i] == "}":
brace_count -= 1
if brace_count == 0:
end_pos = i + 1
break
rule_definition = content[start_pos:end_pos].strip()
# Store the rule by its base name
rule_index[rule_name] = rule_definition
# If the rule has tags, also store it with the full name including tags
if tags:
full_rule_name = f"{rule_name} : {tags}"
rule_index[full_rule_name] = rule_definition
# Save the index to a JSON file
with open("malware_rules_index.json", "w") as f:
json.dump(rule_index, f, indent=2)
print(f"Created rule index with {len(rule_index)} entries")
def main():
get_rules_from_git()
all_yara_files = list_yara_files()
all_yara_filtered_1 = remove_incompatible_imports(all_yara_files)
all_yara_filtered_2 = fix_duplicated_rules(all_yara_filtered_1)
merge_rules(all_yara_filtered_2)
create_rule_index("malware_rules.yar")
if __name__ == "__main__":
main()