From 37a6444544dd5e87104d0a5b69f8fc7ae0594dd1 Mon Sep 17 00:00:00 2001 From: HydraDragonAntivirus <142328963+HydraDragonAntivirus@users.noreply.github.com> Date: Fri, 7 Jun 2024 23:00:09 +0300 Subject: [PATCH 1/7] #52 --- yarGen.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yarGen.py b/yarGen.py index 7c76e5a..b63cf39 100644 --- a/yarGen.py +++ b/yarGen.py @@ -589,7 +589,7 @@ def filter_string_set(string_set): goodstring = True goodcount = good_strings_db[string] # print "%s - %s" % ( goodstring, good_strings[string] ) - if args.excludegood: + if args.excludegood and string in reversedStrings: continue # UTF From ef21e90daadc4ebdb051608345de881a50ffa64c Mon Sep 17 00:00:00 2001 From: HydraDragonAntivirus <142328963+HydraDragonAntivirus@users.noreply.github.com> Date: Sat, 8 Jun 2024 13:37:07 +0300 Subject: [PATCH 2/7] reversed strings --- yarGen.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yarGen.py b/yarGen.py index b63cf39..250e985 100644 --- a/yarGen.py +++ b/yarGen.py @@ -585,11 +585,11 @@ def filter_string_set(string_set): goodcount = 0 # Goodware Strings - if string in good_strings_db: + if string in good_strings_db and string in reversedStrings: goodstring = True goodcount = good_strings_db[string] # print "%s - %s" % ( goodstring, good_strings[string] ) - if args.excludegood and string in reversedStrings: + if args.excludegood: continue # UTF From 0c1e7d1725f01a892eebbe501a74d5a7e81e487a Mon Sep 17 00:00:00 2001 From: HydraDragonAntivirus <142328963+HydraDragonAntivirus@users.noreply.github.com> Date: Sat, 8 Jun 2024 14:10:14 +0300 Subject: [PATCH 3/7] No more reversed strings --- yarGen.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yarGen.py b/yarGen.py index 250e985..bc4b3c0 100644 --- a/yarGen.py +++ b/yarGen.py @@ -585,7 +585,7 @@ def filter_string_set(string_set): goodcount = 0 # Goodware Strings - if string in good_strings_db and string in reversedStrings: + if string in good_strings_db: goodstring = True goodcount = good_strings_db[string] # print "%s - %s" % ( goodstring, good_strings[string] ) @@ -944,6 +944,7 @@ def filter_string_set(string_set): # Reversed String ----------------------------------------------------- if string[::-1] in good_strings_db: + if not args.excludegood: localStringScores[string] += 10 reversedStrings[string] = string[::-1] From a7e297c87c9377690e1e1cd59d66474959ee0749 Mon Sep 17 00:00:00 2001 From: HydraDragonAntivirus <142328963+HydraDragonAntivirus@users.noreply.github.com> Date: Wed, 6 Nov 2024 14:48:12 +0300 Subject: [PATCH 4/7] Only meaningful words --- requirements.txt | 3 ++- yarGen.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index a0e8d75..a725888 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ lief -lxml \ No newline at end of file +lxml +nltk \ No newline at end of file diff --git a/yarGen.py b/yarGen.py index bc4b3c0..2cc09d1 100644 --- a/yarGen.py +++ b/yarGen.py @@ -29,6 +29,21 @@ from hashlib import sha256 import signal as signal_module from lxml import etree +import nltk +from nltk.corpus import words +from nltk.tokenize import word_tokenize + +# Ensure that necessary NLTK resources are available +nltk.download('punkt') +nltk.download('words') + +# A simple filter function to consider only meaningful words (ignoring non-English or arbitrary symbols) +def filter_meaningful_words(word_list): + # Only allow alphabetic, lowercase words + return [word for word in word_list if word.isalpha() and word.islower()] + +# Load NLTK word corpus +nltk_words = set(words.words()) RELEVANT_EXTENSIONS = [".asp", ".vbs", ".ps", ".ps1", ".tmp", ".bas", ".bat", ".cmd", ".com", ".cpl", ".crt", ".dll", ".exe", ".msc", ".scr", ".sys", ".vb", ".vbe", ".vbs", ".wsc", @@ -579,6 +594,15 @@ def filter_string_set(string_set): utfstrings = [] for string in string_set: + # Filter meaningful words based on the flag + if args.meaningful_words_only: + # Tokenize the string and check if it contains any meaningful word + tokens = word_tokenize(string) + contains_meaningful_word = any(word.lower() in nltk_words for word in tokens) + + # If no meaningful word is found, skip this string + if not contains_meaningful_word: + continue # Goodware string marker goodstring = False @@ -2139,6 +2163,9 @@ def print_welcome(): group_inverse.add_argument('--nodirname', help=argparse.SUPPRESS, action='store_true', default=False) group_inverse.add_argument('--noscorefilter', help=argparse.SUPPRESS, action='store_true', default=False) + group_creation.add_argument('--meaningful-words-only', help='Only include strings containing meaningful words (default: False)', + action='store_true', default=False) + args = parser.parse_args() # Print Welcome @@ -2162,6 +2189,10 @@ def print_welcome(): print("[+] Updated databases - you can now start creating YARA rules") sys.exit(0) + # Check if the meaningful-words-only flag is set and handle accordingly + if args.meaningful_words_only: + print("[+] Only including strings containing meaningful words (non-trivial, dictionary-based).") + # Typical input erros if args.m: if os.path.isfile(args.m): From 42438a0885fbe1be391b07b772a3c9a897c21466 Mon Sep 17 00:00:00 2001 From: HydraDragonAntivirus <142328963+HydraDragonAntivirus@users.noreply.github.com> Date: Thu, 7 Nov 2024 10:46:53 +0300 Subject: [PATCH 5/7] punk_tab added to download list --- yarGen.py | 1 + 1 file changed, 1 insertion(+) diff --git a/yarGen.py b/yarGen.py index 2cc09d1..533515f 100644 --- a/yarGen.py +++ b/yarGen.py @@ -35,6 +35,7 @@ # Ensure that necessary NLTK resources are available nltk.download('punkt') +nltk.download('punkt_tab') nltk.download('words') # A simple filter function to consider only meaningful words (ignoring non-English or arbitrary symbols) From 772d244c4537f71d0652ab1fe3f95bb83a0b2697 Mon Sep 17 00:00:00 2001 From: HydraDragonAntivirus <142328963+HydraDragonAntivirus@users.noreply.github.com> Date: Thu, 7 Nov 2024 11:45:28 +0300 Subject: [PATCH 6/7] minimum 4 characters --- yarGen.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/yarGen.py b/yarGen.py index 533515f..5c458d5 100644 --- a/yarGen.py +++ b/yarGen.py @@ -599,11 +599,13 @@ def filter_string_set(string_set): if args.meaningful_words_only: # Tokenize the string and check if it contains any meaningful word tokens = word_tokenize(string) - contains_meaningful_word = any(word.lower() in nltk_words for word in tokens) + contains_meaningful_word = any( + word.lower() in nltk_words and len(word) >= 4 for word in tokens + ) # If no meaningful word is found, skip this string if not contains_meaningful_word: - continue + continue # Goodware string marker goodstring = False From 6c92f26e7f3680d0f1a447bdd0a9ebbdaada2ef1 Mon Sep 17 00:00:00 2001 From: HydraDragonAntivirus <142328963+HydraDragonAntivirus@users.noreply.github.com> Date: Thu, 7 Nov 2024 12:56:00 +0300 Subject: [PATCH 7/7] nltk download order --- yarGen.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/yarGen.py b/yarGen.py index 5c458d5..ddeaeae 100644 --- a/yarGen.py +++ b/yarGen.py @@ -30,14 +30,15 @@ import signal as signal_module from lxml import etree import nltk -from nltk.corpus import words -from nltk.tokenize import word_tokenize # Ensure that necessary NLTK resources are available nltk.download('punkt') nltk.download('punkt_tab') nltk.download('words') +from nltk.corpus import words +from nltk.tokenize import word_tokenize + # A simple filter function to consider only meaningful words (ignoring non-English or arbitrary symbols) def filter_meaningful_words(word_list): # Only allow alphabetic, lowercase words