From 37a6444544dd5e87104d0a5b69f8fc7ae0594dd1 Mon Sep 17 00:00:00 2001
From: HydraDragonAntivirus
 <142328963+HydraDragonAntivirus@users.noreply.github.com>
Date: Fri, 7 Jun 2024 23:00:09 +0300
Subject: [PATCH 1/7] #52

---
 yarGen.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/yarGen.py b/yarGen.py
index 7c76e5a..b63cf39 100644
--- a/yarGen.py
+++ b/yarGen.py
@@ -589,7 +589,7 @@ def filter_string_set(string_set):
             goodstring = True
             goodcount = good_strings_db[string]
             # print "%s - %s" % ( goodstring, good_strings[string] )
-            if args.excludegood:
+            if args.excludegood and string in reversedStrings:
                 continue
 
         # UTF

From ef21e90daadc4ebdb051608345de881a50ffa64c Mon Sep 17 00:00:00 2001
From: HydraDragonAntivirus
 <142328963+HydraDragonAntivirus@users.noreply.github.com>
Date: Sat, 8 Jun 2024 13:37:07 +0300
Subject: [PATCH 2/7] reversed strings

---
 yarGen.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/yarGen.py b/yarGen.py
index b63cf39..250e985 100644
--- a/yarGen.py
+++ b/yarGen.py
@@ -585,11 +585,11 @@ def filter_string_set(string_set):
         goodcount = 0
 
         # Goodware Strings
-        if string in good_strings_db:
+        if string in good_strings_db and string in reversedStrings:
             goodstring = True
             goodcount = good_strings_db[string]
             # print "%s - %s" % ( goodstring, good_strings[string] )
-            if args.excludegood and string in reversedStrings:
+            if args.excludegood:
                 continue
 
         # UTF

From 0c1e7d1725f01a892eebbe501a74d5a7e81e487a Mon Sep 17 00:00:00 2001
From: HydraDragonAntivirus
 <142328963+HydraDragonAntivirus@users.noreply.github.com>
Date: Sat, 8 Jun 2024 14:10:14 +0300
Subject: [PATCH 3/7] No more reversed strings

---
 yarGen.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/yarGen.py b/yarGen.py
index 250e985..bc4b3c0 100644
--- a/yarGen.py
+++ b/yarGen.py
@@ -585,7 +585,7 @@ def filter_string_set(string_set):
         goodcount = 0
 
         # Goodware Strings
-        if string in good_strings_db and string in reversedStrings:
+        if string in good_strings_db:
             goodstring = True
             goodcount = good_strings_db[string]
             # print "%s - %s" % ( goodstring, good_strings[string] )
@@ -944,6 +944,7 @@ def filter_string_set(string_set):
 
             # Reversed String -----------------------------------------------------
             if string[::-1] in good_strings_db:
+              if not args.excludegood:
                 localStringScores[string] += 10
                 reversedStrings[string] = string[::-1]
 

From a7e297c87c9377690e1e1cd59d66474959ee0749 Mon Sep 17 00:00:00 2001
From: HydraDragonAntivirus
 <142328963+HydraDragonAntivirus@users.noreply.github.com>
Date: Wed, 6 Nov 2024 14:48:12 +0300
Subject: [PATCH 4/7] Only meaningful words

---
 requirements.txt |  3 ++-
 yarGen.py        | 31 +++++++++++++++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index a0e8d75..a725888 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,3 @@
 lief
-lxml
\ No newline at end of file
+lxml
+nltk
\ No newline at end of file
diff --git a/yarGen.py b/yarGen.py
index bc4b3c0..2cc09d1 100644
--- a/yarGen.py
+++ b/yarGen.py
@@ -29,6 +29,21 @@
 from hashlib import sha256
 import signal as signal_module
 from lxml import etree
+import nltk
+from nltk.corpus import words
+from nltk.tokenize import word_tokenize
+
+# Ensure that necessary NLTK resources are available
+nltk.download('punkt')
+nltk.download('words')
+
+# A simple filter function to consider only meaningful words (ignoring non-English or arbitrary symbols)
+def filter_meaningful_words(word_list):
+    # Only allow alphabetic, lowercase words
+    return [word for word in word_list if word.isalpha() and word.islower()]
+
+# Load NLTK word corpus
+nltk_words = set(words.words())
 
 RELEVANT_EXTENSIONS = [".asp", ".vbs", ".ps", ".ps1", ".tmp", ".bas", ".bat", ".cmd", ".com", ".cpl",
                        ".crt", ".dll", ".exe", ".msc", ".scr", ".sys", ".vb", ".vbe", ".vbs", ".wsc",
@@ -579,6 +594,15 @@ def filter_string_set(string_set):
     utfstrings = []
 
     for string in string_set:
+        # Filter meaningful words based on the flag
+        if args.meaningful_words_only:
+            # Tokenize the string and check if it contains any meaningful word
+            tokens = word_tokenize(string)
+            contains_meaningful_word = any(word.lower() in nltk_words for word in tokens)
+            
+            # If no meaningful word is found, skip this string
+            if not contains_meaningful_word:
+                continue       
 
         # Goodware string marker
         goodstring = False
@@ -2139,6 +2163,9 @@ def print_welcome():
     group_inverse.add_argument('--nodirname', help=argparse.SUPPRESS, action='store_true', default=False)
     group_inverse.add_argument('--noscorefilter', help=argparse.SUPPRESS, action='store_true', default=False)
 
+    group_creation.add_argument('--meaningful-words-only', help='Only include strings containing meaningful words (default: False)',
+                             action='store_true', default=False)
+
     args = parser.parse_args()
 
     # Print Welcome
@@ -2162,6 +2189,10 @@ def print_welcome():
         print("[+] Updated databases - you can now start creating YARA rules")
         sys.exit(0)
 
+    # Check if the meaningful-words-only flag is set and handle accordingly
+    if args.meaningful_words_only:
+        print("[+] Only including strings containing meaningful words (non-trivial, dictionary-based).")
+
     # Typical input erros
     if args.m:
         if os.path.isfile(args.m):

From 42438a0885fbe1be391b07b772a3c9a897c21466 Mon Sep 17 00:00:00 2001
From: HydraDragonAntivirus
 <142328963+HydraDragonAntivirus@users.noreply.github.com>
Date: Thu, 7 Nov 2024 10:46:53 +0300
Subject: [PATCH 5/7] punk_tab added to download list

---
 yarGen.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/yarGen.py b/yarGen.py
index 2cc09d1..533515f 100644
--- a/yarGen.py
+++ b/yarGen.py
@@ -35,6 +35,7 @@
 
 # Ensure that necessary NLTK resources are available
 nltk.download('punkt')
+nltk.download('punkt_tab')
 nltk.download('words')
 
 # A simple filter function to consider only meaningful words (ignoring non-English or arbitrary symbols)

From 772d244c4537f71d0652ab1fe3f95bb83a0b2697 Mon Sep 17 00:00:00 2001
From: HydraDragonAntivirus
 <142328963+HydraDragonAntivirus@users.noreply.github.com>
Date: Thu, 7 Nov 2024 11:45:28 +0300
Subject: [PATCH 6/7] minimum 4 characters

---
 yarGen.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/yarGen.py b/yarGen.py
index 533515f..5c458d5 100644
--- a/yarGen.py
+++ b/yarGen.py
@@ -599,11 +599,13 @@ def filter_string_set(string_set):
         if args.meaningful_words_only:
             # Tokenize the string and check if it contains any meaningful word
             tokens = word_tokenize(string)
-            contains_meaningful_word = any(word.lower() in nltk_words for word in tokens)
+            contains_meaningful_word = any(
+                word.lower() in nltk_words and len(word) >= 4 for word in tokens
+            )
             
             # If no meaningful word is found, skip this string
             if not contains_meaningful_word:
-                continue       
+                continue
 
         # Goodware string marker
         goodstring = False

From 6c92f26e7f3680d0f1a447bdd0a9ebbdaada2ef1 Mon Sep 17 00:00:00 2001
From: HydraDragonAntivirus
 <142328963+HydraDragonAntivirus@users.noreply.github.com>
Date: Thu, 7 Nov 2024 12:56:00 +0300
Subject: [PATCH 7/7] nltk download order

---
 yarGen.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/yarGen.py b/yarGen.py
index 5c458d5..ddeaeae 100644
--- a/yarGen.py
+++ b/yarGen.py
@@ -30,14 +30,15 @@
 import signal as signal_module
 from lxml import etree
 import nltk
-from nltk.corpus import words
-from nltk.tokenize import word_tokenize
 
 # Ensure that necessary NLTK resources are available
 nltk.download('punkt')
 nltk.download('punkt_tab')
 nltk.download('words')
 
+from nltk.corpus import words
+from nltk.tokenize import word_tokenize
+
 # A simple filter function to consider only meaningful words (ignoring non-English or arbitrary symbols)
 def filter_meaningful_words(word_list):
     # Only allow alphabetic, lowercase words