Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

No more reversed goodware strings #53

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
lief
lxml
lxml
nltk
36 changes: 36 additions & 0 deletions yarGen.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,23 @@
from hashlib import sha256
import signal as signal_module
from lxml import etree
import nltk

# Ensure that necessary NLTK resources are available
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('words')

from nltk.corpus import words
from nltk.tokenize import word_tokenize

# A simple filter function to consider only meaningful words (ignoring non-English or arbitrary symbols)
def filter_meaningful_words(word_list):
# Only allow alphabetic, lowercase words
return [word for word in word_list if word.isalpha() and word.islower()]

# Load NLTK word corpus
nltk_words = set(words.words())

RELEVANT_EXTENSIONS = [".asp", ".vbs", ".ps", ".ps1", ".tmp", ".bas", ".bat", ".cmd", ".com", ".cpl",
".crt", ".dll", ".exe", ".msc", ".scr", ".sys", ".vb", ".vbe", ".vbs", ".wsc",
Expand Down Expand Up @@ -579,6 +596,17 @@ def filter_string_set(string_set):
utfstrings = []

for string in string_set:
# Filter meaningful words based on the flag
if args.meaningful_words_only:
# Tokenize the string and check if it contains any meaningful word
tokens = word_tokenize(string)
contains_meaningful_word = any(
word.lower() in nltk_words and len(word) >= 4 for word in tokens
)

# If no meaningful word is found, skip this string
if not contains_meaningful_word:
continue

# Goodware string marker
goodstring = False
Expand Down Expand Up @@ -944,6 +972,7 @@ def filter_string_set(string_set):

# Reversed String -----------------------------------------------------
if string[::-1] in good_strings_db:
if not args.excludegood:
localStringScores[string] += 10
reversedStrings[string] = string[::-1]

Expand Down Expand Up @@ -2138,6 +2167,9 @@ def print_welcome():
group_inverse.add_argument('--nodirname', help=argparse.SUPPRESS, action='store_true', default=False)
group_inverse.add_argument('--noscorefilter', help=argparse.SUPPRESS, action='store_true', default=False)

group_creation.add_argument('--meaningful-words-only', help='Only include strings containing meaningful words (default: False)',
action='store_true', default=False)

args = parser.parse_args()

# Print Welcome
Expand All @@ -2161,6 +2193,10 @@ def print_welcome():
print("[+] Updated databases - you can now start creating YARA rules")
sys.exit(0)

# Check if the meaningful-words-only flag is set and handle accordingly
if args.meaningful_words_only:
print("[+] Only including strings containing meaningful words (non-trivial, dictionary-based).")

# Typical input erros
if args.m:
if os.path.isfile(args.m):
Expand Down