diff --git a/floss/language/rust/decode_utf8.py b/floss/language/rust/decode_utf8.py index d2e8253ea..3637e97de 100644 --- a/floss/language/rust/decode_utf8.py +++ b/floss/language/rust/decode_utf8.py @@ -1,14 +1,16 @@ # Copyright (C) 2023 Mandiant, Inc. All Rights Reserved. -import pefile +import sys import logging -import argparse import pathlib -import sys +import argparse + +import pefile MIN_STR_LEN = 4 logger = logging.getLogger(__name__) + def get_rdata_section(pe: pefile.PE) -> pefile.SectionStructure: for section in pe.sections: if section.Name.startswith(b".rdata\x00"): @@ -16,7 +18,11 @@ def get_rdata_section(pe: pefile.PE) -> pefile.SectionStructure: raise ValueError("no .rdata section found") -def extract_utf8_strings(pe, min_length=MIN_STR_LEN): + +def extract_utf8_strings(pe: pefile.PE, min_length=MIN_STR_LEN): + """ + Extracts UTF-8 strings from the .rdata section of a PE file. + """ try: rdata_section = get_rdata_section(pe) except ValueError as e: @@ -27,7 +33,7 @@ def extract_utf8_strings(pe, min_length=MIN_STR_LEN): character_and_index = [] - # Reference: https://en.wikipedia.org/wiki/UTF-8 + # Reference: https://en.wikipedia.org/wiki/UTF-8 for i in range(0, len(strings)): # for 1 byte @@ -37,34 +43,36 @@ def extract_utf8_strings(pe, min_length=MIN_STR_LEN): # for 2 bytes elif strings[i] & 0xE0 == 0xC0: - temp = strings[i] << 8 | strings[i+1] + temp = strings[i] << 8 | strings[i + 1] character = temp.to_bytes(2, "big").decode("utf-8", "ignore") i += 1 character_and_index.append([character, i, 2]) # for 3 bytes elif strings[i] & 0xF0 == 0xE0: - temp = strings[i] << 16 | strings[i+1] << 8 | strings[i+2] + temp = strings[i] << 16 | strings[i + 1] << 8 | strings[i + 2] character = temp.to_bytes(3, "big").decode("utf-8", "ignore") i += 2 character_and_index.append([character, i, 3]) # for 4 bytes elif strings[i] & 0xF8 == 0xF0: - temp = strings[i] << 24 | strings[i+1] << 16 | strings[i+2] << 8 | strings[i+3] + temp = strings[i] << 24 | strings[i + 1] << 16 | strings[i + 2] << 8 | strings[i + 3] character = temp.to_bytes(4, "big").decode("utf-8", "ignore") i += 3 character_and_index.append([character, i, 4]) - - strings = [] # string, start index, end index + strings = [] # string, start index, end index # check for consecutive characters and convert to string for i in range(0, len(character_and_index)): if i == 0: strings.append([character_and_index[i][0], character_and_index[i][1], character_and_index[i][1]]) else: - if character_and_index[i-1][1] + character_and_index[i-1][2] == character_and_index[i][1] and character_and_index[i][0].isprintable() == True: + if ( + character_and_index[i - 1][1] + character_and_index[i - 1][2] == character_and_index[i][1] + and character_and_index[i][0].isprintable() == True + ): strings[-1][0] += character_and_index[i][0] strings[-1][2] = character_and_index[i][1] else: @@ -102,4 +110,4 @@ def main(argv=None): if __name__ == "__main__": - sys.exit(main()) \ No newline at end of file + sys.exit(main())