-
Notifications
You must be signed in to change notification settings - Fork 4
/
genOEPFeatureVectors.py
100 lines (73 loc) · 4.4 KB
/
genOEPFeatureVectors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import os
import sys
import numpy as np
import pefile
import hashlib
# script runs pefile on raw samples
# no mastiff generated files involved in this script
walk_dir = sys.argv[1] # "/home/tarun/Documents/raw vxheaven collection/Win32/BackdoorSUBSET"
print('walk_dir = ' + walk_dir)
# If your current working directory may change during script execution, it's recommended to
# immediately convert program arguments to an absolute path. Then the variable root below will
# be an absolute path as well. Example:
walk_dir = os.path.abspath(walk_dir)
print('walk_dir (absolute) = ' + os.path.abspath(walk_dir))
oep_names = open("/home/tarun/Desktop/BenignOEPNames", 'a')
featureMatrix = open("/home/tarun/Desktop/BenignOEPmatrix.txt", 'a')
cnt = 0
for root, subdirs, files in os.walk(walk_dir):
list_file_path = os.path.join(root, '')
for malware_file in files:
file_path = os.path.join(root, malware_file)
mdhash = hashlib.md5(open(file_path, 'rb').read()).hexdigest()
# print os.path.join('/home/tarun/Documents/Final_Dataset/Backdoor3', mdhash)
positionValueCSVString =""
oepVector = []
cnt += 1
try:
pe = pefile.PE(file_path)
oep = hex(pe.OPTIONAL_HEADER.AddressOfEntryPoint)
entryPointAddress = long(oep, 16)
# print('address \t %d \t ignored bytes \t %d \n\n' % (entryPointAddress, entryPointAddress - 127))
try:
v = open(file_path, 'rb')
v.read(entryPointAddress - 127) # read & ignored values before the [-127,127] offset region
offsetValuesArray = v.read(255) # read next values in offset region
# create vector
# print offsetValuesArray.__len__()
if offsetValuesArray.__len__() > 0:
for currentVal in offsetValuesArray:
# print ord(currentVal), # ord(currentVal) "byte in position" value in interval [0,255]
positionValueArray = np.zeros((255,), np.int) # array of zeroes of size 255
positionValueArray[
ord(currentVal) - 1] = 1 # set 1 iff feature set to value of position in binary array
positionValueCSVString = ', '.join(['%d' % num for num in positionValueArray])
# print positionValueCSVString, "\n"
oepVector.append(positionValueCSVString)
# featureVector.write(positionValueCSVString + "\n")
if offsetValuesArray.__len__() < 255: # offsetValuesArray length between 0 & 255
print "OffsetValuesArray Length < 255 in file: ", mdhash, "\t", offsetValuesArray.__len__()
# set remaining position values to 0
for i in range(offsetValuesArray.__len__(), 255):
positionValueArray = np.zeros((255,), np.int)
positionValueCSVString = ', '.join(['%d' % num for num in positionValueArray])
oepVector.append(positionValueCSVString)
else: # length 0
# print np.zeros((255*255,), np.int).__len__(), "\n"
print "OffsetValuesArray Length 0 in file: ", mdhash
positionValueCSVString = ', '.join(['%d' % num for num in np.zeros((255*255,), np.int)])
oepVector.append(positionValueCSVString)
except OverflowError:
print "Starting address int too large to convert to C long in file: ", mdhash
positionValueCSVString = ', '.join(['%d' % num for num in np.zeros((255 * 255,), np.int)])
oepVector.append(positionValueCSVString)
except pefile.PEFormatError:
print "DOS Header magic not found. - All zeroes in file: ", mdhash
positionValueCSVString = ', '.join(['%d' % num for num in np.zeros((255 * 255,), np.int)])
oepVector.append(positionValueCSVString)
# create & write to vector & matrix file
featureVector = open("/home/tarun/Desktop/BenignOEP/" + mdhash + ".txt", 'a')
featureVector.write(', '.join(oepVector) + '\0' + '\n')
featureMatrix.write(', '.join(oepVector) + '\0' + '\n')
oep_names.write(mdhash + ".txt \n") # add filename to list of files processed
print('\n Count: %d' % cnt) # print number of files processed