-
Notifications
You must be signed in to change notification settings - Fork 4
/
OLD_genOEPFeatureVectors.py
96 lines (70 loc) · 4.46 KB
/
OLD_genOEPFeatureVectors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os
import sys
import re
import numpy as np
walk_dir = sys.argv[1]
print('walk_dir = ' + walk_dir)
# If your current working directory may change during script execution, it's recommended to
# immediately convert program arguments to an absolute path. Then the variable root below will
# be an absolute path as well. Example:
walk_dir = os.path.abspath(walk_dir)
print('walk_dir (absolute) = ' + os.path.abspath(walk_dir))
oep_names = open("/home/tarun/Desktop/OEPNames", 'a')
featureMatrix = open("/home/tarun/Desktop/NEWfinalOEPmatrix.txt", 'a')
cnt = 0
for root, subdirs, files in os.walk(walk_dir):
list_file_path = os.path.join(root, '')
filename = "peinfo-full.txt"
if filename in files:
file_path = os.path.join(root, filename)
vx_hash = os.path.basename(os.path.normpath(root))
positionValueCSVString = ""
oepVector = []
with open(file_path) as f:
# Find Original Entry Point (OEP)
lines = re.findall(r"\baddressofentrypoint:.*\b", f.read().lower()) # returns 1 line
if lines:
cnt += 1
for line in lines:
oep = line.split()[1] # original entry point hex value
# print "\n" + vx_hash + " \t " + oep + " \n "
# Read malicious binary from OEP
with open("/home/tarun/Documents/extracted VirusShare_00189/raw/VirusShare_"+vx_hash, 'rb') as v:
# print('%s \t long %d \t int %d\n\n' % (oep, long(oep, 16), int(oep, 16)))
# address point converted from hex to int --> int(oep, 16)
entryPointAddress = long(oep, 16)
# print('address \t %d \t ignored bytes \t %d \n\n' % (entryPointAddress, entryPointAddress - 127))
try:
v.read(entryPointAddress - 127) # read & ignored values before the [-127,127] offset region
offsetValuesArray = v.read(255) # read next values in offset region
# create vector
# print offsetValuesArray.__len__()
if offsetValuesArray.__len__() > 0:
if offsetValuesArray.__len__() < 255:
print vx_hash, "\t", offsetValuesArray.__len__()
for currentVal in offsetValuesArray:
# print ord(currentVal), # ord(currentVal) "byte in position" value in interval [0,255]
positionValueArray = np.zeros((255,), np.int) # array of zeroes of size 255
positionValueArray[
ord(
currentVal) - 1] = 1 # set 1 iff feature set to value of position in binary array
positionValueCSVString = ', '.join(['%d' % num for num in positionValueArray])
# print positionValueCSVString, "\n"
oepVector.append(positionValueCSVString)
# featureVector.write(positionValueCSVString + "\n")
else: # length 0
# print np.zeros((255*255,), np.int).__len__(), "\n"
print "OffsetValuesArray Length 0 in file: ", vx_hash
positionValueCSVString = ', '.join(
['%d' % num for num in np.zeros((255 * 255,), np.int)])
oepVector.append(positionValueCSVString)
except OverflowError:
print "Starting address int too large to convert to C long in file: ", vx_hash
positionValueCSVString = ', '.join(['%d' % num for num in np.zeros((255 * 255,), np.int)])
oepVector.append(positionValueCSVString)
# create & write to vector file & matrix file
featureVector = open("/home/tarun/Desktop/OEP/" + vx_hash + ".txt", 'a')
featureVector.write(', '.join(oepVector) + '\0' + '\n')
featureMatrix.write(', '.join(oepVector) + '\0' + '\n')
oep_names.write(vx_hash + ".txt \n") # add filename to list of files processed
print('\n Count: %d' % cnt) # print number of files processed