Skip to content

Commit

Permalink
add stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
Te-k committed Jul 16, 2016
1 parent 19c1e1c commit 7a7188e
Show file tree
Hide file tree
Showing 206 changed files with 285 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
__pycache__/
*.py[cod]
*$py.class
*.swp

# C extensions
*.so
Expand Down
189 changes: 189 additions & 0 deletions checkpe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
#! /usr/bin/python2
import pefile
import os
import array
import math
import pickle
from sklearn.externals import joblib
import sys
import argparse

def get_entropy(data):
if len(data) == 0:
return 0.0
occurences = array.array('L', [0]*256)
for x in data:
occurences[x if isinstance(x, int) else ord(x)] += 1

entropy = 0
for x in occurences:
if x:
p_x = float(x) / len(data)
entropy -= p_x*math.log(p_x, 2)

return entropy

def get_resources(pe):
"""Extract resources :
[entropy, size]"""
resources = []
if hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'):
try:
for resource_type in pe.DIRECTORY_ENTRY_RESOURCE.entries:
if hasattr(resource_type, 'directory'):
for resource_id in resource_type.directory.entries:
if hasattr(resource_id, 'directory'):
for resource_lang in resource_id.directory.entries:
data = pe.get_data(resource_lang.data.struct.OffsetToData, resource_lang.data.struct.Size)
size = resource_lang.data.struct.Size
entropy = get_entropy(data)

resources.append([entropy, size])
except Exception as e:
return resources
return resources

def get_version_info(pe):
"""Return version infos"""
res = {}
for fileinfo in pe.FileInfo:
if fileinfo.Key == 'StringFileInfo':
for st in fileinfo.StringTable:
for entry in st.entries.items():
res[entry[0]] = entry[1]
if fileinfo.Key == 'VarFileInfo':
for var in fileinfo.Var:
res[var.entry.items()[0][0]] = var.entry.items()[0][1]
if hasattr(pe, 'VS_FIXEDFILEINFO'):
res['flags'] = pe.VS_FIXEDFILEINFO.FileFlags
res['os'] = pe.VS_FIXEDFILEINFO.FileOS
res['type'] = pe.VS_FIXEDFILEINFO.FileType
res['file_version'] = pe.VS_FIXEDFILEINFO.FileVersionLS
res['product_version'] = pe.VS_FIXEDFILEINFO.ProductVersionLS
res['signature'] = pe.VS_FIXEDFILEINFO.Signature
res['struct_version'] = pe.VS_FIXEDFILEINFO.StrucVersion
return res

def extract_infos(fpath):
res = {}
pe = pefile.PE(fpath)
res['Machine'] = pe.FILE_HEADER.Machine
res['SizeOfOptionalHeader'] = pe.FILE_HEADER.SizeOfOptionalHeader
res['Characteristics'] = pe.FILE_HEADER.Characteristics
res['MajorLinkerVersion'] = pe.OPTIONAL_HEADER.MajorLinkerVersion
res['MinorLinkerVersion'] = pe.OPTIONAL_HEADER.MinorLinkerVersion
res['SizeOfCode'] = pe.OPTIONAL_HEADER.SizeOfCode
res['SizeOfInitializedData'] = pe.OPTIONAL_HEADER.SizeOfInitializedData
res['SizeOfUninitializedData'] = pe.OPTIONAL_HEADER.SizeOfUninitializedData
res['AddressOfEntryPoint'] = pe.OPTIONAL_HEADER.AddressOfEntryPoint
res['BaseOfCode'] = pe.OPTIONAL_HEADER.BaseOfCode
try:
res['BaseOfData'] = pe.OPTIONAL_HEADER.BaseOfData
except AttributeError:
res['BaseOfData'] = 0
res['ImageBase'] = pe.OPTIONAL_HEADER.ImageBase
res['SectionAlignment'] = pe.OPTIONAL_HEADER.SectionAlignment
res['FileAlignment'] = pe.OPTIONAL_HEADER.FileAlignment
res['MajorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MajorOperatingSystemVersion
res['MinorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MinorOperatingSystemVersion
res['MajorImageVersion'] = pe.OPTIONAL_HEADER.MajorImageVersion
res['MinorImageVersion'] = pe.OPTIONAL_HEADER.MinorImageVersion
res['MajorSubsystemVersion'] = pe.OPTIONAL_HEADER.MajorSubsystemVersion
res['MinorSubsystemVersion'] = pe.OPTIONAL_HEADER.MinorSubsystemVersion
res['SizeOfImage'] = pe.OPTIONAL_HEADER.SizeOfImage
res['SizeOfHeaders'] = pe.OPTIONAL_HEADER.SizeOfHeaders
res['CheckSum'] = pe.OPTIONAL_HEADER.CheckSum
res['Subsystem'] = pe.OPTIONAL_HEADER.Subsystem
res['DllCharacteristics'] = pe.OPTIONAL_HEADER.DllCharacteristics
res['SizeOfStackReserve'] = pe.OPTIONAL_HEADER.SizeOfStackReserve
res['SizeOfStackCommit'] = pe.OPTIONAL_HEADER.SizeOfStackCommit
res['SizeOfHeapReserve'] = pe.OPTIONAL_HEADER.SizeOfHeapReserve
res['SizeOfStackCommit'] = pe.OPTIONAL_HEADER.SizeOfHeapCommit
res['LoaderFlags'] = pe.OPTIONAL_HEADER.LoaderFlags
res['NumberOfRvaAndSizes'] = pe.OPTIONAL_HEADER.NumberOfRvaAndSizes

# Sections
res['SectionsNb'] = len(pe.sections)
entropy = map(lambda x:x.get_entropy(), pe.sections)
res['SectionsMeanEntropy'] = sum(entropy)/float(len(entropy))
res['SectionsMinEntropy'] = min(entropy)
res['SectionsMaxEntropy'] = max(entropy)
raw_sizes = map(lambda x:x.SizeOfRawData, pe.sections)
res['SectionsMeanRawsize'] = sum(raw_sizes)/float(len(raw_sizes))
res['SectionsMinRawsize'] = min(raw_sizes)
res['SectionsMaxRawsize'] = max(raw_sizes)
virtual_sizes = map(lambda x:x.Misc_VirtualSize, pe.sections)
res['SectionsMeanVirtualsize'] = sum(virtual_sizes)/float(len(virtual_sizes))
res['SectionsMinVirtualsize'] = min(virtual_sizes)
res['SectionMaxVirtualsize'] = max(virtual_sizes)

#Imports
try:
res['ImportsNbDLL'] = len(pe.DIRECTORY_ENTRY_IMPORT)
imports = sum([x.imports for x in pe.DIRECTORY_ENTRY_IMPORT], [])
res['ImportsNb'] = len(imports)
res['ImportsNbOrdinal'] = len(filter(lambda x:x.name is None, imports))
except AttributeError:
res['ImportsNbDLL'] = 0
res['ImportsNb'] = 0
res['ImportsNbOrdinal'] = 0

#Exports
try:
res['ExportNb'] = len(pe.DIRECTORY_ENTRY_EXPORT.symbols)
except AttributeError:
# No export
res['ExportNb'] = 0
#Resources
resources= get_resources(pe)
res['ResourcesNb'] = len(resources)
if len(resources)> 0:
entropy = map(lambda x:x[0], resources)
res['ResourcesMeanEntropy'] = sum(entropy)/float(len(entropy))
res['ResourcesMinEntropy'] = min(entropy)
res['ResourcesMaxEntropy'] = max(entropy)
sizes = map(lambda x:x[1], resources)
res['ResourcesMeanSize'] = sum(sizes)/float(len(sizes))
res['ResourcesMinSize'] = min(sizes)
res['ResourcesMaxSize'] = max(sizes)
else:
res['ResourcesNb'] = 0
res['ResourcesMeanEntropy'] = 0
res['ResourcesMinEntropy'] = 0
res['ResourcesMaxEntropy'] = 0
res['ResourcesMeanSize'] = 0
res['ResourcesMinSize'] = 0
res['ResourcesMaxSize'] = 0

# Load configuration size
try:
res['LoadConfigurationSize'] = pe.DIRECTORY_ENTRY_LOAD_CONFIG.struct.Size
except AttributeError:
res['LoadConfigurationSize'] = 0


# Version configuration size
try:
version_infos = get_version_info(pe)
res['VersionInformationSize'] = len(version_infos.keys())
except AttributeError:
res['VersionInformationSize'] = 0
return res

if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Detect malicious files')
parser.add_argument('FILE', help='File to be tested')
args = parser.parse_args()
# Load classifier
clf = joblib.load('classifier/classifier.pkl')
features = pickle.loads(open('classifier/features.pkl', 'r').read())

data = extract_infos(args.FILE)

pe_features = map(lambda x:data[x], features)

res= clf.predict([pe_features])[0]
print('The file %s is %s' % (
os.path.basename(sys.argv[1]),
['malicious', 'legitimate'][res])
)
Binary file added classifier/classifier.pkl
Binary file not shown.
Binary file added classifier/classifier.pkl_01.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_02.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_03.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_04.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_05.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_06.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_07.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_08.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_09.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_10.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_100.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_101.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_102.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_103.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_104.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_105.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_106.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_107.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_108.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_109.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_11.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_110.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_111.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_112.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_113.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_114.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_115.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_116.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_117.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_118.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_119.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_12.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_120.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_121.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_122.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_123.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_124.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_125.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_126.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_127.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_128.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_129.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_13.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_130.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_131.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_132.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_133.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_134.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_135.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_136.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_137.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_138.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_139.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_14.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_140.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_141.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_142.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_143.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_144.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_145.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_146.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_147.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_148.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_149.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_15.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_150.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_151.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_152.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_153.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_154.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_155.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_156.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_157.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_158.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_159.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_16.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_160.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_161.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_162.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_163.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_164.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_165.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_166.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_167.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_168.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_169.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_17.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_170.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_171.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_172.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_173.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_174.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_175.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_176.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_177.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_178.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_179.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_18.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_180.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_181.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_182.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_183.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_184.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_185.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_186.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_187.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_188.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_189.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_19.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_190.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_191.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_192.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_193.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_194.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_195.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_196.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_197.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_198.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_199.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_20.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_200.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_201.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_21.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_22.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_23.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_24.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_25.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_26.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_27.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_28.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_29.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_30.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_31.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_32.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_33.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_34.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_35.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_36.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_37.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_38.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_39.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_40.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_41.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_42.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_43.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_44.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_45.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_46.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_47.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_48.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_49.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_50.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_51.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_52.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_53.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_54.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_55.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_56.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_57.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_58.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_59.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_60.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_61.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_62.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_63.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_64.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_65.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_66.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_67.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_68.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_69.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_70.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_71.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_72.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_73.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_74.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_75.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_76.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_77.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_78.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_79.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_80.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_81.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_82.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_83.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_84.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_85.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_86.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_87.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_88.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_89.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_90.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_91.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_92.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_93.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_94.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_95.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_96.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_97.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_98.npy
Binary file not shown.
Binary file added classifier/classifier.pkl_99.npy
Binary file not shown.
26 changes: 26 additions & 0 deletions classifier/features.pkl
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
(lp0
S'Machine'
p1
aS'Characteristics'
p2
aS'ImageBase'
p3
aS'MajorOperatingSystemVersion'
p4
aS'MinorOperatingSystemVersion'
p5
aS'MajorSubsystemVersion'
p6
aS'Subsystem'
p7
aS'DllCharacteristics'
p8
aS'SizeOfStackReserve'
p9
aS'SectionsMaxEntropy'
p10
aS'ResourcesMinEntropy'
p11
aS'ResourcesMaxEntropy'
p12
a.
69 changes: 69 additions & 0 deletions learning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import pandas as pd
import numpy as np
import pickle
import sklearn.ensemble as ske
from sklearn import cross_validation, tree, linear_model
from sklearn.feature_selection import SelectFromModel
from sklearn.externals import joblib
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix

data = pd.read_csv('data.csv', sep='|')
X = data.drop(['Name', 'md5', 'legitimate'], axis=1).values
y = data['legitimate'].values

print('Researching important feature based on %i total features\n' % X.shape[1])

# Feature selection using Trees Classifier
fsel = ske.ExtraTreesClassifier().fit(X, y)
model = SelectFromModel(fsel, prefit=True)
X_new = model.transform(X)
nb_features = X_new.shape[1]

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_new, y ,test_size=0.2)

features = []

print('%i features identified as important:' % nb_features)

indices = np.argsort(fsel.feature_importances_)[::-1][:nb_features]
for f in range(nb_features):
print("%d. feature %s (%f)" % (f + 1, data.columns[2+indices[f]], fsel.feature_importances_[indices[f]]))

# XXX : take care of the feature order
for f in sorted(np.argsort(fsel.feature_importances_)[::-1][:nb_features]):
features.append(data.columns[2+f])

#Algorithm comparison
algorithms = {
"DecisionTree": tree.DecisionTreeClassifier(max_depth=10),
"RandomForest": ske.RandomForestClassifier(n_estimators=50),
"GradientBoosting": ske.GradientBoostingClassifier(n_estimators=50),
"AdaBoost": ske.AdaBoostClassifier(n_estimators=100),
"GNB": GaussianNB()
}

results = {}
print("\nNow testing algorithms")
for algo in algorithms:
clf = algorithms[algo]
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
print("%s : %f %%" % (algo, score*100))
results[algo] = score

winner = max(results, key=results.get)
print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner]*100))

# Save the algorithm and the feature list for later predictions
print('Saving algorithm and feature list in classifier directory...')
joblib.dump(algorithms[winner], 'classifier/classifier.pkl')
open('classifier/features.pkl', 'w').write(pickle.dumps(features))
print('Saved')

# Identify false and true positive rates
clf = algorithms[winner]
res = clf.predict(X_test)
mt = confusion_matrix(y_test, res)
print("False positive rate : %f %%" % ((mt[0][1] / float(sum(mt[0])))*100))
print('False negative rate : %f %%' % ( (mt[1][0] / float(sum(mt[1]))*100)))

0 comments on commit 7a7188e

Please sign in to comment.