-
Notifications
You must be signed in to change notification settings - Fork 36
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
206 changed files
with
285 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,7 @@ | |
__pycache__/ | ||
*.py[cod] | ||
*$py.class | ||
*.swp | ||
|
||
# C extensions | ||
*.so | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,189 @@ | ||
#! /usr/bin/python2 | ||
import pefile | ||
import os | ||
import array | ||
import math | ||
import pickle | ||
from sklearn.externals import joblib | ||
import sys | ||
import argparse | ||
|
||
def get_entropy(data): | ||
if len(data) == 0: | ||
return 0.0 | ||
occurences = array.array('L', [0]*256) | ||
for x in data: | ||
occurences[x if isinstance(x, int) else ord(x)] += 1 | ||
|
||
entropy = 0 | ||
for x in occurences: | ||
if x: | ||
p_x = float(x) / len(data) | ||
entropy -= p_x*math.log(p_x, 2) | ||
|
||
return entropy | ||
|
||
def get_resources(pe): | ||
"""Extract resources : | ||
[entropy, size]""" | ||
resources = [] | ||
if hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'): | ||
try: | ||
for resource_type in pe.DIRECTORY_ENTRY_RESOURCE.entries: | ||
if hasattr(resource_type, 'directory'): | ||
for resource_id in resource_type.directory.entries: | ||
if hasattr(resource_id, 'directory'): | ||
for resource_lang in resource_id.directory.entries: | ||
data = pe.get_data(resource_lang.data.struct.OffsetToData, resource_lang.data.struct.Size) | ||
size = resource_lang.data.struct.Size | ||
entropy = get_entropy(data) | ||
|
||
resources.append([entropy, size]) | ||
except Exception as e: | ||
return resources | ||
return resources | ||
|
||
def get_version_info(pe): | ||
"""Return version infos""" | ||
res = {} | ||
for fileinfo in pe.FileInfo: | ||
if fileinfo.Key == 'StringFileInfo': | ||
for st in fileinfo.StringTable: | ||
for entry in st.entries.items(): | ||
res[entry[0]] = entry[1] | ||
if fileinfo.Key == 'VarFileInfo': | ||
for var in fileinfo.Var: | ||
res[var.entry.items()[0][0]] = var.entry.items()[0][1] | ||
if hasattr(pe, 'VS_FIXEDFILEINFO'): | ||
res['flags'] = pe.VS_FIXEDFILEINFO.FileFlags | ||
res['os'] = pe.VS_FIXEDFILEINFO.FileOS | ||
res['type'] = pe.VS_FIXEDFILEINFO.FileType | ||
res['file_version'] = pe.VS_FIXEDFILEINFO.FileVersionLS | ||
res['product_version'] = pe.VS_FIXEDFILEINFO.ProductVersionLS | ||
res['signature'] = pe.VS_FIXEDFILEINFO.Signature | ||
res['struct_version'] = pe.VS_FIXEDFILEINFO.StrucVersion | ||
return res | ||
|
||
def extract_infos(fpath): | ||
res = {} | ||
pe = pefile.PE(fpath) | ||
res['Machine'] = pe.FILE_HEADER.Machine | ||
res['SizeOfOptionalHeader'] = pe.FILE_HEADER.SizeOfOptionalHeader | ||
res['Characteristics'] = pe.FILE_HEADER.Characteristics | ||
res['MajorLinkerVersion'] = pe.OPTIONAL_HEADER.MajorLinkerVersion | ||
res['MinorLinkerVersion'] = pe.OPTIONAL_HEADER.MinorLinkerVersion | ||
res['SizeOfCode'] = pe.OPTIONAL_HEADER.SizeOfCode | ||
res['SizeOfInitializedData'] = pe.OPTIONAL_HEADER.SizeOfInitializedData | ||
res['SizeOfUninitializedData'] = pe.OPTIONAL_HEADER.SizeOfUninitializedData | ||
res['AddressOfEntryPoint'] = pe.OPTIONAL_HEADER.AddressOfEntryPoint | ||
res['BaseOfCode'] = pe.OPTIONAL_HEADER.BaseOfCode | ||
try: | ||
res['BaseOfData'] = pe.OPTIONAL_HEADER.BaseOfData | ||
except AttributeError: | ||
res['BaseOfData'] = 0 | ||
res['ImageBase'] = pe.OPTIONAL_HEADER.ImageBase | ||
res['SectionAlignment'] = pe.OPTIONAL_HEADER.SectionAlignment | ||
res['FileAlignment'] = pe.OPTIONAL_HEADER.FileAlignment | ||
res['MajorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MajorOperatingSystemVersion | ||
res['MinorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MinorOperatingSystemVersion | ||
res['MajorImageVersion'] = pe.OPTIONAL_HEADER.MajorImageVersion | ||
res['MinorImageVersion'] = pe.OPTIONAL_HEADER.MinorImageVersion | ||
res['MajorSubsystemVersion'] = pe.OPTIONAL_HEADER.MajorSubsystemVersion | ||
res['MinorSubsystemVersion'] = pe.OPTIONAL_HEADER.MinorSubsystemVersion | ||
res['SizeOfImage'] = pe.OPTIONAL_HEADER.SizeOfImage | ||
res['SizeOfHeaders'] = pe.OPTIONAL_HEADER.SizeOfHeaders | ||
res['CheckSum'] = pe.OPTIONAL_HEADER.CheckSum | ||
res['Subsystem'] = pe.OPTIONAL_HEADER.Subsystem | ||
res['DllCharacteristics'] = pe.OPTIONAL_HEADER.DllCharacteristics | ||
res['SizeOfStackReserve'] = pe.OPTIONAL_HEADER.SizeOfStackReserve | ||
res['SizeOfStackCommit'] = pe.OPTIONAL_HEADER.SizeOfStackCommit | ||
res['SizeOfHeapReserve'] = pe.OPTIONAL_HEADER.SizeOfHeapReserve | ||
res['SizeOfStackCommit'] = pe.OPTIONAL_HEADER.SizeOfHeapCommit | ||
res['LoaderFlags'] = pe.OPTIONAL_HEADER.LoaderFlags | ||
res['NumberOfRvaAndSizes'] = pe.OPTIONAL_HEADER.NumberOfRvaAndSizes | ||
|
||
# Sections | ||
res['SectionsNb'] = len(pe.sections) | ||
entropy = map(lambda x:x.get_entropy(), pe.sections) | ||
res['SectionsMeanEntropy'] = sum(entropy)/float(len(entropy)) | ||
res['SectionsMinEntropy'] = min(entropy) | ||
res['SectionsMaxEntropy'] = max(entropy) | ||
raw_sizes = map(lambda x:x.SizeOfRawData, pe.sections) | ||
res['SectionsMeanRawsize'] = sum(raw_sizes)/float(len(raw_sizes)) | ||
res['SectionsMinRawsize'] = min(raw_sizes) | ||
res['SectionsMaxRawsize'] = max(raw_sizes) | ||
virtual_sizes = map(lambda x:x.Misc_VirtualSize, pe.sections) | ||
res['SectionsMeanVirtualsize'] = sum(virtual_sizes)/float(len(virtual_sizes)) | ||
res['SectionsMinVirtualsize'] = min(virtual_sizes) | ||
res['SectionMaxVirtualsize'] = max(virtual_sizes) | ||
|
||
#Imports | ||
try: | ||
res['ImportsNbDLL'] = len(pe.DIRECTORY_ENTRY_IMPORT) | ||
imports = sum([x.imports for x in pe.DIRECTORY_ENTRY_IMPORT], []) | ||
res['ImportsNb'] = len(imports) | ||
res['ImportsNbOrdinal'] = len(filter(lambda x:x.name is None, imports)) | ||
except AttributeError: | ||
res['ImportsNbDLL'] = 0 | ||
res['ImportsNb'] = 0 | ||
res['ImportsNbOrdinal'] = 0 | ||
|
||
#Exports | ||
try: | ||
res['ExportNb'] = len(pe.DIRECTORY_ENTRY_EXPORT.symbols) | ||
except AttributeError: | ||
# No export | ||
res['ExportNb'] = 0 | ||
#Resources | ||
resources= get_resources(pe) | ||
res['ResourcesNb'] = len(resources) | ||
if len(resources)> 0: | ||
entropy = map(lambda x:x[0], resources) | ||
res['ResourcesMeanEntropy'] = sum(entropy)/float(len(entropy)) | ||
res['ResourcesMinEntropy'] = min(entropy) | ||
res['ResourcesMaxEntropy'] = max(entropy) | ||
sizes = map(lambda x:x[1], resources) | ||
res['ResourcesMeanSize'] = sum(sizes)/float(len(sizes)) | ||
res['ResourcesMinSize'] = min(sizes) | ||
res['ResourcesMaxSize'] = max(sizes) | ||
else: | ||
res['ResourcesNb'] = 0 | ||
res['ResourcesMeanEntropy'] = 0 | ||
res['ResourcesMinEntropy'] = 0 | ||
res['ResourcesMaxEntropy'] = 0 | ||
res['ResourcesMeanSize'] = 0 | ||
res['ResourcesMinSize'] = 0 | ||
res['ResourcesMaxSize'] = 0 | ||
|
||
# Load configuration size | ||
try: | ||
res['LoadConfigurationSize'] = pe.DIRECTORY_ENTRY_LOAD_CONFIG.struct.Size | ||
except AttributeError: | ||
res['LoadConfigurationSize'] = 0 | ||
|
||
|
||
# Version configuration size | ||
try: | ||
version_infos = get_version_info(pe) | ||
res['VersionInformationSize'] = len(version_infos.keys()) | ||
except AttributeError: | ||
res['VersionInformationSize'] = 0 | ||
return res | ||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser(description='Detect malicious files') | ||
parser.add_argument('FILE', help='File to be tested') | ||
args = parser.parse_args() | ||
# Load classifier | ||
clf = joblib.load('classifier/classifier.pkl') | ||
features = pickle.loads(open('classifier/features.pkl', 'r').read()) | ||
|
||
data = extract_infos(args.FILE) | ||
|
||
pe_features = map(lambda x:data[x], features) | ||
|
||
res= clf.predict([pe_features])[0] | ||
print('The file %s is %s' % ( | ||
os.path.basename(sys.argv[1]), | ||
['malicious', 'legitimate'][res]) | ||
) |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
(lp0 | ||
S'Machine' | ||
p1 | ||
aS'Characteristics' | ||
p2 | ||
aS'ImageBase' | ||
p3 | ||
aS'MajorOperatingSystemVersion' | ||
p4 | ||
aS'MinorOperatingSystemVersion' | ||
p5 | ||
aS'MajorSubsystemVersion' | ||
p6 | ||
aS'Subsystem' | ||
p7 | ||
aS'DllCharacteristics' | ||
p8 | ||
aS'SizeOfStackReserve' | ||
p9 | ||
aS'SectionsMaxEntropy' | ||
p10 | ||
aS'ResourcesMinEntropy' | ||
p11 | ||
aS'ResourcesMaxEntropy' | ||
p12 | ||
a. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
import pandas as pd | ||
import numpy as np | ||
import pickle | ||
import sklearn.ensemble as ske | ||
from sklearn import cross_validation, tree, linear_model | ||
from sklearn.feature_selection import SelectFromModel | ||
from sklearn.externals import joblib | ||
from sklearn.naive_bayes import GaussianNB | ||
from sklearn.metrics import confusion_matrix | ||
|
||
data = pd.read_csv('data.csv', sep='|') | ||
X = data.drop(['Name', 'md5', 'legitimate'], axis=1).values | ||
y = data['legitimate'].values | ||
|
||
print('Researching important feature based on %i total features\n' % X.shape[1]) | ||
|
||
# Feature selection using Trees Classifier | ||
fsel = ske.ExtraTreesClassifier().fit(X, y) | ||
model = SelectFromModel(fsel, prefit=True) | ||
X_new = model.transform(X) | ||
nb_features = X_new.shape[1] | ||
|
||
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_new, y ,test_size=0.2) | ||
|
||
features = [] | ||
|
||
print('%i features identified as important:' % nb_features) | ||
|
||
indices = np.argsort(fsel.feature_importances_)[::-1][:nb_features] | ||
for f in range(nb_features): | ||
print("%d. feature %s (%f)" % (f + 1, data.columns[2+indices[f]], fsel.feature_importances_[indices[f]])) | ||
|
||
# XXX : take care of the feature order | ||
for f in sorted(np.argsort(fsel.feature_importances_)[::-1][:nb_features]): | ||
features.append(data.columns[2+f]) | ||
|
||
#Algorithm comparison | ||
algorithms = { | ||
"DecisionTree": tree.DecisionTreeClassifier(max_depth=10), | ||
"RandomForest": ske.RandomForestClassifier(n_estimators=50), | ||
"GradientBoosting": ske.GradientBoostingClassifier(n_estimators=50), | ||
"AdaBoost": ske.AdaBoostClassifier(n_estimators=100), | ||
"GNB": GaussianNB() | ||
} | ||
|
||
results = {} | ||
print("\nNow testing algorithms") | ||
for algo in algorithms: | ||
clf = algorithms[algo] | ||
clf.fit(X_train, y_train) | ||
score = clf.score(X_test, y_test) | ||
print("%s : %f %%" % (algo, score*100)) | ||
results[algo] = score | ||
|
||
winner = max(results, key=results.get) | ||
print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner]*100)) | ||
|
||
# Save the algorithm and the feature list for later predictions | ||
print('Saving algorithm and feature list in classifier directory...') | ||
joblib.dump(algorithms[winner], 'classifier/classifier.pkl') | ||
open('classifier/features.pkl', 'w').write(pickle.dumps(features)) | ||
print('Saved') | ||
|
||
# Identify false and true positive rates | ||
clf = algorithms[winner] | ||
res = clf.predict(X_test) | ||
mt = confusion_matrix(y_test, res) | ||
print("False positive rate : %f %%" % ((mt[0][1] / float(sum(mt[0])))*100)) | ||
print('False negative rate : %f %%' % ( (mt[1][0] / float(sum(mt[1]))*100))) |