Skip to content

Commit

Permalink
Merge pull request #1 from viral98/master
Browse files Browse the repository at this point in the history
Fixed files to support Python 3, Added python notebook
  • Loading branch information
Te-k authored Mar 28, 2020
2 parents 8efa97e + 1f63538 commit 7bee2f0
Show file tree
Hide file tree
Showing 3 changed files with 250 additions and 23 deletions.
227 changes: 227 additions & 0 deletions Malware-Analysis.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import pickle\n",
"import sklearn.ensemble as ske\n",
"from sklearn import tree, linear_model\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.feature_selection import SelectFromModel\n",
"from sklearn.externals import joblib\n",
"from sklearn.naive_bayes import GaussianNB\n",
"from sklearn.metrics import confusion_matrix\n",
"\n",
"data = pd.read_csv('data.csv', sep='|')\n",
"X = data.drop(['Name', 'md5', 'legitimate'], axis=1).values\n",
"y = data['legitimate'].values\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Researching important feature based on 54 total features\n",
"\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\users\\tagdi\\appdata\\local\\programs\\python\\python37\\lib\\site-packages\\sklearn\\ensemble\\forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n",
" \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n"
]
}
],
"source": [
"print('Researching important feature based on %i total features\\n' % X.shape[1])\n",
"\n",
"# Feature selection using Trees Classifier\n",
"fsel = ske.ExtraTreesClassifier().fit(X, y)\n",
"model = SelectFromModel(fsel, prefit=True)\n",
"X_new = model.transform(X)\n",
"nb_features = X_new.shape[1]\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X_new, y ,test_size=0.2)\n",
"\n",
"features = []\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"12 features identified as important:\n",
"1. feature Characteristics (0.174459)\n",
"2. feature Machine (0.153884)\n",
"3. feature DllCharacteristics (0.132542)\n",
"4. feature SectionsMaxEntropy (0.074540)\n",
"5. feature Subsystem (0.055532)\n",
"6. feature SizeOfStackReserve (0.047293)\n",
"7. feature ImageBase (0.046986)\n",
"8. feature SectionsNb (0.038547)\n",
"9. feature MajorSubsystemVersion (0.033042)\n",
"10. feature VersionInformationSize (0.031814)\n",
"11. feature MajorOperatingSystemVersion (0.022208)\n",
"12. feature SectionsMinEntropy (0.019704)\n"
]
}
],
"source": [
"print('%i features identified as important:' % nb_features)\n",
"\n",
"indices = np.argsort(fsel.feature_importances_)[::-1][:nb_features]\n",
"for f in range(nb_features):\n",
" print(\"%d. feature %s (%f)\" % (f + 1, data.columns[2+indices[f]], fsel.feature_importances_[indices[f]]))\n",
"\n",
"# XXX : take care of the feature order\n",
"for f in sorted(np.argsort(fsel.feature_importances_)[::-1][:nb_features]):\n",
" features.append(data.columns[2+f])\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Now testing algorithms\n",
"DecisionTree : 99.036581 %\n",
"RandomForest : 99.398769 %\n",
"GradientBoosting : 98.772184 %\n",
"AdaBoost : 98.678015 %\n",
"GNB : 70.449113 %\n"
]
}
],
"source": [
"#Algorithm comparison\n",
"algorithms = {\n",
" \"DecisionTree\": tree.DecisionTreeClassifier(max_depth=10),\n",
" \"RandomForest\": ske.RandomForestClassifier(n_estimators=50),\n",
" \"GradientBoosting\": ske.GradientBoostingClassifier(n_estimators=50),\n",
" \"AdaBoost\": ske.AdaBoostClassifier(n_estimators=100),\n",
" \"GNB\": GaussianNB()\n",
" }\n",
"\n",
"results = {}\n",
"print(\"\\nNow testing algorithms\")\n",
"for algo in algorithms:\n",
" clf = algorithms[algo]\n",
" clf.fit(X_train, y_train)\n",
" score = clf.score(X_test, y_test)\n",
" print(\"%s : %f %%\" % (algo, score*100))\n",
" results[algo] = score\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Winner algorithm is RandomForest with a 99.398769 % success\n"
]
}
],
"source": [
"winner = max(results, key=results.get)\n",
"print('\\n Algorithm with highest accuracy on train/test is %s with a %f %% success' % (winner, results[winner]*100))\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# print('Saving algorithm and feature list in classifier directory...')\n",
"# joblib.dump(algorithms[winner], 'classifier/classifier.pkl')\n",
"# open('classifier/features.pkl', 'w').write(pickle.dumps(features))\n",
"# print('Saved')\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"False positive rate : 0.462701 %\n",
"False negative rate : 0.931487 %\n"
]
}
],
"source": [
"# Identify false and true positive rates\n",
"clf = algorithms[winner]\n",
"res = clf.predict(X_test)\n",
"mt = confusion_matrix(y_test, res)\n",
"print(\"False positive rate : %f %%\" % ((mt[0][1] / float(sum(mt[0])))*100))\n",
"print('False negative rate : %f %%' % ( (mt[1][0] / float(sum(mt[1]))*100)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
39 changes: 19 additions & 20 deletions checkpe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,16 @@

def get_entropy(data):
if len(data) == 0:
return 0.0
return 0.0
occurences = array.array('L', [0]*256)
for x in data:
occurences[x if isinstance(x, int) else ord(x)] += 1
occurences[x if isinstance(x, int) else ord(x)] += 1

entropy = 0
for x in occurences:
if x:
p_x = float(x) / len(data)
entropy -= p_x*math.log(p_x, 2)
if x:
p_x = float(x) / len(data)
entropy -= p_x*math.log(p_x, 2)

return entropy

Expand All @@ -28,7 +28,7 @@ def get_resources(pe):
[entropy, size]"""
resources = []
if hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'):
try:
try:
for resource_type in pe.DIRECTORY_ENTRY_RESOURCE.entries:
if hasattr(resource_type, 'directory'):
for resource_id in resource_type.directory.entries:
Expand Down Expand Up @@ -104,25 +104,27 @@ def extract_infos(fpath):

# Sections
res['SectionsNb'] = len(pe.sections)
entropy = map(lambda x:x.get_entropy(), pe.sections)
entropy = list(map(lambda x:x.get_entropy(), pe.sections))
res['SectionsMeanEntropy'] = sum(entropy)/float(len(entropy))
res['SectionsMinEntropy'] = min(entropy)
res['SectionsMaxEntropy'] = max(entropy)
raw_sizes = map(lambda x:x.SizeOfRawData, pe.sections)


raw_sizes = list(map(lambda x:x.SizeOfRawData, pe.sections))
res['SectionsMeanRawsize'] = sum(raw_sizes)/float(len(raw_sizes))
res['SectionsMinRawsize'] = min(raw_sizes)
res['SectionsMaxRawsize'] = max(raw_sizes)
virtual_sizes = map(lambda x:x.Misc_VirtualSize, pe.sections)
virtual_sizes = list(map(lambda x:x.Misc_VirtualSize, pe.sections))
res['SectionsMeanVirtualsize'] = sum(virtual_sizes)/float(len(virtual_sizes))
res['SectionsMinVirtualsize'] = min(virtual_sizes)
res['SectionMaxVirtualsize'] = max(virtual_sizes)

#Imports
try:
res['ImportsNbDLL'] = len(pe.DIRECTORY_ENTRY_IMPORT)
imports = sum([x.imports for x in pe.DIRECTORY_ENTRY_IMPORT], [])
imports = list(sum([x.imports for x in pe.DIRECTORY_ENTRY_IMPORT], []))
res['ImportsNb'] = len(imports)
res['ImportsNbOrdinal'] = len(filter(lambda x:x.name is None, imports))
res['ImportsNbOrdinal'] = len(list(filter(lambda x:x.name is None, imports)))
except AttributeError:
res['ImportsNbDLL'] = 0
res['ImportsNb'] = 0
Expand All @@ -138,11 +140,11 @@ def extract_infos(fpath):
resources= get_resources(pe)
res['ResourcesNb'] = len(resources)
if len(resources)> 0:
entropy = map(lambda x:x[0], resources)
entropy = list(map(lambda x:x[0], resources))
res['ResourcesMeanEntropy'] = sum(entropy)/float(len(entropy))
res['ResourcesMinEntropy'] = min(entropy)
res['ResourcesMaxEntropy'] = max(entropy)
sizes = map(lambda x:x[1], resources)
sizes = list(map(lambda x:x[1], resources))
res['ResourcesMeanSize'] = sum(sizes)/float(len(sizes))
res['ResourcesMinSize'] = min(sizes)
res['ResourcesMaxSize'] = max(sizes)
Expand Down Expand Up @@ -179,15 +181,12 @@ def extract_infos(fpath):
os.path.dirname(os.path.realpath(__file__)),
'classifier/classifier.pkl'
))
features = pickle.loads(open(os.path.join(
os.path.dirname(os.path.realpath(__file__)),
'classifier/features.pkl'),
'r').read()
)

with open('classifier/features.pkl', 'rb') as f:
features = pickle.load(f)

data = extract_infos(args.FILE)

pe_features = map(lambda x:data[x], features)
pe_features = list(map(lambda x:data[x], features))

res= clf.predict([pe_features])[0]
print('The file %s is %s' % (
Expand Down
7 changes: 4 additions & 3 deletions learning.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import numpy as np
import pickle
import sklearn.ensemble as ske
from sklearn import cross_validation, tree, linear_model
from sklearn import tree, linear_model
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.externals import joblib
from sklearn.naive_bayes import GaussianNB
Expand All @@ -20,7 +21,7 @@
X_new = model.transform(X)
nb_features = X_new.shape[1]

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_new, y ,test_size=0.2)
X_train, X_test, y_train, y_test = train_test_split(X_new, y ,test_size=0.2)

features = []

Expand Down Expand Up @@ -58,7 +59,7 @@
# Save the algorithm and the feature list for later predictions
print('Saving algorithm and feature list in classifier directory...')
joblib.dump(algorithms[winner], 'classifier/classifier.pkl')
open('classifier/features.pkl', 'w').write(pickle.dumps(features))
open('classifier/features.pkl', 'wb').write(pickle.dumps(features))
print('Saved')

# Identify false and true positive rates
Expand Down

0 comments on commit 7bee2f0

Please sign in to comment.