Merge pull request #1 from viral98/master

Fixed files to support Python 3, Added python notebook
Te-k · Mar 28, 2020 · 7bee2f0 · 7bee2f0
2 parents 8efa97e + 1f63538
commit 7bee2f0
Show file tree

Hide file tree

Showing 3 changed files with 250 additions and 23 deletions.
diff --git a/Malware-Analysis.ipynb b/Malware-Analysis.ipynb
@@ -0,0 +1,227 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import pickle\n",
+    "import sklearn.ensemble as ske\n",
+    "from sklearn import tree, linear_model\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.feature_selection import SelectFromModel\n",
+    "from sklearn.externals import joblib\n",
+    "from sklearn.naive_bayes import GaussianNB\n",
+    "from sklearn.metrics import confusion_matrix\n",
+    "\n",
+    "data = pd.read_csv('data.csv', sep='|')\n",
+    "X = data.drop(['Name', 'md5', 'legitimate'], axis=1).values\n",
+    "y = data['legitimate'].values\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Researching important feature based on 54 total features\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\users\\tagdi\\appdata\\local\\programs\\python\\python37\\lib\\site-packages\\sklearn\\ensemble\\forest.py:245: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n",
+      "  \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print('Researching important feature based on %i total features\\n' % X.shape[1])\n",
+    "\n",
+    "# Feature selection using Trees Classifier\n",
+    "fsel = ske.ExtraTreesClassifier().fit(X, y)\n",
+    "model = SelectFromModel(fsel, prefit=True)\n",
+    "X_new = model.transform(X)\n",
+    "nb_features = X_new.shape[1]\n",
+    "\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X_new, y ,test_size=0.2)\n",
+    "\n",
+    "features = []\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "12 features identified as important:\n",
+      "1. feature Characteristics (0.174459)\n",
+      "2. feature Machine (0.153884)\n",
+      "3. feature DllCharacteristics (0.132542)\n",
+      "4. feature SectionsMaxEntropy (0.074540)\n",
+      "5. feature Subsystem (0.055532)\n",
+      "6. feature SizeOfStackReserve (0.047293)\n",
+      "7. feature ImageBase (0.046986)\n",
+      "8. feature SectionsNb (0.038547)\n",
+      "9. feature MajorSubsystemVersion (0.033042)\n",
+      "10. feature VersionInformationSize (0.031814)\n",
+      "11. feature MajorOperatingSystemVersion (0.022208)\n",
+      "12. feature SectionsMinEntropy (0.019704)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print('%i features identified as important:' % nb_features)\n",
+    "\n",
+    "indices = np.argsort(fsel.feature_importances_)[::-1][:nb_features]\n",
+    "for f in range(nb_features):\n",
+    "    print(\"%d. feature %s (%f)\" % (f + 1, data.columns[2+indices[f]], fsel.feature_importances_[indices[f]]))\n",
+    "\n",
+    "# XXX : take care of the feature order\n",
+    "for f in sorted(np.argsort(fsel.feature_importances_)[::-1][:nb_features]):\n",
+    "    features.append(data.columns[2+f])\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Now testing algorithms\n",
+      "DecisionTree : 99.036581 %\n",
+      "RandomForest : 99.398769 %\n",
+      "GradientBoosting : 98.772184 %\n",
+      "AdaBoost : 98.678015 %\n",
+      "GNB : 70.449113 %\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Algorithm comparison\n",
+    "algorithms = {\n",
+    "        \"DecisionTree\": tree.DecisionTreeClassifier(max_depth=10),\n",
+    "        \"RandomForest\": ske.RandomForestClassifier(n_estimators=50),\n",
+    "        \"GradientBoosting\": ske.GradientBoostingClassifier(n_estimators=50),\n",
+    "        \"AdaBoost\": ske.AdaBoostClassifier(n_estimators=100),\n",
+    "        \"GNB\": GaussianNB()\n",
+    "    }\n",
+    "\n",
+    "results = {}\n",
+    "print(\"\\nNow testing algorithms\")\n",
+    "for algo in algorithms:\n",
+    "    clf = algorithms[algo]\n",
+    "    clf.fit(X_train, y_train)\n",
+    "    score = clf.score(X_test, y_test)\n",
+    "    print(\"%s : %f %%\" % (algo, score*100))\n",
+    "    results[algo] = score\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Winner algorithm is RandomForest with a 99.398769 % success\n"
+     ]
+    }
+   ],
+   "source": [
+    "winner = max(results, key=results.get)\n",
+    "print('\\n Algorithm with highest accuracy on train/test is %s with a %f %% success' % (winner, results[winner]*100))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "# print('Saving algorithm and feature list in classifier directory...')\n",
+    "# joblib.dump(algorithms[winner], 'classifier/classifier.pkl')\n",
+    "# open('classifier/features.pkl', 'w').write(pickle.dumps(features))\n",
+    "# print('Saved')\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "False positive rate : 0.462701 %\n",
+      "False negative rate : 0.931487 %\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Identify false and true positive rates\n",
+    "clf = algorithms[winner]\n",
+    "res = clf.predict(X_test)\n",
+    "mt = confusion_matrix(y_test, res)\n",
+    "print(\"False positive rate : %f %%\" % ((mt[0][1] / float(sum(mt[0])))*100))\n",
+    "print('False negative rate : %f %%' % ( (mt[1][0] / float(sum(mt[1]))*100)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/checkpe.py b/checkpe.py
@@ -10,16 +10,16 @@
 
 def get_entropy(data):
     if len(data) == 0:
-	return 0.0
+        return 0.0
     occurences = array.array('L', [0]*256)
     for x in data:
-  	occurences[x if isinstance(x, int) else ord(x)] += 1
+        occurences[x if isinstance(x, int) else ord(x)] += 1
 
     entropy = 0
     for x in occurences:
-	if x:
-	    p_x = float(x) / len(data)
-	    entropy -= p_x*math.log(p_x, 2)
+        if x:
+            p_x = float(x) / len(data)
+            entropy -= p_x*math.log(p_x, 2)
 
     return entropy
 
@@ -28,7 +28,7 @@ def get_resources(pe):
     [entropy, size]"""
     resources = []
     if hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'):
-	try:
+        try:
             for resource_type in pe.DIRECTORY_ENTRY_RESOURCE.entries:
                 if hasattr(resource_type, 'directory'):
                     for resource_id in resource_type.directory.entries:
@@ -104,25 +104,27 @@ def extract_infos(fpath):
 
     # Sections
     res['SectionsNb'] = len(pe.sections)
-    entropy = map(lambda x:x.get_entropy(), pe.sections)
+    entropy = list(map(lambda x:x.get_entropy(), pe.sections))
     res['SectionsMeanEntropy'] = sum(entropy)/float(len(entropy))
     res['SectionsMinEntropy'] = min(entropy)
     res['SectionsMaxEntropy'] = max(entropy)
-    raw_sizes = map(lambda x:x.SizeOfRawData, pe.sections)
+
+
+    raw_sizes = list(map(lambda x:x.SizeOfRawData, pe.sections))
     res['SectionsMeanRawsize'] = sum(raw_sizes)/float(len(raw_sizes))
     res['SectionsMinRawsize'] = min(raw_sizes)
     res['SectionsMaxRawsize'] = max(raw_sizes)
-    virtual_sizes = map(lambda x:x.Misc_VirtualSize, pe.sections)
+    virtual_sizes = list(map(lambda x:x.Misc_VirtualSize, pe.sections))
     res['SectionsMeanVirtualsize'] = sum(virtual_sizes)/float(len(virtual_sizes))
     res['SectionsMinVirtualsize'] = min(virtual_sizes)
     res['SectionMaxVirtualsize'] = max(virtual_sizes)
 
     #Imports
     try:
         res['ImportsNbDLL'] = len(pe.DIRECTORY_ENTRY_IMPORT)
-        imports = sum([x.imports for x in pe.DIRECTORY_ENTRY_IMPORT], [])
+        imports = list(sum([x.imports for x in pe.DIRECTORY_ENTRY_IMPORT], []))
         res['ImportsNb'] = len(imports)
-        res['ImportsNbOrdinal'] = len(filter(lambda x:x.name is None, imports))
+        res['ImportsNbOrdinal'] = len(list(filter(lambda x:x.name is None, imports)))
     except AttributeError:
         res['ImportsNbDLL'] = 0
         res['ImportsNb'] = 0
@@ -138,11 +140,11 @@ def extract_infos(fpath):
     resources= get_resources(pe)
     res['ResourcesNb'] = len(resources)
     if len(resources)> 0:
-        entropy = map(lambda x:x[0], resources)
+        entropy = list(map(lambda x:x[0], resources))
         res['ResourcesMeanEntropy'] = sum(entropy)/float(len(entropy))
         res['ResourcesMinEntropy'] = min(entropy)
         res['ResourcesMaxEntropy'] = max(entropy)
-        sizes = map(lambda x:x[1], resources)
+        sizes = list(map(lambda x:x[1], resources))
         res['ResourcesMeanSize'] = sum(sizes)/float(len(sizes))
         res['ResourcesMinSize'] = min(sizes)
         res['ResourcesMaxSize'] = max(sizes)
@@ -179,15 +181,12 @@ def extract_infos(fpath):
         os.path.dirname(os.path.realpath(__file__)),
         'classifier/classifier.pkl'
     ))
-    features = pickle.loads(open(os.path.join(
-        os.path.dirname(os.path.realpath(__file__)),
-        'classifier/features.pkl'),
-        'r').read()
-    )
+
+    with open('classifier/features.pkl', 'rb') as f:
+        features = pickle.load(f)
 
     data = extract_infos(args.FILE)
-
-    pe_features = map(lambda x:data[x], features)
+    pe_features = list(map(lambda x:data[x], features))
 
     res= clf.predict([pe_features])[0]
     print('The file %s is %s' % (

diff --git a/learning.py b/learning.py
@@ -2,7 +2,8 @@
 import numpy as np
 import pickle
 import sklearn.ensemble as ske
-from sklearn import cross_validation, tree, linear_model
+from sklearn import tree, linear_model
+from sklearn.model_selection import train_test_split
 from sklearn.feature_selection import SelectFromModel
 from sklearn.externals import joblib
 from sklearn.naive_bayes import GaussianNB
@@ -20,7 +21,7 @@
 X_new = model.transform(X)
 nb_features = X_new.shape[1]
 
-X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_new, y ,test_size=0.2)
+X_train, X_test, y_train, y_test = train_test_split(X_new, y ,test_size=0.2)
 
 features = []
 
@@ -58,7 +59,7 @@
 # Save the algorithm and the feature list for later predictions
 print('Saving algorithm and feature list in classifier directory...')
 joblib.dump(algorithms[winner], 'classifier/classifier.pkl')
-open('classifier/features.pkl', 'w').write(pickle.dumps(features))
+open('classifier/features.pkl', 'wb').write(pickle.dumps(features))
 print('Saved')
 
 # Identify false and true positive rates