Add files via upload

zamanzadeh · Aug 28, 2024 · 6df4eac · 6df4eac
1 parent 546a000
commit 6df4eac
Showing 1 changed file with 292 additions and 0 deletions.
diff --git a/Evaluation.ipynb b/Evaluation.ipynb
@@ -0,0 +1,292 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "f82ba5b3",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import os\n",
+    "from sklearn import metrics\n",
+    "from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix, roc_curve, precision_recall_curve"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "a587f335",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "def adjust_predicts(label, predict=None, calc_latency=False):\n",
+    "    \n",
+    "    label = np.asarray(label)\n",
+    "    latency = 0\n",
+    "    \n",
+    "    actual = label > 0.1\n",
+    "    anomaly_state = False\n",
+    "    anomaly_count = 0\n",
+    "    for i in range(len(actual)):\n",
+    "        if actual[i] and predict[i] and not anomaly_state:\n",
+    "                anomaly_state = True\n",
+    "                anomaly_count += 1\n",
+    "                for j in range(i, 0, -1):\n",
+    "                    if not actual[j]:\n",
+    "                        break\n",
+    "                    else:\n",
+    "                        if not predict[j]:\n",
+    "                            predict[j] = True\n",
+    "                            latency += 1\n",
+    "        elif not actual[i]:\n",
+    "            anomaly_state = False\n",
+    "        if anomaly_state:\n",
+    "            predict[i] = True\n",
+    "        \n",
+    "    MCM = metrics.multilabel_confusion_matrix(actual, predict, labels = [1, 0])\n",
+    "\n",
+    "    pa_tn = MCM[0][0, 0]\n",
+    "    pa_tp = MCM[0][1, 1]\n",
+    "    pa_fp = MCM[0][0, 1]\n",
+    "    pa_fn = MCM[0][1, 0]\n",
+    "        \n",
+    "    prec = pa_tp / (pa_tp + pa_fp)\n",
+    "    rec = pa_tp / (pa_tp + pa_fn)\n",
+    "    f1_score = 2 * (prec * rec) / (prec + rec)\n",
+    "    if calc_latency:\n",
+    "        return predict, latency / (anomaly_count + 1e-4), pa_tp, pa_tn, pa_fp, pa_fn, prec , rec, f1_score\n",
+    "    else:\n",
+    "        return predict, prec, rec, f1_score, pa_tp, pa_tn, pa_fp, pa_fn,"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "4460860e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def add_summary_statistics(res_df):\n",
+    "    # Compute the sum of 'best_tp', 'best_tn', 'best_fp', 'best_fn'\n",
+    "    sum_best_tp = res_df['best_tp'].sum()\n",
+    "    sum_best_tn = res_df['best_tn'].sum()\n",
+    "    sum_best_fp = res_df['best_fp'].sum()\n",
+    "    sum_best_fn = res_df['best_fn'].sum()\n",
+    "\n",
+    "    # Calculate precision, recall and f1 score\n",
+    "    precision = sum_best_tp / (sum_best_tp + sum_best_fp) if (sum_best_tp + sum_best_fp) > 0 else 0\n",
+    "    recall = sum_best_tp / (sum_best_tp + sum_best_fn) if (sum_best_tp + sum_best_fn) > 0 else 0\n",
+    "    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0\n",
+    "\n",
+    "    # Calculate the average and std of 'roc' and 'pr'\n",
+    "    roc_avg = res_df['roc'].mean()\n",
+    "    roc_std = res_df['roc'].std()\n",
+    "    pr_avg = res_df['pr'].mean()\n",
+    "    pr_std = res_df['pr'].std()\n",
+    "\n",
+    "    # Append the results to the dataframe\n",
+    "    summary_row = pd.Series({\n",
+    "        'best_tp': sum_best_tp,\n",
+    "        'best_tn': sum_best_tn,\n",
+    "        'best_fp': sum_best_fp,\n",
+    "        'best_fn': sum_best_fn,\n",
+    "        'best_pre': precision,\n",
+    "        'best_rec': recall,\n",
+    "        'b_f_1': f1_score,\n",
+    "        'roc': roc_avg,\n",
+    "        'pr': pr_avg\n",
+    "    })\n",
+    "\n",
+    "    std_row = pd.Series({\n",
+    "        'roc': roc_std,\n",
+    "        'pr': pr_std\n",
+    "    })\n",
+    "\n",
+    "    # Append the rows to the dataframe\n",
+    "    res_df = res_df._append(summary_row, ignore_index=True)\n",
+    "    res_df = res_df._append(std_row, ignore_index=True)\n",
+    "    \n",
+    "    return res_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "af5cb8af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def add_summary_statistics_pa(res_df):\n",
+    "    # Compute the sum of 'best_tp', 'best_tn', 'best_fp', 'best_fn'\n",
+    "    sum_pa_tp = res_df['pa_tp'].sum()\n",
+    "    sum_pa_tn = res_df['pa_tn'].sum()\n",
+    "    sum_pa_fp = res_df['pa_fp'].sum()\n",
+    "    sum_pa_fn = res_df['pa_fn'].sum()\n",
+    "\n",
+    "    # Calculate precision, recall and f1 score\n",
+    "    precision = sum_pa_tp / (sum_pa_tp + sum_pa_fp) if (sum_pa_tp + sum_pa_fp) > 0 else 0\n",
+    "    recall = sum_pa_tp / (sum_pa_tp + sum_pa_fn) if (sum_pa_tp + sum_pa_fn) > 0 else 0\n",
+    "    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0\n",
+    "\n",
+    "\n",
+    "    # Append the results to the dataframe\n",
+    "    summary_row = pd.Series({\n",
+    "        'pa_tp': sum_pa_tp,\n",
+    "        'pa_tn': sum_pa_tn,\n",
+    "        'pa_fp': sum_pa_fp,\n",
+    "        'pa_fn': sum_pa_fn,\n",
+    "        'pa_pre': precision,\n",
+    "        'pa_rec': recall,\n",
+    "        'pa_f1': f1_score,\n",
+    "    })\n",
+    "\n",
+    "\n",
+    "    # Append the row to the dataframe\n",
+    "    res_df = res_df._append(summary_row, ignore_index=True)\n",
+    "    \n",
+    "    return res_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "9bc18dd8",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "res_df = pd.DataFrame(columns=['name', 'tp', 'tn', 'fp', 'fn', 'roc', 'pr', \n",
+    "                               'best_tp', 'best_tn', 'best_fp', 'best_fn', 'best_pre', 'best_rec', 'b_f_1']) \n",
+    "\n",
+    "pa_df = pd.DataFrame(columns=['name', 'pa_tp', 'pa_tn', 'pa_fp', 'pa_fn', 'pa_pre', 'pa_rec', 'pa_f1', 'latency'])\n",
+    "\n",
+    "\n",
+    "with open('datasets/MSL_SMAP/labeled_anomalies.csv', 'r') as file:\n",
+    "    csv_reader = pd.read_csv(file, delimiter=',')\n",
+    "\n",
+    "data_info = csv_reader[csv_reader['spacecraft'] == 'MSL']\n",
+    "\n",
+    "\n",
+    "# data_info = os.listdir('../datasets/KPI/train/')\n",
+    "\n",
+    "# data_info = os.listdir(os.path.join('datasets', 'A1Benchmark'))  \n",
+    "\n",
+    "# data_info = os.listdir('../datasets/SMD/train/')\n",
+    "# files = [file for file in data_info if file.startswith('machine-')]\n",
+    "\n",
+    "\n",
+    "for filename in data_info['chan_id']:\n",
+    "    if filename!='.json':\n",
+    "        print(filename)\n",
+    "        df_train = pd.read_csv(\"results/MSL/\" + filename + \"/classification/classification_trainprobs.csv\")\n",
+    "        df_test = pd.read_csv(\"results/MSL/\" + filename + \"/classification/classification_testprobs.csv\")\n",
+    "        cl_num = df_train.shape[1] - 1\n",
+    "\n",
+    "        df_train['Class'] = np.where((df_train['Class'] == 0), 0, 1)\n",
+    "        df_train['pred']=df_train[df_train.columns[0:cl_num]].idxmax(axis=1)\n",
+    "\n",
+    "        score_col = df_train['pred'].value_counts().idxmax()\n",
+    "        \n",
+    "        df_test['Class'] = np.where((df_test['Class'] == 0), 0, 1)\n",
+    "        df_test['pred'] = df_test[df_test.columns[0:cl_num]].idxmax(axis=1)\n",
+    "        \n",
+    "        roc_auc, pr_auc, best_tn, best_tp, best_fp, best_fn, best_pre, best_rec, best_f1 = 0, 0, 0, 0, 0, 0, 0, 0, 0\n",
+    "        try:\n",
+    "\n",
+    "            df_test['pred'] = np.where((df_test['pred'] == score_col), 0, 1)\n",
+    "\n",
+    "            MCM = metrics.multilabel_confusion_matrix(df_test['Class'], df_test['pred'], labels = [1, 0])\n",
+    "\n",
+    "            tn = MCM[0][0, 0]\n",
+    "            tp = MCM[0][1, 1]\n",
+    "            fp = MCM[0][0, 1]\n",
+    "            fn = MCM[0][1, 0]\n",
+    "\n",
+    "            pre=tp/(tp+fp)\n",
+    "            recall = tp/(tp+fn)\n",
+    "            f_1 = 2*pre*recall/(pre+recall)\n",
+    "            print('f-1 : ', f_1)\n",
+    "\n",
+    "            scores = 1-df_test[score_col]\n",
+    "            # Calculate AU-ROC\n",
+    "            roc_auc = roc_auc_score(df_test['Class'], scores)\n",
+    "            print('AU-ROC : ', roc_auc)\n",
+    "\n",
+    "            # Calculate AU-PR\n",
+    "            pr_auc = average_precision_score(df_test['Class'], scores)\n",
+    "            print('AU-PR : ', pr_auc)\n",
+    "\n",
+    "            fpr, tpr, thresholds = roc_curve(df_test['Class'], scores, pos_label=1)\n",
+    "            precision, recall, thresholds = precision_recall_curve(df_test['Class'], scores, pos_label=1)\n",
+    "\n",
+    "\n",
+    "            res = pd.DataFrame()\n",
+    "            res['pre'] = precision\n",
+    "            res['rec'] = recall\n",
+    "            res['f1'] = 2*res['pre']*res['rec'] / (res['pre']+res['rec'])\n",
+    "            best_idx = res['f1'].argmax()\n",
+    "            best_f1 = res['f1'][best_idx]\n",
+    "            best_pre = res['pre'][best_idx]\n",
+    "            best_rec = res['rec'][best_idx]\n",
+    "            best_thr = thresholds[best_idx]\n",
+    "            print('Best f1 : ', best_f1, 'best_thr', best_thr)\n",
+    "            anomalies = [True if s >= best_thr else False for s in scores]\n",
+    "\n",
+    "            best_tn, best_fp, best_fn, best_tp = confusion_matrix(df_test['Class'], anomalies).ravel()\n",
+    "        except ValueError:\n",
+    "            pass\n",
+    "\n",
+    "        new_row = pd.Series([filename, tp, tn, fp, fn, roc_auc, pr_auc, best_tp, best_tn, best_fp, best_fn, best_pre, best_rec, best_f1],\n",
+    "                                index=['name', 'tp', 'tn', 'fp', 'fn', 'roc', 'pr', 'best_tp', 'best_tn', 'best_fp', 'best_fn', 'best_pre', 'best_rec', 'b_f_1'])\n",
+    "        res_df = res_df._append(new_row, ignore_index=True)\n",
+    "        \n",
+    "        \n",
+    "        pa_f1 = -1\n",
+    "        for thr in thresholds:\n",
+    "            preds_pa = [True if s >= thr else False for s in scores]\n",
+    "            pa_prediction, t_latency, t_tp, t_tn, t_fp, t_fn, t_pre, t_rec, t_f1 = adjust_predicts(df_test['Class'], preds_pa, True)\n",
+    "            if t_f1 > pa_f1:\n",
+    "                latency, pa_tp, pa_tn, pa_fp, pa_fn, pa_pre, pa_rec, pa_f1 = t_latency, t_tp, t_tn, t_fp, t_fn, t_pre, t_rec, t_f1\n",
+    "                \n",
+    "        new_row1 = pd.Series([filename, pa_tp, pa_tn, pa_fp, pa_fn, pa_pre, pa_rec, pa_f1, latency],\n",
+    "                                index=['name', 'pa_tp', 'pa_tn', 'pa_fp', 'pa_fn', 'pa_pre', 'pa_rec', 'pa_f1', 'latency'])   \n",
+    "        pa_df = pa_df._append(new_row1, ignore_index=True)\n",
+    "        \n",
+    "    \n",
+    "res_df = add_summary_statistics(res_df)\n",
+    "res_df.to_csv('msl_results.csv')\n",
+    "\n",
+    "pa_df = add_summary_statistics_pa(pa_df)\n",
+    "pa_df.to_csv('msl_results_pa.csv')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "tsenv",
+   "language": "python",
+   "name": "tsenv"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}