forked from kskk02/Fraud_Detector
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFraud_Detection.py
88 lines (67 loc) · 3.15 KB
/
Fraud_Detection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import pandas as pd
import seaborn
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import os
import time
import cPickle as pickle
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation
from sklearn.metrics import roc_curve, auc
import d3py
from sqlalchemy import create_engine
import Get_Data as get_data
import statistical_analysis
import graph_analysis
import ml_analysis
import logging
'''
This is the main function loop that runs the entire data flow from beginning to end. Refer to the data flow diagram in the readme to refer to the various components.
'''
def Database_Insertion (df,table):
"""
Description: This function inserts the possible possible_fraudsters identified from the graph analysis into the database.
INPUT:
df - type: DataFrame - the list of fraudster ID's
table - type: string - the table to be inserted into
OUTPUT: No output. Just stores the data in the database
"""
engine = create_engine('postgresql://skanajan:abcdef@localhost:5432/fraud_detector')
df.to_sql(table,engine, if_exists='replace')
logging.debug("in Database_Insertion")
def RunComplete ():
"""
Description: This is the main function that ties all the flows together. Currently the flows are tied together by passing around lists of anomalies or fraudsters. Longer term
work would entail in making this as database queries.
INPUT:
df - type: DataFrame - the list of fraudster ID's
table - type: string - the table to be inserted into
OUTPUT: No output. Just stores the data in the database
"""
logging.debug ("In Run Complete")
start = time.time()
df = get_data.GetData("hack_small.csv")[0:400000]
classifier_file = "RF_phone_Fraud.pickle"
Classifier = pickle.load( open( classifier_file, "rb" ) )
StatAnomalies = statistical_analysis.Statistical_Analysis(df)
feature_set = ['answind','origpricingdestid','routingdestcd','supp_orgno','cvrsn_dur','attempts','cust_orgno','pricingdestid']
MLAnomalies = ml_analysis.FindMLAnomalies(df,feature_set,1,classifier_file)
ML_Stat_Anomalies = set(StatAnomalies).union(set(MLAnomalies))
print "Total Anomalies are ", len(ML_Stat_Anomalies)
logging.debug ("Anomalies are " , StatAnomalies)
graph_filename = "edgelist.csv"
graph_analysis.Generate_Edgelist(df,graph_filename)
Anomalies_Subgraph = graph_analysis.Generate_Graph(graph_filename, ML_Stat_Anomalies)
GraphAnalysisResults = graph_analysis.GraphAnalysis(Anomalies_Subgraph)
GraphAnomalies = graph_analysis.Find_Graph_Anomalies(GraphAnalysisResults)
Database_Insertion(pd.DataFrame(GraphAnomalies,columns=['possible_fraudster_phone_number']),'possible_fraudsters')
logging.debug ("graph anomalies are " , GraphAnomalies)
# Visualize these Graph Anomalies and let the user identify these anomalies
#Draw_Suspect_Fraud_Node(GraphAnomalies[0],Anomalies_Subgraph)
Confirmed_Fraudster_Phone_Numbers = GraphAnomalies # assumption for now
ml_analysis.TrainMLClassifier (df,Confirmed_Fraudster_Phone_Numbers,feature_set,classifier_file)
logging.debug (time.time() - start)
RunComplete()