ridhipatil · meghanapalukuri · Jun 21, 2022 · Jun 21, 2022 · Jun 21, 2022 · Jun 21, 2022
diff --git a/.DS_Store b/.DS_Store
diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 # Reinforcement Learning Complex Detection
-This reinforcement learning algorithm is a machine learning method for complex detection in networks. Using known communities, it is trained and learns to find new complexes in the network.
+This is a reinforcement learning algorithm for community detection in networks. Trained on known communities, it learns to find new communities in a network.
 
 # Installation:
 Required python3                                  
@@ -10,38 +10,45 @@ Requirements installation:
 1. For a toy network use input_toy.yaml
 2. For hu.MAP - use input file input_humap.yaml
 
-
 # Instructions:
 To run this pipeline on a new network, construct an input file similar to input_toy.yaml specifying where to find the required inputs.
-1. Specify input options relating to network: Set options dir_nm (directory containing the network) and netf_nm (file name of the network)
-2. Specify input options relating to known communities in network: If you already have sepearated known communities into train and test communitites, specify their paths in the options comf_nm and comf_test_nm (relative to the directory specified in the option:dir_nm) Otherwise, Split complex list into train and test: Set option split_flag = 1 Verify that train test size distributions in figure are the similar. Also check that number of training complexes is not too low by looking at the res_metrics.out file. Set options comf_nm and comf_test_nm with these two files. All the above paths are set relative to the directory specified in the option:dir_nm Make sure to change the option split_flag back to 0 after this step
+1. Specify the network input file: Set options dir_nm (directory containing the network) and netf_nm (file name of the network)
+2. Specify the paths for train and test communitites, in the options comf_nm and comf_test_nm (relative to the directory specified in the option- dir_nm)
 
-An example bash script to run the RL pipeline after the above steps is shown below: This is for hu.MAP complexes
+An example bash script to run the RL pipeline after the above steps is shown below: This is for complexes learned on the human PPI network, hu.MAP 1.0:
 ```
 #!/bin/bash
-mtype = humap
-input_file_name = input_$mtype.yaml
-graph_file = hu.MAP_network_experiments/input_data/humap_network_weighted_edge_lists.txt
-input_training_file = hu.MAP_network_experiments/intermediate_output_results_data/training_CORUM_complexes_node_lists.txt
-input_testing_file = hu.MAP_network_experiments/intermediate_output_results_data/testing_CORUM_complexes_node_lists.txt
-out_dir_name = /results_$mtype
-train_results = $out_dir_name/train_results
-pred_results = $out_dir_name/pred_results
-id_map_path = convert_ids/humap_gene_id_name_map.txt
+
+mtype=humap
+input_file_name=input_$mtype.yaml
+graph_file=hu.MAP_network/input_data/humap_network_weighted_edge_lists.txt
+input_training_file=hu.MAP_network/intermediate_data/training_CORUM_complexes_node_lists.txt
+input_testing_file=hu.MAP_network/intermediate_data/testing_CORUM_complexes_node_lists.txt
+mkdir results_$mtype
+out_dir_name=./results_$mtype
+train_results=$out_dir_name/train_results
+pred_results=$out_dir_name/pred_results
+id_map_path=convert_ids/humap_gene_id_name_map.txt
+
 echo Training Algorithm....
-python3 functions/main_training.py --input_training_file $input_training_file --graph_file $graph_file --train_results $train_results
+python3 main_training.py --input_training_file $input_training_file --graph_file $graph_file --train_results $train_results
 
 echo Predicting new complexes from known communities...
-python3 functions/main_prediction.py --graph_file $graph_file --train_results $train_results --out_dir_name $out_dir_name --pred_results $pred_results
+python3 main_prediction.py --graph_file $graph_file --train_results $train_results --out_dir_name $out_dir_name --pred_results $pred_results
 
 echo Merging similar communities...
-python3 functions/postprocessing.py --input_file_name $input_file_name --graph_file $graph_file --out_dir_name $out_dir_name --pred_results $pred_results --train_results $train_results --input_training_file $input_training_file --input_testing_file $input_testing_file --id_map_path $id_map_path
+python3 postprocessing.py --input_file_name $input_file_name --graph_file $graph_file --out_dir_name $out_dir_name --pred_results $pred_results --train_results $train_results --input_training_file $input_training_file --input_testing_file $input_testing_file --id_map_path $id_map_path
 
 echo Comparing predicted and known communitites...
-python3 functions/eval_complex_RL --input_file_name $input_file_name  --input_training_file $input_training_file --input_testing_file $input_testing_file --out_dir_name $out_dir_name
+python3 eval_complex_RL.py --input_file_name $input_file_name  --input_training_file $input_training_file --input_testing_file $input_testing_file --out_dir_name $out_dir_name --id_name_path $id_map_path
 
 ```
 
-# Additional tips:
+## Additional tips:
 For each of the scripts, optional arguments can be viewed by running: python3 script_name.py --help
 For each command, add the desired argument directly on the terminal.
+
+# References:
+M. V. Palukuri, R. S. Patil, and E. M. Marcotte, “Molecular complex detection in protein interaction networks through reinforcement learning.” bioRxiv, p. 2022.06.20.496772. doi: [10.1101/2022.06.20.496772](https://www.biorxiv.org/content/10.1101/2022.06.20.496772v1).
+
+Interactive visualizations of complexes learned by the RL algorithm on two human PPI networks, hu.MAP 1.0 and hu.MAP 2.0 are available here: [https://marcottelab.github.io/RL_humap_prediction/](https://marcottelab.github.io/RL_humap_prediction/)
diff --git a/convert_humap_ids2names.py b/convert_humap_ids2names.py
@@ -184,6 +184,6 @@ def convert2names_wscores(complexes, filename, G, filename_edges, ids_map):
     convert_edges_wscore(lines, G, filename_edges, id_name_map)
 
 
-def convert2names_wscores_matches(complex_matches, filename):
-    id_name_map = read_gene_id_map()
+def convert2names_wscores_matches(complex_matches, filename, id_name_map_path):
+    id_name_map = read_gene_id_map(id_name_map_path)
     convert_nodes_matches_wscore(complex_matches, filename, id_name_map)    
diff --git a/eval_cmplx_sc.py b/eval_cmplx_sc.py
@@ -16,11 +16,11 @@
 
 
 
-def write_best_matches(best_matches_for_known,out_comp_nm,dir_nm,suffix):
+def write_best_matches(best_matches_for_known,out_comp_nm,dir_nm,suffix,id_name_map):
 
     sorted_matches = sorted(best_matches_for_known,key=lambda x: x[2],reverse=True)
     if dir_nm == "humap":
-        convert2names_wscores_matches(sorted_matches, out_comp_nm + suffix + '_known_pred_matches_names.out')
+        convert2names_wscores_matches(sorted_matches, out_comp_nm + suffix + '_known_pred_matches_names.out',id_name_map)
 
     with open(out_comp_nm + suffix + '_known_pred_matches.out', "w") as fn:
         fn_write = fn.write
@@ -127,7 +127,7 @@ def f1_similarity(P,T):
     return F1_score, C 
 
 
-def one2one_matches(known_complex_nodes_list, fin_list_graphs, N_pred_comp, N_test_comp,out_comp_nm,suffix,dir_nm):
+def one2one_matches(known_complex_nodes_list, fin_list_graphs, N_pred_comp, N_test_comp,out_comp_nm,suffix,dir_nm, id_name_map):
 
     Metric = np_zeros((N_test_comp, N_pred_comp))
     Common_nodes = np_zeros((N_test_comp, N_pred_comp))
@@ -174,8 +174,8 @@ def one2one_matches(known_complex_nodes_list, fin_list_graphs, N_pred_comp, N_te
     avg_f1_score = (avged_f1_score4known + avged_f1_score4pred)/2
     net_f1_score = 2 * avged_f1_score4known * avged_f1_score4pred / (avged_f1_score4known + avged_f1_score4pred)
 
-    write_best_matches(best_matches_4known,out_comp_nm,dir_nm,'_best4known' + suffix)
-    write_best_matches(best_matches_4predicted,out_comp_nm,dir_nm,'_best4predicted' + suffix)
+    write_best_matches(best_matches_4known,out_comp_nm,dir_nm,'_best4known' + suffix, id_name_map)
+    write_best_matches(best_matches_4predicted,out_comp_nm,dir_nm,'_best4predicted' + suffix, id_name_map)
 
     prec_MMR, recall_MMR, f1_MMR, max_matching_edges = f1_mmr(Metric)
 
@@ -295,28 +295,28 @@ def remove_unknown_prots(fin_list_graphs_orig, prot_list):
     return fin_list_graphs
 
 
-def compute_metrics(known_complex_nodes_list, fin_list_graphs,out_comp_nm,N_test_comp,N_pred_comp,inputs,suffix):
+def compute_metrics(known_complex_nodes_list, fin_list_graphs,out_comp_nm,N_test_comp,N_pred_comp,inputs,suffix, id_name_map):
 
     if N_test_comp != 0 and N_pred_comp != 0:
         Precision, Recall, F1_score = node_comparison_prec_recall(known_complex_nodes_list,fin_list_graphs, N_pred_comp, N_test_comp, inputs["eval_p"],out_comp_nm+suffix)
 
-        avg_f1_score, net_f1_score,PPV,Sn,acc_unbiased,prec_MMR, recall_MMR, f1_MMR,n_matches = one2one_matches(known_complex_nodes_list, fin_list_graphs, N_pred_comp, N_test_comp,out_comp_nm,suffix,inputs['dir_nm'])
+        avg_f1_score, net_f1_score,PPV,Sn,acc_unbiased,prec_MMR, recall_MMR, f1_MMR,n_matches = one2one_matches(known_complex_nodes_list, fin_list_graphs, N_pred_comp, N_test_comp,out_comp_nm,suffix,inputs['dir_nm'], id_name_map)
 
         with open(out_comp_nm + '_metrics.out', "a") as fid:
             print("No. of matches in MMR = ", n_matches, file=fid)            
-            print("MMR Precision = %.3f" % prec_MMR, file=fid)
-            print("MMR Recall = %.3f" % recall_MMR, file=fid)
-            print("MMR F1 score = %.3f" % f1_MMR, file=fid)               
-            print("Net F1 score = %.3f" % net_f1_score, file=fid)   
+            print("FMM Precision = %.3f" % prec_MMR, file=fid)
+            print("FMM Recall = %.3f" % recall_MMR, file=fid)
+            print("FMM F1 score = %.3f" % f1_MMR, file=fid)               
+            print("CMMF = %.3f" % net_f1_score, file=fid)   
             print("Unbiased PPV = %.3f" % PPV, file=fid)
             print("Unbiased Sn = %.3f" % Sn, file=fid)
-            print("Unbiased accuracy= %.3f" % acc_unbiased, file=fid)             
+            print("Unbiased accuracy (UnSPA)= %.3f" % acc_unbiased, file=fid)             
             print("Net Averaged F1 score (Average of Precision and Recall based on F1 score) = %.3f" % avg_f1_score, file=fid)
-            print("Prediction Precision = %.3f" % Precision, file=fid)
-            print("Prediction Recall = %.3f" % Recall, file=fid)
-            print("Prediction F1 score = %.3f" % F1_score, file=fid)    
+            print("Qi et al Precision = %.3f" % Precision, file=fid)
+            print("Qi et al Recall = %.3f" % Recall, file=fid)
+            print("Qi et al F1 score = %.3f" % F1_score, file=fid)    
 
-def eval_complex(rf=0, rf_nm=0, inputs={}, known_complex_nodes_list=[], prot_list=[], fin_list_graphs=[], out_comp_nm = '',suffix="both"):
+def eval_complex(rf=0, rf_nm=0, inputs={}, known_complex_nodes_list=[], prot_list=[], fin_list_graphs=[], out_comp_nm = '',suffix="both", id_name_map = ""):
     # rf - read flag to read complexes from file
     logging_info("Evaluating complexes..." + suffix)
     if rf == 1:
@@ -338,7 +338,7 @@ def eval_complex(rf=0, rf_nm=0, inputs={}, known_complex_nodes_list=[], prot_lis
         print("No. of Predicted complexes = ", N_pred_comp, file=fid)
         print("\n -- Metrics on complexes with all proteins -- ", file=fid)       
     print(out_comp_nm)
-    compute_metrics(known_complex_nodes_list, fin_list_graphs, out_comp_nm,N_test_comp,N_pred_comp,inputs,suffix+'_all_prots')            
+    compute_metrics(known_complex_nodes_list, fin_list_graphs, out_comp_nm,N_test_comp,N_pred_comp,inputs,suffix+'_all_prots',id_name_map)            
 
     fin_list_graphs = remove_unknown_prots(fin_list_graphs, prot_list)
     plot_size_dists(known_complex_nodes_list, fin_list_graphs, sizes_orig, out_comp_nm)
@@ -348,8 +348,8 @@ def eval_complex(rf=0, rf_nm=0, inputs={}, known_complex_nodes_list=[], prot_lis
         print("No. of Predicted complexes after removing non-gold std proteins = ", N_pred_comp, file=fid)
         print("\n -- Metrics on complexes with only gold std proteins -- ", file=fid)   
 
-    compute_metrics(known_complex_nodes_list, fin_list_graphs, out_comp_nm,N_test_comp,N_pred_comp,inputs,suffix+'_gold_std_prots')            
+    compute_metrics(known_complex_nodes_list, fin_list_graphs, out_comp_nm,N_test_comp,N_pred_comp,inputs,suffix+'_gold_std_prots', id_name_map)            
     with open(out_comp_nm + '_metrics.out', "a") as fid:
         print("-- Finished writing main metrics -- \n", file=fid)   
 
-    logging_info("Finished evaluating basic metrics for complexes " + suffix)
+    logging_info("Finished evaluating basic metrics for complexes " + suffix)
diff --git a/eval_complex_RL.py b/eval_complex_RL.py
@@ -1,8 +1,7 @@
 from argparse import ArgumentParser as argparse_ArgumentParser, ArgumentParser
 from pickle import load as pickle_load
 from yaml import load as yaml_load, dump as yaml_dump, Loader as yaml_Loader
-from eval_cmplx_sc import eval_complex
-from eval_cmplx_sc import remove_unknown_prots
+from eval_cmplx_sc import eval_complex, remove_unknown_prots
 from main6_eval import run_metrics
 import os
 def main():
@@ -13,6 +12,7 @@ def main():
     parser.add_argument("--input_testing_file", default="", help="Testing Graph file path")
     parser.add_argument("--out_dir_name", default="", help="Output directory name")
     parser.add_argument("--evaluate_additional_metrics", default=1, help="complexes file name")
+    parser.add_argument("--id_name_path", default="", help="Path for id to gene name file")
     args = parser.parse_args()
     print(args.input_file_name)
     with open(args.input_file_name, 'r') as f:
@@ -52,7 +52,7 @@ def main():
     # Remove all proteins in Predicted complexes that are not present in known complex protein list
     fin_list_graphs = remove_unknown_prots(fin_list_graphs_orig, prot_list)
     suffix = ''
-    eval_complex(0, 0, inputs, known_complex_nodes_list, prot_list, fin_list_graphs, out_comp_nm, suffix="_train")
+    eval_complex(0, 0, inputs, known_complex_nodes_list, prot_list, fin_list_graphs, out_comp_nm, suffix="_train", id_name_map = args.id_name_path)
     if args.evaluate_additional_metrics:
         try:
             run_metrics(known_complex_nodes_list, fin_list_graphs, out_comp_nm, "_train")
@@ -75,7 +75,7 @@ def main():
     # Remove all proteins in Predicted complexes that are not present in known complex protein list
     fin_list_graphs = remove_unknown_prots(fin_list_graphs_orig, prot_list)
     suffix = ''
-    eval_complex(0, 0, inputs, known_complex_nodes_list, prot_list, fin_list_graphs, out_comp_nm,suffix="_train")
+    eval_complex(0, 0, inputs, known_complex_nodes_list, prot_list, fin_list_graphs, out_comp_nm,suffix="_train", id_name_map = args.id_name_path)
 
     if args.evaluate_additional_metrics:
         try:
@@ -109,7 +109,7 @@ def main():
     N_pred_comp = len(fin_list_graphs)
     suffix = ''
 
-    eval_complex(0, 0, inputs, known_complex_nodes_list, prot_list, fin_list_graphs, out_comp_nm,suffix="_train")
+    eval_complex(0, 0, inputs, known_complex_nodes_list, prot_list, fin_list_graphs, out_comp_nm,suffix="_train", id_name_map = args.id_name_path)
 
     if args.evaluate_additional_metrics:
         try:

diff --git a/hu.MAP_network/.DS_Store b/hu.MAP_network/.DS_Store
diff --git a/humap_steps.sh b/humap_steps.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+mtype=humap
+input_file_name=input_$mtype.yaml
+graph_file=hu.MAP_network/input_data/humap_network_weighted_edge_lists.txt
+input_training_file=hu.MAP_network/intermediate_data/training_CORUM_complexes_node_lists.txt
+input_testing_file=hu.MAP_network/intermediate_data/testing_CORUM_complexes_node_lists.txt
+mkdir results_$mtype
+out_dir_name=./results_$mtype
+train_results=$out_dir_name/train_results
+pred_results=$out_dir_name/pred_results
+id_map_path=convert_ids/humap_gene_id_name_map.txt
+
+echo Training Algorithm....
+python3 main_training.py --input_training_file $input_training_file --graph_file $graph_file --train_results $train_results
+
+echo Predicting new complexes from known communities...
+python3 main_prediction.py --graph_file $graph_file --train_results $train_results --out_dir_name $out_dir_name --pred_results $pred_results
+
+echo Merging similar communities...
+python3 postprocessing.py --input_file_name $input_file_name --graph_file $graph_file --out_dir_name $out_dir_name --pred_results $pred_results --train_results $train_results --input_training_file $input_training_file --input_testing_file $input_testing_file --id_map_path $id_map_path
+
+echo Comparing predicted and known communitites...
+python3 eval_complex_RL.py --input_file_name $input_file_name  --input_training_file $input_training_file --input_testing_file $input_testing_file --out_dir_name $out_dir_name --id_name_path $id_map_path
diff --git a/input_humap.yaml b/input_humap.yaml
@@ -6,43 +6,11 @@ comf_nm: "/res_train_complexes_new_73_more.txt"
 comf_test_nm: "/res_test_complexes_new_73_more.txt" # Make sure no extra rows are present
 comf_nm_all: "/all_complexes.txt"
 out_comp_nm: "/results_qi0.325/res"
-scale_factor: 10  # Number of times negatives should be higher than positives 
-use_full: 1
 split_flag: 0
-fact: 0.99
-perc_transfer: 0.275
-mode: non_gen # gen means only feature extraction, non_gen is all 
 # -------------------Training parameters--------------------------------
-feats: 6
 
-classifier_file: "humap/results_73_neg_unif_10x/res_classifiers_new.txt" # or remove new - CHECK
-model_type: "tpot" # Options: tpot, NN
-train_feat_mat: "humap/results_73_neg_unif_10x/res_train_dat.csv"
-test_feat_mat: "humap/results_73_neg_unif_10x/res_test_dat.csv"
-model_name: "tpot_select" #Options: FF_1hidden, log_reg, SVM, rand_forest, extra_trees, estimator_SVM
 model_dir: "/results_73_neg_unif_10x/res"
 # --------------------Search parameters ------------------------------
-seed_mode: "all_nodes" # Options:all_nodes_known_comp, all_nodes, n_nodes,cliques
-num_comp: 5 # Options: 10, 7778, 1500 - only for n_nodes mode
-classi_thresh: 0.5
-
-run_mode: "parallel" # Options: serial, parallel
-max_size_thres: 11 
-search_method: "isa" # isa, metropolis, search_top_neigs, search_max_neig 
-
-# All methods except max_neig    
-use_all_neigs: 1
-thres_neig: 30 # Maximum number of neighbors sampled for checking 
-min_thres_neig_sorted: 100 # Threshold above which only a percentage of neigs are considered as per sorted weights
-perc: 0.7 # Percentage of neighbors to check for adding new node
-explore_prob: 0.01 # use 0.1 for top_neigs 
-
-# Metropolis algorithm params
-prob_metropolis: 0.1
-
-# ISA params
-T0: 1.75
-alpha: 0.005
 
 over_t: 0.325 # Overlap threshold = 0.7/0.9
 overlap_method: "qi" # testing_qi_0.3 or 1

diff --git a/input_toy.yaml b/input_toy.yaml
@@ -3,44 +3,10 @@ dir_nm: "toy_network" # Options: toy_network, toy_network_old, humap, humap2
 sep: " " # Options: " ", "\t"
 out_comp_nm: "/results/res" 
 split_flag: 0
-fact: 0.7
-perc_transfer: 0.2
-use_full: 1
-scale_factor: 1.1  # Number of times negatives should be higher than positives 
-mode: non_gen # gen means only feature extraction, non_gen is all
 # -------------------Training parameters--------------------------------
-feats: 6
-
-model_type: "tpot" # Options: tpot, NN
-train_feat_mat: "toy_network/results_train_dat.csv"
-test_feat_mat: "toy_network/results_train_dat.csv"
-model_name: "SVM" #Options: FF_1hidden, log_reg, SVM, rand_forest, extra_trees
-# humap with separted train and test sets - tpot result - extra_trees
 
 model_dir: "/results/res"
 # --------------------Search parameters ------------------------------
-seed_mode: "all_nodes" # Options:all_nodes_known_comp, all_nodes, n_nodes, cliques
-num_comp: 40 # Options: 10, 7778, 1500 -  only for n_nodes mode
-
-run_mode: "parallel" # Options: serial, parallel
-max_size_thres: 50 
-
-search_method: "search_top_neigs" # isa, metropolis, search_top_neigs, search_max_neig
-
-# All methods except search_max_neig    
-# No. of neighbors considered params
-use_all_neigs: 1
-thres_neig: 30 # Maximum number of neighbors sampled for checking
-min_thres_neig_sorted: 30 # Threshold above which only a percentage of neigs are considered as per sorted weights
-perc: 0.7 # Percentage of neighbors to check for adding new node
-
-explore_prob: 0.01 # use 0.1 for top_neigs
-# Metropolis algorithm params
-prob_metropolis: 0.1
-
-# ISA params
-T0: 0.88
-alpha: 1.8
 
 over_t: 0.1 # Overlap threshold = 0.7/0.9
 infer_overlap_threshold: "y"