Fix bug

- ALO headers in cluster_metrics and cafe are in wrong order
DRL · Dec 30, 2016 · f6b821f · f6b821f
1 parent dbd389d
commit f6b821f
Show file tree

Hide file tree

Showing 4 changed files with 461 additions and 65 deletions.
diff --git a/kinfin.py b/kinfin.py
@@ -709,10 +709,10 @@ def get_header_line(self, filetype, attribute):
             cluster_metrics_header.append("attribute_cluster_type")
             cluster_metrics_header.append("protein_span_mean")
             cluster_metrics_header.append("protein_span_sd")
-            cluster_metrics_header += ["%s_count" % level for level in aloCollection.ALO_by_level_by_attribute[attribute]]
+            cluster_metrics_header += ["%s_count" % level for level in sorted(aloCollection.ALO_by_level_by_attribute[attribute])]
             if not attribute == "PROTEOME":
-                cluster_metrics_header += ["%s_median" % level for level in aloCollection.ALO_by_level_by_attribute[attribute]]
-                cluster_metrics_header += ["%s_cov" % level for level in aloCollection.ALO_by_level_by_attribute[attribute]]
+                cluster_metrics_header += ["%s_median" % level for level in sorted(aloCollection.ALO_by_level_by_attribute[attribute])]
+                cluster_metrics_header += ["%s_cov" % level for level in sorted(aloCollection.ALO_by_level_by_attribute[attribute])]
             return "\t".join(cluster_metrics_header)
         elif filetype == "cluster_metrics_domains":
             cluster_metrics_domains_header = []
@@ -742,7 +742,7 @@ def get_header_line(self, filetype, attribute):
             cafe_header = []
             cafe_header.append("Description")
             cafe_header.append("ID")
-            for level in aloCollection.ALO_by_level_by_attribute['PROTEOME']:
+            for level in sorted(aloCollection.ALO_by_level_by_attribute['PROTEOME']):
                 cafe_header.append(level)
             return "\t".join(cafe_header)
         elif filetype == "pairwise_representation_test":

diff --git a/src/compare_across_IVs.py b/src/compare_across_IVs.py
@@ -0,0 +1,230 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+"""
+usage: find_isoforms.py                      -i <FILE> [-o <STRING>]
+                                              [-h|--help]
+
+    Options:
+        -h --help                           show this
+        -i, --infile <FILE>                 Infile
+        -o, --outprefix <STRING>            Output prefix
+
+"""
+
+import re
+import sys
+import operator
+from docopt import docopt
+from os.path import isfile, join, exists, realpath, dirname, basename
+
+def parse_groups(group_f):
+    with open(groups_f) as group_fh:
+        for line in group_fh:
+            clusterID, protein_string = line.rstrip("\n").split(": ")
+            proteins = protein_string.split(" ")
+            if headers:
+                for protein in proteins:
+                    if protein in headers:
+                        if headers[protein] == None:
+                            headers[protein] = clusterID
+                            output[clusterID] = proteins
+                        else:
+                            sys.exit("[-] protein %s found more than once" % protein)
+            else:
+                if clusterID in clusters:
+                    if clusters[clusterID] == None:
+                        clusters[clusterID] = clusterID
+                        output[clusterID] = proteins
+                    else:
+                        sys.exit("[-] cluster %s found more than once" % clusterID)
+    return output
+
+def read_file(infile):
+    if not infile or not exists(infile):
+        sys.exit("[ERROR] - File '%s' does not exist." % (infile))
+    if infile.endswith(".gz"):
+        import gzip
+        with gzip.open(infile) as fh:
+            for line in fh:
+                yield line.rstrip("\n")
+    else:
+        with open(infile) as fh:
+            for line in fh:
+                yield line.rstrip("\n")
+
+class BarChartObj():
+    def __init__(self, IV):
+        self.IV = IV
+        self.proteins_conserved = {'non-singletons-split' : set(), 'non-singletons' : set(), 'singletons' : set()}
+        self.proteins_non_conserved = {'non-singletons-split' : set(), 'non-singletons' : set(), 'singletons' : set()}
+        self.proteins_singletons = {'non-singletons-split' : set(), 'non-singletons' : set(), 'singletons' : set()}
+
+
+class DataCollection():
+    def __init__(self):
+        self.clusterCollections = []
+        self.protein_ids_by_hierarchality_pattern = {} # tuples of hierarchality
+        self.hierarchality_pattern_by_protein_id = {}
+        self.hierarchality_pattern_by_protein_ids = {}
+        self.protein_ids_trajectory = {}
+        self.cluster_ids_trajectory = {}
+
+
+    def add_to_hierarchy(self, protein_ids_hierarchy, protein_ids):
+        if protein_ids_hierarchy[protein_ids]:
+            protein_ids_hierarchy[protein_ids]
+
+    def add_clusterCollection(self, clusterCollection):
+        barChartObj = BarChartObj(clusterCollection.IV)
+
+        cluster_counts_by_split = {'conserved': {'singleton': 0, 'non_singleton' : 0}, 'hierarchical' : {'singleton': 0, 'non_singleton' : 0}, 'non_hierarchical' : 0}
+        protein_counts_by_split = {'conserved': {'singleton': 0, 'non_singleton' : 0}, 'hierarchical' : {'singleton': 0, 'non_singleton' : 0}, 'non_hierarchical' : 0}
+        cluster_ids_by_split = {'conserved': {'singleton': set(), 'non_singleton' : set()}, 'hierarchical' : {'singleton': set(), 'non_singleton' : set()}, 'non_hierarchical' : set()}
+        protein_ids_by_split = {'conserved': {'singleton': set(), 'non_singleton' : set()}, 'hierarchical' : {'singleton': set(), 'non_singleton' : set()}, 'non_hierarchical' : set()}
+        if self.clusterCollections == []:
+            self.clusterCollections.append(clusterCollection)
+            cluster_counts_by_split['conserved']['singleton'] += clusterCollection.cluster_singleton_count
+            cluster_counts_by_split['conserved']['non_singleton'] += clusterCollection.cluster_non_singleton_count
+            protein_counts_by_split['conserved']['singleton'] += clusterCollection.protein_singleton_count
+            protein_counts_by_split['conserved']['non_singleton'] += clusterCollection.protein_count
+        else:
+            parent_clusterCollection = self.clusterCollections[-1]
+            child_clusterCollection = child_clusterCollection
+            non_hierarchical_cluster_ids = set()
+
+            for parent_protein_ids, prev_cluster_id in parent_clusterCollection.cluster_id_by_protein_ids.items():
+                # for each previous cluster
+                if parent_protein_ids in child_clusterCollection.cluster_id_by_protein_ids:
+                    # if cluster is present in current clustering => conserved, no-split
+                    split = 'None'
+                else:
+                    # cluster is split somehow: hierarchical (singleton/non-singleton), non-hierarchical (non-singleton)
+                    protein_ids_split_in_singleton = []
+                    protein_ids_split_in_non_singleton = []
+                    for parent_protein_id in parent_protein_ids:
+                        # for each protein in previous cluster
+                        if parent_protein_id in child_clusterCollection.cluster_id_by_protein_ids:
+                            # if singleton
+                            protein_ids_split_in_singleton.append(parent_protein_id)
+                        else:
+                            # not a singleton
+                            protein_ids_split_in_non_singleton.append(parent_protein_id)
+                        if protein_ids_split_in_non_singleton:
+                            non_singleton_clusters = set()
+                            for protein_id in protein_ids_split_in_non_singleton:
+                                non_singleton_clusters.add(clusterCollection.cluster_id_by_protein_id[protein_id])
+                            for non_singleton_cluster in non_singleton_clusters:
+                                # check if additional proteins are present
+                                protein_ids_in_non_singleton_cluster = clusterCollection.clusterObjs_by_cluster_id[non_singleton_cluster]
+                                additional_protein_ids = protein_ids_in_non_singleton_cluster.difference(parent_protein_ids)
+                                if additional_protein_ids:
+                                    if not non_singleton_cluster in non_hierarchical_cluster_ids:
+                                        non_hierarchical_cluster_count += 1
+                                        non_hierarchical_cluster_ids.add(non_singleton_cluster)
+                                    non_hierarchical_split_count += 1
+                                else:
+                                    hierarchical_cluster_count += 1
+                                    hierarchical_split_count += 1
+
+                                    "non-hierarchical"
+
+
+
+
+                else:
+                    if len(protein_ids) == 1:
+                        hierarchality = "split_singleton"
+                    else:
+                        hierarchality = "split_non_singleton"
+                for protein_id in protein_ids:
+                    if not protein_id in self.hierarchality_pattern_by_protein_id:
+                        self.hierarchality_pattern_by_protein_id[protein_id] = []
+                    self.hierarchality_pattern_by_protein_id[protein_id].append(hierarchality)
+                if not protein_ids in self.hierarchality_pattern_by_protein_ids:
+                    self.hierarchality_pattern_by_protein_ids[protein_ids] = []
+                self.hierarchality_pattern_by_protein_ids[protein_ids].append(hierarchality)
+            self.clusterCollections.append(clusterCollection)
+
+    def generate_output(self):
+        for protein_ids, hierarchality_patterns in self.hierarchality_pattern_by_protein_ids.items():
+            print "proteins: %s" % (len(protein_ids)), hierarchality_patterns
+
+class ClusterObj():
+    def __init__(self, IV, cluster_id, protein_ids):
+        self.IV = IV
+        self.cluster_id = cluster_id
+        self.protein_ids = protein_ids
+        self.protein_count = len(protein_ids)
+        self.singleton = True if len(protein_ids) == 1 else False
+        self.validated = False
+
+class ClusterCollection():
+    def __init__(self, IV, cluster_f):
+        self.IV = IV
+        self.cluster_f = cluster_f
+
+        self.cluster_id_by_protein_id = {}
+        self.cluster_id_by_protein_ids = {}
+        self.clusterObjs_by_cluster_id = {}
+
+        self.cluster_ids_by_cluster_status = {}
+
+        self.cluster_count = 0
+        self.protein_count = 0
+        self.cluster_singleton_count = 0
+        self.protein_singleton_count = 0
+        self.cluster_non_singleton_count = 0
+        self.protein_non_singleton_count = 0
+
+    def parse_cluster_f(self):
+        print "[+] Parsing cluster file %s ..." % (self.cluster_f)
+        for line in read_file(self.cluster_f):
+            cluster_id_string, protein_string = line.rstrip("\n").split(": ")
+            cluster_id = "%s:%s" % (self.IV, cluster_id_string)
+            protein_ids = protein_string.split(" ")
+            clusterObj = ClusterObj(self.IV, cluster_id, protein_ids)
+            # count
+            if clusterObj.singleton == True:
+                self.cluster_singleton_count += 1
+                self.protein_singleton_count += len(clusterObj.protein_count)
+            else:
+                self.cluster_non_singleton_count += 1
+                self.protein_non_singleton_count += len(clusterObj.protein_count)
+            self.cluster_count += 1
+            self.protein_count += len(clusterObj.protein_count)
+            # add
+            for protein_id in clusterObj.protein_ids:
+                self.cluster_id_by_protein_id[protein_id] = clusterObj.cluster_id
+            self.cluster_id_by_protein_ids[frozenset(clusterObj.protein_ids)] = clusterObj.cluster_id
+            self.clusterObjs_by_cluster_id[clusterObj.cluster_id] = clusterObj
+
+class InputObj():
+    def __init__(self, input_f):
+        self.input_file_by_IV = self.parse_input_f(input_f)
+
+    def parse_input_f(self, input_f):
+        print "[+] Parsing input file %s ..." % (input_f)
+        input_file_by_IV = {}
+        for line in read_file(input_f):
+            temp = line.split()
+            IV = temp[0]
+            cluster_f = temp[1]
+            input_file_by_IV[IV] = cluster_f
+        return input_file_by_IV
+
+if __name__ == "__main__":
+    __version__ = 0.1
+    args = docopt(__doc__)
+
+    input_f = args['--infile']
+    out_prefix = args['--outprefix']
+
+    inputObj = InputObj(input_f)
+    print "[+] Start ..."
+    dataCollection = DataCollection()
+    for IV, cluster_f in sorted(inputObj.input_file_by_IV.items()):
+        clusterCollection = ClusterCollection(IV, cluster_f)
+        clusterCollection.parse_cluster_f()
+        dataCollection.add_clusterCollection(clusterCollection)
+    dataCollection.generate_output()
diff --git a/src/create_network.py b/src/create_network.py
@@ -4,6 +4,7 @@
 """
 usage: generate_and_analyse_network.py      -c <FILE> -s <FILE> -a <STR>
                                             [-n <STR>] [-t <FLOAT>] [-e <STR>]
+                                            [--colour <FILE>]
                                             [-o <STR>] [-d]
                                             [-h|--help]
 
@@ -17,6 +18,7 @@
         -t, --weight_threshold <FLOAT>          Minimal weight threshold (default: 0.0)
         -e, --exclude_proteome_ids <STR>        Proteomes to be excluded (seprated by ",")
         -d, --draw_all_edges                    Draw all edges (as opposed to only filtered ones)
+        --colour_f <FILE>                       CSV file containing colours of attribute levels
         -o, --out_prefix <STR>                  Outprefix (default: graph)
 
 """
@@ -66,37 +68,6 @@ def read_file(f):
         for line in fh:
             yield line.rstrip("\n")
 
-def pie(ax, values, **kwargs):
-    total = sum(values)
-    def formatter(pct):
-        return '{:0.0f}\n({:0.1f}%)'.format(pct*total/100, pct)
-    wedges, _, labels = ax.pie(values, autopct=formatter, **kwargs)
-    return wedges
-
-def plot_icons():
-    fig, ax = plt.subplots()
-    ax.axis('equal')
-
-    width = 0.35
-    kwargs = dict(colors=['#66FF66', '#9999FF', '#FF9999'], startangle=90)
-
-    outside = pie(ax, [96, 124, 88], radius=1, pctdistance=1-width/2, **kwargs)
-    inside = pie(ax, [45, 87, 77], radius=1-width,
-                 pctdistance=1 - (width/2) / (1-width), **kwargs)
-    plt.setp(inside + outside, width=width, edgecolor='white')
-
-    ax.legend(inside[::-1], ['Hardware', 'Software', 'Services'], frameon=False)
-
-    kwargs = dict(size=13, color='white', va='center', fontweight='bold')
-    ax.text(0, 0, 'Year 2005', ha='center',
-            bbox=dict(boxstyle='round', facecolor='blue', edgecolor='none'),
-            **kwargs)
-    ax.annotate('Year 2006', (0, 0), xytext=(np.radians(-45), 1.1),
-                bbox=dict(boxstyle='round', facecolor='green', edgecolor='none'),
-                textcoords='polar', ha='left', **kwargs)
-
-    plt.show()
-
 class ProteomeObj():
     def __init__(self, proteome_id, label, colour, idx):
         self.proteome_id = proteome_id
@@ -209,21 +180,6 @@ def generate_edges(normalisation, weight_by_positional_edges):
         pass
     return edges_positional
 
-def binomial(n,k):
-    """
-    A fast way to calculate binomial coefficients by Andrew Dalke (contrib).
-    """
-    if 0 <= k <= n:
-        ntok = 1
-        ktok = 1
-        for t in xrange(1, min(k, n - k) + 1):
-            ntok *= n
-            ktok *= t
-            n -= 1
-        return ntok // ktok
-    else:
-        return 0
-
 
 def generate_proteomeObjs():
     proteomeObj_by_proteome_id = {}
@@ -245,20 +201,6 @@ def check_normalisation(normalisation_algo):
 def check_exclude_proteome_ids(exclude_proteome_ids):
     return set([proteome_id for proteome_id in exclude_proteome_ids.split(",")])
 
-def santisise_args(args):
-    sane_args = {}
-    sane_args['--cluster_stats'] = check_file(args['--cluster_stats'])
-    sane_args['--species_classification'] = check_file(args['--species_classification'])
-    sane_args['--attribute'] = args['--attribute']
-    sane_args['--normalisation'] = check_normalisation(args['--normalisation'])
-    sane_args['--weight_threshold'] = float(args['--weight_threshold'])
-    if args['--exclude_proteome_ids']:
-        sane_args['--exclude_proteome_ids'] = check_exclude_proteome_ids(args['--exclude_proteome_ids'])
-    else:
-        sane_args['--exclude_proteome_ids'] = set()
-    sane_args['--draw_all_edges'] = args['--draw_all_edges']
-    sane_args['--out_prefix'] = args['--out_prefix']
-    return sane_args
 
 def generate_outpath_by_name(target_attribute, normalisation, weight_threshold, out_prefix):
     outpath_by_name = {}
@@ -458,6 +400,22 @@ def filter_edges(edges_positional, weight_threshold, edge_count):
              edge_count['excluded_weight'] = edge_count.get('excluded_weight', 0) + 1
     return edges_positional_filtered, edge_count
 
+def santisise_args(args):
+    sane_args = {}
+    sane_args['--cluster_stats'] = check_file(args['--cluster_stats'])
+    sane_args['--colour_f'] = check_file(args['--colour_f'])
+    sane_args['--species_classification'] = check_file(args['--species_classification'])
+    sane_args['--attribute'] = args['--attribute']
+    sane_args['--normalisation'] = check_normalisation(args['--normalisation'])
+    sane_args['--weight_threshold'] = float(args['--weight_threshold'])
+    if args['--exclude_proteome_ids']:
+        sane_args['--exclude_proteome_ids'] = check_exclude_proteome_ids(args['--exclude_proteome_ids'])
+    else:
+        sane_args['--exclude_proteome_ids'] = set()
+    sane_args['--draw_all_edges'] = args['--draw_all_edges']
+    sane_args['--out_prefix'] = args['--out_prefix']
+    return sane_args
+
 if __name__ == "__main__":
     '''
     IDEAS:
@@ -470,7 +428,7 @@ def filter_edges(edges_positional, weight_threshold, edge_count):
          src/create_network.py -c ~/Dropbox/projects/manuscripts/50helminths/results/50helminth.20161115.master.kinfin_results/PROTEOME/PROTEOME.cluster_stats.txt -s ~/Dropbox/projects/manuscripts/50helminths/results/50helminth.20161115.master.kinfin_results/PROTEOME/PROTEOME.cluster_stats.txt -a superphylum -n "max_weight" -t 0.30 -o test -d
 
     '''
-    __version__ = 0.1
+    __version__ = 0.2
 
     NORMALISATION_ALGOS = set(["max_weight", "none"])
     FIGSIZE = (16,16)
@@ -485,11 +443,13 @@ def filter_edges(edges_positional, weight_threshold, edge_count):
     exclude_proteome_ids = sane_args['--exclude_proteome_ids']
     draw_all_edges = sane_args['--draw_all_edges']
     out_prefix = sane_args['--out_prefix']
+    colour_f = sane_args['--colour_f']
 
     results = {}
     outpath_by_name = generate_outpath_by_name(target_attribute, normalisation, weight_threshold, out_prefix)
 
     proteome_ids, label_by_proteome_id, proteome_ids_by_label = parse_species_classification(species_classification_f)
+    colour_by_proteome_id = parse_colour_f(colour_f)
     results['proteome_count'] = len(proteome_ids)
     results['labels'] = ",".join(["%s (%s)" % (label, len(proteomes)) for label, proteomes in proteome_ids_by_label.items()])
     proteomeObj_by_proteome_id = generate_proteomeObjs()