Skip to content

Commit

Permalink
Fix bug
Browse files Browse the repository at this point in the history
- ALO headers in cluster_metrics and cafe are in wrong order
  • Loading branch information
DRL committed Dec 30, 2016
1 parent dbd389d commit f6b821f
Show file tree
Hide file tree
Showing 4 changed files with 461 additions and 65 deletions.
8 changes: 4 additions & 4 deletions kinfin.py
Original file line number Diff line number Diff line change
Expand Up @@ -709,10 +709,10 @@ def get_header_line(self, filetype, attribute):
cluster_metrics_header.append("attribute_cluster_type")
cluster_metrics_header.append("protein_span_mean")
cluster_metrics_header.append("protein_span_sd")
cluster_metrics_header += ["%s_count" % level for level in aloCollection.ALO_by_level_by_attribute[attribute]]
cluster_metrics_header += ["%s_count" % level for level in sorted(aloCollection.ALO_by_level_by_attribute[attribute])]
if not attribute == "PROTEOME":
cluster_metrics_header += ["%s_median" % level for level in aloCollection.ALO_by_level_by_attribute[attribute]]
cluster_metrics_header += ["%s_cov" % level for level in aloCollection.ALO_by_level_by_attribute[attribute]]
cluster_metrics_header += ["%s_median" % level for level in sorted(aloCollection.ALO_by_level_by_attribute[attribute])]
cluster_metrics_header += ["%s_cov" % level for level in sorted(aloCollection.ALO_by_level_by_attribute[attribute])]
return "\t".join(cluster_metrics_header)
elif filetype == "cluster_metrics_domains":
cluster_metrics_domains_header = []
Expand Down Expand Up @@ -742,7 +742,7 @@ def get_header_line(self, filetype, attribute):
cafe_header = []
cafe_header.append("Description")
cafe_header.append("ID")
for level in aloCollection.ALO_by_level_by_attribute['PROTEOME']:
for level in sorted(aloCollection.ALO_by_level_by_attribute['PROTEOME']):
cafe_header.append(level)
return "\t".join(cafe_header)
elif filetype == "pairwise_representation_test":
Expand Down
230 changes: 230 additions & 0 deletions src/compare_across_IVs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
usage: find_isoforms.py -i <FILE> [-o <STRING>]
[-h|--help]
Options:
-h --help show this
-i, --infile <FILE> Infile
-o, --outprefix <STRING> Output prefix
"""

import re
import sys
import operator
from docopt import docopt
from os.path import isfile, join, exists, realpath, dirname, basename

def parse_groups(group_f):
with open(groups_f) as group_fh:
for line in group_fh:
clusterID, protein_string = line.rstrip("\n").split(": ")
proteins = protein_string.split(" ")
if headers:
for protein in proteins:
if protein in headers:
if headers[protein] == None:
headers[protein] = clusterID
output[clusterID] = proteins
else:
sys.exit("[-] protein %s found more than once" % protein)
else:
if clusterID in clusters:
if clusters[clusterID] == None:
clusters[clusterID] = clusterID
output[clusterID] = proteins
else:
sys.exit("[-] cluster %s found more than once" % clusterID)
return output

def read_file(infile):
if not infile or not exists(infile):
sys.exit("[ERROR] - File '%s' does not exist." % (infile))
if infile.endswith(".gz"):
import gzip
with gzip.open(infile) as fh:
for line in fh:
yield line.rstrip("\n")
else:
with open(infile) as fh:
for line in fh:
yield line.rstrip("\n")

class BarChartObj():
def __init__(self, IV):
self.IV = IV
self.proteins_conserved = {'non-singletons-split' : set(), 'non-singletons' : set(), 'singletons' : set()}
self.proteins_non_conserved = {'non-singletons-split' : set(), 'non-singletons' : set(), 'singletons' : set()}
self.proteins_singletons = {'non-singletons-split' : set(), 'non-singletons' : set(), 'singletons' : set()}


class DataCollection():
def __init__(self):
self.clusterCollections = []
self.protein_ids_by_hierarchality_pattern = {} # tuples of hierarchality
self.hierarchality_pattern_by_protein_id = {}
self.hierarchality_pattern_by_protein_ids = {}
self.protein_ids_trajectory = {}
self.cluster_ids_trajectory = {}


def add_to_hierarchy(self, protein_ids_hierarchy, protein_ids):
if protein_ids_hierarchy[protein_ids]:
protein_ids_hierarchy[protein_ids]

def add_clusterCollection(self, clusterCollection):
barChartObj = BarChartObj(clusterCollection.IV)

cluster_counts_by_split = {'conserved': {'singleton': 0, 'non_singleton' : 0}, 'hierarchical' : {'singleton': 0, 'non_singleton' : 0}, 'non_hierarchical' : 0}
protein_counts_by_split = {'conserved': {'singleton': 0, 'non_singleton' : 0}, 'hierarchical' : {'singleton': 0, 'non_singleton' : 0}, 'non_hierarchical' : 0}
cluster_ids_by_split = {'conserved': {'singleton': set(), 'non_singleton' : set()}, 'hierarchical' : {'singleton': set(), 'non_singleton' : set()}, 'non_hierarchical' : set()}
protein_ids_by_split = {'conserved': {'singleton': set(), 'non_singleton' : set()}, 'hierarchical' : {'singleton': set(), 'non_singleton' : set()}, 'non_hierarchical' : set()}
if self.clusterCollections == []:
self.clusterCollections.append(clusterCollection)
cluster_counts_by_split['conserved']['singleton'] += clusterCollection.cluster_singleton_count
cluster_counts_by_split['conserved']['non_singleton'] += clusterCollection.cluster_non_singleton_count
protein_counts_by_split['conserved']['singleton'] += clusterCollection.protein_singleton_count
protein_counts_by_split['conserved']['non_singleton'] += clusterCollection.protein_count
else:
parent_clusterCollection = self.clusterCollections[-1]
child_clusterCollection = child_clusterCollection
non_hierarchical_cluster_ids = set()

for parent_protein_ids, prev_cluster_id in parent_clusterCollection.cluster_id_by_protein_ids.items():
# for each previous cluster
if parent_protein_ids in child_clusterCollection.cluster_id_by_protein_ids:
# if cluster is present in current clustering => conserved, no-split
split = 'None'
else:
# cluster is split somehow: hierarchical (singleton/non-singleton), non-hierarchical (non-singleton)
protein_ids_split_in_singleton = []
protein_ids_split_in_non_singleton = []
for parent_protein_id in parent_protein_ids:
# for each protein in previous cluster
if parent_protein_id in child_clusterCollection.cluster_id_by_protein_ids:
# if singleton
protein_ids_split_in_singleton.append(parent_protein_id)
else:
# not a singleton
protein_ids_split_in_non_singleton.append(parent_protein_id)
if protein_ids_split_in_non_singleton:
non_singleton_clusters = set()
for protein_id in protein_ids_split_in_non_singleton:
non_singleton_clusters.add(clusterCollection.cluster_id_by_protein_id[protein_id])
for non_singleton_cluster in non_singleton_clusters:
# check if additional proteins are present
protein_ids_in_non_singleton_cluster = clusterCollection.clusterObjs_by_cluster_id[non_singleton_cluster]
additional_protein_ids = protein_ids_in_non_singleton_cluster.difference(parent_protein_ids)
if additional_protein_ids:
if not non_singleton_cluster in non_hierarchical_cluster_ids:
non_hierarchical_cluster_count += 1
non_hierarchical_cluster_ids.add(non_singleton_cluster)
non_hierarchical_split_count += 1
else:
hierarchical_cluster_count += 1
hierarchical_split_count += 1

"non-hierarchical"




else:
if len(protein_ids) == 1:
hierarchality = "split_singleton"
else:
hierarchality = "split_non_singleton"
for protein_id in protein_ids:
if not protein_id in self.hierarchality_pattern_by_protein_id:
self.hierarchality_pattern_by_protein_id[protein_id] = []
self.hierarchality_pattern_by_protein_id[protein_id].append(hierarchality)
if not protein_ids in self.hierarchality_pattern_by_protein_ids:
self.hierarchality_pattern_by_protein_ids[protein_ids] = []
self.hierarchality_pattern_by_protein_ids[protein_ids].append(hierarchality)
self.clusterCollections.append(clusterCollection)

def generate_output(self):
for protein_ids, hierarchality_patterns in self.hierarchality_pattern_by_protein_ids.items():
print "proteins: %s" % (len(protein_ids)), hierarchality_patterns

class ClusterObj():
def __init__(self, IV, cluster_id, protein_ids):
self.IV = IV
self.cluster_id = cluster_id
self.protein_ids = protein_ids
self.protein_count = len(protein_ids)
self.singleton = True if len(protein_ids) == 1 else False
self.validated = False

class ClusterCollection():
def __init__(self, IV, cluster_f):
self.IV = IV
self.cluster_f = cluster_f

self.cluster_id_by_protein_id = {}
self.cluster_id_by_protein_ids = {}
self.clusterObjs_by_cluster_id = {}

self.cluster_ids_by_cluster_status = {}

self.cluster_count = 0
self.protein_count = 0
self.cluster_singleton_count = 0
self.protein_singleton_count = 0
self.cluster_non_singleton_count = 0
self.protein_non_singleton_count = 0

def parse_cluster_f(self):
print "[+] Parsing cluster file %s ..." % (self.cluster_f)
for line in read_file(self.cluster_f):
cluster_id_string, protein_string = line.rstrip("\n").split(": ")
cluster_id = "%s:%s" % (self.IV, cluster_id_string)
protein_ids = protein_string.split(" ")
clusterObj = ClusterObj(self.IV, cluster_id, protein_ids)
# count
if clusterObj.singleton == True:
self.cluster_singleton_count += 1
self.protein_singleton_count += len(clusterObj.protein_count)
else:
self.cluster_non_singleton_count += 1
self.protein_non_singleton_count += len(clusterObj.protein_count)
self.cluster_count += 1
self.protein_count += len(clusterObj.protein_count)
# add
for protein_id in clusterObj.protein_ids:
self.cluster_id_by_protein_id[protein_id] = clusterObj.cluster_id
self.cluster_id_by_protein_ids[frozenset(clusterObj.protein_ids)] = clusterObj.cluster_id
self.clusterObjs_by_cluster_id[clusterObj.cluster_id] = clusterObj

class InputObj():
def __init__(self, input_f):
self.input_file_by_IV = self.parse_input_f(input_f)

def parse_input_f(self, input_f):
print "[+] Parsing input file %s ..." % (input_f)
input_file_by_IV = {}
for line in read_file(input_f):
temp = line.split()
IV = temp[0]
cluster_f = temp[1]
input_file_by_IV[IV] = cluster_f
return input_file_by_IV

if __name__ == "__main__":
__version__ = 0.1
args = docopt(__doc__)

input_f = args['--infile']
out_prefix = args['--outprefix']

inputObj = InputObj(input_f)
print "[+] Start ..."
dataCollection = DataCollection()
for IV, cluster_f in sorted(inputObj.input_file_by_IV.items()):
clusterCollection = ClusterCollection(IV, cluster_f)
clusterCollection.parse_cluster_f()
dataCollection.add_clusterCollection(clusterCollection)
dataCollection.generate_output()
82 changes: 21 additions & 61 deletions src/create_network.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""
usage: generate_and_analyse_network.py -c <FILE> -s <FILE> -a <STR>
[-n <STR>] [-t <FLOAT>] [-e <STR>]
[--colour <FILE>]
[-o <STR>] [-d]
[-h|--help]
Expand All @@ -17,6 +18,7 @@
-t, --weight_threshold <FLOAT> Minimal weight threshold (default: 0.0)
-e, --exclude_proteome_ids <STR> Proteomes to be excluded (seprated by ",")
-d, --draw_all_edges Draw all edges (as opposed to only filtered ones)
--colour_f <FILE> CSV file containing colours of attribute levels
-o, --out_prefix <STR> Outprefix (default: graph)
"""
Expand Down Expand Up @@ -66,37 +68,6 @@ def read_file(f):
for line in fh:
yield line.rstrip("\n")

def pie(ax, values, **kwargs):
total = sum(values)
def formatter(pct):
return '{:0.0f}\n({:0.1f}%)'.format(pct*total/100, pct)
wedges, _, labels = ax.pie(values, autopct=formatter, **kwargs)
return wedges

def plot_icons():
fig, ax = plt.subplots()
ax.axis('equal')

width = 0.35
kwargs = dict(colors=['#66FF66', '#9999FF', '#FF9999'], startangle=90)

outside = pie(ax, [96, 124, 88], radius=1, pctdistance=1-width/2, **kwargs)
inside = pie(ax, [45, 87, 77], radius=1-width,
pctdistance=1 - (width/2) / (1-width), **kwargs)
plt.setp(inside + outside, width=width, edgecolor='white')

ax.legend(inside[::-1], ['Hardware', 'Software', 'Services'], frameon=False)

kwargs = dict(size=13, color='white', va='center', fontweight='bold')
ax.text(0, 0, 'Year 2005', ha='center',
bbox=dict(boxstyle='round', facecolor='blue', edgecolor='none'),
**kwargs)
ax.annotate('Year 2006', (0, 0), xytext=(np.radians(-45), 1.1),
bbox=dict(boxstyle='round', facecolor='green', edgecolor='none'),
textcoords='polar', ha='left', **kwargs)

plt.show()

class ProteomeObj():
def __init__(self, proteome_id, label, colour, idx):
self.proteome_id = proteome_id
Expand Down Expand Up @@ -209,21 +180,6 @@ def generate_edges(normalisation, weight_by_positional_edges):
pass
return edges_positional

def binomial(n,k):
"""
A fast way to calculate binomial coefficients by Andrew Dalke (contrib).
"""
if 0 <= k <= n:
ntok = 1
ktok = 1
for t in xrange(1, min(k, n - k) + 1):
ntok *= n
ktok *= t
n -= 1
return ntok // ktok
else:
return 0


def generate_proteomeObjs():
proteomeObj_by_proteome_id = {}
Expand All @@ -245,20 +201,6 @@ def check_normalisation(normalisation_algo):
def check_exclude_proteome_ids(exclude_proteome_ids):
return set([proteome_id for proteome_id in exclude_proteome_ids.split(",")])

def santisise_args(args):
sane_args = {}
sane_args['--cluster_stats'] = check_file(args['--cluster_stats'])
sane_args['--species_classification'] = check_file(args['--species_classification'])
sane_args['--attribute'] = args['--attribute']
sane_args['--normalisation'] = check_normalisation(args['--normalisation'])
sane_args['--weight_threshold'] = float(args['--weight_threshold'])
if args['--exclude_proteome_ids']:
sane_args['--exclude_proteome_ids'] = check_exclude_proteome_ids(args['--exclude_proteome_ids'])
else:
sane_args['--exclude_proteome_ids'] = set()
sane_args['--draw_all_edges'] = args['--draw_all_edges']
sane_args['--out_prefix'] = args['--out_prefix']
return sane_args

def generate_outpath_by_name(target_attribute, normalisation, weight_threshold, out_prefix):
outpath_by_name = {}
Expand Down Expand Up @@ -458,6 +400,22 @@ def filter_edges(edges_positional, weight_threshold, edge_count):
edge_count['excluded_weight'] = edge_count.get('excluded_weight', 0) + 1
return edges_positional_filtered, edge_count

def santisise_args(args):
sane_args = {}
sane_args['--cluster_stats'] = check_file(args['--cluster_stats'])
sane_args['--colour_f'] = check_file(args['--colour_f'])
sane_args['--species_classification'] = check_file(args['--species_classification'])
sane_args['--attribute'] = args['--attribute']
sane_args['--normalisation'] = check_normalisation(args['--normalisation'])
sane_args['--weight_threshold'] = float(args['--weight_threshold'])
if args['--exclude_proteome_ids']:
sane_args['--exclude_proteome_ids'] = check_exclude_proteome_ids(args['--exclude_proteome_ids'])
else:
sane_args['--exclude_proteome_ids'] = set()
sane_args['--draw_all_edges'] = args['--draw_all_edges']
sane_args['--out_prefix'] = args['--out_prefix']
return sane_args

if __name__ == "__main__":
'''
IDEAS:
Expand All @@ -470,7 +428,7 @@ def filter_edges(edges_positional, weight_threshold, edge_count):
src/create_network.py -c ~/Dropbox/projects/manuscripts/50helminths/results/50helminth.20161115.master.kinfin_results/PROTEOME/PROTEOME.cluster_stats.txt -s ~/Dropbox/projects/manuscripts/50helminths/results/50helminth.20161115.master.kinfin_results/PROTEOME/PROTEOME.cluster_stats.txt -a superphylum -n "max_weight" -t 0.30 -o test -d
'''
__version__ = 0.1
__version__ = 0.2

NORMALISATION_ALGOS = set(["max_weight", "none"])
FIGSIZE = (16,16)
Expand All @@ -485,11 +443,13 @@ def filter_edges(edges_positional, weight_threshold, edge_count):
exclude_proteome_ids = sane_args['--exclude_proteome_ids']
draw_all_edges = sane_args['--draw_all_edges']
out_prefix = sane_args['--out_prefix']
colour_f = sane_args['--colour_f']

results = {}
outpath_by_name = generate_outpath_by_name(target_attribute, normalisation, weight_threshold, out_prefix)

proteome_ids, label_by_proteome_id, proteome_ids_by_label = parse_species_classification(species_classification_f)
colour_by_proteome_id = parse_colour_f(colour_f)
results['proteome_count'] = len(proteome_ids)
results['labels'] = ",".join(["%s (%s)" % (label, len(proteomes)) for label, proteomes in proteome_ids_by_label.items()])
proteomeObj_by_proteome_id = generate_proteomeObjs()
Expand Down
Loading

0 comments on commit f6b821f

Please sign in to comment.