Metanetx dictionaries update (#57)

* chore: update metanetx dictionaries * fix: back compatibility for python2 - pickle protocol * Do not filter chem_prop by database
biosustain · Aug 1, 2016 · 393d1bc · 393d1bc
1 parent 6a9beb5
commit 393d1bc
Show file tree

Hide file tree

Showing 17 changed files with 69 additions and 40,364 deletions.
diff --git a/cameo/data/metanetx.pickle b/cameo/data/metanetx.pickle
diff --git a/cameo/data/metanetx_chem_prop.pklz b/cameo/data/metanetx_chem_prop.pklz
diff --git a/cameo/models/universal_models/metanetx_universal_model_bigg.json b/cameo/models/universal_models/metanetx_universal_model_bigg.json
diff --git a/cameo/models/universal_models/metanetx_universal_model_bigg_rhea.json b/cameo/models/universal_models/metanetx_universal_model_bigg_rhea.json
diff --git a/cameo/models/universal_models/metanetx_universal_model_bigg_rhea_kegg.json b/cameo/models/universal_models/metanetx_universal_model_bigg_rhea_kegg.json
diff --git a/cameo/models/universal_models/metanetx_universal_model_bigg_rhea_kegg_brenda.json b/cameo/models/universal_models/metanetx_universal_model_bigg_rhea_kegg_brenda.json
diff --git a/cameo/models/universal_models/metanetx_universal_model_rhea.json b/cameo/models/universal_models/metanetx_universal_model_rhea.json
diff --git a/cameo/strain_design/pathway_prediction/pathway_predictor.py b/cameo/strain_design/pathway_prediction/pathway_predictor.py
@@ -351,7 +351,7 @@ def _extend_model(self, original_exchanges):
 
         logger.info("Adding reactions from universal model to host model.")
         new_reactions = list()
-        original_model_metabolites = [self.mapping.get(m.id[0:-2], m.id) for
+        original_model_metabolites = [self.mapping.get('bigg:' + m.id[0:-2], m.id) for
                                       r in original_exchanges for m, coeff in six.iteritems(r.metabolites)
                                       if len(r.metabolites) == 1 and coeff < 0 < r.upper_bound]
 

diff --git a/cameo/strain_design/pathway_prediction/util.py b/cameo/strain_design/pathway_prediction/util.py
@@ -34,7 +34,7 @@ def create_adapter_reactions(original_metabolites, database, mapping, compartmen
 
         name = metabolite.id[0:-2]
         try:
-            mapped_name = mapping[name]
+            mapped_name = mapping['bigg:' + name]  # assuming that model uses BIGG ids
         except KeyError:
             continue
             # print name, 'N/A'

diff --git a/data/metanetx/chem_prop.tsv.gz b/data/metanetx/chem_prop.tsv.gz
diff --git a/data/metanetx/chem_xref.tsv.gz b/data/metanetx/chem_xref.tsv.gz
diff --git a/data/metanetx/comp_prop.tsv.gz b/data/metanetx/comp_prop.tsv.gz
diff --git a/data/metanetx/comp_xref.tsv.gz b/data/metanetx/comp_xref.tsv.gz
diff --git a/data/metanetx/reac_prop.tsv.gz b/data/metanetx/reac_prop.tsv.gz
diff --git a/data/metanetx/reac_xref.tsv.gz b/data/metanetx/reac_xref.tsv.gz
diff --git a/requirements.txt b/requirements.txt
@@ -14,3 +14,4 @@ future>=0.15.2
 lazy-object-proxy==1.2.0
 IProgress==0.2
 palettable>=2.1.1
+requests>=2.10.0
diff --git a/scripts/parse_metanetx.py b/scripts/parse_metanetx.py
@@ -14,7 +14,11 @@
 
 import logging
 import re
+import gzip
+import pickle
+import sys
 
+import requests
 import optlang
 from cobra.core.Formula import Formula
 from cobra.io.json import save_json_model
@@ -119,20 +123,61 @@ def construct_universal_model(list_of_db_prefixes):
     return model
 
 
+def load_metanetx_files():
+    BASE_URL = 'http://www.metanetx.org/cgi-bin/mnxget/mnxref/{}.tsv'
+    for filename in ['chem_prop', 'chem_xref', 'reac_prop', 'reac_xref', 'comp_prop', 'comp_xref']:
+        response = requests.get(BASE_URL.format(filename))
+        filepath = '../data/metanetx/{}.tsv.gz'.format(filename)
+        compress_by_lines(response, filepath)
+
+
+def compress_by_lines(response, filepath):
+    prev_line = next(response.iter_lines())
+    with gzip.open(filepath, 'wb') as f:
+        for line in response.iter_lines(decode_unicode=response.encoding):
+            if line.startswith('#'):
+                prev_line = line
+                continue
+            if prev_line:
+                f.write(str.encode(prev_line + '\n'))
+                prev_line = None
+            f.write(str.encode(line + '\n'))
+
+
+def add_to_all_mapping(dataframe, mapping):
+    for other_id, mnx_id in dataframe[['XREF', 'MNX_ID']].values:
+        cleaned_key = _apply_sanitize_rules(
+            _apply_sanitize_rules(other_id, REVERSE_ID_SANITIZE_RULES_SIMPHENY),
+            ID_SANITIZE_RULES_TAB_COMPLETION)
+        mapping[cleaned_key] = mnx_id
+
+
+def add_to_bigg_mapping(xref, bigg2mnx, mnx2bigg):
+    bigg_selection = xref[['bigg' in blub for blub in xref.XREF]]
+    sanitized_XREF = [
+        _apply_sanitize_rules(_apply_sanitize_rules(id, REVERSE_ID_SANITIZE_RULES_SIMPHENY),
+                              ID_SANITIZE_RULES_TAB_COMPLETION) for id in bigg_selection.XREF]
+    bigg2mnx.update(dict(zip(sanitized_XREF, bigg_selection.MNX_ID)))
+    mnx2bigg.update(dict(zip(bigg_selection.MNX_ID, sanitized_XREF)))
+
+
 if __name__ == '__main__':
 
     import logging
 
     logging.basicConfig(level='INFO')
 
+    if len(sys.argv) > 1 and sys.argv[1] == '--load':
+        load_metanetx_files()
+
     # load metanetx data
-    chem_xref = read_table('../data/metanetx/chem_xref.tsv.gz', skiprows=124, compression='gzip')
+    chem_xref = read_table('../data/metanetx/chem_xref.tsv.gz', compression='gzip')
     chem_xref.columns = [name.replace('#', '') for name in chem_xref.columns]
-    reac_xref = read_table('../data/metanetx/reac_xref.tsv.gz', skiprows=107, compression='gzip')
+    reac_xref = read_table('../data/metanetx/reac_xref.tsv.gz', compression='gzip')
     reac_xref.columns = [name.replace('#', '') for name in reac_xref.columns]
-    reac_prop = read_table('../data/metanetx/reac_prop.tsv.gz', skiprows=107, compression='gzip', index_col=0)
+    reac_prop = read_table('../data/metanetx/reac_prop.tsv.gz', compression='gzip', index_col=0)
     reac_prop.columns = [name.replace('#', '') for name in reac_prop.columns]
-    chem_prop = read_table('../data/metanetx/chem_prop.tsv.gz', skiprows=125, compression='gzip', index_col=0,
+    chem_prop = read_table('../data/metanetx/chem_prop.tsv.gz', compression='gzip', index_col=0,
                            names=['name', 'formula', 'charge', 'mass', 'InChI', 'SMILES', 'source'])
 
     # replace NaN with None
@@ -141,41 +186,16 @@ def construct_universal_model(list_of_db_prefixes):
     REVERSE_ID_SANITIZE_RULES_SIMPHENY = [(value, key) for key, value in ID_SANITIZE_RULES_SIMPHENY]
 
     metanetx = dict()
+    metanetx['all2mnx'] = dict()
+    metanetx['bigg2mnx'] = dict()
+    metanetx['mnx2bigg'] = dict()
     # Metabolites
-    bigg_selection = chem_xref[['bigg' in blub for blub in chem_xref.XREF]]
-    sanitized_XREF = [
-        _apply_sanitize_rules(_apply_sanitize_rules(id.replace('bigg:', ''), REVERSE_ID_SANITIZE_RULES_SIMPHENY),
-                              ID_SANITIZE_RULES_TAB_COMPLETION) for id in bigg_selection.XREF]
-    bigg2mnx = dict(zip(sanitized_XREF, bigg_selection.MNX_ID))
-    mnx2bigg = dict(zip(bigg_selection.MNX_ID, sanitized_XREF))
-
-    # Reactions
-    bigg_selection = reac_xref[['bigg' in blub for blub in reac_xref.XREF]]
-    sanitized_XREF = [
-        _apply_sanitize_rules(_apply_sanitize_rules(id.replace('bigg:', ''), REVERSE_ID_SANITIZE_RULES_SIMPHENY),
-                              ID_SANITIZE_RULES_TAB_COMPLETION) for id in bigg_selection.XREF]
-    bigg2mnx.update(dict(zip(sanitized_XREF, bigg_selection.MNX_ID)))
-    mnx2bigg.update(dict(zip(bigg_selection.MNX_ID, sanitized_XREF)))
+    for xref in [chem_xref, reac_xref]:
+        add_to_bigg_mapping(xref, metanetx['bigg2mnx'], metanetx['mnx2bigg'])
+        add_to_all_mapping(xref, metanetx['all2mnx'])
 
-    # put into final result dict
-    metanetx['bigg2mnx'] = bigg2mnx
-    metanetx['mnx2bigg'] = mnx2bigg
-
-    all2mnx = dict()
-    for other_id, mnx_id in chem_xref[['XREF', 'MNX_ID']].values:
-        cleaned_key = _apply_sanitize_rules(
-            _apply_sanitize_rules(other_id.split(':')[1], REVERSE_ID_SANITIZE_RULES_SIMPHENY),
-            ID_SANITIZE_RULES_TAB_COMPLETION)
-        all2mnx[cleaned_key] = mnx_id
-    for other_id, mnx_id in reac_xref[['XREF', 'MNX_ID']].values:
-        cleaned_key = _apply_sanitize_rules(
-            _apply_sanitize_rules(other_id.split(':')[1], REVERSE_ID_SANITIZE_RULES_SIMPHENY),
-            ID_SANITIZE_RULES_TAB_COMPLETION)
-        all2mnx[cleaned_key] = mnx_id
-
-    metanetx['all2mnx'] = all2mnx
-    # with open('../cameo/data/metanetx.pickle', 'wb') as f:
-    #    pickle.dump(metanetx, f)
+    with open('../cameo/data/metanetx.pickle', 'wb') as f:
+        pickle.dump(metanetx, f, protocol=2)
 
     # generate universal reaction models
     db_combinations = [('bigg',), ('rhea',), ('bigg', 'rhea'), ('bigg', 'rhea', 'kegg'),
@@ -186,14 +206,9 @@ def construct_universal_model(list_of_db_prefixes):
         from cobra.io.json import _REQUIRED_REACTION_ATTRIBUTES
 
         _REQUIRED_REACTION_ATTRIBUTES.add('annotation')
-        # d_model = _to_dict(universal_model)
         with open('../cameo/models/universal_models/{model_name}.json'.format(model_name=universal_model.id), 'w') as f:
             save_json_model(universal_model, f)
-            # json.dump(d_model, f)
-            # save_json_model(universal_model, '../cameo/models/universal_models/{model_name}.json'.format(model_name=universal_model.id))
-    chem_prop_filtered = chem_prop[
-        [any([source.startswith(db) for db in ('bigg', 'rhea', 'kegg', 'brenda', 'chebi')]) for source in
-         chem_prop.source]]
-    chem_prop_filtered = chem_prop_filtered.dropna(subset=['name'])
-    # with gzip.open('../cameo/data/metanetx_chem_prop.pklz', 'wb') as f:
-    #    pickle.dump(chem_prop_filtered, f)
+
+    chem_prop_filtered = chem_prop.dropna(subset=['name'])
+    with gzip.open('../cameo/data/metanetx_chem_prop.pklz', 'wb') as f:
+        pickle.dump(chem_prop_filtered, f, protocol=2)