-
Notifications
You must be signed in to change notification settings - Fork 0
/
restructure_anno.py
98 lines (83 loc) · 3.7 KB
/
restructure_anno.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/bin/env python
#######################################################################
# Copyright (C) 2022 Julian Dosch
#
# This file is part of main.
#
# get_domain_importance is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# get_domain_importance is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with PathwayTrace. If not, see <http://www.gnu.org/licenses/>.
#
#######################################################################
import argparse
import json
import os
from pathlib import Path
def option_parse():
parser = argparse.ArgumentParser(epilog="This script restructures the annotation file format into a mapped version where each feature instance of a protein gets an id. This is to make saving linearized architectures less data heavy.")
required = parser.add_argument_group('required arguments')
optional = parser.add_argument_group('optional arguments')
required.add_argument("-i", "--inPath", default='.', type=str, required=True,
help="path to input json")
required.add_argument("-o", "--outPath", default='.', type=str, required=True,
help="path to output directory. Name will be based on input file.")
optional.add_argument("-c", "--genesPerFile", default=100, type=int, required=False,
help="Number of genes in one file. Higher number result in less file with bigger size. ")
args = parser.parse_args()
main(args.inPath, args.outPath, args.genesPerFile)
def main(inpath, outpath, filesize):
arc = read_input(inpath)
index = 0
count = 0
output = {}
indexout = {'genes': {}}
for gene in arc:
output[gene] = arc[gene]
indexout['genes'][gene] = index
count += 1
if count > filesize:
name = str(index).rjust(9, '0')
save2json(output, name, outpath)
output = {}
index += 1
count = 0
name = str(index).rjust(9, '0')
save2json(output, name, outpath)
indexout['#files'] = index
save2json(indexout, 'index', outpath)
def save2json(dict2save, name, directory):
Path(directory).mkdir(parents=True, exist_ok=True)
jsonOut = json.dumps(dict2save, ensure_ascii=False)
out = open(directory + '/' + name + '.json', 'w')
out.write(jsonOut)
out.close()
def read_input(inpath): # reads input json that contains the isoform annotations and restructures the data in a gene centric fashion
fa_map = {}
with open(inpath, 'r') as infile:
features = json.loads(infile.read())['feature']
for protid in features:
gid, pid, tid = protid.split('|')
if not gid in fa_map:
fa_map[gid] = {}
i = 0
fa_map[gid][pid] = {'fmap': {}}
for tool in features[protid]:
if tool == 'length':
fa_map[gid][pid]['length'] = features[protid]['length']
else:
for feature in features[protid][tool]:
for instance in features[protid][tool][feature]['instance']:
fa_map[gid][pid]['fmap'][i] = (feature, instance[0], instance[1])
i += 1
return fa_map
if __name__ == '__main__':
option_parse()