-
Notifications
You must be signed in to change notification settings - Fork 0
/
XML_prototype.py
98 lines (73 loc) · 2.61 KB
/
XML_prototype.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#Program to import a specified LRG file and export corresponding fasta
import sys
import xml.etree.ElementTree as etree
import glob
import os
#Read input arguments - should be
# [0] - program name
# [1] - Input XML file name
# [2] - Output file name (optional argument)
#Read input file name from arguments
fileName = sys.argv[1]
#Check file name is valid .xml
assert fileName[-4:] == '.xml', 'You have the wrong input file'
#Scan for the optional argument specifying genomic/protein etc (extension)
#try:
#option = sys.argv[3]
#except:
#option = '-g'
#Read in the specified input file into a variable
try:
tree = etree.parse(fileName)
root = tree.getroot()
except IOError as fileNotPresent:
print "The specified file cannot be located: " + fileNotPresent.filename
exit()
#Check the version of the file we are openeing is correct
assert root.attrib['schema_version'] == '1.8', 'This file is not the correct version'
#Read output file title from arguments
fileOutTitle = sys.argv[2]
#Check that the specified output file does not already exist
#List all files in present directory
existingOutputFiles = os.listdir('/home/swc/XML_Parser/outputFiles')
fileOut = '/home/swc/XML_Parser/outputFiles' + fileOutTitle
if fileOut in existingFiles:
print 'The output file already exists in the present directory'
print 'Would you like to override the file? y/n'
userChoice = raw_input('> ')
if userChoice == 'n':
exit()
#Open the specified output file
#fileOutPath = '/home/swc/XML_Parser/outputFiles'
#fileOut = open(fileOutPath, 'w')
out = open('output',"a")
fixannot = root.find('fixed_annotation')
def getseqelement()
for element in fixannot.iter():
if element.tag == 'sequence':
sequences.append(element.text)
nucleotides = ['A','T','C','G']
for l in genseq:
if l not in nucleotides:
print "this is not the genomic sequence"
exit()
exons = []
for items in fixannot.iter(tag="transcript"):
if 'name' in items.attrib.keys():
if items.attrib['name'] == "t1":
exons = items.iter('exon')
def get_exoncoord()
exons = []
for items in fixannot.iter(tag="transcript"):
if 'name' in items.attrib.keys():
if items.attrib['name'] == "t1":
exons = items.iter('exon')
for exon in exons:
exonNumber = exon.attrib['label']
for coordinates in exon:
if coordinates.attrib['coord_system'][-2] not in ['t','p']:
startIndex = int(coordinates.attrib['start'])
endIndex = int(coordinates.attrib['end'])
exonLength = endIndex - startIndex
print 'For exon ', exonNumber, ', the start is ', startIndex, ' and the end is ', endIndex
exonLength = int(endIndex) - int(startIndex)