Skip to content

Commit

Permalink
Merge pull request #11 from stjude/modupe-patch
Browse files Browse the repository at this point in the history
Update BAM2GFF_gtftogenes
  • Loading branch information
madetunj authored Apr 19, 2021
2 parents 2b01ec7 + 2cb41db commit 975a372
Showing 1 changed file with 67 additions and 69 deletions.
136 changes: 67 additions & 69 deletions bin/BAM2GFF_gtftogenes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,23 @@
Generate genomic coordinates of all promoters, 5'UTR, 3'UTR, CDS
'''

import sys
import os
import argparse

if not os.path.exists('annotation'):
os.makedirs('annotation')

#initialize outputfiles
PSEUDOGFF = open('annotation/genes.gff', 'w')
PROMOTERSGFF = open('annotation/promoters.gff', 'w')
UPSTREAMGFF = open('annotation/upstream.gff', 'w')
DOWNSTREAMGFF = open('annotation/downstream.gff', 'w')

def parse_genelocations(chromz, results, flank):
""" Parse genomic regions
Args:
chromz (dict) : Chromosomal sizes
results (string) : Individual gene coordinates
flank (int) : genomic distance from start/end site
"""

#initialize outputfiles
PROMOTERSGFF = open('annotation/promoters.gff', 'a')
UPSTREAMGFF = open('annotation/upstream.gff', 'a')
DOWNSTREAMGFF = open('annotation/downstream.gff', 'a')

lines = results.split("\t")
lines[3] = int(lines[3])
lines[4] = int(lines[4])
Expand Down Expand Up @@ -60,79 +58,79 @@ def parse_genelocations(chromz, results, flank):
downstart, downend, "\t".join(lines[5:])))

def main():
usage = "usage: %prog -g [GTF/GFF file] -f [FEATURE TYPE (gene/transcript)] \
-c [CHROMSIZES] -d [DISTANCE(bp) from start/end site]"
usage = "usage: " + sys.argv[0] + " -g [GTF/GFF file] -f [FEATURE TYPE (gene/transcript)] " \
"-c [CHROMSIZES] -d [DISTANCE(bp) from start/end site] [-h help]"
parser = argparse.ArgumentParser(usage=usage)
parser.add_argument("-g", "--gtf", dest="gtf", required=True,
help="Enter .gtf/gff file to be processed.")
parser.add_argument("-d", "--dist", dest="distance", default=2000, type=int,
help="Distance (bp) from site [TSS/TES].")
parser.add_argument("-d", "--dist", dest="distance", required=False,
default=2000, type=int, help="Distance (bp) from site [TSS/TES].")
parser.add_argument("-c", "--chrom", dest="chrom", required=True,
help="Enter ucsc chrom sizes file to be processed.")
help="Enter UCSC chrom sizes file to be processed.")

options = parser.parse_args()
print(options)
#print(options)

if not os.path.exists('annotation'):
os.makedirs('annotation')

flank = options.distance #flank distance from TSS / TES

if options.chrom and options.gtf:
chromsizefile = open(options.chrom, 'r')
chrom_sizes = {}
for line in chromsizefile:
line = line.split('\t')
chrom_sizes[line[0]] = line[1].rstrip("\n")
chromsizefile = open(options.chrom, 'r')
chrom_sizes = {}
for line in chromsizefile:
line = line.split('\t')
chrom_sizes[line[0]] = line[1].rstrip("\n")

#feature = options.feature
gtf_file = open(options.gtf, 'r')

feature_dict = {}
if not options.gtf.split('.')[-1] == 'gtf':
sys.exit("ERROR :\tGTF required")

#reading the feature type to be used
for line in gtf_file:
if not line.startswith('#'):
lines = line.split("\t")
feature_dict[lines[2]] = lines[2]
#feature = options.feature
gtf_name = options.gtf
gtf_file = open(options.gtf, 'r')

feature_dict = {}

if 'transcript' in feature_dict:
feature = "transcript"
elif 'gene' in feature_dict:
feature = "gene"
else:
sys.exit("ERROR :\tGTF with either transcript/gene annotation is needed")

print("NOTE :\tFeature type used is '%s'" %(feature))
#reading the feature type to be used
for line in gtf_file:
if not line.startswith('#'):
lines = line.split("\t")
feature_dict[lines[2]] = lines[2]

if 'transcript' in feature_dict:
feature = "transcript"
elif 'gene' in feature_dict:
feature = "gene"
else:
sys.exit("ERROR :\tGTF/GFF with either transcript/gene annotation is needed")

#reading the gff_file to parse regions
gff_file = open(options.gtf, 'r')
print("NOTE :\tFeature type used is '%s'" %(feature))

if options.gtf.split('.')[-1] == 'gff':
for line in gff_file:
if not line.startswith('#'):
lines = line.split("\t")
if not lines[0].startswith('chr'):
lines[0] = "chr"+lines[0]
if lines[2] == feature:
results = ("{0}\t{1}".format(lines[0], "\t".join(lines[1:])))
PSEUDOGFF.write(results+"\n")
parse_genelocations(chrom_sizes, results, flank)
elif options.gtf.split('.')[-1] == 'gtf':
for line in gff_file:
if not line.startswith('#'):
lines = line.split("\t")
if not lines[0].startswith('chr'):
lines[0] = "chr"+lines[0]
if lines[2] == feature:
newline = lines[8].split(' ')
results = ("{0}\t{1}\t{2}={3}".format(lines[0],
"\t".join(lines[1:8]),
newline[0], newline[1]))
PSEUDOGFF.write(results + "\n")
parse_genelocations(chrom_sizes, results, flank)
else:
parser.print_help()

#reading the gff_file to parse regions
gff_file = open(options.gtf, 'r')

#initialize output files
PSEUDOGFF = open('annotation/genes.gff', 'w')
PROMOTERSGFF = open('annotation/promoters.gff', 'w')
UPSTREAMGFF = open('annotation/upstream.gff', 'w')
DOWNSTREAMGFF = open('annotation/downstream.gff', 'w')

for line in gff_file:
if not line.startswith('#'):
lines = line.rstrip("\n").split("\t")
if not lines[0].startswith('chr'):
lines[0] = "chr"+lines[0]
if lines[2] == feature:
if gtf_name.split('.')[-1] == 'gff' or gtf_name.split('.')[-1] == 'gff3':
newline = lines[8].split(';')
transcript = [s for s in newline if "transcript_id=" in s]
results = ("{0}\t{1}\t{2}".format(lines[0], "\t".join(lines[1:8]), transcript[0]))
elif gtf_name.split('.')[-1] == 'gtf':
newline = lines[8].split(' ')
results = ("{0}\t{1}\t{2}={3}".format(lines[0],
"\t".join(lines[1:8]),
newline[0], newline[1].strip('";')))
else:
sys.exit("ERROR :\tFailed to process %s" %(gtf_name))
PSEUDOGFF.write(results+"\n")
parse_genelocations(chrom_sizes, results, flank)

if __name__ == "__main__":
main()

0 comments on commit 975a372

Please sign in to comment.