diff --git a/README.md b/README.md index fa1b5b9..63aa7b7 100755 --- a/README.md +++ b/README.md @@ -13,14 +13,14 @@ Executable anywhere as long as the PATHTO is correctly specified. ├── README.md - ├── BAM2GFF-call.sh : bash wrapper + ├── BAM2GFF-call.sh : bash wrapper script ├── lib └── utils.py : utilities method │   - └── src + └── bin ├── BAM2GFF_main.py : calculates density of .bam reads in .gff regions @@ -35,7 +35,7 @@ Executable anywhere as long as the PATHTO is correctly specified. * samtools * R version > 3.4 - * python2 + * python3 ================================================================= diff --git a/bin/BAM2GFF-call.sh b/bin/BAM2GFF-call.sh index f66f9ba..3057565 100755 --- a/bin/BAM2GFF-call.sh +++ b/bin/BAM2GFF-call.sh @@ -1,19 +1,10 @@ #!/bin/bash # # BAM TO GFF to detect Read density across gene transcripting regions +# Template wrapper shell script # # Edited from Original version 12/11/2019 -############################################################## -# ##### Please replace PATHTO with your own directory ###### # -############################################################## -# NOTE: We now assume the PATH and PYTHONPATH has been set up before -# entering this script -#PATHTO=/PATH/TO/BAM2GFF -#PYTHONPATH=$PATHTO/lib -#export PYTHONPATH -#export PATH=$PATH:$PATHTO/src - if [ $# -lt 3 ]; then echo "" echo 1>&2 Usage: $0 ["GTF file"] ["feature type"] ["BAM file"] ["CHROM SIZES"] ["SAMPLENAME"] @@ -39,7 +30,8 @@ CHROMSIZES=$4 #sample name SAMPLENAME=$5 -SAMPLENAME=${SAMPLENAME:=BAMFILE##*/} +TEMPFILE=${BAMFILE##*/} +SAMPLENAME=${SAMPLENAME:=${TEMPFILE%%.*}} echo "#############################################" echo "###### BAM2GFF v1 ######" @@ -61,16 +53,20 @@ echo # BAM TO GFF main code # mkdir -p matrix -echo "Working on GeneBody Region\nBAM2GFF_main.py -b $BAMFILE -i annotation/genes.gff -m 100 -o matrix/genebody.txt" +echo "Working on GeneBody Region" +echo "BAM2GFF_main.py -b $BAMFILE -i annotation/genes.gff -m 100 -o matrix/genebody.txt" BAM2GFF_main.py -b $BAMFILE -i annotation/genes.gff -m 100 -o matrix/genebody.txt echo -echo "Working on Upstream Region\nBAM2GFF_main.py -b $BAMFILE -i annotation/upstream.gff -m 50 -o matrix/upstream.txt" +echo "Working on Upstream Region" +echo "BAM2GFF_main.py -b $BAMFILE -i annotation/upstream.gff -m 50 -o matrix/upstream.txt" BAM2GFF_main.py -b $BAMFILE -i annotation/upstream.gff -m 50 -o matrix/upstream.txt echo -echo "Working on Downstream Region\nBAM2GFF_main.py -b $BAMFILE -i annotation/downstream.gff -m 50 -o matrix/downstream.txt" +echo "Working on Downstream Region" +echo "BAM2GFF_main.py -b $BAMFILE -i annotation/downstream.gff -m 50 -o matrix/downstream.txt" BAM2GFF_main.py -b $BAMFILE -i annotation/downstream.gff -m 50 -o matrix/downstream.txt echo -echo "Working on Promoter Region\nBAM2GFF_main.py -b $BAMFILE -i annotation/promoters.gff -m 100 -o matrix/promoters.txt" +echo "Working on Promoter Region" +echo "BAM2GFF_main.py -b $BAMFILE -i annotation/promoters.gff -m 100 -o matrix/promoters.txt" BAM2GFF_main.py -b $BAMFILE -i annotation/promoters.gff -m 100 -o matrix/promoters.txt echo diff --git a/bin/BAM2GFF_gtftogenes.py b/bin/BAM2GFF_gtftogenes.py index 374e4bb..8a0a2ea 100755 --- a/bin/BAM2GFF_gtftogenes.py +++ b/bin/BAM2GFF_gtftogenes.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python3 #get region both left and right of position of interest. import os diff --git a/bin/BAM2GFF_main.py b/bin/BAM2GFF_main.py index 9d15c1f..2e96b32 100755 --- a/bin/BAM2GFF_main.py +++ b/bin/BAM2GFF_main.py @@ -1,6 +1,7 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python3 #bamToGFF.py - +# modified for python3 #1/8/20 + ''' The MIT License (MIT) Copyright (c) 2013 Charles Lin @@ -64,19 +65,19 @@ def getUniquelyMappingReads(bamFile): bamName = fullPath.split('/')[-1].split('.')[0] pathFolder = join(fullPath.split('/')[0:-1],'/') bamFiles = os.listdir(pathFolder) - statsFile = filter(lambda x: x.count(bamName) ==1 and x.count('stats.') ==1,bamFiles) + statsFile = [x for x in bamFiles if x.count(bamName) ==1 and x.count('stats.') ==1] if len(statsFile) == 1: - print('USING STATS FILE %s' % (pathFolder+'/'+statsFile[0])) + print(('USING STATS FILE %s' % (pathFolder+'/'+statsFile[0]))) samDict = parseSamHeader(pathFolder+'/'+statsFile[0]) return int(samDict['UniquelyMappingSequenceTags']) elif len(statsFile) > 1: - statsFile = filter(lambda x: x.count(bamName) ==1 and x.count('stats.concise') ==1,bamFiles) - print('USING STATS FILE %s' % (pathFolder+'/'+statsFile[0])) + statsFile = [x for x in bamFiles if x.count(bamName) ==1 and x.count('stats.concise') ==1] + print(('USING STATS FILE %s' % (pathFolder+'/'+statsFile[0]))) samDict = parseSamHeader(pathFolder+'/'+statsFile[0]) return int(samDict['UniquelyMappingSequenceTags']) else: - print('no precomputed stats file found for %s.' % (bamFile)) + print(('no precomputed stats file found for %s.' % (bamFile))) return None @@ -116,12 +117,7 @@ def mapBamToGFF(bamFile,gff,sense = 'both',unique = 0,extension = 200,floor = 0, MMR = 1 unique = True - - - print('using a MMR value of %s' % (MMR)) - - senseTrans = maketrans('-+.','+-+') - + print(('using a MMR value of %s' % (MMR))) if type(gff) == str: gff = parseTable(gff,'\t') @@ -155,7 +151,7 @@ def mapBamToGFF(bamFile,gff,sense = 'both',unique = 0,extension = 200,floor = 0, for line in gff: line = line[0:9] if ticker%100 == 0: - print ticker + print(ticker) ticker+=1 gffLocus = Locus(line[0],int(line[3]),int(line[4]),line[6],line[1]) searchLocus = makeSearchLocus(gffLocus,int(extension),int(extension)) @@ -170,11 +166,11 @@ def mapBamToGFF(bamFile,gff,sense = 'both',unique = 0,extension = 200,floor = 0, locus = Locus(locus.chr(),locus.start()-extension,locus.end(),locus.sense(),locus.ID()) extendedReads.append(locus) if gffLocus.sense() == '+' or gffLocus.sense == '.': - senseReads = filter(lambda x:x.sense() == '+' or x.sense() == '.',extendedReads) - antiReads = filter(lambda x:x.sense() == '-',extendedReads) + senseReads = [x for x in extendedReads if x.sense() == '+' or x.sense() == '.'] + antiReads = [x for x in extendedReads if x.sense() == '-'] else: - senseReads = filter(lambda x:x.sense() == '-' or x.sense() == '.',extendedReads) - antiReads = filter(lambda x:x.sense() == '+',extendedReads) + senseReads = [x for x in extendedReads if x.sense() == '-' or x.sense() == '.'] + antiReads = [x for x in extendedReads if x.sense() == '+'] #at this point can output starts onto the GFF unless density is called @@ -195,12 +191,12 @@ def mapBamToGFF(bamFile,gff,sense = 'both',unique = 0,extension = 200,floor = 0, antiHash[x]+=1 #now apply flooring and filtering for coordinates - keys = uniquify(senseHash.keys()+antiHash.keys()) + keys = uniquify(list(senseHash.keys())+list(antiHash.keys())) if floor > 0: - keys = filter(lambda x: (senseHash[x]+antiHash[x]) > floor,keys) + keys = [x for x in keys if (senseHash[x]+antiHash[x]) > floor] #coordinate filtering - keys = filter(lambda x: gffLocus.start() < x < gffLocus.end(),keys) + keys = [x for x in keys if gffLocus.start() < x < gffLocus.end()] if clusterGram or matrix: clusterLine = [gffLocus.ID(),gffLocus.__str__()] @@ -219,7 +215,7 @@ def mapBamToGFF(bamFile,gff,sense = 'both',unique = 0,extension = 200,floor = 0, while n self.len(): - print "this locus is sad %s. please debug me" % (self.__str__()) - print "locus length is %s" % (self.len()) - print "phastBases are %s" % (phastBases) + print("this locus is sad %s. please debug me" % (self.__str__())) + print("locus length is %s" % (self.len())) + print("phastBases are %s" % (phastBases)) return phastSum/self.len() @@ -676,20 +677,20 @@ def __init__(self,loci,windowSize=50): for lcs in loci: self.__addLocus(lcs) def __addLocus(self,lcs): - if not(self.__loci.has_key(lcs)): + if not(lcs in self.__loci): self.__loci[lcs] = None if lcs.sense()=='.': chrKeyList = [lcs.chr()+'+', lcs.chr()+'-'] else: chrKeyList = [lcs.chr()+lcs.sense()] for chrKey in chrKeyList: - if not(self.__chrToCoordToLoci.has_key(chrKey)): self.__chrToCoordToLoci[chrKey] = dict() + if not(chrKey in self.__chrToCoordToLoci): self.__chrToCoordToLoci[chrKey] = dict() for n in self.__getKeyRange(lcs): - if not(self.__chrToCoordToLoci[chrKey].has_key(n)): self.__chrToCoordToLoci[chrKey][n] = [] + if not(n in self.__chrToCoordToLoci[chrKey]): self.__chrToCoordToLoci[chrKey][n] = [] self.__chrToCoordToLoci[chrKey][n].append(lcs) def __getKeyRange(self,locus): start = locus.start() / self.__winSize end = locus.end() / self.__winSize + 1 ## add 1 because of the range - return range(start,end) + return list(range(start,end)) def __len__(self): return len(self.__loci) @@ -697,9 +698,9 @@ def append(self,new): self.__addLocus(new) def extend(self,newList): for lcs in newList: self.__addLocus(lcs) def hasLocus(self,locus): - return self.__loci.has_key(locus) + return locus in self.__loci def remove(self,old): - if not(self.__loci.has_key(old)): raise ValueError("requested locus isn't in collection") + if not(old in self.__loci): raise ValueError("requested locus isn't in collection") del self.__loci[old] if old.sense()=='.': senseList = ['+','-'] else: senseList = [old.sense()] @@ -708,7 +709,7 @@ def remove(self,old): self.__chrToCoordToLoci[old.chr()+sense][k].remove(old) def getWindowSize(self): return self.__winSize - def getLoci(self): return self.__loci.keys() + def getLoci(self): return list(self.__loci.keys()) def getSize(self): size = 0 @@ -721,8 +722,8 @@ def getChrList(self): # i need to remove the strand info from the chromosome keys and make # them non-redundant. tempKeys = dict() - for k in self.__chrToCoordToLoci.keys(): tempKeys[k[:-1]] = None - return tempKeys.keys() + for k in list(self.__chrToCoordToLoci.keys()): tempKeys[k[:-1]] = None + return list(tempKeys.keys()) def __subsetHelper(self,locus,sense): sense = sense.lower() @@ -736,12 +737,12 @@ def __subsetHelper(self,locus,sense): else: raise ValueError("sense value was inappropriate: '"+sense+"'.") for s in filter(lamb, senses): chrKey = locus.chr()+s - if self.__chrToCoordToLoci.has_key(chrKey): + if chrKey in self.__chrToCoordToLoci: for n in self.__getKeyRange(locus): - if self.__chrToCoordToLoci[chrKey].has_key(n): + if n in self.__chrToCoordToLoci[chrKey]: for lcs in self.__chrToCoordToLoci[chrKey][n]: matches[lcs] = None - return matches.keys() + return list(matches.keys()) # sense can be 'sense' (default), 'antisense', or 'both' # returns all members of the collection that overlap the locus @@ -750,12 +751,12 @@ def getOverlap(self,locus,sense='sense'): ### now, get rid of the ones that don't really overlap realMatches = dict() if sense=='sense' or sense=='both': - for i in filter(lambda lcs: lcs.overlaps(locus), matches): + for i in [lcs for lcs in matches if lcs.overlaps(locus)]: realMatches[i] = None if sense=='antisense' or sense=='both': - for i in filter(lambda lcs: lcs.overlapsAntisense(locus), matches): + for i in [lcs for lcs in matches if lcs.overlapsAntisense(locus)]: realMatches[i] = None - return realMatches.keys() + return list(realMatches.keys()) # sense can be 'sense' (default), 'antisense', or 'both' # returns all members of the collection that are contained by the locus @@ -764,12 +765,12 @@ def getContained(self,locus,sense='sense'): ### now, get rid of the ones that don't really overlap realMatches = dict() if sense=='sense' or sense=='both': - for i in filter(lambda lcs: locus.contains(lcs), matches): + for i in [lcs for lcs in matches if locus.contains(lcs)]: realMatches[i] = None if sense=='antisense' or sense=='both': - for i in filter(lambda lcs: locus.containsAntisense(lcs), matches): + for i in [lcs for lcs in matches if locus.containsAntisense(lcs)]: realMatches[i] = None - return realMatches.keys() + return list(realMatches.keys()) # sense can be 'sense' (default), 'antisense', or 'both' # returns all members of the collection that contain the locus @@ -778,12 +779,12 @@ def getContainers(self,locus,sense='sense'): ### now, get rid of the ones that don't really overlap realMatches = dict() if sense=='sense' or sense=='both': - for i in filter(lambda lcs: lcs.contains(locus), matches): + for i in [lcs for lcs in matches if lcs.contains(locus)]: realMatches[i] = None if sense=='antisense' or sense=='both': - for i in filter(lambda lcs: lcs.containsAntisense(locus), matches): + for i in [lcs for lcs in matches if lcs.containsAntisense(locus)]: realMatches[i] = None - return realMatches.keys() + return list(realMatches.keys()) def stitchCollection(self,stitchWindow=1,sense='both'): @@ -887,8 +888,8 @@ def __init__(self,name,chr,sense,txCoords,cdCoords,exStarts,exEnds,commonName='' else: self._cdLocus = Locus(chr,min(cdCoords),max(cdCoords),sense) - exStarts = sorted(map(lambda i: i, exStarts)) - exEnds = sorted(map(lambda i: i, exEnds)) + exStarts = sorted([i for i in exStarts]) + exEnds = sorted([i for i in exEnds]) self._txExons = [] self._cdExons = [] @@ -942,9 +943,9 @@ def chr(self): return self._txLocus.chr() def sense(self): return self._txLocus.sense() def txLocus(self): return self._txLocus ## locus of full transcript def cdLocus(self): return self._cdLocus ## locus from start codon to end codon - def txExons(self): return map(lambda i: i, self._txExons) ## list of loci - def cdExons(self): return map(lambda i: i, self._cdExons) ## list of loci - def introns(self): return map(lambda i: i, self._introns) ## list of loci + def txExons(self): return [i for i in self._txExons] ## list of loci + def cdExons(self): return [i for i in self._cdExons] ## list of loci + def introns(self): return [i for i in self._introns] ## list of loci def fpUtr(self): return self._fpUTR ## locus def tpUtr(self): return self._tpUTR ## locus def isCoding(self): return not(self._cdLocus.start()==0 and self._cdLocus.end()==0) # boolean; is this gene protein-coding? @@ -1113,10 +1114,11 @@ def getRawReads(self,locus,sense,unique = False,includeJxnReads = False,printCom print(command) getReads = subprocess.Popen(command,stdin = subprocess.PIPE,stderr = subprocess.PIPE,stdout = subprocess.PIPE,shell = True) reads = getReads.communicate() - reads = reads[0].split('\n')[:-1] + reads = reads[0].decode("utf-8") + reads = reads.split('\n')[:-1] reads = [read.split('\t') for read in reads] if includeJxnReads == False: - reads = filter(lambda x: x[5].count('N') < 1,reads) + reads = [x for x in reads if x[5].count('N') < 1] #convertDict = {'16':'-','0':'+','64':'+','65':'+','80':'-','81':'-','129':'+','145':'-'} #convertDict = {'16':'-','0':'+','64':'+','65':'+','80':'-','81':'-','129':'+','145':'-','256':'+','272':'-','99':'+','147':'-'} @@ -1185,7 +1187,7 @@ def readsToLoci(self,reads,IDtag = 'sequence,seqID,none'): #then it filters out the '' and converts them to integers #only works for reads that span one junction - [first,gap,second] = [int(x) for x in filter(lambda x: len(x) > 0, re.findall(numPattern,read[5]))][0:3] + [first,gap,second] = [int(x) for x in [x for x in re.findall(numPattern,read[5]) if len(x) > 0]][0:3] if IDtag == 'sequence': loci.append(Locus(chrom,start,start+first,strand,ID[0:first])) loci.append(Locus(chrom,start+first+gap,start+first+gap+second,strand,ID[first:])) @@ -1282,7 +1284,7 @@ def order(x, NoneIsLast = True, decreasing = False): omitNone = True n = len(x) - ix = range(n) + ix = list(range(n)) if None not in x: ix.sort(reverse = decreasing, key = lambda j : x[j]) else: @@ -1294,7 +1296,7 @@ def key(i, x = x): return not(elem is None), elem else: return elem is None, elem - ix = range(n) + ix = list(range(n)) ix.sort(key=key, reverse=decreasing) if omitNone: @@ -1387,7 +1389,7 @@ def gffToFasta(genome,directory,gff,UCSC = True,useID=False): def pair(nuc): pairDict = {'A':'T','C':'G','G':'C','T':'A','U':'A','a':'t','c':'g','g':'c','t':'a','u':'a'} - if pairDict.has_key(nuc): + if nuc in pairDict: return pairDict[nuc] else: return nuc @@ -1398,13 +1400,13 @@ def pair(nuc): def revComp(seq,rev = True, RNA=False): if rev: - revComp = join(map(pair,seq[::-1]),'') + revComp = join(list(map(pair,seq[::-1])),'') else: - revComp = join(map(pair,seq),'') + revComp = join(list(map(pair,seq)),'') if RNA: revComp = revComp.replace('T','U') # Not sure this is defined (!) revComp = revComp.replace('t','u') else: revComp = revComp.replace('U','T') revComp = revComp.replace('u','t') - return revComp \ No newline at end of file + return revComp