Skip to content

Commit

Permalink
Merge branch 'issue_26'
Browse files Browse the repository at this point in the history
 * Bug fix for issue #26
 * Let user specify logging level in log file
 * Small formatting changes
  • Loading branch information
kbseah committed Jun 28, 2021
2 parents ab44371 + 340c390 commit 1b8cab2
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 38 deletions.
8 changes: 7 additions & 1 deletion bin/bleties
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@ parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
"--log", default="bleties.log", help="Path to write log file")
parser.add_argument(
"--log_level", default="INFO",
help="""
Logging level; choose from: ERROR WARNING INFO DEBUG.
On screen messages will be at INFO level; see log file for DEBUG messages.
""")
parser.add_argument(
"-v", "--version", action="version", version=f"BleTIES {__version__}")
parser.add_argument(
Expand Down Expand Up @@ -375,7 +381,7 @@ if args.check_env:
# Logging
logging.basicConfig(
format='[%(asctime)s] %(name)-12s %(levelname)-8s %(message)s',
level=logging.DEBUG, filename=args.log, filemode="a")
level=args.log_level, filename=args.log, filemode="a")
console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter("%(name)-24s %(levelname)-8s %(message)s")
Expand Down
86 changes: 56 additions & 30 deletions bleties/Milraa.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,8 +230,8 @@ def getPointers(seq, start, end, iesseq, name):
str
Putative pointer sequence
int, int
Adjusted IES start and IES end positions, such that the pointer sequence
in the MDS is to the _right_ of the insert.
Adjusted IES start and IES end positions, such that the pointer
sequence in the MDS is to the _right_ of the insert.
"""
# Give provisional name to this indel if none is supplied
if not name:
Expand All @@ -243,6 +243,7 @@ def getPointers(seq, start, end, iesseq, name):
indel = "D"
iesseq = seq[start-1: end]
else:
logger.error(f'Start less than end for feature {name}')
raise Exception(f"Start cannot be less than end, feature {name}")

# Remove gap characters from ies sequence
Expand All @@ -251,21 +252,28 @@ def getPointers(seq, start, end, iesseq, name):
elif isinstance(iesseq, SeqRecord):
iesseq = iesseq.seq.ungap("-")
else:
logger.error(
f'iesseq not of type str or SeqRecord but is {str(type(iesseq))}')
raise Exception(
f"iesseq must be of type str or SeqRecord but is {type(iesseq)}")
f"iesseq must be of type str or SeqRecord but is {str(type(iesseq))}")

pointer = ""
pointerstart, pointerend = start, end
leftcheck = ""
rightcheck = ""
# check left of IES
i = 0
while iesseq[i] == seq[end+i]:
leftcheck += iesseq[i]
i += 1
# break out of loop if the next position runs off edge
if i >= len(iesseq) or end+i >= len(seq):
break
if end < len(seq):
# Do not check pointer if insert position at end of reference seq
i = 0
while iesseq[i] == seq[end+i]:
leftcheck += iesseq[i]
i += 1
# break out of loop if the next position runs off edge
if i >= len(iesseq) or end+i >= len(seq):
break
else:
logger.debug(
f'insert position {str(end)} at or beyond length of ref sequence {str(len(seq))}: skip check left pointer')
# check right of IES
i = -1
if indel == "I": # beacuse GFF puts zero-length features to right of coordinate
Expand Down Expand Up @@ -544,8 +552,9 @@ def subreadNamesToZmwCoverage(qnames):
QNAME of a PacBio subread has the following convention:
{movieName}/{holeNumber}/{qStart}_{qEnd}
We want to count the number of holes (ZMWs), because a given hole may result
in multiple subreads.
We want to count the number of holes (ZMWs), because a given hole may
result in multiple subreads.
Parameters
----------
Expand Down Expand Up @@ -594,11 +603,11 @@ def __init__(self, alnfile, alnformat, refgenome):
Internally represented by:
_insSeqDict -- dict of sequences of detected inserts/deletions. Keys:
contig (str) -> startpos (int) -> endpos (int) -> indel len (int) ->
list of dicts containing data on the mapped read segments and their
coordinates. The coordinates startpos and endpos in the keys are 1-
based GFF convention. The coordinates in the dicts are 0-based
python convention
contig (str) -> startpos (int) -> endpos (int) -> indel len (int)
-> list of dicts containing data on the mapped read segments and
their coordinates. The coordinates startpos and endpos in the keys
are 1- based GFF convention. The coordinates in the dicts are
0-based python convention
_alnfile -- as below
_alnformat -- as below
_refgenome -- as below
Expand Down Expand Up @@ -654,10 +663,10 @@ def dump(self):
def _addIndelsFromCigar(self, alignedsegment, minlength):
"""Check if alignment contains indels above minimum length, and record
the corresponding breakpoints relative to the reference, and the insert
length.
length.
If the indel is an insert, insert length > 0.
If the indel is a deletion, insert length = 0.
If the indel is an insert, insert length > 0.
If the indel is a deletion, insert length = 0.
Recorded in self._insSeqDict, keyed by contig -> start pos -> end pos
-> insert length -> dict, where dict has fields
Expand All @@ -673,7 +682,8 @@ def _addIndelsFromCigar(self, alignedsegment, minlength):
minlength : int
Minimum length of indel for it to be recorded
"""
# Look for inserts that are completely spanned by the read (i.e. I operations)
# Look for inserts that are completely spanned by the read (i.e. I
# operations)
rname = alignedsegment.reference_name
qname = alignedsegment.query_name
ins_tuples = getCigarOpQuerySeqs(alignedsegment.query_sequence,
Expand Down Expand Up @@ -713,6 +723,7 @@ def _addIndelsFromCigar(self, alignedsegment, minlength):
self._insSeqDict[rname][indelstart][indelend][indellen].append(
record)


def findPutativeIes(self, minlength, ctg=None, start=None, stop=None):
"""Search alignment for clips and indels to identify putative IESs.
Record them in the _insSeqDict
Expand All @@ -736,6 +747,7 @@ def findPutativeIes(self, minlength, ctg=None, start=None, stop=None):
# Find indels (putative IESs) over the minimum length and record them
self._addIndelsFromCigar(line, minlength)


def reportPutativeIesInsertFuzzy(self, mininsbreaks, mindelbreaks, dist_threshold=0.05):
"""Report putative IESs where insert lengths do not match exactly
Expand Down Expand Up @@ -939,6 +951,7 @@ def reportAdjustPointers(self, ctg, ins_start, ins_end, consseq, breakpointid):
out['pp_pointer_end'] = ppend
return(ins_start, ins_end, out)


def reportPutativeIes(self, mininsbreaks, mindelbreaks):
"""After clips and indels have been recorded, report putative IESs above
the minimum coverage, and if the input alignment is a BAM file, then
Expand Down Expand Up @@ -1053,6 +1066,7 @@ def reportPutativeIes(self, mininsbreaks, mindelbreaks):

return(gff, outseq)


def reportIndelConsensusSeq(self, ctg, indelstart, indelend, indellen):
"""Report consensus of indel sequence
Expand Down Expand Up @@ -1089,6 +1103,7 @@ def reportIndelConsensusSeq(self, ctg, indelstart, indelend, indellen):
indelseq = str(self._refgenome[ctg].seq[indelstart - 1:indelend])
return(SeqRecord(Seq(indelseq, generic_dna)))


def reportIndelConsensusSeqFuzzy(self, ctg, indelstart, indelend, indellens):
"""Report consensus alignment of insert sequence when length of insert
is fuzzy matched
Expand Down Expand Up @@ -1178,6 +1193,7 @@ def reportIndelReadMismatchPc(
non_mm.append(mismatch_pc)
return(ins_mm, non_mm)


def getJuncClustersSeqs(self, clusters, rname, minseqs=10):
"""Get reference positions and alignment details for a set of
coordinate clusters
Expand Down Expand Up @@ -1211,8 +1227,10 @@ def getJuncClustersSeqs(self, clusters, rname, minseqs=10):
jcs.append({"positions": i, "seqs": seqs})
return(jcs)

def extractFlankingFromJcs(self, jcs, rname,
margin=100, lower_threshold=None, upper_threshold=None):

def extractFlankingFromJcs(
self, jcs, rname, margin=100, lower_threshold=None,
upper_threshold=None):
"""Extract insert of interest and flanking region from mapped reads
Parameters
Expand All @@ -1236,8 +1254,9 @@ def extractFlankingFromJcs(self, jcs, rname,
Returns
-------
list
list of SeqRecord of the extracted insert sequences and the flanking
regions from mapped query sequences, Seq.id is the query QNAME
list of SeqRecord of the extracted insert sequences and the
flanking regions from mapped query sequences, Seq.id is the query
QNAME
tuple
range of coordinates (min, max) on reference contig where the
extracted insert sequences are positioned
Expand Down Expand Up @@ -1285,7 +1304,9 @@ def extractFlankingFromJcs(self, jcs, rname,
else:
return(None, None)

def spoaConsensusToFlanking(self, seqs, rname, rstart, rend, margin=100, mode=1):

def spoaConsensusToFlanking(
self, seqs, rname, rstart, rend, margin=100, mode=1):
"""Get consensus of subreads and align to flanking regions on reference
genome
Expand All @@ -1301,8 +1322,8 @@ def spoaConsensusToFlanking(self, seqs, rname, rstart, rend, margin=100, mode=1)
Parameters
----------
seqs : list
List of SeqRecord objects representing subread fragments that map to
region of interest and contain insert.
List of SeqRecord objects representing subread fragments that map
to region of interest and contain insert.
rname : str
Name of the contig of interest
rstart : int
Expand Down Expand Up @@ -1352,6 +1373,7 @@ def spoaConsensusToFlanking(self, seqs, rname, rstart, rend, margin=100, mode=1)
f"Inserts at {rname} {str(rstart)}-{str(rend)} anomalous; flanking + insert shorter than ref")
return(None)


def reportPutativeIesInsertSubreads(self, mininsbreaks, mindelbreaks,
margin=100, max_cluster_dist=5,
len_threshold=0.25):
Expand Down Expand Up @@ -1425,12 +1447,14 @@ def reportPutativeIesInsertSubreads(self, mininsbreaks, mindelbreaks,
jcs, rname, margin, 1.0-len_threshold, 1.0+len_threshold)
aln = None
if extr and coords:
# logging.debug(f"contig {rname} coordinates {str(coords[0])} {str(coords[1])}")
logger.debug(f"contig {rname} coordinates {str(coords[0])} {str(coords[1])}")
aln = self.spoaConsensusToFlanking(
extr, rname, coords[0], coords[1], margin=100, mode=1)
extr, rname, coords[0], coords[1], margin, mode=1)
if aln:
consseq, adjpos = findLongestInsert(
aln, rname, coords[0]-margin)
logger.debug(f'consseq {str(consseq)}')
logger.debug(f'adjpos {str(adjpos)}')
# Complain if predicted insert location from findLongestInsert
# is more than 5 bp away from the approximate insert location
problem = None
Expand Down Expand Up @@ -1482,6 +1506,8 @@ def reportPutativeIesInsertSubreads(self, mininsbreaks, mindelbreaks,
Seq(consseq), id=breakpointid, description=";".join(attr)+";")
# Find pointers if present
if len(consseq) > 0:
logger.debug(f'report adjust pointers')
logger.debug(f'rname {rname} gffpos {str(gffpos)} consseq {str(consseq.seq)} breakpointid {breakpointid}')
gffpos, gffpos, pointerdict = self.reportAdjustPointers(
rname, gffpos, gffpos, consseq, breakpointid)
for i in pointerdict:
Expand Down
2 changes: 1 addition & 1 deletion bleties/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
__all__ = ["main", "Milraa", "Milret", "Milcor", "Miltel", "Insert"]
__version__ = "v0.1.9"
__version__ = "v0.1.10-alpha"
13 changes: 7 additions & 6 deletions bleties/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,19 @@ def check_env():
# Get versions
dep_vers = {}
dep_vers['pysam'] = pysam.__version__
dep_vers['biopython'] = biopython_v
dep_vers['biopython'] = biopython_v
# Each program reports its version string differently
dep_vers['htslib'] = run(['htsfile','--version'], capture_output=True).stdout.decode().split("\n")[0]
dep_vers['htslib'] = run(['htsfile', '--version'], capture_output=True).stdout.decode().split("\n")[0]
dep_vers['htslib'] = re.search(r"\(htslib\) (\S+)", dep_vers['htslib']).group(1)
dep_vers['spoa'] = run(['spoa','--version'], capture_output=True).stdout.decode().rstrip()
dep_vers['NCRF'] = run(['NCRF','--version'], capture_output=True).stderr.decode().split("\n")[0]
dep_vers['spoa'] = run(['spoa', '--version'], capture_output=True).stdout.decode().rstrip()
dep_vers['NCRF'] = run(['NCRF', '--version'], capture_output=True).stderr.decode().split("\n")[0]
dep_vers['NCRF'] = re.search(r"version (\S+)", dep_vers['NCRF']).group(1)
dep_vers['muscle'] = run(['muscle','-version'], capture_output=True).stdout.decode().rstrip()
dep_vers['muscle'] = run(['muscle', '-version'], capture_output=True).stdout.decode().rstrip()
dep_vers['muscle'] = re.search(r"v\S+", dep_vers['muscle']).group(0)
# Get paths
dep_paths = {}
for prog in ['spoa', 'NCRF', 'muscle']:
dep_paths[prog] = run(['which',prog], capture_output=True).stdout.decode().rstrip()
dep_paths[prog] = run(['which', prog], capture_output=True).stdout.decode().rstrip()
out = []
for prog in dep_vers:
if prog in dep_paths:
Expand Down Expand Up @@ -108,6 +108,7 @@ def read_bam_ref(bam, fasta=None):


def run_boilerplate(logger):
"""Basic info on BleTIES run for log"""
logger.info(f"BleTIES {__version__}")
logger.info("Dependencies: ")
for dep in check_env():
Expand Down

0 comments on commit 1b8cab2

Please sign in to comment.