Merge branch 'issue_26'

* Bug fix for issue #26 * Let user specify logging level in log file * Small formatting changes
Swart-lab · Jun 28, 2021 · 1b8cab2 · 1b8cab2
2 parents ab44371 + 340c390
commit 1b8cab2
Show file tree

Hide file tree

Showing 4 changed files with 71 additions and 38 deletions.
diff --git a/bin/bleties b/bin/bleties
@@ -13,6 +13,12 @@ parser = argparse.ArgumentParser(
     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 parser.add_argument(
     "--log", default="bleties.log", help="Path to write log file")
+parser.add_argument(
+    "--log_level", default="INFO",
+    help="""
+    Logging level; choose from: ERROR WARNING INFO DEBUG.
+    On screen messages will be at INFO level; see log file for DEBUG messages.
+    """)
 parser.add_argument(
     "-v", "--version", action="version", version=f"BleTIES {__version__}")
 parser.add_argument(
@@ -375,7 +381,7 @@ if args.check_env:
 # Logging
 logging.basicConfig(
     format='[%(asctime)s] %(name)-12s %(levelname)-8s %(message)s',
-    level=logging.DEBUG, filename=args.log, filemode="a")
+    level=args.log_level, filename=args.log, filemode="a")
 console = logging.StreamHandler()
 console.setLevel(logging.INFO)
 formatter = logging.Formatter("%(name)-24s %(levelname)-8s %(message)s")

diff --git a/bleties/Milraa.py b/bleties/Milraa.py
@@ -230,8 +230,8 @@ def getPointers(seq, start, end, iesseq, name):
     str
         Putative pointer sequence
     int, int
-        Adjusted IES start and IES end positions, such that the pointer sequence
-        in the MDS is to the _right_ of the insert.
+        Adjusted IES start and IES end positions, such that the pointer
+        sequence in the MDS is to the _right_ of the insert.
     """
     # Give provisional name to this indel if none is supplied
     if not name:
@@ -243,6 +243,7 @@ def getPointers(seq, start, end, iesseq, name):
         indel = "D"
         iesseq = seq[start-1: end]
     else:
+        logger.error(f'Start less than end for feature {name}')
         raise Exception(f"Start cannot be less than end, feature {name}")
 
     # Remove gap characters from ies sequence
@@ -251,21 +252,28 @@ def getPointers(seq, start, end, iesseq, name):
     elif isinstance(iesseq, SeqRecord):
         iesseq = iesseq.seq.ungap("-")
     else:
+        logger.error(
+            f'iesseq not of type str or SeqRecord but is {str(type(iesseq))}')
         raise Exception(
-            f"iesseq must be of type str or SeqRecord but is {type(iesseq)}")
+            f"iesseq must be of type str or SeqRecord but is {str(type(iesseq))}")
 
     pointer = ""
     pointerstart, pointerend = start, end
     leftcheck = ""
     rightcheck = ""
     # check left of IES
-    i = 0
-    while iesseq[i] == seq[end+i]:
-        leftcheck += iesseq[i]
-        i += 1
-        # break out of loop if the next position runs off edge
-        if i >= len(iesseq) or end+i >= len(seq):
-            break
+    if end < len(seq):
+        # Do not check pointer if insert position at end of reference seq
+        i = 0
+        while iesseq[i] == seq[end+i]:
+            leftcheck += iesseq[i]
+            i += 1
+            # break out of loop if the next position runs off edge
+            if i >= len(iesseq) or end+i >= len(seq):
+                break
+    else:
+        logger.debug(
+            f'insert position {str(end)} at or beyond length of ref sequence {str(len(seq))}: skip check left pointer')
     # check right of IES
     i = -1
     if indel == "I":  # beacuse GFF puts zero-length features to right of coordinate
@@ -544,8 +552,9 @@ def subreadNamesToZmwCoverage(qnames):
 
     QNAME of a PacBio subread has the following convention:
     {movieName}/{holeNumber}/{qStart}_{qEnd}
-    We want to count the number of holes (ZMWs), because a given hole may result
-    in multiple subreads.
+
+    We want to count the number of holes (ZMWs), because a given hole may
+    result in multiple subreads.
 
     Parameters
     ----------
@@ -594,11 +603,11 @@ def __init__(self, alnfile, alnformat, refgenome):
 
         Internally represented by:
         _insSeqDict -- dict of sequences of detected inserts/deletions. Keys:
-            contig (str) -> startpos (int) -> endpos (int) -> indel len (int) ->
-            list of dicts containing data on the mapped read segments and their
-            coordinates. The coordinates startpos and endpos in the keys are 1-
-            based GFF convention. The coordinates in the dicts are 0-based
-            python convention
+            contig (str) -> startpos (int) -> endpos (int) -> indel len (int)
+            -> list of dicts containing data on the mapped read segments and
+            their coordinates. The coordinates startpos and endpos in the keys
+            are 1- based GFF convention. The coordinates in the dicts are
+            0-based python convention
         _alnfile -- as below
         _alnformat -- as below
         _refgenome -- as below
@@ -654,10 +663,10 @@ def dump(self):
     def _addIndelsFromCigar(self, alignedsegment, minlength):
         """Check if alignment contains indels above minimum length, and record
         the corresponding breakpoints relative to the reference, and the insert
-        length. 
+        length.
 
-        If the indel is an insert, insert length > 0. 
-        If the indel is a deletion, insert length = 0. 
+        If the indel is an insert, insert length > 0.
+        If the indel is a deletion, insert length = 0.
 
         Recorded in self._insSeqDict, keyed by contig -> start pos -> end pos
         -> insert length -> dict, where dict has fields
@@ -673,7 +682,8 @@ def _addIndelsFromCigar(self, alignedsegment, minlength):
         minlength : int
             Minimum length of indel for it to be recorded
         """
-        # Look for inserts that are completely spanned by the read (i.e. I operations)
+        # Look for inserts that are completely spanned by the read (i.e. I
+        # operations)
         rname = alignedsegment.reference_name
         qname = alignedsegment.query_name
         ins_tuples = getCigarOpQuerySeqs(alignedsegment.query_sequence,
@@ -713,6 +723,7 @@ def _addIndelsFromCigar(self, alignedsegment, minlength):
                     self._insSeqDict[rname][indelstart][indelend][indellen].append(
                         record)
 
+
     def findPutativeIes(self, minlength, ctg=None, start=None, stop=None):
         """Search alignment for clips and indels to identify putative IESs.
         Record them in the _insSeqDict
@@ -736,6 +747,7 @@ def findPutativeIes(self, minlength, ctg=None, start=None, stop=None):
                 # Find indels (putative IESs) over the minimum length and record them
                 self._addIndelsFromCigar(line, minlength)
 
+
     def reportPutativeIesInsertFuzzy(self, mininsbreaks, mindelbreaks, dist_threshold=0.05):
         """Report putative IESs where insert lengths do not match exactly
 
@@ -939,6 +951,7 @@ def reportAdjustPointers(self, ctg, ins_start, ins_end, consseq, breakpointid):
             out['pp_pointer_end'] = ppend
         return(ins_start, ins_end, out)
 
+
     def reportPutativeIes(self, mininsbreaks, mindelbreaks):
         """After clips and indels have been recorded, report putative IESs above
         the minimum coverage, and if the input alignment is a BAM file, then
@@ -1053,6 +1066,7 @@ def reportPutativeIes(self, mininsbreaks, mindelbreaks):
 
         return(gff, outseq)
 
+
     def reportIndelConsensusSeq(self, ctg, indelstart, indelend, indellen):
         """Report consensus of indel sequence
 
@@ -1089,6 +1103,7 @@ def reportIndelConsensusSeq(self, ctg, indelstart, indelend, indellen):
             indelseq = str(self._refgenome[ctg].seq[indelstart - 1:indelend])
             return(SeqRecord(Seq(indelseq, generic_dna)))
 
+
     def reportIndelConsensusSeqFuzzy(self, ctg, indelstart, indelend, indellens):
         """Report consensus alignment of insert sequence when length of insert
         is fuzzy matched
@@ -1178,6 +1193,7 @@ def reportIndelReadMismatchPc(
                 non_mm.append(mismatch_pc)
         return(ins_mm, non_mm)
 
+
     def getJuncClustersSeqs(self, clusters, rname, minseqs=10):
         """Get reference positions and alignment details for a set of
         coordinate clusters
@@ -1211,8 +1227,10 @@ def getJuncClustersSeqs(self, clusters, rname, minseqs=10):
                 jcs.append({"positions": i, "seqs": seqs})
         return(jcs)
 
-    def extractFlankingFromJcs(self, jcs, rname,
-                               margin=100, lower_threshold=None, upper_threshold=None):
+
+    def extractFlankingFromJcs(
+            self, jcs, rname, margin=100, lower_threshold=None,
+            upper_threshold=None):
         """Extract insert of interest and flanking region from mapped reads
 
         Parameters
@@ -1236,8 +1254,9 @@ def extractFlankingFromJcs(self, jcs, rname,
         Returns
         -------
         list
-            list of SeqRecord of the extracted insert sequences and the flanking
-            regions from mapped query sequences, Seq.id is the query QNAME
+            list of SeqRecord of the extracted insert sequences and the
+            flanking regions from mapped query sequences, Seq.id is the query
+            QNAME
         tuple
             range of coordinates (min, max) on reference contig where the
             extracted insert sequences are positioned
@@ -1285,7 +1304,9 @@ def extractFlankingFromJcs(self, jcs, rname,
         else:
             return(None, None)
 
-    def spoaConsensusToFlanking(self, seqs, rname, rstart, rend, margin=100, mode=1):
+
+    def spoaConsensusToFlanking(
+            self, seqs, rname, rstart, rend, margin=100, mode=1):
         """Get consensus of subreads and align to flanking regions on reference
         genome
 
@@ -1301,8 +1322,8 @@ def spoaConsensusToFlanking(self, seqs, rname, rstart, rend, margin=100, mode=1)
         Parameters
         ----------
         seqs : list
-            List of SeqRecord objects representing subread fragments that map to 
-            region of interest and contain insert.
+            List of SeqRecord objects representing subread fragments that map
+            to region of interest and contain insert.
         rname : str
             Name of the contig of interest
         rstart : int
@@ -1352,6 +1373,7 @@ def spoaConsensusToFlanking(self, seqs, rname, rstart, rend, margin=100, mode=1)
                 f"Inserts at {rname} {str(rstart)}-{str(rend)} anomalous; flanking + insert shorter than ref")
             return(None)
 
+
     def reportPutativeIesInsertSubreads(self, mininsbreaks, mindelbreaks,
                                         margin=100, max_cluster_dist=5,
                                         len_threshold=0.25):
@@ -1425,12 +1447,14 @@ def reportPutativeIesInsertSubreads(self, mininsbreaks, mindelbreaks,
                     jcs, rname, margin, 1.0-len_threshold, 1.0+len_threshold)
                 aln = None
                 if extr and coords:
-                    # logging.debug(f"contig {rname} coordinates {str(coords[0])} {str(coords[1])}")
+                    logger.debug(f"contig {rname} coordinates {str(coords[0])} {str(coords[1])}")
                     aln = self.spoaConsensusToFlanking(
-                        extr, rname, coords[0], coords[1], margin=100, mode=1)
+                        extr, rname, coords[0], coords[1], margin, mode=1)
                 if aln:
                     consseq, adjpos = findLongestInsert(
                         aln, rname, coords[0]-margin)
+                    logger.debug(f'consseq {str(consseq)}')
+                    logger.debug(f'adjpos {str(adjpos)}')
                     # Complain if predicted insert location from findLongestInsert
                     # is more than 5 bp away from the approximate insert location
                     problem = None
@@ -1482,6 +1506,8 @@ def reportPutativeIesInsertSubreads(self, mininsbreaks, mindelbreaks,
                         Seq(consseq), id=breakpointid, description=";".join(attr)+";")
                     # Find pointers if present
                     if len(consseq) > 0:
+                        logger.debug(f'report adjust pointers')
+                        logger.debug(f'rname {rname} gffpos {str(gffpos)} consseq {str(consseq.seq)} breakpointid {breakpointid}')
                         gffpos, gffpos, pointerdict = self.reportAdjustPointers(
                             rname, gffpos, gffpos, consseq, breakpointid)
                         for i in pointerdict:

diff --git a/bleties/__init__.py b/bleties/__init__.py
@@ -1,2 +1,2 @@
 __all__ = ["main", "Milraa", "Milret", "Milcor", "Miltel", "Insert"]
-__version__ = "v0.1.9"
+__version__ = "v0.1.10-alpha"
diff --git a/bleties/main.py b/bleties/main.py
@@ -24,19 +24,19 @@ def check_env():
     # Get versions
     dep_vers = {}
     dep_vers['pysam'] = pysam.__version__
-    dep_vers['biopython'] =  biopython_v
+    dep_vers['biopython'] = biopython_v
     # Each program reports its version string differently
-    dep_vers['htslib'] = run(['htsfile','--version'], capture_output=True).stdout.decode().split("\n")[0]
+    dep_vers['htslib'] = run(['htsfile', '--version'], capture_output=True).stdout.decode().split("\n")[0]
     dep_vers['htslib'] = re.search(r"\(htslib\) (\S+)", dep_vers['htslib']).group(1)
-    dep_vers['spoa'] = run(['spoa','--version'], capture_output=True).stdout.decode().rstrip()
-    dep_vers['NCRF'] = run(['NCRF','--version'], capture_output=True).stderr.decode().split("\n")[0]
+    dep_vers['spoa'] = run(['spoa', '--version'], capture_output=True).stdout.decode().rstrip()
+    dep_vers['NCRF'] = run(['NCRF', '--version'], capture_output=True).stderr.decode().split("\n")[0]
     dep_vers['NCRF'] = re.search(r"version (\S+)", dep_vers['NCRF']).group(1)
-    dep_vers['muscle'] = run(['muscle','-version'], capture_output=True).stdout.decode().rstrip()
+    dep_vers['muscle'] = run(['muscle', '-version'], capture_output=True).stdout.decode().rstrip()
     dep_vers['muscle'] = re.search(r"v\S+", dep_vers['muscle']).group(0)
     # Get paths
     dep_paths = {}
     for prog in ['spoa', 'NCRF', 'muscle']:
-        dep_paths[prog] = run(['which',prog], capture_output=True).stdout.decode().rstrip()
+        dep_paths[prog] = run(['which', prog], capture_output=True).stdout.decode().rstrip()
     out = []
     for prog in dep_vers:
         if prog in dep_paths:
@@ -108,6 +108,7 @@ def read_bam_ref(bam, fasta=None):
 
 
 def run_boilerplate(logger):
+    """Basic info on BleTIES run for log"""
     logger.info(f"BleTIES {__version__}")
     logger.info("Dependencies: ")
     for dep in check_env():