Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generate output patch #56

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
319 changes: 226 additions & 93 deletions BESST/GenerateOutput.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,23 @@
'''
Created on Sep 29, 2011

@author: ksahlin


Updates on march, 2017
Updates on august, 2018
@contributor: sletort

This file is part of BESST.

BESST is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

BESST is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with BESST. If not, see <http://www.gnu.org/licenses/>.
'''
Expand Down Expand Up @@ -91,118 +94,250 @@ def WriteToF(F, Contigs, list_of_contigs):
info_list.append((cont_obj.name, cont_obj.direction, cont_obj.position, cont_obj.length, cont_obj.sequence)) #,cont_obj.links
if cont_obj.position < 0:
print('Write to F: Position is negative!', cont_obj.position, cont_obj.name, cont_obj.direction)
#del Contigs[cont_obj.name]
#del Contigs[cont_obj.name]
F.append(info_list)
return(F)

# ==================================
def compute_kmer_overlap( end1,end2 ):
# should be a class method of Scaffold ?
i = len(end1)
while i > 0:
if end1[-i:] == end2[:i]:
return i
i -= 1
return i

# thoses classes are based on AGP concept
# Scaffold is 'Object' in AGP concept
class Component(object):
"""'Component' is for what compose the 'Object', 'Contig' and 'Gap'."""
i = 0 # an instance counter, a way to have id for component.

def __init__( self, id, len, seq ):
super( Component, self ).__init__()
Component.i += 1

self._len = len
self._id = id if( id != None ) else Component.i
self._seq = seq

# position of the first/last contig base that belong to the scaffold
# contig coordinate
self._scaff_start = 1
self._scaff_end = len

def __del__( self ):
Component.i -= 1

@property
def seq( self ):
return self._seq

@property
def len( self ):
return self._len

@property
def id( self ):
return self._id

@property
def scaff_start( self ):
return self._scaff_start
@scaff_start.setter
def scaff_start( self, value ):
self._scaff_start = value
#~ self._scaff_end -= value

@property
def scaff_end( self ):
return self._scaff_end

def get_scaff_ctg_coords( self ):
return [ self.scaff_start, self.scaff_end ]

def __str__( self ):
s = "\t".join([ str(x) for x in [self._id, self._len] ])
s += "\t".join([ str(x) for x in [ '\ncoords' ]+self.get_scaff_ctg_coords() ])
s += "\n" + self._seq
return s

class Contig( Component ):
'''a Contig is a Component with a direction(=strand).

This class provide a builder and sequence is returned regarding direction.
'''
def __init__( self, id, len, direction, seq ):
super( Contig, self ).__init__( id, len, seq )
self.__direction = direction

@classmethod
def buildFromTuple( cls, t_ctg ):
return cls( t_ctg[0], t_ctg[3], t_ctg[1], t_ctg[4] )

@property
def seq( self ):
if self.direction:
return self._seq
else:
return RevComp( self._seq,rev_nuc )

@property
def direction( self ):
return self.__direction

class Gap( Component ):
'''a Gap is a component whose sequence is a repetition of N or n.'''

def __init__( self, char, len ):
super( Gap, self ).__init__( None, len, char*len )

# ==================================

class Scaffold(object):
"""docstring for Scaffold"""
def __init__(self, name, param, info_tuple):
super(Scaffold, self).__init__()
self.name = name
super( Scaffold, self ).__init__()
self.name = name
self.param = param

self.seqs = [x[4] for x in info_tuple]
self.gaps = list(map(lambda x,y : y[2] - (x[2] + x[3]), info_tuple[:-1],info_tuple[1:]))
self.directions = [x[1] for x in info_tuple]
self.positions = [(x[2],x[2] + x[3] - 1) for x in info_tuple]
self.contigs = [x[0] for x in info_tuple]

def check_kmer_overlap(self,end1,end2):
i = len(end1)
while i > 0:
if end1[-i:] == end2[:i]:
return i
i -= 1
return i

def get_sequence(self, string, direction):
if direction:
return string
else:
return RevComp(string,rev_nuc)
self.l_components = []
self.__scaff_len = 0
self.__buildComponents( info_tuple )

@property
def scaff_len( self ):
return self.__scaff_len

def make_fasta_string(self,fasta_file):
fasta = []
#fasta.append('>{0}\n'.format(self.name))
fasta.append('>'+str(self.name)+'\n')
# first contig
def __incorporateComponent( self, o_compo, compo_pos=0 ):
ctg_offset = 0

fasta.append( self.get_sequence(self.seqs[0], self.directions[0]))
if( 0 == len( self.l_components ) ):
# the first contig align all along the scaffold
s_start = compo_pos + 1 # if the first contig start on base 1 of the scaffold, use 1 make it easier to understand
# compo_pos + 1 to turn to 1-based.
else:
s_start = self.scaff_len + 1 # the base following the last one

for i in range(len(self.seqs)-1):
gap = self.gaps[i]
if gap <= 2*self.param.std_dev_ins_size:
overlap = self.check_kmer_overlap( self.get_sequence(self.seqs[i], self.directions[i])[-self.param.max_contig_overlap:], self.get_sequence(self.seqs[i+1], self.directions[i+1])[:self.param.max_contig_overlap])
if overlap >= 20:
fasta.append('n' + self.get_sequence(self.seqs[i+1], self.directions[i+1])[overlap:])
print('merging {0} bp here'.format(overlap), file=self.param.information_file)
else:
#print gap
if gap <= 1:
fasta.append('n' + self.get_sequence(self.seqs[i+1], self.directions[i+1]))
else:
fasta.append('N'*int(gap) + self.get_sequence(self.seqs[i+1], self.directions[i+1]))
else:
if gap <= 1:
fasta.append('n' + self.get_sequence(self.seqs[i+1], self.directions[i+1]))
else:
fasta.append('N'*int(gap) + self.get_sequence(self.seqs[i+1], self.directions[i+1]))
s_end = s_start + o_compo.len - o_compo.scaff_start
self.__scaff_len = s_end
self.l_components.append([ o_compo, s_start, s_end ])

def __buildComponents( self, l_info_tuples ):
o_ctg = Contig.buildFromTuple( l_info_tuples[0] )

self.__incorporateComponent( o_ctg, l_info_tuples[0][2] )

print(''.join([ x for x in fasta]), file=fasta_file)
max_gap_size = 2*self.param.std_dev_ins_size # not good var name, max_small_gap_size ?
max_overlap = self.param.max_contig_overlap

def make_AGP_string(self, AGP_file):
for i in range( 1, len( l_info_tuples ) ):
o_next = Contig.buildFromTuple( l_info_tuples[i] )
next_pos = l_info_tuples[i][2] +1 # 1-based

gap_estimate = next_pos - ( l_info_tuples[i-1][2]+1 + o_ctg.len )
if( max_gap_size < gap_estimate ):
#~ print "***Big Gap"
o_gap = Gap( 'N', gap_estimate )
else:
# compute the real overlap
prev_seq = o_ctg.seq[-max_overlap:]
next_seq = o_next.seq[:max_overlap]
overlap = compute_kmer_overlap( prev_seq, next_seq )

#~ print "Overlap = " + str( overlap )
if( 20 <= overlap ): # big overlap
#~ print "***Big Overlap"
print( 'merging {0} bp here'.format(overlap), file=self.param.information_file )
o_gap = Gap( 'n', 1 )
o_next.scaff_start = overlap + 1 # +1 to turn 1-based
elif( gap_estimate <= 1 ): # supposed overlap, but not found
#~ print "***Supposed overlap"
o_gap = Gap( 'n', 1 )
else:
o_gap = Gap( 'N', gap_estimate )

self.__incorporateComponent( o_gap )
self.__incorporateComponent( o_next )
o_ctg = o_next

def make_fasta_string( self,fasta_file ):
'''write a fasta sequence from the object.

Builds a header and concatenates Component sequences.
When no overlapping, offset will be 0.
'''
l_fasta = [ '>'+str(self.name)+'\n' ]
for o_compo in [ l_[0] for l_ in self.l_components ]:
offset = o_compo.scaff_start - 1 # scaff_start is 1-based
l_fasta.append( o_compo.seq[offset:] )

print( ''.join( l_fasta ), file=fasta_file )

def make_AGP_string( self, AGP_file ):
'''write AGP rows from the object.

AGP row starts with scaff_id, scaff_start, scaff_end.
Follows the component number, type.
Then depending on type we have
for type=contig id, scaff_start, scaff_end, orientation
for type=gap len, gap_type, linkage, links_evidence
'''
component_count = 0
for i in range( len(self.seqs) ):
sign = '+' if self.directions[i] else '-'
if i > 0 and self.gaps[i-1] > 0:
component_count += 1
obj_start = self.positions[i-1][1] + 2
obj_end = self.positions[i][0]
compo_len = self.gaps[i-1]
l_elts = [ self.name, obj_start, obj_end, component_count ]
l_elts +=[ 'N', compo_len, 'scaffold', 'yes', 'paired-ends' ]
print('\t'.join([ str(x) for x in l_elts ]), file=AGP_file)

for l_compo in self.l_components:
component_count += 1
obj_start = self.positions[i][0] + 1
obj_end = self.positions[i][1] + 1
compo_len = self.positions[i][1] - self.positions[i][0] + 1
l_elts = [ self.name, obj_start, obj_end, component_count ]
l_elts +=[ 'W', self.contigs[i],'1', compo_len, sign ]
print('\t'.join([ str(x) for x in l_elts ]), file=AGP_file)
o_compo = l_compo[0]
l_elts = [ self.name, l_compo[1], l_compo[2], component_count ]

if( isinstance( o_compo, Gap ) ):
l_component = [ 'N', o_compo.len, 'scaffold', 'yes', 'paired-ends' ]
else:
strand = '+' if o_compo.direction else '-'
l_component = [ 'W', o_compo.id ]
l_component += o_compo.get_scaff_ctg_coords()
l_component.append( strand )

print( '\t'.join([ str(x) for x in l_elts + l_component ]), file=AGP_file )

def make_GFF_string( self, gff_file ):
'''write GFF rows from the object.

GFF row starts with scaff_id, source, component type, scaff_start, scaff_end.
Follows score, strand, phase and ends with attributes.
Note: we only provide ID and Name as attributes.
'''
source = 'besst_assembly'
for i in range( len(self.seqs) ):
sign = '+' if self.directions[i] else '-'
if i > 0 and self.gaps[i-1] > 0:
obj_start = self.positions[i-1][1] + 2
obj_end = self.positions[i][0]
l_attrs = []
l_elts = [ self.name, source, 'gap', obj_start, obj_end ]
l_elts += [ '.', '.', '.', ';'.join( l_attrs ) ]
print('\t'.join([ str(x) for x in l_elts ]), file=gff_file)

obj_start = self.positions[i][0] + 1
obj_end = self.positions[i][1] + 1
name = "_".join( self.contigs[i].split('_',2)[:2] ) # only kept NODE_XXXX from ID
l_attrs = [ 'ID='+self.contigs[i], 'Name='+name ]
l_elts = [ self.name, source, 'contig', obj_start, obj_end ]
l_elts += [ '.', sign, '.', ';'.join( l_attrs ) ]
print('\t'.join([ str(x) for x in l_elts ]), file=gff_file)

def PrintOutput(F, Information, output_dest, param, pass_nr):

for l_compo in self.l_components:
o_compo = l_compo[0]

( score, strand, phase ) = ( '.', '.', '.' )
if( isinstance( o_compo, Gap ) ):
type_ = 'gap'
l_attrs = []
else:
type_ = 'contig'
strand = '+' if o_compo.direction else '-'
name = "_".join( o_compo.id.split('_',2)[:2] ) # only kept NODE_XXXX from ID
l_attrs = [ 'ID='+ o_compo.id, 'Name='+name ]

l_elts = [ self.name, source, type_, l_compo[1], l_compo[2] ]
l_elts += [ score, strand, phase, ';'.join( l_attrs ) ]

print( '\t'.join([ str(x) for x in l_elts ]), file=gff_file )


def PrintOutput(F, param, pass_nr):
try:
os.mkdir(param.output_directory + '/pass' + str(pass_nr))
except OSError:
#directory is already created
pass
#contigs_before=len(C_dict)
contigs_after = len(F)
print('(super)Contigs after scaffolding: ' + str(contigs_after) + '\n', file=Information)
print('(super)Contigs after scaffolding: ' + str(contigs_after) + '\n', file=param.information_file)
gff_file = open(param.output_directory + '/pass' + str(pass_nr) + '/info-pass' + str(pass_nr) + '.gff', 'w')
AGP_file = open(param.output_directory + '/pass' + str(pass_nr) + '/info-pass' + str(pass_nr) + '.agp', 'w')
print('##gff-version 3', file=gff_file)
Expand All @@ -219,8 +354,8 @@ def PrintOutput(F, Information, output_dest, param, pass_nr):


s = Scaffold('scaffold_' + str(header_index) + "_uid_" + unique_id , param, scaf)
s.make_fasta_string(fasta_file)
s.make_AGP_string(AGP_file)
s.make_fasta_string( fasta_file )
s.make_AGP_string( AGP_file )
s.make_GFF_string( gff_file )

return()
Expand All @@ -230,5 +365,3 @@ def RevComp(string, rev_nuc):
#rev_nuc={'A':'T','C':'G','G':'C','T':'A','N':'N','X':'X'}
rev_comp = ''.join([rev_nuc[nucl] for nucl in reversed(string)])
return(rev_comp)


2 changes: 1 addition & 1 deletion runBESST
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ def main(options):
F = GO.WriteToF(F, Contigs, list_of_contigs)


GO.PrintOutput(F, Information, param.output_directory, param, i + 1)
GO.PrintOutput(F, param, i + 1)

if not options.separate_repeats:
destination = open("{0}/pass{1}/Scaffolds_pass{1}.fa".format(param.output_directory, str(i+1) ), 'wb')
Expand Down