Skip to content

Commit

Permalink
patched genbank parser and builder to handle BASE COUNT. (#386)
Browse files Browse the repository at this point in the history
  • Loading branch information
TimothyStiles authored Oct 12, 2023
1 parent f2310db commit 2d5d8c4
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 1 deletion.
1 change: 1 addition & 0 deletions data/sample.gbk
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ FEATURES Location/Qualifiers
KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR
RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK
LISGDDKILNGVYSQYEEGESIFGSLF"
BASE COUNT 67070277 a 48055043 c 48111528 g 67244164 t 18475410 n
ORIGIN
1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg
61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct
Expand Down
33 changes: 32 additions & 1 deletion io/genbank/genbank.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ type Meta struct {
Origin string `json:"origin"`
Locus Locus `json:"locus"`
References []Reference `json:"references"`
BaseCount []BaseCount `json:"base_count"`
Other map[string]string `json:"other"`
Name string `json:"name"`
SequenceHash string `json:"sequence_hash"`
Expand Down Expand Up @@ -109,6 +110,12 @@ type Location struct {
SubLocations []Location `json:"sub_locations"`
}

// BaseCount is a struct that holds the base counts for a sequence.
type BaseCount struct {
Base string
Count int
}

// Precompiled regular expressions:
var (
basePairRegex = regexp.MustCompile(` \d* \w{2} `)
Expand Down Expand Up @@ -315,6 +322,13 @@ func BuildMulti(sequences []Genbank) ([]byte, error) {
gbkString.WriteString(BuildFeatureString(feature))
}

if len(sequence.Meta.BaseCount) > 0 {
gbkString.WriteString("BASE COUNT ")
for _, baseCount := range sequence.Meta.BaseCount {
gbkString.WriteString(strconv.Itoa(baseCount.Count) + " " + baseCount.Base + " ")
}
gbkString.WriteString("\n")
}
// start writing sequence section.
gbkString.WriteString("ORIGIN\n")

Expand Down Expand Up @@ -378,7 +392,7 @@ type parseLoopParameters struct {
emptyAttribute bool
sequenceBuilder strings.Builder
parseStep string
genbank Genbank // since we are scanning lines we need a Genbank struct to store the data outside the loop.// since we are scanning lines we need a Genbank struct to store the data outside the loop.
genbank Genbank // since we are scanning lines we need a Genbank struct to store the data outside the loop.
feature Feature
features []Feature
metadataTag string
Expand Down Expand Up @@ -484,6 +498,23 @@ func ParseMultiNth(r io.Reader, count int) ([]Genbank, error) {
}
case "features":

baseCountFlag := strings.Contains(line, "BASE COUNT") // example string for BASE COUNT: "BASE COUNT 67070277 a 48055043 c 48111528 g 67244164 t 18475410 n"
if baseCountFlag {
fields := strings.Fields(line)
for countIndex := 2; countIndex < len(fields)-1; countIndex += 2 { // starts at two because we don't want to include "BASE COUNT" in our fields
count, err := strconv.Atoi(fields[countIndex])
if err != nil {
return []Genbank{}, err
}

baseCount := BaseCount{
Base: fields[countIndex+1],
Count: count,
}
parameters.genbank.Meta.BaseCount = append(parameters.genbank.Meta.BaseCount, baseCount)
}
break
}
// Switch to sequence parsing
originFlag := strings.Contains(line, "ORIGIN") // we detect the beginning of the sequence with "ORIGIN"
if originFlag {
Expand Down

0 comments on commit 2d5d8c4

Please sign in to comment.