diff --git a/io/genbank/error.go b/io/genbank/error.go new file mode 100644 index 00000000..7795815b --- /dev/null +++ b/io/genbank/error.go @@ -0,0 +1,27 @@ +package genbank + +import "fmt" + +// A GenbankSyntaxError denotes a sytntax error in +// a Genbank flatfile. +type GenbankSyntaxError struct { + Line uint + Context string + Msg string + InnerErr error +} + +// Error returns a human-readable error message. +func (gse GenbankSyntaxError) Error() string { + msg := gse.Msg + if gse.InnerErr != nil { + msg = fmt.Errorf("%v: %w", msg, gse.InnerErr).Error() + } + + return fmt.Sprintf("syntax error at line %v: %v\n%v\t%v", gse.Line, msg, gse.Line, gse.Context) +} + +// Unwrap returns any errors underlying the syntax error, if applicable. +func (gse GenbankSyntaxError) Unwrap() error { + return gse.InnerErr +} diff --git a/io/genbank/example_test.go b/io/genbank/example_test.go deleted file mode 100644 index 89b128b8..00000000 --- a/io/genbank/example_test.go +++ /dev/null @@ -1,168 +0,0 @@ -package genbank_test - -import ( - "bytes" - "fmt" - "os" - "path/filepath" - - "github.com/bebop/poly/io/genbank" -) - -// This example shows how to open a genbank file and search for a gene given -// its name. After finding it, notes about the particular gene are read. -func Example_basic() { - sequences, _ := genbank.Read("../../data/puc19.gbk") - for _, feature := range sequences.Features { - if feature.Attributes["gene"] == "bla" { - fmt.Println(feature.Attributes["note"]) - } - } - // Output: confers resistance to ampicillin, carbenicillin, andrelated antibiotics -} - -func ExampleRead() { - sequence, _ := genbank.Read("../../data/puc19.gbk") - fmt.Println(sequence.Meta.Locus.ModificationDate) - // Output: 22-OCT-2019 -} - -func ExampleWrite() { - tmpDataDir, err := os.MkdirTemp("", "data-*") - if err != nil { - fmt.Println(err.Error()) - } - defer os.RemoveAll(tmpDataDir) - - sequences, _ := genbank.Read("../../data/puc19.gbk") - - tmpGbkFilePath := filepath.Join(tmpDataDir, "puc19.gbk") - _ = genbank.Write(sequences, tmpGbkFilePath) - - testSequence, _ := genbank.Read(tmpGbkFilePath) - - fmt.Println(testSequence.Meta.Locus.ModificationDate) - // Output: 22-OCT-2019 -} - -func ExampleBuild() { - sequences, _ := genbank.Read("../../data/puc19.gbk") - gbkBytes, _ := genbank.Build(sequences) - testSequence, _ := genbank.Parse(bytes.NewReader(gbkBytes)) - - fmt.Println(testSequence.Meta.Locus.ModificationDate) - // Output: 22-OCT-2019 -} - -func ExampleParse() { - file, _ := os.Open("../../data/puc19.gbk") - sequence, _ := genbank.Parse(file) - - fmt.Println(sequence.Meta.Locus.ModificationDate) - // Output: 22-OCT-2019 -} - -func ExampleReadMulti() { - sequences, err := genbank.ReadMulti("../../data/multiGbk_test.seq") - if err != nil { - fmt.Println(err.Error()) - } - - fmt.Println(sequences[1].Meta.Locus.ModificationDate) - // Output: 05-FEB-1999 -} - -func ExampleWriteMulti() { - tmpDataDir, err := os.MkdirTemp("", "data-*") - if err != nil { - fmt.Println(err.Error()) - } - - sequences, _ := genbank.ReadMulti("../../data/multiGbk_test.seq") - tmpGbkFilePath := filepath.Join(tmpDataDir, "multiGbk_test.seq") - - err = genbank.WriteMulti(sequences, tmpGbkFilePath) - - if err != nil { - fmt.Println(err.Error()) - } - - testSequences, _ := genbank.ReadMulti(tmpGbkFilePath) - isEqual := sequences[1].Meta.Locus.ModificationDate == testSequences[1].Meta.Locus.ModificationDate - fmt.Println(isEqual) - // Output: true -} - -func ExampleBuildMulti() { - sequences, _ := genbank.ReadMulti("../../data/multiGbk_test.seq") - gbkBytes, _ := genbank.BuildMulti(sequences) - testSequences, _ := genbank.ParseMulti(bytes.NewReader(gbkBytes)) - - isEqual := sequences[1].Meta.Locus.ModificationDate == testSequences[1].Meta.Locus.ModificationDate - fmt.Println(isEqual) - // Output: true -} - -func ExampleParseMulti() { - file, _ := os.Open("../../data/multiGbk_test.seq") - sequences, _ := genbank.ParseMulti(file) - fmt.Println(sequences[1].Meta.Locus.ModificationDate) - // Output: 05-FEB-1999 -} - -func ExampleGenbank_AddFeature() { - // Sequence for greenflourescent protein (GFP) that we're using as test data for this example. - gfpSequence := "ATGGCTAGCAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCTACATACGGAAAGCTTACCCTTAAATTTATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTCTCTTATGGTGTTCAATGCTTTTCCCGTTATCCGGATCATATGAAACGGCATGACTTTTTCAAGAGTGCCATGCCCGAAGGTTATGTACAGGAACGCACTATATCTTTCAAAGATGACGGGAACTACAAGACGCGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATCGTATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTCGGACACAAACTCGAGTACAACTATAACTCACACAATGTATACATCACGGCAGACAAACAAAAGAATGGAATCAAAGCTAACTTCAAAATTCGCCACAACATTGAAGATGGATCCGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCGACACAATCTGCCCTTTCGAAAGATCCCAACGAAAAGCGTGACCACATGGTCCTTCTTGAGTTTGTAACTGCTGCTGGGATTACACATGGCATGGATGAGCTCTACAAATAA" - - // initialize sequence and feature structs. - var sequence genbank.Genbank - var feature genbank.Feature - - // set the initialized sequence struct's sequence. - sequence.Sequence = gfpSequence - - // Set the initialized feature name and sequence location. - feature.Description = "Green Fluorescent Protein" - feature.Location = genbank.Location{} - feature.Location.Start = 0 - feature.Location.End = len(sequence.Sequence) - - // Add the GFP feature to the sequence struct. - _ = sequence.AddFeature(&feature) - - // get the GFP feature sequence string from the sequence struct. - featureSequence, _ := feature.GetSequence() - - // check to see if the feature was inserted properly into the sequence. - fmt.Println(gfpSequence == featureSequence) - - // Output: true -} - -func ExampleFeature_GetSequence() { - // Sequence for greenflourescent protein (GFP) that we're using as test data for this example. - gfpSequence := "ATGGCTAGCAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCTACATACGGAAAGCTTACCCTTAAATTTATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTCTCTTATGGTGTTCAATGCTTTTCCCGTTATCCGGATCATATGAAACGGCATGACTTTTTCAAGAGTGCCATGCCCGAAGGTTATGTACAGGAACGCACTATATCTTTCAAAGATGACGGGAACTACAAGACGCGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATCGTATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTCGGACACAAACTCGAGTACAACTATAACTCACACAATGTATACATCACGGCAGACAAACAAAAGAATGGAATCAAAGCTAACTTCAAAATTCGCCACAACATTGAAGATGGATCCGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCGACACAATCTGCCCTTTCGAAAGATCCCAACGAAAAGCGTGACCACATGGTCCTTCTTGAGTTTGTAACTGCTGCTGGGATTACACATGGCATGGATGAGCTCTACAAATAA" - - // initialize sequence and feature structs. - var sequence genbank.Genbank - var feature genbank.Feature - - // set the initialized sequence struct's sequence. - sequence.Sequence = gfpSequence - - // Set the initialized feature name and sequence location. - feature.Description = "Green Fluorescent Protein" - feature.Location.Start = 0 - feature.Location.End = len(sequence.Sequence) - - // Add the GFP feature to the sequence struct. - _ = sequence.AddFeature(&feature) - - // get the GFP feature sequence string from the sequence struct. - featureSequence, _ := feature.GetSequence() - - // check to see if the feature was inserted properly into the sequence. - fmt.Println(gfpSequence == featureSequence) - - // Output: true -} diff --git a/io/genbank/genbank.go b/io/genbank/genbank.go deleted file mode 100644 index 6ef8cb43..00000000 --- a/io/genbank/genbank.go +++ /dev/null @@ -1,1028 +0,0 @@ -/* -Package genbank provides genbank parsers and writers. - -GenBank is a flat text file format developed in the 1980s to annotate genetic -sequences, and has since become the standard for sharing annotated genetic -sequences. - -This package provides a parser and writer to convert between the GenBank file -format and the more general Genbank struct. -*/ -package genbank - -import ( - "bufio" - "bytes" - "fmt" - "io" - "os" - "regexp" - "strconv" - "strings" - - "github.com/bebop/poly/transform" - "github.com/lunny/log" - "github.com/mitchellh/go-wordwrap" -) - -/****************************************************************************** - -GBK specific IO related things begin here. - -******************************************************************************/ - -var ( - readFileFn = os.ReadFile - parseMultiNthFn = ParseMultiNth - parseReferencesFn = parseReferences -) - -// Genbank is the main struct for the Genbank file format. -type Genbank struct { - Meta Meta - Features []Feature - Sequence string // will be changed and include reader, writer, and byte slice. -} - -// Meta holds the meta data for Genbank and other annotated sequence files. -type Meta struct { - Date string `json:"date"` - Definition string `json:"definition"` - Accession string `json:"accession"` - Version string `json:"version"` - Keywords string `json:"keywords"` - Organism string `json:"organism"` - Source string `json:"source"` - Taxonomy []string `json:"taxonomy"` - Origin string `json:"origin"` - Locus Locus `json:"locus"` - References []Reference `json:"references"` - BaseCount []BaseCount `json:"base_count"` - Other map[string]string `json:"other"` - Name string `json:"name"` - SequenceHash string `json:"sequence_hash"` - SequenceHashFunction string `json:"hash_function"` -} - -// Feature holds the information for a feature in a Genbank file and other annotated sequence files. -type Feature struct { - Type string `json:"type"` - Description string `json:"description"` - Attributes map[string]string `json:"attributes"` - SequenceHash string `json:"sequence_hash"` - SequenceHashFunction string `json:"hash_function"` - Sequence string `json:"sequence"` - Location Location `json:"location"` - ParentSequence *Genbank `json:"-"` -} - -// Reference holds information for one reference in a Meta struct. -type Reference struct { - Authors string `json:"authors"` - Title string `json:"title"` - Journal string `json:"journal"` - PubMed string `json:"pub_med"` - Remark string `json:"remark"` - Range string `json:"range"` - Consortium string `json:"consortium"` -} - -// Locus holds Locus information in a Meta struct. -type Locus struct { - Name string `json:"name"` - SequenceLength string `json:"sequence_length"` - MoleculeType string `json:"molecule_type"` - GenbankDivision string `json:"genbank_division"` - ModificationDate string `json:"modification_date"` - SequenceCoding string `json:"sequence_coding"` - Circular bool `json:"circular"` -} - -// Location is a struct that holds the location of a feature. -type Location struct { - Start int `json:"start"` - End int `json:"end"` - Complement bool `json:"complement"` - Join bool `json:"join"` - FivePrimePartial bool `json:"five_prime_partial"` - ThreePrimePartial bool `json:"three_prime_partial"` - GbkLocationString string `json:"gbk_location_string"` - SubLocations []Location `json:"sub_locations"` -} - -// BaseCount is a struct that holds the base counts for a sequence. -type BaseCount struct { - Base string - Count int -} - -// Precompiled regular expressions: -var ( - basePairRegex = regexp.MustCompile(` \d* \w{2} `) - circularRegex = regexp.MustCompile(` circular `) - modificationDateRegex = regexp.MustCompile(`\d{2}-[A-Z]{3}-\d{4}`) - partialRegex = regexp.MustCompile("<|>") - sequenceRegex = regexp.MustCompile("[^a-zA-Z]+") -) - -// AddFeature adds a feature to a Genbank struct. -func (sequence *Genbank) AddFeature(feature *Feature) error { - feature.ParentSequence = sequence - sequence.Features = append(sequence.Features, *feature) - return nil -} - -// GetSequence returns the sequence of a feature. -func (feature Feature) GetSequence() (string, error) { - return getFeatureSequence(feature, feature.Location) -} - -// getFeatureSequence takes a feature and location object and returns a sequence string. -func getFeatureSequence(feature Feature, location Location) (string, error) { - var sequenceBuffer bytes.Buffer - var sequenceString string - parentSequence := feature.ParentSequence.Sequence - - if len(location.SubLocations) == 0 { - sequenceBuffer.WriteString(parentSequence[location.Start:location.End]) - } else { - for _, subLocation := range location.SubLocations { - sequence, _ := getFeatureSequence(feature, subLocation) - - sequenceBuffer.WriteString(sequence) - } - } - - // reverse complements resulting string if needed. - if location.Complement { - sequenceString = transform.ReverseComplement(sequenceBuffer.String()) - } else { - sequenceString = sequenceBuffer.String() - } - - return sequenceString, nil -} - -// Read reads a GBK file from path and returns a Genbank struct. -func Read(path string) (Genbank, error) { - genbankSlice, err := ReadMultiNth(path, 1) - if err != nil { - return Genbank{}, err - } - genbank := genbankSlice[0] - return genbank, err -} - -// ReadMulti reads a multi Gbk from path and parses it into a slice of Genbank structs. -func ReadMulti(path string) ([]Genbank, error) { - return ReadMultiNth(path, -1) -} - -// ReadMultiNth reads a multi Gbk from path and parses N entries into a slice of Genbank structs. -func ReadMultiNth(path string, count int) ([]Genbank, error) { - file, err := os.Open(path) - if err != nil { - return []Genbank{}, err - } - - sequence, err := parseMultiNthFn(file, count) - if err != nil { - return []Genbank{}, err - } - - return sequence, nil -} - -// Write takes an Genbank list and a path string and writes out a genbank record to that path. -func Write(sequences Genbank, path string) error { - // build function always returns nil error. - // This is for API consistency in case we need to - // add error handling in the future. - gbk, _ := Build(sequences) - - err := os.WriteFile(path, gbk, 0644) - return err -} - -// WriteMulti takes a slice of Genbank structs and a path string and writes out a multi genbank record to that path. -func WriteMulti(sequences []Genbank, path string) error { - // buildmulti function always returns nil error. - // This is for API consistency in case we need to - // add error handling in the future. - gbk, _ := BuildMulti(sequences) - - err := os.WriteFile(path, gbk, 0644) - return err -} - -// Build builds a GBK byte slice to be written out to db or file. -func Build(gbk Genbank) ([]byte, error) { - gbkSlice := []Genbank{gbk} - multiGBK, err := BuildMulti(gbkSlice) - return multiGBK, err -} - -// BuildMulti builds a MultiGBK byte slice to be written out to db or file. -func BuildMulti(sequences []Genbank) ([]byte, error) { - var gbkString bytes.Buffer - for _, sequence := range sequences { - locus := sequence.Meta.Locus - var shape string - - if locus.Circular { - shape = "circular" - } else { - shape = "linear" - } - - fivespace := generateWhiteSpace(subMetaIndex) - - // building locus - locusData := locus.Name + fivespace + locus.SequenceLength + " bp" + fivespace + locus.MoleculeType + fivespace + shape + fivespace + locus.GenbankDivision + fivespace + locus.ModificationDate - locusString := "LOCUS " + locusData + "\n" - gbkString.WriteString(locusString) - - // building other standard meta features - definitionString := buildMetaString("DEFINITION", sequence.Meta.Definition) - gbkString.WriteString(definitionString) - - accessionString := buildMetaString("ACCESSION", sequence.Meta.Accession) - gbkString.WriteString(accessionString) - - versionString := buildMetaString("VERSION", sequence.Meta.Version) - gbkString.WriteString(versionString) - - keywordsString := buildMetaString("KEYWORDS", sequence.Meta.Keywords) - gbkString.WriteString(keywordsString) - - sourceString := buildMetaString("SOURCE", sequence.Meta.Source) - gbkString.WriteString(sourceString) - - organismString := buildMetaString(" ORGANISM", sequence.Meta.Organism) - gbkString.WriteString(organismString) - - if len(sequence.Meta.Taxonomy) > 0 { - var taxonomyString strings.Builder - for i, taxonomyData := range sequence.Meta.Taxonomy { - taxonomyString.WriteString(taxonomyData) - if len(sequence.Meta.Taxonomy) == i+1 { - taxonomyString.WriteString(".") - } else { - taxonomyString.WriteString("; ") - } - } - gbkString.WriteString(buildMetaString("", taxonomyString.String())) - } - - // building references - // TODO: could use reflection to get keys and make more general. - for referenceIndex, reference := range sequence.Meta.References { - referenceString := buildMetaString("REFERENCE", fmt.Sprintf("%d %s", referenceIndex+1, reference.Range)) - gbkString.WriteString(referenceString) - - if reference.Authors != "" { - authorsString := buildMetaString(" AUTHORS", reference.Authors) - gbkString.WriteString(authorsString) - } - - if reference.Title != "" { - titleString := buildMetaString(" TITLE", reference.Title) - gbkString.WriteString(titleString) - } - - if reference.Journal != "" { - journalString := buildMetaString(" JOURNAL", reference.Journal) - gbkString.WriteString(journalString) - } - - if reference.PubMed != "" { - pubMedString := buildMetaString(" PUBMED", reference.PubMed) - gbkString.WriteString(pubMedString) - } - if reference.Consortium != "" { - consrtmString := buildMetaString(" CONSRTM", reference.Consortium) - gbkString.WriteString(consrtmString) - } - } - - // building other meta fields that are catch all - otherKeys := make([]string, 0, len(sequence.Meta.Other)) - for key := range sequence.Meta.Other { - otherKeys = append(otherKeys, key) - } - - for _, otherKey := range otherKeys { - otherString := buildMetaString(otherKey, sequence.Meta.Other[otherKey]) - gbkString.WriteString(otherString) - } - - // start writing features section. - gbkString.WriteString("FEATURES Location/Qualifiers\n") - for _, feature := range sequence.Features { - gbkString.WriteString(BuildFeatureString(feature)) - } - - if len(sequence.Meta.BaseCount) > 0 { - gbkString.WriteString("BASE COUNT ") - for _, baseCount := range sequence.Meta.BaseCount { - gbkString.WriteString(strconv.Itoa(baseCount.Count) + " " + baseCount.Base + " ") - } - gbkString.WriteString("\n") - } - // start writing sequence section. - gbkString.WriteString("ORIGIN\n") - - // iterate over every character in sequence range. - for index, base := range sequence.Sequence { - // if 60th character add newline then whitespace and index number and space before adding next base. - if index%60 == 0 { - if index != 0 { - gbkString.WriteString("\n") - } - lineNumberString := strconv.Itoa(index + 1) // genbank indexes at 1 for some reason - leadingWhiteSpaceLength := 9 - len(lineNumberString) // <- I wish I was kidding - for i := 0; i < leadingWhiteSpaceLength; i++ { - gbkString.WriteString(" ") - } - gbkString.WriteString(lineNumberString + " ") - gbkString.WriteRune(base) - // if base index is divisible by ten add a space (genbank convention) - } else if index%10 == 0 { - gbkString.WriteString(" ") - gbkString.WriteRune(base) - // else just add the base. - } else { - gbkString.WriteRune(base) - } - } - // finish genbank file with "//" on newline (again a genbank convention) - gbkString.WriteString("\n//\n") - } - - return gbkString.Bytes(), nil -} - -// Parse takes in a reader representing a single gbk/gb/genbank file and parses it into a Genbank struct. -func Parse(r io.Reader) (Genbank, error) { - genbankSlice, err := parseMultiNthFn(r, 1) - - if err != nil { - return Genbank{}, err - } - - return genbankSlice[0], err -} - -// ParseMulti takes in a reader representing a multi gbk/gb/genbank file and parses it into a slice of Genbank structs. -func ParseMulti(r io.Reader) ([]Genbank, error) { - genbankSlice, err := parseMultiNthFn(r, -1) - - if err != nil { - return []Genbank{}, err - } - - return genbankSlice, err -} - -type parseLoopParameters struct { - newLocation bool - quoteActive bool - attribute string - attributeValue string - emptyAttribute bool - sequenceBuilder strings.Builder - parseStep string - genbank Genbank // since we are scanning lines we need a Genbank struct to store the data outside the loop. - feature Feature - features []Feature - metadataTag string - metadataData []string //this stutters but will remain to make it easier to batch rename variables when compared to parameters.metadataTag. - genbankStarted bool - currentLine string - prevline string - multiLineFeature bool -} - -// method to init loop parameters -func (params *parseLoopParameters) init() { - params.newLocation = true - params.feature.Attributes = make(map[string]string) - params.parseStep = "metadata" - params.genbankStarted = false - params.genbank.Meta.Other = make(map[string]string) -} - -// ParseMultiNth takes in a reader representing a multi gbk/gb/genbank file and parses the first n records into a slice of Genbank structs. -func ParseMultiNth(r io.Reader, count int) ([]Genbank, error) { - scanner := bufio.NewScanner(r) - var genbanks []Genbank - - // Sequence setup - - var parameters parseLoopParameters - parameters.init() - - // Loop through each line of the file - for lineNum := 0; scanner.Scan(); lineNum++ { - // get line from scanner and split it - line := scanner.Text() - splitLine := strings.Split(strings.TrimSpace(line), " ") - - prevline := parameters.currentLine - parameters.currentLine = line - parameters.prevline = prevline - - // keep scanning until we find the start of the first record - if !parameters.genbankStarted { - // We detect the beginning of a new genbank file with "LOCUS" - locusFlag := strings.Contains(line, "LOCUS") - - if locusFlag { - parameters = parseLoopParameters{} - parameters.init() - parameters.genbank.Meta.Locus = parseLocus(line) - parameters.genbankStarted = true - } - continue - } - - switch parameters.parseStep { - case "metadata": - // Handle empty lines - if len(line) == 0 { - return genbanks, fmt.Errorf("Empty metadata line on line %d", lineNum) - } - - // If we are currently reading a line, we need to figure out if it is a new meta line. - if string(line[0]) != " " || parameters.metadataTag == "FEATURES" { - // If this is true, it means we are beginning a new meta tag. In that case, let's save - // the older data, and then continue along. - switch parameters.metadataTag { - case "DEFINITION": - parameters.genbank.Meta.Definition = parseMetadata(parameters.metadataData) - case "ACCESSION": - parameters.genbank.Meta.Accession = parseMetadata(parameters.metadataData) - case "VERSION": - parameters.genbank.Meta.Version = parseMetadata(parameters.metadataData) - case "KEYWORDS": - parameters.genbank.Meta.Keywords = parseMetadata(parameters.metadataData) - case "SOURCE": - parameters.genbank.Meta.Source, parameters.genbank.Meta.Organism, parameters.genbank.Meta.Taxonomy = getSourceOrganism(parameters.metadataData) - case "REFERENCE": - reference, err := parseReferencesFn(parameters.metadataData) - if err != nil { - return []Genbank{}, fmt.Errorf("Failed in parsing reference above line %d. Got error: %s", lineNum, err) - } - parameters.genbank.Meta.References = append(parameters.genbank.Meta.References, reference) - - case "FEATURES": - parameters.parseStep = "features" - - // We know that we are now parsing features, so lets initialize our first feature - parameters.feature.Type = strings.TrimSpace(splitLine[0]) - parameters.feature.Location.GbkLocationString = strings.TrimSpace(splitLine[len(splitLine)-1]) - parameters.newLocation = true - - continue - - default: - if parameters.metadataTag != "" { - parameters.genbank.Meta.Other[parameters.metadataTag] = parseMetadata(parameters.metadataData) - } - } - - parameters.metadataTag = strings.TrimSpace(splitLine[0]) - parameters.metadataData = []string{strings.TrimSpace(line[len(parameters.metadataTag):])} - } else { - parameters.metadataData = append(parameters.metadataData, line) - } - case "features": - - baseCountFlag := strings.Contains(line, "BASE COUNT") // example string for BASE COUNT: "BASE COUNT 67070277 a 48055043 c 48111528 g 67244164 t 18475410 n" - if baseCountFlag { - fields := strings.Fields(line) - for countIndex := 2; countIndex < len(fields)-1; countIndex += 2 { // starts at two because we don't want to include "BASE COUNT" in our fields - count, err := strconv.Atoi(fields[countIndex]) - if err != nil { - return []Genbank{}, err - } - - baseCount := BaseCount{ - Base: fields[countIndex+1], - Count: count, - } - parameters.genbank.Meta.BaseCount = append(parameters.genbank.Meta.BaseCount, baseCount) - } - break - } - // Switch to sequence parsing - originFlag := strings.Contains(line, "ORIGIN") // we detect the beginning of the sequence with "ORIGIN" - if originFlag { - parameters.parseStep = "sequence" - - // save our completed attribute / qualifier string to the current feature - if parameters.attributeValue != "" { - parameters.feature.Attributes[parameters.attribute] = parameters.attributeValue - parameters.features = append(parameters.features, parameters.feature) - parameters.attributeValue = "" - parameters.attribute = "" - parameters.feature = Feature{} - parameters.feature.Attributes = make(map[string]string) - } else { - parameters.features = append(parameters.features, parameters.feature) - } - - // add our features to the genbank - for _, feature := range parameters.features { - location, err := parseLocation(feature.Location.GbkLocationString) - if err != nil { - return []Genbank{}, err - } - feature.Location = location - err = parameters.genbank.AddFeature(&feature) - if err != nil { - return []Genbank{}, err - } - } - continue - } // end sequence parsing flag logic - - // check if current line contains anything but whitespace - trimmedLine := strings.TrimSpace(line) - if len(trimmedLine) < 1 { - continue - } - - // determine if current line is a new top level feature - if countLeadingSpaces(parameters.currentLine) < countLeadingSpaces(parameters.prevline) || parameters.prevline == "FEATURES" { - // save our completed attribute / qualifier string to the current feature - if parameters.attributeValue != "" { - parameters.feature.Attributes[parameters.attribute] = parameters.attributeValue - parameters.features = append(parameters.features, parameters.feature) - parameters.attributeValue = "" - parameters.attribute = "" - parameters.feature = Feature{} - parameters.feature.Attributes = make(map[string]string) - } - - // } - // checks for empty types - if parameters.feature.Type != "" { - parameters.features = append(parameters.features, parameters.feature) - } - - parameters.feature = Feature{} - parameters.feature.Attributes = make(map[string]string) - - // An initial feature line looks like this: `source 1..2686` with a type separated by its location - if len(splitLine) < 2 { - return genbanks, fmt.Errorf("Feature line malformed on line %d. Got line: %s", lineNum, line) - } - parameters.feature.Type = strings.TrimSpace(splitLine[0]) - parameters.feature.Location.GbkLocationString = strings.TrimSpace(splitLine[len(splitLine)-1]) - parameters.multiLineFeature = false // without this we can't tell if something is a multiline feature or multiline qualifier - } else if !strings.Contains(parameters.currentLine, "/") { // current line is continuation of a feature or qualifier (sub-constituent of a feature) - // if it's a continuation of the current feature, add it to the location - if !strings.Contains(parameters.currentLine, "\"") && (countLeadingSpaces(parameters.currentLine) > countLeadingSpaces(parameters.prevline) || parameters.multiLineFeature) { - parameters.feature.Location.GbkLocationString += strings.TrimSpace(line) - parameters.multiLineFeature = true // without this we can't tell if something is a multiline feature or multiline qualifier - } else { // it's a continued line of a qualifier - removeAttributeValueQuotes := strings.Replace(trimmedLine, "\"", "", -1) - - parameters.attributeValue = parameters.attributeValue + removeAttributeValueQuotes - } - } else if strings.Contains(parameters.currentLine, "/") { // current line is a new qualifier - trimmedCurrentLine := strings.TrimSpace(parameters.currentLine) - if trimmedCurrentLine[0] != '/' { // if we have an exception case, like (adenine(1518)-N(6)/adenine(1519)-N(6))- - parameters.attributeValue = parameters.attributeValue + trimmedCurrentLine - continue - } - // save our completed attribute / qualifier string to the current feature - if parameters.attributeValue != "" || parameters.emptyAttribute { - parameters.feature.Attributes[parameters.attribute] = parameters.attributeValue - parameters.emptyAttribute = false - } - parameters.attributeValue = "" - splitAttribute := strings.Split(line, "=") - trimmedSpaceAttribute := strings.TrimSpace(splitAttribute[0]) - removedForwardSlashAttribute := strings.Replace(trimmedSpaceAttribute, "/", "", 1) - - parameters.attribute = removedForwardSlashAttribute - - var removeAttributeValueQuotes string - if len(splitAttribute) == 1 { // handle case of ` /pseudo `, which has no text - removeAttributeValueQuotes = "" - parameters.emptyAttribute = true - } else { // this is normally triggered - removeAttributeValueQuotes = strings.Replace(splitAttribute[1], "\"", "", -1) - } - parameters.attributeValue = removeAttributeValueQuotes - parameters.multiLineFeature = false // without this we can't tell if something is a multiline feature or multiline qualifier - } - - case "sequence": - if len(line) < 2 { // throw error if line is malformed - return genbanks, fmt.Errorf("Too short line found while parsing genbank sequence on line %d. Got line: %s", lineNum, line) - } else if line[0:2] == "//" { // end of sequence - parameters.genbank.Sequence = parameters.sequenceBuilder.String() - - genbanks = append(genbanks, parameters.genbank) - parameters.genbankStarted = false - parameters.sequenceBuilder.Reset() - } else { // add line to total sequence - parameters.sequenceBuilder.WriteString(sequenceRegex.ReplaceAllString(line, "")) - } - default: - log.Warnf("Unknown parse step: %s", parameters.parseStep) - parameters.genbankStarted = false - } - } - return genbanks, nil -} - -func countLeadingSpaces(line string) int { - return len(line) - len(strings.TrimLeft(line, " ")) -} - -func parseMetadata(metadataData []string) string { - var outputMetadata string - if len(metadataData) == 0 { - return "." - } - for _, data := range metadataData { - outputMetadata = outputMetadata + strings.TrimSpace(data) + " " - } - outputMetadata = outputMetadata[:len(outputMetadata)-1] // Remove trailing whitespace - return outputMetadata -} - -func parseReferences(metadataData []string) (Reference, error) { - var reference Reference - var err error - rangeIndex := strings.Index(metadataData[0], "(") - if rangeIndex != -1 { - reference.Range = metadataData[0][rangeIndex:] - } - var referenceKey string - var referenceValue string - - if len(metadataData) == 1 { - return Reference{}, fmt.Errorf("Got reference with no additional information") - } - - referenceKey = strings.Split(strings.TrimSpace(metadataData[1]), " ")[0] - referenceValue = strings.TrimSpace(metadataData[1][len(referenceKey)+2:]) - for index := 2; index < len(metadataData); index++ { - if len(metadataData[index]) > 3 { - if metadataData[index][3] != ' ' { - err = reference.addKey(referenceKey, referenceValue) - if err != nil { - return reference, err - } - referenceKey = strings.Split(strings.TrimSpace(metadataData[index]), " ")[0] - referenceValue = strings.TrimSpace(metadataData[index][len(referenceKey)+2:]) - } else { - // Otherwise, simply append the next metadata. - referenceValue = referenceValue + " " + strings.TrimSpace(metadataData[index]) - } - } - } - err = reference.addKey(referenceKey, referenceValue) - if err != nil { - return reference, err - } - - return reference, nil -} - -func (reference *Reference) addKey(referenceKey string, referenceValue string) error { - switch referenceKey { - case "AUTHORS": - reference.Authors = referenceValue - case "TITLE": - reference.Title = referenceValue - case "JOURNAL": - reference.Journal = referenceValue - case "PUBMED": - reference.PubMed = referenceValue - case "REMARK": - reference.Remark = referenceValue - case "CONSRTM": - reference.Consortium = referenceValue - default: - return fmt.Errorf("ReferenceKey not in [AUTHORS, TITLE, JOURNAL, PUBMED, REMARK, CONSRTM]. Got: %s", referenceKey) - } - return nil -} - -var genBankMoleculeTypes = []string{ - "DNA", - "genomic DNA", - "genomic RNA", - "mRNA", - "tRNA", - "rRNA", - "other RNA", - "other DNA", - "transcribed RNA", - "viral cRNA", - "unassigned DNA", - "unassigned RNA", -} - -// used in parseLocus function though it could be useful elsewhere. -var genbankDivisions = []string{ - "PRI", //primate sequences - "ROD", //rodent sequences - "MAM", //other mamallian sequences - "VRT", //other vertebrate sequences - "INV", //invertebrate sequences - "PLN", //plant, fungal, and algal sequences - "BCT", //bacterial sequences - "VRL", //viral sequences - "PHG", //bacteriophage sequences - "SYN", //synthetic sequences - "UNA", //unannotated sequences - "EST", //EST sequences (expressed sequence tags) - "PAT", //patent sequences - "STS", //STS sequences (sequence tagged sites) - "GSS", //GSS sequences (genome survey sequences) - "HTG", //HTG sequences (high-throughput genomic sequences) - "HTC", //unfinished high-throughput cDNA sequencing - "ENV", //environmental sampling sequences -} - -// TODO rewrite with proper error handling. -// parses locus from provided string. -func parseLocus(locusString string) Locus { - locus := Locus{} - - locusSplit := strings.Split(strings.TrimSpace(locusString), " ") - - var filteredLocusSplit []string - for i := range locusSplit { - if locusSplit[i] != "" { - filteredLocusSplit = append(filteredLocusSplit, locusSplit[i]) - } - } - - locus.Name = filteredLocusSplit[1] - - // sequence length and coding - baseSequenceLength := basePairRegex.FindString(locusString) - if baseSequenceLength != "" { - splitBaseSequenceLength := strings.Split(strings.TrimSpace(baseSequenceLength), " ") - if len(splitBaseSequenceLength) == 2 { - locus.SequenceLength = splitBaseSequenceLength[0] - locus.SequenceCoding = splitBaseSequenceLength[1] - } - } - - // molecule type - for _, moleculeType := range genBankMoleculeTypes { - moleculeRegex, _ := regexp.Compile(moleculeType) - match := string(moleculeRegex.Find([]byte(locusString))) - if match != "" { - locus.MoleculeType = match - break - } - } - - // circularity flag - if circularRegex.Match([]byte(locusString)) { - locus.Circular = true - } - - // genbank division - for _, genbankDivision := range genbankDivisions { - genbankDivisionRegex, _ := regexp.Compile(genbankDivision) - match := string(genbankDivisionRegex.Find([]byte(locusString))) - if match != "" { - locus.GenbankDivision = match - break - } - } - - // ModificationDate - locus.ModificationDate = modificationDateRegex.FindString(locusString) - - return locus -} - -// indices for random points of interests on a gbk line. -const subMetaIndex = 5 -const qualifierIndex = 21 - -func getSourceOrganism(metadataData []string) (string, string, []string) { - source := strings.TrimSpace(metadataData[0]) - var organism string - var taxonomy []string - for iterator := 1; iterator < len(metadataData); iterator++ { - dataLine := metadataData[iterator] - headString := strings.Split(strings.TrimSpace(dataLine), " ")[0] - if headString == "ORGANISM" { - index := strings.Index(dataLine, `ORGANISM`) - organism = strings.TrimSpace(dataLine[index+len("ORGANISM"):]) - continue - } - for _, taxonomyData := range strings.Split(strings.TrimSpace(dataLine), ";") { - taxonomyDataTrimmed := strings.TrimSpace(taxonomyData) - // Taxonomy ends with a ".", which we check for here - if len(taxonomyDataTrimmed) > 1 { - if taxonomyDataTrimmed[len(taxonomyDataTrimmed)-1] == '.' { - taxonomyDataTrimmed = taxonomyDataTrimmed[:len(taxonomyDataTrimmed)-1] - } - taxonomy = append(taxonomy, taxonomyDataTrimmed) - } - } - } - return source, organism, taxonomy -} - -func parseLocation(locationString string) (Location, error) { - var location Location - location.GbkLocationString = locationString - if !strings.ContainsAny(locationString, "(") { // Case checks for simple expression of x..x - if !strings.ContainsAny(locationString, ".") { //Case checks for simple expression x - position, err := strconv.Atoi(locationString) - if err != nil { - return Location{}, err - } - location = Location{Start: position, End: position} - } else { - // to remove FivePrimePartial and ThreePrimePartial indicators from start and end before converting to int. - startEndSplit := strings.Split(locationString, "..") - start, err := strconv.Atoi(partialRegex.ReplaceAllString(startEndSplit[0], "")) - if err != nil { - return Location{}, err - } - end, err := strconv.Atoi(partialRegex.ReplaceAllString(startEndSplit[1], "")) - if err != nil { - return Location{}, err - } - location = Location{Start: start - 1, End: end} - } - } else { - firstOuterParentheses := strings.Index(locationString, "(") - expression := locationString[firstOuterParentheses+1 : strings.LastIndex(locationString, ")")] - switch command := locationString[0:firstOuterParentheses]; command { - case "join": - location.Join = true - // This case checks for join(complement(x..x),complement(x..x)), or any more complicated derivatives - if strings.ContainsAny(expression, "(") { - firstInnerParentheses := strings.Index(expression, "(") - ParenthesesCount := 1 - prevSubLocationStart := 0 - for i := firstInnerParentheses + 1; i < len(expression); i++ { // "(" is at 0, so we start at 1 - switch expression[i] { - case '(': - ParenthesesCount++ - case ')': - ParenthesesCount-- - case ',': - if ParenthesesCount == 0 { - parsedSubLocation, err := parseLocation(expression[prevSubLocationStart:i]) - if err != nil { - return Location{}, err - } - parsedSubLocation.GbkLocationString = locationString - location.SubLocations = append(location.SubLocations, parsedSubLocation) - prevSubLocationStart = i + 1 - } - } - } - if ParenthesesCount != 0 { - return Location{}, fmt.Errorf("Unbalanced parentheses") - } - parsedSubLocation, err := parseLocation(expression[prevSubLocationStart:]) - if err != nil { - return Location{}, err - } - parsedSubLocation.GbkLocationString = locationString - location.SubLocations = append(location.SubLocations, parsedSubLocation) - } else { // This is the default join(x..x,x..x) - for _, numberRange := range strings.Split(expression, ",") { - joinLocation, err := parseLocation(numberRange) - if err != nil { - return Location{}, err - } - location.SubLocations = append(location.SubLocations, joinLocation) - } - } - - case "complement": - // location.Complement = true - subLocation, err := parseLocation(expression) - if err != nil { - return Location{}, err - } - subLocation.Complement = true - subLocation.GbkLocationString = locationString - location.SubLocations = append(location.SubLocations, subLocation) - } - } - - if strings.Contains(locationString, "<") { - location.FivePrimePartial = true - } - - if strings.Contains(locationString, ">") { - location.ThreePrimePartial = true - } - - // if excess root node then trim node. Maybe should just be handled with second arg? - if location.Start == 0 && location.End == 0 && !location.Join && !location.Complement { - location = location.SubLocations[0] - } - - return location, nil -} - -// buildMetaString is a helper function to build the meta section of genbank files. -func buildMetaString(name string, data string) string { - keyWhitespaceTrailLength := 12 - len(name) // I wish I was kidding. - var keyWhitespaceTrail string - for i := 0; i < keyWhitespaceTrailLength; i++ { - keyWhitespaceTrail += " " - } - name += keyWhitespaceTrail - wrappedData := wordwrap.WrapString(data, 68) - splitData := strings.Split(wrappedData, "\n") - var returnData string - for index, datum := range splitData { - if index == 0 { - returnData = name + datum + "\n" - } else { - returnData += generateWhiteSpace(12) + datum + "\n" - } - } - - return returnData -} - -// BuildLocationString is a recursive function that takes a location object and creates a gbk location string for Build() -func BuildLocationString(location Location) string { - var locationString string - - if location.Complement { - location.Complement = false - locationString = "complement(" + BuildLocationString(location) + ")" - } else if location.Join { - locationString = "join(" - for _, sublocation := range location.SubLocations { - locationString += BuildLocationString(sublocation) + "," - } - locationString = strings.TrimSuffix(locationString, ",") + ")" - } else { - locationString = strconv.Itoa(location.Start+1) + ".." + strconv.Itoa(location.End) - if location.FivePrimePartial { - locationString = "<" + locationString - } - - if location.ThreePrimePartial { - locationString += ">" - } - } - return locationString -} - -// BuildFeatureString is a helper function to build gbk feature strings for Build() -func BuildFeatureString(feature Feature) string { - whiteSpaceTrailLength := 16 - len(feature.Type) // I wish I was kidding. - whiteSpaceTrail := generateWhiteSpace(whiteSpaceTrailLength) - var location string - - if feature.Location.GbkLocationString != "" { - location = feature.Location.GbkLocationString - } else { - location = BuildLocationString(feature.Location) - } - featureHeader := generateWhiteSpace(subMetaIndex) + feature.Type + whiteSpaceTrail + location + "\n" - returnString := featureHeader - - qualifierKeys := make([]string, 0, len(feature.Attributes)) - for key := range feature.Attributes { - qualifierKeys = append(qualifierKeys, key) - } - - for _, qualifier := range qualifierKeys { - returnString += generateWhiteSpace(qualifierIndex) + "/" + qualifier + "=\"" + feature.Attributes[qualifier] + "\"\n" - } - return returnString -} - -func generateWhiteSpace(length int) string { - var spaceBuilder strings.Builder - - for i := 0; i < length; i++ { - spaceBuilder.WriteString(" ") - } - - return spaceBuilder.String() -} - -/****************************************************************************** - -GBK specific IO related things end here. - -******************************************************************************/ diff --git a/io/genbank/genbank_test.go b/io/genbank/genbank_test.go deleted file mode 100644 index a0ac784c..00000000 --- a/io/genbank/genbank_test.go +++ /dev/null @@ -1,804 +0,0 @@ -package genbank - -import ( - "encoding/json" - "errors" - "fmt" - "io" - "os" - "path/filepath" - "strings" - "testing" - - "reflect" - - "github.com/bebop/poly/transform" - "github.com/google/go-cmp/cmp" - "github.com/google/go-cmp/cmp/cmpopts" - "github.com/stretchr/testify/assert" -) - -/****************************************************************************** - -Gbk/gb/genbank related benchmarks begin here. - -******************************************************************************/ - -var singleGbkPaths = []string{ - "../../data/t4_intron.gb", - "../../data/puc19.gbk", - "../../data/puc19_snapgene.gb", - "../../data/benchling.gb", - "../../data/phix174.gb", - "../../data/sample.gbk", - // "../../data/pichia_chr1_head.gb", -} - -func TestGbkIO(t *testing.T) { - tmpDataDir, err := os.MkdirTemp("", "data-*") - if err != nil { - t.Error(err) - } - defer os.RemoveAll(tmpDataDir) - - // test single gbk read, write, build, parse - for _, gbkPath := range singleGbkPaths { - gbk, _ := Read(gbkPath) - - tmpGbkFilePath := filepath.Join(tmpDataDir, filepath.Base(gbkPath)) - _ = Write(gbk, tmpGbkFilePath) - - writeTestGbk, _ := Read(tmpGbkFilePath) - if diff := cmp.Diff(gbk, writeTestGbk, []cmp.Option{cmpopts.IgnoreFields(Feature{}, "ParentSequence")}...); diff != "" { - t.Errorf("Parsing the output of Build() does not produce the same output as parsing the original file, \"%s\", read with Read(). Got this diff:\n%s", filepath.Base(gbkPath), diff) - } - } // end test single gbk read, write, build, parse -} - -func TestMultiLineFeatureParse(t *testing.T) { - pichia, _ := Read("../../data/pichia_chr1_head.gb") - var multilineOutput string - for _, feature := range pichia.Features { - multilineOutput = feature.Location.GbkLocationString - } - - if multilineOutput != "join(<459260..459456,459556..459637,459685..459739,459810..>460126)" { - t.Errorf("Failed to parse multiline genbank feature string") - } -} - -func TestMultiGenbankIO(t *testing.T) { - tmpDataDir, err := os.MkdirTemp("", "data-*") - if err != nil { - t.Error(err) - } - defer os.RemoveAll(tmpDataDir) - - // Test multiline Genbank features - gbkPath := "../../data/multiGbk_test.seq" - multiGbk, _ := ReadMulti(gbkPath) - tmpGbkFilePath := filepath.Join(tmpDataDir, filepath.Base(gbkPath)) - _ = WriteMulti(multiGbk, tmpGbkFilePath) - - writeTestGbk, _ := ReadMulti(tmpGbkFilePath) - - if diff := cmp.Diff(multiGbk, writeTestGbk, []cmp.Option{cmpopts.IgnoreFields(Feature{}, "ParentSequence")}...); diff != "" { - t.Errorf("Parsing the output of Build() does not produce the same output as parsing the original file, \"%s\", read with Read(). Got this diff:\n%s", filepath.Base(gbkPath), diff) - } -} - -func TestGbkLocationStringBuilder(t *testing.T) { - tmpDataDir, err := os.MkdirTemp("", "data-*") - if err != nil { - t.Error(err) - } - defer os.RemoveAll(tmpDataDir) - - scrubbedGbk, err := Read("../../data/sample.gbk") - if err != nil { - t.Error(err) - } - - // removing gbkLocationString from features to allow testing for gbkLocationBuilder - for featureIndex := range scrubbedGbk.Features { - scrubbedGbk.Features[featureIndex].Location.GbkLocationString = "" - } - - tmpGbkFilePath := filepath.Join(tmpDataDir, "sample.gbk") - _ = Write(scrubbedGbk, tmpGbkFilePath) - - testInputGbk, _ := Read("../../data/sample.gbk") - testOutputGbk, _ := Read(tmpGbkFilePath) - - if diff := cmp.Diff(testInputGbk, testOutputGbk, []cmp.Option{cmpopts.IgnoreFields(Feature{}, "ParentSequence")}...); diff != "" { - t.Errorf("Issue with partial location building. Parsing the output of Build() does not produce the same output as parsing the original file read with Read(). Got this diff:\n%s", diff) - } -} - -func TestGbLocationStringBuilder(t *testing.T) { - tmpDataDir, err := os.MkdirTemp("", "data-*") - if err != nil { - t.Error(err) - } - defer os.RemoveAll(tmpDataDir) - - scrubbedGb, _ := Read("../../data/t4_intron.gb") - - // removing gbkLocationString from features to allow testing for gbkLocationBuilder - for featureIndex := range scrubbedGb.Features { - scrubbedGb.Features[featureIndex].Location.GbkLocationString = "" - } - - tmpGbFilePath := filepath.Join(tmpDataDir, "t4_intron_test.gb") - _ = Write(scrubbedGb, tmpGbFilePath) - - testInputGb, _ := Read("../../data/t4_intron.gb") - testOutputGb, _ := Read(tmpGbFilePath) - - if diff := cmp.Diff(testInputGb, testOutputGb, []cmp.Option{cmpopts.IgnoreFields(Feature{}, "ParentSequence")}...); diff != "" { - t.Errorf("Issue with either Join or complement location building. Parsing the output of Build() does not produce the same output as parsing the original file read with Read(). Got this diff:\n%s", diff) - } -} - -func TestPartialLocationParseRegression(t *testing.T) { - gbk, _ := Read("../../data/sample.gbk") - - for _, feature := range gbk.Features { - if feature.Location.GbkLocationString == "687..3158>" && (feature.Location.Start != 686 || feature.Location.End != 3158) { - t.Errorf("Partial location for three prime location parsing has failed. Parsing the output of Build() does not produce the same output as parsing the original file read with Read()") - } - } - gbk, err := Read("../../data/sample.gbk") - if err != nil { - t.Errorf("Failed to read sample.gbk. Got err: %s", err) - } - - for _, feature := range gbk.Features { - if feature.Location.GbkLocationString == "687..3158>" && (feature.Location.Start != 686 || feature.Location.End != 3158) { - t.Errorf("Partial location for three prime location parsing has failed. Parsing the output of Build() does not produce the same output as parsing the original file read with Read(). Got location start %d and location end %d. Expected 687..3158>.", feature.Location.Start, feature.Location.End) - } else if feature.Location.GbkLocationString == "<1..206" && (feature.Location.Start != 0 || feature.Location.End != 206) { - t.Errorf("Partial location for five prime location parsing has failed. Parsing the output of Build() does not produce the same output as parsing the original file read with Read().") - } - } -} - -func TestSubLocationStringParseRegression(t *testing.T) { - location := "join(complement(5306942..5307394),complement(5304401..5305029),complement(5303328..5303393),complement(5301928..5302004))" - parsedLocation, err := parseLocation(location) - if err != nil { - t.Errorf("Failed to parse location string. Got err: %s", err) - } - jsonFile, err := os.Open("../../data/parseLocationRegressionTest.json") - // if we os.Open returns an error then handle it - if err != nil { - fmt.Println(err) - } - defer jsonFile.Close() - - byteValue, _ := io.ReadAll(jsonFile) - var testParsedLocation Location - err = json.Unmarshal(byteValue, &testParsedLocation) - if err != nil { - t.Errorf("Failed to unmarshal json. Got err: %s", err) - } - - if diff := cmp.Diff(parsedLocation, testParsedLocation); diff != "" { - t.Errorf("Failed to parse sublocation string. Got this diff:\n%s", diff) - } -} - -func TestSnapgeneGenbankRegression(t *testing.T) { - snapgene, err := Read("../../data/puc19_snapgene.gb") - - if snapgene.Sequence == "" { - t.Errorf("Parsing snapgene returned an empty string. Got error: %s", err) - } -} - -func TestGetSequenceMethod(t *testing.T) { - gbk, _ := Read("../../data/t4_intron.gb") - - // Check to see if GetSequence method works on Features struct - feature, _ := gbk.Features[1].GetSequence() - seq := "atgagattacaacgccagagcatcaaagattcagaagttagaggtaaatggtattttaatatcatcggtaaagattctgaacttgttgaaaaagctgaacatcttttacgtgatatgggatgggaagatgaatgcgatggatgtcctctttatgaagacggagaaagcgcaggattttggatttaccattctgacgtcgagcagtttaaagctgattggaaaattgtgaaaaagtctgtttga" - if feature != seq { - t.Errorf("Feature GetSequence method has failed. Got this:\n%s instead of \n%s", feature, seq) - } -} - -func TestLocationParser(t *testing.T) { - gbk, _ := Read("../../data/t4_intron.gb") - - // Read 1..243 - feature, _ := gbk.Features[1].GetSequence() - seq := "atgagattacaacgccagagcatcaaagattcagaagttagaggtaaatggtattttaatatcatcggtaaagattctgaacttgttgaaaaagctgaacatcttttacgtgatatgggatgggaagatgaatgcgatggatgtcctctttatgaagacggagaaagcgcaggattttggatttaccattctgacgtcgagcagtttaaagctgattggaaaattgtgaaaaagtctgtttga" - if feature != seq { - t.Errorf("Feature sequence parser has changed on test '1..243'. Got this:\n%s instead of \n%s", feature, seq) - } - - // Read join(893..1441,2459..2770) - featureJoin, _ := gbk.Features[6].GetSequence() - seqJoin := "atgaaacaataccaagatttaattaaagacatttttgaaaatggttatgaaaccgatgatcgtacaggcacaggaacaattgctctgttcggatctaaattacgctgggatttaactaaaggttttcctgcggtaacaactaagaagctcgcctggaaagcttgcattgctgagctaatatggtttttatcaggaagcacaaatgtcaatgatttacgattaattcaacacgattcgttaatccaaggcaaaacagtctgggatgaaaattacgaaaatcaagcaaaagatttaggataccatagcggtgaacttggtccaatttatggaaaacagtggcgtgattttggtggtgtagaccaaattatagaagttattgatcgtattaaaaaactgccaaatgataggcgtcaaattgtttctgcatggaatccagctgaacttaaatatatggcattaccgccttgtcatatgttctatcagtttaatgtgcgtaatggctatttggatttgcagtggtatcaacgctcagtagatgttttcttgggtctaccgtttaatattgcgtcatatgctacgttagttcatattgtagctaagatgtgtaatcttattccaggggatttgatattttctggtggtaatactcatatctatatgaatcacgtagaacaatgtaaagaaattttgaggcgtgaacctaaagagctttgtgagctggtaataagtggtctaccttataaattccgatatctttctactaaagaacaattaaaatatgttcttaaacttaggcctaaagatttcgttcttaacaactatgtatcacaccctcctattaaaggaaagatggcggtgtaa" - if featureJoin != seqJoin { - t.Errorf("Feature sequence parser has changed on test 'join(893..1441,2459..2770)'. Got this:\n%s instead of \n%s", featureJoin, seqJoin) - } - - // Read complement(2791..3054) - featureComplement, _ := gbk.Features[10].GetSequence() - seqComplement := "ttattcactacccggcatagacggcccacgctggaataattcgtcatattgtttttccgttaaaacagtaatatcgtagtaacagtcagaagaagttttaactgtggaaattttattatcaaaatactcacgagtcattttatgagtatagtattttttaccataaatggtaataggctgttctggtcctggaacttctaactcgcttgggttaggaagtgtaaaaagaactacaccagaagtatctttaaatcgtaaaatcat" - if featureComplement != seqComplement { - t.Errorf("Feature sequence parser has changed on test 'complement(2791..3054)'. Got this:\n%s instead of \n%s", featureComplement, seqComplement) - } - - // Read join(complement(315..330),complement(339..896)) - // Note: it is known that some software, like Snapgene, assumes that since both strands are in the reverse direction - // that the first sequence should be appended to the reverse sequence, instead of the second sequence - // getting appended to the first. Biopython appends the second sequence to the first, and that is logically - // the most obvious thing to do, so we are implementing it that way. - featureJoinComplement, _ := gbk.Features[3].GetSequence() - seqJoinComplement := "ataccaatttaatcattcatttatatactgattccgtaagggttgttacttcatctattttataccaatgcgtttcaaccatttcacgcttgcttatatcatcaagaaaacttgcgtctaattgaactgttgaattaacacgatgccttttaacgatgcgagaaacaactacttcatctgcataaggtaatgcagcatataacagagcaggcccgccaattacacttactttagaattctgatcaagcatagtttcgaatggtgcattagggcttgacacttgaatttcgccgccagaaatgtaagttatatattgctcccaagtaatatagaaatgtgctaaatcgccgtctttagttacaggataatcacgcgcaaggtcacacaccacaatatggctacgaccaggaagtaatgtaggcaatgactggaacgttttagcacccataatcataattgtgccttcagtacgagctttaaaattctggaggtcctttttaactcgtccccatggtaaaccatcacctaaaccgaatgctaattcattaaagccgtcgaccgttttagttggaga" - if featureJoinComplement != seqJoinComplement { - t.Errorf("Feature sequence parser has changed on test 'join(complement(315..330),complement(339..896))'. Got this:\n%s instead of \n%s", featureJoinComplement, seqJoinComplement) - } - - // Read complement(join(893..1098,1101..2770)) - featureComplementJoin, _ := gbk.Features[5].GetSequence() - seqComplementJoin := "ttacaccgccatctttcctttaataggagggtgtgatacatagttgttaagaacgaaatctttaggcctaagtttaagaacatattttaattgttctttagtagaaagatatcggaatttataaggtagaccacttattaccagctcacaaagctctttaggttcacgcctcaaaatttctttacattgttctacgtgattcatatagatatgagtattaccaccagaaaatatcaaatcccctggaataagattacacatcttagctacaatatgaactaacgtagcatatgacgcaatattaaacggtagcattatgttcagataaggtcgttaatcttaccccggaattatatccagctgcatgtcaccatgcagagcagactatatctccaacttgttaaagcaagttgtctatcgtttcgagtcacttgaccctactccccaaagggatagtcgttaggcatttatgtagaaccaattccatttatcagattttacacgataagtaactaatccagacgaaattttaaaatgtctagctgcatctgctgcacaatcaaaaataaccccatcacatgaaatctttttaatattactaggctttttacctttcatcttttctgatattttagatttagttatgtctgaatgcttatgattaaagaatgaattattttcacctgaacgatttctgcatttactacaagtataagcagaagtttgtatgcgaacaccgcacttacaaaacttatgggtttctggattccaacgcccgtttttacttccgggtttactgtaaagagctttccgaccatcaggtccaagtttaagcatcttagctttaacagtttcagaacgtttcttaataatttcttcttttaatggatgcgtagaacatgtatcaccaaacgttgcatcagcaatattgtatccattaattttagaattaagctctttaatccaaaaattttctcgttcaataatcaaatctttctcatatggaatttcttccaaaatagaacattcaaacacattaccatgtttgttaaaagacctctgaagttttatagaagaatggcatcctttttctaaatctttaaaatgcctcttccatctcttttcaaaatctttagcacttcctacatatactttattgtttaaagtatttttaatctgataaattccgcttttcataaatacctctttaaatatagaagtatttattaaagggcaagtcctacaatttagcacgggattgtctactagagaggttccccgtttagatagattacaagtataagtcaccttatactcaggcctcaattaacccaagaaaacatctactgagcgttgataccactgcaaatccaaatagccattacgcacattaaactgatagaacatatgacaaggcggtaatgccatatatttaagttcagctggattccatgcagaaacaatttgacgcctatcatttggcagttttttaatacgatcaataacttctataatttggtctacaccaccaaaatcacgccactgttttccataaattggaccaagttcaccgctatggtatcctaaatcttttgcttgattttcgtaattttcatcccagactgttttgccttggattaacgaatcgtgttgaattaatcgtaaatcatacatttgtgcttcctgataaaaaccatattagctcagcaatgcaagctttccaggcgagcttcttagttgttaccgcaggaaaacctttagttaaatcccagcgtaatttagatccgaacagagcaattgttcctgtgcctgtacgatcatcggtttcataaccattttcaaaaatgtctttaattaaatcttggtattgtttcat" - if featureComplementJoin != seqComplementJoin { - t.Errorf("Feature sequence parser has changed on test 'complement(join(893..1098,1101..2770))'. Got this:\n%s instead of \n%s", featureComplementJoin, seqComplementJoin) - } -} - -func TestGenbankNewlineParsingRegression(t *testing.T) { - gbk, _ := Read("../../data/puc19.gbk") - - for _, feature := range gbk.Features { - if feature.Location.Start == 410 && feature.Location.End == 1750 && feature.Type == "CDS" { - if feature.Attributes["product"] != "chromosomal replication initiator informational ATPase" { - t.Errorf("Newline parsing has failed.") - } - break - } - } -} - -func BenchmarkRead(b *testing.B) { - for i := 0; i < b.N; i++ { - _, _ = Read("../../data/bsub.gbk") - } -} - -func BenchmarkRead1(b *testing.B) { BenchmarkRead(b) } -func BenchmarkRead10(b *testing.B) { BenchmarkRead(b) } -func BenchmarkRead100(b *testing.B) { BenchmarkRead(b) } -func BenchmarkRead1000(b *testing.B) { BenchmarkRead(b) } -func BenchmarkRead10000(b *testing.B) { BenchmarkRead(b) } - -/****************************************************************************** - -Gbk/gb/genbank related benchmarks end here. - -******************************************************************************/ - -func TestBenchlingGenbank(t *testing.T) { - sequence, _ := Read("../../data/benchling.gb") - - if len(sequence.Features) != 17 { - t.Errorf("Parsing benchling genbank file not returned the correct quantity of features") - } -} - -func TestParse(t *testing.T) { - type args struct { - r io.Reader - } - tests := []struct { - name string - args args - want Genbank - wantErr bool - }{ - // TODO: Add test cases. - // empty line in genbank meta data - // { - - // name: "empty line in genbank meta data", - // args: args{r: strings.NewReader("LOCUS puc19.gbk 2686 bp DNA circular 22-OCT-2019")}, - // wantErr: true, - // }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got, err := Parse(tt.args.r) - if (err != nil) != tt.wantErr { - t.Errorf("Parse() error = %v, wantErr %v", err, tt.wantErr) - return - } - if !reflect.DeepEqual(got, tt.want) { - t.Errorf("Parse() = %v, want %v", got, tt.want) - } - }) - } -} - -func TestParseMulti(t *testing.T) { - type args struct { - r io.Reader - } - tests := []struct { - name string - args args - want []Genbank - wantErr bool - }{ - // TODO: Add test cases. - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got, err := ParseMulti(tt.args.r) - if (err != nil) != tt.wantErr { - t.Errorf("ParseMulti() error = %v, wantErr %v", err, tt.wantErr) - return - } - if !reflect.DeepEqual(got, tt.want) { - t.Errorf("ParseMulti() = %v, want %v", got, tt.want) - } - }) - } -} - -// this was hand-written and tests the same as the above suite. -func TestFeature_GetSequence_Legacy(t *testing.T) { - // This test is a little too complex and contrived for an example function. - // Essentially, it's testing GetSequence()'s ability to parse and retrieve sequences from complex location structures. - // This was originally covered in the old package system it was not covered in the new package system so I decided to include it here. - - // Sequence for greenflourescent protein (GFP) that we're using as test data for this example. - gfpSequence := "ATGGCTAGCAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCTACATACGGAAAGCTTACCCTTAAATTTATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTCTCTTATGGTGTTCAATGCTTTTCCCGTTATCCGGATCATATGAAACGGCATGACTTTTTCAAGAGTGCCATGCCCGAAGGTTATGTACAGGAACGCACTATATCTTTCAAAGATGACGGGAACTACAAGACGCGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATCGTATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTCGGACACAAACTCGAGTACAACTATAACTCACACAATGTATACATCACGGCAGACAAACAAAAGAATGGAATCAAAGCTAACTTCAAAATTCGCCACAACATTGAAGATGGATCCGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCGACACAATCTGCCCTTTCGAAAGATCCCAACGAAAAGCGTGACCACATGGTCCTTCTTGAGTTTGTAACTGCTGCTGGGATTACACATGGCATGGATGAGCTCTACAAATAA" - - sequenceLength := len(gfpSequence) - - // Splitting the sequence into two parts to make a multi-location feature. - sequenceFirstHalf := gfpSequence[:sequenceLength/2] - sequenceSecondHalf := transform.ReverseComplement(gfpSequence[sequenceLength/2:]) // This feature is reverse complemented. - - // rejoining the two halves into a single string where the second half of the sequence is reverse complemented. - gfpSequenceModified := sequenceFirstHalf + sequenceSecondHalf - - // initialize sequence and feature structs. - var sequence Genbank - var feature Feature - - // set the initialized sequence struct's sequence. - sequence.Sequence = gfpSequenceModified - // initialize sublocations to be usedin the feature. - - var subLocation Location - var subLocationReverseComplemented Location - - subLocation.Start = 0 - subLocation.End = sequenceLength / 2 - - subLocationReverseComplemented.Start = sequenceLength / 2 - subLocationReverseComplemented.End = sequenceLength - subLocationReverseComplemented.Complement = true // According to genbank complement means reverse complement. What a country. - - feature.Description = "Green Fluorescent Protein" - feature.Location.SubLocations = []Location{subLocation, subLocationReverseComplemented} - - // Add the GFP feature to the sequence struct. - _ = sequence.AddFeature(&feature) - - // get the GFP feature sequence string from the sequence struct. - featureSequence, _ := feature.GetSequence() - - // check to see if the feature was inserted properly into the sequence. - if gfpSequence != featureSequence { - t.Error("Feature sequence was not properly retrieved.") - } -} - -func Test_parseLoopParameters_init(t *testing.T) { - type fields struct { - newLocation bool - quoteActive bool - attribute string - attributeValue string - sequenceBuilder strings.Builder - parseStep string - genbank Genbank - feature Feature - features []Feature - metadataTag string - metadataData []string - genbankStarted bool - } - tests := []struct { - name string - fields fields - }{ - // TODO: Add test cases. - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - params := &parseLoopParameters{ - newLocation: tt.fields.newLocation, - quoteActive: tt.fields.quoteActive, - attribute: tt.fields.attribute, - attributeValue: tt.fields.attributeValue, - sequenceBuilder: tt.fields.sequenceBuilder, - parseStep: tt.fields.parseStep, - genbank: tt.fields.genbank, - feature: tt.fields.feature, - features: tt.fields.features, - metadataTag: tt.fields.metadataTag, - metadataData: tt.fields.metadataData, - genbankStarted: tt.fields.genbankStarted, - } - params.init() - }) - } -} - -func TestParseMultiNth(t *testing.T) { - type args struct { - r io.Reader - count int - } - tests := []struct { - name string - args args - want []Genbank - wantErr bool - }{ - // TODO: Add test cases. - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got, err := ParseMultiNth(tt.args.r, tt.args.count) - if (err != nil) != tt.wantErr { - t.Errorf("ParseMultiNth() error = %v, wantErr %v", err, tt.wantErr) - return - } - if !reflect.DeepEqual(got, tt.want) { - t.Errorf("ParseMultiNth() = %v, want %v", got, tt.want) - } - }) - } -} - -func Test_parseMetadata(t *testing.T) { - type args struct { - metadataData []string - } - tests := []struct { - name string - args args - want string - }{ - // TODO: Add test cases. - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := parseMetadata(tt.args.metadataData); got != tt.want { - t.Errorf("parseMetadata() = %v, want %v", got, tt.want) - } - }) - } -} - -func Test_parseReferences(t *testing.T) { - type args struct { - metadataData []string - } - tests := []struct { - name string - args args - want Reference - wantErr bool - }{ - // TODO: Add test cases. - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got, err := parseReferences(tt.args.metadataData) - if (err != nil) != tt.wantErr { - t.Errorf("parseReferences() error = %v, wantErr %v", err, tt.wantErr) - return - } - if !reflect.DeepEqual(got, tt.want) { - t.Errorf("parseReferences() = %v, want %v", got, tt.want) - } - }) - } -} - -func TestReference_addKey(t *testing.T) { - type fields struct { - Authors string - Title string - Journal string - PubMed string - Remark string - Range string - } - type args struct { - referenceKey string - referenceValue string - } - tests := []struct { - name string - fields fields - args args - wantErr bool - }{ - // TODO: Add test cases. - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - reference := &Reference{ - Authors: tt.fields.Authors, - Title: tt.fields.Title, - Journal: tt.fields.Journal, - PubMed: tt.fields.PubMed, - Remark: tt.fields.Remark, - Range: tt.fields.Range, - } - if err := reference.addKey(tt.args.referenceKey, tt.args.referenceValue); (err != nil) != tt.wantErr { - t.Errorf("Reference.addKey() error = %v, wantErr %v", err, tt.wantErr) - } - }) - } -} - -func Test_parseLocus(t *testing.T) { - type args struct { - locusString string - } - tests := []struct { - name string - args args - want Locus - }{ - // TODO: Add test cases. - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := parseLocus(tt.args.locusString); !reflect.DeepEqual(got, tt.want) { - t.Errorf("parseLocus() = %v, want %v", got, tt.want) - } - }) - } -} - -func TestRead(t *testing.T) { - type args struct { - path string - } - tests := []struct { - name string - args args - want Genbank - wantErr bool - }{ - // TODO: Add test cases. - { - name: "error on missing file", - args: args{ - path: "../../afdaljhdfa.txt", - }, - wantErr: true, - }, - { - name: "error on malformed file", - args: args{ - path: "../../data/malformed_read_test.gbk", - }, - wantErr: true, - }, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got, err := Read(tt.args.path) - if (err != nil) != tt.wantErr { - t.Errorf("Read() error = %v, wantErr %v", err, tt.wantErr) - return - } - if !reflect.DeepEqual(got, tt.want) { - t.Errorf("Read() = %v, want %v", got, tt.want) - } - }) - } -} - -func TestReadMulti(t *testing.T) { - type args struct { - path string - } - tests := []struct { - name string - args args - want []Genbank - wantErr bool - }{ - // TODO: Add test cases. - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got, err := ReadMulti(tt.args.path) - if (err != nil) != tt.wantErr { - t.Errorf("ReadMulti() error = %v, wantErr %v", err, tt.wantErr) - return - } - if !reflect.DeepEqual(got, tt.want) { - t.Errorf("ReadMulti() = %v, want %v", got, tt.want) - } - }) - } -} - -func Test_parseLocation(t *testing.T) { - type args struct { - locationString string - } - tests := []struct { - name string - args args - want Location - wantErr bool - }{ - // TODO: Add test cases. - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got, err := parseLocation(tt.args.locationString) - if (err != nil) != tt.wantErr { - t.Errorf("parseLocation() error = %v, wantErr %v", err, tt.wantErr) - return - } - if !reflect.DeepEqual(got, tt.want) { - t.Errorf("parseLocation() = %v, want %v", got, tt.want) - } - }) - } -} - -func Test_buildMetaString(t *testing.T) { - type args struct { - name string - data string - } - tests := []struct { - name string - args args - want string - }{ - // TODO: Add test cases. - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := buildMetaString(tt.args.name, tt.args.data); got != tt.want { - t.Errorf("buildMetaString() = %v, want %v", got, tt.want) - } - }) - } -} - -func TestBuildLocationString(t *testing.T) { - type args struct { - location Location - } - tests := []struct { - name string - args args - want string - }{ - // TODO: Add test cases. - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := BuildLocationString(tt.args.location); got != tt.want { - t.Errorf("BuildLocationString() = %v, want %v", got, tt.want) - } - }) - } -} - -func Test_generateWhiteSpace(t *testing.T) { - type args struct { - length int - } - tests := []struct { - name string - args args - want string - }{ - // TODO: Add test cases. - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - if got := generateWhiteSpace(tt.args.length); got != tt.want { - t.Errorf("generateWhiteSpace() = %v, want %v", got, tt.want) - } - }) - } -} - -func TestRead_error(t *testing.T) { - readErr := errors.New("open /tmp/file: no such file or directory") - oldReadFileFn := readFileFn - readFileFn = func(filename string) ([]byte, error) { - return nil, readErr - } - defer func() { - readFileFn = oldReadFileFn - }() - _, err := Read("/tmp/file") - assert.EqualError(t, err, readErr.Error()) -} - -func TestBuildFeatureString(t *testing.T) { - feature := Feature{ - Type: "test type", - Description: "a description", - Location: Location{ - GbkLocationString: "gbk location", - }, - } - str := BuildFeatureString(feature) - assert.Equal(t, str, " test type gbk location\n") -} - -func TestParse_error(t *testing.T) { - parseMultiErr := errors.New("parse error") - oldParseMultiNthFn := parseMultiNthFn - parseMultiNthFn = func(r io.Reader, count int) ([]Genbank, error) { - return nil, parseMultiErr - } - defer func() { - parseMultiNthFn = oldParseMultiNthFn - }() - _, err := Parse(strings.NewReader("")) - assert.EqualError(t, err, parseMultiErr.Error()) - - _, err = ParseMulti(strings.NewReader("")) - assert.EqualError(t, err, parseMultiErr.Error()) -} - -func TestParseReferences_error(t *testing.T) { - parseReferencesErr := errors.New("Failed in parsing reference above line 13. Got error: ") - oldParseReferencesFn := parseReferencesFn - parseReferencesFn = func(metadataData []string) (Reference, error) { - return Reference{}, errors.New("") - } - defer func() { - parseReferencesFn = oldParseReferencesFn - }() - file, _ := os.Open("../../data/puc19.gbk") - _, err := parseMultiNthFn(file, 1) - assert.EqualError(t, err, parseReferencesErr.Error()) -} - -func TestIssue303Regression(t *testing.T) { - seq, _ := Read("../../data/puc19_303_regression.gbk") - expectedAttribute := "16S rRNA(adenine(1518)-N(6)/adenine(1519)-N(6))-dimethyltransferase" - for _, feature := range seq.Features { - if feature.Attributes["locus_tag"] == "JCVISYN3A_0004" && feature.Type == "CDS" { - if feature.Attributes["product"] != expectedAttribute { - t.Errorf("Failed to get proper expected attribute. Got: %s Expected: %s", feature.Attributes["product"], expectedAttribute) - } - } - if feature.Attributes["locus_tag"] == "JCVISYN3A_0051" && feature.Type == "CDS" { - if _, ok := feature.Attributes["pseudo"]; !ok { - t.Errorf("pseudo should be in attributes") - } - } - } -} - -func TestConsortiumRegression(t *testing.T) { - _, err := Read("../../data/puc19_consrtm.gbk") - if err != nil { - t.Errorf("Failed to read consrtm. Got err: %s", err) - } -} diff --git a/io/genbank/parser.go b/io/genbank/parser.go new file mode 100644 index 00000000..dc4e0f72 --- /dev/null +++ b/io/genbank/parser.go @@ -0,0 +1,747 @@ +package genbank + +import ( + "bufio" + "fmt" + "io" + "regexp" + "strconv" + "strings" + "time" + "unicode" +) + +/* PARSER TYPE DEFINITIONS */ + +// A Parser stores state during parsing of a Genbank +// flatfile. +type Parser struct { + reader *bufio.Reader + line uint + currLine string + peekedLine string +} + +// A parseContext holds the data structures currently being parsed. +type parseContext struct { + entry *Entry + reference *Reference +} + +// NewParser instantiates a new Genbank parser to +// parse a flatfile in an io.Reader. +func NewParser(reader io.Reader) *Parser { + return &Parser{ + reader: bufio.NewReader(reader), + line: 0, + } +} + +// A tokenRange holds indices in which a token is expected to occur. +type tokenRange struct { + start int + end int +} + +/* PARSER UTILITY FUNCTIONS */ + +func (p *Parser) makeSyntaxError(msg string, innerError ...error) GenbankSyntaxError { + res := GenbankSyntaxError{ + Msg: msg, + Line: p.line, + Context: p.currLine, + } + if len(innerError) > 0 { + res.InnerErr = innerError[0] + } + + return res +} + +func (p *Parser) makeWrongContextError(linetype lineType) GenbankSyntaxError { + switch linetype { + case keyword: + return p.makeSyntaxError("unexpected keyword found") + case empty: + return p.makeSyntaxError("unexpected empty line found") + case eof: + return p.makeSyntaxError("unexpected end of file found") + case subKeyword: + return p.makeSyntaxError("subkeyword found outside of keyword context") + case entryEnd: + return p.makeSyntaxError("unexpected end of entry found") + case continuation: + return p.makeSyntaxError("continuation found outside of keyword context") + case featureKey: + return p.makeSyntaxError("feature key found outside of FEATURES keyword context") + case sequence: + return p.makeSyntaxError("sequence found outside of ORIGIN keyword context") + default: + return p.makeSyntaxError("unknown linetype found, please report this to repository maintainers") + } +} + +// readLine reads a line from the underlying reader. +// Does not return a line's newline. +func (p *Parser) readLine() (string, error) { + // If we previously peeked a line, make sure we output it + // before we start reading new lines. Otherwise, read + // from the underlying reader. + var res string + if p.peekedLine != "" { + res = p.peekedLine + } else { + var err error + res, err = p.reader.ReadString('\n') + if err != nil && !(err == io.EOF && res != "") { // Don't return an error if we are reading the last line + return "", err + } + } + + p.peekedLine = "" + p.line++ + p.currLine = res + trimmed, _ := strings.CutSuffix(res, "\n") + return trimmed, nil +} + +// peekLine returns the next line without consuming it from +// the underlying reader. Repeated calls to peekLine return +// the same result. Does not return a line's newline. +func (p *Parser) peekLine() (string, error) { + // If we've already peeked this line, simply return it again. + if p.peekedLine != "" { + res, _ := strings.CutSuffix(p.peekedLine, "\n") + return res, nil + } + + res, err := p.reader.ReadString('\n') + if err != nil && !(err == io.EOF && res != "") { // Don't return an error if we are reading the last line + return "", err + } + + p.peekedLine = res + res, _ = strings.CutSuffix(res, "\n") + return res, nil +} + +type lineType int + +const ( + empty lineType = iota + subKeyword + continuation + featureKey + sequence + entryEnd + keyword + eof +) + +// Determine which type of information the next line encodes +// (see Genbank spec 3.4.2). +func (p *Parser) peekNextLineType() (lineType, error) { + nextLine, err := p.peekLine() + if err == io.EOF { + return eof, nil + } else if err != nil { + return empty, err + } + + switch { + case len(strings.TrimSpace(nextLine)) == 0: + return empty, nil + case unicode.IsLetter(rune(nextLine[0])): + return keyword, nil + case strings.HasPrefix(nextLine, "//"): + return entryEnd, nil + case unicode.IsLetter(rune(nextLine[2])): + return subKeyword, nil + case !unicode.IsSpace(rune(nextLine[5])): + return featureKey, nil + } + + if _, err := strconv.Atoi(strings.TrimSpace(nextLine[:9])); err == nil { + return sequence, nil + } else if len(nextLine) > 0 { + return continuation, nil + } + + return empty, fmt.Errorf("unrecognized line type") +} + +func (p *Parser) dispatchByPrefix(pCtx parseContext, funcs map[string]func(parseContext) error) (err error) { + line, err := p.peekLine() + if err != nil { + return err + } + + for prefix, parseFunc := range funcs { + if strings.HasPrefix(line, prefix) { + return parseFunc(pCtx) + } + } + + p.readLine() // We only peeked previously, so we have to read for proper error reporting + return p.makeSyntaxError("keyword expected, none found") +} + +/* MAIN PARSER FUNCTIONS */ + +// Parse parses a Genbank file from a Parser's +// io.Reader. +func (p *Parser) Parse() (Genbank, error) { + res := Genbank{} + + header, err := p.parseHeader() + if err != nil { + return res, fmt.Errorf("failed to parse header: %w", err) + } + res.Header = header + + res.Entries = make([]Entry, 0) + var entry Entry + reachedEOF := false + for !reachedEOF { + entry, reachedEOF, err = p.parseEntry() + if err != nil { + return res, fmt.Errorf("failed to parse entry %v: %w", len(res.Entries), err) + } + res.Entries = append(res.Entries, entry) + + } + + return res, nil +} + +const headerDateLayout = "January 2 2006" + +var fileNameRange = tokenRange{start: 0, end: 10} +var releaseNumRange = tokenRange{start: 47, end: 52} +var entryNumRange = tokenRange{start: 0, end: 8} +var baseNumRange = tokenRange{start: 15, end: 26} +var sequenceNumRange = tokenRange{start: 39, end: 47} + +func (p *Parser) parseHeader() (Header, error) { + res := Header{} + + fileNameLine, err := p.readLine() + if err != nil { + return res, fmt.Errorf("failed to read file name: %w", err) + } + fileName := strings.TrimSpace(fileNameLine[fileNameRange.start:fileNameRange.end]) + if fileName == "" { + return res, p.makeSyntaxError("empty file name") + } + res.FileName = fileName + + dateLine, err := p.readLine() + if err != nil { + return res, fmt.Errorf("failed to read date: %w", err) + } + date, err := time.Parse(headerDateLayout, strings.TrimSpace(dateLine)) + if err != nil { + return res, p.makeSyntaxError("failed to parse date", err) + } + res.Date = date + + p.readLine() // Skip empty line + + releaseLine, err := p.readLine() + if err != nil { + return res, fmt.Errorf("failed to read release: %w", err) + } + if len(releaseLine) < releaseNumRange.end { + return res, p.makeSyntaxError("release line too short") + } + release := strings.Split(strings.TrimSpace(releaseLine[releaseNumRange.start:releaseNumRange.end]), ".") + if len(release) != 2 { + return res, p.makeSyntaxError("malformed release version") + } + res.MajorRelease, err = strconv.Atoi(release[0]) + if err != nil { + return res, p.makeSyntaxError("failed to parse major release", err) + } + res.MinorRelease, err = strconv.Atoi(release[1]) + if err != nil { + return res, p.makeSyntaxError("failed to parse minor release", err) + } + + p.readLine() // Skip empty line + + titleLine, err := p.readLine() + if err != nil { + return res, fmt.Errorf("failed to read title: %w", err) + } + res.Title = strings.TrimSpace(titleLine) + + p.readLine() // Skip empty line + + statsLine, err := p.readLine() + if err != nil { + return res, fmt.Errorf("failed to read file statistics: %w", err) + } + if len(statsLine) < sequenceNumRange.end { + return res, p.makeSyntaxError("file statistics line too short") + } + res.NumEntries, err = strconv.Atoi(strings.TrimSpace(statsLine[entryNumRange.start:entryNumRange.end])) + if err != nil { + return res, p.makeSyntaxError("failed to parse number of entries/loci", err) + } + res.NumBases, err = strconv.Atoi(strings.TrimSpace(statsLine[baseNumRange.start:baseNumRange.end])) + if err != nil { + return res, p.makeSyntaxError("failed to parse number of bases", err) + } + res.NumSequences, err = strconv.Atoi(strings.TrimSpace(statsLine[sequenceNumRange.start:sequenceNumRange.end])) + if err != nil { + return res, p.makeSyntaxError("failed to parse number of sequences", err) + } + + p.readLine() // Skip empty line + + return res, nil +} + +func (p *Parser) parseEntry() (entry Entry, reachedEOF bool, err error) { + keywordFuncs := map[string]func(parseContext) error{ + "LOCUS": p.parseLocus, + "DEFINITION": p.parseDefinition, + "ACCESSION": p.parseAccession, + "VERSION": p.parseVersion, + "DBLINK": p.parseDBLink, + "KEYWORDS": p.parseKeywords, + "SEGMENT": p.parseSegment, + "SOURCE": p.parseSource, + } + + res := Entry{} + pCtx := parseContext{entry: &res} + for { + linetype, err := p.peekNextLineType() + if err != nil { + return res, false, fmt.Errorf("could not parse entry: %w", err) + } + + switch linetype { + case keyword: + err = p.dispatchByPrefix(pCtx, keywordFuncs) + if err == io.EOF { + return res, true, nil + } else if err != nil { + return res, false, err + } else if _, err := p.peekLine(); err == io.EOF { + return res, true, nil + } + + case empty: + p.readLine() // We need to read the empty line to advance the reader + case eof: + return res, true, nil + case entryEnd: + return res, false, nil + default: + return res, false, p.makeWrongContextError(linetype) + } + } +} + +/* KEYWORD PARSING FUNCTIONS */ + +const locusDateLayout = "02-Jan-2006" + +// See Genbank spec 3.4.4 +func (p *Parser) parseLocus(pCtx parseContext) error { + line, err := p.readLine() + if err != nil { + return fmt.Errorf("failed to read LOCUS line: %w", err) + } + + re := regexp.MustCompile(`\s+`) + tokens := re.Split(line, -1) + if len(tokens) != 8 { + return fmt.Errorf("LOCUS line should have 8 tokens, got %v", len(tokens)) + } + + pCtx.entry.Name = tokens[1] + + length, err := strconv.Atoi(tokens[2]) + if err != nil { + return p.makeSyntaxError("failed to parse sequence length", err) + } + pCtx.entry.Length = length + + molecule := strings.Split(tokens[4], "-") + if len(molecule) == 2 { + pCtx.entry.Strandedness = Strandedness(molecule[0]) + pCtx.entry.MoleculeType = MoleculeType(molecule[1]) + } else { + pCtx.entry.Strandedness = Unknown + pCtx.entry.MoleculeType = MoleculeType(molecule[0]) + } + + pCtx.entry.MoleculeToplogy = MoleculeToplogy(tokens[5]) + + pCtx.entry.DivisionCode = DivisionCode(tokens[6]) + + date, err := time.Parse(locusDateLayout, tokens[7]) + if err != nil { + return p.makeSyntaxError("failed to parse entry date", err) + } + pCtx.entry.UpdateDate = date + + return nil +} + +// See Genbank spec 3.4.5 +func (p *Parser) parseDefinition(pCtx parseContext) error { + line, err := p.readLine() + if err != nil { + return fmt.Errorf("failed to read DEFINITION line: %w", err) + } + + after, _ := strings.CutPrefix(line, "DEFINITION") + after = strings.TrimSpace(after) + pCtx.entry.Definition = appendLine(pCtx.entry.Definition, after) + + for { + linetype, err := p.peekNextLineType() + if err != nil { + return err + } + + switch linetype { + case empty: + _, err := p.readLine() + if err != nil { + return err + } + pCtx.entry.Definition = appendLine(pCtx.entry.Definition, "\n") + case continuation: + line, err := p.readLine() + if err != nil { + return err + } + pCtx.entry.Definition = appendLine(pCtx.entry.Definition, line[10:]) + default: + return nil + } + } +} + +// See Genbank spec 3.4.6 +func (p *Parser) parseAccession(pCtx parseContext) error { + line, err := p.readLine() + if err != nil { + return fmt.Errorf("failed to read ACCESSION line: %w", err) + } + + after, _ := strings.CutPrefix(line, "ACCESSION") + after = strings.TrimSpace(after) + pCtx.entry.Accession += after + + return nil +} + +// See Genbank spec 3.4.7.1 +func (p *Parser) parseVersion(pCtx parseContext) error { + line, err := p.readLine() + if err != nil { + return fmt.Errorf("failed to read VERSION line: %w", err) + } + + after, _ := strings.CutPrefix(line, "VERSION") + accessionVersion := strings.Split(strings.TrimSpace(after), ".") + if len(accessionVersion) != 2 { + return p.makeSyntaxError("could not parse accession version") + } + version, err := strconv.Atoi(accessionVersion[1]) + if err != nil { + return p.makeSyntaxError("accession version should be an integer") + } + pCtx.entry.AccessionVersion = version + + return nil +} + +// See Genbank spec 3.4.7.2 +func (p *Parser) parseDBLink(pCtx parseContext) error { + line, err := p.readLine() + if err != nil { + return fmt.Errorf("failed to read DBLINK line: %w", err) + } + + if pCtx.entry.DatabaseLinks == nil { + pCtx.entry.DatabaseLinks = make(map[string][]string) + } + + after, _ := strings.CutPrefix(line, "DBLINK") + crossRefType, crossRefs, err := parseRawDBLink(after) + if err != nil { + return p.makeSyntaxError("failed to parse DBLINK entry", err) + } + pCtx.entry.DatabaseLinks[crossRefType] = append(pCtx.entry.DatabaseLinks[crossRefType], crossRefs...) + + for { + lineType, err := p.peekNextLineType() + if err == io.EOF { + return nil + } else if err != nil { + return err + } + + switch lineType { + case empty: + _, err := p.readLine() + if err != nil { + return err + } + case continuation: + line, err := p.readLine() + if err != nil { + return err + } + crossRefType, crossRefs, err := parseRawDBLink(line) + if err != nil { + return p.makeSyntaxError("failed to parse DBLINK entry", err) + } + pCtx.entry.DatabaseLinks[crossRefType] = append(pCtx.entry.DatabaseLinks[crossRefType], crossRefs...) + default: + return nil + } + } +} + +func parseRawDBLink(data string) (crossRefType string, crossRefs []string, err error) { + before, after, found := strings.Cut(data, ":") + if !found { + return "", nil, fmt.Errorf("should be in the format TYPE:REFERENCE") + } + + crossRefType = strings.TrimSpace(before) + for _, crossRef := range strings.Split(after, ",") { + crossRefs = append(crossRefs, strings.TrimSpace(crossRef)) + } + + return +} + +// See Genbank spec section 3.4.8 +func (p *Parser) parseKeywords(pCtx parseContext) error { + line, err := p.readLine() + if err != nil { + return fmt.Errorf("failed to read KEYWORDS line: %w", err) + } + + after, _ := strings.CutPrefix(line, "KEYWORDS") + after, endsWithPeriod := strings.CutSuffix(after, ".") + if !endsWithPeriod { + return p.makeSyntaxError("KEYWORDS line must end with a period") + } + keywords := strings.Split(after, ";") + for _, keyword := range keywords { + pCtx.entry.Keywords = append(pCtx.entry.Keywords, strings.TrimSpace(keyword)) + } + + return nil +} + +// See Genbank spec section 3.4.9 +func (p *Parser) parseSegment(pCtx parseContext) error { + line, err := p.readLine() + if err != nil { + return fmt.Errorf("failed to read KEYWORDS line: %w", err) + } + + after, _ := strings.CutPrefix(line, "SEGMENT") + segmentStr, totalSegmentsStr, found := strings.Cut(after, " of ") + if !found { + return p.makeSyntaxError("malformed SEGMENT line, should be in the form 'n of m'") + } + segment, err := strconv.Atoi(strings.TrimSpace(segmentStr)) + if err != nil { + return p.makeSyntaxError("could not parse segment number", err) + } + totalSegments, err := strconv.Atoi(strings.TrimSpace(totalSegmentsStr)) + if err != nil { + return p.makeSyntaxError("could not parse total number of segments", err) + } + + pCtx.entry.Segment = segment + pCtx.entry.TotalSegments = totalSegments + + return nil + +} + +// See Genbank spec section 3.4.10 +func (p *Parser) parseSource(pCtx parseContext) error { + line, err := p.readLine() + if err != nil { + return fmt.Errorf("failed to read SOURCE line: %w", err) + } + + after, _ := strings.CutPrefix(line, "SOURCE") + after, endsWithPeriod := strings.CutSuffix(strings.TrimSpace(after), ".") + if !endsWithPeriod { + return p.makeSyntaxError("SOURCE line must end with a period") + } + pCtx.entry.Source.Name = strings.TrimSpace(after) + + return p.parseOrganism(pCtx) + +} + +// See Genbank spec section 3.4.10 +func (p *Parser) parseOrganism(pCtx parseContext) error { + organismLine, err := p.readLine() + if err != nil { + return fmt.Errorf("failed to read ORGANISM line: %w", err) + } + + after, organismKeyword := strings.CutPrefix(organismLine, " ORGANISM") + if !organismKeyword { + return p.makeSyntaxError("SOURCE keyword must be followed by ORGANISM subkeyword") + } + after, endsWithPeriod := strings.CutSuffix(strings.TrimSpace(after), ".") + if !endsWithPeriod { + return p.makeSyntaxError("ORGANISM line must end with a period") + } + + pCtx.entry.Source.ScientificName = strings.TrimSpace(after) + + return p.parseTaxonomy(pCtx) +} + +// See Genbank spec section 3.4.10 +func (p *Parser) parseTaxonomy(pCtx parseContext) error { + for { + lineType, err := p.peekNextLineType() + if err != nil { + return err + } + + switch lineType { + case empty: + p.readLine() // skip empty lines + case continuation: + line, err := p.readLine() + if err != nil { + return err + } + + lineType, err := p.peekNextLineType() + if err != nil { + return err + } + if lineType != continuation && !strings.HasSuffix(line, ".") { + return p.makeSyntaxError("final line of ORGANISM subkeyword taxonomy must end in a period") + } else if lineType == continuation && !strings.HasSuffix(line, ";") { + return p.makeSyntaxError("ORGANISM subkeyword taxonomy lines must end in a semicolon") + } + + taxa := strings.Split(strings.TrimSpace(line[:len(line)-1]), ";") + for _, taxon := range taxa { + pCtx.entry.Source.Taxonomy = append(pCtx.entry.Source.Taxonomy, strings.TrimSpace(taxon)) + } + default: + return nil + } + + } +} + +// See Genbank spec section 3.4.11 +func (p *Parser) parseReference(pCtx parseContext) error { + line, err := p.readLine() + if err != nil { + return err + } + + after, _ := strings.CutPrefix(line, "REFERENCE") + after = strings.TrimSpace(after) + + re := regexp.MustCompile(`(\d+)\s+\(bases (\d)+ to (\d+)`) + matches := re.FindStringSubmatch(after) + if matches == nil { + return p.makeSyntaxError("malformed REFERENCE line") + } + + refNum, _ := strconv.Atoi(matches[0]) + basesFrom, _ := strconv.ParseUint(matches[1], 10, 0) + basesTo, _ := strconv.ParseUint(matches[2], 10, 0) + + pCtx.entry.References = append(pCtx.entry.References, Reference{ + Number: refNum, + BaseRange: Range{ + From: uint(basesFrom), + To: uint(basesTo), + }, + }) + pCtx.reference = &pCtx.entry.References[len(pCtx.entry.References)-1] + + subKeywordFuncs := map[string]func(parseContext) error{} + + for { + lineType, err := p.peekNextLineType() + if err != nil { + return fmt.Errorf("could not parse REFERENCE keyword: %w", err) + } + + switch lineType { + case subKeyword: + err = p.dispatchByPrefix(pCtx, subKeywordFuncs) + if err != nil { + return err + } + case empty: + p.readLine() // We need to read the empty line to advance the reader + + default: + pCtx.reference = nil + return nil + } + } +} + +// See Genbank spec section 3.4.11 +func (p *Parser) parseAuthors(pCtx parseContext) error { + line, err := p.readLine() + if err != nil { + return err + } + + authors, _ := strings.CutPrefix(line, "AUTHORS") + authors = strings.TrimSpace(authors) + for { + lineType, err := p.peekNextLineType() + if err != nil { + return fmt.Errorf("could not parse AUTHORS subkeyword: %w", err) + } + switch lineType { + case continuation: + line, err := p.readLine() + if err != nil { + return err + } + + authors += strings.TrimSpace(line[:len(line)-1]) + case empty: + p.readLine() // We need to read the empty line to advance the reader + default: + return nil + } + } +} + +/* OTHER UTILITY FUNCTIONS */ + +// appendLine appends a line to s. If s is empty, simply +// returns append. If s is not empty, ensures that s is followed +// by a newline and then what is contained in apend. +func appendLine(s string, append string) string { + if s == "" { + return append + } + + return strings.TrimSuffix(s, "\n") + "\n" + append +} diff --git a/io/genbank/parser_test.go b/io/genbank/parser_test.go new file mode 100644 index 00000000..b3cd10d8 --- /dev/null +++ b/io/genbank/parser_test.go @@ -0,0 +1,203 @@ +package genbank + +import ( + "strings" + "testing" + "time" + + "github.com/google/go-cmp/cmp" +) + +func TestParseHeaderErrs(t *testing.T) { + testcases := []struct { + name string + data string + }{{ + name: "fails on premature EOF", + data: `GBBCT1.SEQ Genetic Sequence Data Bank + October 15 2023 + + NCBI-GenBank Flat File Release`, + }} + + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + p := NewParser(strings.NewReader(tc.data)) + got, err := p.parseHeader() + + if err == nil { + t.Fatalf("no error returned by parseHeader on invalid input, got: %v", got) + } + }) + } + +} + +func TestParseHeaderRoundTrip(t *testing.T) { + testcases := []struct { + name string + data Header + }{{ + name: "parses example header from spec", + data: Header{ + FileName: "GBBCT1.SEQ", + Date: time.Date(2023, time.October, 15, 0, 0, 0, 0, time.UTC), + Title: "Bacterial Sequences (Part 1)", + NumEntries: 51396, + NumBases: 92682287, + NumSequences: 51396, + MajorRelease: 258, + MinorRelease: 0, + }}, { + name: "parses header with largest possible ints in stats line", + data: Header{ + FileName: "GBBCT1.SEQ", + Date: time.Date(2023, time.October, 15, 0, 0, 0, 0, time.UTC), + Title: "Bacterial Sequences (Part 1)", + NumEntries: 99999999, + NumBases: 99999999999, + NumSequences: 99999999, + MajorRelease: 258, + MinorRelease: 0, + }}, { + name: "parses header with smallest possible ints in stats line", + data: Header{ + FileName: "GBBCT1.SEQ", + Date: time.Date(2023, time.October, 15, 0, 0, 0, 0, time.UTC), + Title: "Bacterial Sequences (Part 1)", + NumEntries: 1, + NumBases: 1, + NumSequences: 1, + MajorRelease: 258, + MinorRelease: 0, + }}} + + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + headerStr := tc.data.String() + + p := NewParser(strings.NewReader(headerStr)) + got, err := p.parseHeader() + + if err != nil { + t.Fatalf("unexpected error from parseHeader: %v", err) + } + + if diff := cmp.Diff(tc.data, got); diff != "" { + t.Fatalf("parseHeader returned incorrect header after round trip, (-want, +got): %v", diff) + } + + }) + } +} + +func TestParseEntry(t *testing.T) { + testcases := []struct { + name string + data string + want Entry + wantReachedEOF bool + wantErr bool + }{{ + name: "parses LOCUS keyword", + data: "LOCUS CP032762 5868661 bp DNA circular BCT 15-OCT-2018", + wantReachedEOF: true, + want: Entry{ + Name: "CP032762", + Length: 5868661, + Strandedness: Unknown, + MoleculeType: DNA, + MoleculeToplogy: Circular, + DivisionCode: BacterialDivision, + UpdateDate: time.Date(2018, time.October, 15, 0, 0, 0, 0, time.UTC), + }, + }, { + name: "parses DEFINITION keyword", + data: "DEFINITION test", + wantReachedEOF: true, + want: Entry{Definition: "test"}, + }, { + name: "parses multiline DEFINITION keyword", + data: "DEFINITION test\n another line\n\n yet another line", + wantReachedEOF: true, + want: Entry{Definition: "test\nanother line\n\nyet another line"}, + }, { + name: "parses repeated DEFINITION keywords", + data: "DEFINITION test\nDEFINITION another line\n\n yet another line", + wantReachedEOF: true, + want: Entry{Definition: "test\nanother line\n\nyet another line"}, + }, { + name: "parses only first entry in reader", + data: `DEFINITION test +// +DEFINITION another test`, + want: Entry{Definition: "test"}, + }, { + name: "parses VERSION keyword", + data: "VERSION ASDF.130", + want: Entry{AccessionVersion: 130}, + wantReachedEOF: true, + }, { + name: "parses ACCESSION keyword", + data: "ACCESSION HIIII", + want: Entry{Accession: "HIIII"}, + wantReachedEOF: true, + }, { + name: "parses simple DBLINK keyword", + data: "DBLINK SomeDB: someReference", + want: Entry{DatabaseLinks: map[string][]string{"SomeDB": {"someReference"}}}, + wantReachedEOF: true, + }, { + name: "parses DBLINK keyword with multiple refs for a single DB", + data: "DBLINK SomeDB: someReference, anotherReference", + want: Entry{DatabaseLinks: map[string][]string{"SomeDB": {"someReference", "anotherReference"}}}, + wantReachedEOF: true, + }, { + name: "parses DBLINK keyword with multiple DBs", + data: "DBLINK SomeDB: someReference, anotherReference\n AnotherDB: yetAnotherReference", + want: Entry{DatabaseLinks: map[string][]string{"SomeDB": {"someReference", "anotherReference"}, "AnotherDB": {"yetAnotherReference"}}}, + wantReachedEOF: true, + }, { + name: "parses KEYWORDS keyword", + data: "KEYWORDS first; second; last.", + want: Entry{Keywords: []string{"first", "second", "last"}}, + wantReachedEOF: true, + }, { + name: "KEYWORDS line must end with period", + data: "KEYWORDS first; second; last", + wantErr: true, + }, { + name: "parses SOURCE keyword", + data: `SOURCE common name (more info) molecule type. + ORGANISM Scientific name. + Taxon 1; Taxon 2; Taxon 3; + Taxon 4; Taxon 5. +`, + want: Entry{Source: Source{ + Name: "common name (more info) molecule type", + ScientificName: "Scientific name", + Taxonomy: []string{"Taxon 1", "Taxon 2", "Taxon 3", "Taxon 4", "Taxon 5"}, + }}, + wantReachedEOF: true, + }} + + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + p := NewParser(strings.NewReader(tc.data)) + got, gotReachedEOF, err := p.parseEntry() + + if gotErr := err != nil; gotErr != tc.wantErr { + t.Fatalf("incorrect error returned from parseEntry, wantErr: %v, err: %v", tc.wantErr, err) + } + + if gotReachedEOF != tc.wantReachedEOF { + t.Fatalf("incorrect reached EOF status returned from parseEntry, want: %v, got: %v", tc.wantReachedEOF, gotReachedEOF) + } + + if diff := cmp.Diff(tc.want, got); !tc.wantErr && diff != "" { + t.Fatalf("parseEntry returned incorrect Entry, (-want, +got): %v", diff) + } + + }) + } +} diff --git a/io/genbank/types.go b/io/genbank/types.go new file mode 100644 index 00000000..4f44c9f5 --- /dev/null +++ b/io/genbank/types.go @@ -0,0 +1,173 @@ +/* +Package genbank provides genbank parsers and writers. + +GenBank is a flat text file format developed in the 1980s to annotate genetic +sequences, and has since become the standard for sharing annotated genetic +sequences. A full specification of the GenBank flatfile format can be found at +https://www.ncbi.nlm.nih.gov/genbank/release/current/. + +This package provides a parser and writer to convert between the GenBank file +format and the more general GenBank struct. +*/ +package genbank + +import "time" + +// Genbank is the main struct for the Genbank file format. +type Genbank struct { + Header Header + Entries []Entry +} + +// Header holds the information at the beginning of all +// Genbank files (see Genbank spec section 3.1) +type Header struct { + FileName string + Title string + Date time.Time + MajorRelease int + MinorRelease int + NumEntries int + NumBases int + NumSequences int +} + +// Strandedness represents whether a sequence is +// single-, double-, or mixed-stranded (see Genbank +// spec section 3.4.4.2). +type Strandedness string + +const ( + DoubleStranded Strandedness = "ds" + SingleStranded Strandedness = "ss" + MixedStranded Strandedness = "ms" + Unknown Strandedness = "unknown" +) + +// MoleculeType represents what kind of molecule a sequence +// consists of (see Genbank spec section 3.4.4.2). +type MoleculeType string + +const ( + NucleicAcid MoleculeType = "NA" + DNA MoleculeType = "DNA" + RNA MoleculeType = "RNA" + TransferRNA MoleculeType = "tRNA" + RibosomalRNA MoleculeType = "rRNA" + MessengerRNA MoleculeType = "mRNA" + SmallNuclearRNA MoleculeType = "uRNA" + ViralCRNA MoleculeType = "cRNA" +) + +// MoleculeType represents the topology of a molecule +// (see Genbank spec section 3.4.4.2). +type MoleculeToplogy string + +const ( + Circular MoleculeToplogy = "circular" + Linear MoleculeToplogy = "linear" +) + +// DivisionCode represents which division of Genbank an entry +// belongs to (see Genbank spec section 3.4.4.2). +type DivisionCode string + +const ( + PrimateDivision DivisionCode = "PRI" + RodentDivision DivisionCode = "ROD" + OtherMammalianDivision DivisionCode = "MAM" + OtherVertebrateDivision DivisionCode = "VRT" + InvertebrateDivision DivisionCode = "INV" + PlantFungalAlgalDivision DivisionCode = "PLN" + BacterialDivision DivisionCode = "BCT" + ViralDivision DivisionCode = "VRL" + BacteriophageDivision DivisionCode = "PHG" + SyntheticDivision DivisionCode = "SYN" + UnnanotatedDivision DivisionCode = "UNA" + ExpressedSequenceTagDivision DivisionCode = "EST" + PatentDivision DivisionCode = "PAT" + SequenceTaggedSiteDivision DivisionCode = "STS" + GenomeSurveyDivision DivisionCode = "GSS" + HighThroughputGenomicDivision DivisionCode = "HTG" + HighThroughputCDNADivision DivisionCode = "HTC" + EnvironmentalSamplingDivision DivisionCode = "ENV" + ConstructedDivision DivisionCode = "CON" + TranscriptomeShotgunDivision DivisionCode = "TSA" +) + +type Range struct { + From uint + To uint +} + +// Reference is a reference to a publication (see +// Genbank spec section 3.4.11). +type Reference struct { + Number int + BaseRange Range + Authors string + Title string + Medline string + PubMed string + PageNumbers string + Remark string + Year string +} + +type Book struct { + Reference + Editors []string + BookTitle string + PublisherName string + PublisherLocation string +} + +type Article struct { + Reference + Journal string + Volume string +} + +// Source describes the source of an entry (see +// Genbank spec section 3.4.10). +type Source struct { + Name string + ScientificName string + Taxonomy []string +} + +// A Feature represents genes, gene products, and +// other areas of significance in an entry (see +// Genbank spec section 3.4.12). +type Feature struct { + Key string + Location Location + Qualifiers map[string][]string +} + +type Location string + +// Entry represents a single entry in the Genbank file +// (see Genbank spec section 3.1). +type Entry struct { + Name string + Length int + Strandedness Strandedness + MoleculeType MoleculeType + MoleculeToplogy MoleculeToplogy + DivisionCode DivisionCode + UpdateDate time.Time + Definition string + Accession string + AccessionVersion int + CrossReferences map[string][]string + References []Reference + Source Source + Features []Feature + Origin string + Sequence string + DatabaseLinks map[string][]string + Keywords []string + Segment int + TotalSegments int +} diff --git a/io/genbank/write.go b/io/genbank/write.go new file mode 100644 index 00000000..044234b6 --- /dev/null +++ b/io/genbank/write.go @@ -0,0 +1,56 @@ +package genbank + +import ( + "fmt" + "strings" +) + +const ( + dbNameIdx = 22 + dateIdx = 27 + titleIdx = 25 +) + +// String converts a Header to its Genbank flatfile representation. +func (h Header) String() string { + builder := strings.Builder{} + + // Line 1: File name and database name + builder.WriteString(h.FileName) + builder.WriteString(strings.Repeat(" ", dbNameIdx-len(h.FileName))) + builder.WriteString("Genetic Sequence Data Bank") + builder.WriteRune('\n') + + // Line 2: Date + builder.WriteString(strings.Repeat(" ", dateIdx)) + builder.WriteString(h.Date.Format(headerDateLayout)) + builder.WriteRune('\n') + + // Line 3: Blank + builder.WriteRune('\n') + + // Line 4: Genbank release number + builder.WriteString(fmt.Sprintf(" NCBI-GenBank Flat File Release %v.%v", h.MajorRelease, h.MinorRelease)) + builder.WriteRune('\n') + + // Line 5: Blank + builder.WriteRune('\n') + + // Line 6: File title + builder.WriteString(strings.Repeat(" ", titleIdx)) + builder.WriteString(h.Title) + builder.WriteRune('\n') + + // Line 7: Blank + builder.WriteRune('\n') + + // Line 8: File statistics + builder.WriteString(fmt.Sprintf( + "%8v loci, %11v bases, from %8v reported sequences", + h.NumEntries, + h.NumBases, + h.NumSequences)) + builder.WriteRune('\n') + + return builder.String() +}