Skip to content

Commit

Permalink
Added Fasta parser (#113)
Browse files Browse the repository at this point in the history
* Added fasta parser subpackage

* Updated for 100% test coverage

* Small update to comments

* Depreciated fasta command line options

* Removed "fasta" from fasta parser

* Fixed linter issue with bool operation
  • Loading branch information
Koeng101 authored Jun 5, 2021
1 parent dd81ed2 commit 17f27e4
Show file tree
Hide file tree
Showing 9 changed files with 251 additions and 220 deletions.
127 changes: 0 additions & 127 deletions io.go
Original file line number Diff line number Diff line change
Expand Up @@ -383,133 +383,6 @@ JSON specific IO related things end here.

/******************************************************************************
FASTA specific IO related things begin here.
******************************************************************************/

// ParseFASTA parses a Sequence struct from a FASTA file and adds appropriate pointers to the structs.
func ParseFASTA(file []byte) Sequence {
fasta := string(file)
var sequence Sequence
var feature Feature
var features []Feature
var sequenceBuffer bytes.Buffer
var start int
var end int

lines := strings.Split(fasta, "\n")
linesLength := len(lines) - 1

for lineIndex, line := range lines {

// if there's nothing on this line skip this iteration of the loop
if len(line) == 0 {
continue
}

// if it's a comment skip this line
if line[0:1] == ";" {
continue
}

if line[0:1] == ">" && lineIndex == 0 { // if it's the first description
feature.Description = line[1:]

} else if line[0:1] == ">" || lineIndex == linesLength { // if it's a description or the last line

// if end of file write line to buffer
if lineIndex == linesLength {
sequenceBuffer.WriteString(line)
}

// setting sequence location
feature.SequenceLocation.Start = start
end = len(sequenceBuffer.String())
feature.SequenceLocation.End = end

// setting start to end after assigning to location in feature.
start = end

// adding new feature to features slice
features = append(features, feature)

// resetting feature
feature = Feature{}

// if it's the last line
if lineIndex != linesLength {
feature.Description = line[1:]
}

} else {
sequenceBuffer.WriteString(line)
}
}

sequence.Sequence = sequenceBuffer.String()

// add features last so that internal pointer to parent sequence is accurate
for _, feature := range features {
sequence.AddFeature(feature)
}

return sequence
}

// BuildFASTA builds a FASTA string from a Sequence struct.
func BuildFASTA(sequence Sequence) []byte {
var fastaBuffer bytes.Buffer
const maxLineLength = 70

for featureIndex, feature := range sequence.Features {

// if there isn't a descriptive comment don't write out feature to fasta file.
if feature.Description == "" {
continue
}
// write feature comment
fastaBuffer.WriteString(">" + feature.Description + "\n")

// range over sequence and add spacing
for characterIndex, character := range feature.GetSequence() {
characterIndex++
if characterIndex%maxLineLength == 0 && characterIndex != 0 {
fastaBuffer.WriteRune(character)
fastaBuffer.WriteString("\n")
} else {
fastaBuffer.WriteRune(character)
}
}

// if it's the end write new line.
if featureIndex != len(sequence.Features)-1 {
fastaBuffer.WriteString("\n\n")
}
}

return fastaBuffer.Bytes()
}

// ReadFASTA reads a Sequence struct from a FASTA file.
func ReadFASTA(path string) Sequence {
file, _ := ioutil.ReadFile(path)
sequence := ParseFASTA(file)
return sequence
}

// WriteFASTA writes a Sequence struct out to FASTA.
func WriteFASTA(sequence Sequence, path string) {
_ = ioutil.WriteFile(path, BuildFASTA(sequence), 0644)
}

/******************************************************************************
FASTA specific IO related things end here.
******************************************************************************/

/******************************************************************************
GBK specific IO related things begin here.
******************************************************************************/
Expand Down
80 changes: 0 additions & 80 deletions io_test.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package poly

import (
"bytes"
"fmt"
"io/ioutil"
"os"
Expand All @@ -21,7 +20,6 @@ File is structured as so:
Gff - io tests, and benchmarks.
Gbk/gb/genbank - benchmarks.
JSON - io tests.
FASTA - fasta tests.
******************************************************************************/

Expand Down Expand Up @@ -403,84 +401,6 @@ JSON related tests end here.

/******************************************************************************
FASTA related tests begin here.
******************************************************************************/

// ExampleReadFASTA shows basic usage for ReadFASTA
func ExampleReadFASTA() {
sequence := ReadFASTA("data/base.fasta")
fmt.Println(sequence.Features[0].Description)
// Output: gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus]
}

func ExampleParseFASTA() {
file, _ := ioutil.ReadFile("data/base.fasta")
sequence := ParseFASTA(file)

fmt.Println(sequence.Features[0].Description)
// Output: gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus]
}

func ExampleBuildFASTA() {
sequence := ReadFASTA("data/base.fasta") // get example data
fasta := BuildFASTA(sequence) // build a fasta byte array
firstLine := string(bytes.Split(fasta, []byte("\n"))[0])

fmt.Println(firstLine)
// Output: >gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus]
}

func ExampleWriteFASTA() {
tmpDataDir, err := ioutil.TempDir("", "data-*")
if err != nil {
fmt.Println(err.Error())
}
defer os.RemoveAll(tmpDataDir)

sequence := ReadFASTA("data/base.fasta") // get example data

tmpFASTAFilePath := filepath.Join(tmpDataDir, "base.fasta")
WriteFASTA(sequence, tmpFASTAFilePath) // write it out again

testSequence := ReadFASTA(tmpFASTAFilePath) // read it in again

fmt.Println(testSequence.Features[0].Description)
// Output: gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus]
}

func TestFASTAIO(t *testing.T) {
tmpDataDir, err := ioutil.TempDir("", "data-*")
if err != nil {
t.Error(err)
}
defer os.RemoveAll(tmpDataDir)

inputFilename := "data/base.fasta"
tmpFASTAFilePath := filepath.Join(tmpDataDir, "test.fasta")

// read FASTA file
testSequence := ReadFASTA(inputFilename)

// write FASTA file
WriteFASTA(testSequence, tmpFASTAFilePath)

// read back and diff
readTestSequence := ReadFASTA(tmpFASTAFilePath)

if diff := cmp.Diff(testSequence, readTestSequence, cmpopts.IgnoreFields(Feature{}, "ParentSequence")); diff != "" {
t.Errorf(" mismatch (-want +got):\n%s", diff)
}
}

/******************************************************************************
FASTA related tests end here.
******************************************************************************/

/******************************************************************************
GbkMulti/GbkFlat related tests begin here.
******************************************************************************/
Expand Down
File renamed without changes.
Binary file added parsers/fasta/data/uniprot_1mb_test.fasta.gz
Binary file not shown.
Loading

0 comments on commit 17f27e4

Please sign in to comment.