Skip to content

Commit

Permalink
Version 0.1.3.14. Ability to parse modifications in Fasta. (#49)
Browse files Browse the repository at this point in the history
* Version 0.1.3.14. Ability to parse modifications in Fasta.

* Fixes

* Fixes again
  • Loading branch information
vks4git authored Feb 10, 2021
1 parent a63d943 commit 939918a
Show file tree
Hide file tree
Showing 8 changed files with 372 additions and 41 deletions.
4 changes: 4 additions & 0 deletions ChangeLog.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

## [Unreleased]

## [0.1.3.14] - 2021-02-10
### Changed
- Added ability to parse modifications in Fasta.

## [0.1.3.13] - 2021-01-02
### Changed
- Allow `QuickCheck-2.14`.
Expand Down
2 changes: 1 addition & 1 deletion package.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: cobot-io
version: 0.1.3.13
version: 0.1.3.14
github: "biocad/cobot-io"
license: BSD3
category: Bio
Expand Down
20 changes: 11 additions & 9 deletions src/Bio/FASTA.hs
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,27 @@

module Bio.FASTA
( module T
, WritableFastaToken (..)
, fromFile
, toFile
, fastaP
, fastaPGeneric
) where

import Bio.FASTA.Parser
import Bio.FASTA.Type as T
import Bio.FASTA.Writer (fastaToText)
import Control.Monad.IO.Class (MonadIO, liftIO)
import Data.Attoparsec.Text (parseOnly)
import Data.Text.IO (readFile, writeFile)
import Control.Monad.IO.Class (MonadIO, liftIO)
import Data.Attoparsec.Text (parseOnly)
import Data.Text.IO (readFile, writeFile)
#if !MIN_VERSION_base(4,13,0)
import Control.Monad.Fail (MonadFail (..))
import Prelude hiding (fail, readFile, writeFile)
import Control.Monad.Fail (MonadFail (..))
import Prelude hiding (fail, readFile, writeFile)
#else
import Prelude hiding (readFile, writeFile)
import Prelude hiding (readFile, writeFile)
#endif

import Bio.FASTA.Parser
import Bio.FASTA.Type as T
import Bio.FASTA.Writer (WritableFastaToken (..), fastaToText)

-- | Reads 'FastaSequence' from given file.
--
fromFile :: (MonadFail m, MonadIO m) => FilePath -> m (Fasta Char)
Expand Down
127 changes: 118 additions & 9 deletions src/Bio/FASTA/Parser.hs
Original file line number Diff line number Diff line change
@@ -1,35 +1,46 @@
{-# OPTIONS_GHC -fno-warn-orphans #-}

module Bio.FASTA.Parser
( fastaP
, fastaPGeneric
) where

import Bio.FASTA.Type (Fasta, FastaItem (..))
import Bio.FASTA.Type (Fasta, FastaItem (..), ModItem (..), Modification (..),
ParsableFastaToken (..))
import Bio.Sequence (BareSequence, bareSequence)
import Control.Applicative ((<|>))
import Data.Attoparsec.Text (Parser, char, choice, endOfInput, endOfLine, many', many1', satisfy,
skipWhile, takeWhile)
import Data.Char (isLetter, isSpace)
skipWhile, string, takeWhile, try)
import Data.Char (isAlphaNum, isLetter, isSpace)
import Data.Text (Text, strip)
import Prelude hiding (takeWhile)

instance ParsableFastaToken Char where
parseToken = satisfy

instance ParsableFastaToken ModItem where
parseToken predicate = (Mod <$> modificationP) <|> (Letter <$> satisfy predicate)


-- | Parser of .fasta file.
--
fastaP :: Parser (Fasta Char)
fastaP :: ParsableFastaToken a => Parser (Fasta a)
fastaP = fastaPGeneric isLetter

fastaPGeneric :: (Char -> Bool) -> Parser (Fasta Char)
fastaPGeneric :: ParsableFastaToken a => (Char -> Bool) -> Parser (Fasta a)
fastaPGeneric = many' . item

item :: (Char -> Bool) -> Parser (FastaItem Char)
item :: ParsableFastaToken a => (Char -> Bool) -> Parser (FastaItem a)
item predicate = (FastaItem <$> seqName <*> fastaSeq predicate) <* skipWhile isSpace

seqName :: Parser Text
seqName = strip <$> (char '>' *> tabs *> takeWhile (`notElem` ['\n', '\r']) <* tabs <* eol)

fastaSeq :: (Char -> Bool) -> Parser (BareSequence Char)
fastaSeq :: ParsableFastaToken a => (Char -> Bool) -> Parser (BareSequence a)
fastaSeq predicate = bareSequence . mconcat <$> many' (line predicate)

line :: (Char -> Bool) -> Parser String
line predicate = concat <$> many1' (many1' (satisfy predicate) <* many' (char ' ')) <* eol
line :: ParsableFastaToken a => (Char -> Bool) -> Parser [a]
line predicate = concat <$> many1' (many1' (parseToken predicate) <* many' (char ' ')) <* eol

eol :: Parser ()
eol = tabs *> choice [slashN, endOfInput]
Expand All @@ -39,3 +50,101 @@ slashN = () <$ many1' endOfLine

tabs :: Parser ()
tabs = () <$ many' (char '\t')

modificationP :: Parser Modification
modificationP
= string "[A*]" *> pure Mod_A_Star
<|> string "[C*]" *> pure Mod_C_Star
<|> string "[G*]" *> pure Mod_G_Star
<|> string "[T*]" *> pure Mod_T_Star
<|> string "[rA]" *> pure Mod_rA
<|> string "[rC]" *> pure Mod_rC
<|> string "[rG]" *> pure Mod_rG
<|> string "[rU]" *> pure Mod_rU
<|> string "[+A]" *> pure Mod_Plus_A
<|> string "[+C]" *> pure Mod_Plus_C
<|> string "[+G]" *> pure Mod_Plus_G
<|> string "[+T]" *> pure Mod_Plus_T
<|> string "[rAf]" *> pure Mod_rAf
<|> string "[rCf]" *> pure Mod_rCf
<|> string "[rGf]" *> pure Mod_rGf
<|> string "[rUf]" *> pure Mod_rUf
<|> string "[mA]" *> pure Mod_mA
<|> string "[mC]" *> pure Mod_mC
<|> string "[mG]" *> pure Mod_mG
<|> string "[mU]" *> pure Mod_mU
<|> string "[mA*]" *> pure Mod_mA_Star
<|> string "[mC*]" *> pure Mod_mC_Star
<|> string "[mG*]" *> pure Mod_mG_Star
<|> string "[mU*]" *> pure Mod_mU_Star
<|> string "[dU]" *> pure Mod_dU
<|> string "[5Bio]" *> pure Mod_5Bio
<|> string "[iBio]" *> pure Mod_iBio
<|> string "[56FAM]" *> pure Mod_56FAM
<|> string "[36FAM]" *> pure Mod_36FAM
<|> string "[5HEX]" *> pure Mod_5HEX
<|> string "[5TMR]" *> pure Mod_5TMR
<|> string "[3BHQ1]" *> pure Mod_3BHQ1
<|> string "[3BHQ2]" *> pure Mod_3BHQ2
<|> string "[5NH2]" *> pure Mod_5NH2
<|> string "[3NH2]" *> pure Mod_3NH2
<|> string "[5PO4]" *> pure Mod_5PO4
<|> string "[3PO4]" *> pure Mod_3PO4
<|> string "[3BioTEG]" *> pure Mod_3BioTEG
<|> string "[C12]" *> pure Mod_C12
<|> string "[NHSdT]" *> pure Mod_NHSdT
<|> string "[5Mal]" *> pure Mod_5Mal
<|> string "[5thio]" *> pure Mod_5thio
<|> string "[3thio]" *> pure Mod_3thio
<|> string "[3azide]" *> pure Mod_3azide
<|> string "[3alkine]" *> pure Mod_3alkine
<|> string "[5CholTEG]" *> pure Mod_5CholTEG
<|> string "[3CholTEG]" *> pure Mod_3CholTEG
<|> string "[5C10]" *> pure Mod_5C10
<|> string "[5Alk]" *> pure Mod_5Alk
<|> string "[GC]" *> pure Mod_GC
<|> string "[GT]" *> pure Mod_GT
<|> string "[AT]" *> pure Mod_AT
<|> string "[TG]" *> pure Mod_TG
<|> string "[AC]" *> pure Mod_AC
<|> string "[CC]" *> pure Mod_CC
<|> string "[AA]" *> pure Mod_AA
<|> string "[TC]" *> pure Mod_TC
<|> string "[TT]" *> pure Mod_TT
<|> string "[CG]" *> pure Mod_CG
<|> string "[GG]" *> pure Mod_GG
<|> string "[AG]" *> pure Mod_AG
<|> string "[GA]" *> pure Mod_GA
<|> string "[CA]" *> pure Mod_CA
<|> string "[CT]" *> pure Mod_CT
<|> string "[TA]" *> pure Mod_TA
<|> string "[AAA]" *> pure Mod_AAA
<|> string "[AAC]" *> pure Mod_AAC
<|> string "[ACT]" *> pure Mod_ACT
<|> string "[ATC]" *> pure Mod_ATC
<|> string "[ATG]" *> pure Mod_ATG
<|> string "[CAG]" *> pure Mod_CAG
<|> string "[AGA]" *> pure Mod_AGA
<|> string "[CAT]" *> pure Mod_CAT
<|> string "[CCG]" *> pure Mod_CCG
<|> string "[CGT]" *> pure Mod_CGT
<|> string "[CTG]" *> pure Mod_CTG
<|> string "[GAA]" *> pure Mod_GAA
<|> string "[GAC]" *> pure Mod_GAC
<|> string "[GCT]" *> pure Mod_GCT
<|> string "[GGT]" *> pure Mod_GGT
<|> string "[GTT]" *> pure Mod_GTT
<|> string "[TAC]" *> pure Mod_TAC
<|> string "[TCT]" *> pure Mod_TCT
<|> string "[TGC]" *> pure Mod_TGC
<|> string "[TGG]" *> pure Mod_TGG
<|> string "[TTC]" *> pure Mod_TTC
<|> string "[TTT]" *> pure Mod_TTT
<|> unknownP

unknownP :: Parser Modification
unknownP = try $ do
_ <- char '['
m <- many1' $ satisfy (\c -> isAlphaNum c || c `elem` ['+', '-', '*', '_'])
_ <- char ']'
pure $ Unknown ("[" <> m <> "]")
Loading

0 comments on commit 939918a

Please sign in to comment.