-
Notifications
You must be signed in to change notification settings - Fork 1
/
multseq.go
126 lines (119 loc) · 3.16 KB
/
multseq.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
package gophy
import (
"bufio"
"io"
"log"
"os"
"strconv"
"strings"
)
// MSeq minimal seq struct
type MSeq struct {
NM string
SQ string
SQs []string //SQ seperated by spaces
}
// GetEmpiricalBaseFreqsMS get the empirical base freqs from the seqs
func GetEmpiricalBaseFreqsMS(seqs []MSeq, numstates int) (bf []float64) {
bf = make([]float64, numstates)
statecounts := make([]int, numstates)
total := 0
for _, j := range seqs {
for _, m := range j.SQs {
v, _ := strconv.Atoi(m)
statecounts[v]++
}
}
for _, i := range statecounts {
total += i
}
for i := range statecounts {
bf[i] = float64(statecounts[i]) / float64(total)
}
return
}
//ReadMSeqsFromFile obvious
// file should be fasta in format with starts seperated by spaces and starting at 0 going to whatever
func ReadMSeqsFromFile(filen string) (seqs []MSeq, numstates int) {
file, err := os.Open(filen)
if err != nil {
log.Fatal(err)
}
defer file.Close()
var csq string
var cnm string
first := true
reader := bufio.NewReader(file)
numstates = 0
for {
st, err := reader.ReadString('\n')
if len(st) > 0 {
if string(st[0]) == ">" {
if first == true {
first = false
cnm = strings.TrimRight(st[1:], "\n")
} else {
cs := MSeq{cnm, strings.ToUpper(csq), strings.Split(strings.ToUpper(csq), " ")}
for _, i := range cs.SQs {
sa, _ := strconv.Atoi(i)
if (sa + 1) > numstates {
numstates = sa + 1
}
}
seqs = append(seqs, cs)
csq = ""
cnm = strings.TrimRight(st[1:], "\n")
}
} else {
csq += strings.TrimRight(st, "\n")
}
}
if err == io.EOF {
break
}
if err != nil {
log.Printf("read %v bytes: %v", st, err)
break
}
}
// get the last one
cs := MSeq{cnm, strings.ToUpper(csq), strings.Split(strings.ToUpper(csq), " ")}
seqs = append(seqs, cs)
return
}
// ReadPatternsMSeqsFromFile return the seqs and patternsint
func ReadPatternsMSeqsFromFile(sfn string) (seqs map[string][]string,
patternsint map[int]float64, nsites int, bf []float64, numstates int) {
nsites = 0
seqs = map[string][]string{}
seqnames := make([]string, 0)
mseqs, numstates := ReadMSeqsFromFile(sfn)
for _, i := range mseqs {
seqs[i.NM] = i.SQs
seqnames = append(seqnames, i.NM)
nsites = len(i.SQs)
}
// get the site patternas
bf = GetEmpiricalBaseFreqsMS(mseqs, numstates)
//patterns, patternsint, gapsites, constant, uninformative, _ := GetSitePatterns(seqs, nsites, seqnames)
_, patternsint, _, _, _, _ = GetSitePatternsMS(mseqs, GetMap(numstates), nsites)
//list of sites
//fmt.Fprintln(os.Stderr, "nsites:", nsites)
//fmt.Fprintln(os.Stderr, "patterns:", len(patterns), len(patternsint))
//fmt.Fprintln(os.Stderr, "onlygaps:", len(gapsites))
//fmt.Fprintln(os.Stderr, "constant:", len(constant))
//fmt.Fprintln(os.Stderr, "uninformative:", len(uninformative))
return
}
//GetMap based on states without the MultStateModel struct
func GetMap(numStates int) (charMap map[string][]int) {
charMap = make(map[string][]int)
charMap["-"] = make([]int, numStates)
charMap["N"] = make([]int, numStates)
for i := 0; i < numStates; i++ {
charMap[strconv.Itoa(i)] = []int{i}
charMap["-"][i] = i
charMap["N"][i] = i
}
return
}