-
Notifications
You must be signed in to change notification settings - Fork 0
/
genomefile.go
114 lines (102 loc) · 2.29 KB
/
genomefile.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
// read genome files
package ggd_utils
import (
"bytes"
"fmt"
"io"
"log"
"regexp"
"strconv"
"strings"
"github.com/brentp/xopen"
)
// GenomeFile has the Lengths and Orders of the chromosomes.
type GenomeFile struct {
Lengths map[string]int
Order map[string]int
path string
ReMap map[string]string
}
// Less checks if one chromosome occurs before the other.
func (g *GenomeFile) Less(a, b string) bool {
return g.Order[a] <= g.Order[b]
}
func readChromosomMappings(fname string) map[string]string {
if fname == "" {
return nil
}
rdr, err := xopen.Ropen(fname)
result := make(map[string]string)
if err != nil {
log.Fatalf("[gsort] unable to open chromosome maping file: %s", fname)
}
warned := false
for {
line, err := rdr.ReadString('\n')
if len(line) > 0 {
toks := strings.Split(strings.TrimSpace(line), "\t")
if len(toks) == 1 {
toks = append(toks, "[unknown]"+toks[0])
if !warned {
log.Printf("[gsort] warning unmappable chromosome: %s.", toks[0])
warned = true
}
}
result[toks[0]] = toks[1]
}
if err == io.EOF {
break
}
if err != nil {
log.Printf("[gsort] error reading chromosome maping file: %s. %s", fname, err)
}
}
return result
}
// ReadGenomeFile returns a GenomeFile struct
func ReadGenomeFile(path string, chromsomeMappings string) (*GenomeFile, error) {
rdr, err := xopen.Ropen(path)
if err != nil {
return nil, err
}
gf := &GenomeFile{path: path, Lengths: make(map[string]int, 50), Order: make(map[string]int, 50)}
defer rdr.Close()
gf.ReMap = readChromosomMappings(chromsomeMappings)
space := regexp.MustCompile("\\s+")
// allow us to bypass a header. found indicates when we have a usable line.
found := false
for {
line, err := rdr.ReadBytes('\n')
line = bytes.TrimSpace(line)
if len(line) > 0 {
if line[0] == '#' {
continue
}
}
if err == io.EOF {
break
}
if err != nil {
return nil, err
}
if len(line) == 0 {
continue
}
toks := space.Split(string(line), -1)
chrom := toks[0]
length, err := strconv.Atoi(toks[1])
if err != nil {
if !found {
continue
}
return nil, err
}
found = true
gf.Lengths[chrom] = length
gf.Order[chrom] = len(gf.Order)
}
if !found {
return nil, fmt.Errorf("no usable lengths found for %s\n", path)
}
return gf, nil
}