forked from nlnwa/gowarc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
digest.go
209 lines (191 loc) · 6.31 KB
/
digest.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
/*
* Copyright 2021 National Library of Norway.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package gowarc
import (
"crypto/md5"
"crypto/sha1"
"crypto/sha256"
"crypto/sha512"
"encoding/base32"
"encoding/base64"
"encoding/hex"
"fmt"
"hash"
"io"
"strings"
)
type digestEncoding uint8
func (d digestEncoding) encode(digest *digest) string {
dig := digest.Sum(nil)
switch d {
case Base16:
return strings.ToUpper(hex.EncodeToString(dig))
case Base32:
return base32.StdEncoding.EncodeToString(dig)
case Base64:
return base64.StdEncoding.EncodeToString(dig)
default:
return string(dig)
}
}
const (
unknown digestEncoding = 0
Base16 digestEncoding = 1
Base32 digestEncoding = 2
Base64 digestEncoding = 3
)
func detectEncoding(algorithm, digest string, defaultEncoding digestEncoding) digestEncoding {
var algorithmLength int
switch algorithm {
case "md5":
if len(digest) == 32 {
// Special handling for md5 where encoded length are the same for base16 and base32.
// Distinction can be done on base32 padding
if strings.HasSuffix(digest, "=") {
return Base32
} else {
return Base16
}
}
algorithmLength = md5.Size
case "sha1":
algorithmLength = sha1.Size
case "sha256":
algorithmLength = sha256.Size
case "sha512":
algorithmLength = sha512.Size
}
switch len(digest) {
case algorithmLength * 2:
return Base16
case base32.StdEncoding.EncodedLen(algorithmLength):
return Base32
case base64.StdEncoding.EncodedLen(algorithmLength):
return Base64
}
return defaultEncoding
}
// digest is a utility for parsing, creation and validation of WARC block and payload digests.
//
// Typical usage is to create a digest from a WARC record's WARC-Block-Digest or WARC-Payload-Digest fields.
// Then write the content to the digest which implements io.Writer. When all is written, call validate to check if the
// submitted digest value equals the computed value. For this usage create the digest with newDigestFromField.
//
// For new records a digest can be calculated by creating a new digest with newDigest with the preferred algorithm as
// parameter. Then write the content to digest and call format to get a string suitable for WARC digest-fields.
type digest struct {
hash.Hash
name string
hash string
count int64
encoding digestEncoding
}
// Write (via the embedded io.Writer interface) adds more data to the running hash.
// It never returns an error.
func (d *digest) Write(p []byte) (n int, err error) {
d.count += int64(len(p))
return d.Hash.Write(p)
}
// Sum appends the current hash to b and returns the resulting slice.
// It does not change the underlying hash state.
func (d *digest) Sum(b []byte) []byte {
d.count += int64(len(b))
return d.Hash.Sum(b)
}
// format creates a string in the format expected in WARC-Block-Digest and WARC-Payload-Digest fields.
func (d *digest) format() string {
return fmt.Sprintf("%s:%s", d.name, d.encoding.encode(d))
}
// validate compares the computed digest-value against the digest-string submitted as part of the instantiation of the
// digest.
func (d *digest) validate() error {
computed := d.encoding.encode(d)
if d.hash != computed {
return fmt.Errorf("wrong digest: expected %s:%s, computed: %s:%s", d.name, d.hash, d.name, computed)
}
return nil
}
// updateDigest updates the digest-string to the computed value.
func (d *digest) updateDigest() {
d.hash = d.encoding.encode(d)
}
// newDigest creates a new digest from the value of a WARC digest-field or from scratch.
//
// digestString has the format: <algorithm>[:[<digestValue>]] where algorithm is one of md5, sha1, sha256, or sha512.
//
// The encoding is deduced from the length of the digestValue. In the case where only the algorithm is submitted
// or the length of the digestValue is of wrong length for the supported encodings, the value of defaultEncoding is used.
func newDigest(digestString string, defaultEncoding digestEncoding) (*digest, error) {
t := strings.SplitN(digestString, ":", 2)
algorithm := t[0]
algorithm = strings.ToLower(algorithm)
if algorithm == "" {
return nil, fmt.Errorf("missing algorithm")
}
var hash string
if len(t) > 1 {
hash = t[1]
}
encoding := detectEncoding(algorithm, hash, defaultEncoding)
if encoding < Base64 {
// base16 and base32 encodings are case insensitive.
hash = strings.ToUpper(hash)
}
switch algorithm {
case "md5":
return &digest{md5.New(), algorithm, hash, 0, encoding}, nil
case "sha1":
return &digest{sha1.New(), algorithm, hash, 0, encoding}, nil
case "sha256":
return &digest{sha256.New(), algorithm, hash, 0, encoding}, nil
case "sha512":
return &digest{sha512.New(), algorithm, hash, 0, encoding}, nil
case "":
return &digest{sha1.New(), "sha1", hash, 0, encoding}, nil
default:
return nil, fmt.Errorf("unsupported digest algorithm '%s'", algorithm)
}
}
// newDigestFromField takes a warcRecord and a digest-field name and creates a new digest from it.
//
// If the digest-field is missing from the warcRecord a digest is created with the default algorithm and encoding set
// in the warcRecord's options
func newDigestFromField(wr *warcRecord, warcDigestField string) (d *digest, err error) {
if wr.WarcHeader().Has(warcDigestField) {
d, err = newDigest(wr.WarcHeader().Get(warcDigestField), wr.opts.defaultDigestEncoding)
} else {
d, err = newDigest(wr.opts.defaultDigestAlgorithm, wr.opts.defaultDigestEncoding)
}
return
}
type digestFilterReader struct {
src io.Reader
digests []*digest
}
func newDigestFilterReader(src io.Reader, digests ...*digest) *digestFilterReader {
return &digestFilterReader{src: src, digests: digests}
}
func (d digestFilterReader) Read(p []byte) (n int, err error) {
n, err = d.src.Read(p)
if n > 0 {
pp := p[:n]
for _, dd := range d.digests {
// OK to ignore error. The digest might be wrong, but client gets wanted data.
_, _ = dd.Write(pp)
}
}
return
}