Skip to content

Commit

Permalink
filelist: only scan parts of bucket when listing files
Browse files Browse the repository at this point in the history
  • Loading branch information
adamdecaf committed Mar 29, 2024
1 parent 280944f commit 07d5087
Show file tree
Hide file tree
Showing 4 changed files with 154 additions and 6 deletions.
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,22 @@ module github.com/moov-io/ach-web-viewer
go 1.20

require (
cloud.google.com/go/storage v1.39.1
github.com/gorilla/mux v1.8.1
github.com/markbates/pkger v0.17.1
github.com/moov-io/ach v1.36.1
github.com/moov-io/base v0.48.5
github.com/moov-io/cryptfs v0.7.1
github.com/stretchr/testify v1.9.0
gocloud.dev v0.37.0
golang.org/x/sync v0.6.0
)

require (
cloud.google.com/go v0.112.1 // indirect
cloud.google.com/go/compute v1.25.0 // indirect
cloud.google.com/go/compute/metadata v0.2.3 // indirect
cloud.google.com/go/iam v1.1.6 // indirect
cloud.google.com/go/storage v1.39.1 // indirect
github.com/ProtonMail/go-crypto v0.0.0-20230828082145-3c4c8a2d2371 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/cenkalti/backoff/v3 v3.2.2 // indirect
Expand Down Expand Up @@ -83,7 +84,6 @@ require (
golang.org/x/exp v0.0.0-20231206192017-f3f8817b8deb // indirect
golang.org/x/net v0.22.0 // indirect
golang.org/x/oauth2 v0.18.0 // indirect
golang.org/x/sync v0.6.0 // indirect
golang.org/x/sys v0.18.0 // indirect
golang.org/x/text v0.14.0 // indirect
golang.org/x/time v0.5.0 // indirect
Expand Down
76 changes: 72 additions & 4 deletions pkg/filelist/bucket.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,13 @@ import (
"path/filepath"

"github.com/moov-io/ach-web-viewer/pkg/service"
"github.com/moov-io/ach-web-viewer/pkg/yyyymmdd"
"github.com/moov-io/cryptfs"

"cloud.google.com/go/storage"
"gocloud.dev/blob"
_ "gocloud.dev/blob/gcsblob"
"golang.org/x/sync/errgroup"
)

type bucketLister struct {
Expand Down Expand Up @@ -71,9 +74,7 @@ func (ls *bucketLister) GetFiles(opts ListOpts) (Files, error) {
SourceType: "Bucket",
}
for i := range ls.paths {
files, err := ls.listFiles(opts, ls.buck.List(&blob.ListOptions{
Prefix: ls.paths[i],
}))
files, err := ls.listFiles(opts, ls.paths[i])
if err != nil {
return out, fmt.Errorf("error reading %s bucket path: %v", ls.paths[i], err)
}
Expand Down Expand Up @@ -122,7 +123,74 @@ func (ls *bucketLister) maybeDecrypt(r io.Reader) ([]byte, error) {
return initial, err
}

func (ls *bucketLister) listFiles(opts ListOpts, cur *blob.ListIterator) ([]File, error) {
func (ls *bucketLister) listFiles(opts ListOpts, pathPrefix string) ([]File, error) {
// Different underlying storage engines will let us scan/glob parts of the bucket differently.
var gcsBucket *storage.Client
if ls.buck.As(&gcsBucket) {
return ls.listFilesFromGCSBucket(opts, pathPrefix)
}
return ls.listFilesFromCDKBucket(opts, pathPrefix)
}

func (ls *bucketLister) listFilesFromGCSBucket(opts ListOpts, pathPrefix string) ([]File, error) {
var g errgroup.Group
datePrefixes := yyyymmdd.Prefixes(opts.StartDate, opts.EndDate)

discoveredFiles := make(chan []File)

for _, datePrefix := range datePrefixes {
g.Go(func() error {
beforeList := func(as func(interface{}) bool) error {
var q *storage.Query
if as(&q) {
q.MatchGlob = fmt.Sprintf("%s/*/%s*/*", pathPrefix, datePrefix)
}
return nil
}

listOptions := &blob.ListOptions{
Prefix: pathPrefix, // + "/",
BeforeList: beforeList,
}

files, err := ls.listFilesFromCursor(opts, ls.buck.List(listOptions))
if len(files) > 0 {
go func() {
discoveredFiles <- files
}()
}
return err
})
}

err := g.Wait()
go func() {
discoveredFiles <- nil
}()
if err != nil {
return nil, err
}

var out []File
for {
files := <-discoveredFiles
if len(files) == 0 {
break
}
out = append(out, files...)
}
return out, nil

}

func (ls *bucketLister) listFilesFromCDKBucket(opts ListOpts, pathPrefix string) ([]File, error) {
return ls.listFilesFromCursor(opts, ls.buck.List(&blob.ListOptions{
Delimiter: "/",
Prefix: pathPrefix,
}))
}

func (ls *bucketLister) listFilesFromCursor(opts ListOpts, cur *blob.ListIterator) ([]File, error) {
var out []File
for {
obj, err := cur.Next(context.Background())
Expand Down
41 changes: 41 additions & 0 deletions pkg/yyyymmdd/prefix.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package yyyymmdd

import (
"slices"
"time"
)

// Given two time.Time values generate a start and end prefix (in yyyy-mm-dd format)
// which serves as prefixs usable to filter.
//
// Examples:
//
// 2023-12-23 to 2023-12-31 produces 2023-12-2, 2023-12-3
// 2023-12-23 to 2024-01-10 produces 2023-12-2, 2023-12-3, 2024-01-0, 2024-01-10
func Prefixes(start, end time.Time) []string {
var out []string

// For now just iterate over each day and chop off the trailing day digit
for {
if start.After(end) {
break
}

// Add the current day to our list
ts := start.Format("2006-01-02")

// Only when the end day is 10, 20, 30 we can extend the timestamp
if (start.Month() == end.Month()) && (start.Day() == end.Day()) && end.Day()%10 == 0 {
// do nothing
} else {
ts = ts[:len(ts)-1] // chop off the last digit
}

out = append(out, ts)

start = start.Add(24 * time.Hour)
}

slices.Sort(out)
return slices.Compact(out)
}
39 changes: 39 additions & 0 deletions pkg/yyyymmdd/prefix_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package yyyymmdd

import (
"testing"
"time"

"github.com/stretchr/testify/require"
)

func TestPrefixes(t *testing.T) {
start := time.Date(2023, time.December, 23, 0, 0, 0, 0, time.UTC)
end := time.Date(2023, time.December, 31, 0, 0, 0, 0, time.UTC)
expected := []string{"2023-12-2", "2023-12-3"}
require.ElementsMatch(t, expected, Prefixes(start, end))

end = time.Date(2024, time.January, 4, 0, 0, 0, 0, time.UTC)
expected = append(expected, "2024-01-0")
require.ElementsMatch(t, expected, Prefixes(start, end))

end = time.Date(2024, time.January, 10, 0, 0, 0, 0, time.UTC)
expected = append(expected, "2024-01-10")
require.ElementsMatch(t, expected, Prefixes(start, end))

end = time.Date(2024, time.January, 11, 0, 0, 0, 0, time.UTC)
expected = []string{"2023-12-2", "2023-12-3", "2024-01-0", "2024-01-1"}
require.ElementsMatch(t, expected, Prefixes(start, end))

end = time.Date(2024, time.January, 20, 0, 0, 0, 0, time.UTC)
expected = []string{"2023-12-2", "2023-12-3", "2024-01-0", "2024-01-1", "2024-01-20"}
require.ElementsMatch(t, expected, Prefixes(start, end))

end = time.Date(2024, time.January, 25, 0, 0, 0, 0, time.UTC)
expected = []string{"2023-12-2", "2023-12-3", "2024-01-0", "2024-01-1", "2024-01-2"}
require.ElementsMatch(t, expected, Prefixes(start, end))

end = time.Date(2024, time.January, 30, 0, 0, 0, 0, time.UTC)
expected = []string{"2023-12-2", "2023-12-3", "2024-01-0", "2024-01-1", "2024-01-2", "2024-01-30"}
require.ElementsMatch(t, expected, Prefixes(start, end))
}

0 comments on commit 07d5087

Please sign in to comment.