Skip to content

Commit

Permalink
claat: use crc64 checksum for image filenames
Browse files Browse the repository at this point in the history
This will de-dup and reduce number of image copies on disc.
  • Loading branch information
x1ddos committed May 14, 2016
1 parent 3f1f485 commit 8ed4599
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 59 deletions.
2 changes: 1 addition & 1 deletion claat/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.3.2
0.4.0
62 changes: 42 additions & 20 deletions claat/export.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ package main

import (
"bytes"
"crypto/md5"
"encoding/json"
"flag"
"fmt"
Expand Down Expand Up @@ -91,21 +90,17 @@ func exportCodelab(src string) (*types.Meta, error) {
Updated: &lastmod,
}

// rewritten image urls
var imap map[string]string

dir := *output // output dir or stdout
if !isStdout(dir) {
dir = codelabDir(dir, meta)
imap = rewriteImages(clab.Steps)
// download codelab assets to disk, and rewrite image URLs
mdir := filepath.Join(dir, imgDirname)
if _, err := slurpImages(client, mdir, clab.Steps); err != nil {
return nil, err
}
}
// write codelab and its metadata to disk
if err := writeCodelab(dir, clab.Codelab, ctx); err != nil {
return nil, err
}
// slurp codelab assets to disk, if any
mdir := filepath.Join(dir, imgDirname)
return meta, downloadImages(client, mdir, imap)
return meta, writeCodelab(dir, clab.Codelab, ctx)
}

// writeCodelab stores codelab main content in ctx.Format and its metadata
Expand Down Expand Up @@ -151,20 +146,47 @@ func writeCodelab(dir string, clab *types.Codelab, ctx *types.Context) error {
return err
}

// rewriteImages returns a mapping of local codelab asset file
// to its original URL.
// The local filename is MD5 hash of the original URL.
func rewriteImages(steps []*types.Step) map[string]string {
var imap = make(map[string]string)
func slurpImages(client *http.Client, dir string, steps []*types.Step) (map[string]string, error) {
// make sure img dir exists
if err := os.MkdirAll(dir, 0755); err != nil {
return nil, err
}

type res struct {
url, file string
err error
}

ch := make(chan *res, 100)
defer close(ch)
var count int
for _, st := range steps {
nodes := imageNodes(st.Content.Nodes)
count += len(nodes)
for _, n := range nodes {
file := fmt.Sprintf("%x.png", md5.Sum([]byte(n.Src)))
imap[file] = n.Src
n.Src = filepath.Join(imgDirname, file)
go func(n *types.ImageNode) {
url := n.Src
file, err := slurpBytes(client, dir, url, 5)
if err == nil {
n.Src = filepath.Join(imgDirname, file)
}
ch <- &res{url, file, err}
}(n)
}
}
return imap

var err error
imap := make(map[string]string, count)
for i := 0; i < count; i++ {
r := <-ch
imap[r.file] = r.url
if r.err != nil && err == nil {
// record first error
err = fmt.Errorf("%s => %s: %v", r.url, r.file, r.err)
}
}

return imap, err
}

// imageNodes filters out everything except types.NodeImage nodes, recursively.
Expand Down
39 changes: 9 additions & 30 deletions claat/fetch.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ package main
import (
"encoding/json"
"fmt"
"hash/crc64"
"io"
"io/ioutil"
"math"
Expand Down Expand Up @@ -237,44 +238,22 @@ func fetchDriveFile(id string, nometa bool) (*resource, error) {
}, nil
}

// downloadImages fetches imap images and stores them in dir/img directory, concurrently.
// The imap argument is expected to be a mapping of local file name to original image URL.
func downloadImages(client *http.Client, dir string, imap map[string]string) error {
if len(imap) == 0 {
return nil
}
// make sure img dir exists
if err := os.MkdirAll(dir, 0755); err != nil {
return err
}

ch := make(chan error, len(imap))
for name, url := range imap {
go func(name, url string) {
ch <- slurpBytes(client, filepath.Join(dir, name), url, 5)
}(name, url)
}
for _ = range imap {
if err := <-ch; err != nil {
return err
}
}
return nil
}
var crcTable = crc64.MakeTable(crc64.ECMA)

// slurpBytes fetches a resource from url using retryGet and writes it to dst.
// It retries the fetch at most n times.
func slurpBytes(client *http.Client, dst, url string, n int) error {
func slurpBytes(client *http.Client, dir, url string, n int) (string, error) {
res, err := retryGet(client, url, n)
if err != nil {
return err
return "", err
}
defer res.Body.Close()
b, err := ioutil.ReadAll(res.Body)
if err != nil {
return err
return "", err
}
return ioutil.WriteFile(dst, b, 0644)
crc := crc64.Checksum(b, crcTable)
file := fmt.Sprintf("%x.png", crc)
dst := filepath.Join(dir, file)
return file, ioutil.WriteFile(dst, b, 0644)
}

// retryGet tries to GET specified url up to n times.
Expand Down
17 changes: 9 additions & 8 deletions claat/update.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,24 +93,25 @@ func updateCodelab(dir string) (*types.Meta, error) {
updated := types.ContextTime(clab.mod)
meta.Context.Updated = &updated

// update image references before writing codelab
imgmap := rewriteImages(clab.Steps)
basedir := filepath.Join(dir, "..")
newdir := codelabDir(basedir, &clab.Meta)
if err := writeCodelab(newdir, clab.Codelab, &meta.Context); err != nil {
return nil, err
}
imgdir := filepath.Join(newdir, imgDirname)

// slurp codelab assets to disk
// slurp codelab assets to disk and rewrite image URLs
var client *http.Client
if clab.typ == srcGoogleDoc {
client, err = driveClient()
if err != nil {
return nil, err
}
}
imgdir := filepath.Join(newdir, imgDirname)
if err := downloadImages(client, imgdir, imgmap); err != nil {
imgmap, err := slurpImages(client, imgdir, clab.Steps)
if err != nil {
return nil, err
}

// write codelab and its metadata
if err := writeCodelab(newdir, clab.Codelab, &meta.Context); err != nil {
return nil, err
}

Expand Down

0 comments on commit 8ed4599

Please sign in to comment.