diff --git a/gateway/gateway.go b/gateway/gateway.go index dbba37f41e..770808fe4c 100644 --- a/gateway/gateway.go +++ b/gateway/gateway.go @@ -15,6 +15,7 @@ import ( "github.com/d5/tengo/v2/stdlib" lru "github.com/hashicorp/golang-lru" "github.com/kyokomi/emoji/v2" + "github.com/philippgille/gokv" "github.com/sirupsen/logrus" ) @@ -29,14 +30,17 @@ type Gateway struct { Message chan config.Message Name string Messages *lru.Cache + MessageStore gokv.Store + CanonicalStore gokv.Store logger *logrus.Entry } type BrMsgID struct { - br *bridge.Bridge - ID string + Protocol string + DestName string ChannelID string + ID string } const apiProtocol = "api" @@ -59,12 +63,41 @@ func New(rootLogger *logrus.Logger, cfg *config.Gateway, r *Router) *Gateway { if err := gw.AddConfig(cfg); err != nil { logger.Errorf("Failed to add configuration to gateway: %#v", err) } + + persistentMessageStorePath, usePersistent := gw.Config.GetString("PersistentMessageStorePath") + if usePersistent { + rootPath := fmt.Sprintf("%s/%s", persistentMessageStorePath, gw.Name) + os.MkdirAll(rootPath, os.ModePerm) + + gw.MessageStore = gw.getMessageMapStore(fmt.Sprintf("%s/Messages", rootPath)) + gw.CanonicalStore = gw.getMessageMapStore(fmt.Sprintf("%s/Canonical", rootPath)) + } + return gw } +func (gw *Gateway) SetMessageMap(canonicalMsgID string, msgIDs []*BrMsgID) { + _, usePersistent := gw.Config.GetString("PersistentMessageStorePath") + if usePersistent { + gw.setDestMessagesToStore(canonicalMsgID, msgIDs) + } else { + gw.Messages.Add(canonicalMsgID, msgIDs) + } +} + // FindCanonicalMsgID returns the ID under which a message was stored in the cache. func (gw *Gateway) FindCanonicalMsgID(protocol string, mID string) string { ID := protocol + " " + mID + + _, usePersistent := gw.Config.GetString("PersistentMessageStorePath") + if usePersistent { + return gw.getCanonicalMessageFromStore(ID) + } else { + return gw.getCanonicalMessageFromMemCache(ID) + } +} + +func (gw *Gateway) getCanonicalMessageFromMemCache(ID string) string { if gw.Messages.Contains(ID) { return ID } @@ -259,13 +292,26 @@ func (gw *Gateway) getDestChannel(msg *config.Message, dest bridge.Bridge) []con } func (gw *Gateway) getDestMsgID(msgID string, dest *bridge.Bridge, channel *config.ChannelInfo) string { + var destID string + + _, usePersistent := gw.Config.GetString("PersistentMessageStorePath") + if usePersistent { + destID = gw.getDestMessagesFromStore(msgID, dest, channel) + } else { + destID = gw.getDestMessageFromMemCache(msgID, dest, channel) + } + + return strings.Replace(destID, dest.Protocol+" ", "", 1) +} + +func (gw *Gateway) getDestMessageFromMemCache(msgID string, dest *bridge.Bridge, channel *config.ChannelInfo) string { if res, ok := gw.Messages.Get(msgID); ok { IDs := res.([]*BrMsgID) for _, id := range IDs { // check protocol, bridge name and channelname // for people that reuse the same bridge multiple times. see #342 - if dest.Protocol == id.br.Protocol && dest.Name == id.br.Name && channel.ID == id.ChannelID { - return strings.Replace(id.ID, dest.Protocol+" ", "", 1) + if dest.Protocol == id.Protocol && dest.Name == id.DestName && channel.ID == id.ChannelID { + return id.ID } } } diff --git a/gateway/handlers.go b/gateway/handlers.go index 44cefe4506..1d35b849b8 100644 --- a/gateway/handlers.go +++ b/gateway/handlers.go @@ -231,7 +231,13 @@ func (gw *Gateway) handleMessage(rmsg *config.Message, dest *bridge.Bridge) []*B if msgID == "" { continue } - brMsgIDs = append(brMsgIDs, &BrMsgID{dest, dest.Protocol + " " + msgID, channel.ID}) + brMsgIDs = append(brMsgIDs, + &BrMsgID{ + Protocol: dest.Protocol, + DestName: dest.Name, + ChannelID: channel.ID, + ID: msgID, + }) } return brMsgIDs } diff --git a/gateway/persistent.go b/gateway/persistent.go new file mode 100644 index 0000000000..63534f069a --- /dev/null +++ b/gateway/persistent.go @@ -0,0 +1,83 @@ +package gateway + +import ( + "github.com/42wim/matterbridge/bridge" + "github.com/42wim/matterbridge/bridge/config" + "github.com/philippgille/gokv" + "github.com/philippgille/gokv/badgerdb" + "github.com/philippgille/gokv/encoding" +) + +func (gw *Gateway) getMessageMapStore(path string) gokv.Store { + options := badgerdb.Options{ + Dir: path, + Codec: encoding.Gob, + } + + store, err := badgerdb.NewStore(options) + if err != nil { + gw.logger.Error(err) + gw.logger.Errorf("Could not connect to db: %s", path) + } + + return store +} + +func (gw *Gateway) getCanonicalMessageFromStore(messageID string) string { + if messageID == "" { + return "" + } + + canonicalMsgID := new(string) + found, err := gw.CanonicalStore.Get(messageID, canonicalMsgID) + if err != nil { + gw.logger.Error(err) + } + + if found { + return *canonicalMsgID + } + + return "" +} + +func (gw *Gateway) setCanonicalMessageToStore(messageID string, canonicalMsgID string) { + err := gw.CanonicalStore.Set(messageID, canonicalMsgID) + if err != nil { + gw.logger.Error(err) + } +} + +func (gw *Gateway) getDestMessagesFromStore(canonicalMsgID string, dest *bridge.Bridge, channel *config.ChannelInfo) string { + if canonicalMsgID == "" { + return "" + } + + destMessageIds := new([]BrMsgID) + found, err := gw.MessageStore.Get(canonicalMsgID, destMessageIds) + if err != nil { + gw.logger.Error(err) + } + + if found { + for _, id := range *destMessageIds { + // check protocol, bridge name and channelname + // for people that reuse the same bridge multiple times. see #342 + if dest.Protocol == id.Protocol && dest.Name == id.DestName && channel.ID == id.ChannelID { + return id.ID + } + } + } + return "" +} + +func (gw *Gateway) setDestMessagesToStore(canonicalMsgID string, msgIDs []*BrMsgID) { + for _, msgID := range msgIDs { + gw.setCanonicalMessageToStore(msgID.Protocol+" "+msgID.ID, canonicalMsgID) + } + + err := gw.MessageStore.Set(canonicalMsgID, msgIDs) + if err != nil { + gw.logger.Error(err) + } +} diff --git a/gateway/router.go b/gateway/router.go index cc6eb75205..c2738f49b2 100644 --- a/gateway/router.go +++ b/gateway/router.go @@ -167,9 +167,15 @@ func (r *Router) handleReceive() { // we're adding the original message as a "dest message" // as when we get the dest messages for a delete the source message isnt in the list // therefore the delete doesnt happen on the source platform. - msgIDs = append(msgIDs, &BrMsgID{srcBridge, srcBridge.Protocol + " " + msg.ID, msg.Channel + srcBridge.Account}) - - gw.Messages.Add(msg.Protocol+" "+msg.ID, msgIDs) + msgIDs = append(msgIDs, + &BrMsgID{ + Protocol: srcBridge.Protocol, + DestName: srcBridge.Name, + ChannelID: msg.Channel + srcBridge.Account, + ID: msg.ID, + }) + + gw.SetMessageMap(msg.Protocol+" "+msg.ID, msgIDs) } } } diff --git a/go.mod b/go.mod index c5b7bdab76..6a6df3fc62 100644 --- a/go.mod +++ b/go.mod @@ -61,11 +61,14 @@ require ( require ( filippo.io/edwards25519 v1.0.0 // indirect + github.com/AndreasBriese/bbloom v0.0.0-20190825152654-46b345b51c96 // indirect github.com/Benau/go_rlottie v0.0.0-20210807002906-98c1b2421989 // indirect github.com/Jeffail/gabs v1.4.0 // indirect github.com/apex/log v1.9.0 // indirect github.com/av-elier/go-decimal-to-rational v0.0.0-20191127152832-89e6aad02ecf // indirect github.com/blang/semver v3.5.1+incompatible // indirect + github.com/dgraph-io/badger v1.6.0 // indirect + github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2 // indirect github.com/dustin/go-humanize v1.0.0 // indirect github.com/dyatlov/go-opengraph v0.0.0-20210112100619-dae8665a5b09 // indirect github.com/francoispqt/gojay v1.2.13 // indirect @@ -107,6 +110,10 @@ require ( github.com/pelletier/go-toml v1.9.5 // indirect github.com/pelletier/go-toml/v2 v2.0.6 // indirect github.com/philhofer/fwd v1.1.1 // indirect + github.com/philippgille/gokv v0.6.0 // indirect + github.com/philippgille/gokv/badgerdb v0.6.0 // indirect + github.com/philippgille/gokv/encoding v0.0.0-20191011213304-eb77f15b9c61 // indirect + github.com/philippgille/gokv/util v0.0.0-20191011213304-eb77f15b9c61 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect diff --git a/go.sum b/go.sum index 4f5f3470ff..8bf5051ecc 100644 --- a/go.sum +++ b/go.sum @@ -67,6 +67,8 @@ git.apache.org/thrift.git v0.12.0/go.mod h1:fPE2ZNJGynbRyZ4dJvy6G277gSllfV2HJqbl github.com/42wim/go-gitter v0.0.0-20170828205020-017310c2d557 h1:IZtuWGfzQnKnCSu+vl8WGLhpVQ5Uvy3rlSwqXSg+sQg= github.com/42wim/go-gitter v0.0.0-20170828205020-017310c2d557/go.mod h1:jL0YSXMs/txjtGJ4PWrmETOk6KUHMDPMshgQZlTeB3Y= github.com/AndreasBriese/bbloom v0.0.0-20190306092124-e2d15f34fcf9/go.mod h1:bOvUY6CB00SOBii9/FifXqc0awNKxLFCL/+pkDPuyl8= +github.com/AndreasBriese/bbloom v0.0.0-20190825152654-46b345b51c96 h1:cTp8I5+VIoKjsnZuH8vjyaysT/ses3EvZeaV/1UkF2M= +github.com/AndreasBriese/bbloom v0.0.0-20190825152654-46b345b51c96/go.mod h1:bOvUY6CB00SOBii9/FifXqc0awNKxLFCL/+pkDPuyl8= github.com/Azure/azure-pipeline-go v0.2.3/go.mod h1:x841ezTBIMG6O3lAcl8ATHnsOPVl2bqk7S3ta6S6u4k= github.com/Azure/azure-sdk-for-go v16.2.1+incompatible/go.mod h1:9XXNKU+eRnpl9moKnB4QOLf1HestfXbmab5FXxiDBjc= github.com/Azure/azure-sdk-for-go v26.5.0+incompatible/go.mod h1:9XXNKU+eRnpl9moKnB4QOLf1HestfXbmab5FXxiDBjc= @@ -459,10 +461,12 @@ github.com/dchote/go-openal v0.0.0-20171116030048-f4a9a141d372/go.mod h1:74z+CYu github.com/denisenkom/go-mssqldb v0.0.0-20200620013148-b91950f658ec/go.mod h1:xbL0rPBG9cCiLr28tMa8zpbdarY27NDyej4t/EjAShU= github.com/denisenkom/go-mssqldb v0.10.0/go.mod h1:xbL0rPBG9cCiLr28tMa8zpbdarY27NDyej4t/EjAShU= github.com/denverdino/aliyungo v0.0.0-20190125010748-a747050bb1ba/go.mod h1:dV8lFg6daOBZbT6/BDGIz6Y3WFGn8juu6G+CQ6LHtl0= +github.com/dgraph-io/badger v1.6.0 h1:DshxFxZWXUcO0xX476VJC07Xsr6ZCBVRHKZ93Oh7Evo= github.com/dgraph-io/badger v1.6.0/go.mod h1:zwt7syl517jmP8s94KqSxTlM6IMsdhYy6psNgSztDR4= github.com/dgrijalva/jwt-go v0.0.0-20170104182250-a601269ab70c/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ= github.com/dgrijalva/jwt-go v3.2.0+incompatible/go.mod h1:E3ru+11k8xSBh+hMPgOLZmtrrCbhqsmaPHjLKYnJCaQ= github.com/dgryski/dgoogauth v0.0.0-20190221195224-5a805980a5f3/go.mod h1:hEfFauPHz7+NnjR/yHJGhrKo1Za+zStgwUETx3yzqgY= +github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2 h1:tdlZCpZ/P9DhczCTSixgIKmwPv6+wP5DGjqLYw5SUiA= github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2/go.mod h1:SqUrOPUnsFjfmXRMNPybcSiG0BgUW2AuFH8PAnS2iTw= github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc= github.com/dgryski/go-sip13 v0.0.0-20181026042036-e10d5fee7954/go.mod h1:vAd38F8PWV+bWy6jNmig1y/TA+kYO4g3RSRF0IAv0no= @@ -1344,6 +1348,17 @@ github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR github.com/philhofer/fwd v1.0.0/go.mod h1:gk3iGcWd9+svBvR0sR+KPcfE+RNWozjowpeBVG3ZVNU= github.com/philhofer/fwd v1.1.1 h1:GdGcTjf5RNAxwS4QLsiMzJYj5KEvPJD3Abr261yRQXQ= github.com/philhofer/fwd v1.1.1/go.mod h1:gk3iGcWd9+svBvR0sR+KPcfE+RNWozjowpeBVG3ZVNU= +github.com/philippgille/gokv v0.0.0-20191001201555-5ac9a20de634/go.mod h1:OCoWPt+mbYuTO1FUVrQ2SxQU0oaaHBsn6lRhFX3JHOc= +github.com/philippgille/gokv v0.5.1-0.20191011213304-eb77f15b9c61/go.mod h1:OCoWPt+mbYuTO1FUVrQ2SxQU0oaaHBsn6lRhFX3JHOc= +github.com/philippgille/gokv v0.6.0 h1:fNEx/tSwV73nzlYd3iRYB8F+SEVJNNFzH1gsaT8SK2c= +github.com/philippgille/gokv v0.6.0/go.mod h1:tjXRFw9xDHgxLS8WJdfYotKGWp8TWqu4RdXjMDG/XBo= +github.com/philippgille/gokv/badgerdb v0.6.0 h1:4Qigf2SpyXLF8KaM5nA5/D/0aD/bZevuAnrW4ZsDsjA= +github.com/philippgille/gokv/badgerdb v0.6.0/go.mod h1:3u2avs8gtmCc0R0Bw4jKV8aaDfLb5V9JToSASyhpFGM= +github.com/philippgille/gokv/encoding v0.0.0-20191011213304-eb77f15b9c61 h1:IgQDuUPuEFVf22mBskeCLAtvd5c9XiiJG2UYud6eGHI= +github.com/philippgille/gokv/encoding v0.0.0-20191011213304-eb77f15b9c61/go.mod h1:SjxSrCoeYrYn85oTtroyG1ePY8aE72nvLQlw8IYwAN8= +github.com/philippgille/gokv/test v0.0.0-20191011213304-eb77f15b9c61/go.mod h1:EUc+s9ONc1+VOr9NUEd8S0YbGRrQd/gz/p+2tvwt12s= +github.com/philippgille/gokv/util v0.0.0-20191011213304-eb77f15b9c61 h1:ril/jI0JgXNjPWwDkvcRxlZ09kgHXV2349xChjbsQ4o= +github.com/philippgille/gokv/util v0.0.0-20191011213304-eb77f15b9c61/go.mod h1:2dBhsJgY/yVIkjY5V3AnDUxUbEPzT6uQ3LvoVT8TR20= github.com/phpdave11/gofpdf v1.4.2/go.mod h1:zpO6xFn9yxo3YLyMvW8HcKWVdbNqgIfOOp2dXMnm1mY= github.com/phpdave11/gofpdi v1.0.12/go.mod h1:vBmVV0Do6hSBHC8uKUQ71JGW+ZGQq74llk/7bXwjDoI= github.com/pierrec/lz4 v2.0.5+incompatible/go.mod h1:pdkljMzZIN41W+lC3N2tnIh5sFi+IEE17M5jbnwPHcY= @@ -1916,6 +1931,7 @@ golang.org/x/net v0.0.0-20190813141303-74dc4d7220e7/go.mod h1:z5CRVTTTmAJ677TzLL golang.org/x/net v0.0.0-20190827160401-ba9fcec4b297/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190923162816-aa69164e4478/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20191004110552-13f9640d40b9/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20191011234655-491137f69257/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20191112182307-2180aed22343/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20191209160850-c0dbc17a3553/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200114155413-6afb5195e5aa/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= @@ -2049,6 +2065,7 @@ golang.org/x/sys v0.0.0-20190924154521-2837fb4f24fe/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20191001151750-bb3f8db39f24/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191005200804-aed5e4c7ecf9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191008105621-543471e840be/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191010194322-b09406accb47/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191022100944-742c48ecaeb7/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20191112214154-59a1497f0cea/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= diff --git a/vendor/github.com/AndreasBriese/bbloom/.travis.yml b/vendor/github.com/AndreasBriese/bbloom/.travis.yml new file mode 100644 index 0000000000..4f2ee4d973 --- /dev/null +++ b/vendor/github.com/AndreasBriese/bbloom/.travis.yml @@ -0,0 +1 @@ +language: go diff --git a/vendor/github.com/AndreasBriese/bbloom/LICENSE b/vendor/github.com/AndreasBriese/bbloom/LICENSE new file mode 100644 index 0000000000..4b20050e84 --- /dev/null +++ b/vendor/github.com/AndreasBriese/bbloom/LICENSE @@ -0,0 +1,35 @@ +bbloom.go + +// The MIT License (MIT) +// Copyright (c) 2014 Andreas Briese, eduToolbox@Bri-C GmbH, Sarstedt + +// Permission is hereby granted, free of charge, to any person obtaining a copy of +// this software and associated documentation files (the "Software"), to deal in +// the Software without restriction, including without limitation the rights to +// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +// the Software, and to permit persons to whom the Software is furnished to do so, +// subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +// IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +siphash.go + +// https://github.com/dchest/siphash +// +// Written in 2012 by Dmitry Chestnykh. +// +// To the extent possible under law, the author have dedicated all copyright +// and related and neighboring rights to this software to the public domain +// worldwide. This software is distributed without any warranty. +// http://creativecommons.org/publicdomain/zero/1.0/ +// +// Package siphash implements SipHash-2-4, a fast short-input PRF +// created by Jean-Philippe Aumasson and Daniel J. Bernstein. diff --git a/vendor/github.com/AndreasBriese/bbloom/README.md b/vendor/github.com/AndreasBriese/bbloom/README.md new file mode 100644 index 0000000000..d7413c33fa --- /dev/null +++ b/vendor/github.com/AndreasBriese/bbloom/README.md @@ -0,0 +1,131 @@ +## bbloom: a bitset Bloom filter for go/golang +=== + +[![Build Status](https://travis-ci.org/AndreasBriese/bbloom.png?branch=master)](http://travis-ci.org/AndreasBriese/bbloom) + +package implements a fast bloom filter with real 'bitset' and JSONMarshal/JSONUnmarshal to store/reload the Bloom filter. + +NOTE: the package uses unsafe.Pointer to set and read the bits from the bitset. If you're uncomfortable with using the unsafe package, please consider using my bloom filter package at github.com/AndreasBriese/bloom + +=== + +changelog 11/2015: new thread safe methods AddTS(), HasTS(), AddIfNotHasTS() following a suggestion from Srdjan Marinovic (github @a-little-srdjan), who used this to code a bloomfilter cache. + +This bloom filter was developed to strengthen a website-log database and was tested and optimized for this log-entry mask: "2014/%02i/%02i %02i:%02i:%02i /info.html". +Nonetheless bbloom should work with any other form of entries. + +~~Hash function is a modified Berkeley DB sdbm hash (to optimize for smaller strings). sdbm http://www.cse.yorku.ca/~oz/hash.html~~ + +Found sipHash (SipHash-2-4, a fast short-input PRF created by Jean-Philippe Aumasson and Daniel J. Bernstein.) to be about as fast. sipHash had been ported by Dimtry Chestnyk to Go (github.com/dchest/siphash ) + +Minimum hashset size is: 512 ([4]uint64; will be set automatically). + +###install + +```sh +go get github.com/AndreasBriese/bbloom +``` + +###test ++ change to folder ../bbloom ++ create wordlist in file "words.txt" (you might use `python permut.py`) ++ run 'go test -bench=.' within the folder + +```go +go test -bench=. +``` + +~~If you've installed the GOCONVEY TDD-framework http://goconvey.co/ you can run the tests automatically.~~ + +using go's testing framework now (have in mind that the op timing is related to 65536 operations of Add, Has, AddIfNotHas respectively) + +### usage + +after installation add + +```go +import ( + ... + "github.com/AndreasBriese/bbloom" + ... + ) +``` + +at your header. In the program use + +```go +// create a bloom filter for 65536 items and 1 % wrong-positive ratio +bf := bbloom.New(float64(1<<16), float64(0.01)) + +// or +// create a bloom filter with 650000 for 65536 items and 7 locs per hash explicitly +// bf = bbloom.New(float64(650000), float64(7)) +// or +bf = bbloom.New(650000.0, 7.0) + +// add one item +bf.Add([]byte("butter")) + +// Number of elements added is exposed now +// Note: ElemNum will not be included in JSON export (for compatability to older version) +nOfElementsInFilter := bf.ElemNum + +// check if item is in the filter +isIn := bf.Has([]byte("butter")) // should be true +isNotIn := bf.Has([]byte("Butter")) // should be false + +// 'add only if item is new' to the bloomfilter +added := bf.AddIfNotHas([]byte("butter")) // should be false because 'butter' is already in the set +added = bf.AddIfNotHas([]byte("buTTer")) // should be true because 'buTTer' is new + +// thread safe versions for concurrent use: AddTS, HasTS, AddIfNotHasTS +// add one item +bf.AddTS([]byte("peanutbutter")) +// check if item is in the filter +isIn = bf.HasTS([]byte("peanutbutter")) // should be true +isNotIn = bf.HasTS([]byte("peanutButter")) // should be false +// 'add only if item is new' to the bloomfilter +added = bf.AddIfNotHasTS([]byte("butter")) // should be false because 'peanutbutter' is already in the set +added = bf.AddIfNotHasTS([]byte("peanutbuTTer")) // should be true because 'penutbuTTer' is new + +// convert to JSON ([]byte) +Json := bf.JSONMarshal() + +// bloomfilters Mutex is exposed for external un-/locking +// i.e. mutex lock while doing JSON conversion +bf.Mtx.Lock() +Json = bf.JSONMarshal() +bf.Mtx.Unlock() + +// restore a bloom filter from storage +bfNew := bbloom.JSONUnmarshal(Json) + +isInNew := bfNew.Has([]byte("butter")) // should be true +isNotInNew := bfNew.Has([]byte("Butter")) // should be false + +``` + +to work with the bloom filter. + +### why 'fast'? + +It's about 3 times faster than William Fitzgeralds bitset bloom filter https://github.com/willf/bloom . And it is about so fast as my []bool set variant for Boom filters (see https://github.com/AndreasBriese/bloom ) but having a 8times smaller memory footprint: + + + Bloom filter (filter size 524288, 7 hashlocs) + github.com/AndreasBriese/bbloom 'Add' 65536 items (10 repetitions): 6595800 ns (100 ns/op) + github.com/AndreasBriese/bbloom 'Has' 65536 items (10 repetitions): 5986600 ns (91 ns/op) + github.com/AndreasBriese/bloom 'Add' 65536 items (10 repetitions): 6304684 ns (96 ns/op) + github.com/AndreasBriese/bloom 'Has' 65536 items (10 repetitions): 6568663 ns (100 ns/op) + + github.com/willf/bloom 'Add' 65536 items (10 repetitions): 24367224 ns (371 ns/op) + github.com/willf/bloom 'Test' 65536 items (10 repetitions): 21881142 ns (333 ns/op) + github.com/dataence/bloom/standard 'Add' 65536 items (10 repetitions): 23041644 ns (351 ns/op) + github.com/dataence/bloom/standard 'Check' 65536 items (10 repetitions): 19153133 ns (292 ns/op) + github.com/cabello/bloom 'Add' 65536 items (10 repetitions): 131921507 ns (2012 ns/op) + github.com/cabello/bloom 'Contains' 65536 items (10 repetitions): 131108962 ns (2000 ns/op) + +(on MBPro15 OSX10.8.5 i7 4Core 2.4Ghz) + + +With 32bit bloom filters (bloom32) using modified sdbm, bloom32 does hashing with only 2 bit shifts, one xor and one substraction per byte. smdb is about as fast as fnv64a but gives less collisions with the dataset (see mask above). bloom.New(float64(10 * 1<<16),float64(7)) populated with 1<<16 random items from the dataset (see above) and tested against the rest results in less than 0.05% collisions. diff --git a/vendor/github.com/AndreasBriese/bbloom/bbloom.go b/vendor/github.com/AndreasBriese/bbloom/bbloom.go new file mode 100644 index 0000000000..c36948fcf8 --- /dev/null +++ b/vendor/github.com/AndreasBriese/bbloom/bbloom.go @@ -0,0 +1,284 @@ +// The MIT License (MIT) +// Copyright (c) 2014 Andreas Briese, eduToolbox@Bri-C GmbH, Sarstedt + +// Permission is hereby granted, free of charge, to any person obtaining a copy of +// this software and associated documentation files (the "Software"), to deal in +// the Software without restriction, including without limitation the rights to +// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +// the Software, and to permit persons to whom the Software is furnished to do so, +// subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +// IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +// 2019/08/25 code revision to reduce unsafe use +// Parts are adopted from the fork at ipfs/bbloom after performance rev by +// Steve Allen (https://github.com/Stebalien) +// (see https://github.com/ipfs/bbloom/blob/master/bbloom.go) +// -> func Has +// -> func set +// -> func add + +package bbloom + +import ( + "bytes" + "encoding/json" + "log" + "math" + "sync" + "unsafe" +) + +// helper +// not needed anymore by Set +// var mask = []uint8{1, 2, 4, 8, 16, 32, 64, 128} + +func getSize(ui64 uint64) (size uint64, exponent uint64) { + if ui64 < uint64(512) { + ui64 = uint64(512) + } + size = uint64(1) + for size < ui64 { + size <<= 1 + exponent++ + } + return size, exponent +} + +func calcSizeByWrongPositives(numEntries, wrongs float64) (uint64, uint64) { + size := -1 * numEntries * math.Log(wrongs) / math.Pow(float64(0.69314718056), 2) + locs := math.Ceil(float64(0.69314718056) * size / numEntries) + return uint64(size), uint64(locs) +} + +// New +// returns a new bloomfilter +func New(params ...float64) (bloomfilter Bloom) { + var entries, locs uint64 + if len(params) == 2 { + if params[1] < 1 { + entries, locs = calcSizeByWrongPositives(params[0], params[1]) + } else { + entries, locs = uint64(params[0]), uint64(params[1]) + } + } else { + log.Fatal("usage: New(float64(number_of_entries), float64(number_of_hashlocations)) i.e. New(float64(1000), float64(3)) or New(float64(number_of_entries), float64(number_of_hashlocations)) i.e. New(float64(1000), float64(0.03))") + } + size, exponent := getSize(uint64(entries)) + bloomfilter = Bloom{ + Mtx: &sync.Mutex{}, + sizeExp: exponent, + size: size - 1, + setLocs: locs, + shift: 64 - exponent, + } + bloomfilter.Size(size) + return bloomfilter +} + +// NewWithBoolset +// takes a []byte slice and number of locs per entry +// returns the bloomfilter with a bitset populated according to the input []byte +func NewWithBoolset(bs *[]byte, locs uint64) (bloomfilter Bloom) { + bloomfilter = New(float64(len(*bs)<<3), float64(locs)) + for i, b := range *bs { + *(*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(&bloomfilter.bitset[0])) + uintptr(i))) = b + } + return bloomfilter +} + +// bloomJSONImExport +// Im/Export structure used by JSONMarshal / JSONUnmarshal +type bloomJSONImExport struct { + FilterSet []byte + SetLocs uint64 +} + +// JSONUnmarshal +// takes JSON-Object (type bloomJSONImExport) as []bytes +// returns Bloom object +func JSONUnmarshal(dbData []byte) Bloom { + bloomImEx := bloomJSONImExport{} + json.Unmarshal(dbData, &bloomImEx) + buf := bytes.NewBuffer(bloomImEx.FilterSet) + bs := buf.Bytes() + bf := NewWithBoolset(&bs, bloomImEx.SetLocs) + return bf +} + +// +// Bloom filter +type Bloom struct { + Mtx *sync.Mutex + ElemNum uint64 + bitset []uint64 + sizeExp uint64 + size uint64 + setLocs uint64 + shift uint64 +} + +// <--- http://www.cse.yorku.ca/~oz/hash.html +// modified Berkeley DB Hash (32bit) +// hash is casted to l, h = 16bit fragments +// func (bl Bloom) absdbm(b *[]byte) (l, h uint64) { +// hash := uint64(len(*b)) +// for _, c := range *b { +// hash = uint64(c) + (hash << 6) + (hash << bl.sizeExp) - hash +// } +// h = hash >> bl.shift +// l = hash << bl.shift >> bl.shift +// return l, h +// } + +// Update: found sipHash of Jean-Philippe Aumasson & Daniel J. Bernstein to be even faster than absdbm() +// https://131002.net/siphash/ +// siphash was implemented for Go by Dmitry Chestnykh https://github.com/dchest/siphash + +// Add +// set the bit(s) for entry; Adds an entry to the Bloom filter +func (bl *Bloom) Add(entry []byte) { + l, h := bl.sipHash(entry) + for i := uint64(0); i < bl.setLocs; i++ { + bl.set((h + i*l) & bl.size) + bl.ElemNum++ + } +} + +// AddTS +// Thread safe: Mutex.Lock the bloomfilter for the time of processing the entry +func (bl *Bloom) AddTS(entry []byte) { + bl.Mtx.Lock() + defer bl.Mtx.Unlock() + bl.Add(entry) +} + +// Has +// check if bit(s) for entry is/are set +// returns true if the entry was added to the Bloom Filter +func (bl Bloom) Has(entry []byte) bool { + l, h := bl.sipHash(entry) + res := true + for i := uint64(0); i < bl.setLocs; i++ { + res = res && bl.isSet((h+i*l)&bl.size) + // https://github.com/ipfs/bbloom/commit/84e8303a9bfb37b2658b85982921d15bbb0fecff + // // Branching here (early escape) is not worth it + // // This is my conclusion from benchmarks + // // (prevents loop unrolling) + // switch bl.IsSet((h + i*l) & bl.size) { + // case false: + // return false + // } + } + return res +} + +// HasTS +// Thread safe: Mutex.Lock the bloomfilter for the time of processing the entry +func (bl *Bloom) HasTS(entry []byte) bool { + bl.Mtx.Lock() + defer bl.Mtx.Unlock() + return bl.Has(entry) +} + +// AddIfNotHas +// Only Add entry if it's not present in the bloomfilter +// returns true if entry was added +// returns false if entry was allready registered in the bloomfilter +func (bl Bloom) AddIfNotHas(entry []byte) (added bool) { + if bl.Has(entry) { + return added + } + bl.Add(entry) + return true +} + +// AddIfNotHasTS +// Tread safe: Only Add entry if it's not present in the bloomfilter +// returns true if entry was added +// returns false if entry was allready registered in the bloomfilter +func (bl *Bloom) AddIfNotHasTS(entry []byte) (added bool) { + bl.Mtx.Lock() + defer bl.Mtx.Unlock() + return bl.AddIfNotHas(entry) +} + +// Size +// make Bloom filter with as bitset of size sz +func (bl *Bloom) Size(sz uint64) { + bl.bitset = make([]uint64, sz>>6) +} + +// Clear +// resets the Bloom filter +func (bl *Bloom) Clear() { + bs := bl.bitset + for i := range bs { + bs[i] = 0 + } +} + +// Set +// set the bit[idx] of bitsit +func (bl *Bloom) set(idx uint64) { + // ommit unsafe + // *(*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(&bl.bitset[idx>>6])) + uintptr((idx%64)>>3))) |= mask[idx%8] + bl.bitset[idx>>6] |= 1 << (idx % 64) +} + +// IsSet +// check if bit[idx] of bitset is set +// returns true/false +func (bl *Bloom) isSet(idx uint64) bool { + // ommit unsafe + // return (((*(*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(&bl.bitset[idx>>6])) + uintptr((idx%64)>>3)))) >> (idx % 8)) & 1) == 1 + return bl.bitset[idx>>6]&(1<<(idx%64)) != 0 +} + +// JSONMarshal +// returns JSON-object (type bloomJSONImExport) as []byte +func (bl Bloom) JSONMarshal() []byte { + bloomImEx := bloomJSONImExport{} + bloomImEx.SetLocs = uint64(bl.setLocs) + bloomImEx.FilterSet = make([]byte, len(bl.bitset)<<3) + for i := range bloomImEx.FilterSet { + bloomImEx.FilterSet[i] = *(*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(&bl.bitset[0])) + uintptr(i))) + } + data, err := json.Marshal(bloomImEx) + if err != nil { + log.Fatal("json.Marshal failed: ", err) + } + return data +} + +// // alternative hashFn +// func (bl Bloom) fnv64a(b *[]byte) (l, h uint64) { +// h64 := fnv.New64a() +// h64.Write(*b) +// hash := h64.Sum64() +// h = hash >> 32 +// l = hash << 32 >> 32 +// return l, h +// } +// +// // <-- http://partow.net/programming/hashfunctions/index.html +// // citation: An algorithm proposed by Donald E. Knuth in The Art Of Computer Programming Volume 3, +// // under the topic of sorting and search chapter 6.4. +// // modified to fit with boolset-length +// func (bl Bloom) DEKHash(b *[]byte) (l, h uint64) { +// hash := uint64(len(*b)) +// for _, c := range *b { +// hash = ((hash << 5) ^ (hash >> bl.shift)) ^ uint64(c) +// } +// h = hash >> bl.shift +// l = hash << bl.sizeExp >> bl.sizeExp +// return l, h +// } diff --git a/vendor/github.com/AndreasBriese/bbloom/sipHash.go b/vendor/github.com/AndreasBriese/bbloom/sipHash.go new file mode 100644 index 0000000000..a91d8199b2 --- /dev/null +++ b/vendor/github.com/AndreasBriese/bbloom/sipHash.go @@ -0,0 +1,225 @@ +// Written in 2012 by Dmitry Chestnykh. +// +// To the extent possible under law, the author have dedicated all copyright +// and related and neighboring rights to this software to the public domain +// worldwide. This software is distributed without any warranty. +// http://creativecommons.org/publicdomain/zero/1.0/ +// +// Package siphash implements SipHash-2-4, a fast short-input PRF +// created by Jean-Philippe Aumasson and Daniel J. Bernstein. + +package bbloom + +// Hash returns the 64-bit SipHash-2-4 of the given byte slice with two 64-bit +// parts of 128-bit key: k0 and k1. +func (bl Bloom) sipHash(p []byte) (l, h uint64) { + // Initialization. + v0 := uint64(8317987320269560794) // k0 ^ 0x736f6d6570736575 + v1 := uint64(7237128889637516672) // k1 ^ 0x646f72616e646f6d + v2 := uint64(7816392314733513934) // k0 ^ 0x6c7967656e657261 + v3 := uint64(8387220255325274014) // k1 ^ 0x7465646279746573 + t := uint64(len(p)) << 56 + + // Compression. + for len(p) >= 8 { + + m := uint64(p[0]) | uint64(p[1])<<8 | uint64(p[2])<<16 | uint64(p[3])<<24 | + uint64(p[4])<<32 | uint64(p[5])<<40 | uint64(p[6])<<48 | uint64(p[7])<<56 + + v3 ^= m + + // Round 1. + v0 += v1 + v1 = v1<<13 | v1>>51 + v1 ^= v0 + v0 = v0<<32 | v0>>32 + + v2 += v3 + v3 = v3<<16 | v3>>48 + v3 ^= v2 + + v0 += v3 + v3 = v3<<21 | v3>>43 + v3 ^= v0 + + v2 += v1 + v1 = v1<<17 | v1>>47 + v1 ^= v2 + v2 = v2<<32 | v2>>32 + + // Round 2. + v0 += v1 + v1 = v1<<13 | v1>>51 + v1 ^= v0 + v0 = v0<<32 | v0>>32 + + v2 += v3 + v3 = v3<<16 | v3>>48 + v3 ^= v2 + + v0 += v3 + v3 = v3<<21 | v3>>43 + v3 ^= v0 + + v2 += v1 + v1 = v1<<17 | v1>>47 + v1 ^= v2 + v2 = v2<<32 | v2>>32 + + v0 ^= m + p = p[8:] + } + + // Compress last block. + switch len(p) { + case 7: + t |= uint64(p[6]) << 48 + fallthrough + case 6: + t |= uint64(p[5]) << 40 + fallthrough + case 5: + t |= uint64(p[4]) << 32 + fallthrough + case 4: + t |= uint64(p[3]) << 24 + fallthrough + case 3: + t |= uint64(p[2]) << 16 + fallthrough + case 2: + t |= uint64(p[1]) << 8 + fallthrough + case 1: + t |= uint64(p[0]) + } + + v3 ^= t + + // Round 1. + v0 += v1 + v1 = v1<<13 | v1>>51 + v1 ^= v0 + v0 = v0<<32 | v0>>32 + + v2 += v3 + v3 = v3<<16 | v3>>48 + v3 ^= v2 + + v0 += v3 + v3 = v3<<21 | v3>>43 + v3 ^= v0 + + v2 += v1 + v1 = v1<<17 | v1>>47 + v1 ^= v2 + v2 = v2<<32 | v2>>32 + + // Round 2. + v0 += v1 + v1 = v1<<13 | v1>>51 + v1 ^= v0 + v0 = v0<<32 | v0>>32 + + v2 += v3 + v3 = v3<<16 | v3>>48 + v3 ^= v2 + + v0 += v3 + v3 = v3<<21 | v3>>43 + v3 ^= v0 + + v2 += v1 + v1 = v1<<17 | v1>>47 + v1 ^= v2 + v2 = v2<<32 | v2>>32 + + v0 ^= t + + // Finalization. + v2 ^= 0xff + + // Round 1. + v0 += v1 + v1 = v1<<13 | v1>>51 + v1 ^= v0 + v0 = v0<<32 | v0>>32 + + v2 += v3 + v3 = v3<<16 | v3>>48 + v3 ^= v2 + + v0 += v3 + v3 = v3<<21 | v3>>43 + v3 ^= v0 + + v2 += v1 + v1 = v1<<17 | v1>>47 + v1 ^= v2 + v2 = v2<<32 | v2>>32 + + // Round 2. + v0 += v1 + v1 = v1<<13 | v1>>51 + v1 ^= v0 + v0 = v0<<32 | v0>>32 + + v2 += v3 + v3 = v3<<16 | v3>>48 + v3 ^= v2 + + v0 += v3 + v3 = v3<<21 | v3>>43 + v3 ^= v0 + + v2 += v1 + v1 = v1<<17 | v1>>47 + v1 ^= v2 + v2 = v2<<32 | v2>>32 + + // Round 3. + v0 += v1 + v1 = v1<<13 | v1>>51 + v1 ^= v0 + v0 = v0<<32 | v0>>32 + + v2 += v3 + v3 = v3<<16 | v3>>48 + v3 ^= v2 + + v0 += v3 + v3 = v3<<21 | v3>>43 + v3 ^= v0 + + v2 += v1 + v1 = v1<<17 | v1>>47 + v1 ^= v2 + v2 = v2<<32 | v2>>32 + + // Round 4. + v0 += v1 + v1 = v1<<13 | v1>>51 + v1 ^= v0 + v0 = v0<<32 | v0>>32 + + v2 += v3 + v3 = v3<<16 | v3>>48 + v3 ^= v2 + + v0 += v3 + v3 = v3<<21 | v3>>43 + v3 ^= v0 + + v2 += v1 + v1 = v1<<17 | v1>>47 + v1 ^= v2 + v2 = v2<<32 | v2>>32 + + // return v0 ^ v1 ^ v2 ^ v3 + + hash := v0 ^ v1 ^ v2 ^ v3 + h = hash >> bl.shift + l = hash << bl.shift >> bl.shift + return l, h + +} diff --git a/vendor/github.com/AndreasBriese/bbloom/words.txt b/vendor/github.com/AndreasBriese/bbloom/words.txt new file mode 100644 index 0000000000..ad86a31ac5 --- /dev/null +++ b/vendor/github.com/AndreasBriese/bbloom/words.txt @@ -0,0 +1,140 @@ +2014/01/01 00:00:00 /info.html +2014/01/01 00:00:00 /info.html +2014/01/01 00:00:01 /info.html +2014/01/01 00:00:02 /info.html +2014/01/01 00:00:03 /info.html +2014/01/01 00:00:04 /info.html +2014/01/01 00:00:05 /info.html +2014/01/01 00:00:06 /info.html +2014/01/01 00:00:07 /info.html +2014/01/01 00:00:08 /info.html +2014/01/01 00:00:09 /info.html +2014/01/01 00:00:10 /info.html +2014/01/01 00:00:11 /info.html +2014/01/01 00:00:12 /info.html +2014/01/01 00:00:13 /info.html +2014/01/01 00:00:14 /info.html +2014/01/01 00:00:15 /info.html +2014/01/01 00:00:16 /info.html +2014/01/01 00:00:17 /info.html +2014/01/01 00:00:18 /info.html +2014/01/01 00:00:19 /info.html +2014/01/01 00:00:20 /info.html +2014/01/01 00:00:21 /info.html +2014/01/01 00:00:22 /info.html +2014/01/01 00:00:23 /info.html +2014/01/01 00:00:24 /info.html +2014/01/01 00:00:25 /info.html +2014/01/01 00:00:26 /info.html +2014/01/01 00:00:27 /info.html +2014/01/01 00:00:28 /info.html +2014/01/01 00:00:29 /info.html +2014/01/01 00:00:30 /info.html +2014/01/01 00:00:31 /info.html +2014/01/01 00:00:32 /info.html +2014/01/01 00:00:33 /info.html +2014/01/01 00:00:34 /info.html +2014/01/01 00:00:35 /info.html +2014/01/01 00:00:36 /info.html +2014/01/01 00:00:37 /info.html +2014/01/01 00:00:38 /info.html +2014/01/01 00:00:39 /info.html +2014/01/01 00:00:40 /info.html +2014/01/01 00:00:41 /info.html +2014/01/01 00:00:42 /info.html +2014/01/01 00:00:43 /info.html +2014/01/01 00:00:44 /info.html +2014/01/01 00:00:45 /info.html +2014/01/01 00:00:46 /info.html +2014/01/01 00:00:47 /info.html +2014/01/01 00:00:48 /info.html +2014/01/01 00:00:49 /info.html +2014/01/01 00:00:50 /info.html +2014/01/01 00:00:51 /info.html +2014/01/01 00:00:52 /info.html +2014/01/01 00:00:53 /info.html +2014/01/01 00:00:54 /info.html +2014/01/01 00:00:55 /info.html +2014/01/01 00:00:56 /info.html +2014/01/01 00:00:57 /info.html +2014/01/01 00:00:58 /info.html +2014/01/01 00:00:59 /info.html +2014/01/01 00:01:00 /info.html +2014/01/01 00:01:01 /info.html +2014/01/01 00:01:02 /info.html +2014/01/01 00:01:03 /info.html +2014/01/01 00:01:04 /info.html +2014/01/01 00:01:05 /info.html +2014/01/01 00:01:06 /info.html +2014/01/01 00:01:07 /info.html +2014/01/01 00:01:08 /info.html +2014/01/01 00:01:09 /info.html +2014/01/01 00:01:10 /info.html +2014/01/01 00:01:11 /info.html +2014/01/01 00:01:12 /info.html +2014/01/01 00:01:13 /info.html +2014/01/01 00:01:14 /info.html +2014/01/01 00:01:15 /info.html +2014/01/01 00:01:16 /info.html +2014/01/01 00:01:17 /info.html +2014/01/01 00:01:18 /info.html +2014/01/01 00:01:19 /info.html +2014/01/01 00:01:20 /info.html +2014/01/01 00:01:21 /info.html +2014/01/01 00:01:22 /info.html +2014/01/01 00:01:23 /info.html +2014/01/01 00:01:24 /info.html +2014/01/01 00:01:25 /info.html +2014/01/01 00:01:26 /info.html +2014/01/01 00:01:27 /info.html +2014/01/01 00:01:28 /info.html +2014/01/01 00:01:29 /info.html +2014/01/01 00:01:30 /info.html +2014/01/01 00:01:31 /info.html +2014/01/01 00:01:32 /info.html +2014/01/01 00:01:33 /info.html +2014/01/01 00:01:34 /info.html +2014/01/01 00:01:35 /info.html +2014/01/01 00:01:36 /info.html +2014/01/01 00:01:37 /info.html +2014/01/01 00:01:38 /info.html +2014/01/01 00:01:39 /info.html +2014/01/01 00:01:40 /info.html +2014/01/01 00:01:41 /info.html +2014/01/01 00:01:42 /info.html +2014/01/01 00:01:43 /info.html +2014/01/01 00:01:44 /info.html +2014/01/01 00:01:45 /info.html +2014/01/01 00:01:46 /info.html +2014/01/01 00:01:47 /info.html +2014/01/01 00:01:48 /info.html +2014/01/01 00:01:49 /info.html +2014/01/01 00:01:50 /info.html +2014/01/01 00:01:51 /info.html +2014/01/01 00:01:52 /info.html +2014/01/01 00:01:53 /info.html +2014/01/01 00:01:54 /info.html +2014/01/01 00:01:55 /info.html +2014/01/01 00:01:56 /info.html +2014/01/01 00:01:57 /info.html +2014/01/01 00:01:58 /info.html +2014/01/01 00:01:59 /info.html +2014/01/01 00:02:00 /info.html +2014/01/01 00:02:01 /info.html +2014/01/01 00:02:02 /info.html +2014/01/01 00:02:03 /info.html +2014/01/01 00:02:04 /info.html +2014/01/01 00:02:05 /info.html +2014/01/01 00:02:06 /info.html +2014/01/01 00:02:07 /info.html +2014/01/01 00:02:08 /info.html +2014/01/01 00:02:09 /info.html +2014/01/01 00:02:10 /info.html +2014/01/01 00:02:11 /info.html +2014/01/01 00:02:12 /info.html +2014/01/01 00:02:13 /info.html +2014/01/01 00:02:14 /info.html +2014/01/01 00:02:15 /info.html +2014/01/01 00:02:16 /info.html +2014/01/01 00:02:17 /info.html +2014/01/01 00:02:18 /info.html diff --git a/vendor/github.com/dgraph-io/badger/.gitignore b/vendor/github.com/dgraph-io/badger/.gitignore new file mode 100644 index 0000000000..e3efdf58f4 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/.gitignore @@ -0,0 +1,2 @@ +p/ +badger-test*/ diff --git a/vendor/github.com/dgraph-io/badger/.golangci.yml b/vendor/github.com/dgraph-io/badger/.golangci.yml new file mode 100644 index 0000000000..fecb8644b8 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/.golangci.yml @@ -0,0 +1,27 @@ +run: + tests: false + +linters-settings: + lll: + line-length: 100 + +linters: + disable-all: true + enable: + - errcheck + - ineffassign + - gas + - gofmt + - golint + - gosimple + - govet + - lll + - varcheck + - unused + +issues: + exclude-rules: + - linters: + - gosec + text: "G404: " + \ No newline at end of file diff --git a/vendor/github.com/dgraph-io/badger/.travis.yml b/vendor/github.com/dgraph-io/badger/.travis.yml new file mode 100644 index 0000000000..7c58e56d2d --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/.travis.yml @@ -0,0 +1,24 @@ +language: go + +go: + - "1.11" + - "1.12" + +matrix: + include: + - os: osx +notifications: + email: false + slack: + secure: X7uBLWYbuUhf8QFE16CoS5z7WvFR8EN9j6cEectMW6mKZ3vwXGwVXRIPsgUq/606DsQdCCx34MR8MRWYGlu6TBolbSe9y0EP0i46yipPz22YtuT7umcVUbGEyx8MZKgG0v1u/zA0O4aCsOBpGAA3gxz8h3JlEHDt+hv6U8xRsSllVLzLSNb5lwxDtcfEDxVVqP47GMEgjLPM28Pyt5qwjk7o5a4YSVzkfdxBXxd3gWzFUWzJ5E3cTacli50dK4GVfiLcQY2aQYoYO7AAvDnvP+TPfjDkBlUEE4MUz5CDIN51Xb+WW33sX7g+r3Bj7V5IRcF973RiYkpEh+3eoiPnyWyxhDZBYilty3b+Hysp6d4Ov/3I3ll7Bcny5+cYjakjkMH3l9w3gs6Y82GlpSLSJshKWS8vPRsxFe0Pstj6QSJXTd9EBaFr+l1ScXjJv/Sya9j8N9FfTuOTESWuaL1auX4Y7zEEVHlA8SCNOO8K0eTfxGZnC/YcIHsR8rePEAcFxfOYQppkyLF/XvAtnb/LMUuu0g4y2qNdme6Oelvyar1tFEMRtbl4mRCdu/krXBFtkrsfUaVY6WTPdvXAGotsFJ0wuA53zGVhlcd3+xAlSlR3c1QX95HIMeivJKb5L4nTjP+xnrmQNtnVk+tG4LSH2ltuwcZSSczModtcBmRefrk= + +env: + global: + - secure: CRkV2+/jlO0gXzzS50XGxfMS117FNwiVjxNY/LeWq06RKD+dDCPxTJl3JCNe3l0cYEPAglV2uMMYukDiTqJ7e+HI4nh4N4mv6lwx39N8dAvJe1x5ITS2T4qk4kTjuQb1Q1vw/ZOxoQqmvNKj2uRmBdJ/HHmysbRJ1OzCWML3OXdUwJf0AYlJzTjpMfkOKr7sTtE4rwyyQtd4tKH1fGdurgI9ZuFd9qvYxK2qcJhsQ6CNqMXt+7FkVkN1rIPmofjjBTNryzUr4COFXuWH95aDAif19DeBW4lbNgo1+FpDsrgmqtuhl6NAuptI8q/imow2KXBYJ8JPXsxW8DVFj0IIp0RCd3GjaEnwBEbxAyiIHLfW7AudyTS/dJOvZffPqXnuJ8xj3OPIdNe4xY0hWl8Ju2HhKfLOAHq7VadHZWd3IHLil70EiL4/JLD1rNbMImUZisFaA8pyrcIvYYebjOnk4TscwKFLedClRSX1XsMjWWd0oykQtrdkHM2IxknnBpaLu7mFnfE07f6dkG0nlpyu4SCLey7hr5FdcEmljA0nIxTSYDg6035fQkBEAbe7hlESOekkVNT9IZPwG+lmt3vU4ofi6NqNbJecOuSB+h36IiZ9s4YQtxYNnLgW14zjuFGGyT5smc3IjBT7qngDjKIgyrSVoRkY/8udy9qbUgvBeW8= + +before_script: +- go get github.com/mattn/goveralls +script: +- bash contrib/cover.sh $HOME/build coverage.out || travis_terminate 1 +- goveralls -service=travis-ci -coverprofile=coverage.out || true +- goveralls -coverprofile=coverage.out -service=travis-ci diff --git a/vendor/github.com/dgraph-io/badger/CHANGELOG.md b/vendor/github.com/dgraph-io/badger/CHANGELOG.md new file mode 100644 index 0000000000..e381a4b7c9 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/CHANGELOG.md @@ -0,0 +1,190 @@ +# Changelog +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) +and this project adheres to [Serialization Versioning](VERSIONING.md). + +## [Unreleased] + +## [1.6.0] - 2019-07-01 + +This is a release including almost 200 commits, so expect many changes - some of them +not backward compatible. + +Regarding backward compatibility in Badger versions, you might be interested on reading +[VERSIONING.md](VERSIONING.md). + +_Note_: The hashes in parentheses correspond to the commits that impacted the given feature. + +### New APIs + +- badger.DB + - DropPrefix (291295e) + - Flatten (7e41bba) + - KeySplits (4751ef1) + - MaxBatchCount (b65e2a3) + - MaxBatchSize (b65e2a3) + - PrintKeyValueHistogram (fd59907) + - Subscribe (26128a7) + - Sync (851e462) + +- badger.DefaultOptions() and badger.LSMOnlyOptions() (91ce687) + - badger.Options.WithX methods + +- badger.Entry (e9447c9) + - NewEntry + - WithMeta + - WithDiscard + - WithTTL + +- badger.Item + - KeySize (fd59907) + - ValueSize (5242a99) + +- badger.IteratorOptions + - PickTable (7d46029, 49a49e3) + - Prefix (7d46029) + +- badger.Logger (fbb2778) + +- badger.Options + - CompactL0OnClose (7e41bba) + - Logger (3f66663) + - LogRotatesToFlush (2237832) + +- badger.Stream (14cbd89, 3258067) +- badger.StreamWriter (7116e16) +- badger.TableInfo.KeyCount (fd59907) +- badger.TableManifest (2017987) +- badger.Tx.NewKeyIterator (49a49e3) +- badger.WriteBatch (6daccf9, 7e78e80) + +### Modified APIs + +#### Breaking changes: + +- badger.DefaultOptions and badger.LSMOnlyOptions are now functions rather than variables (91ce687) +- badger.Item.Value now receives a function that returns an error (439fd46) +- badger.Txn.Commit doesn't receive any params now (6daccf9) +- badger.DB.Tables now receives a boolean (76b5341) + +#### Not breaking changes: + +- badger.LSMOptions changed values (799c33f) +- badger.DB.NewIterator now allows multiple iterators per RO txn (41d9656) +- badger.Options.TableLoadingMode's new default is options.MemoryMap (6b97bac) + +### Removed APIs + +- badger.ManagedDB (d22c0e8) +- badger.Options.DoNotCompact (7e41bba) +- badger.Txn.SetWithX (e9447c9) + +### Tools: + +- badger bank disect (13db058) +- badger bank test (13db058) --mmap (03870e3) +- badger fill (7e41bba) +- badger flatten (7e41bba) +- badger info --histogram (fd59907) --history --lookup --show-keys --show-meta --with-prefix (09e9b63) --show-internal (fb2eed9) +- badger benchmark read (239041e) +- badger benchmark write (6d3b67d) + +## [1.5.5] - 2019-06-20 + +* Introduce support for Go Modules + +## [1.5.3] - 2018-07-11 +Bug Fixes: +* Fix a panic caused due to item.vptr not copying over vs.Value, when looking + for a move key. + +## [1.5.2] - 2018-06-19 +Bug Fixes: +* Fix the way move key gets generated. +* If a transaction has unclosed, or multiple iterators running simultaneously, + throw a panic. Every iterator must be properly closed. At any point in time, + only one iterator per transaction can be running. This is to avoid bugs in a + transaction data structure which is thread unsafe. + +* *Warning: This change might cause panics in user code. Fix is to properly + close your iterators, and only have one running at a time per transaction.* + +## [1.5.1] - 2018-06-04 +Bug Fixes: +* Fix for infinite yieldItemValue recursion. #503 +* Fix recursive addition of `badgerMove` prefix. https://github.com/dgraph-io/badger/commit/2e3a32f0ccac3066fb4206b28deb39c210c5266f +* Use file size based window size for sampling, instead of fixing it to 10MB. #501 + +Cleanup: +* Clarify comments and documentation. +* Move badger tool one directory level up. + +## [1.5.0] - 2018-05-08 +* Introduce `NumVersionsToKeep` option. This option is used to discard many + versions of the same key, which saves space. +* Add a new `SetWithDiscard` method, which would indicate that all the older + versions of the key are now invalid. Those versions would be discarded during + compactions. +* Value log GC moves are now bound to another keyspace to ensure latest versions + of data are always at the top in LSM tree. +* Introduce `ValueLogMaxEntries` to restrict the number of key-value pairs per + value log file. This helps bound the time it takes to garbage collect one + file. + +## [1.4.0] - 2018-05-04 +* Make mmap-ing of value log optional. +* Run GC multiple times, based on recorded discard statistics. +* Add MergeOperator. +* Force compact L0 on clsoe (#439). +* Add truncate option to warn about data loss (#452). +* Discard key versions during compaction (#464). +* Introduce new `LSMOnlyOptions`, to make Badger act like a typical LSM based DB. + +Bug fix: +* (Temporary) Check max version across all tables in Get (removed in next + release). +* Update commit and read ts while loading from backup. +* Ensure all transaction entries are part of the same value log file. +* On commit, run unlock callbacks before doing writes (#413). +* Wait for goroutines to finish before closing iterators (#421). + +## [1.3.0] - 2017-12-12 +* Add `DB.NextSequence()` method to generate monotonically increasing integer + sequences. +* Add `DB.Size()` method to return the size of LSM and value log files. +* Tweaked mmap code to make Windows 32-bit builds work. +* Tweaked build tags on some files to make iOS builds work. +* Fix `DB.PurgeOlderVersions()` to not violate some constraints. + +## [1.2.0] - 2017-11-30 +* Expose a `Txn.SetEntry()` method to allow setting the key-value pair + and all the metadata at the same time. + +## [1.1.1] - 2017-11-28 +* Fix bug where txn.Get was returing key deleted in same transaction. +* Fix race condition while decrementing reference in oracle. +* Update doneCommit in the callback for CommitAsync. +* Iterator see writes of current txn. + +## [1.1.0] - 2017-11-13 +* Create Badger directory if it does not exist when `badger.Open` is called. +* Added `Item.ValueCopy()` to avoid deadlocks in long-running iterations +* Fixed 64-bit alignment issues to make Badger run on Arm v7 + +## [1.0.1] - 2017-11-06 +* Fix an uint16 overflow when resizing key slice + +[Unreleased]: https://github.com/dgraph-io/badger/compare/v1.6.0...HEAD +[1.6.0]: https://github.com/dgraph-io/badger/compare/v1.5.5...v1.6.0 +[1.5.5]: https://github.com/dgraph-io/badger/compare/v1.5.3...v1.5.5 +[1.5.3]: https://github.com/dgraph-io/badger/compare/v1.5.2...v1.5.3 +[1.5.2]: https://github.com/dgraph-io/badger/compare/v1.5.1...v1.5.2 +[1.5.1]: https://github.com/dgraph-io/badger/compare/v1.5.0...v1.5.1 +[1.5.0]: https://github.com/dgraph-io/badger/compare/v1.4.0...v1.5.0 +[1.4.0]: https://github.com/dgraph-io/badger/compare/v1.3.0...v1.4.0 +[1.3.0]: https://github.com/dgraph-io/badger/compare/v1.2.0...v1.3.0 +[1.2.0]: https://github.com/dgraph-io/badger/compare/v1.1.1...v1.2.0 +[1.1.1]: https://github.com/dgraph-io/badger/compare/v1.1.0...v1.1.1 +[1.1.0]: https://github.com/dgraph-io/badger/compare/v1.0.1...v1.1.0 +[1.0.1]: https://github.com/dgraph-io/badger/compare/v1.0.0...v1.0.1 diff --git a/vendor/github.com/dgraph-io/badger/CODE_OF_CONDUCT.md b/vendor/github.com/dgraph-io/badger/CODE_OF_CONDUCT.md new file mode 100644 index 0000000000..bf7bbc29dc --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/CODE_OF_CONDUCT.md @@ -0,0 +1,5 @@ +# Code of Conduct + +Our Code of Conduct can be found here: + +https://dgraph.io/conduct diff --git a/vendor/github.com/dgraph-io/badger/LICENSE b/vendor/github.com/dgraph-io/badger/LICENSE new file mode 100644 index 0000000000..d9a10c0d8e --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/LICENSE @@ -0,0 +1,176 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS diff --git a/vendor/github.com/dgraph-io/badger/README.md b/vendor/github.com/dgraph-io/badger/README.md new file mode 100644 index 0000000000..fe033d9cb5 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/README.md @@ -0,0 +1,859 @@ +# BadgerDB [![GoDoc](https://godoc.org/github.com/dgraph-io/badger?status.svg)](https://godoc.org/github.com/dgraph-io/badger) [![Go Report Card](https://goreportcard.com/badge/github.com/dgraph-io/badger)](https://goreportcard.com/report/github.com/dgraph-io/badger) [![Sourcegraph](https://sourcegraph.com/github.com/dgraph-io/badger/-/badge.svg)](https://sourcegraph.com/github.com/dgraph-io/badger?badge) [![Build Status](https://teamcity.dgraph.io/guestAuth/app/rest/builds/buildType:(id:Badger_UnitTests)/statusIcon.svg)](https://teamcity.dgraph.io/viewLog.html?buildTypeId=Badger_UnitTests&buildId=lastFinished&guest=1) ![Appveyor](https://ci.appveyor.com/api/projects/status/github/dgraph-io/badger?branch=master&svg=true) [![Coverage Status](https://coveralls.io/repos/github/dgraph-io/badger/badge.svg?branch=master)](https://coveralls.io/github/dgraph-io/badger?branch=master) + +![Badger mascot](images/diggy-shadow.png) + +BadgerDB is an embeddable, persistent and fast key-value (KV) database +written in pure Go. It's meant to be a performant alternative to non-Go-based +key-value stores like [RocksDB](https://github.com/facebook/rocksdb). + +## Project Status [Jun 26, 2019] + +Badger is stable and is being used to serve data sets worth hundreds of +terabytes. Badger supports concurrent ACID transactions with serializable +snapshot isolation (SSI) guarantees. A Jepsen-style bank test runs nightly for +8h, with `--race` flag and ensures maintainance of transactional guarantees. +Badger has also been tested to work with filesystem level anomalies, to ensure +persistence and consistency. + +Badger v1.0 was released in Nov 2017, and the latest version that is data-compatible +with v1.0 is v1.6.0. + +Badger v2.0, a new release coming up very soon will use a new storage format which won't +be compatible with all of the v1.x. The [Changelog] is kept fairly up-to-date. + +For more details on our version naming schema please read [Choosing a version](#choosing-a-version). + +[Changelog]:https://github.com/dgraph-io/badger/blob/master/CHANGELOG.md + +## Table of Contents + * [Getting Started](#getting-started) + + [Installing](#installing) + - [Choosing a version](#choosing-a-version) + + [Opening a database](#opening-a-database) + + [Transactions](#transactions) + - [Read-only transactions](#read-only-transactions) + - [Read-write transactions](#read-write-transactions) + - [Managing transactions manually](#managing-transactions-manually) + + [Using key/value pairs](#using-keyvalue-pairs) + + [Monotonically increasing integers](#monotonically-increasing-integers) + * [Merge Operations](#merge-operations) + + [Setting Time To Live(TTL) and User Metadata on Keys](#setting-time-to-livettl-and-user-metadata-on-keys) + + [Iterating over keys](#iterating-over-keys) + - [Prefix scans](#prefix-scans) + - [Key-only iteration](#key-only-iteration) + + [Stream](#stream) + + [Garbage Collection](#garbage-collection) + + [Database backup](#database-backup) + + [Memory usage](#memory-usage) + + [Statistics](#statistics) + * [Resources](#resources) + + [Blog Posts](#blog-posts) + * [Contact](#contact) + * [Design](#design) + + [Comparisons](#comparisons) + + [Benchmarks](#benchmarks) + * [Other Projects Using Badger](#other-projects-using-badger) + * [Frequently Asked Questions](#frequently-asked-questions) + +## Getting Started + +### Installing +To start using Badger, install Go 1.11 or above and run `go get`: + +```sh +$ go get github.com/dgraph-io/badger/... +``` + +This will retrieve the library and install the `badger` command line +utility into your `$GOBIN` path. + +#### Choosing a version + +BadgerDB is a pretty special package from the point of view that the most important change we can +make to it is not on its API but rather on how data is stored on disk. + +This is why we follow a version naming schema that differs from Semantic Versioning. + +- New major versions are released when the data format on disk changes in an incompatible way. +- New minor versions are released whenever the API changes but data compatibility is maintained. + Note that the changes on the API could be backward-incompatible - unlike Semantic Versioning. +- New patch versions are released when there's no changes to the data format nor the API. + +Following these rules: + +- v1.5.0 and v1.6.0 can be used on top of the same files without any concerns, as their major + version is the same, therefore the data format on disk is compatible. +- v1.6.0 and v2.0.0 are data incompatible as their major version implies, so files created with + v1.6.0 will need to be converted into the new format before they can be used by v2.0.0. + +For a longer explanation on the reasons behind using a new versioning naming schema, you can read +[VERSIONING.md](VERSIONING.md). + +### Opening a database +The top-level object in Badger is a `DB`. It represents multiple files on disk +in specific directories, which contain the data for a single database. + +To open your database, use the `badger.Open()` function, with the appropriate +options. The `Dir` and `ValueDir` options are mandatory and must be +specified by the client. They can be set to the same value to simplify things. + +```go +package main + +import ( + "log" + + badger "github.com/dgraph-io/badger" +) + +func main() { + // Open the Badger database located in the /tmp/badger directory. + // It will be created if it doesn't exist. + db, err := badger.Open(badger.DefaultOptions("tmp/badger")) + if err != nil { + log.Fatal(err) + } + defer db.Close() +  // Your code here… +} +``` + +Please note that Badger obtains a lock on the directories so multiple processes +cannot open the same database at the same time. + +### Transactions + +#### Read-only transactions +To start a read-only transaction, you can use the `DB.View()` method: + +```go +err := db.View(func(txn *badger.Txn) error { +  // Your code here… +  return nil +}) +``` + +You cannot perform any writes or deletes within this transaction. Badger +ensures that you get a consistent view of the database within this closure. Any +writes that happen elsewhere after the transaction has started, will not be +seen by calls made within the closure. + +#### Read-write transactions +To start a read-write transaction, you can use the `DB.Update()` method: + +```go +err := db.Update(func(txn *badger.Txn) error { +  // Your code here… +  return nil +}) +``` + +All database operations are allowed inside a read-write transaction. + +Always check the returned error value. If you return an error +within your closure it will be passed through. + +An `ErrConflict` error will be reported in case of a conflict. Depending on the state +of your application, you have the option to retry the operation if you receive +this error. + +An `ErrTxnTooBig` will be reported in case the number of pending writes/deletes in +the transaction exceed a certain limit. In that case, it is best to commit the +transaction and start a new transaction immediately. Here is an example (we are +not checking for errors in some places for simplicity): + +```go +updates := make(map[string]string) +txn := db.NewTransaction(true) +for k,v := range updates { + if err := txn.Set([]byte(k),[]byte(v)); err == ErrTxnTooBig { + _ = txn.Commit() + txn = db.NewTransaction(true) + _ = txn.Set([]byte(k),[]byte(v)) + } +} +_ = txn.Commit() +``` + +#### Managing transactions manually +The `DB.View()` and `DB.Update()` methods are wrappers around the +`DB.NewTransaction()` and `Txn.Commit()` methods (or `Txn.Discard()` in case of +read-only transactions). These helper methods will start the transaction, +execute a function, and then safely discard your transaction if an error is +returned. This is the recommended way to use Badger transactions. + +However, sometimes you may want to manually create and commit your +transactions. You can use the `DB.NewTransaction()` function directly, which +takes in a boolean argument to specify whether a read-write transaction is +required. For read-write transactions, it is necessary to call `Txn.Commit()` +to ensure the transaction is committed. For read-only transactions, calling +`Txn.Discard()` is sufficient. `Txn.Commit()` also calls `Txn.Discard()` +internally to cleanup the transaction, so just calling `Txn.Commit()` is +sufficient for read-write transaction. However, if your code doesn’t call +`Txn.Commit()` for some reason (for e.g it returns prematurely with an error), +then please make sure you call `Txn.Discard()` in a `defer` block. Refer to the +code below. + +```go +// Start a writable transaction. +txn := db.NewTransaction(true) +defer txn.Discard() + +// Use the transaction... +err := txn.Set([]byte("answer"), []byte("42")) +if err != nil { + return err +} + +// Commit the transaction and check for error. +if err := txn.Commit(); err != nil { + return err +} +``` + +The first argument to `DB.NewTransaction()` is a boolean stating if the transaction +should be writable. + +Badger allows an optional callback to the `Txn.Commit()` method. Normally, the +callback can be set to `nil`, and the method will return after all the writes +have succeeded. However, if this callback is provided, the `Txn.Commit()` +method returns as soon as it has checked for any conflicts. The actual writing +to the disk happens asynchronously, and the callback is invoked once the +writing has finished, or an error has occurred. This can improve the throughput +of the application in some cases. But it also means that a transaction is not +durable until the callback has been invoked with a `nil` error value. + +### Using key/value pairs +To save a key/value pair, use the `Txn.Set()` method: + +```go +err := db.Update(func(txn *badger.Txn) error { + err := txn.Set([]byte("answer"), []byte("42")) + return err +}) +``` + +Key/Value pair can also be saved by first creating `Entry`, then setting this +`Entry` using `Txn.SetEntry()`. `Entry` also exposes methods to set properties +on it. + +```go +err := db.Update(func(txn *badger.Txn) error { + e := NewEntry([]byte("answer"), []byte("42")) + err := txn.SetEntry(e) + return err +}) +``` + +This will set the value of the `"answer"` key to `"42"`. To retrieve this +value, we can use the `Txn.Get()` method: + +```go +err := db.View(func(txn *badger.Txn) error { + item, err := txn.Get([]byte("answer")) + handle(err) + + var valNot, valCopy []byte + err := item.Value(func(val []byte) error { + // This func with val would only be called if item.Value encounters no error. + + // Accessing val here is valid. + fmt.Printf("The answer is: %s\n", val) + + // Copying or parsing val is valid. + valCopy = append([]byte{}, val...) + + // Assigning val slice to another variable is NOT OK. + valNot = val // Do not do this. + return nil + }) + handle(err) + + // DO NOT access val here. It is the most common cause of bugs. + fmt.Printf("NEVER do this. %s\n", valNot) + + // You must copy it to use it outside item.Value(...). + fmt.Printf("The answer is: %s\n", valCopy) + + // Alternatively, you could also use item.ValueCopy(). + valCopy, err = item.ValueCopy(nil) + handle(err) + fmt.Printf("The answer is: %s\n", valCopy) + + return nil +}) +``` + +`Txn.Get()` returns `ErrKeyNotFound` if the value is not found. + +Please note that values returned from `Get()` are only valid while the +transaction is open. If you need to use a value outside of the transaction +then you must use `copy()` to copy it to another byte slice. + +Use the `Txn.Delete()` method to delete a key. + +### Monotonically increasing integers + +To get unique monotonically increasing integers with strong durability, you can +use the `DB.GetSequence` method. This method returns a `Sequence` object, which +is thread-safe and can be used concurrently via various goroutines. + +Badger would lease a range of integers to hand out from memory, with the +bandwidth provided to `DB.GetSequence`. The frequency at which disk writes are +done is determined by this lease bandwidth and the frequency of `Next` +invocations. Setting a bandwith too low would do more disk writes, setting it +too high would result in wasted integers if Badger is closed or crashes. +To avoid wasted integers, call `Release` before closing Badger. + +```go +seq, err := db.GetSequence(key, 1000) +defer seq.Release() +for { + num, err := seq.Next() +} +``` + +### Merge Operations +Badger provides support for ordered merge operations. You can define a func +of type `MergeFunc` which takes in an existing value, and a value to be +_merged_ with it. It returns a new value which is the result of the _merge_ +operation. All values are specified in byte arrays. For e.g., here is a merge +function (`add`) which appends a `[]byte` value to an existing `[]byte` value. + +```Go +// Merge function to append one byte slice to another +func add(originalValue, newValue []byte) []byte { + return append(originalValue, newValue...) +} +``` + +This function can then be passed to the `DB.GetMergeOperator()` method, along +with a key, and a duration value. The duration specifies how often the merge +function is run on values that have been added using the `MergeOperator.Add()` +method. + +`MergeOperator.Get()` method can be used to retrieve the cumulative value of the key +associated with the merge operation. + +```Go +key := []byte("merge") + +m := db.GetMergeOperator(key, add, 200*time.Millisecond) +defer m.Stop() + +m.Add([]byte("A")) +m.Add([]byte("B")) +m.Add([]byte("C")) + +res, _ := m.Get() // res should have value ABC encoded +``` + +Example: Merge operator which increments a counter + +```Go +func uint64ToBytes(i uint64) []byte { + var buf [8]byte + binary.BigEndian.PutUint64(buf[:], i) + return buf[:] +} + +func bytesToUint64(b []byte) uint64 { + return binary.BigEndian.Uint64(b) +} + +// Merge function to add two uint64 numbers +func add(existing, new []byte) []byte { + return uint64ToBytes(bytesToUint64(existing) + bytesToUint64(new)) +} +``` +It can be used as +```Go +key := []byte("merge") + +m := db.GetMergeOperator(key, add, 200*time.Millisecond) +defer m.Stop() + +m.Add(uint64ToBytes(1)) +m.Add(uint64ToBytes(2)) +m.Add(uint64ToBytes(3)) + +res, _ := m.Get() // res should have value 6 encoded +``` + +### Setting Time To Live(TTL) and User Metadata on Keys +Badger allows setting an optional Time to Live (TTL) value on keys. Once the TTL has +elapsed, the key will no longer be retrievable and will be eligible for garbage +collection. A TTL can be set as a `time.Duration` value using the `Entry.WithTTL()` +and `Txn.SetEntry()` API methods. + +```go +err := db.Update(func(txn *badger.Txn) error { + e := NewEntry([]byte("answer"), []byte("42")).WithTTL(time.Hour) + err := txn.SetEntry(e) + return err +}) +``` + +An optional user metadata value can be set on each key. A user metadata value +is represented by a single byte. It can be used to set certain bits along +with the key to aid in interpreting or decoding the key-value pair. User +metadata can be set using `Entry.WithMeta()` and `Txn.SetEntry()` API methods. + +```go +err := db.Update(func(txn *badger.Txn) error { + e := NewEntry([]byte("answer"), []byte("42")).WithMeta(byte(1)) + err := txn.SetEntry(e) + return err +}) +``` + +`Entry` APIs can be used to add the user metadata and TTL for same key. This `Entry` +then can be set using `Txn.SetEntry()`. + +```go +err := db.Update(func(txn *badger.Txn) error { + e := NewEntry([]byte("answer"), []byte("42")).WithMeta(byte(1)).WithTTL(time.Hour) + err := txn.SetEntry(e) + return err +}) +``` + +### Iterating over keys +To iterate over keys, we can use an `Iterator`, which can be obtained using the +`Txn.NewIterator()` method. Iteration happens in byte-wise lexicographical sorting +order. + + +```go +err := db.View(func(txn *badger.Txn) error { + opts := badger.DefaultIteratorOptions + opts.PrefetchSize = 10 + it := txn.NewIterator(opts) + defer it.Close() + for it.Rewind(); it.Valid(); it.Next() { + item := it.Item() + k := item.Key() + err := item.Value(func(v []byte) error { + fmt.Printf("key=%s, value=%s\n", k, v) + return nil + }) + if err != nil { + return err + } + } + return nil +}) +``` + +The iterator allows you to move to a specific point in the list of keys and move +forward or backward through the keys one at a time. + +By default, Badger prefetches the values of the next 100 items. You can adjust +that with the `IteratorOptions.PrefetchSize` field. However, setting it to +a value higher than GOMAXPROCS (which we recommend to be 128 or higher) +shouldn’t give any additional benefits. You can also turn off the fetching of +values altogether. See section below on key-only iteration. + +#### Prefix scans +To iterate over a key prefix, you can combine `Seek()` and `ValidForPrefix()`: + +```go +db.View(func(txn *badger.Txn) error { + it := txn.NewIterator(badger.DefaultIteratorOptions) + defer it.Close() + prefix := []byte("1234") + for it.Seek(prefix); it.ValidForPrefix(prefix); it.Next() { + item := it.Item() + k := item.Key() + err := item.Value(func(v []byte) error { + fmt.Printf("key=%s, value=%s\n", k, v) + return nil + }) + if err != nil { + return err + } + } + return nil +}) +``` + +#### Key-only iteration +Badger supports a unique mode of iteration called _key-only_ iteration. It is +several order of magnitudes faster than regular iteration, because it involves +access to the LSM-tree only, which is usually resident entirely in RAM. To +enable key-only iteration, you need to set the `IteratorOptions.PrefetchValues` +field to `false`. This can also be used to do sparse reads for selected keys +during an iteration, by calling `item.Value()` only when required. + +```go +err := db.View(func(txn *badger.Txn) error { + opts := badger.DefaultIteratorOptions + opts.PrefetchValues = false + it := txn.NewIterator(opts) + defer it.Close() + for it.Rewind(); it.Valid(); it.Next() { + item := it.Item() + k := item.Key() + fmt.Printf("key=%s\n", k) + } + return nil +}) +``` + +### Stream +Badger provides a Stream framework, which concurrently iterates over all or a +portion of the DB, converting data into custom key-values, and streams it out +serially to be sent over network, written to disk, or even written back to +Badger. This is a lot faster way to iterate over Badger than using a single +Iterator. Stream supports Badger in both managed and normal mode. + +Stream uses the natural boundaries created by SSTables within the LSM tree, to +quickly generate key ranges. Each goroutine then picks a range and runs an +iterator to iterate over it. Each iterator iterates over all versions of values +and is created from the same transaction, thus working over a snapshot of the +DB. Every time a new key is encountered, it calls `ChooseKey(item)`, followed +by `KeyToList(key, itr)`. This allows a user to select or reject that key, and +if selected, convert the value versions into custom key-values. The goroutine +batches up 4MB worth of key-values, before sending it over to a channel. +Another goroutine further batches up data from this channel using *smart +batching* algorithm and calls `Send` serially. + +This framework is designed for high throughput key-value iteration, spreading +the work of iteration across many goroutines. `DB.Backup` uses this framework to +provide full and incremental backups quickly. Dgraph is a heavy user of this +framework. In fact, this framework was developed and used within Dgraph, before +getting ported over to Badger. + +```go +stream := db.NewStream() +// db.NewStreamAt(readTs) for managed mode. + +// -- Optional settings +stream.NumGo = 16 // Set number of goroutines to use for iteration. +stream.Prefix = []byte("some-prefix") // Leave nil for iteration over the whole DB. +stream.LogPrefix = "Badger.Streaming" // For identifying stream logs. Outputs to Logger. + +// ChooseKey is called concurrently for every key. If left nil, assumes true by default. +stream.ChooseKey = func(item *badger.Item) bool { + return bytes.HasSuffix(item.Key(), []byte("er")) +} + +// KeyToList is called concurrently for chosen keys. This can be used to convert +// Badger data into custom key-values. If nil, uses stream.ToList, a default +// implementation, which picks all valid key-values. +stream.KeyToList = nil + +// -- End of optional settings. + +// Send is called serially, while Stream.Orchestrate is running. +stream.Send = func(list *pb.KVList) error { + return proto.MarshalText(w, list) // Write to w. +} + +// Run the stream +if err := stream.Orchestrate(context.Background()); err != nil { + return err +} +// Done. +``` + +### Garbage Collection +Badger values need to be garbage collected, because of two reasons: + +* Badger keeps values separately from the LSM tree. This means that the compaction operations +that clean up the LSM tree do not touch the values at all. Values need to be cleaned up +separately. + +* Concurrent read/write transactions could leave behind multiple values for a single key, because they +are stored with different versions. These could accumulate, and take up unneeded space beyond the +time these older versions are needed. + +Badger relies on the client to perform garbage collection at a time of their choosing. It provides +the following method, which can be invoked at an appropriate time: + +* `DB.RunValueLogGC()`: This method is designed to do garbage collection while + Badger is online. Along with randomly picking a file, it uses statistics generated by the + LSM-tree compactions to pick files that are likely to lead to maximum space + reclamation. It is recommended to be called during periods of low activity in + your system, or periodically. One call would only result in removal of at max + one log file. As an optimization, you could also immediately re-run it whenever + it returns nil error (indicating a successful value log GC), as shown below. + + ```go + ticker := time.NewTicker(5 * time.Minute) + defer ticker.Stop() + for range ticker.C { + again: + err := db.RunValueLogGC(0.7) + if err == nil { + goto again + } + } + ``` + +* `DB.PurgeOlderVersions()`: This method is **DEPRECATED** since v1.5.0. Now, Badger's LSM tree automatically discards older/invalid versions of keys. + +**Note: The RunValueLogGC method would not garbage collect the latest value log.** + +### Database backup +There are two public API methods `DB.Backup()` and `DB.Load()` which can be +used to do online backups and restores. Badger v0.9 provides a CLI tool +`badger`, which can do offline backup/restore. Make sure you have `$GOPATH/bin` +in your PATH to use this tool. + +The command below will create a version-agnostic backup of the database, to a +file `badger.bak` in the current working directory + +``` +badger backup --dir +``` + +To restore `badger.bak` in the current working directory to a new database: + +``` +badger restore --dir +``` + +See `badger --help` for more details. + +If you have a Badger database that was created using v0.8 (or below), you can +use the `badger_backup` tool provided in v0.8.1, and then restore it using the +command above to upgrade your database to work with the latest version. + +``` +badger_backup --dir --backup-file badger.bak +``` + +We recommend all users to use the `Backup` and `Restore` APIs and tools. However, +Badger is also rsync-friendly because all files are immutable, barring the +latest value log which is append-only. So, rsync can be used as rudimentary way +to perform a backup. In the following script, we repeat rsync to ensure that the +LSM tree remains consistent with the MANIFEST file while doing a full backup. + +``` +#!/bin/bash +set -o history +set -o histexpand +# Makes a complete copy of a Badger database directory. +# Repeat rsync if the MANIFEST and SSTables are updated. +rsync -avz --delete db/ dst +while !! | grep -q "(MANIFEST\|\.sst)$"; do :; done +``` + +### Memory usage +Badger's memory usage can be managed by tweaking several options available in +the `Options` struct that is passed in when opening the database using +`DB.Open`. + +- `Options.ValueLogLoadingMode` can be set to `options.FileIO` (instead of the + default `options.MemoryMap`) to avoid memory-mapping log files. This can be + useful in environments with low RAM. +- Number of memtables (`Options.NumMemtables`) + - If you modify `Options.NumMemtables`, also adjust `Options.NumLevelZeroTables` and + `Options.NumLevelZeroTablesStall` accordingly. +- Number of concurrent compactions (`Options.NumCompactors`) +- Mode in which LSM tree is loaded (`Options.TableLoadingMode`) +- Size of table (`Options.MaxTableSize`) +- Size of value log file (`Options.ValueLogFileSize`) + +If you want to decrease the memory usage of Badger instance, tweak these +options (ideally one at a time) until you achieve the desired +memory usage. + +### Statistics +Badger records metrics using the [expvar] package, which is included in the Go +standard library. All the metrics are documented in [y/metrics.go][metrics] +file. + +`expvar` package adds a handler in to the default HTTP server (which has to be +started explicitly), and serves up the metrics at the `/debug/vars` endpoint. +These metrics can then be collected by a system like [Prometheus], to get +better visibility into what Badger is doing. + +[expvar]: https://golang.org/pkg/expvar/ +[metrics]: https://github.com/dgraph-io/badger/blob/master/y/metrics.go +[Prometheus]: https://prometheus.io/ + +## Resources + +### Blog Posts +1. [Introducing Badger: A fast key-value store written natively in +Go](https://open.dgraph.io/post/badger/) +2. [Make Badger crash resilient with ALICE](https://blog.dgraph.io/post/alice/) +3. [Badger vs LMDB vs BoltDB: Benchmarking key-value databases in Go](https://blog.dgraph.io/post/badger-lmdb-boltdb/) +4. [Concurrent ACID Transactions in Badger](https://blog.dgraph.io/post/badger-txn/) + +## Design +Badger was written with these design goals in mind: + +- Write a key-value database in pure Go. +- Use latest research to build the fastest KV database for data sets spanning terabytes. +- Optimize for SSDs. + +Badger’s design is based on a paper titled _[WiscKey: Separating Keys from +Values in SSD-conscious Storage][wisckey]_. + +[wisckey]: https://www.usenix.org/system/files/conference/fast16/fast16-papers-lu.pdf + +### Comparisons +| Feature | Badger | RocksDB | BoltDB | +| ------- | ------ | ------- | ------ | +| Design | LSM tree with value log | LSM tree only | B+ tree | +| High Read throughput | Yes | No | Yes | +| High Write throughput | Yes | Yes | No | +| Designed for SSDs | Yes (with latest research 1) | Not specifically 2 | No | +| Embeddable | Yes | Yes | Yes | +| Sorted KV access | Yes | Yes | Yes | +| Pure Go (no Cgo) | Yes | No | Yes | +| Transactions | Yes, ACID, concurrent with SSI3 | Yes (but non-ACID) | Yes, ACID | +| Snapshots | Yes | Yes | Yes | +| TTL support | Yes | Yes | No | +| 3D access (key-value-version) | Yes4 | No | No | + +1 The [WISCKEY paper][wisckey] (on which Badger is based) saw big +wins with separating values from keys, significantly reducing the write +amplification compared to a typical LSM tree. + +2 RocksDB is an SSD optimized version of LevelDB, which was designed specifically for rotating disks. +As such RocksDB's design isn't aimed at SSDs. + +3 SSI: Serializable Snapshot Isolation. For more details, see the blog post [Concurrent ACID Transactions in Badger](https://blog.dgraph.io/post/badger-txn/) + +4 Badger provides direct access to value versions via its Iterator API. +Users can also specify how many versions to keep per key via Options. + +### Benchmarks +We have run comprehensive benchmarks against RocksDB, Bolt and LMDB. The +benchmarking code, and the detailed logs for the benchmarks can be found in the +[badger-bench] repo. More explanation, including graphs can be found the blog posts (linked +above). + +[badger-bench]: https://github.com/dgraph-io/badger-bench + +## Other Projects Using Badger +Below is a list of known projects that use Badger: + +* [0-stor](https://github.com/zero-os/0-stor) - Single device object store. +* [Dgraph](https://github.com/dgraph-io/dgraph) - Distributed graph database. +* [Dispatch Protocol](https://github.com/dispatchlabs/disgo) - Blockchain protocol for distributed application data analytics. +* [Sandglass](https://github.com/celrenheit/sandglass) - distributed, horizontally scalable, persistent, time sorted message queue. +* [Usenet Express](https://usenetexpress.com/) - Serving over 300TB of data with Badger. +* [go-ipfs](https://github.com/ipfs/go-ipfs) - Go client for the InterPlanetary File System (IPFS), a new hypermedia distribution protocol. +* [gorush](https://github.com/appleboy/gorush) - A push notification server written in Go. +* [emitter](https://github.com/emitter-io/emitter) - Scalable, low latency, distributed pub/sub broker with message storage, uses MQTT, gossip and badger. +* [GarageMQ](https://github.com/valinurovam/garagemq) - AMQP server written in Go. +* [RedixDB](https://alash3al.github.io/redix/) - A real-time persistent key-value store with the same redis protocol. +* [BBVA](https://github.com/BBVA/raft-badger) - Raft backend implementation using BadgerDB for Hashicorp raft. +* [Riot](https://github.com/go-ego/riot) - An open-source, distributed search engine. +* [Fantom](https://github.com/Fantom-foundation/go-lachesis) - aBFT Consensus platform for distributed applications. +* [decred](https://github.com/decred/dcrdata) - An open, progressive, and self-funding cryptocurrency with a system of community-based governance integrated into its blockchain. +* [OpenNetSys](https://github.com/opennetsys/c3-go) - Create useful dApps in any software language. +* [HoneyTrap](https://github.com/honeytrap/honeytrap) - An extensible and opensource system for running, monitoring and managing honeypots. +* [Insolar](https://github.com/insolar/insolar) - Enterprise-ready blockchain platform. +* [IoTeX](https://github.com/iotexproject/iotex-core) - The next generation of the decentralized network for IoT powered by scalability- and privacy-centric blockchains. +* [go-sessions](https://github.com/kataras/go-sessions) - The sessions manager for Go net/http and fasthttp. +* [Babble](https://github.com/mosaicnetworks/babble) - BFT Consensus platform for distributed applications. +* [Tormenta](https://github.com/jpincas/tormenta) - Embedded object-persistence layer / simple JSON database for Go projects. +* [BadgerHold](https://github.com/timshannon/badgerhold) - An embeddable NoSQL store for querying Go types built on Badger +* [Goblero](https://github.com/didil/goblero) - Pure Go embedded persistent job queue backed by BadgerDB +* [Surfline](https://www.surfline.com) - Serving global wave and weather forecast data with Badger. +* [Cete](https://github.com/mosuka/cete) - Simple and highly available distributed key-value store built on Badger. Makes it easy bringing up a cluster of Badger with Raft consensus algorithm by hashicorp/raft. +* [Volument](https://volument.com/) - A new take on website analytics backed by Badger. + +If you are using Badger in a project please send a pull request to add it to the list. + +## Frequently Asked Questions +- **My writes are getting stuck. Why?** + +**Update: With the new `Value(func(v []byte))` API, this deadlock can no longer +happen.** + +The following is true for users on Badger v1.x. + +This can happen if a long running iteration with `Prefetch` is set to false, but +a `Item::Value` call is made internally in the loop. That causes Badger to +acquire read locks over the value log files to avoid value log GC removing the +file from underneath. As a side effect, this also blocks a new value log GC +file from being created, when the value log file boundary is hit. + +Please see Github issues [#293](https://github.com/dgraph-io/badger/issues/293) +and [#315](https://github.com/dgraph-io/badger/issues/315). + +There are multiple workarounds during iteration: + +1. Use `Item::ValueCopy` instead of `Item::Value` when retrieving value. +1. Set `Prefetch` to true. Badger would then copy over the value and release the + file lock immediately. +1. When `Prefetch` is false, don't call `Item::Value` and do a pure key-only + iteration. This might be useful if you just want to delete a lot of keys. +1. Do the writes in a separate transaction after the reads. + +- **My writes are really slow. Why?** + +Are you creating a new transaction for every single key update, and waiting for +it to `Commit` fully before creating a new one? This will lead to very low +throughput. + +We have created `WriteBatch` API which provides a way to batch up +many updates into a single transaction and `Commit` that transaction using +callbacks to avoid blocking. This amortizes the cost of a transaction really +well, and provides the most efficient way to do bulk writes. + +```go +wb := db.NewWriteBatch() +defer wb.Cancel() + +for i := 0; i < N; i++ { + err := wb.Set(key(i), value(i), 0) // Will create txns as needed. + handle(err) +} +handle(wb.Flush()) // Wait for all txns to finish. +``` + +Note that `WriteBatch` API does not allow any reads. For read-modify-write +workloads, you should be using the `Transaction` API. + +- **I don't see any disk write. Why?** + +If you're using Badger with `SyncWrites=false`, then your writes might not be written to value log +and won't get synced to disk immediately. Writes to LSM tree are done inmemory first, before they +get compacted to disk. The compaction would only happen once `MaxTableSize` has been reached. So, if +you're doing a few writes and then checking, you might not see anything on disk. Once you `Close` +the database, you'll see these writes on disk. + +- **Reverse iteration doesn't give me the right results.** + +Just like forward iteration goes to the first key which is equal or greater than the SEEK key, reverse iteration goes to the first key which is equal or lesser than the SEEK key. Therefore, SEEK key would not be part of the results. You can typically add a `0xff` byte as a suffix to the SEEK key to include it in the results. See the following issues: [#436](https://github.com/dgraph-io/badger/issues/436) and [#347](https://github.com/dgraph-io/badger/issues/347). + +- **Which instances should I use for Badger?** + +We recommend using instances which provide local SSD storage, without any limit +on the maximum IOPS. In AWS, these are storage optimized instances like i3. They +provide local SSDs which clock 100K IOPS over 4KB blocks easily. + +- **I'm getting a closed channel error. Why?** + +``` +panic: close of closed channel +panic: send on closed channel +``` + +If you're seeing panics like above, this would be because you're operating on a closed DB. This can happen, if you call `Close()` before sending a write, or multiple times. You should ensure that you only call `Close()` once, and all your read/write operations finish before closing. + +- **Are there any Go specific settings that I should use?** + +We *highly* recommend setting a high number for GOMAXPROCS, which allows Go to +observe the full IOPS throughput provided by modern SSDs. In Dgraph, we have set +it to 128. For more details, [see this +thread](https://groups.google.com/d/topic/golang-nuts/jPb_h3TvlKE/discussion). + +- **Are there any linux specific settings that I should use?** + +We recommend setting max file descriptors to a high number depending upon the expected size of you data. + +## Contact +- Please use [discuss.dgraph.io](https://discuss.dgraph.io) for questions, feature requests and discussions. +- Please use [Github issue tracker](https://github.com/dgraph-io/badger/issues) for filing bugs or feature requests. +- Join [![Slack Status](http://slack.dgraph.io/badge.svg)](http://slack.dgraph.io). +- Follow us on Twitter [@dgraphlabs](https://twitter.com/dgraphlabs). + diff --git a/vendor/github.com/dgraph-io/badger/VERSIONING.md b/vendor/github.com/dgraph-io/badger/VERSIONING.md new file mode 100644 index 0000000000..a890a36ffb --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/VERSIONING.md @@ -0,0 +1,47 @@ +# Serialization Versioning: Semantic Versioning for databases + +Semantic Versioning, commonly known as SemVer, is a great idea that has been very widely adopted as +a way to decide how to name software versions. The whole concept is very well summarized on +semver.org with the following lines: + +> Given a version number MAJOR.MINOR.PATCH, increment the: +> +> 1. MAJOR version when you make incompatible API changes, +> 2. MINOR version when you add functionality in a backwards-compatible manner, and +> 3. PATCH version when you make backwards-compatible bug fixes. +> +> Additional labels for pre-release and build metadata are available as extensions to the +> MAJOR.MINOR.PATCH format. + +Unfortunately, API changes are not the most important changes for libraries that serialize data for +later consumption. For these libraries, such as BadgerDB, changes to the API are much easier to +handle than change to the data format used to store data on disk. + +## Serialization Version specification + +Serialization Versioning, like Semantic Versioning, uses 3 numbers and also calls them +MAJOR.MINOR.PATCH, but the semantics of the numbers are slightly modified: + +Given a version number MAJOR.MINOR.PATCH, increment the: + +- MAJOR version when you make changes that require a transformation of the dataset before it can be +used again. +- MINOR version when old datasets are still readable but the API might have changed in +backwards-compatible or incompatible ways. +- PATCH version when you make backwards-compatible bug fixes. + +Additional labels for pre-release and build metadata are available as extensions to the +MAJOR.MINOR.PATCH format. + +Following this naming strategy, migration from v1.x to v2.x requires a migration strategy for your +existing dataset, and as such has to be carefully planned. Migrations in between different minor +versions (e.g. v1.5.x and v1.6.x) might break your build, as the API *might* have changed, but once +your code compiles there's no need for any data migration. Lastly, changes in between two different +patch versions should never break your build or dataset. + +For more background on our decision to adopt Serialization Versioning, read the blog post +[Semantic Versioning, Go Modules, and Databases][blog] and the original proposal on +[this comment on Dgraph's Discuss forum][discuss]. + +[blog]: https://blog.dgraph.io/post/serialization-versioning/ +[discuss]: https://discuss.dgraph.io/t/go-modules-on-badger-and-dgraph/4662/7 \ No newline at end of file diff --git a/vendor/github.com/dgraph-io/badger/appveyor.yml b/vendor/github.com/dgraph-io/badger/appveyor.yml new file mode 100644 index 0000000000..afa54ca0ad --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/appveyor.yml @@ -0,0 +1,49 @@ +# version format +version: "{build}" + +# Operating system (build VM template) +os: Windows Server 2012 R2 + +# Platform. +platform: x64 + +clone_folder: c:\gopath\src\github.com\dgraph-io\badger + +# Environment variables +environment: + GOVERSION: 1.8.3 + GOPATH: c:\gopath + GO111MODULE: on + +# scripts that run after cloning repository +install: + - set PATH=%GOPATH%\bin;c:\go\bin;%PATH% + - go version + - go env + - python --version + +# To run your custom scripts instead of automatic MSBuild +build_script: + # We need to disable firewall - https://github.com/appveyor/ci/issues/1579#issuecomment-309830648 + - ps: Disable-NetFirewallRule -DisplayName 'File and Printer Sharing (SMB-Out)' + - cd c:\gopath\src\github.com\dgraph-io\badger + - git branch + - go get -t ./... + +# To run your custom scripts instead of automatic tests +test_script: + # Unit tests + - ps: Add-AppveyorTest "Unit Tests" -Outcome Running + - go test -v github.com/dgraph-io/badger/... + - go test -v -vlog_mmap=false github.com/dgraph-io/badger/... + - ps: Update-AppveyorTest "Unit Tests" -Outcome Passed + +notifications: + - provider: Email + to: + - pawan@dgraph.io + on_build_failure: true + on_build_status_changed: true +# to disable deployment +deploy: off + diff --git a/vendor/github.com/dgraph-io/badger/backup.go b/vendor/github.com/dgraph-io/badger/backup.go new file mode 100644 index 0000000000..2569b31005 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/backup.go @@ -0,0 +1,244 @@ +/* + * Copyright 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package badger + +import ( + "bufio" + "bytes" + "context" + "encoding/binary" + "io" + + "github.com/dgraph-io/badger/pb" + "github.com/dgraph-io/badger/y" +) + +// Backup is a wrapper function over Stream.Backup to generate full and incremental backups of the +// DB. For more control over how many goroutines are used to generate the backup, or if you wish to +// backup only a certain range of keys, use Stream.Backup directly. +func (db *DB) Backup(w io.Writer, since uint64) (uint64, error) { + stream := db.NewStream() + stream.LogPrefix = "DB.Backup" + return stream.Backup(w, since) +} + +// Backup dumps a protobuf-encoded list of all entries in the database into the +// given writer, that are newer than the specified version. It returns a +// timestamp indicating when the entries were dumped which can be passed into a +// later invocation to generate an incremental dump, of entries that have been +// added/modified since the last invocation of Stream.Backup(). +// +// This can be used to backup the data in a database at a given point in time. +func (stream *Stream) Backup(w io.Writer, since uint64) (uint64, error) { + stream.KeyToList = func(key []byte, itr *Iterator) (*pb.KVList, error) { + list := &pb.KVList{} + for ; itr.Valid(); itr.Next() { + item := itr.Item() + if !bytes.Equal(item.Key(), key) { + return list, nil + } + if item.Version() < since { + // Ignore versions less than given timestamp, or skip older + // versions of the given key. + return list, nil + } + + var valCopy []byte + if !item.IsDeletedOrExpired() { + // No need to copy value, if item is deleted or expired. + var err error + valCopy, err = item.ValueCopy(nil) + if err != nil { + stream.db.opt.Errorf("Key [%x, %d]. Error while fetching value [%v]\n", + item.Key(), item.Version(), err) + return nil, err + } + } + + // clear txn bits + meta := item.meta &^ (bitTxn | bitFinTxn) + kv := &pb.KV{ + Key: item.KeyCopy(nil), + Value: valCopy, + UserMeta: []byte{item.UserMeta()}, + Version: item.Version(), + ExpiresAt: item.ExpiresAt(), + Meta: []byte{meta}, + } + list.Kv = append(list.Kv, kv) + + switch { + case item.DiscardEarlierVersions(): + // If we need to discard earlier versions of this item, add a delete + // marker just below the current version. + list.Kv = append(list.Kv, &pb.KV{ + Key: item.KeyCopy(nil), + Version: item.Version() - 1, + Meta: []byte{bitDelete}, + }) + return list, nil + + case item.IsDeletedOrExpired(): + return list, nil + } + } + return list, nil + } + + var maxVersion uint64 + stream.Send = func(list *pb.KVList) error { + for _, kv := range list.Kv { + if maxVersion < kv.Version { + maxVersion = kv.Version + } + } + return writeTo(list, w) + } + + if err := stream.Orchestrate(context.Background()); err != nil { + return 0, err + } + return maxVersion, nil +} + +func writeTo(list *pb.KVList, w io.Writer) error { + if err := binary.Write(w, binary.LittleEndian, uint64(list.Size())); err != nil { + return err + } + buf, err := list.Marshal() + if err != nil { + return err + } + _, err = w.Write(buf) + return err +} + +// KVLoader is used to write KVList objects in to badger. It can be used to restore a backup. +type KVLoader struct { + db *DB + throttle *y.Throttle + entries []*Entry +} + +// NewKVLoader returns a new instance of KVLoader. +func (db *DB) NewKVLoader(maxPendingWrites int) *KVLoader { + return &KVLoader{ + db: db, + throttle: y.NewThrottle(maxPendingWrites), + } +} + +// Set writes the key-value pair to the database. +func (l *KVLoader) Set(kv *pb.KV) error { + var userMeta, meta byte + if len(kv.UserMeta) > 0 { + userMeta = kv.UserMeta[0] + } + if len(kv.Meta) > 0 { + meta = kv.Meta[0] + } + + l.entries = append(l.entries, &Entry{ + Key: y.KeyWithTs(kv.Key, kv.Version), + Value: kv.Value, + UserMeta: userMeta, + ExpiresAt: kv.ExpiresAt, + meta: meta, + }) + if len(l.entries) >= 1000 { + return l.send() + } + return nil +} + +func (l *KVLoader) send() error { + if err := l.throttle.Do(); err != nil { + return err + } + if err := l.db.batchSetAsync(l.entries, func(err error) { + l.throttle.Done(err) + }); err != nil { + return err + } + + l.entries = make([]*Entry, 0, 1000) + return nil +} + +// Finish is meant to be called after all the key-value pairs have been loaded. +func (l *KVLoader) Finish() error { + if len(l.entries) > 0 { + if err := l.send(); err != nil { + return err + } + } + return l.throttle.Finish() +} + +// Load reads a protobuf-encoded list of all entries from a reader and writes +// them to the database. This can be used to restore the database from a backup +// made by calling DB.Backup(). If more complex logic is needed to restore a badger +// backup, the KVLoader interface should be used instead. +// +// DB.Load() should be called on a database that is not running any other +// concurrent transactions while it is running. +func (db *DB) Load(r io.Reader, maxPendingWrites int) error { + br := bufio.NewReaderSize(r, 16<<10) + unmarshalBuf := make([]byte, 1<<10) + + ldr := db.NewKVLoader(maxPendingWrites) + for { + var sz uint64 + err := binary.Read(br, binary.LittleEndian, &sz) + if err == io.EOF { + break + } else if err != nil { + return err + } + + if cap(unmarshalBuf) < int(sz) { + unmarshalBuf = make([]byte, sz) + } + + if _, err = io.ReadFull(br, unmarshalBuf[:sz]); err != nil { + return err + } + + list := &pb.KVList{} + if err := list.Unmarshal(unmarshalBuf[:sz]); err != nil { + return err + } + + for _, kv := range list.Kv { + if err := ldr.Set(kv); err != nil { + return err + } + + // Update nextTxnTs, memtable stores this + // timestamp in badger head when flushed. + if kv.Version >= db.orc.nextTxnTs { + db.orc.nextTxnTs = kv.Version + 1 + } + } + } + + if err := ldr.Finish(); err != nil { + return err + } + db.orc.txnMark.Done(db.orc.nextTxnTs - 1) + return nil +} diff --git a/vendor/github.com/dgraph-io/badger/batch.go b/vendor/github.com/dgraph-io/badger/batch.go new file mode 100644 index 0000000000..c94e0fed47 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/batch.go @@ -0,0 +1,162 @@ +/* + * Copyright 2018 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package badger + +import ( + "sync" + + "github.com/dgraph-io/badger/y" +) + +// WriteBatch holds the necessary info to perform batched writes. +type WriteBatch struct { + sync.Mutex + txn *Txn + db *DB + throttle *y.Throttle + err error +} + +// NewWriteBatch creates a new WriteBatch. This provides a way to conveniently do a lot of writes, +// batching them up as tightly as possible in a single transaction and using callbacks to avoid +// waiting for them to commit, thus achieving good performance. This API hides away the logic of +// creating and committing transactions. Due to the nature of SSI guaratees provided by Badger, +// blind writes can never encounter transaction conflicts (ErrConflict). +func (db *DB) NewWriteBatch() *WriteBatch { + return &WriteBatch{ + db: db, + txn: db.newTransaction(true, true), + throttle: y.NewThrottle(16), + } +} + +// SetMaxPendingTxns sets a limit on maximum number of pending transactions while writing batches. +// This function should be called before using WriteBatch. Default value of MaxPendingTxns is +// 16 to minimise memory usage. +func (wb *WriteBatch) SetMaxPendingTxns(max int) { + wb.throttle = y.NewThrottle(max) +} + +// Cancel function must be called if there's a chance that Flush might not get +// called. If neither Flush or Cancel is called, the transaction oracle would +// never get a chance to clear out the row commit timestamp map, thus causing an +// unbounded memory consumption. Typically, you can call Cancel as a defer +// statement right after NewWriteBatch is called. +// +// Note that any committed writes would still go through despite calling Cancel. +func (wb *WriteBatch) Cancel() { + if err := wb.throttle.Finish(); err != nil { + wb.db.opt.Errorf("WatchBatch.Cancel error while finishing: %v", err) + } + wb.txn.Discard() +} + +func (wb *WriteBatch) callback(err error) { + // sync.WaitGroup is thread-safe, so it doesn't need to be run inside wb.Lock. + defer wb.throttle.Done(err) + if err == nil { + return + } + + wb.Lock() + defer wb.Unlock() + if wb.err != nil { + return + } + wb.err = err +} + +// SetEntry is the equivalent of Txn.SetEntry. +func (wb *WriteBatch) SetEntry(e *Entry) error { + wb.Lock() + defer wb.Unlock() + + if err := wb.txn.SetEntry(e); err != ErrTxnTooBig { + return err + } + // Txn has reached it's zenith. Commit now. + if cerr := wb.commit(); cerr != nil { + return cerr + } + // This time the error must not be ErrTxnTooBig, otherwise, we make the + // error permanent. + if err := wb.txn.SetEntry(e); err != nil { + wb.err = err + return err + } + return nil +} + +// Set is equivalent of Txn.Set(). +func (wb *WriteBatch) Set(k, v []byte) error { + e := &Entry{Key: k, Value: v} + return wb.SetEntry(e) +} + +// Delete is equivalent of Txn.Delete. +func (wb *WriteBatch) Delete(k []byte) error { + wb.Lock() + defer wb.Unlock() + + if err := wb.txn.Delete(k); err != ErrTxnTooBig { + return err + } + if err := wb.commit(); err != nil { + return err + } + if err := wb.txn.Delete(k); err != nil { + wb.err = err + return err + } + return nil +} + +// Caller to commit must hold a write lock. +func (wb *WriteBatch) commit() error { + if wb.err != nil { + return wb.err + } + if err := wb.throttle.Do(); err != nil { + return err + } + wb.txn.CommitWith(wb.callback) + wb.txn = wb.db.newTransaction(true, true) + wb.txn.readTs = 0 // We're not reading anything. + return wb.err +} + +// Flush must be called at the end to ensure that any pending writes get committed to Badger. Flush +// returns any error stored by WriteBatch. +func (wb *WriteBatch) Flush() error { + wb.Lock() + _ = wb.commit() + wb.txn.Discard() + wb.Unlock() + + if err := wb.throttle.Finish(); err != nil { + return err + } + + return wb.err +} + +// Error returns any errors encountered so far. No commits would be run once an error is detected. +func (wb *WriteBatch) Error() error { + wb.Lock() + defer wb.Unlock() + return wb.err +} diff --git a/vendor/github.com/dgraph-io/badger/compaction.go b/vendor/github.com/dgraph-io/badger/compaction.go new file mode 100644 index 0000000000..931d56664c --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/compaction.go @@ -0,0 +1,210 @@ +/* + * Copyright 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package badger + +import ( + "bytes" + "fmt" + "log" + "math" + "sync" + + "golang.org/x/net/trace" + + "github.com/dgraph-io/badger/table" + "github.com/dgraph-io/badger/y" +) + +type keyRange struct { + left []byte + right []byte + inf bool +} + +var infRange = keyRange{inf: true} + +func (r keyRange) String() string { + return fmt.Sprintf("[left=%x, right=%x, inf=%v]", r.left, r.right, r.inf) +} + +func (r keyRange) equals(dst keyRange) bool { + return bytes.Equal(r.left, dst.left) && + bytes.Equal(r.right, dst.right) && + r.inf == dst.inf +} + +func (r keyRange) overlapsWith(dst keyRange) bool { + if r.inf || dst.inf { + return true + } + + // If my left is greater than dst right, we have no overlap. + if y.CompareKeys(r.left, dst.right) > 0 { + return false + } + // If my right is less than dst left, we have no overlap. + if y.CompareKeys(r.right, dst.left) < 0 { + return false + } + // We have overlap. + return true +} + +func getKeyRange(tables []*table.Table) keyRange { + if len(tables) == 0 { + return keyRange{} + } + smallest := tables[0].Smallest() + biggest := tables[0].Biggest() + for i := 1; i < len(tables); i++ { + if y.CompareKeys(tables[i].Smallest(), smallest) < 0 { + smallest = tables[i].Smallest() + } + if y.CompareKeys(tables[i].Biggest(), biggest) > 0 { + biggest = tables[i].Biggest() + } + } + return keyRange{ + left: y.KeyWithTs(y.ParseKey(smallest), math.MaxUint64), + right: y.KeyWithTs(y.ParseKey(biggest), 0), + } +} + +type levelCompactStatus struct { + ranges []keyRange + delSize int64 +} + +func (lcs *levelCompactStatus) debug() string { + var b bytes.Buffer + for _, r := range lcs.ranges { + b.WriteString(r.String()) + } + return b.String() +} + +func (lcs *levelCompactStatus) overlapsWith(dst keyRange) bool { + for _, r := range lcs.ranges { + if r.overlapsWith(dst) { + return true + } + } + return false +} + +func (lcs *levelCompactStatus) remove(dst keyRange) bool { + final := lcs.ranges[:0] + var found bool + for _, r := range lcs.ranges { + if !r.equals(dst) { + final = append(final, r) + } else { + found = true + } + } + lcs.ranges = final + return found +} + +type compactStatus struct { + sync.RWMutex + levels []*levelCompactStatus +} + +func (cs *compactStatus) toLog(tr trace.Trace) { + cs.RLock() + defer cs.RUnlock() + + tr.LazyPrintf("Compaction status:") + for i, l := range cs.levels { + if l.debug() == "" { + continue + } + tr.LazyPrintf("[%d] %s", i, l.debug()) + } +} + +func (cs *compactStatus) overlapsWith(level int, this keyRange) bool { + cs.RLock() + defer cs.RUnlock() + + thisLevel := cs.levels[level] + return thisLevel.overlapsWith(this) +} + +func (cs *compactStatus) delSize(l int) int64 { + cs.RLock() + defer cs.RUnlock() + return cs.levels[l].delSize +} + +type thisAndNextLevelRLocked struct{} + +// compareAndAdd will check whether we can run this compactDef. That it doesn't overlap with any +// other running compaction. If it can be run, it would store this run in the compactStatus state. +func (cs *compactStatus) compareAndAdd(_ thisAndNextLevelRLocked, cd compactDef) bool { + cs.Lock() + defer cs.Unlock() + + level := cd.thisLevel.level + + y.AssertTruef(level < len(cs.levels)-1, "Got level %d. Max levels: %d", level, len(cs.levels)) + thisLevel := cs.levels[level] + nextLevel := cs.levels[level+1] + + if thisLevel.overlapsWith(cd.thisRange) { + return false + } + if nextLevel.overlapsWith(cd.nextRange) { + return false + } + // Check whether this level really needs compaction or not. Otherwise, we'll end up + // running parallel compactions for the same level. + // Update: We should not be checking size here. Compaction priority already did the size checks. + // Here we should just be executing the wish of others. + + thisLevel.ranges = append(thisLevel.ranges, cd.thisRange) + nextLevel.ranges = append(nextLevel.ranges, cd.nextRange) + thisLevel.delSize += cd.thisSize + return true +} + +func (cs *compactStatus) delete(cd compactDef) { + cs.Lock() + defer cs.Unlock() + + level := cd.thisLevel.level + y.AssertTruef(level < len(cs.levels)-1, "Got level %d. Max levels: %d", level, len(cs.levels)) + + thisLevel := cs.levels[level] + nextLevel := cs.levels[level+1] + + thisLevel.delSize -= cd.thisSize + found := thisLevel.remove(cd.thisRange) + found = nextLevel.remove(cd.nextRange) && found + + if !found { + this := cd.thisRange + next := cd.nextRange + fmt.Printf("Looking for: [%q, %q, %v] in this level.\n", this.left, this.right, this.inf) + fmt.Printf("This Level:\n%s\n", thisLevel.debug()) + fmt.Println() + fmt.Printf("Looking for: [%q, %q, %v] in next level.\n", next.left, next.right, next.inf) + fmt.Printf("Next Level:\n%s\n", nextLevel.debug()) + log.Fatal("keyRange not found") + } +} diff --git a/vendor/github.com/dgraph-io/badger/db.go b/vendor/github.com/dgraph-io/badger/db.go new file mode 100644 index 0000000000..21bb22d6f0 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/db.go @@ -0,0 +1,1468 @@ +/* + * Copyright 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package badger + +import ( + "bytes" + "context" + "encoding/binary" + "encoding/hex" + "expvar" + "io" + "math" + "os" + "path/filepath" + "sort" + "strconv" + "sync" + "sync/atomic" + "time" + + "github.com/dgraph-io/badger/options" + "github.com/dgraph-io/badger/pb" + "github.com/dgraph-io/badger/skl" + "github.com/dgraph-io/badger/table" + "github.com/dgraph-io/badger/y" + humanize "github.com/dustin/go-humanize" + "github.com/pkg/errors" + "golang.org/x/net/trace" +) + +var ( + badgerPrefix = []byte("!badger!") // Prefix for internal keys used by badger. + head = []byte("!badger!head") // For storing value offset for replay. + txnKey = []byte("!badger!txn") // For indicating end of entries in txn. + badgerMove = []byte("!badger!move") // For key-value pairs which got moved during GC. + lfDiscardStatsKey = []byte("!badger!discard") // For storing lfDiscardStats +) + +type closers struct { + updateSize *y.Closer + compactors *y.Closer + memtable *y.Closer + writes *y.Closer + valueGC *y.Closer + pub *y.Closer +} + +type callback func(kv *pb.KVList) + +// DB provides the various functions required to interact with Badger. +// DB is thread-safe. +type DB struct { + sync.RWMutex // Guards list of inmemory tables, not individual reads and writes. + + dirLockGuard *directoryLockGuard + // nil if Dir and ValueDir are the same + valueDirGuard *directoryLockGuard + + closers closers + elog trace.EventLog + mt *skl.Skiplist // Our latest (actively written) in-memory table + imm []*skl.Skiplist // Add here only AFTER pushing to flushChan. + opt Options + manifest *manifestFile + lc *levelsController + vlog valueLog + vhead valuePointer // less than or equal to a pointer to the last vlog value put into mt + writeCh chan *request + flushChan chan flushTask // For flushing memtables. + closeOnce sync.Once // For closing DB only once. + + // Number of log rotates since the last memtable flush. We will access this field via atomic + // functions. Since we are not going to use any 64bit atomic functions, there is no need for + // 64 bit alignment of this struct(see #311). + logRotates int32 + + blockWrites int32 + + orc *oracle + + pub *publisher +} + +const ( + kvWriteChCapacity = 1000 +) + +func (db *DB) replayFunction() func(Entry, valuePointer) error { + type txnEntry struct { + nk []byte + v y.ValueStruct + } + + var txn []txnEntry + var lastCommit uint64 + + toLSM := func(nk []byte, vs y.ValueStruct) { + for err := db.ensureRoomForWrite(); err != nil; err = db.ensureRoomForWrite() { + db.elog.Printf("Replay: Making room for writes") + time.Sleep(10 * time.Millisecond) + } + db.mt.Put(nk, vs) + } + + first := true + return func(e Entry, vp valuePointer) error { // Function for replaying. + if first { + db.elog.Printf("First key=%q\n", e.Key) + } + first = false + + if db.orc.nextTxnTs < y.ParseTs(e.Key) { + db.orc.nextTxnTs = y.ParseTs(e.Key) + } + + nk := make([]byte, len(e.Key)) + copy(nk, e.Key) + var nv []byte + meta := e.meta + if db.shouldWriteValueToLSM(e) { + nv = make([]byte, len(e.Value)) + copy(nv, e.Value) + } else { + nv = make([]byte, vptrSize) + vp.Encode(nv) + meta = meta | bitValuePointer + } + + v := y.ValueStruct{ + Value: nv, + Meta: meta, + UserMeta: e.UserMeta, + ExpiresAt: e.ExpiresAt, + } + + if e.meta&bitFinTxn > 0 { + txnTs, err := strconv.ParseUint(string(e.Value), 10, 64) + if err != nil { + return errors.Wrapf(err, "Unable to parse txn fin: %q", e.Value) + } + y.AssertTrue(lastCommit == txnTs) + y.AssertTrue(len(txn) > 0) + // Got the end of txn. Now we can store them. + for _, t := range txn { + toLSM(t.nk, t.v) + } + txn = txn[:0] + lastCommit = 0 + + } else if e.meta&bitTxn > 0 { + txnTs := y.ParseTs(nk) + if lastCommit == 0 { + lastCommit = txnTs + } + if lastCommit != txnTs { + db.opt.Warningf("Found an incomplete txn at timestamp %d. Discarding it.\n", + lastCommit) + txn = txn[:0] + lastCommit = txnTs + } + te := txnEntry{nk: nk, v: v} + txn = append(txn, te) + + } else { + // This entry is from a rewrite. + toLSM(nk, v) + + // We shouldn't get this entry in the middle of a transaction. + y.AssertTrue(lastCommit == 0) + y.AssertTrue(len(txn) == 0) + } + return nil + } +} + +// Open returns a new DB object. +func Open(opt Options) (db *DB, err error) { + opt.maxBatchSize = (15 * opt.MaxTableSize) / 100 + opt.maxBatchCount = opt.maxBatchSize / int64(skl.MaxNodeSize) + + if opt.ValueThreshold > ValueThresholdLimit { + return nil, ErrValueThreshold + } + + if opt.ReadOnly { + // Can't truncate if the DB is read only. + opt.Truncate = false + // Do not perform compaction in read only mode. + opt.CompactL0OnClose = false + } + + for _, path := range []string{opt.Dir, opt.ValueDir} { + dirExists, err := exists(path) + if err != nil { + return nil, y.Wrapf(err, "Invalid Dir: %q", path) + } + if !dirExists { + if opt.ReadOnly { + return nil, errors.Errorf("Cannot find directory %q for read-only open", path) + } + // Try to create the directory + err = os.Mkdir(path, 0700) + if err != nil { + return nil, y.Wrapf(err, "Error Creating Dir: %q", path) + } + } + } + absDir, err := filepath.Abs(opt.Dir) + if err != nil { + return nil, err + } + absValueDir, err := filepath.Abs(opt.ValueDir) + if err != nil { + return nil, err + } + var dirLockGuard, valueDirLockGuard *directoryLockGuard + dirLockGuard, err = acquireDirectoryLock(opt.Dir, lockFile, opt.ReadOnly) + if err != nil { + return nil, err + } + defer func() { + if dirLockGuard != nil { + _ = dirLockGuard.release() + } + }() + if absValueDir != absDir { + valueDirLockGuard, err = acquireDirectoryLock(opt.ValueDir, lockFile, opt.ReadOnly) + if err != nil { + return nil, err + } + defer func() { + if valueDirLockGuard != nil { + _ = valueDirLockGuard.release() + } + }() + } + if !(opt.ValueLogFileSize <= 2<<30 && opt.ValueLogFileSize >= 1<<20) { + return nil, ErrValueLogSize + } + if !(opt.ValueLogLoadingMode == options.FileIO || + opt.ValueLogLoadingMode == options.MemoryMap) { + return nil, ErrInvalidLoadingMode + } + manifestFile, manifest, err := openOrCreateManifestFile(opt.Dir, opt.ReadOnly) + if err != nil { + return nil, err + } + defer func() { + if manifestFile != nil { + _ = manifestFile.close() + } + }() + + db = &DB{ + imm: make([]*skl.Skiplist, 0, opt.NumMemtables), + flushChan: make(chan flushTask, opt.NumMemtables), + writeCh: make(chan *request, kvWriteChCapacity), + opt: opt, + manifest: manifestFile, + elog: trace.NewEventLog("Badger", "DB"), + dirLockGuard: dirLockGuard, + valueDirGuard: valueDirLockGuard, + orc: newOracle(opt), + pub: newPublisher(), + } + + // Calculate initial size. + db.calculateSize() + db.closers.updateSize = y.NewCloser(1) + go db.updateSize(db.closers.updateSize) + db.mt = skl.NewSkiplist(arenaSize(opt)) + + // newLevelsController potentially loads files in directory. + if db.lc, err = newLevelsController(db, &manifest); err != nil { + return nil, err + } + + if !opt.ReadOnly { + db.closers.compactors = y.NewCloser(1) + db.lc.startCompact(db.closers.compactors) + + db.closers.memtable = y.NewCloser(1) + go func() { + _ = db.flushMemtable(db.closers.memtable) // Need levels controller to be up. + }() + } + + headKey := y.KeyWithTs(head, math.MaxUint64) + // Need to pass with timestamp, lsm get removes the last 8 bytes and compares key + vs, err := db.get(headKey) + if err != nil { + return nil, errors.Wrap(err, "Retrieving head") + } + db.orc.nextTxnTs = vs.Version + var vptr valuePointer + if len(vs.Value) > 0 { + vptr.Decode(vs.Value) + } + + replayCloser := y.NewCloser(1) + go db.doWrites(replayCloser) + + if err = db.vlog.open(db, vptr, db.replayFunction()); err != nil { + return db, err + } + replayCloser.SignalAndWait() // Wait for replay to be applied first. + + // Let's advance nextTxnTs to one more than whatever we observed via + // replaying the logs. + db.orc.txnMark.Done(db.orc.nextTxnTs) + // In normal mode, we must update readMark so older versions of keys can be removed during + // compaction when run in offline mode via the flatten tool. + db.orc.readMark.Done(db.orc.nextTxnTs) + db.orc.incrementNextTs() + + db.writeCh = make(chan *request, kvWriteChCapacity) + db.closers.writes = y.NewCloser(1) + go db.doWrites(db.closers.writes) + + db.closers.valueGC = y.NewCloser(1) + go db.vlog.waitOnGC(db.closers.valueGC) + + db.closers.pub = y.NewCloser(1) + go db.pub.listenForUpdates(db.closers.pub) + + valueDirLockGuard = nil + dirLockGuard = nil + manifestFile = nil + return db, nil +} + +// Close closes a DB. It's crucial to call it to ensure all the pending updates make their way to +// disk. Calling DB.Close() multiple times would still only close the DB once. +func (db *DB) Close() error { + var err error + db.closeOnce.Do(func() { + err = db.close() + }) + return err +} + +func (db *DB) close() (err error) { + db.elog.Printf("Closing database") + + if err := db.vlog.flushDiscardStats(); err != nil { + return errors.Wrap(err, "failed to flush discard stats") + } + + atomic.StoreInt32(&db.blockWrites, 1) + + // Stop value GC first. + db.closers.valueGC.SignalAndWait() + + // Stop writes next. + db.closers.writes.SignalAndWait() + + db.closers.pub.SignalAndWait() + + // Now close the value log. + if vlogErr := db.vlog.Close(); vlogErr != nil { + err = errors.Wrap(vlogErr, "DB.Close") + } + + // Make sure that block writer is done pushing stuff into memtable! + // Otherwise, you will have a race condition: we are trying to flush memtables + // and remove them completely, while the block / memtable writer is still + // trying to push stuff into the memtable. This will also resolve the value + // offset problem: as we push into memtable, we update value offsets there. + if !db.mt.Empty() { + db.elog.Printf("Flushing memtable") + for { + pushedFlushTask := func() bool { + db.Lock() + defer db.Unlock() + y.AssertTrue(db.mt != nil) + select { + case db.flushChan <- flushTask{mt: db.mt, vptr: db.vhead}: + db.imm = append(db.imm, db.mt) // Flusher will attempt to remove this from s.imm. + db.mt = nil // Will segfault if we try writing! + db.elog.Printf("pushed to flush chan\n") + return true + default: + // If we fail to push, we need to unlock and wait for a short while. + // The flushing operation needs to update s.imm. Otherwise, we have a deadlock. + // TODO: Think about how to do this more cleanly, maybe without any locks. + } + return false + }() + if pushedFlushTask { + break + } + time.Sleep(10 * time.Millisecond) + } + } + db.stopCompactions() + + // Force Compact L0 + // We don't need to care about cstatus since no parallel compaction is running. + if db.opt.CompactL0OnClose { + err := db.lc.doCompact(compactionPriority{level: 0, score: 1.73}) + switch err { + case errFillTables: + // This error only means that there might be enough tables to do a compaction. So, we + // should not report it to the end user to avoid confusing them. + case nil: + db.opt.Infof("Force compaction on level 0 done") + default: + db.opt.Warningf("While forcing compaction on level 0: %v", err) + } + } + + if lcErr := db.lc.close(); err == nil { + err = errors.Wrap(lcErr, "DB.Close") + } + db.elog.Printf("Waiting for closer") + db.closers.updateSize.SignalAndWait() + db.orc.Stop() + + db.elog.Finish() + + if db.dirLockGuard != nil { + if guardErr := db.dirLockGuard.release(); err == nil { + err = errors.Wrap(guardErr, "DB.Close") + } + } + if db.valueDirGuard != nil { + if guardErr := db.valueDirGuard.release(); err == nil { + err = errors.Wrap(guardErr, "DB.Close") + } + } + if manifestErr := db.manifest.close(); err == nil { + err = errors.Wrap(manifestErr, "DB.Close") + } + + // Fsync directories to ensure that lock file, and any other removed files whose directory + // we haven't specifically fsynced, are guaranteed to have their directory entry removal + // persisted to disk. + if syncErr := syncDir(db.opt.Dir); err == nil { + err = errors.Wrap(syncErr, "DB.Close") + } + if syncErr := syncDir(db.opt.ValueDir); err == nil { + err = errors.Wrap(syncErr, "DB.Close") + } + + return err +} + +const ( + lockFile = "LOCK" +) + +// Sync syncs database content to disk. This function provides +// more control to user to sync data whenever required. +func (db *DB) Sync() error { + return db.vlog.sync(math.MaxUint32) +} + +// getMemtables returns the current memtables and get references. +func (db *DB) getMemTables() ([]*skl.Skiplist, func()) { + db.RLock() + defer db.RUnlock() + + tables := make([]*skl.Skiplist, len(db.imm)+1) + + // Get mutable memtable. + tables[0] = db.mt + tables[0].IncrRef() + + // Get immutable memtables. + last := len(db.imm) - 1 + for i := range db.imm { + tables[i+1] = db.imm[last-i] + tables[i+1].IncrRef() + } + return tables, func() { + for _, tbl := range tables { + tbl.DecrRef() + } + } +} + +// get returns the value in memtable or disk for given key. +// Note that value will include meta byte. +// +// IMPORTANT: We should never write an entry with an older timestamp for the same key, We need to +// maintain this invariant to search for the latest value of a key, or else we need to search in all +// tables and find the max version among them. To maintain this invariant, we also need to ensure +// that all versions of a key are always present in the same table from level 1, because compaction +// can push any table down. +// +// Update (Sep 22, 2018): To maintain the above invariant, and to allow keys to be moved from one +// value log to another (while reclaiming space during value log GC), we have logically moved this +// need to write "old versions after new versions" to the badgerMove keyspace. Thus, for normal +// gets, we can stop going down the LSM tree once we find any version of the key (note however that +// we will ALWAYS skip versions with ts greater than the key version). However, if that key has +// been moved, then for the corresponding movekey, we'll look through all the levels of the tree +// to ensure that we pick the highest version of the movekey present. +func (db *DB) get(key []byte) (y.ValueStruct, error) { + tables, decr := db.getMemTables() // Lock should be released. + defer decr() + + var maxVs *y.ValueStruct + var version uint64 + if bytes.HasPrefix(key, badgerMove) { + // If we are checking badgerMove key, we should look into all the + // levels, so we can pick up the newer versions, which might have been + // compacted down the tree. + maxVs = &y.ValueStruct{} + version = y.ParseTs(key) + } + + y.NumGets.Add(1) + for i := 0; i < len(tables); i++ { + vs := tables[i].Get(key) + y.NumMemtableGets.Add(1) + if vs.Meta == 0 && vs.Value == nil { + continue + } + // Found a version of the key. For user keyspace, return immediately. For move keyspace, + // continue iterating, unless we found a version == given key version. + if maxVs == nil || vs.Version == version { + return vs, nil + } + if maxVs.Version < vs.Version { + *maxVs = vs + } + } + return db.lc.get(key, maxVs) +} + +func (db *DB) updateHead(ptrs []valuePointer) { + var ptr valuePointer + for i := len(ptrs) - 1; i >= 0; i-- { + p := ptrs[i] + if !p.IsZero() { + ptr = p + break + } + } + if ptr.IsZero() { + return + } + + db.Lock() + defer db.Unlock() + y.AssertTrue(!ptr.Less(db.vhead)) + db.vhead = ptr +} + +var requestPool = sync.Pool{ + New: func() interface{} { + return new(request) + }, +} + +func (db *DB) shouldWriteValueToLSM(e Entry) bool { + return len(e.Value) < db.opt.ValueThreshold +} + +func (db *DB) writeToLSM(b *request) error { + if len(b.Ptrs) != len(b.Entries) { + return errors.Errorf("Ptrs and Entries don't match: %+v", b) + } + + for i, entry := range b.Entries { + if entry.meta&bitFinTxn != 0 { + continue + } + if db.shouldWriteValueToLSM(*entry) { // Will include deletion / tombstone case. + db.mt.Put(entry.Key, + y.ValueStruct{ + Value: entry.Value, + Meta: entry.meta, + UserMeta: entry.UserMeta, + ExpiresAt: entry.ExpiresAt, + }) + } else { + var offsetBuf [vptrSize]byte + db.mt.Put(entry.Key, + y.ValueStruct{ + Value: b.Ptrs[i].Encode(offsetBuf[:]), + Meta: entry.meta | bitValuePointer, + UserMeta: entry.UserMeta, + ExpiresAt: entry.ExpiresAt, + }) + } + } + return nil +} + +// writeRequests is called serially by only one goroutine. +func (db *DB) writeRequests(reqs []*request) error { + if len(reqs) == 0 { + return nil + } + + done := func(err error) { + for _, r := range reqs { + r.Err = err + r.Wg.Done() + } + } + db.elog.Printf("writeRequests called. Writing to value log") + + err := db.vlog.write(reqs) + if err != nil { + done(err) + return err + } + + db.elog.Printf("Sending updates to subscribers") + db.pub.sendUpdates(reqs) + db.elog.Printf("Writing to memtable") + var count int + for _, b := range reqs { + if len(b.Entries) == 0 { + continue + } + count += len(b.Entries) + var i uint64 + for err = db.ensureRoomForWrite(); err == errNoRoom; err = db.ensureRoomForWrite() { + i++ + if i%100 == 0 { + db.elog.Printf("Making room for writes") + } + // We need to poll a bit because both hasRoomForWrite and the flusher need access to s.imm. + // When flushChan is full and you are blocked there, and the flusher is trying to update s.imm, + // you will get a deadlock. + time.Sleep(10 * time.Millisecond) + } + if err != nil { + done(err) + return errors.Wrap(err, "writeRequests") + } + if err := db.writeToLSM(b); err != nil { + done(err) + return errors.Wrap(err, "writeRequests") + } + db.updateHead(b.Ptrs) + } + done(nil) + db.elog.Printf("%d entries written", count) + return nil +} + +func (db *DB) sendToWriteCh(entries []*Entry) (*request, error) { + if atomic.LoadInt32(&db.blockWrites) == 1 { + return nil, ErrBlockedWrites + } + var count, size int64 + for _, e := range entries { + size += int64(e.estimateSize(db.opt.ValueThreshold)) + count++ + } + if count >= db.opt.maxBatchCount || size >= db.opt.maxBatchSize { + return nil, ErrTxnTooBig + } + + // We can only service one request because we need each txn to be stored in a contigous section. + // Txns should not interleave among other txns or rewrites. + req := requestPool.Get().(*request) + req.Entries = entries + req.Wg = sync.WaitGroup{} + req.Wg.Add(1) + req.IncrRef() // for db write + req.IncrRef() // for publisher updates + db.writeCh <- req // Handled in doWrites. + y.NumPuts.Add(int64(len(entries))) + + return req, nil +} + +func (db *DB) doWrites(lc *y.Closer) { + defer lc.Done() + pendingCh := make(chan struct{}, 1) + + writeRequests := func(reqs []*request) { + if err := db.writeRequests(reqs); err != nil { + db.opt.Errorf("writeRequests: %v", err) + } + <-pendingCh + } + + // This variable tracks the number of pending writes. + reqLen := new(expvar.Int) + y.PendingWrites.Set(db.opt.Dir, reqLen) + + reqs := make([]*request, 0, 10) + for { + var r *request + select { + case r = <-db.writeCh: + case <-lc.HasBeenClosed(): + goto closedCase + } + + for { + reqs = append(reqs, r) + reqLen.Set(int64(len(reqs))) + + if len(reqs) >= 3*kvWriteChCapacity { + pendingCh <- struct{}{} // blocking. + goto writeCase + } + + select { + // Either push to pending, or continue to pick from writeCh. + case r = <-db.writeCh: + case pendingCh <- struct{}{}: + goto writeCase + case <-lc.HasBeenClosed(): + goto closedCase + } + } + + closedCase: + close(db.writeCh) + for r := range db.writeCh { // Flush the channel. + reqs = append(reqs, r) + } + + pendingCh <- struct{}{} // Push to pending before doing a write. + writeRequests(reqs) + return + + writeCase: + go writeRequests(reqs) + reqs = make([]*request, 0, 10) + reqLen.Set(0) + } +} + +// batchSet applies a list of badger.Entry. If a request level error occurs it +// will be returned. +// Check(kv.BatchSet(entries)) +func (db *DB) batchSet(entries []*Entry) error { + req, err := db.sendToWriteCh(entries) + if err != nil { + return err + } + + return req.Wait() +} + +// batchSetAsync is the asynchronous version of batchSet. It accepts a callback +// function which is called when all the sets are complete. If a request level +// error occurs, it will be passed back via the callback. +// err := kv.BatchSetAsync(entries, func(err error)) { +// Check(err) +// } +func (db *DB) batchSetAsync(entries []*Entry, f func(error)) error { + req, err := db.sendToWriteCh(entries) + if err != nil { + return err + } + go func() { + err := req.Wait() + // Write is complete. Let's call the callback function now. + f(err) + }() + return nil +} + +var errNoRoom = errors.New("No room for write") + +// ensureRoomForWrite is always called serially. +func (db *DB) ensureRoomForWrite() error { + var err error + db.Lock() + defer db.Unlock() + + // Here we determine if we need to force flush memtable. Given we rotated log file, it would + // make sense to force flush a memtable, so the updated value head would have a chance to be + // pushed to L0. Otherwise, it would not go to L0, until the memtable has been fully filled, + // which can take a lot longer if the write load has fewer keys and larger values. This force + // flush, thus avoids the need to read through a lot of log files on a crash and restart. + // Above approach is quite simple with small drawback. We are calling ensureRoomForWrite before + // inserting every entry in Memtable. We will get latest db.head after all entries for a request + // are inserted in Memtable. If we have done >= db.logRotates rotations, then while inserting + // first entry in Memtable, below condition will be true and we will endup flushing old value of + // db.head. Hence we are limiting no of value log files to be read to db.logRotates only. + forceFlush := atomic.LoadInt32(&db.logRotates) >= db.opt.LogRotatesToFlush + + if !forceFlush && db.mt.MemSize() < db.opt.MaxTableSize { + return nil + } + + y.AssertTrue(db.mt != nil) // A nil mt indicates that DB is being closed. + select { + case db.flushChan <- flushTask{mt: db.mt, vptr: db.vhead}: + // After every memtable flush, let's reset the counter. + atomic.StoreInt32(&db.logRotates, 0) + + // Ensure value log is synced to disk so this memtable's contents wouldn't be lost. + err = db.vlog.sync(db.vhead.Fid) + if err != nil { + return err + } + + db.opt.Debugf("Flushing memtable, mt.size=%d size of flushChan: %d\n", + db.mt.MemSize(), len(db.flushChan)) + // We manage to push this task. Let's modify imm. + db.imm = append(db.imm, db.mt) + db.mt = skl.NewSkiplist(arenaSize(db.opt)) + // New memtable is empty. We certainly have room. + return nil + default: + // We need to do this to unlock and allow the flusher to modify imm. + return errNoRoom + } +} + +func arenaSize(opt Options) int64 { + return opt.MaxTableSize + opt.maxBatchSize + opt.maxBatchCount*int64(skl.MaxNodeSize) +} + +// WriteLevel0Table flushes memtable. +func writeLevel0Table(ft flushTask, f io.Writer) error { + iter := ft.mt.NewIterator() + defer iter.Close() + b := table.NewTableBuilder() + defer b.Close() + for iter.SeekToFirst(); iter.Valid(); iter.Next() { + if len(ft.dropPrefix) > 0 && bytes.HasPrefix(iter.Key(), ft.dropPrefix) { + continue + } + if err := b.Add(iter.Key(), iter.Value()); err != nil { + return err + } + } + _, err := f.Write(b.Finish()) + return err +} + +type flushTask struct { + mt *skl.Skiplist + vptr valuePointer + dropPrefix []byte +} + +// handleFlushTask must be run serially. +func (db *DB) handleFlushTask(ft flushTask) error { + // There can be a scnerio, when empty memtable is flushed. For example, memtable is empty and + // after writing request to value log, rotation count exceeds db.LogRotatesToFlush. + if ft.mt.Empty() { + return nil + } + + // Store badger head even if vptr is zero, need it for readTs + db.opt.Debugf("Storing value log head: %+v\n", ft.vptr) + db.elog.Printf("Storing offset: %+v\n", ft.vptr) + offset := make([]byte, vptrSize) + ft.vptr.Encode(offset) + + // Pick the max commit ts, so in case of crash, our read ts would be higher than all the + // commits. + headTs := y.KeyWithTs(head, db.orc.nextTs()) + ft.mt.Put(headTs, y.ValueStruct{Value: offset}) + + fileID := db.lc.reserveFileID() + fd, err := y.CreateSyncedFile(table.NewFilename(fileID, db.opt.Dir), true) + if err != nil { + return y.Wrap(err) + } + + // Don't block just to sync the directory entry. + dirSyncCh := make(chan error) + go func() { dirSyncCh <- syncDir(db.opt.Dir) }() + + err = writeLevel0Table(ft, fd) + dirSyncErr := <-dirSyncCh + + if err != nil { + db.elog.Errorf("ERROR while writing to level 0: %v", err) + return err + } + if dirSyncErr != nil { + // Do dir sync as best effort. No need to return due to an error there. + db.elog.Errorf("ERROR while syncing level directory: %v", dirSyncErr) + } + + tbl, err := table.OpenTable(fd, db.opt.TableLoadingMode, nil) + if err != nil { + db.elog.Printf("ERROR while opening table: %v", err) + return err + } + // We own a ref on tbl. + err = db.lc.addLevel0Table(tbl) // This will incrRef (if we don't error, sure) + _ = tbl.DecrRef() // Releases our ref. + return err +} + +// flushMemtable must keep running until we send it an empty flushTask. If there +// are errors during handling the flush task, we'll retry indefinitely. +func (db *DB) flushMemtable(lc *y.Closer) error { + defer lc.Done() + + for ft := range db.flushChan { + if ft.mt == nil { + // We close db.flushChan now, instead of sending a nil ft.mt. + continue + } + for { + err := db.handleFlushTask(ft) + if err == nil { + // Update s.imm. Need a lock. + db.Lock() + // This is a single-threaded operation. ft.mt corresponds to the head of + // db.imm list. Once we flush it, we advance db.imm. The next ft.mt + // which would arrive here would match db.imm[0], because we acquire a + // lock over DB when pushing to flushChan. + // TODO: This logic is dirty AF. Any change and this could easily break. + y.AssertTrue(ft.mt == db.imm[0]) + db.imm = db.imm[1:] + ft.mt.DecrRef() // Return memory. + db.Unlock() + + break + } + // Encountered error. Retry indefinitely. + db.opt.Errorf("Failure while flushing memtable to disk: %v. Retrying...\n", err) + time.Sleep(time.Second) + } + } + return nil +} + +func exists(path string) (bool, error) { + _, err := os.Stat(path) + if err == nil { + return true, nil + } + if os.IsNotExist(err) { + return false, nil + } + return true, err +} + +// This function does a filewalk, calculates the size of vlog and sst files and stores it in +// y.LSMSize and y.VlogSize. +func (db *DB) calculateSize() { + newInt := func(val int64) *expvar.Int { + v := new(expvar.Int) + v.Add(val) + return v + } + + totalSize := func(dir string) (int64, int64) { + var lsmSize, vlogSize int64 + err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { + if err != nil { + return err + } + ext := filepath.Ext(path) + if ext == ".sst" { + lsmSize += info.Size() + } else if ext == ".vlog" { + vlogSize += info.Size() + } + return nil + }) + if err != nil { + db.elog.Printf("Got error while calculating total size of directory: %s", dir) + } + return lsmSize, vlogSize + } + + lsmSize, vlogSize := totalSize(db.opt.Dir) + y.LSMSize.Set(db.opt.Dir, newInt(lsmSize)) + // If valueDir is different from dir, we'd have to do another walk. + if db.opt.ValueDir != db.opt.Dir { + _, vlogSize = totalSize(db.opt.ValueDir) + } + y.VlogSize.Set(db.opt.Dir, newInt(vlogSize)) +} + +func (db *DB) updateSize(lc *y.Closer) { + defer lc.Done() + + metricsTicker := time.NewTicker(time.Minute) + defer metricsTicker.Stop() + + for { + select { + case <-metricsTicker.C: + db.calculateSize() + case <-lc.HasBeenClosed(): + return + } + } +} + +// RunValueLogGC triggers a value log garbage collection. +// +// It picks value log files to perform GC based on statistics that are collected +// duing compactions. If no such statistics are available, then log files are +// picked in random order. The process stops as soon as the first log file is +// encountered which does not result in garbage collection. +// +// When a log file is picked, it is first sampled. If the sample shows that we +// can discard at least discardRatio space of that file, it would be rewritten. +// +// If a call to RunValueLogGC results in no rewrites, then an ErrNoRewrite is +// thrown indicating that the call resulted in no file rewrites. +// +// We recommend setting discardRatio to 0.5, thus indicating that a file be +// rewritten if half the space can be discarded. This results in a lifetime +// value log write amplification of 2 (1 from original write + 0.5 rewrite + +// 0.25 + 0.125 + ... = 2). Setting it to higher value would result in fewer +// space reclaims, while setting it to a lower value would result in more space +// reclaims at the cost of increased activity on the LSM tree. discardRatio +// must be in the range (0.0, 1.0), both endpoints excluded, otherwise an +// ErrInvalidRequest is returned. +// +// Only one GC is allowed at a time. If another value log GC is running, or DB +// has been closed, this would return an ErrRejected. +// +// Note: Every time GC is run, it would produce a spike of activity on the LSM +// tree. +func (db *DB) RunValueLogGC(discardRatio float64) error { + if discardRatio >= 1.0 || discardRatio <= 0.0 { + return ErrInvalidRequest + } + + // Find head on disk + headKey := y.KeyWithTs(head, math.MaxUint64) + // Need to pass with timestamp, lsm get removes the last 8 bytes and compares key + val, err := db.lc.get(headKey, nil) + if err != nil { + return errors.Wrap(err, "Retrieving head from on-disk LSM") + } + + var head valuePointer + if len(val.Value) > 0 { + head.Decode(val.Value) + } + + // Pick a log file and run GC + return db.vlog.runGC(discardRatio, head) +} + +// Size returns the size of lsm and value log files in bytes. It can be used to decide how often to +// call RunValueLogGC. +func (db *DB) Size() (lsm, vlog int64) { + if y.LSMSize.Get(db.opt.Dir) == nil { + lsm, vlog = 0, 0 + return + } + lsm = y.LSMSize.Get(db.opt.Dir).(*expvar.Int).Value() + vlog = y.VlogSize.Get(db.opt.Dir).(*expvar.Int).Value() + return +} + +// Sequence represents a Badger sequence. +type Sequence struct { + sync.Mutex + db *DB + key []byte + next uint64 + leased uint64 + bandwidth uint64 +} + +// Next would return the next integer in the sequence, updating the lease by running a transaction +// if needed. +func (seq *Sequence) Next() (uint64, error) { + seq.Lock() + defer seq.Unlock() + if seq.next >= seq.leased { + if err := seq.updateLease(); err != nil { + return 0, err + } + } + val := seq.next + seq.next++ + return val, nil +} + +// Release the leased sequence to avoid wasted integers. This should be done right +// before closing the associated DB. However it is valid to use the sequence after +// it was released, causing a new lease with full bandwidth. +func (seq *Sequence) Release() error { + seq.Lock() + defer seq.Unlock() + err := seq.db.Update(func(txn *Txn) error { + var buf [8]byte + binary.BigEndian.PutUint64(buf[:], seq.next) + return txn.SetEntry(NewEntry(seq.key, buf[:])) + }) + if err != nil { + return err + } + seq.leased = seq.next + return nil +} + +func (seq *Sequence) updateLease() error { + return seq.db.Update(func(txn *Txn) error { + item, err := txn.Get(seq.key) + if err == ErrKeyNotFound { + seq.next = 0 + } else if err != nil { + return err + } else { + var num uint64 + if err := item.Value(func(v []byte) error { + num = binary.BigEndian.Uint64(v) + return nil + }); err != nil { + return err + } + seq.next = num + } + + lease := seq.next + seq.bandwidth + var buf [8]byte + binary.BigEndian.PutUint64(buf[:], lease) + if err = txn.SetEntry(NewEntry(seq.key, buf[:])); err != nil { + return err + } + seq.leased = lease + return nil + }) +} + +// GetSequence would initiate a new sequence object, generating it from the stored lease, if +// available, in the database. Sequence can be used to get a list of monotonically increasing +// integers. Multiple sequences can be created by providing different keys. Bandwidth sets the +// size of the lease, determining how many Next() requests can be served from memory. +// +// GetSequence is not supported on ManagedDB. Calling this would result in a panic. +func (db *DB) GetSequence(key []byte, bandwidth uint64) (*Sequence, error) { + if db.opt.managedTxns { + panic("Cannot use GetSequence with managedDB=true.") + } + + switch { + case len(key) == 0: + return nil, ErrEmptyKey + case bandwidth == 0: + return nil, ErrZeroBandwidth + } + seq := &Sequence{ + db: db, + key: key, + next: 0, + leased: 0, + bandwidth: bandwidth, + } + err := seq.updateLease() + return seq, err +} + +// Tables gets the TableInfo objects from the level controller. If withKeysCount +// is true, TableInfo objects also contain counts of keys for the tables. +func (db *DB) Tables(withKeysCount bool) []TableInfo { + return db.lc.getTableInfo(withKeysCount) +} + +// KeySplits can be used to get rough key ranges to divide up iteration over +// the DB. +func (db *DB) KeySplits(prefix []byte) []string { + var splits []string + // We just want table ranges here and not keys count. + for _, ti := range db.Tables(false) { + // We don't use ti.Left, because that has a tendency to store !badger + // keys. + if bytes.HasPrefix(ti.Right, prefix) { + splits = append(splits, string(ti.Right)) + } + } + sort.Strings(splits) + return splits +} + +// MaxBatchCount returns max possible entries in batch +func (db *DB) MaxBatchCount() int64 { + return db.opt.maxBatchCount +} + +// MaxBatchSize returns max possible batch size +func (db *DB) MaxBatchSize() int64 { + return db.opt.maxBatchSize +} + +func (db *DB) stopCompactions() { + // Stop memtable flushes. + if db.closers.memtable != nil { + close(db.flushChan) + db.closers.memtable.SignalAndWait() + } + // Stop compactions. + if db.closers.compactors != nil { + db.closers.compactors.SignalAndWait() + } +} + +func (db *DB) startCompactions() { + // Resume compactions. + if db.closers.compactors != nil { + db.closers.compactors = y.NewCloser(1) + db.lc.startCompact(db.closers.compactors) + } + if db.closers.memtable != nil { + db.flushChan = make(chan flushTask, db.opt.NumMemtables) + db.closers.memtable = y.NewCloser(1) + go func() { + _ = db.flushMemtable(db.closers.memtable) + }() + } +} + +// Flatten can be used to force compactions on the LSM tree so all the tables fall on the same +// level. This ensures that all the versions of keys are colocated and not split across multiple +// levels, which is necessary after a restore from backup. During Flatten, live compactions are +// stopped. Ideally, no writes are going on during Flatten. Otherwise, it would create competition +// between flattening the tree and new tables being created at level zero. +func (db *DB) Flatten(workers int) error { + db.stopCompactions() + defer db.startCompactions() + + compactAway := func(cp compactionPriority) error { + db.opt.Infof("Attempting to compact with %+v\n", cp) + errCh := make(chan error, 1) + for i := 0; i < workers; i++ { + go func() { + errCh <- db.lc.doCompact(cp) + }() + } + var success int + var rerr error + for i := 0; i < workers; i++ { + err := <-errCh + if err != nil { + rerr = err + db.opt.Warningf("While running doCompact with %+v. Error: %v\n", cp, err) + } else { + success++ + } + } + if success == 0 { + return rerr + } + // We could do at least one successful compaction. So, we'll consider this a success. + db.opt.Infof("%d compactor(s) succeeded. One or more tables from level %d compacted.\n", + success, cp.level) + return nil + } + + hbytes := func(sz int64) string { + return humanize.Bytes(uint64(sz)) + } + + for { + db.opt.Infof("\n") + var levels []int + for i, l := range db.lc.levels { + sz := l.getTotalSize() + db.opt.Infof("Level: %d. %8s Size. %8s Max.\n", + i, hbytes(l.getTotalSize()), hbytes(l.maxTotalSize)) + if sz > 0 { + levels = append(levels, i) + } + } + if len(levels) <= 1 { + prios := db.lc.pickCompactLevels() + if len(prios) == 0 || prios[0].score <= 1.0 { + db.opt.Infof("All tables consolidated into one level. Flattening done.\n") + return nil + } + if err := compactAway(prios[0]); err != nil { + return err + } + continue + } + // Create an artificial compaction priority, to ensure that we compact the level. + cp := compactionPriority{level: levels[0], score: 1.71} + if err := compactAway(cp); err != nil { + return err + } + } +} + +func (db *DB) prepareToDrop() func() { + if db.opt.ReadOnly { + panic("Attempting to drop data in read-only mode.") + } + // Stop accepting new writes. + atomic.StoreInt32(&db.blockWrites, 1) + + // Make all pending writes finish. The following will also close writeCh. + db.closers.writes.SignalAndWait() + db.opt.Infof("Writes flushed. Stopping compactions now...") + + // Stop all compactions. + db.stopCompactions() + return func() { + db.opt.Infof("Resuming writes") + db.startCompactions() + + db.writeCh = make(chan *request, kvWriteChCapacity) + db.closers.writes = y.NewCloser(1) + go db.doWrites(db.closers.writes) + + // Resume writes. + atomic.StoreInt32(&db.blockWrites, 0) + } +} + +// DropAll would drop all the data stored in Badger. It does this in the following way. +// - Stop accepting new writes. +// - Pause memtable flushes and compactions. +// - Pick all tables from all levels, create a changeset to delete all these +// tables and apply it to manifest. +// - Pick all log files from value log, and delete all of them. Restart value log files from zero. +// - Resume memtable flushes and compactions. +// +// NOTE: DropAll is resilient to concurrent writes, but not to reads. It is up to the user to not do +// any reads while DropAll is going on, otherwise they may result in panics. Ideally, both reads and +// writes are paused before running DropAll, and resumed after it is finished. +func (db *DB) DropAll() error { + f, err := db.dropAll() + if err != nil { + return err + } + if f == nil { + panic("both error and returned function cannot be nil in DropAll") + } + f() + return nil +} + +func (db *DB) dropAll() (func(), error) { + db.opt.Infof("DropAll called. Blocking writes...") + f := db.prepareToDrop() + + // Block all foreign interactions with memory tables. + db.Lock() + defer db.Unlock() + + // Remove inmemory tables. Calling DecrRef for safety. Not sure if they're absolutely needed. + db.mt.DecrRef() + for _, mt := range db.imm { + mt.DecrRef() + } + db.imm = db.imm[:0] + db.mt = skl.NewSkiplist(arenaSize(db.opt)) // Set it up for future writes. + + num, err := db.lc.dropTree() + if err != nil { + return nil, err + } + db.opt.Infof("Deleted %d SSTables. Now deleting value logs...\n", num) + + num, err = db.vlog.dropAll() + if err != nil { + return nil, err + } + db.vhead = valuePointer{} // Zero it out. + db.lc.nextFileID = 1 + db.opt.Infof("Deleted %d value log files. DropAll done.\n", num) + return f, nil +} + +// DropPrefix would drop all the keys with the provided prefix. It does this in the following way: +// - Stop accepting new writes. +// - Stop memtable flushes and compactions. +// - Flush out all memtables, skipping over keys with the given prefix, Kp. +// - Write out the value log header to memtables when flushing, so we don't accidentally bring Kp +// back after a restart. +// - Compact L0->L1, skipping over Kp. +// - Compact rest of the levels, Li->Li, picking tables which have Kp. +// - Resume memtable flushes, compactions and writes. +func (db *DB) DropPrefix(prefix []byte) error { + db.opt.Infof("DropPrefix called on %s. Blocking writes...", hex.Dump(prefix)) + f := db.prepareToDrop() + defer f() + + // Block all foreign interactions with memory tables. + db.Lock() + defer db.Unlock() + + db.imm = append(db.imm, db.mt) + for _, memtable := range db.imm { + if memtable.Empty() { + memtable.DecrRef() + continue + } + task := flushTask{ + mt: memtable, + // Ensure that the head of value log gets persisted to disk. + vptr: db.vhead, + dropPrefix: prefix, + } + db.opt.Debugf("Flushing memtable") + if err := db.handleFlushTask(task); err != nil { + db.opt.Errorf("While trying to flush memtable: %v", err) + return err + } + memtable.DecrRef() + } + db.imm = db.imm[:0] + db.mt = skl.NewSkiplist(arenaSize(db.opt)) + + // Drop prefixes from the levels. + if err := db.lc.dropPrefix(prefix); err != nil { + return err + } + db.opt.Infof("DropPrefix done") + return nil +} + +// Subscribe can be used watch key changes for the given key prefix. +func (db *DB) Subscribe(ctx context.Context, cb callback, prefix []byte, prefixes ...[]byte) error { + if cb == nil { + return ErrNilCallback + } + prefixes = append(prefixes, prefix) + c := y.NewCloser(1) + recvCh, id := db.pub.newSubscriber(c, prefixes...) + slurp := func(batch *pb.KVList) { + defer func() { + if len(batch.GetKv()) > 0 { + cb(batch) + } + }() + for { + select { + case kvs := <-recvCh: + batch.Kv = append(batch.Kv, kvs.Kv...) + default: + return + } + } + } + for { + select { + case <-c.HasBeenClosed(): + slurp(new(pb.KVList)) + // Drain if any pending updates. + c.Done() + // No need to delete here. Closer will be called only while + // closing DB. Subscriber will be deleted by cleanSubscribers. + return nil + case <-ctx.Done(): + c.Done() + db.pub.deleteSubscriber(id) + // Delete the subscriber to avoid further updates. + return ctx.Err() + case batch := <-recvCh: + slurp(batch) + } + } +} diff --git a/vendor/github.com/dgraph-io/badger/dir_unix.go b/vendor/github.com/dgraph-io/badger/dir_unix.go new file mode 100644 index 0000000000..d56e6e821a --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/dir_unix.go @@ -0,0 +1,118 @@ +// +build !windows + +/* + * Copyright 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package badger + +import ( + "fmt" + "io/ioutil" + "os" + "path/filepath" + + "github.com/dgraph-io/badger/y" + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +// directoryLockGuard holds a lock on a directory and a pid file inside. The pid file isn't part +// of the locking mechanism, it's just advisory. +type directoryLockGuard struct { + // File handle on the directory, which we've flocked. + f *os.File + // The absolute path to our pid file. + path string + // Was this a shared lock for a read-only database? + readOnly bool +} + +// acquireDirectoryLock gets a lock on the directory (using flock). If +// this is not read-only, it will also write our pid to +// dirPath/pidFileName for convenience. +func acquireDirectoryLock(dirPath string, pidFileName string, readOnly bool) ( + *directoryLockGuard, error) { + // Convert to absolute path so that Release still works even if we do an unbalanced + // chdir in the meantime. + absPidFilePath, err := filepath.Abs(filepath.Join(dirPath, pidFileName)) + if err != nil { + return nil, errors.Wrap(err, "cannot get absolute path for pid lock file") + } + f, err := os.Open(dirPath) + if err != nil { + return nil, errors.Wrapf(err, "cannot open directory %q", dirPath) + } + opts := unix.LOCK_EX | unix.LOCK_NB + if readOnly { + opts = unix.LOCK_SH | unix.LOCK_NB + } + + err = unix.Flock(int(f.Fd()), opts) + if err != nil { + f.Close() + return nil, errors.Wrapf(err, + "Cannot acquire directory lock on %q. Another process is using this Badger database.", + dirPath) + } + + if !readOnly { + // Yes, we happily overwrite a pre-existing pid file. We're the + // only read-write badger process using this directory. + err = ioutil.WriteFile(absPidFilePath, []byte(fmt.Sprintf("%d\n", os.Getpid())), 0666) + if err != nil { + f.Close() + return nil, errors.Wrapf(err, + "Cannot write pid file %q", absPidFilePath) + } + } + return &directoryLockGuard{f, absPidFilePath, readOnly}, nil +} + +// Release deletes the pid file and releases our lock on the directory. +func (guard *directoryLockGuard) release() error { + var err error + if !guard.readOnly { + // It's important that we remove the pid file first. + err = os.Remove(guard.path) + } + + if closeErr := guard.f.Close(); err == nil { + err = closeErr + } + guard.path = "" + guard.f = nil + + return err +} + +// openDir opens a directory for syncing. +func openDir(path string) (*os.File, error) { return os.Open(path) } + +// When you create or delete a file, you have to ensure the directory entry for the file is synced +// in order to guarantee the file is visible (if the system crashes). (See the man page for fsync, +// or see https://github.com/coreos/etcd/issues/6368 for an example.) +func syncDir(dir string) error { + f, err := openDir(dir) + if err != nil { + return errors.Wrapf(err, "While opening directory: %s.", dir) + } + err = y.FileSync(f) + closeErr := f.Close() + if err != nil { + return errors.Wrapf(err, "While syncing directory: %s.", dir) + } + return errors.Wrapf(closeErr, "While closing directory: %s.", dir) +} diff --git a/vendor/github.com/dgraph-io/badger/dir_windows.go b/vendor/github.com/dgraph-io/badger/dir_windows.go new file mode 100644 index 0000000000..60f982e2c5 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/dir_windows.go @@ -0,0 +1,110 @@ +// +build windows + +/* + * Copyright 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package badger + +// OpenDir opens a directory in windows with write access for syncing. +import ( + "os" + "path/filepath" + "syscall" + + "github.com/pkg/errors" +) + +// FILE_ATTRIBUTE_TEMPORARY - A file that is being used for temporary storage. +// FILE_FLAG_DELETE_ON_CLOSE - The file is to be deleted immediately after all of its handles are +// closed, which includes the specified handle and any other open or duplicated handles. +// See: https://docs.microsoft.com/en-us/windows/desktop/FileIO/file-attribute-constants +// NOTE: Added here to avoid importing golang.org/x/sys/windows +const ( + FILE_ATTRIBUTE_TEMPORARY = 0x00000100 + FILE_FLAG_DELETE_ON_CLOSE = 0x04000000 +) + +func openDir(path string) (*os.File, error) { + fd, err := openDirWin(path) + if err != nil { + return nil, err + } + return os.NewFile(uintptr(fd), path), nil +} + +func openDirWin(path string) (fd syscall.Handle, err error) { + if len(path) == 0 { + return syscall.InvalidHandle, syscall.ERROR_FILE_NOT_FOUND + } + pathp, err := syscall.UTF16PtrFromString(path) + if err != nil { + return syscall.InvalidHandle, err + } + access := uint32(syscall.GENERIC_READ | syscall.GENERIC_WRITE) + sharemode := uint32(syscall.FILE_SHARE_READ | syscall.FILE_SHARE_WRITE) + createmode := uint32(syscall.OPEN_EXISTING) + fl := uint32(syscall.FILE_FLAG_BACKUP_SEMANTICS) + return syscall.CreateFile(pathp, access, sharemode, nil, createmode, fl, 0) +} + +// DirectoryLockGuard holds a lock on the directory. +type directoryLockGuard struct { + h syscall.Handle + path string +} + +// AcquireDirectoryLock acquires exclusive access to a directory. +func acquireDirectoryLock(dirPath string, pidFileName string, readOnly bool) (*directoryLockGuard, error) { + if readOnly { + return nil, ErrWindowsNotSupported + } + + // Convert to absolute path so that Release still works even if we do an unbalanced + // chdir in the meantime. + absLockFilePath, err := filepath.Abs(filepath.Join(dirPath, pidFileName)) + if err != nil { + return nil, errors.Wrap(err, "Cannot get absolute path for pid lock file") + } + + // This call creates a file handler in memory that only one process can use at a time. When + // that process ends, the file is deleted by the system. + // FILE_ATTRIBUTE_TEMPORARY is used to tell Windows to try to create the handle in memory. + // FILE_FLAG_DELETE_ON_CLOSE is not specified in syscall_windows.go but tells Windows to delete + // the file when all processes holding the handler are closed. + // XXX: this works but it's a bit klunky. i'd prefer to use LockFileEx but it needs unsafe pkg. + h, err := syscall.CreateFile( + syscall.StringToUTF16Ptr(absLockFilePath), 0, 0, nil, + syscall.OPEN_ALWAYS, + uint32(FILE_ATTRIBUTE_TEMPORARY|FILE_FLAG_DELETE_ON_CLOSE), + 0) + if err != nil { + return nil, errors.Wrapf(err, + "Cannot create lock file %q. Another process is using this Badger database", + absLockFilePath) + } + + return &directoryLockGuard{h: h, path: absLockFilePath}, nil +} + +// Release removes the directory lock. +func (g *directoryLockGuard) release() error { + g.path = "" + return syscall.CloseHandle(g.h) +} + +// Windows doesn't support syncing directories to the file system. See +// https://github.com/dgraph-io/badger/issues/699#issuecomment-504133587 for more details. +func syncDir(dir string) error { return nil } diff --git a/vendor/github.com/dgraph-io/badger/doc.go b/vendor/github.com/dgraph-io/badger/doc.go new file mode 100644 index 0000000000..83dc9a28ac --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/doc.go @@ -0,0 +1,28 @@ +/* +Package badger implements an embeddable, simple and fast key-value database, +written in pure Go. It is designed to be highly performant for both reads and +writes simultaneously. Badger uses Multi-Version Concurrency Control (MVCC), and +supports transactions. It runs transactions concurrently, with serializable +snapshot isolation guarantees. + +Badger uses an LSM tree along with a value log to separate keys from values, +hence reducing both write amplification and the size of the LSM tree. This +allows LSM tree to be served entirely from RAM, while the values are served +from SSD. + + +Usage + +Badger has the following main types: DB, Txn, Item and Iterator. DB contains +keys that are associated with values. It must be opened with the appropriate +options before it can be accessed. + +All operations happen inside a Txn. Txn represents a transaction, which can +be read-only or read-write. Read-only transactions can read values for a +given key (which are returned inside an Item), or iterate over a set of +key-value pairs using an Iterator (which are returned as Item type values as +well). Read-write transactions can also update and delete keys from the DB. + +See the examples for more usage details. +*/ +package badger diff --git a/vendor/github.com/dgraph-io/badger/errors.go b/vendor/github.com/dgraph-io/badger/errors.go new file mode 100644 index 0000000000..8d2df6833a --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/errors.go @@ -0,0 +1,117 @@ +/* + * Copyright 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package badger + +import ( + "math" + + "github.com/pkg/errors" +) + +const ( + // ValueThresholdLimit is the maximum permissible value of opt.ValueThreshold. + ValueThresholdLimit = math.MaxUint16 - 16 + 1 +) + +var ( + // ErrValueLogSize is returned when opt.ValueLogFileSize option is not within the valid + // range. + ErrValueLogSize = errors.New("Invalid ValueLogFileSize, must be between 1MB and 2GB") + + // ErrValueThreshold is returned when ValueThreshold is set to a value close to or greater than + // uint16. + ErrValueThreshold = errors.Errorf( + "Invalid ValueThreshold, must be less than %d", ValueThresholdLimit) + + // ErrKeyNotFound is returned when key isn't found on a txn.Get. + ErrKeyNotFound = errors.New("Key not found") + + // ErrTxnTooBig is returned if too many writes are fit into a single transaction. + ErrTxnTooBig = errors.New("Txn is too big to fit into one request") + + // ErrConflict is returned when a transaction conflicts with another transaction. This can + // happen if the read rows had been updated concurrently by another transaction. + ErrConflict = errors.New("Transaction Conflict. Please retry") + + // ErrReadOnlyTxn is returned if an update function is called on a read-only transaction. + ErrReadOnlyTxn = errors.New("No sets or deletes are allowed in a read-only transaction") + + // ErrDiscardedTxn is returned if a previously discarded transaction is re-used. + ErrDiscardedTxn = errors.New("This transaction has been discarded. Create a new one") + + // ErrEmptyKey is returned if an empty key is passed on an update function. + ErrEmptyKey = errors.New("Key cannot be empty") + + // ErrInvalidKey is returned if the key has a special !badger! prefix, + // reserved for internal usage. + ErrInvalidKey = errors.New("Key is using a reserved !badger! prefix") + + // ErrRetry is returned when a log file containing the value is not found. + // This usually indicates that it may have been garbage collected, and the + // operation needs to be retried. + ErrRetry = errors.New("Unable to find log file. Please retry") + + // ErrThresholdZero is returned if threshold is set to zero, and value log GC is called. + // In such a case, GC can't be run. + ErrThresholdZero = errors.New( + "Value log GC can't run because threshold is set to zero") + + // ErrNoRewrite is returned if a call for value log GC doesn't result in a log file rewrite. + ErrNoRewrite = errors.New( + "Value log GC attempt didn't result in any cleanup") + + // ErrRejected is returned if a value log GC is called either while another GC is running, or + // after DB::Close has been called. + ErrRejected = errors.New("Value log GC request rejected") + + // ErrInvalidRequest is returned if the user request is invalid. + ErrInvalidRequest = errors.New("Invalid request") + + // ErrManagedTxn is returned if the user tries to use an API which isn't + // allowed due to external management of transactions, when using ManagedDB. + ErrManagedTxn = errors.New( + "Invalid API request. Not allowed to perform this action using ManagedDB") + + // ErrInvalidDump if a data dump made previously cannot be loaded into the database. + ErrInvalidDump = errors.New("Data dump cannot be read") + + // ErrZeroBandwidth is returned if the user passes in zero bandwidth for sequence. + ErrZeroBandwidth = errors.New("Bandwidth must be greater than zero") + + // ErrInvalidLoadingMode is returned when opt.ValueLogLoadingMode option is not + // within the valid range + ErrInvalidLoadingMode = errors.New("Invalid ValueLogLoadingMode, must be FileIO or MemoryMap") + + // ErrReplayNeeded is returned when opt.ReadOnly is set but the + // database requires a value log replay. + ErrReplayNeeded = errors.New("Database was not properly closed, cannot open read-only") + + // ErrWindowsNotSupported is returned when opt.ReadOnly is used on Windows + ErrWindowsNotSupported = errors.New("Read-only mode is not supported on Windows") + + // ErrTruncateNeeded is returned when the value log gets corrupt, and requires truncation of + // corrupt data to allow Badger to run properly. + ErrTruncateNeeded = errors.New( + "Value log truncate required to run DB. This might result in data loss") + + // ErrBlockedWrites is returned if the user called DropAll. During the process of dropping all + // data from Badger, we stop accepting new writes, by returning this error. + ErrBlockedWrites = errors.New("Writes are blocked, possibly due to DropAll or Close") + + // ErrNilCallback is returned when subscriber's callback is nil. + ErrNilCallback = errors.New("Callback cannot be nil") +) diff --git a/vendor/github.com/dgraph-io/badger/histogram.go b/vendor/github.com/dgraph-io/badger/histogram.go new file mode 100644 index 0000000000..d8c94bb7ad --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/histogram.go @@ -0,0 +1,169 @@ +/* + * Copyright 2019 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package badger + +import ( + "fmt" + "math" +) + +// PrintHistogram builds and displays the key-value size histogram. +// When keyPrefix is set, only the keys that have prefix "keyPrefix" are +// considered for creating the histogram +func (db *DB) PrintHistogram(keyPrefix []byte) { + if db == nil { + fmt.Println("\nCannot build histogram: DB is nil.") + return + } + histogram := db.buildHistogram(keyPrefix) + fmt.Printf("Histogram of key sizes (in bytes)\n") + histogram.keySizeHistogram.printHistogram() + fmt.Printf("Histogram of value sizes (in bytes)\n") + histogram.valueSizeHistogram.printHistogram() +} + +// histogramData stores information about a histogram +type histogramData struct { + bins []int64 + countPerBin []int64 + totalCount int64 + min int64 + max int64 + sum int64 +} + +// sizeHistogram contains keySize histogram and valueSize histogram +type sizeHistogram struct { + keySizeHistogram, valueSizeHistogram histogramData +} + +// newSizeHistogram returns a new instance of keyValueSizeHistogram with +// properly initialized fields. +func newSizeHistogram() *sizeHistogram { + // TODO(ibrahim): find appropriate bin size. + keyBins := createHistogramBins(1, 16) + valueBins := createHistogramBins(1, 30) + return &sizeHistogram{ + keySizeHistogram: histogramData{ + bins: keyBins, + countPerBin: make([]int64, len(keyBins)+1), + max: math.MinInt64, + min: math.MaxInt64, + sum: 0, + }, + valueSizeHistogram: histogramData{ + bins: valueBins, + countPerBin: make([]int64, len(valueBins)+1), + max: math.MinInt64, + min: math.MaxInt64, + sum: 0, + }, + } +} + +// createHistogramBins creates bins for an histogram. The bin sizes are powers +// of two of the form [2^min_exponent, ..., 2^max_exponent]. +func createHistogramBins(minExponent, maxExponent uint32) []int64 { + var bins []int64 + for i := minExponent; i <= maxExponent; i++ { + bins = append(bins, int64(1)< histogram.max { + histogram.max = value + } + if value < histogram.min { + histogram.min = value + } + + histogram.sum += value + histogram.totalCount++ + + for index := 0; index <= len(histogram.bins); index++ { + // Allocate value in the last buckets if we reached the end of the Bounds array. + if index == len(histogram.bins) { + histogram.countPerBin[index]++ + break + } + + // Check if the value should be added to the "index" bin + if value < int64(histogram.bins[index]) { + histogram.countPerBin[index]++ + break + } + } +} + +// buildHistogram builds the key-value size histogram. +// When keyPrefix is set, only the keys that have prefix "keyPrefix" are +// considered for creating the histogram +func (db *DB) buildHistogram(keyPrefix []byte) *sizeHistogram { + txn := db.NewTransaction(false) + defer txn.Discard() + + itr := txn.NewIterator(DefaultIteratorOptions) + defer itr.Close() + + badgerHistogram := newSizeHistogram() + + // Collect key and value sizes. + for itr.Seek(keyPrefix); itr.ValidForPrefix(keyPrefix); itr.Next() { + item := itr.Item() + badgerHistogram.keySizeHistogram.Update(item.KeySize()) + badgerHistogram.valueSizeHistogram.Update(item.ValueSize()) + } + return badgerHistogram +} + +// printHistogram prints the histogram data in a human-readable format. +func (histogram histogramData) printHistogram() { + fmt.Printf("Total count: %d\n", histogram.totalCount) + fmt.Printf("Min value: %d\n", histogram.min) + fmt.Printf("Max value: %d\n", histogram.max) + fmt.Printf("Mean: %.2f\n", float64(histogram.sum)/float64(histogram.totalCount)) + fmt.Printf("%24s %9s\n", "Range", "Count") + + numBins := len(histogram.bins) + for index, count := range histogram.countPerBin { + if count == 0 { + continue + } + + // The last bin represents the bin that contains the range from + // the last bin up to infinity so it's processed differently than the + // other bins. + if index == len(histogram.countPerBin)-1 { + lowerBound := int(histogram.bins[numBins-1]) + fmt.Printf("[%10d, %10s) %9d\n", lowerBound, "infinity", count) + continue + } + + upperBound := int(histogram.bins[index]) + lowerBound := 0 + if index > 0 { + lowerBound = int(histogram.bins[index-1]) + } + + fmt.Printf("[%10d, %10d) %9d\n", lowerBound, upperBound, count) + } + fmt.Println() +} diff --git a/vendor/github.com/dgraph-io/badger/iterator.go b/vendor/github.com/dgraph-io/badger/iterator.go new file mode 100644 index 0000000000..f4af4058d2 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/iterator.go @@ -0,0 +1,684 @@ +/* + * Copyright 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package badger + +import ( + "bytes" + "fmt" + "hash/crc32" + "sync" + "sync/atomic" + "time" + + "github.com/dgraph-io/badger/options" + "github.com/dgraph-io/badger/table" + + "github.com/dgraph-io/badger/y" +) + +type prefetchStatus uint8 + +const ( + prefetched prefetchStatus = iota + 1 +) + +// Item is returned during iteration. Both the Key() and Value() output is only valid until +// iterator.Next() is called. +type Item struct { + status prefetchStatus + err error + wg sync.WaitGroup + db *DB + key []byte + vptr []byte + meta byte // We need to store meta to know about bitValuePointer. + userMeta byte + expiresAt uint64 + val []byte + slice *y.Slice // Used only during prefetching. + next *Item + version uint64 + txn *Txn +} + +// String returns a string representation of Item +func (item *Item) String() string { + return fmt.Sprintf("key=%q, version=%d, meta=%x", item.Key(), item.Version(), item.meta) +} + +// Key returns the key. +// +// Key is only valid as long as item is valid, or transaction is valid. If you need to use it +// outside its validity, please use KeyCopy. +func (item *Item) Key() []byte { + return item.key +} + +// KeyCopy returns a copy of the key of the item, writing it to dst slice. +// If nil is passed, or capacity of dst isn't sufficient, a new slice would be allocated and +// returned. +func (item *Item) KeyCopy(dst []byte) []byte { + return y.SafeCopy(dst, item.key) +} + +// Version returns the commit timestamp of the item. +func (item *Item) Version() uint64 { + return item.version +} + +// Value retrieves the value of the item from the value log. +// +// This method must be called within a transaction. Calling it outside a +// transaction is considered undefined behavior. If an iterator is being used, +// then Item.Value() is defined in the current iteration only, because items are +// reused. +// +// If you need to use a value outside a transaction, please use Item.ValueCopy +// instead, or copy it yourself. Value might change once discard or commit is called. +// Use ValueCopy if you want to do a Set after Get. +func (item *Item) Value(fn func(val []byte) error) error { + item.wg.Wait() + if item.status == prefetched { + if item.err == nil && fn != nil { + if err := fn(item.val); err != nil { + return err + } + } + return item.err + } + buf, cb, err := item.yieldItemValue() + defer runCallback(cb) + if err != nil { + return err + } + if fn != nil { + return fn(buf) + } + return nil +} + +// ValueCopy returns a copy of the value of the item from the value log, writing it to dst slice. +// If nil is passed, or capacity of dst isn't sufficient, a new slice would be allocated and +// returned. Tip: It might make sense to reuse the returned slice as dst argument for the next call. +// +// This function is useful in long running iterate/update transactions to avoid a write deadlock. +// See Github issue: https://github.com/dgraph-io/badger/issues/315 +func (item *Item) ValueCopy(dst []byte) ([]byte, error) { + item.wg.Wait() + if item.status == prefetched { + return y.SafeCopy(dst, item.val), item.err + } + buf, cb, err := item.yieldItemValue() + defer runCallback(cb) + return y.SafeCopy(dst, buf), err +} + +func (item *Item) hasValue() bool { + if item.meta == 0 && item.vptr == nil { + // key not found + return false + } + return true +} + +// IsDeletedOrExpired returns true if item contains deleted or expired value. +func (item *Item) IsDeletedOrExpired() bool { + return isDeletedOrExpired(item.meta, item.expiresAt) +} + +// DiscardEarlierVersions returns whether the item was created with the +// option to discard earlier versions of a key when multiple are available. +func (item *Item) DiscardEarlierVersions() bool { + return item.meta&bitDiscardEarlierVersions > 0 +} + +func (item *Item) yieldItemValue() ([]byte, func(), error) { + key := item.Key() // No need to copy. + for { + if !item.hasValue() { + return nil, nil, nil + } + + if item.slice == nil { + item.slice = new(y.Slice) + } + + if (item.meta & bitValuePointer) == 0 { + val := item.slice.Resize(len(item.vptr)) + copy(val, item.vptr) + return val, nil, nil + } + + var vp valuePointer + vp.Decode(item.vptr) + result, cb, err := item.db.vlog.Read(vp, item.slice) + if err != ErrRetry { + return result, cb, err + } + if bytes.HasPrefix(key, badgerMove) { + // err == ErrRetry + // Error is retry even after checking the move keyspace. So, let's + // just assume that value is not present. + return nil, cb, nil + } + + // The value pointer is pointing to a deleted value log. Look for the + // move key and read that instead. + runCallback(cb) + // Do not put badgerMove on the left in append. It seems to cause some sort of manipulation. + keyTs := y.KeyWithTs(item.Key(), item.Version()) + key = make([]byte, len(badgerMove)+len(keyTs)) + n := copy(key, badgerMove) + copy(key[n:], keyTs) + // Note that we can't set item.key to move key, because that would + // change the key user sees before and after this call. Also, this move + // logic is internal logic and should not impact the external behavior + // of the retrieval. + vs, err := item.db.get(key) + if err != nil { + return nil, nil, err + } + if vs.Version != item.Version() { + return nil, nil, nil + } + // Bug fix: Always copy the vs.Value into vptr here. Otherwise, when item is reused this + // slice gets overwritten. + item.vptr = y.SafeCopy(item.vptr, vs.Value) + item.meta &^= bitValuePointer // Clear the value pointer bit. + if vs.Meta&bitValuePointer > 0 { + item.meta |= bitValuePointer // This meta would only be about value pointer. + } + } +} + +func runCallback(cb func()) { + if cb != nil { + cb() + } +} + +func (item *Item) prefetchValue() { + val, cb, err := item.yieldItemValue() + defer runCallback(cb) + + item.err = err + item.status = prefetched + if val == nil { + return + } + if item.db.opt.ValueLogLoadingMode == options.MemoryMap { + buf := item.slice.Resize(len(val)) + copy(buf, val) + item.val = buf + } else { + item.val = val + } +} + +// EstimatedSize returns the approximate size of the key-value pair. +// +// This can be called while iterating through a store to quickly estimate the +// size of a range of key-value pairs (without fetching the corresponding +// values). +func (item *Item) EstimatedSize() int64 { + if !item.hasValue() { + return 0 + } + if (item.meta & bitValuePointer) == 0 { + return int64(len(item.key) + len(item.vptr)) + } + var vp valuePointer + vp.Decode(item.vptr) + return int64(vp.Len) // includes key length. +} + +// KeySize returns the size of the key. +// Exact size of the key is key + 8 bytes of timestamp +func (item *Item) KeySize() int64 { + return int64(len(item.key)) +} + +// ValueSize returns the exact size of the value. +// +// This can be called to quickly estimate the size of a value without fetching +// it. +func (item *Item) ValueSize() int64 { + if !item.hasValue() { + return 0 + } + if (item.meta & bitValuePointer) == 0 { + return int64(len(item.vptr)) + } + var vp valuePointer + vp.Decode(item.vptr) + + klen := int64(len(item.key) + 8) // 8 bytes for timestamp. + return int64(vp.Len) - klen - headerBufSize - crc32.Size +} + +// UserMeta returns the userMeta set by the user. Typically, this byte, optionally set by the user +// is used to interpret the value. +func (item *Item) UserMeta() byte { + return item.userMeta +} + +// ExpiresAt returns a Unix time value indicating when the item will be +// considered expired. 0 indicates that the item will never expire. +func (item *Item) ExpiresAt() uint64 { + return item.expiresAt +} + +// TODO: Switch this to use linked list container in Go. +type list struct { + head *Item + tail *Item +} + +func (l *list) push(i *Item) { + i.next = nil + if l.tail == nil { + l.head = i + l.tail = i + return + } + l.tail.next = i + l.tail = i +} + +func (l *list) pop() *Item { + if l.head == nil { + return nil + } + i := l.head + if l.head == l.tail { + l.tail = nil + l.head = nil + } else { + l.head = i.next + } + i.next = nil + return i +} + +// IteratorOptions is used to set options when iterating over Badger key-value +// stores. +// +// This package provides DefaultIteratorOptions which contains options that +// should work for most applications. Consider using that as a starting point +// before customizing it for your own needs. +type IteratorOptions struct { + // Indicates whether we should prefetch values during iteration and store them. + PrefetchValues bool + // How many KV pairs to prefetch while iterating. Valid only if PrefetchValues is true. + PrefetchSize int + Reverse bool // Direction of iteration. False is forward, true is backward. + AllVersions bool // Fetch all valid versions of the same key. + + // The following option is used to narrow down the SSTables that iterator picks up. If + // Prefix is specified, only tables which could have this prefix are picked based on their range + // of keys. + Prefix []byte // Only iterate over this given prefix. + prefixIsKey bool // If set, use the prefix for bloom filter lookup. + + InternalAccess bool // Used to allow internal access to badger keys. +} + +func (opt *IteratorOptions) pickTable(t table.TableInterface) bool { + if len(opt.Prefix) == 0 { + return true + } + trim := func(key []byte) []byte { + if len(key) > len(opt.Prefix) { + return key[:len(opt.Prefix)] + } + return key + } + if bytes.Compare(trim(t.Smallest()), opt.Prefix) > 0 { + return false + } + if bytes.Compare(trim(t.Biggest()), opt.Prefix) < 0 { + return false + } + // Bloom filter lookup would only work if opt.Prefix does NOT have the read + // timestamp as part of the key. + if opt.prefixIsKey && t.DoesNotHave(opt.Prefix) { + return false + } + return true +} + +// DefaultIteratorOptions contains default options when iterating over Badger key-value stores. +var DefaultIteratorOptions = IteratorOptions{ + PrefetchValues: true, + PrefetchSize: 100, + Reverse: false, + AllVersions: false, +} + +// Iterator helps iterating over the KV pairs in a lexicographically sorted order. +type Iterator struct { + iitr *y.MergeIterator + txn *Txn + readTs uint64 + + opt IteratorOptions + item *Item + data list + waste list + + lastKey []byte // Used to skip over multiple versions of the same key. + + closed bool +} + +// NewIterator returns a new iterator. Depending upon the options, either only keys, or both +// key-value pairs would be fetched. The keys are returned in lexicographically sorted order. +// Using prefetch is recommended if you're doing a long running iteration, for performance. +// +// Multiple Iterators: +// For a read-only txn, multiple iterators can be running simultaneously. However, for a read-write +// txn, only one can be running at one time to avoid race conditions, because Txn is thread-unsafe. +func (txn *Txn) NewIterator(opt IteratorOptions) *Iterator { + if txn.discarded { + panic("Transaction has already been discarded") + } + // Do not change the order of the next if. We must track the number of running iterators. + if atomic.AddInt32(&txn.numIterators, 1) > 1 && txn.update { + atomic.AddInt32(&txn.numIterators, -1) + panic("Only one iterator can be active at one time, for a RW txn.") + } + + // TODO: If Prefix is set, only pick those memtables which have keys with + // the prefix. + tables, decr := txn.db.getMemTables() + defer decr() + txn.db.vlog.incrIteratorCount() + var iters []y.Iterator + if itr := txn.newPendingWritesIterator(opt.Reverse); itr != nil { + iters = append(iters, itr) + } + for i := 0; i < len(tables); i++ { + iters = append(iters, tables[i].NewUniIterator(opt.Reverse)) + } + iters = txn.db.lc.appendIterators(iters, &opt) // This will increment references. + res := &Iterator{ + txn: txn, + iitr: y.NewMergeIterator(iters, opt.Reverse), + opt: opt, + readTs: txn.readTs, + } + return res +} + +// NewKeyIterator is just like NewIterator, but allows the user to iterate over all versions of a +// single key. Internally, it sets the Prefix option in provided opt, and uses that prefix to +// additionally run bloom filter lookups before picking tables from the LSM tree. +func (txn *Txn) NewKeyIterator(key []byte, opt IteratorOptions) *Iterator { + if len(opt.Prefix) > 0 { + panic("opt.Prefix should be nil for NewKeyIterator.") + } + opt.Prefix = key // This key must be without the timestamp. + opt.prefixIsKey = true + return txn.NewIterator(opt) +} + +func (it *Iterator) newItem() *Item { + item := it.waste.pop() + if item == nil { + item = &Item{slice: new(y.Slice), db: it.txn.db, txn: it.txn} + } + return item +} + +// Item returns pointer to the current key-value pair. +// This item is only valid until it.Next() gets called. +func (it *Iterator) Item() *Item { + tx := it.txn + tx.addReadKey(it.item.Key()) + return it.item +} + +// Valid returns false when iteration is done. +func (it *Iterator) Valid() bool { + if it.item == nil { + return false + } + return bytes.HasPrefix(it.item.key, it.opt.Prefix) +} + +// ValidForPrefix returns false when iteration is done +// or when the current key is not prefixed by the specified prefix. +func (it *Iterator) ValidForPrefix(prefix []byte) bool { + return it.Valid() && bytes.HasPrefix(it.item.key, prefix) +} + +// Close would close the iterator. It is important to call this when you're done with iteration. +func (it *Iterator) Close() { + if it.closed { + return + } + it.closed = true + + it.iitr.Close() + // It is important to wait for the fill goroutines to finish. Otherwise, we might leave zombie + // goroutines behind, which are waiting to acquire file read locks after DB has been closed. + waitFor := func(l list) { + item := l.pop() + for item != nil { + item.wg.Wait() + item = l.pop() + } + } + waitFor(it.waste) + waitFor(it.data) + + // TODO: We could handle this error. + _ = it.txn.db.vlog.decrIteratorCount() + atomic.AddInt32(&it.txn.numIterators, -1) +} + +// Next would advance the iterator by one. Always check it.Valid() after a Next() +// to ensure you have access to a valid it.Item(). +func (it *Iterator) Next() { + // Reuse current item + it.item.wg.Wait() // Just cleaner to wait before pushing to avoid doing ref counting. + it.waste.push(it.item) + + // Set next item to current + it.item = it.data.pop() + + for it.iitr.Valid() { + if it.parseItem() { + // parseItem calls one extra next. + // This is used to deal with the complexity of reverse iteration. + break + } + } +} + +func isDeletedOrExpired(meta byte, expiresAt uint64) bool { + if meta&bitDelete > 0 { + return true + } + if expiresAt == 0 { + return false + } + return expiresAt <= uint64(time.Now().Unix()) +} + +// parseItem is a complex function because it needs to handle both forward and reverse iteration +// implementation. We store keys such that their versions are sorted in descending order. This makes +// forward iteration efficient, but revese iteration complicated. This tradeoff is better because +// forward iteration is more common than reverse. +// +// This function advances the iterator. +func (it *Iterator) parseItem() bool { + mi := it.iitr + key := mi.Key() + + setItem := func(item *Item) { + if it.item == nil { + it.item = item + } else { + it.data.push(item) + } + } + + // Skip badger keys. + if !it.opt.InternalAccess && bytes.HasPrefix(key, badgerPrefix) { + mi.Next() + return false + } + + // Skip any versions which are beyond the readTs. + version := y.ParseTs(key) + if version > it.readTs { + mi.Next() + return false + } + + if it.opt.AllVersions { + // Return deleted or expired values also, otherwise user can't figure out + // whether the key was deleted. + item := it.newItem() + it.fill(item) + setItem(item) + mi.Next() + return true + } + + // If iterating in forward direction, then just checking the last key against current key would + // be sufficient. + if !it.opt.Reverse { + if y.SameKey(it.lastKey, key) { + mi.Next() + return false + } + // Only track in forward direction. + // We should update lastKey as soon as we find a different key in our snapshot. + // Consider keys: a 5, b 7 (del), b 5. When iterating, lastKey = a. + // Then we see b 7, which is deleted. If we don't store lastKey = b, we'll then return b 5, + // which is wrong. Therefore, update lastKey here. + it.lastKey = y.SafeCopy(it.lastKey, mi.Key()) + } + +FILL: + // If deleted, advance and return. + vs := mi.Value() + if isDeletedOrExpired(vs.Meta, vs.ExpiresAt) { + mi.Next() + return false + } + + item := it.newItem() + it.fill(item) + // fill item based on current cursor position. All Next calls have returned, so reaching here + // means no Next was called. + + mi.Next() // Advance but no fill item yet. + if !it.opt.Reverse || !mi.Valid() { // Forward direction, or invalid. + setItem(item) + return true + } + + // Reverse direction. + nextTs := y.ParseTs(mi.Key()) + mik := y.ParseKey(mi.Key()) + if nextTs <= it.readTs && bytes.Equal(mik, item.key) { + // This is a valid potential candidate. + goto FILL + } + // Ignore the next candidate. Return the current one. + setItem(item) + return true +} + +func (it *Iterator) fill(item *Item) { + vs := it.iitr.Value() + item.meta = vs.Meta + item.userMeta = vs.UserMeta + item.expiresAt = vs.ExpiresAt + + item.version = y.ParseTs(it.iitr.Key()) + item.key = y.SafeCopy(item.key, y.ParseKey(it.iitr.Key())) + + item.vptr = y.SafeCopy(item.vptr, vs.Value) + item.val = nil + if it.opt.PrefetchValues { + item.wg.Add(1) + go func() { + // FIXME we are not handling errors here. + item.prefetchValue() + item.wg.Done() + }() + } +} + +func (it *Iterator) prefetch() { + prefetchSize := 2 + if it.opt.PrefetchValues && it.opt.PrefetchSize > 1 { + prefetchSize = it.opt.PrefetchSize + } + + i := it.iitr + var count int + it.item = nil + for i.Valid() { + if !it.parseItem() { + continue + } + count++ + if count == prefetchSize { + break + } + } +} + +// Seek would seek to the provided key if present. If absent, it would seek to the next +// smallest key greater than the provided key if iterating in the forward direction. +// Behavior would be reversed if iterating backwards. +func (it *Iterator) Seek(key []byte) { + for i := it.data.pop(); i != nil; i = it.data.pop() { + i.wg.Wait() + it.waste.push(i) + } + + it.lastKey = it.lastKey[:0] + if len(key) == 0 { + key = it.opt.Prefix + } + if len(key) == 0 { + it.iitr.Rewind() + it.prefetch() + return + } + + if !it.opt.Reverse { + key = y.KeyWithTs(key, it.txn.readTs) + } else { + key = y.KeyWithTs(key, 0) + } + it.iitr.Seek(key) + it.prefetch() +} + +// Rewind would rewind the iterator cursor all the way to zero-th position, which would be the +// smallest key if iterating forward, and largest if iterating backward. It does not keep track of +// whether the cursor started with a Seek(). +func (it *Iterator) Rewind() { + it.Seek(nil) +} diff --git a/vendor/github.com/dgraph-io/badger/level_handler.go b/vendor/github.com/dgraph-io/badger/level_handler.go new file mode 100644 index 0000000000..147967fb8c --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/level_handler.go @@ -0,0 +1,299 @@ +/* + * Copyright 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package badger + +import ( + "fmt" + "sort" + "sync" + + "github.com/dgraph-io/badger/table" + "github.com/dgraph-io/badger/y" + "github.com/pkg/errors" +) + +type levelHandler struct { + // Guards tables, totalSize. + sync.RWMutex + + // For level >= 1, tables are sorted by key ranges, which do not overlap. + // For level 0, tables are sorted by time. + // For level 0, newest table are at the back. Compact the oldest one first, which is at the front. + tables []*table.Table + totalSize int64 + + // The following are initialized once and const. + level int + strLevel string + maxTotalSize int64 + db *DB +} + +func (s *levelHandler) getTotalSize() int64 { + s.RLock() + defer s.RUnlock() + return s.totalSize +} + +// initTables replaces s.tables with given tables. This is done during loading. +func (s *levelHandler) initTables(tables []*table.Table) { + s.Lock() + defer s.Unlock() + + s.tables = tables + s.totalSize = 0 + for _, t := range tables { + s.totalSize += t.Size() + } + + if s.level == 0 { + // Key range will overlap. Just sort by fileID in ascending order + // because newer tables are at the end of level 0. + sort.Slice(s.tables, func(i, j int) bool { + return s.tables[i].ID() < s.tables[j].ID() + }) + } else { + // Sort tables by keys. + sort.Slice(s.tables, func(i, j int) bool { + return y.CompareKeys(s.tables[i].Smallest(), s.tables[j].Smallest()) < 0 + }) + } +} + +// deleteTables remove tables idx0, ..., idx1-1. +func (s *levelHandler) deleteTables(toDel []*table.Table) error { + s.Lock() // s.Unlock() below + + toDelMap := make(map[uint64]struct{}) + for _, t := range toDel { + toDelMap[t.ID()] = struct{}{} + } + + // Make a copy as iterators might be keeping a slice of tables. + var newTables []*table.Table + for _, t := range s.tables { + _, found := toDelMap[t.ID()] + if !found { + newTables = append(newTables, t) + continue + } + s.totalSize -= t.Size() + } + s.tables = newTables + + s.Unlock() // Unlock s _before_ we DecrRef our tables, which can be slow. + + return decrRefs(toDel) +} + +// replaceTables will replace tables[left:right] with newTables. Note this EXCLUDES tables[right]. +// You must call decr() to delete the old tables _after_ writing the update to the manifest. +func (s *levelHandler) replaceTables(toDel, toAdd []*table.Table) error { + // Need to re-search the range of tables in this level to be replaced as other goroutines might + // be changing it as well. (They can't touch our tables, but if they add/remove other tables, + // the indices get shifted around.) + s.Lock() // We s.Unlock() below. + + toDelMap := make(map[uint64]struct{}) + for _, t := range toDel { + toDelMap[t.ID()] = struct{}{} + } + var newTables []*table.Table + for _, t := range s.tables { + _, found := toDelMap[t.ID()] + if !found { + newTables = append(newTables, t) + continue + } + s.totalSize -= t.Size() + } + + // Increase totalSize first. + for _, t := range toAdd { + s.totalSize += t.Size() + t.IncrRef() + newTables = append(newTables, t) + } + + // Assign tables. + s.tables = newTables + sort.Slice(s.tables, func(i, j int) bool { + return y.CompareKeys(s.tables[i].Smallest(), s.tables[j].Smallest()) < 0 + }) + s.Unlock() // s.Unlock before we DecrRef tables -- that can be slow. + return decrRefs(toDel) +} + +func decrRefs(tables []*table.Table) error { + for _, table := range tables { + if err := table.DecrRef(); err != nil { + return err + } + } + return nil +} + +func newLevelHandler(db *DB, level int) *levelHandler { + return &levelHandler{ + level: level, + strLevel: fmt.Sprintf("l%d", level), + db: db, + } +} + +// tryAddLevel0Table returns true if ok and no stalling. +func (s *levelHandler) tryAddLevel0Table(t *table.Table) bool { + y.AssertTrue(s.level == 0) + // Need lock as we may be deleting the first table during a level 0 compaction. + s.Lock() + defer s.Unlock() + if len(s.tables) >= s.db.opt.NumLevelZeroTablesStall { + return false + } + + s.tables = append(s.tables, t) + t.IncrRef() + s.totalSize += t.Size() + + return true +} + +func (s *levelHandler) numTables() int { + s.RLock() + defer s.RUnlock() + return len(s.tables) +} + +func (s *levelHandler) close() error { + s.RLock() + defer s.RUnlock() + var err error + for _, t := range s.tables { + if closeErr := t.Close(); closeErr != nil && err == nil { + err = closeErr + } + } + return errors.Wrap(err, "levelHandler.close") +} + +// getTableForKey acquires a read-lock to access s.tables. It returns a list of tableHandlers. +func (s *levelHandler) getTableForKey(key []byte) ([]*table.Table, func() error) { + s.RLock() + defer s.RUnlock() + + if s.level == 0 { + // For level 0, we need to check every table. Remember to make a copy as s.tables may change + // once we exit this function, and we don't want to lock s.tables while seeking in tables. + // CAUTION: Reverse the tables. + out := make([]*table.Table, 0, len(s.tables)) + for i := len(s.tables) - 1; i >= 0; i-- { + out = append(out, s.tables[i]) + s.tables[i].IncrRef() + } + return out, func() error { + for _, t := range out { + if err := t.DecrRef(); err != nil { + return err + } + } + return nil + } + } + // For level >= 1, we can do a binary search as key range does not overlap. + idx := sort.Search(len(s.tables), func(i int) bool { + return y.CompareKeys(s.tables[i].Biggest(), key) >= 0 + }) + if idx >= len(s.tables) { + // Given key is strictly > than every element we have. + return nil, func() error { return nil } + } + tbl := s.tables[idx] + tbl.IncrRef() + return []*table.Table{tbl}, tbl.DecrRef +} + +// get returns value for a given key or the key after that. If not found, return nil. +func (s *levelHandler) get(key []byte) (y.ValueStruct, error) { + tables, decr := s.getTableForKey(key) + keyNoTs := y.ParseKey(key) + + var maxVs y.ValueStruct + for _, th := range tables { + if th.DoesNotHave(keyNoTs) { + y.NumLSMBloomHits.Add(s.strLevel, 1) + continue + } + + it := th.NewIterator(false) + defer it.Close() + + y.NumLSMGets.Add(s.strLevel, 1) + it.Seek(key) + if !it.Valid() { + continue + } + if y.SameKey(key, it.Key()) { + if version := y.ParseTs(it.Key()); maxVs.Version < version { + maxVs = it.Value() + maxVs.Version = version + } + } + } + return maxVs, decr() +} + +// appendIterators appends iterators to an array of iterators, for merging. +// Note: This obtains references for the table handlers. Remember to close these iterators. +func (s *levelHandler) appendIterators(iters []y.Iterator, opt *IteratorOptions) []y.Iterator { + s.RLock() + defer s.RUnlock() + + tables := make([]*table.Table, 0, len(s.tables)) + for _, t := range s.tables { + if opt.pickTable(t) { + tables = append(tables, t) + } + } + if len(tables) == 0 { + return iters + } + + if s.level == 0 { + // Remember to add in reverse order! + // The newer table at the end of s.tables should be added first as it takes precedence. + return appendIteratorsReversed(iters, tables, opt.Reverse) + } + return append(iters, table.NewConcatIterator(tables, opt.Reverse)) +} + +type levelHandlerRLocked struct{} + +// overlappingTables returns the tables that intersect with key range. Returns a half-interval. +// This function should already have acquired a read lock, and this is so important the caller must +// pass an empty parameter declaring such. +func (s *levelHandler) overlappingTables(_ levelHandlerRLocked, kr keyRange) (int, int) { + if len(kr.left) == 0 || len(kr.right) == 0 { + return 0, 0 + } + left := sort.Search(len(s.tables), func(i int) bool { + return y.CompareKeys(kr.left, s.tables[i].Biggest()) <= 0 + }) + right := sort.Search(len(s.tables), func(i int) bool { + return y.CompareKeys(kr.right, s.tables[i].Smallest()) < 0 + }) + return left, right +} diff --git a/vendor/github.com/dgraph-io/badger/levels.go b/vendor/github.com/dgraph-io/badger/levels.go new file mode 100644 index 0000000000..a4efd6624f --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/levels.go @@ -0,0 +1,989 @@ +/* + * Copyright 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package badger + +import ( + "bytes" + "fmt" + "math" + "math/rand" + "os" + "sort" + "strings" + "sync" + "sync/atomic" + "time" + + "golang.org/x/net/trace" + + "github.com/dgraph-io/badger/pb" + "github.com/dgraph-io/badger/table" + "github.com/dgraph-io/badger/y" + "github.com/pkg/errors" +) + +type levelsController struct { + nextFileID uint64 // Atomic + elog trace.EventLog + + // The following are initialized once and const. + levels []*levelHandler + kv *DB + + cstatus compactStatus +} + +var ( + // This is for getting timings between stalls. + lastUnstalled time.Time +) + +// revertToManifest checks that all necessary table files exist and removes all table files not +// referenced by the manifest. idMap is a set of table file id's that were read from the directory +// listing. +func revertToManifest(kv *DB, mf *Manifest, idMap map[uint64]struct{}) error { + // 1. Check all files in manifest exist. + for id := range mf.Tables { + if _, ok := idMap[id]; !ok { + return fmt.Errorf("file does not exist for table %d", id) + } + } + + // 2. Delete files that shouldn't exist. + for id := range idMap { + if _, ok := mf.Tables[id]; !ok { + kv.elog.Printf("Table file %d not referenced in MANIFEST\n", id) + filename := table.NewFilename(id, kv.opt.Dir) + if err := os.Remove(filename); err != nil { + return y.Wrapf(err, "While removing table %d", id) + } + } + } + + return nil +} + +func newLevelsController(db *DB, mf *Manifest) (*levelsController, error) { + y.AssertTrue(db.opt.NumLevelZeroTablesStall > db.opt.NumLevelZeroTables) + s := &levelsController{ + kv: db, + elog: db.elog, + levels: make([]*levelHandler, db.opt.MaxLevels), + } + s.cstatus.levels = make([]*levelCompactStatus, db.opt.MaxLevels) + + for i := 0; i < db.opt.MaxLevels; i++ { + s.levels[i] = newLevelHandler(db, i) + if i == 0 { + // Do nothing. + } else if i == 1 { + // Level 1 probably shouldn't be too much bigger than level 0. + s.levels[i].maxTotalSize = db.opt.LevelOneSize + } else { + s.levels[i].maxTotalSize = s.levels[i-1].maxTotalSize * int64(db.opt.LevelSizeMultiplier) + } + s.cstatus.levels[i] = new(levelCompactStatus) + } + + // Compare manifest against directory, check for existent/non-existent files, and remove. + if err := revertToManifest(db, mf, getIDMap(db.opt.Dir)); err != nil { + return nil, err + } + + // Some files may be deleted. Let's reload. + var flags uint32 = y.Sync + if db.opt.ReadOnly { + flags |= y.ReadOnly + } + + var mu sync.Mutex + tables := make([][]*table.Table, db.opt.MaxLevels) + var maxFileID uint64 + + // We found that using 3 goroutines allows disk throughput to be utilized to its max. + // Disk utilization is the main thing we should focus on, while trying to read the data. That's + // the one factor that remains constant between HDD and SSD. + throttle := y.NewThrottle(3) + + start := time.Now() + var numOpened int32 + tick := time.NewTicker(3 * time.Second) + defer tick.Stop() + + for fileID, tf := range mf.Tables { + fname := table.NewFilename(fileID, db.opt.Dir) + select { + case <-tick.C: + db.opt.Infof("%d tables out of %d opened in %s\n", atomic.LoadInt32(&numOpened), + len(mf.Tables), time.Since(start).Round(time.Millisecond)) + default: + } + if err := throttle.Do(); err != nil { + closeAllTables(tables) + return nil, err + } + if fileID > maxFileID { + maxFileID = fileID + } + go func(fname string, tf TableManifest) { + var rerr error + defer func() { + throttle.Done(rerr) + atomic.AddInt32(&numOpened, 1) + }() + fd, err := y.OpenExistingFile(fname, flags) + if err != nil { + rerr = errors.Wrapf(err, "Opening file: %q", fname) + return + } + + t, err := table.OpenTable(fd, db.opt.TableLoadingMode, tf.Checksum) + if err != nil { + if strings.HasPrefix(err.Error(), "CHECKSUM_MISMATCH:") { + db.opt.Errorf(err.Error()) + db.opt.Errorf("Ignoring table %s", fd.Name()) + // Do not set rerr. We will continue without this table. + } else { + rerr = errors.Wrapf(err, "Opening table: %q", fname) + } + return + } + + mu.Lock() + tables[tf.Level] = append(tables[tf.Level], t) + mu.Unlock() + }(fname, tf) + } + if err := throttle.Finish(); err != nil { + closeAllTables(tables) + return nil, err + } + db.opt.Infof("All %d tables opened in %s\n", atomic.LoadInt32(&numOpened), + time.Since(start).Round(time.Millisecond)) + s.nextFileID = maxFileID + 1 + for i, tbls := range tables { + s.levels[i].initTables(tbls) + } + + // Make sure key ranges do not overlap etc. + if err := s.validate(); err != nil { + _ = s.cleanupLevels() + return nil, errors.Wrap(err, "Level validation") + } + + // Sync directory (because we have at least removed some files, or previously created the + // manifest file). + if err := syncDir(db.opt.Dir); err != nil { + _ = s.close() + return nil, err + } + + return s, nil +} + +// Closes the tables, for cleanup in newLevelsController. (We Close() instead of using DecrRef() +// because that would delete the underlying files.) We ignore errors, which is OK because tables +// are read-only. +func closeAllTables(tables [][]*table.Table) { + for _, tableSlice := range tables { + for _, table := range tableSlice { + _ = table.Close() + } + } +} + +func (s *levelsController) cleanupLevels() error { + var firstErr error + for _, l := range s.levels { + if err := l.close(); err != nil && firstErr == nil { + firstErr = err + } + } + return firstErr +} + +// dropTree picks all tables from all levels, creates a manifest changeset, +// applies it, and then decrements the refs of these tables, which would result +// in their deletion. +func (s *levelsController) dropTree() (int, error) { + // First pick all tables, so we can create a manifest changelog. + var all []*table.Table + for _, l := range s.levels { + l.RLock() + all = append(all, l.tables...) + l.RUnlock() + } + if len(all) == 0 { + return 0, nil + } + + // Generate the manifest changes. + changes := []*pb.ManifestChange{} + for _, table := range all { + changes = append(changes, newDeleteChange(table.ID())) + } + changeSet := pb.ManifestChangeSet{Changes: changes} + if err := s.kv.manifest.addChanges(changeSet.Changes); err != nil { + return 0, err + } + + // Now that manifest has been successfully written, we can delete the tables. + for _, l := range s.levels { + l.Lock() + l.totalSize = 0 + l.tables = l.tables[:0] + l.Unlock() + } + for _, table := range all { + if err := table.DecrRef(); err != nil { + return 0, err + } + } + return len(all), nil +} + +// dropPrefix runs a L0->L1 compaction, and then runs same level compaction on the rest of the +// levels. For L0->L1 compaction, it runs compactions normally, but skips over all the keys with the +// provided prefix. For Li->Li compactions, it picks up the tables which would have the prefix. The +// tables who only have keys with this prefix are quickly dropped. The ones which have other keys +// are run through MergeIterator and compacted to create new tables. All the mechanisms of +// compactions apply, i.e. level sizes and MANIFEST are updated as in the normal flow. +func (s *levelsController) dropPrefix(prefix []byte) error { + opt := s.kv.opt + for _, l := range s.levels { + l.RLock() + if l.level == 0 { + size := len(l.tables) + l.RUnlock() + + if size > 0 { + cp := compactionPriority{ + level: 0, + score: 1.74, + // A unique number greater than 1.0 does two things. Helps identify this + // function in logs, and forces a compaction. + dropPrefix: prefix, + } + if err := s.doCompact(cp); err != nil { + opt.Warningf("While compacting level 0: %v", err) + return nil + } + } + continue + } + + var tables []*table.Table + for _, table := range l.tables { + var absent bool + switch { + case bytes.HasPrefix(table.Smallest(), prefix): + case bytes.HasPrefix(table.Biggest(), prefix): + case bytes.Compare(prefix, table.Smallest()) > 0 && + bytes.Compare(prefix, table.Biggest()) < 0: + default: + absent = true + } + if !absent { + tables = append(tables, table) + } + } + l.RUnlock() + if len(tables) == 0 { + continue + } + + cd := compactDef{ + elog: trace.New(fmt.Sprintf("Badger.L%d", l.level), "Compact"), + thisLevel: l, + nextLevel: l, + top: []*table.Table{}, + bot: tables, + dropPrefix: prefix, + } + if err := s.runCompactDef(l.level, cd); err != nil { + opt.Warningf("While running compact def: %+v. Error: %v", cd, err) + return err + } + } + return nil +} + +func (s *levelsController) startCompact(lc *y.Closer) { + n := s.kv.opt.NumCompactors + lc.AddRunning(n - 1) + for i := 0; i < n; i++ { + go s.runWorker(lc) + } +} + +func (s *levelsController) runWorker(lc *y.Closer) { + defer lc.Done() + + randomDelay := time.NewTimer(time.Duration(rand.Int31n(1000)) * time.Millisecond) + select { + case <-randomDelay.C: + case <-lc.HasBeenClosed(): + randomDelay.Stop() + return + } + + ticker := time.NewTicker(time.Second) + defer ticker.Stop() + + for { + select { + // Can add a done channel or other stuff. + case <-ticker.C: + prios := s.pickCompactLevels() + for _, p := range prios { + if err := s.doCompact(p); err == nil { + break + } else if err == errFillTables { + // pass + } else { + s.kv.opt.Warningf("While running doCompact: %v\n", err) + } + } + case <-lc.HasBeenClosed(): + return + } + } +} + +// Returns true if level zero may be compacted, without accounting for compactions that already +// might be happening. +func (s *levelsController) isLevel0Compactable() bool { + return s.levels[0].numTables() >= s.kv.opt.NumLevelZeroTables +} + +// Returns true if the non-zero level may be compacted. delSize provides the size of the tables +// which are currently being compacted so that we treat them as already having started being +// compacted (because they have been, yet their size is already counted in getTotalSize). +func (l *levelHandler) isCompactable(delSize int64) bool { + return l.getTotalSize()-delSize >= l.maxTotalSize +} + +type compactionPriority struct { + level int + score float64 + dropPrefix []byte +} + +// pickCompactLevel determines which level to compact. +// Based on: https://github.com/facebook/rocksdb/wiki/Leveled-Compaction +func (s *levelsController) pickCompactLevels() (prios []compactionPriority) { + // This function must use identical criteria for guaranteeing compaction's progress that + // addLevel0Table uses. + + // cstatus is checked to see if level 0's tables are already being compacted + if !s.cstatus.overlapsWith(0, infRange) && s.isLevel0Compactable() { + pri := compactionPriority{ + level: 0, + score: float64(s.levels[0].numTables()) / float64(s.kv.opt.NumLevelZeroTables), + } + prios = append(prios, pri) + } + + for i, l := range s.levels[1:] { + // Don't consider those tables that are already being compacted right now. + delSize := s.cstatus.delSize(i + 1) + + if l.isCompactable(delSize) { + pri := compactionPriority{ + level: i + 1, + score: float64(l.getTotalSize()-delSize) / float64(l.maxTotalSize), + } + prios = append(prios, pri) + } + } + sort.Slice(prios, func(i, j int) bool { + return prios[i].score > prios[j].score + }) + return prios +} + +// compactBuildTables merge topTables and botTables to form a list of new tables. +func (s *levelsController) compactBuildTables( + lev int, cd compactDef) ([]*table.Table, func() error, error) { + topTables := cd.top + botTables := cd.bot + + var hasOverlap bool + { + kr := getKeyRange(cd.top) + for i, lh := range s.levels { + if i <= lev { // Skip upper levels. + continue + } + lh.RLock() + left, right := lh.overlappingTables(levelHandlerRLocked{}, kr) + lh.RUnlock() + if right-left > 0 { + hasOverlap = true + break + } + } + } + + // Try to collect stats so that we can inform value log about GC. That would help us find which + // value log file should be GCed. + discardStats := make(map[uint32]int64) + updateStats := func(vs y.ValueStruct) { + if vs.Meta&bitValuePointer > 0 { + var vp valuePointer + vp.Decode(vs.Value) + discardStats[vp.Fid] += int64(vp.Len) + } + } + + // Create iterators across all the tables involved first. + var iters []y.Iterator + if lev == 0 { + iters = appendIteratorsReversed(iters, topTables, false) + } else if len(topTables) > 0 { + y.AssertTrue(len(topTables) == 1) + iters = []y.Iterator{topTables[0].NewIterator(false)} + } + + // Next level has level>=1 and we can use ConcatIterator as key ranges do not overlap. + var valid []*table.Table + for _, table := range botTables { + if len(cd.dropPrefix) > 0 && + bytes.HasPrefix(table.Smallest(), cd.dropPrefix) && + bytes.HasPrefix(table.Biggest(), cd.dropPrefix) { + // All the keys in this table have the dropPrefix. So, this table does not need to be + // in the iterator and can be dropped immediately. + continue + } + valid = append(valid, table) + } + iters = append(iters, table.NewConcatIterator(valid, false)) + it := y.NewMergeIterator(iters, false) + defer it.Close() // Important to close the iterator to do ref counting. + + it.Rewind() + + // Pick a discard ts, so we can discard versions below this ts. We should + // never discard any versions starting from above this timestamp, because + // that would affect the snapshot view guarantee provided by transactions. + discardTs := s.kv.orc.discardAtOrBelow() + + // Start generating new tables. + type newTableResult struct { + table *table.Table + err error + } + resultCh := make(chan newTableResult) + var numBuilds, numVersions int + var lastKey, skipKey []byte + for it.Valid() { + timeStart := time.Now() + builder := table.NewTableBuilder() + var numKeys, numSkips uint64 + for ; it.Valid(); it.Next() { + // See if we need to skip the prefix. + if len(cd.dropPrefix) > 0 && bytes.HasPrefix(it.Key(), cd.dropPrefix) { + numSkips++ + updateStats(it.Value()) + continue + } + + // See if we need to skip this key. + if len(skipKey) > 0 { + if y.SameKey(it.Key(), skipKey) { + numSkips++ + updateStats(it.Value()) + continue + } else { + skipKey = skipKey[:0] + } + } + + if !y.SameKey(it.Key(), lastKey) { + if builder.ReachedCapacity(s.kv.opt.MaxTableSize) { + // Only break if we are on a different key, and have reached capacity. We want + // to ensure that all versions of the key are stored in the same sstable, and + // not divided across multiple tables at the same level. + break + } + lastKey = y.SafeCopy(lastKey, it.Key()) + numVersions = 0 + } + + vs := it.Value() + version := y.ParseTs(it.Key()) + // Do not discard entries inserted by merge operator. These entries will be + // discarded once they're merged + if version <= discardTs && vs.Meta&bitMergeEntry == 0 { + // Keep track of the number of versions encountered for this key. Only consider the + // versions which are below the minReadTs, otherwise, we might end up discarding the + // only valid version for a running transaction. + numVersions++ + lastValidVersion := vs.Meta&bitDiscardEarlierVersions > 0 + if isDeletedOrExpired(vs.Meta, vs.ExpiresAt) || + numVersions > s.kv.opt.NumVersionsToKeep || + lastValidVersion { + // If this version of the key is deleted or expired, skip all the rest of the + // versions. Ensure that we're only removing versions below readTs. + skipKey = y.SafeCopy(skipKey, it.Key()) + + if lastValidVersion { + // Add this key. We have set skipKey, so the following key versions + // would be skipped. + } else if hasOverlap { + // If this key range has overlap with lower levels, then keep the deletion + // marker with the latest version, discarding the rest. We have set skipKey, + // so the following key versions would be skipped. + } else { + // If no overlap, we can skip all the versions, by continuing here. + numSkips++ + updateStats(vs) + continue // Skip adding this key. + } + } + } + numKeys++ + y.Check(builder.Add(it.Key(), it.Value())) + } + // It was true that it.Valid() at least once in the loop above, which means we + // called Add() at least once, and builder is not Empty(). + s.kv.opt.Debugf("LOG Compact. Added %d keys. Skipped %d keys. Iteration took: %v", + numKeys, numSkips, time.Since(timeStart)) + if !builder.Empty() { + numBuilds++ + fileID := s.reserveFileID() + go func(builder *table.Builder) { + defer builder.Close() + + fd, err := y.CreateSyncedFile(table.NewFilename(fileID, s.kv.opt.Dir), true) + if err != nil { + resultCh <- newTableResult{nil, errors.Wrapf(err, "While opening new table: %d", fileID)} + return + } + + if _, err := fd.Write(builder.Finish()); err != nil { + resultCh <- newTableResult{nil, errors.Wrapf(err, "Unable to write to file: %d", fileID)} + return + } + + tbl, err := table.OpenTable(fd, s.kv.opt.TableLoadingMode, nil) + // decrRef is added below. + resultCh <- newTableResult{tbl, errors.Wrapf(err, "Unable to open table: %q", fd.Name())} + }(builder) + } + } + + newTables := make([]*table.Table, 0, 20) + // Wait for all table builders to finish. + var firstErr error + for x := 0; x < numBuilds; x++ { + res := <-resultCh + newTables = append(newTables, res.table) + if firstErr == nil { + firstErr = res.err + } + } + + if firstErr == nil { + // Ensure created files' directory entries are visible. We don't mind the extra latency + // from not doing this ASAP after all file creation has finished because this is a + // background operation. + firstErr = syncDir(s.kv.opt.Dir) + } + + if firstErr != nil { + // An error happened. Delete all the newly created table files (by calling DecrRef + // -- we're the only holders of a ref). + for j := 0; j < numBuilds; j++ { + if newTables[j] != nil { + _ = newTables[j].DecrRef() + } + } + errorReturn := errors.Wrapf(firstErr, "While running compaction for: %+v", cd) + return nil, nil, errorReturn + } + + sort.Slice(newTables, func(i, j int) bool { + return y.CompareKeys(newTables[i].Biggest(), newTables[j].Biggest()) < 0 + }) + if err := s.kv.vlog.updateDiscardStats(discardStats); err != nil { + return nil, nil, errors.Wrap(err, "failed to update discard stats") + } + s.kv.opt.Debugf("Discard stats: %v", discardStats) + return newTables, func() error { return decrRefs(newTables) }, nil +} + +func buildChangeSet(cd *compactDef, newTables []*table.Table) pb.ManifestChangeSet { + changes := []*pb.ManifestChange{} + for _, table := range newTables { + changes = append(changes, + newCreateChange(table.ID(), cd.nextLevel.level, table.Checksum)) + } + for _, table := range cd.top { + changes = append(changes, newDeleteChange(table.ID())) + } + for _, table := range cd.bot { + changes = append(changes, newDeleteChange(table.ID())) + } + return pb.ManifestChangeSet{Changes: changes} +} + +type compactDef struct { + elog trace.Trace + + thisLevel *levelHandler + nextLevel *levelHandler + + top []*table.Table + bot []*table.Table + + thisRange keyRange + nextRange keyRange + + thisSize int64 + + dropPrefix []byte +} + +func (cd *compactDef) lockLevels() { + cd.thisLevel.RLock() + cd.nextLevel.RLock() +} + +func (cd *compactDef) unlockLevels() { + cd.nextLevel.RUnlock() + cd.thisLevel.RUnlock() +} + +func (s *levelsController) fillTablesL0(cd *compactDef) bool { + cd.lockLevels() + defer cd.unlockLevels() + + cd.top = make([]*table.Table, len(cd.thisLevel.tables)) + copy(cd.top, cd.thisLevel.tables) + if len(cd.top) == 0 { + return false + } + cd.thisRange = infRange + + kr := getKeyRange(cd.top) + left, right := cd.nextLevel.overlappingTables(levelHandlerRLocked{}, kr) + cd.bot = make([]*table.Table, right-left) + copy(cd.bot, cd.nextLevel.tables[left:right]) + + if len(cd.bot) == 0 { + cd.nextRange = kr + } else { + cd.nextRange = getKeyRange(cd.bot) + } + + if !s.cstatus.compareAndAdd(thisAndNextLevelRLocked{}, *cd) { + return false + } + + return true +} + +func (s *levelsController) fillTables(cd *compactDef) bool { + cd.lockLevels() + defer cd.unlockLevels() + + tbls := make([]*table.Table, len(cd.thisLevel.tables)) + copy(tbls, cd.thisLevel.tables) + if len(tbls) == 0 { + return false + } + + // Find the biggest table, and compact that first. + // TODO: Try other table picking strategies. + sort.Slice(tbls, func(i, j int) bool { + return tbls[i].Size() > tbls[j].Size() + }) + + for _, t := range tbls { + cd.thisSize = t.Size() + cd.thisRange = keyRange{ + // We pick all the versions of the smallest and the biggest key. + left: y.KeyWithTs(y.ParseKey(t.Smallest()), math.MaxUint64), + // Note that version zero would be the rightmost key. + right: y.KeyWithTs(y.ParseKey(t.Biggest()), 0), + } + if s.cstatus.overlapsWith(cd.thisLevel.level, cd.thisRange) { + continue + } + cd.top = []*table.Table{t} + left, right := cd.nextLevel.overlappingTables(levelHandlerRLocked{}, cd.thisRange) + + cd.bot = make([]*table.Table, right-left) + copy(cd.bot, cd.nextLevel.tables[left:right]) + + if len(cd.bot) == 0 { + cd.bot = []*table.Table{} + cd.nextRange = cd.thisRange + if !s.cstatus.compareAndAdd(thisAndNextLevelRLocked{}, *cd) { + continue + } + return true + } + cd.nextRange = getKeyRange(cd.bot) + + if s.cstatus.overlapsWith(cd.nextLevel.level, cd.nextRange) { + continue + } + if !s.cstatus.compareAndAdd(thisAndNextLevelRLocked{}, *cd) { + continue + } + return true + } + return false +} + +func (s *levelsController) runCompactDef(l int, cd compactDef) (err error) { + timeStart := time.Now() + + thisLevel := cd.thisLevel + nextLevel := cd.nextLevel + + // Table should never be moved directly between levels, always be rewritten to allow discarding + // invalid versions. + + newTables, decr, err := s.compactBuildTables(l, cd) + if err != nil { + return err + } + defer func() { + // Only assign to err, if it's not already nil. + if decErr := decr(); err == nil { + err = decErr + } + }() + changeSet := buildChangeSet(&cd, newTables) + + // We write to the manifest _before_ we delete files (and after we created files) + if err := s.kv.manifest.addChanges(changeSet.Changes); err != nil { + return err + } + + // See comment earlier in this function about the ordering of these ops, and the order in which + // we access levels when reading. + if err := nextLevel.replaceTables(cd.bot, newTables); err != nil { + return err + } + if err := thisLevel.deleteTables(cd.top); err != nil { + return err + } + + // Note: For level 0, while doCompact is running, it is possible that new tables are added. + // However, the tables are added only to the end, so it is ok to just delete the first table. + + s.kv.opt.Infof("LOG Compact %d->%d, del %d tables, add %d tables, took %v\n", + thisLevel.level, nextLevel.level, len(cd.top)+len(cd.bot), + len(newTables), time.Since(timeStart)) + return nil +} + +var errFillTables = errors.New("Unable to fill tables") + +// doCompact picks some table on level l and compacts it away to the next level. +func (s *levelsController) doCompact(p compactionPriority) error { + l := p.level + y.AssertTrue(l+1 < s.kv.opt.MaxLevels) // Sanity check. + + cd := compactDef{ + elog: trace.New(fmt.Sprintf("Badger.L%d", l), "Compact"), + thisLevel: s.levels[l], + nextLevel: s.levels[l+1], + dropPrefix: p.dropPrefix, + } + cd.elog.SetMaxEvents(100) + defer cd.elog.Finish() + + s.kv.opt.Infof("Got compaction priority: %+v", p) + + // While picking tables to be compacted, both levels' tables are expected to + // remain unchanged. + if l == 0 { + if !s.fillTablesL0(&cd) { + return errFillTables + } + + } else { + if !s.fillTables(&cd) { + return errFillTables + } + } + defer s.cstatus.delete(cd) // Remove the ranges from compaction status. + + s.kv.opt.Infof("Running for level: %d\n", cd.thisLevel.level) + s.cstatus.toLog(cd.elog) + if err := s.runCompactDef(l, cd); err != nil { + // This compaction couldn't be done successfully. + s.kv.opt.Warningf("LOG Compact FAILED with error: %+v: %+v", err, cd) + return err + } + + s.cstatus.toLog(cd.elog) + s.kv.opt.Infof("Compaction for level: %d DONE", cd.thisLevel.level) + return nil +} + +func (s *levelsController) addLevel0Table(t *table.Table) error { + // We update the manifest _before_ the table becomes part of a levelHandler, because at that + // point it could get used in some compaction. This ensures the manifest file gets updated in + // the proper order. (That means this update happens before that of some compaction which + // deletes the table.) + err := s.kv.manifest.addChanges([]*pb.ManifestChange{ + newCreateChange(t.ID(), 0, t.Checksum), + }) + if err != nil { + return err + } + + for !s.levels[0].tryAddLevel0Table(t) { + // Stall. Make sure all levels are healthy before we unstall. + var timeStart time.Time + { + s.elog.Printf("STALLED STALLED STALLED: %v\n", time.Since(lastUnstalled)) + s.cstatus.RLock() + for i := 0; i < s.kv.opt.MaxLevels; i++ { + s.elog.Printf("level=%d. Status=%s Size=%d\n", + i, s.cstatus.levels[i].debug(), s.levels[i].getTotalSize()) + } + s.cstatus.RUnlock() + timeStart = time.Now() + } + // Before we unstall, we need to make sure that level 0 and 1 are healthy. Otherwise, we + // will very quickly fill up level 0 again and if the compaction strategy favors level 0, + // then level 1 is going to super full. + for i := 0; ; i++ { + // Passing 0 for delSize to compactable means we're treating incomplete compactions as + // not having finished -- we wait for them to finish. Also, it's crucial this behavior + // replicates pickCompactLevels' behavior in computing compactability in order to + // guarantee progress. + if !s.isLevel0Compactable() && !s.levels[1].isCompactable(0) { + break + } + time.Sleep(10 * time.Millisecond) + if i%100 == 0 { + prios := s.pickCompactLevels() + s.elog.Printf("Waiting to add level 0 table. Compaction priorities: %+v\n", prios) + i = 0 + } + } + { + s.elog.Printf("UNSTALLED UNSTALLED UNSTALLED: %v\n", time.Since(timeStart)) + lastUnstalled = time.Now() + } + } + + return nil +} + +func (s *levelsController) close() error { + err := s.cleanupLevels() + return errors.Wrap(err, "levelsController.Close") +} + +// get returns the found value if any. If not found, we return nil. +func (s *levelsController) get(key []byte, maxVs *y.ValueStruct) (y.ValueStruct, error) { + // It's important that we iterate the levels from 0 on upward. The reason is, if we iterated + // in opposite order, or in parallel (naively calling all the h.RLock() in some order) we could + // read level L's tables post-compaction and level L+1's tables pre-compaction. (If we do + // parallelize this, we will need to call the h.RLock() function by increasing order of level + // number.) + version := y.ParseTs(key) + for _, h := range s.levels { + vs, err := h.get(key) // Calls h.RLock() and h.RUnlock(). + if err != nil { + return y.ValueStruct{}, errors.Wrapf(err, "get key: %q", key) + } + if vs.Value == nil && vs.Meta == 0 { + continue + } + if maxVs == nil || vs.Version == version { + return vs, nil + } + if maxVs.Version < vs.Version { + *maxVs = vs + } + } + if maxVs != nil { + return *maxVs, nil + } + return y.ValueStruct{}, nil +} + +func appendIteratorsReversed(out []y.Iterator, th []*table.Table, reversed bool) []y.Iterator { + for i := len(th) - 1; i >= 0; i-- { + // This will increment the reference of the table handler. + out = append(out, th[i].NewIterator(reversed)) + } + return out +} + +// appendIterators appends iterators to an array of iterators, for merging. +// Note: This obtains references for the table handlers. Remember to close these iterators. +func (s *levelsController) appendIterators( + iters []y.Iterator, opt *IteratorOptions) []y.Iterator { + // Just like with get, it's important we iterate the levels from 0 on upward, to avoid missing + // data when there's a compaction. + for _, level := range s.levels { + iters = level.appendIterators(iters, opt) + } + return iters +} + +// TableInfo represents the information about a table. +type TableInfo struct { + ID uint64 + Level int + Left []byte + Right []byte + KeyCount uint64 // Number of keys in the table +} + +func (s *levelsController) getTableInfo(withKeysCount bool) (result []TableInfo) { + for _, l := range s.levels { + l.RLock() + for _, t := range l.tables { + var count uint64 + if withKeysCount { + it := t.NewIterator(false) + for it.Rewind(); it.Valid(); it.Next() { + count++ + } + } + + info := TableInfo{ + ID: t.ID(), + Level: l.level, + Left: t.Smallest(), + Right: t.Biggest(), + KeyCount: count, + } + result = append(result, info) + } + l.RUnlock() + } + sort.Slice(result, func(i, j int) bool { + if result[i].Level != result[j].Level { + return result[i].Level < result[j].Level + } + return result[i].ID < result[j].ID + }) + return +} diff --git a/vendor/github.com/dgraph-io/badger/logger.go b/vendor/github.com/dgraph-io/badger/logger.go new file mode 100644 index 0000000000..3a9b8a337f --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/logger.go @@ -0,0 +1,85 @@ +/* + * Copyright 2018 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package badger + +import ( + "log" + "os" +) + +// Logger is implemented by any logging system that is used for standard logs. +type Logger interface { + Errorf(string, ...interface{}) + Warningf(string, ...interface{}) + Infof(string, ...interface{}) + Debugf(string, ...interface{}) +} + +// Errorf logs an ERROR log message to the logger specified in opts or to the +// global logger if no logger is specified in opts. +func (opt *Options) Errorf(format string, v ...interface{}) { + if opt.Logger == nil { + return + } + opt.Logger.Errorf(format, v...) +} + +// Infof logs an INFO message to the logger specified in opts. +func (opt *Options) Infof(format string, v ...interface{}) { + if opt.Logger == nil { + return + } + opt.Logger.Infof(format, v...) +} + +// Warningf logs a WARNING message to the logger specified in opts. +func (opt *Options) Warningf(format string, v ...interface{}) { + if opt.Logger == nil { + return + } + opt.Logger.Warningf(format, v...) +} + +// Debugf logs a DEBUG message to the logger specified in opts. +func (opt *Options) Debugf(format string, v ...interface{}) { + if opt.Logger == nil { + return + } + opt.Logger.Debugf(format, v...) +} + +type defaultLog struct { + *log.Logger +} + +var defaultLogger = &defaultLog{Logger: log.New(os.Stderr, "badger ", log.LstdFlags)} + +func (l *defaultLog) Errorf(f string, v ...interface{}) { + l.Printf("ERROR: "+f, v...) +} + +func (l *defaultLog) Warningf(f string, v ...interface{}) { + l.Printf("WARNING: "+f, v...) +} + +func (l *defaultLog) Infof(f string, v ...interface{}) { + l.Printf("INFO: "+f, v...) +} + +func (l *defaultLog) Debugf(f string, v ...interface{}) { + l.Printf("DEBUG: "+f, v...) +} diff --git a/vendor/github.com/dgraph-io/badger/managed_db.go b/vendor/github.com/dgraph-io/badger/managed_db.go new file mode 100644 index 0000000000..4de226ae25 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/managed_db.go @@ -0,0 +1,68 @@ +/* + * Copyright 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package badger + +// OpenManaged returns a new DB, which allows more control over setting +// transaction timestamps, aka managed mode. +// +// This is only useful for databases built on top of Badger (like Dgraph), and +// can be ignored by most users. +func OpenManaged(opts Options) (*DB, error) { + opts.managedTxns = true + return Open(opts) +} + +// NewTransactionAt follows the same logic as DB.NewTransaction(), but uses the +// provided read timestamp. +// +// This is only useful for databases built on top of Badger (like Dgraph), and +// can be ignored by most users. +func (db *DB) NewTransactionAt(readTs uint64, update bool) *Txn { + if !db.opt.managedTxns { + panic("Cannot use NewTransactionAt with managedDB=false. Use NewTransaction instead.") + } + txn := db.newTransaction(update, true) + txn.readTs = readTs + return txn +} + +// CommitAt commits the transaction, following the same logic as Commit(), but +// at the given commit timestamp. This will panic if not used with managed transactions. +// +// This is only useful for databases built on top of Badger (like Dgraph), and +// can be ignored by most users. +func (txn *Txn) CommitAt(commitTs uint64, callback func(error)) error { + if !txn.db.opt.managedTxns { + panic("Cannot use CommitAt with managedDB=false. Use Commit instead.") + } + txn.commitTs = commitTs + if callback == nil { + return txn.Commit() + } + txn.CommitWith(callback) + return nil +} + +// SetDiscardTs sets a timestamp at or below which, any invalid or deleted +// versions can be discarded from the LSM tree, and thence from the value log to +// reclaim disk space. Can only be used with managed transactions. +func (db *DB) SetDiscardTs(ts uint64) { + if !db.opt.managedTxns { + panic("Cannot use SetDiscardTs with managedDB=false.") + } + db.orc.setDiscardTs(ts) +} diff --git a/vendor/github.com/dgraph-io/badger/manifest.go b/vendor/github.com/dgraph-io/badger/manifest.go new file mode 100644 index 0000000000..a581882947 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/manifest.go @@ -0,0 +1,440 @@ +/* + * Copyright 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package badger + +import ( + "bufio" + "bytes" + "encoding/binary" + "fmt" + "hash/crc32" + "io" + "os" + "path/filepath" + "sync" + + "github.com/dgraph-io/badger/pb" + "github.com/dgraph-io/badger/y" + "github.com/pkg/errors" +) + +// Manifest represents the contents of the MANIFEST file in a Badger store. +// +// The MANIFEST file describes the startup state of the db -- all LSM files and what level they're +// at. +// +// It consists of a sequence of ManifestChangeSet objects. Each of these is treated atomically, +// and contains a sequence of ManifestChange's (file creations/deletions) which we use to +// reconstruct the manifest at startup. +type Manifest struct { + Levels []levelManifest + Tables map[uint64]TableManifest + + // Contains total number of creation and deletion changes in the manifest -- used to compute + // whether it'd be useful to rewrite the manifest. + Creations int + Deletions int +} + +func createManifest() Manifest { + levels := make([]levelManifest, 0) + return Manifest{ + Levels: levels, + Tables: make(map[uint64]TableManifest), + } +} + +// levelManifest contains information about LSM tree levels +// in the MANIFEST file. +type levelManifest struct { + Tables map[uint64]struct{} // Set of table id's +} + +// TableManifest contains information about a specific level +// in the LSM tree. +type TableManifest struct { + Level uint8 + Checksum []byte +} + +// manifestFile holds the file pointer (and other info) about the manifest file, which is a log +// file we append to. +type manifestFile struct { + fp *os.File + directory string + // We make this configurable so that unit tests can hit rewrite() code quickly + deletionsRewriteThreshold int + + // Guards appends, which includes access to the manifest field. + appendLock sync.Mutex + + // Used to track the current state of the manifest, used when rewriting. + manifest Manifest +} + +const ( + // ManifestFilename is the filename for the manifest file. + ManifestFilename = "MANIFEST" + manifestRewriteFilename = "MANIFEST-REWRITE" + manifestDeletionsRewriteThreshold = 10000 + manifestDeletionsRatio = 10 +) + +// asChanges returns a sequence of changes that could be used to recreate the Manifest in its +// present state. +func (m *Manifest) asChanges() []*pb.ManifestChange { + changes := make([]*pb.ManifestChange, 0, len(m.Tables)) + for id, tm := range m.Tables { + changes = append(changes, newCreateChange(id, int(tm.Level), tm.Checksum)) + } + return changes +} + +func (m *Manifest) clone() Manifest { + changeSet := pb.ManifestChangeSet{Changes: m.asChanges()} + ret := createManifest() + y.Check(applyChangeSet(&ret, &changeSet)) + return ret +} + +// openOrCreateManifestFile opens a Badger manifest file if it exists, or creates on if +// one doesn’t. +func openOrCreateManifestFile(dir string, readOnly bool) ( + ret *manifestFile, result Manifest, err error) { + return helpOpenOrCreateManifestFile(dir, readOnly, manifestDeletionsRewriteThreshold) +} + +func helpOpenOrCreateManifestFile(dir string, readOnly bool, deletionsThreshold int) ( + ret *manifestFile, result Manifest, err error) { + + path := filepath.Join(dir, ManifestFilename) + var flags uint32 + if readOnly { + flags |= y.ReadOnly + } + fp, err := y.OpenExistingFile(path, flags) // We explicitly sync in addChanges, outside the lock. + if err != nil { + if !os.IsNotExist(err) { + return nil, Manifest{}, err + } + if readOnly { + return nil, Manifest{}, fmt.Errorf("no manifest found, required for read-only db") + } + m := createManifest() + fp, netCreations, err := helpRewrite(dir, &m) + if err != nil { + return nil, Manifest{}, err + } + y.AssertTrue(netCreations == 0) + mf := &manifestFile{ + fp: fp, + directory: dir, + manifest: m.clone(), + deletionsRewriteThreshold: deletionsThreshold, + } + return mf, m, nil + } + + manifest, truncOffset, err := ReplayManifestFile(fp) + if err != nil { + _ = fp.Close() + return nil, Manifest{}, err + } + + if !readOnly { + // Truncate file so we don't have a half-written entry at the end. + if err := fp.Truncate(truncOffset); err != nil { + _ = fp.Close() + return nil, Manifest{}, err + } + } + if _, err = fp.Seek(0, io.SeekEnd); err != nil { + _ = fp.Close() + return nil, Manifest{}, err + } + + mf := &manifestFile{ + fp: fp, + directory: dir, + manifest: manifest.clone(), + deletionsRewriteThreshold: deletionsThreshold, + } + return mf, manifest, nil +} + +func (mf *manifestFile) close() error { + return mf.fp.Close() +} + +// addChanges writes a batch of changes, atomically, to the file. By "atomically" that means when +// we replay the MANIFEST file, we'll either replay all the changes or none of them. (The truth of +// this depends on the filesystem -- some might append garbage data if a system crash happens at +// the wrong time.) +func (mf *manifestFile) addChanges(changesParam []*pb.ManifestChange) error { + changes := pb.ManifestChangeSet{Changes: changesParam} + buf, err := changes.Marshal() + if err != nil { + return err + } + + // Maybe we could use O_APPEND instead (on certain file systems) + mf.appendLock.Lock() + if err := applyChangeSet(&mf.manifest, &changes); err != nil { + mf.appendLock.Unlock() + return err + } + // Rewrite manifest if it'd shrink by 1/10 and it's big enough to care + if mf.manifest.Deletions > mf.deletionsRewriteThreshold && + mf.manifest.Deletions > manifestDeletionsRatio*(mf.manifest.Creations-mf.manifest.Deletions) { + if err := mf.rewrite(); err != nil { + mf.appendLock.Unlock() + return err + } + } else { + var lenCrcBuf [8]byte + binary.BigEndian.PutUint32(lenCrcBuf[0:4], uint32(len(buf))) + binary.BigEndian.PutUint32(lenCrcBuf[4:8], crc32.Checksum(buf, y.CastagnoliCrcTable)) + buf = append(lenCrcBuf[:], buf...) + if _, err := mf.fp.Write(buf); err != nil { + mf.appendLock.Unlock() + return err + } + } + + mf.appendLock.Unlock() + return y.FileSync(mf.fp) +} + +// Has to be 4 bytes. The value can never change, ever, anyway. +var magicText = [4]byte{'B', 'd', 'g', 'r'} + +// The magic version number. +const magicVersion = 4 + +func helpRewrite(dir string, m *Manifest) (*os.File, int, error) { + rewritePath := filepath.Join(dir, manifestRewriteFilename) + // We explicitly sync. + fp, err := y.OpenTruncFile(rewritePath, false) + if err != nil { + return nil, 0, err + } + + buf := make([]byte, 8) + copy(buf[0:4], magicText[:]) + binary.BigEndian.PutUint32(buf[4:8], magicVersion) + + netCreations := len(m.Tables) + changes := m.asChanges() + set := pb.ManifestChangeSet{Changes: changes} + + changeBuf, err := set.Marshal() + if err != nil { + fp.Close() + return nil, 0, err + } + var lenCrcBuf [8]byte + binary.BigEndian.PutUint32(lenCrcBuf[0:4], uint32(len(changeBuf))) + binary.BigEndian.PutUint32(lenCrcBuf[4:8], crc32.Checksum(changeBuf, y.CastagnoliCrcTable)) + buf = append(buf, lenCrcBuf[:]...) + buf = append(buf, changeBuf...) + if _, err := fp.Write(buf); err != nil { + fp.Close() + return nil, 0, err + } + if err := y.FileSync(fp); err != nil { + fp.Close() + return nil, 0, err + } + + // In Windows the files should be closed before doing a Rename. + if err = fp.Close(); err != nil { + return nil, 0, err + } + manifestPath := filepath.Join(dir, ManifestFilename) + if err := os.Rename(rewritePath, manifestPath); err != nil { + return nil, 0, err + } + fp, err = y.OpenExistingFile(manifestPath, 0) + if err != nil { + return nil, 0, err + } + if _, err := fp.Seek(0, io.SeekEnd); err != nil { + fp.Close() + return nil, 0, err + } + if err := syncDir(dir); err != nil { + fp.Close() + return nil, 0, err + } + + return fp, netCreations, nil +} + +// Must be called while appendLock is held. +func (mf *manifestFile) rewrite() error { + // In Windows the files should be closed before doing a Rename. + if err := mf.fp.Close(); err != nil { + return err + } + fp, netCreations, err := helpRewrite(mf.directory, &mf.manifest) + if err != nil { + return err + } + mf.fp = fp + mf.manifest.Creations = netCreations + mf.manifest.Deletions = 0 + + return nil +} + +type countingReader struct { + wrapped *bufio.Reader + count int64 +} + +func (r *countingReader) Read(p []byte) (n int, err error) { + n, err = r.wrapped.Read(p) + r.count += int64(n) + return +} + +func (r *countingReader) ReadByte() (b byte, err error) { + b, err = r.wrapped.ReadByte() + if err == nil { + r.count++ + } + return +} + +var ( + errBadMagic = errors.New("manifest has bad magic") + errBadChecksum = errors.New("manifest has checksum mismatch") +) + +// ReplayManifestFile reads the manifest file and constructs two manifest objects. (We need one +// immutable copy and one mutable copy of the manifest. Easiest way is to construct two of them.) +// Also, returns the last offset after a completely read manifest entry -- the file must be +// truncated at that point before further appends are made (if there is a partial entry after +// that). In normal conditions, truncOffset is the file size. +func ReplayManifestFile(fp *os.File) (ret Manifest, truncOffset int64, err error) { + r := countingReader{wrapped: bufio.NewReader(fp)} + + var magicBuf [8]byte + if _, err := io.ReadFull(&r, magicBuf[:]); err != nil { + return Manifest{}, 0, errBadMagic + } + if !bytes.Equal(magicBuf[0:4], magicText[:]) { + return Manifest{}, 0, errBadMagic + } + version := binary.BigEndian.Uint32(magicBuf[4:8]) + if version != magicVersion { + return Manifest{}, 0, + fmt.Errorf("manifest has unsupported version: %d (we support %d)", version, magicVersion) + } + + build := createManifest() + var offset int64 + for { + offset = r.count + var lenCrcBuf [8]byte + _, err := io.ReadFull(&r, lenCrcBuf[:]) + if err != nil { + if err == io.EOF || err == io.ErrUnexpectedEOF { + break + } + return Manifest{}, 0, err + } + length := binary.BigEndian.Uint32(lenCrcBuf[0:4]) + var buf = make([]byte, length) + if _, err := io.ReadFull(&r, buf); err != nil { + if err == io.EOF || err == io.ErrUnexpectedEOF { + break + } + return Manifest{}, 0, err + } + if crc32.Checksum(buf, y.CastagnoliCrcTable) != binary.BigEndian.Uint32(lenCrcBuf[4:8]) { + return Manifest{}, 0, errBadChecksum + } + + var changeSet pb.ManifestChangeSet + if err := changeSet.Unmarshal(buf); err != nil { + return Manifest{}, 0, err + } + + if err := applyChangeSet(&build, &changeSet); err != nil { + return Manifest{}, 0, err + } + } + + return build, offset, err +} + +func applyManifestChange(build *Manifest, tc *pb.ManifestChange) error { + switch tc.Op { + case pb.ManifestChange_CREATE: + if _, ok := build.Tables[tc.Id]; ok { + return fmt.Errorf("MANIFEST invalid, table %d exists", tc.Id) + } + build.Tables[tc.Id] = TableManifest{ + Level: uint8(tc.Level), + Checksum: append([]byte{}, tc.Checksum...), + } + for len(build.Levels) <= int(tc.Level) { + build.Levels = append(build.Levels, levelManifest{make(map[uint64]struct{})}) + } + build.Levels[tc.Level].Tables[tc.Id] = struct{}{} + build.Creations++ + case pb.ManifestChange_DELETE: + tm, ok := build.Tables[tc.Id] + if !ok { + return fmt.Errorf("MANIFEST removes non-existing table %d", tc.Id) + } + delete(build.Levels[tm.Level].Tables, tc.Id) + delete(build.Tables, tc.Id) + build.Deletions++ + default: + return fmt.Errorf("MANIFEST file has invalid manifestChange op") + } + return nil +} + +// This is not a "recoverable" error -- opening the KV store fails because the MANIFEST file is +// just plain broken. +func applyChangeSet(build *Manifest, changeSet *pb.ManifestChangeSet) error { + for _, change := range changeSet.Changes { + if err := applyManifestChange(build, change); err != nil { + return err + } + } + return nil +} + +func newCreateChange(id uint64, level int, checksum []byte) *pb.ManifestChange { + return &pb.ManifestChange{ + Id: id, + Op: pb.ManifestChange_CREATE, + Level: uint32(level), + Checksum: checksum, + } +} + +func newDeleteChange(id uint64) *pb.ManifestChange { + return &pb.ManifestChange{ + Id: id, + Op: pb.ManifestChange_DELETE, + } +} diff --git a/vendor/github.com/dgraph-io/badger/merge.go b/vendor/github.com/dgraph-io/badger/merge.go new file mode 100644 index 0000000000..02ad4bcde4 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/merge.go @@ -0,0 +1,177 @@ +/* + * Copyright 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package badger + +import ( + "sync" + "time" + + "github.com/dgraph-io/badger/y" + "github.com/pkg/errors" +) + +// MergeOperator represents a Badger merge operator. +type MergeOperator struct { + sync.RWMutex + f MergeFunc + db *DB + key []byte + closer *y.Closer +} + +// MergeFunc accepts two byte slices, one representing an existing value, and +// another representing a new value that needs to be ‘merged’ into it. MergeFunc +// contains the logic to perform the ‘merge’ and return an updated value. +// MergeFunc could perform operations like integer addition, list appends etc. +// Note that the ordering of the operands is maintained. +type MergeFunc func(existingVal, newVal []byte) []byte + +// GetMergeOperator creates a new MergeOperator for a given key and returns a +// pointer to it. It also fires off a goroutine that performs a compaction using +// the merge function that runs periodically, as specified by dur. +func (db *DB) GetMergeOperator(key []byte, + f MergeFunc, dur time.Duration) *MergeOperator { + op := &MergeOperator{ + f: f, + db: db, + key: key, + closer: y.NewCloser(1), + } + + go op.runCompactions(dur) + return op +} + +var errNoMerge = errors.New("No need for merge") + +func (op *MergeOperator) iterateAndMerge() (newVal []byte, latest uint64, err error) { + txn := op.db.NewTransaction(false) + defer txn.Discard() + opt := DefaultIteratorOptions + opt.AllVersions = true + it := txn.NewKeyIterator(op.key, opt) + defer it.Close() + + var numVersions int + for it.Rewind(); it.Valid(); it.Next() { + item := it.Item() + numVersions++ + if numVersions == 1 { + // This should be the newVal, considering this is the latest version. + newVal, err = item.ValueCopy(newVal) + if err != nil { + return nil, 0, err + } + latest = item.Version() + } else { + if err := item.Value(func(oldVal []byte) error { + // The merge should always be on the newVal considering it has the merge result of + // the latest version. The value read should be the oldVal. + newVal = op.f(oldVal, newVal) + return nil + }); err != nil { + return nil, 0, err + } + } + if item.DiscardEarlierVersions() { + break + } + } + if numVersions == 0 { + return nil, latest, ErrKeyNotFound + } else if numVersions == 1 { + return newVal, latest, errNoMerge + } + return newVal, latest, nil +} + +func (op *MergeOperator) compact() error { + op.Lock() + defer op.Unlock() + val, version, err := op.iterateAndMerge() + if err == ErrKeyNotFound || err == errNoMerge { + return nil + } else if err != nil { + return err + } + entries := []*Entry{ + { + Key: y.KeyWithTs(op.key, version), + Value: val, + meta: bitDiscardEarlierVersions, + }, + } + // Write value back to the DB. It is important that we do not set the bitMergeEntry bit + // here. When compaction happens, all the older merged entries will be removed. + return op.db.batchSetAsync(entries, func(err error) { + if err != nil { + op.db.opt.Errorf("failed to insert the result of merge compaction: %s", err) + } + }) +} + +func (op *MergeOperator) runCompactions(dur time.Duration) { + ticker := time.NewTicker(dur) + defer op.closer.Done() + var stop bool + for { + select { + case <-op.closer.HasBeenClosed(): + stop = true + case <-ticker.C: // wait for tick + } + if err := op.compact(); err != nil { + op.db.opt.Errorf("failure while running merge operation: %s", err) + } + if stop { + ticker.Stop() + break + } + } +} + +// Add records a value in Badger which will eventually be merged by a background +// routine into the values that were recorded by previous invocations to Add(). +func (op *MergeOperator) Add(val []byte) error { + return op.db.Update(func(txn *Txn) error { + return txn.SetEntry(NewEntry(op.key, val).withMergeBit()) + }) +} + +// Get returns the latest value for the merge operator, which is derived by +// applying the merge function to all the values added so far. +// +// If Add has not been called even once, Get will return ErrKeyNotFound. +func (op *MergeOperator) Get() ([]byte, error) { + op.RLock() + defer op.RUnlock() + var existing []byte + err := op.db.View(func(txn *Txn) (err error) { + existing, _, err = op.iterateAndMerge() + return err + }) + if err == errNoMerge { + return existing, nil + } + return existing, err +} + +// Stop waits for any pending merge to complete and then stops the background +// goroutine. +func (op *MergeOperator) Stop() { + op.closer.SignalAndWait() +} diff --git a/vendor/github.com/dgraph-io/badger/options.go b/vendor/github.com/dgraph-io/badger/options.go new file mode 100644 index 0000000000..b91fdc5e30 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/options.go @@ -0,0 +1,374 @@ +/* + * Copyright 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package badger + +import ( + "github.com/dgraph-io/badger/options" +) + +// Note: If you add a new option X make sure you also add a WithX method on Options. + +// Options are params for creating DB object. +// +// This package provides DefaultOptions which contains options that should +// work for most applications. Consider using that as a starting point before +// customizing it for your own needs. +// +// Each option X is documented on the WithX method. +type Options struct { + // Required options. + + Dir string + ValueDir string + + // Usually modified options. + + SyncWrites bool + TableLoadingMode options.FileLoadingMode + ValueLogLoadingMode options.FileLoadingMode + NumVersionsToKeep int + ReadOnly bool + Truncate bool + Logger Logger + + // Fine tuning options. + + MaxTableSize int64 + LevelSizeMultiplier int + MaxLevels int + ValueThreshold int + NumMemtables int + + NumLevelZeroTables int + NumLevelZeroTablesStall int + + LevelOneSize int64 + ValueLogFileSize int64 + ValueLogMaxEntries uint32 + + NumCompactors int + CompactL0OnClose bool + LogRotatesToFlush int32 + + // Transaction start and commit timestamps are managed by end-user. + // This is only useful for databases built on top of Badger (like Dgraph). + // Not recommended for most users. + managedTxns bool + + // 4. Flags for testing purposes + // ------------------------------ + maxBatchCount int64 // max entries in batch + maxBatchSize int64 // max batch size in bytes + +} + +// DefaultOptions sets a list of recommended options for good performance. +// Feel free to modify these to suit your needs with the WithX methods. +func DefaultOptions(path string) Options { + return Options{ + Dir: path, + ValueDir: path, + LevelOneSize: 256 << 20, + LevelSizeMultiplier: 10, + TableLoadingMode: options.MemoryMap, + ValueLogLoadingMode: options.MemoryMap, + // table.MemoryMap to mmap() the tables. + // table.Nothing to not preload the tables. + MaxLevels: 7, + MaxTableSize: 64 << 20, + NumCompactors: 2, // Compactions can be expensive. Only run 2. + NumLevelZeroTables: 5, + NumLevelZeroTablesStall: 10, + NumMemtables: 5, + SyncWrites: true, + NumVersionsToKeep: 1, + CompactL0OnClose: true, + // Nothing to read/write value log using standard File I/O + // MemoryMap to mmap() the value log files + // (2^30 - 1)*2 when mmapping < 2^31 - 1, max int32. + // -1 so 2*ValueLogFileSize won't overflow on 32-bit systems. + ValueLogFileSize: 1<<30 - 1, + + ValueLogMaxEntries: 1000000, + ValueThreshold: 32, + Truncate: false, + Logger: defaultLogger, + LogRotatesToFlush: 2, + } +} + +// LSMOnlyOptions follows from DefaultOptions, but sets a higher ValueThreshold +// so values would be colocated with the LSM tree, with value log largely acting +// as a write-ahead log only. These options would reduce the disk usage of value +// log, and make Badger act more like a typical LSM tree. +func LSMOnlyOptions(path string) Options { + // Max value length which fits in uint16. + // Let's not set any other options, because they can cause issues with the + // size of key-value a user can pass to Badger. For e.g., if we set + // ValueLogFileSize to 64MB, a user can't pass a value more than that. + // Setting it to ValueLogMaxEntries to 1000, can generate too many files. + // These options are better configured on a usage basis, than broadly here. + // The ValueThreshold is the most important setting a user needs to do to + // achieve a heavier usage of LSM tree. + // NOTE: If a user does not want to set 64KB as the ValueThreshold because + // of performance reasons, 1KB would be a good option too, allowing + // values smaller than 1KB to be colocated with the keys in the LSM tree. + return DefaultOptions(path).WithValueThreshold(65500) +} + +// WithDir returns a new Options value with Dir set to the given value. +// +// Dir is the path of the directory where key data will be stored in. +// If it doesn't exist, Badger will try to create it for you. +// This is set automatically to be the path given to `DefaultOptions`. +func (opt Options) WithDir(val string) Options { + opt.Dir = val + return opt +} + +// WithValueDir returns a new Options value with ValueDir set to the given value. +// +// ValueDir is the path of the directory where value data will be stored in. +// If it doesn't exist, Badger will try to create it for you. +// This is set automatically to be the path given to `DefaultOptions`. +func (opt Options) WithValueDir(val string) Options { + opt.ValueDir = val + return opt +} + +// WithSyncWrites returns a new Options value with SyncWrites set to the given value. +// +// When SyncWrites is true all writes are synced to disk. Setting this to false would achieve better +// performance, but may cause data loss in case of crash. +// +// The default value of SyncWrites is true. +func (opt Options) WithSyncWrites(val bool) Options { + opt.SyncWrites = val + return opt +} + +// WithTableLoadingMode returns a new Options value with TableLoadingMode set to the given value. +// +// TableLoadingMode indicates which file loading mode should be used for the LSM tree data files. +// +// The default value of TableLoadingMode is options.MemoryMap. +func (opt Options) WithTableLoadingMode(val options.FileLoadingMode) Options { + opt.TableLoadingMode = val + return opt +} + +// WithValueLogLoadingMode returns a new Options value with ValueLogLoadingMode set to the given +// value. +// +// ValueLogLoadingMode indicates which file loading mode should be used for the value log data +// files. +// +// The default value of ValueLogLoadingMode is options.MemoryMap. +func (opt Options) WithValueLogLoadingMode(val options.FileLoadingMode) Options { + opt.ValueLogLoadingMode = val + return opt +} + +// WithNumVersionsToKeep returns a new Options value with NumVersionsToKeep set to the given value. +// +// NumVersionsToKeep sets how many versions to keep per key at most. +// +// The default value of NumVersionsToKeep is 1. +func (opt Options) WithNumVersionsToKeep(val int) Options { + opt.NumVersionsToKeep = val + return opt +} + +// WithReadOnly returns a new Options value with ReadOnly set to the given value. +// +// When ReadOnly is true the DB will be opened on read-only mode. +// Multiple processes can open the same Badger DB. +// Note: if the DB being opened had crashed before and has vlog data to be replayed, +// ReadOnly will cause Open to fail with an appropriate message. +// +// The default value of ReadOnly is false. +func (opt Options) WithReadOnly(val bool) Options { + opt.ReadOnly = val + return opt +} + +// WithTruncate returns a new Options value with Truncate set to the given value. +// +// Truncate indicates whether value log files should be truncated to delete corrupt data, if any. +// This option is ignored when ReadOnly is true. +// +// The default value of Truncate is false. +func (opt Options) WithTruncate(val bool) Options { + opt.Truncate = val + return opt +} + +// WithLogger returns a new Options value with Logger set to the given value. +// +// Logger provides a way to configure what logger each value of badger.DB uses. +// +// The default value of Logger writes to stderr using the log package from the Go standard library. +func (opt Options) WithLogger(val Logger) Options { + opt.Logger = val + return opt +} + +// WithMaxTableSize returns a new Options value with MaxTableSize set to the given value. +// +// MaxTableSize sets the maximum size in bytes for each LSM table or file. +// +// The default value of MaxTableSize is 64MB. +func (opt Options) WithMaxTableSize(val int64) Options { + opt.MaxTableSize = val + return opt +} + +// WithLevelSizeMultiplier returns a new Options value with LevelSizeMultiplier set to the given +// value. +// +// LevelSizeMultiplier sets the ratio between the maximum sizes of contiguous levels in the LSM. +// Once a level grows to be larger than this ratio allowed, the compaction process will be +// triggered. +// +// The default value of LevelSizeMultiplier is 10. +func (opt Options) WithLevelSizeMultiplier(val int) Options { + opt.LevelSizeMultiplier = val + return opt +} + +// WithMaxLevels returns a new Options value with MaxLevels set to the given value. +// +// Maximum number of levels of compaction allowed in the LSM. +// +// The default value of MaxLevels is 7. +func (opt Options) WithMaxLevels(val int) Options { + opt.MaxLevels = val + return opt +} + +// WithValueThreshold returns a new Options value with ValueThreshold set to the given value. +// +// ValueThreshold sets the threshold used to decide whether a value is stored directly in the LSM +// tree or separatedly in the log value files. +// +// The default value of ValueThreshold is 32, but LSMOnlyOptions sets it to 65500. +func (opt Options) WithValueThreshold(val int) Options { + opt.ValueThreshold = val + return opt +} + +// WithNumMemtables returns a new Options value with NumMemtables set to the given value. +// +// NumMemtables sets the maximum number of tables to keep in memory before stalling. +// +// The default value of NumMemtables is 5. +func (opt Options) WithNumMemtables(val int) Options { + opt.NumMemtables = val + return opt +} + +// WithNumLevelZeroTables returns a new Options value with NumLevelZeroTables set to the given +// value. +// +// NumLevelZeroTables sets the maximum number of Level 0 tables before compaction starts. +// +// The default value of NumLevelZeroTables is 5. +func (opt Options) WithNumLevelZeroTables(val int) Options { + opt.NumLevelZeroTables = val + return opt +} + +// WithNumLevelZeroTablesStall returns a new Options value with NumLevelZeroTablesStall set to the +// given value. +// +// NumLevelZeroTablesStall sets the number of Level 0 tables that once reached causes the DB to +// stall until compaction succeeds. +// +// The default value of NumLevelZeroTablesStall is 10. +func (opt Options) WithNumLevelZeroTablesStall(val int) Options { + opt.NumLevelZeroTablesStall = val + return opt +} + +// WithLevelOneSize returns a new Options value with LevelOneSize set to the given value. +// +// LevelOneSize sets the maximum total size for Level 1. +// +// The default value of LevelOneSize is 20MB. +func (opt Options) WithLevelOneSize(val int64) Options { + opt.LevelOneSize = val + return opt +} + +// WithValueLogFileSize returns a new Options value with ValueLogFileSize set to the given value. +// +// ValueLogFileSize sets the maximum size of a single value log file. +// +// The default value of ValueLogFileSize is 1GB. +func (opt Options) WithValueLogFileSize(val int64) Options { + opt.ValueLogFileSize = val + return opt +} + +// WithValueLogMaxEntries returns a new Options value with ValueLogMaxEntries set to the given +// value. +// +// ValueLogMaxEntries sets the maximum number of entries a value log file can hold approximately. +// A actual size limit of a value log file is the minimum of ValueLogFileSize and +// ValueLogMaxEntries. +// +// The default value of ValueLogMaxEntries is one million (1000000). +func (opt Options) WithValueLogMaxEntries(val uint32) Options { + opt.ValueLogMaxEntries = val + return opt +} + +// WithNumCompactors returns a new Options value with NumCompactors set to the given value. +// +// NumCompactors sets the number of compaction workers to run concurrently. +// Setting this to zero stops compactions, which could eventually cause writes to block forever. +// +// The default value of NumCompactors is 2. +func (opt Options) WithNumCompactors(val int) Options { + opt.NumCompactors = val + return opt +} + +// WithCompactL0OnClose returns a new Options value with CompactL0OnClose set to the given value. +// +// CompactL0OnClose determines whether Level 0 should be compacted before closing the DB. +// This ensures that both reads and writes are efficient when the DB is opened later. +// +// The default value of CompactL0OnClose is true. +func (opt Options) WithCompactL0OnClose(val bool) Options { + opt.CompactL0OnClose = val + return opt +} + +// WithLogRotatesToFlush returns a new Options value with LogRotatesToFlush set to the given value. +// +// LogRotatesToFlush sets the number of value log file rotates after which the Memtables are +// flushed to disk. This is useful in write loads with fewer keys and larger values. This work load +// would fill up the value logs quickly, while not filling up the Memtables. Thus, on a crash +// and restart, the value log head could cause the replay of a good number of value log files +// which can slow things on start. +// +// The default value of LogRotatesToFlush is 2. +func (opt Options) WithLogRotatesToFlush(val int32) Options { + opt.LogRotatesToFlush = val + return opt +} diff --git a/vendor/github.com/dgraph-io/badger/options/options.go b/vendor/github.com/dgraph-io/badger/options/options.go new file mode 100644 index 0000000000..06c8b1b7f0 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/options/options.go @@ -0,0 +1,30 @@ +/* + * Copyright 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package options + +// FileLoadingMode specifies how data in LSM table files and value log files should +// be loaded. +type FileLoadingMode int + +const ( + // FileIO indicates that files must be loaded using standard I/O + FileIO FileLoadingMode = iota + // LoadToRAM indicates that file must be loaded into RAM + LoadToRAM + // MemoryMap indicates that that the file must be memory-mapped + MemoryMap +) diff --git a/vendor/github.com/dgraph-io/badger/pb/gen.sh b/vendor/github.com/dgraph-io/badger/pb/gen.sh new file mode 100644 index 0000000000..49b44ff4e0 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/pb/gen.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +# You might need to go get -v github.com/gogo/protobuf/... + +protos=${GOPATH-$HOME/go}/src/github.com/dgraph-io/badger/pb +pushd $protos > /dev/null +protoc --gofast_out=plugins=grpc:. -I=. pb.proto diff --git a/vendor/github.com/dgraph-io/badger/pb/pb.pb.go b/vendor/github.com/dgraph-io/badger/pb/pb.pb.go new file mode 100644 index 0000000000..f9a2c6eeed --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/pb/pb.pb.go @@ -0,0 +1,1313 @@ +// Code generated by protoc-gen-gogo. DO NOT EDIT. +// source: pb.proto + +package pb + +import ( + fmt "fmt" + proto "github.com/golang/protobuf/proto" + io "io" + math "math" +) + +// Reference imports to suppress errors if they are not otherwise used. +var _ = proto.Marshal +var _ = fmt.Errorf +var _ = math.Inf + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the proto package it is being compiled against. +// A compilation error at this line likely means your copy of the +// proto package needs to be updated. +const _ = proto.ProtoPackageIsVersion2 // please upgrade the proto package + +type ManifestChange_Operation int32 + +const ( + ManifestChange_CREATE ManifestChange_Operation = 0 + ManifestChange_DELETE ManifestChange_Operation = 1 +) + +var ManifestChange_Operation_name = map[int32]string{ + 0: "CREATE", + 1: "DELETE", +} + +var ManifestChange_Operation_value = map[string]int32{ + "CREATE": 0, + "DELETE": 1, +} + +func (x ManifestChange_Operation) String() string { + return proto.EnumName(ManifestChange_Operation_name, int32(x)) +} + +func (ManifestChange_Operation) EnumDescriptor() ([]byte, []int) { + return fileDescriptor_f80abaa17e25ccc8, []int{3, 0} +} + +type KV struct { + Key []byte `protobuf:"bytes,1,opt,name=key,proto3" json:"key,omitempty"` + Value []byte `protobuf:"bytes,2,opt,name=value,proto3" json:"value,omitempty"` + UserMeta []byte `protobuf:"bytes,3,opt,name=user_meta,json=userMeta,proto3" json:"user_meta,omitempty"` + Version uint64 `protobuf:"varint,4,opt,name=version,proto3" json:"version,omitempty"` + ExpiresAt uint64 `protobuf:"varint,5,opt,name=expires_at,json=expiresAt,proto3" json:"expires_at,omitempty"` + Meta []byte `protobuf:"bytes,6,opt,name=meta,proto3" json:"meta,omitempty"` + // Stream id is used to identify which stream the KV came from. + StreamId uint32 `protobuf:"varint,10,opt,name=stream_id,json=streamId,proto3" json:"stream_id,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *KV) Reset() { *m = KV{} } +func (m *KV) String() string { return proto.CompactTextString(m) } +func (*KV) ProtoMessage() {} +func (*KV) Descriptor() ([]byte, []int) { + return fileDescriptor_f80abaa17e25ccc8, []int{0} +} +func (m *KV) XXX_Unmarshal(b []byte) error { + return m.Unmarshal(b) +} +func (m *KV) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + if deterministic { + return xxx_messageInfo_KV.Marshal(b, m, deterministic) + } else { + b = b[:cap(b)] + n, err := m.MarshalTo(b) + if err != nil { + return nil, err + } + return b[:n], nil + } +} +func (m *KV) XXX_Merge(src proto.Message) { + xxx_messageInfo_KV.Merge(m, src) +} +func (m *KV) XXX_Size() int { + return m.Size() +} +func (m *KV) XXX_DiscardUnknown() { + xxx_messageInfo_KV.DiscardUnknown(m) +} + +var xxx_messageInfo_KV proto.InternalMessageInfo + +func (m *KV) GetKey() []byte { + if m != nil { + return m.Key + } + return nil +} + +func (m *KV) GetValue() []byte { + if m != nil { + return m.Value + } + return nil +} + +func (m *KV) GetUserMeta() []byte { + if m != nil { + return m.UserMeta + } + return nil +} + +func (m *KV) GetVersion() uint64 { + if m != nil { + return m.Version + } + return 0 +} + +func (m *KV) GetExpiresAt() uint64 { + if m != nil { + return m.ExpiresAt + } + return 0 +} + +func (m *KV) GetMeta() []byte { + if m != nil { + return m.Meta + } + return nil +} + +func (m *KV) GetStreamId() uint32 { + if m != nil { + return m.StreamId + } + return 0 +} + +type KVList struct { + Kv []*KV `protobuf:"bytes,1,rep,name=kv,proto3" json:"kv,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *KVList) Reset() { *m = KVList{} } +func (m *KVList) String() string { return proto.CompactTextString(m) } +func (*KVList) ProtoMessage() {} +func (*KVList) Descriptor() ([]byte, []int) { + return fileDescriptor_f80abaa17e25ccc8, []int{1} +} +func (m *KVList) XXX_Unmarshal(b []byte) error { + return m.Unmarshal(b) +} +func (m *KVList) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + if deterministic { + return xxx_messageInfo_KVList.Marshal(b, m, deterministic) + } else { + b = b[:cap(b)] + n, err := m.MarshalTo(b) + if err != nil { + return nil, err + } + return b[:n], nil + } +} +func (m *KVList) XXX_Merge(src proto.Message) { + xxx_messageInfo_KVList.Merge(m, src) +} +func (m *KVList) XXX_Size() int { + return m.Size() +} +func (m *KVList) XXX_DiscardUnknown() { + xxx_messageInfo_KVList.DiscardUnknown(m) +} + +var xxx_messageInfo_KVList proto.InternalMessageInfo + +func (m *KVList) GetKv() []*KV { + if m != nil { + return m.Kv + } + return nil +} + +type ManifestChangeSet struct { + // A set of changes that are applied atomically. + Changes []*ManifestChange `protobuf:"bytes,1,rep,name=changes,proto3" json:"changes,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *ManifestChangeSet) Reset() { *m = ManifestChangeSet{} } +func (m *ManifestChangeSet) String() string { return proto.CompactTextString(m) } +func (*ManifestChangeSet) ProtoMessage() {} +func (*ManifestChangeSet) Descriptor() ([]byte, []int) { + return fileDescriptor_f80abaa17e25ccc8, []int{2} +} +func (m *ManifestChangeSet) XXX_Unmarshal(b []byte) error { + return m.Unmarshal(b) +} +func (m *ManifestChangeSet) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + if deterministic { + return xxx_messageInfo_ManifestChangeSet.Marshal(b, m, deterministic) + } else { + b = b[:cap(b)] + n, err := m.MarshalTo(b) + if err != nil { + return nil, err + } + return b[:n], nil + } +} +func (m *ManifestChangeSet) XXX_Merge(src proto.Message) { + xxx_messageInfo_ManifestChangeSet.Merge(m, src) +} +func (m *ManifestChangeSet) XXX_Size() int { + return m.Size() +} +func (m *ManifestChangeSet) XXX_DiscardUnknown() { + xxx_messageInfo_ManifestChangeSet.DiscardUnknown(m) +} + +var xxx_messageInfo_ManifestChangeSet proto.InternalMessageInfo + +func (m *ManifestChangeSet) GetChanges() []*ManifestChange { + if m != nil { + return m.Changes + } + return nil +} + +type ManifestChange struct { + Id uint64 `protobuf:"varint,1,opt,name=Id,proto3" json:"Id,omitempty"` + Op ManifestChange_Operation `protobuf:"varint,2,opt,name=Op,proto3,enum=pb.ManifestChange_Operation" json:"Op,omitempty"` + Level uint32 `protobuf:"varint,3,opt,name=Level,proto3" json:"Level,omitempty"` + Checksum []byte `protobuf:"bytes,4,opt,name=Checksum,proto3" json:"Checksum,omitempty"` + XXX_NoUnkeyedLiteral struct{} `json:"-"` + XXX_unrecognized []byte `json:"-"` + XXX_sizecache int32 `json:"-"` +} + +func (m *ManifestChange) Reset() { *m = ManifestChange{} } +func (m *ManifestChange) String() string { return proto.CompactTextString(m) } +func (*ManifestChange) ProtoMessage() {} +func (*ManifestChange) Descriptor() ([]byte, []int) { + return fileDescriptor_f80abaa17e25ccc8, []int{3} +} +func (m *ManifestChange) XXX_Unmarshal(b []byte) error { + return m.Unmarshal(b) +} +func (m *ManifestChange) XXX_Marshal(b []byte, deterministic bool) ([]byte, error) { + if deterministic { + return xxx_messageInfo_ManifestChange.Marshal(b, m, deterministic) + } else { + b = b[:cap(b)] + n, err := m.MarshalTo(b) + if err != nil { + return nil, err + } + return b[:n], nil + } +} +func (m *ManifestChange) XXX_Merge(src proto.Message) { + xxx_messageInfo_ManifestChange.Merge(m, src) +} +func (m *ManifestChange) XXX_Size() int { + return m.Size() +} +func (m *ManifestChange) XXX_DiscardUnknown() { + xxx_messageInfo_ManifestChange.DiscardUnknown(m) +} + +var xxx_messageInfo_ManifestChange proto.InternalMessageInfo + +func (m *ManifestChange) GetId() uint64 { + if m != nil { + return m.Id + } + return 0 +} + +func (m *ManifestChange) GetOp() ManifestChange_Operation { + if m != nil { + return m.Op + } + return ManifestChange_CREATE +} + +func (m *ManifestChange) GetLevel() uint32 { + if m != nil { + return m.Level + } + return 0 +} + +func (m *ManifestChange) GetChecksum() []byte { + if m != nil { + return m.Checksum + } + return nil +} + +func init() { + proto.RegisterEnum("pb.ManifestChange_Operation", ManifestChange_Operation_name, ManifestChange_Operation_value) + proto.RegisterType((*KV)(nil), "pb.KV") + proto.RegisterType((*KVList)(nil), "pb.KVList") + proto.RegisterType((*ManifestChangeSet)(nil), "pb.ManifestChangeSet") + proto.RegisterType((*ManifestChange)(nil), "pb.ManifestChange") +} + +func init() { proto.RegisterFile("pb.proto", fileDescriptor_f80abaa17e25ccc8) } + +var fileDescriptor_f80abaa17e25ccc8 = []byte{ + // 365 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x64, 0x91, 0x4f, 0x8a, 0xdb, 0x30, + 0x14, 0xc6, 0x47, 0x8a, 0xc7, 0xe3, 0xbc, 0xce, 0x04, 0x57, 0x94, 0x22, 0xfa, 0xc7, 0x18, 0x77, + 0xe3, 0xc5, 0xe0, 0xc5, 0xf4, 0x04, 0x69, 0xea, 0x45, 0x48, 0x42, 0x40, 0x0d, 0xd9, 0x06, 0x39, + 0x7e, 0x6d, 0x8c, 0x13, 0x5b, 0x58, 0x8a, 0x69, 0x6f, 0xd2, 0x0b, 0xf4, 0x04, 0xbd, 0x44, 0x97, + 0x3d, 0x42, 0x49, 0x2f, 0x52, 0xac, 0xfc, 0x81, 0xd0, 0xdd, 0xfb, 0xbe, 0xef, 0xbd, 0x4f, 0xf0, + 0x13, 0x78, 0x2a, 0x4b, 0x54, 0x53, 0x9b, 0x9a, 0x51, 0x95, 0x45, 0x3f, 0x09, 0xd0, 0xc9, 0x92, + 0xf9, 0xd0, 0x2b, 0xf1, 0x1b, 0x27, 0x21, 0x89, 0xef, 0x45, 0x37, 0xb2, 0x17, 0x70, 0xdb, 0xca, + 0xed, 0x1e, 0x39, 0xb5, 0xde, 0x51, 0xb0, 0xd7, 0xd0, 0xdf, 0x6b, 0x6c, 0x56, 0x3b, 0x34, 0x92, + 0xf7, 0x6c, 0xe2, 0x75, 0xc6, 0x0c, 0x8d, 0x64, 0x1c, 0xee, 0x5a, 0x6c, 0x74, 0x51, 0x57, 0xdc, + 0x09, 0x49, 0xec, 0x88, 0xb3, 0x64, 0x6f, 0x01, 0xf0, 0xab, 0x2a, 0x1a, 0xd4, 0x2b, 0x69, 0xf8, + 0xad, 0x0d, 0xfb, 0x27, 0x67, 0x68, 0x18, 0x03, 0xc7, 0x16, 0xba, 0xb6, 0xd0, 0xce, 0xdd, 0x4b, + 0xda, 0x34, 0x28, 0x77, 0xab, 0x22, 0xe7, 0x10, 0x92, 0xf8, 0x41, 0x78, 0x47, 0x63, 0x9c, 0x47, + 0x21, 0xb8, 0x93, 0xe5, 0xb4, 0xd0, 0x86, 0xbd, 0x04, 0x5a, 0xb6, 0x9c, 0x84, 0xbd, 0xf8, 0xd9, + 0x93, 0x9b, 0xa8, 0x2c, 0x99, 0x2c, 0x05, 0x2d, 0xdb, 0x68, 0x08, 0xcf, 0x67, 0xb2, 0x2a, 0x3e, + 0xa3, 0x36, 0xa3, 0x8d, 0xac, 0xbe, 0xe0, 0x27, 0x34, 0xec, 0x11, 0xee, 0xd6, 0x56, 0xe8, 0xd3, + 0x05, 0xeb, 0x2e, 0xae, 0xf7, 0xc4, 0x79, 0x25, 0xfa, 0x41, 0x60, 0x70, 0x9d, 0xb1, 0x01, 0xd0, + 0x71, 0x6e, 0x29, 0x39, 0x82, 0x8e, 0x73, 0xf6, 0x08, 0x74, 0xae, 0x2c, 0xa1, 0xc1, 0xd3, 0x9b, + 0xff, 0xbb, 0x92, 0xb9, 0xc2, 0x46, 0x9a, 0xa2, 0xae, 0x04, 0x9d, 0xab, 0x0e, 0xe9, 0x14, 0x5b, + 0xdc, 0x5a, 0x70, 0x0f, 0xe2, 0x28, 0xd8, 0x2b, 0xf0, 0x46, 0x1b, 0x5c, 0x97, 0x7a, 0xbf, 0xb3, + 0xd8, 0xee, 0xc5, 0x45, 0x47, 0xef, 0xa0, 0x7f, 0xa9, 0x60, 0x00, 0xee, 0x48, 0xa4, 0xc3, 0x45, + 0xea, 0xdf, 0x74, 0xf3, 0xc7, 0x74, 0x9a, 0x2e, 0x52, 0x9f, 0x7c, 0xf0, 0x7f, 0x1d, 0x02, 0xf2, + 0xfb, 0x10, 0x90, 0x3f, 0x87, 0x80, 0x7c, 0xff, 0x1b, 0xdc, 0x64, 0xae, 0xfd, 0xdf, 0xf7, 0xff, + 0x02, 0x00, 0x00, 0xff, 0xff, 0xeb, 0x28, 0x5d, 0xcf, 0xeb, 0x01, 0x00, 0x00, +} + +func (m *KV) Marshal() (dAtA []byte, err error) { + size := m.Size() + dAtA = make([]byte, size) + n, err := m.MarshalTo(dAtA) + if err != nil { + return nil, err + } + return dAtA[:n], nil +} + +func (m *KV) MarshalTo(dAtA []byte) (int, error) { + var i int + _ = i + var l int + _ = l + if len(m.Key) > 0 { + dAtA[i] = 0xa + i++ + i = encodeVarintPb(dAtA, i, uint64(len(m.Key))) + i += copy(dAtA[i:], m.Key) + } + if len(m.Value) > 0 { + dAtA[i] = 0x12 + i++ + i = encodeVarintPb(dAtA, i, uint64(len(m.Value))) + i += copy(dAtA[i:], m.Value) + } + if len(m.UserMeta) > 0 { + dAtA[i] = 0x1a + i++ + i = encodeVarintPb(dAtA, i, uint64(len(m.UserMeta))) + i += copy(dAtA[i:], m.UserMeta) + } + if m.Version != 0 { + dAtA[i] = 0x20 + i++ + i = encodeVarintPb(dAtA, i, uint64(m.Version)) + } + if m.ExpiresAt != 0 { + dAtA[i] = 0x28 + i++ + i = encodeVarintPb(dAtA, i, uint64(m.ExpiresAt)) + } + if len(m.Meta) > 0 { + dAtA[i] = 0x32 + i++ + i = encodeVarintPb(dAtA, i, uint64(len(m.Meta))) + i += copy(dAtA[i:], m.Meta) + } + if m.StreamId != 0 { + dAtA[i] = 0x50 + i++ + i = encodeVarintPb(dAtA, i, uint64(m.StreamId)) + } + if m.XXX_unrecognized != nil { + i += copy(dAtA[i:], m.XXX_unrecognized) + } + return i, nil +} + +func (m *KVList) Marshal() (dAtA []byte, err error) { + size := m.Size() + dAtA = make([]byte, size) + n, err := m.MarshalTo(dAtA) + if err != nil { + return nil, err + } + return dAtA[:n], nil +} + +func (m *KVList) MarshalTo(dAtA []byte) (int, error) { + var i int + _ = i + var l int + _ = l + if len(m.Kv) > 0 { + for _, msg := range m.Kv { + dAtA[i] = 0xa + i++ + i = encodeVarintPb(dAtA, i, uint64(msg.Size())) + n, err := msg.MarshalTo(dAtA[i:]) + if err != nil { + return 0, err + } + i += n + } + } + if m.XXX_unrecognized != nil { + i += copy(dAtA[i:], m.XXX_unrecognized) + } + return i, nil +} + +func (m *ManifestChangeSet) Marshal() (dAtA []byte, err error) { + size := m.Size() + dAtA = make([]byte, size) + n, err := m.MarshalTo(dAtA) + if err != nil { + return nil, err + } + return dAtA[:n], nil +} + +func (m *ManifestChangeSet) MarshalTo(dAtA []byte) (int, error) { + var i int + _ = i + var l int + _ = l + if len(m.Changes) > 0 { + for _, msg := range m.Changes { + dAtA[i] = 0xa + i++ + i = encodeVarintPb(dAtA, i, uint64(msg.Size())) + n, err := msg.MarshalTo(dAtA[i:]) + if err != nil { + return 0, err + } + i += n + } + } + if m.XXX_unrecognized != nil { + i += copy(dAtA[i:], m.XXX_unrecognized) + } + return i, nil +} + +func (m *ManifestChange) Marshal() (dAtA []byte, err error) { + size := m.Size() + dAtA = make([]byte, size) + n, err := m.MarshalTo(dAtA) + if err != nil { + return nil, err + } + return dAtA[:n], nil +} + +func (m *ManifestChange) MarshalTo(dAtA []byte) (int, error) { + var i int + _ = i + var l int + _ = l + if m.Id != 0 { + dAtA[i] = 0x8 + i++ + i = encodeVarintPb(dAtA, i, uint64(m.Id)) + } + if m.Op != 0 { + dAtA[i] = 0x10 + i++ + i = encodeVarintPb(dAtA, i, uint64(m.Op)) + } + if m.Level != 0 { + dAtA[i] = 0x18 + i++ + i = encodeVarintPb(dAtA, i, uint64(m.Level)) + } + if len(m.Checksum) > 0 { + dAtA[i] = 0x22 + i++ + i = encodeVarintPb(dAtA, i, uint64(len(m.Checksum))) + i += copy(dAtA[i:], m.Checksum) + } + if m.XXX_unrecognized != nil { + i += copy(dAtA[i:], m.XXX_unrecognized) + } + return i, nil +} + +func encodeVarintPb(dAtA []byte, offset int, v uint64) int { + for v >= 1<<7 { + dAtA[offset] = uint8(v&0x7f | 0x80) + v >>= 7 + offset++ + } + dAtA[offset] = uint8(v) + return offset + 1 +} +func (m *KV) Size() (n int) { + if m == nil { + return 0 + } + var l int + _ = l + l = len(m.Key) + if l > 0 { + n += 1 + l + sovPb(uint64(l)) + } + l = len(m.Value) + if l > 0 { + n += 1 + l + sovPb(uint64(l)) + } + l = len(m.UserMeta) + if l > 0 { + n += 1 + l + sovPb(uint64(l)) + } + if m.Version != 0 { + n += 1 + sovPb(uint64(m.Version)) + } + if m.ExpiresAt != 0 { + n += 1 + sovPb(uint64(m.ExpiresAt)) + } + l = len(m.Meta) + if l > 0 { + n += 1 + l + sovPb(uint64(l)) + } + if m.StreamId != 0 { + n += 1 + sovPb(uint64(m.StreamId)) + } + if m.XXX_unrecognized != nil { + n += len(m.XXX_unrecognized) + } + return n +} + +func (m *KVList) Size() (n int) { + if m == nil { + return 0 + } + var l int + _ = l + if len(m.Kv) > 0 { + for _, e := range m.Kv { + l = e.Size() + n += 1 + l + sovPb(uint64(l)) + } + } + if m.XXX_unrecognized != nil { + n += len(m.XXX_unrecognized) + } + return n +} + +func (m *ManifestChangeSet) Size() (n int) { + if m == nil { + return 0 + } + var l int + _ = l + if len(m.Changes) > 0 { + for _, e := range m.Changes { + l = e.Size() + n += 1 + l + sovPb(uint64(l)) + } + } + if m.XXX_unrecognized != nil { + n += len(m.XXX_unrecognized) + } + return n +} + +func (m *ManifestChange) Size() (n int) { + if m == nil { + return 0 + } + var l int + _ = l + if m.Id != 0 { + n += 1 + sovPb(uint64(m.Id)) + } + if m.Op != 0 { + n += 1 + sovPb(uint64(m.Op)) + } + if m.Level != 0 { + n += 1 + sovPb(uint64(m.Level)) + } + l = len(m.Checksum) + if l > 0 { + n += 1 + l + sovPb(uint64(l)) + } + if m.XXX_unrecognized != nil { + n += len(m.XXX_unrecognized) + } + return n +} + +func sovPb(x uint64) (n int) { + for { + n++ + x >>= 7 + if x == 0 { + break + } + } + return n +} +func sozPb(x uint64) (n int) { + return sovPb(uint64((x << 1) ^ uint64((int64(x) >> 63)))) +} +func (m *KV) Unmarshal(dAtA []byte) error { + l := len(dAtA) + iNdEx := 0 + for iNdEx < l { + preIndex := iNdEx + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + wire |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + fieldNum := int32(wire >> 3) + wireType := int(wire & 0x7) + if wireType == 4 { + return fmt.Errorf("proto: KV: wiretype end group for non-group") + } + if fieldNum <= 0 { + return fmt.Errorf("proto: KV: illegal tag %d (wire type %d)", fieldNum, wire) + } + switch fieldNum { + case 1: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Key", wireType) + } + var byteLen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + byteLen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if byteLen < 0 { + return ErrInvalidLengthPb + } + postIndex := iNdEx + byteLen + if postIndex < 0 { + return ErrInvalidLengthPb + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Key = append(m.Key[:0], dAtA[iNdEx:postIndex]...) + if m.Key == nil { + m.Key = []byte{} + } + iNdEx = postIndex + case 2: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Value", wireType) + } + var byteLen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + byteLen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if byteLen < 0 { + return ErrInvalidLengthPb + } + postIndex := iNdEx + byteLen + if postIndex < 0 { + return ErrInvalidLengthPb + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Value = append(m.Value[:0], dAtA[iNdEx:postIndex]...) + if m.Value == nil { + m.Value = []byte{} + } + iNdEx = postIndex + case 3: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field UserMeta", wireType) + } + var byteLen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + byteLen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if byteLen < 0 { + return ErrInvalidLengthPb + } + postIndex := iNdEx + byteLen + if postIndex < 0 { + return ErrInvalidLengthPb + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.UserMeta = append(m.UserMeta[:0], dAtA[iNdEx:postIndex]...) + if m.UserMeta == nil { + m.UserMeta = []byte{} + } + iNdEx = postIndex + case 4: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field Version", wireType) + } + m.Version = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.Version |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + case 5: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field ExpiresAt", wireType) + } + m.ExpiresAt = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.ExpiresAt |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + case 6: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Meta", wireType) + } + var byteLen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + byteLen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if byteLen < 0 { + return ErrInvalidLengthPb + } + postIndex := iNdEx + byteLen + if postIndex < 0 { + return ErrInvalidLengthPb + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Meta = append(m.Meta[:0], dAtA[iNdEx:postIndex]...) + if m.Meta == nil { + m.Meta = []byte{} + } + iNdEx = postIndex + case 10: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field StreamId", wireType) + } + m.StreamId = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.StreamId |= uint32(b&0x7F) << shift + if b < 0x80 { + break + } + } + default: + iNdEx = preIndex + skippy, err := skipPb(dAtA[iNdEx:]) + if err != nil { + return err + } + if skippy < 0 { + return ErrInvalidLengthPb + } + if (iNdEx + skippy) < 0 { + return ErrInvalidLengthPb + } + if (iNdEx + skippy) > l { + return io.ErrUnexpectedEOF + } + m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...) + iNdEx += skippy + } + } + + if iNdEx > l { + return io.ErrUnexpectedEOF + } + return nil +} +func (m *KVList) Unmarshal(dAtA []byte) error { + l := len(dAtA) + iNdEx := 0 + for iNdEx < l { + preIndex := iNdEx + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + wire |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + fieldNum := int32(wire >> 3) + wireType := int(wire & 0x7) + if wireType == 4 { + return fmt.Errorf("proto: KVList: wiretype end group for non-group") + } + if fieldNum <= 0 { + return fmt.Errorf("proto: KVList: illegal tag %d (wire type %d)", fieldNum, wire) + } + switch fieldNum { + case 1: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Kv", wireType) + } + var msglen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + msglen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if msglen < 0 { + return ErrInvalidLengthPb + } + postIndex := iNdEx + msglen + if postIndex < 0 { + return ErrInvalidLengthPb + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Kv = append(m.Kv, &KV{}) + if err := m.Kv[len(m.Kv)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil { + return err + } + iNdEx = postIndex + default: + iNdEx = preIndex + skippy, err := skipPb(dAtA[iNdEx:]) + if err != nil { + return err + } + if skippy < 0 { + return ErrInvalidLengthPb + } + if (iNdEx + skippy) < 0 { + return ErrInvalidLengthPb + } + if (iNdEx + skippy) > l { + return io.ErrUnexpectedEOF + } + m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...) + iNdEx += skippy + } + } + + if iNdEx > l { + return io.ErrUnexpectedEOF + } + return nil +} +func (m *ManifestChangeSet) Unmarshal(dAtA []byte) error { + l := len(dAtA) + iNdEx := 0 + for iNdEx < l { + preIndex := iNdEx + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + wire |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + fieldNum := int32(wire >> 3) + wireType := int(wire & 0x7) + if wireType == 4 { + return fmt.Errorf("proto: ManifestChangeSet: wiretype end group for non-group") + } + if fieldNum <= 0 { + return fmt.Errorf("proto: ManifestChangeSet: illegal tag %d (wire type %d)", fieldNum, wire) + } + switch fieldNum { + case 1: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Changes", wireType) + } + var msglen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + msglen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if msglen < 0 { + return ErrInvalidLengthPb + } + postIndex := iNdEx + msglen + if postIndex < 0 { + return ErrInvalidLengthPb + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Changes = append(m.Changes, &ManifestChange{}) + if err := m.Changes[len(m.Changes)-1].Unmarshal(dAtA[iNdEx:postIndex]); err != nil { + return err + } + iNdEx = postIndex + default: + iNdEx = preIndex + skippy, err := skipPb(dAtA[iNdEx:]) + if err != nil { + return err + } + if skippy < 0 { + return ErrInvalidLengthPb + } + if (iNdEx + skippy) < 0 { + return ErrInvalidLengthPb + } + if (iNdEx + skippy) > l { + return io.ErrUnexpectedEOF + } + m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...) + iNdEx += skippy + } + } + + if iNdEx > l { + return io.ErrUnexpectedEOF + } + return nil +} +func (m *ManifestChange) Unmarshal(dAtA []byte) error { + l := len(dAtA) + iNdEx := 0 + for iNdEx < l { + preIndex := iNdEx + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + wire |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + fieldNum := int32(wire >> 3) + wireType := int(wire & 0x7) + if wireType == 4 { + return fmt.Errorf("proto: ManifestChange: wiretype end group for non-group") + } + if fieldNum <= 0 { + return fmt.Errorf("proto: ManifestChange: illegal tag %d (wire type %d)", fieldNum, wire) + } + switch fieldNum { + case 1: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field Id", wireType) + } + m.Id = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.Id |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + case 2: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field Op", wireType) + } + m.Op = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.Op |= ManifestChange_Operation(b&0x7F) << shift + if b < 0x80 { + break + } + } + case 3: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field Level", wireType) + } + m.Level = 0 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + m.Level |= uint32(b&0x7F) << shift + if b < 0x80 { + break + } + } + case 4: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field Checksum", wireType) + } + var byteLen int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowPb + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + byteLen |= int(b&0x7F) << shift + if b < 0x80 { + break + } + } + if byteLen < 0 { + return ErrInvalidLengthPb + } + postIndex := iNdEx + byteLen + if postIndex < 0 { + return ErrInvalidLengthPb + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.Checksum = append(m.Checksum[:0], dAtA[iNdEx:postIndex]...) + if m.Checksum == nil { + m.Checksum = []byte{} + } + iNdEx = postIndex + default: + iNdEx = preIndex + skippy, err := skipPb(dAtA[iNdEx:]) + if err != nil { + return err + } + if skippy < 0 { + return ErrInvalidLengthPb + } + if (iNdEx + skippy) < 0 { + return ErrInvalidLengthPb + } + if (iNdEx + skippy) > l { + return io.ErrUnexpectedEOF + } + m.XXX_unrecognized = append(m.XXX_unrecognized, dAtA[iNdEx:iNdEx+skippy]...) + iNdEx += skippy + } + } + + if iNdEx > l { + return io.ErrUnexpectedEOF + } + return nil +} +func skipPb(dAtA []byte) (n int, err error) { + l := len(dAtA) + iNdEx := 0 + for iNdEx < l { + var wire uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return 0, ErrIntOverflowPb + } + if iNdEx >= l { + return 0, io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + wire |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + wireType := int(wire & 0x7) + switch wireType { + case 0: + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return 0, ErrIntOverflowPb + } + if iNdEx >= l { + return 0, io.ErrUnexpectedEOF + } + iNdEx++ + if dAtA[iNdEx-1] < 0x80 { + break + } + } + return iNdEx, nil + case 1: + iNdEx += 8 + return iNdEx, nil + case 2: + var length int + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return 0, ErrIntOverflowPb + } + if iNdEx >= l { + return 0, io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + length |= (int(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + if length < 0 { + return 0, ErrInvalidLengthPb + } + iNdEx += length + if iNdEx < 0 { + return 0, ErrInvalidLengthPb + } + return iNdEx, nil + case 3: + for { + var innerWire uint64 + var start int = iNdEx + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return 0, ErrIntOverflowPb + } + if iNdEx >= l { + return 0, io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + innerWire |= (uint64(b) & 0x7F) << shift + if b < 0x80 { + break + } + } + innerWireType := int(innerWire & 0x7) + if innerWireType == 4 { + break + } + next, err := skipPb(dAtA[start:]) + if err != nil { + return 0, err + } + iNdEx = start + next + if iNdEx < 0 { + return 0, ErrInvalidLengthPb + } + } + return iNdEx, nil + case 4: + return iNdEx, nil + case 5: + iNdEx += 4 + return iNdEx, nil + default: + return 0, fmt.Errorf("proto: illegal wireType %d", wireType) + } + } + panic("unreachable") +} + +var ( + ErrInvalidLengthPb = fmt.Errorf("proto: negative length found during unmarshaling") + ErrIntOverflowPb = fmt.Errorf("proto: integer overflow") +) diff --git a/vendor/github.com/dgraph-io/badger/pb/pb.proto b/vendor/github.com/dgraph-io/badger/pb/pb.proto new file mode 100644 index 0000000000..c6e7f413d7 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/pb/pb.proto @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Use protos/gen.sh to generate .pb.go files. +syntax = "proto3"; + +package pb; + +message KV { + bytes key = 1; + bytes value = 2; + bytes user_meta = 3; + uint64 version = 4; + uint64 expires_at = 5; + bytes meta = 6; + + // Stream id is used to identify which stream the KV came from. + uint32 stream_id = 10; +} + +message KVList { + repeated KV kv = 1; +} + +message ManifestChangeSet { + // A set of changes that are applied atomically. + repeated ManifestChange changes = 1; +} + +message ManifestChange { + uint64 Id = 1; + enum Operation { + CREATE = 0; + DELETE = 1; + } + Operation Op = 2; + uint32 Level = 3; // Only used for CREATE + bytes Checksum = 4; // Only used for CREATE +} diff --git a/vendor/github.com/dgraph-io/badger/publisher.go b/vendor/github.com/dgraph-io/badger/publisher.go new file mode 100644 index 0000000000..24588f5c68 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/publisher.go @@ -0,0 +1,159 @@ +/* + * Copyright 2019 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package badger + +import ( + "bytes" + "sync" + + "github.com/dgraph-io/badger/pb" + "github.com/dgraph-io/badger/y" +) + +type subscriber struct { + prefixes [][]byte + sendCh chan<- *pb.KVList + subCloser *y.Closer +} + +type publisher struct { + sync.Mutex + pubCh chan requests + subscribers map[uint64]subscriber + nextID uint64 +} + +func newPublisher() *publisher { + return &publisher{ + pubCh: make(chan requests, 1000), + subscribers: make(map[uint64]subscriber), + nextID: 0, + } +} + +func (p *publisher) listenForUpdates(c *y.Closer) { + defer func() { + p.cleanSubscribers() + c.Done() + }() + slurp := func(batch []*request) { + for { + select { + case reqs := <-p.pubCh: + batch = append(batch, reqs...) + default: + p.publishUpdates(batch) + return + } + } + } + for { + select { + case <-c.HasBeenClosed(): + return + case reqs := <-p.pubCh: + slurp(reqs) + } + } +} + +func (p *publisher) publishUpdates(reqs requests) { + kvs := &pb.KVList{} + p.Lock() + defer func() { + p.Unlock() + // Release all the request. + reqs.DecrRef() + }() + + // TODO: Optimize this, so we can figure out key -> subscriber quickly, without iterating over + // all the prefixes. + // TODO: Use trie to find subscribers. + for _, s := range p.subscribers { + // BUG: This would send out the same entry multiple times on multiple matches for the same + // subscriber. + for _, prefix := range s.prefixes { + for _, req := range reqs { + for _, e := range req.Entries { + if bytes.HasPrefix(e.Key, prefix) { + // TODO: Maybe we can optimize this by creating the KV once and sending it + // over to multiple subscribers. + k := y.SafeCopy(nil, e.Key) + kv := &pb.KV{ + Key: y.ParseKey(k), + Value: y.SafeCopy(nil, e.Value), + UserMeta: []byte{e.UserMeta}, + ExpiresAt: e.ExpiresAt, + Version: y.ParseTs(k), + } + kvs.Kv = append(kvs.Kv, kv) + } + } + } + } + if len(kvs.GetKv()) > 0 { + s.sendCh <- kvs + } + } +} + +func (p *publisher) newSubscriber(c *y.Closer, prefixes ...[]byte) (<-chan *pb.KVList, uint64) { + p.Lock() + defer p.Unlock() + ch := make(chan *pb.KVList, 1000) + id := p.nextID + // Increment next ID. + p.nextID++ + p.subscribers[id] = subscriber{ + prefixes: prefixes, + sendCh: ch, + subCloser: c, + } + return ch, id +} + +// cleanSubscribers stops all the subscribers. Ideally, It should be called while closing DB. +func (p *publisher) cleanSubscribers() { + p.Lock() + defer p.Unlock() + for id, s := range p.subscribers { + delete(p.subscribers, id) + s.subCloser.SignalAndWait() + } +} + +func (p *publisher) deleteSubscriber(id uint64) { + p.Lock() + defer p.Unlock() + if _, ok := p.subscribers[id]; !ok { + return + } + delete(p.subscribers, id) +} + +func (p *publisher) sendUpdates(reqs []*request) { + // TODO: Prefix check before pushing into pubCh. + if p.noOfSubscribers() != 0 { + p.pubCh <- reqs + } +} + +func (p *publisher) noOfSubscribers() int { + p.Lock() + defer p.Unlock() + return len(p.subscribers) +} diff --git a/vendor/github.com/dgraph-io/badger/skl/README.md b/vendor/github.com/dgraph-io/badger/skl/README.md new file mode 100644 index 0000000000..e22e4590bb --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/skl/README.md @@ -0,0 +1,113 @@ +This is much better than `skiplist` and `slist`. + +``` +BenchmarkReadWrite/frac_0-8 3000000 537 ns/op +BenchmarkReadWrite/frac_1-8 3000000 503 ns/op +BenchmarkReadWrite/frac_2-8 3000000 492 ns/op +BenchmarkReadWrite/frac_3-8 3000000 475 ns/op +BenchmarkReadWrite/frac_4-8 3000000 440 ns/op +BenchmarkReadWrite/frac_5-8 5000000 442 ns/op +BenchmarkReadWrite/frac_6-8 5000000 380 ns/op +BenchmarkReadWrite/frac_7-8 5000000 338 ns/op +BenchmarkReadWrite/frac_8-8 5000000 294 ns/op +BenchmarkReadWrite/frac_9-8 10000000 268 ns/op +BenchmarkReadWrite/frac_10-8 100000000 26.3 ns/op +``` + +And even better than a simple map with read-write lock: + +``` +BenchmarkReadWriteMap/frac_0-8 2000000 774 ns/op +BenchmarkReadWriteMap/frac_1-8 2000000 647 ns/op +BenchmarkReadWriteMap/frac_2-8 3000000 605 ns/op +BenchmarkReadWriteMap/frac_3-8 3000000 603 ns/op +BenchmarkReadWriteMap/frac_4-8 3000000 556 ns/op +BenchmarkReadWriteMap/frac_5-8 3000000 472 ns/op +BenchmarkReadWriteMap/frac_6-8 3000000 476 ns/op +BenchmarkReadWriteMap/frac_7-8 3000000 457 ns/op +BenchmarkReadWriteMap/frac_8-8 5000000 444 ns/op +BenchmarkReadWriteMap/frac_9-8 5000000 361 ns/op +BenchmarkReadWriteMap/frac_10-8 10000000 212 ns/op +``` + +# Node Pooling + +Command used + +``` +rm -Rf tmp && /usr/bin/time -l ./populate -keys_mil 10 +``` + +For pprof results, we run without using /usr/bin/time. There are four runs below. + +Results seem to vary quite a bit between runs. + +## Before node pooling + +``` +1311.53MB of 1338.69MB total (97.97%) +Dropped 30 nodes (cum <= 6.69MB) +Showing top 10 nodes out of 37 (cum >= 12.50MB) + flat flat% sum% cum cum% + 523.04MB 39.07% 39.07% 523.04MB 39.07% github.com/dgraph-io/badger/skl.(*Skiplist).Put + 184.51MB 13.78% 52.85% 184.51MB 13.78% runtime.stringtoslicebyte + 166.01MB 12.40% 65.25% 689.04MB 51.47% github.com/dgraph-io/badger/mem.(*Table).Put + 165MB 12.33% 77.58% 165MB 12.33% runtime.convT2E + 116.92MB 8.73% 86.31% 116.92MB 8.73% bytes.makeSlice + 62.50MB 4.67% 90.98% 62.50MB 4.67% main.newValue + 34.50MB 2.58% 93.56% 34.50MB 2.58% github.com/dgraph-io/badger/table.(*BlockIterator).parseKV + 25.50MB 1.90% 95.46% 100.06MB 7.47% github.com/dgraph-io/badger/y.(*MergeIterator).Next + 21.06MB 1.57% 97.04% 21.06MB 1.57% github.com/dgraph-io/badger/table.(*Table).read + 12.50MB 0.93% 97.97% 12.50MB 0.93% github.com/dgraph-io/badger/table.header.Encode + + 128.31 real 329.37 user 17.11 sys +3355660288 maximum resident set size + 0 average shared memory size + 0 average unshared data size + 0 average unshared stack size + 2203080 page reclaims + 764 page faults + 0 swaps + 275 block input operations + 76 block output operations + 0 messages sent + 0 messages received + 0 signals received + 49173 voluntary context switches + 599922 involuntary context switches +``` + +## After node pooling + +``` +1963.13MB of 2026.09MB total (96.89%) +Dropped 29 nodes (cum <= 10.13MB) +Showing top 10 nodes out of 41 (cum >= 185.62MB) + flat flat% sum% cum cum% + 658.05MB 32.48% 32.48% 658.05MB 32.48% github.com/dgraph-io/badger/skl.glob..func1 + 297.51MB 14.68% 47.16% 297.51MB 14.68% runtime.convT2E + 257.51MB 12.71% 59.87% 257.51MB 12.71% runtime.stringtoslicebyte + 249.01MB 12.29% 72.16% 1007.06MB 49.70% github.com/dgraph-io/badger/mem.(*Table).Put + 142.43MB 7.03% 79.19% 142.43MB 7.03% bytes.makeSlice + 100MB 4.94% 84.13% 758.05MB 37.41% github.com/dgraph-io/badger/skl.newNode + 99.50MB 4.91% 89.04% 99.50MB 4.91% main.newValue + 75MB 3.70% 92.74% 75MB 3.70% github.com/dgraph-io/badger/table.(*BlockIterator).parseKV + 44.62MB 2.20% 94.94% 44.62MB 2.20% github.com/dgraph-io/badger/table.(*Table).read + 39.50MB 1.95% 96.89% 185.62MB 9.16% github.com/dgraph-io/badger/y.(*MergeIterator).Next + + 135.58 real 374.29 user 17.65 sys +3740614656 maximum resident set size + 0 average shared memory size + 0 average unshared data size + 0 average unshared stack size + 2276566 page reclaims + 770 page faults + 0 swaps + 128 block input operations + 90 block output operations + 0 messages sent + 0 messages received + 0 signals received + 46434 voluntary context switches + 597049 involuntary context switches +``` diff --git a/vendor/github.com/dgraph-io/badger/skl/arena.go b/vendor/github.com/dgraph-io/badger/skl/arena.go new file mode 100644 index 0000000000..def550712f --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/skl/arena.go @@ -0,0 +1,136 @@ +/* + * Copyright 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package skl + +import ( + "sync/atomic" + "unsafe" + + "github.com/dgraph-io/badger/y" +) + +const ( + offsetSize = int(unsafe.Sizeof(uint32(0))) + + // Always align nodes on 64-bit boundaries, even on 32-bit architectures, + // so that the node.value field is 64-bit aligned. This is necessary because + // node.getValueOffset uses atomic.LoadUint64, which expects its input + // pointer to be 64-bit aligned. + nodeAlign = int(unsafe.Sizeof(uint64(0))) - 1 +) + +// Arena should be lock-free. +type Arena struct { + n uint32 + buf []byte +} + +// newArena returns a new arena. +func newArena(n int64) *Arena { + // Don't store data at position 0 in order to reserve offset=0 as a kind + // of nil pointer. + out := &Arena{ + n: 1, + buf: make([]byte, n), + } + return out +} + +func (s *Arena) size() int64 { + return int64(atomic.LoadUint32(&s.n)) +} + +func (s *Arena) reset() { + atomic.StoreUint32(&s.n, 0) +} + +// putNode allocates a node in the arena. The node is aligned on a pointer-sized +// boundary. The arena offset of the node is returned. +func (s *Arena) putNode(height int) uint32 { + // Compute the amount of the tower that will never be used, since the height + // is less than maxHeight. + unusedSize := (maxHeight - height) * offsetSize + + // Pad the allocation with enough bytes to ensure pointer alignment. + l := uint32(MaxNodeSize - unusedSize + nodeAlign) + n := atomic.AddUint32(&s.n, l) + y.AssertTruef(int(n) <= len(s.buf), + "Arena too small, toWrite:%d newTotal:%d limit:%d", + l, n, len(s.buf)) + + // Return the aligned offset. + m := (n - l + uint32(nodeAlign)) & ^uint32(nodeAlign) + return m +} + +// Put will *copy* val into arena. To make better use of this, reuse your input +// val buffer. Returns an offset into buf. User is responsible for remembering +// size of val. We could also store this size inside arena but the encoding and +// decoding will incur some overhead. +func (s *Arena) putVal(v y.ValueStruct) uint32 { + l := uint32(v.EncodedSize()) + n := atomic.AddUint32(&s.n, l) + y.AssertTruef(int(n) <= len(s.buf), + "Arena too small, toWrite:%d newTotal:%d limit:%d", + l, n, len(s.buf)) + m := n - l + v.Encode(s.buf[m:]) + return m +} + +func (s *Arena) putKey(key []byte) uint32 { + l := uint32(len(key)) + n := atomic.AddUint32(&s.n, l) + y.AssertTruef(int(n) <= len(s.buf), + "Arena too small, toWrite:%d newTotal:%d limit:%d", + l, n, len(s.buf)) + m := n - l + y.AssertTrue(len(key) == copy(s.buf[m:n], key)) + return m +} + +// getNode returns a pointer to the node located at offset. If the offset is +// zero, then the nil node pointer is returned. +func (s *Arena) getNode(offset uint32) *node { + if offset == 0 { + return nil + } + + return (*node)(unsafe.Pointer(&s.buf[offset])) +} + +// getKey returns byte slice at offset. +func (s *Arena) getKey(offset uint32, size uint16) []byte { + return s.buf[offset : offset+uint32(size)] +} + +// getVal returns byte slice at offset. The given size should be just the value +// size and should NOT include the meta bytes. +func (s *Arena) getVal(offset uint32, size uint16) (ret y.ValueStruct) { + ret.Decode(s.buf[offset : offset+uint32(size)]) + return +} + +// getNodeOffset returns the offset of node in the arena. If the node pointer is +// nil, then the zero offset is returned. +func (s *Arena) getNodeOffset(nd *node) uint32 { + if nd == nil { + return 0 + } + + return uint32(uintptr(unsafe.Pointer(nd)) - uintptr(unsafe.Pointer(&s.buf[0]))) +} diff --git a/vendor/github.com/dgraph-io/badger/skl/skl.go b/vendor/github.com/dgraph-io/badger/skl/skl.go new file mode 100644 index 0000000000..fc2eff982b --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/skl/skl.go @@ -0,0 +1,517 @@ +/* + * Copyright 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* +Adapted from RocksDB inline skiplist. + +Key differences: +- No optimization for sequential inserts (no "prev"). +- No custom comparator. +- Support overwrites. This requires care when we see the same key when inserting. + For RocksDB or LevelDB, overwrites are implemented as a newer sequence number in the key, so + there is no need for values. We don't intend to support versioning. In-place updates of values + would be more efficient. +- We discard all non-concurrent code. +- We do not support Splices. This simplifies the code a lot. +- No AllocateNode or other pointer arithmetic. +- We combine the findLessThan, findGreaterOrEqual, etc into one function. +*/ + +package skl + +import ( + "math" + "math/rand" + "sync/atomic" + "unsafe" + + "github.com/dgraph-io/badger/y" +) + +const ( + maxHeight = 20 + heightIncrease = math.MaxUint32 / 3 +) + +// MaxNodeSize is the memory footprint of a node of maximum height. +const MaxNodeSize = int(unsafe.Sizeof(node{})) + +type node struct { + // Multiple parts of the value are encoded as a single uint64 so that it + // can be atomically loaded and stored: + // value offset: uint32 (bits 0-31) + // value size : uint16 (bits 32-47) + value uint64 + + // A byte slice is 24 bytes. We are trying to save space here. + keyOffset uint32 // Immutable. No need to lock to access key. + keySize uint16 // Immutable. No need to lock to access key. + + // Height of the tower. + height uint16 + + // Most nodes do not need to use the full height of the tower, since the + // probability of each successive level decreases exponentially. Because + // these elements are never accessed, they do not need to be allocated. + // Therefore, when a node is allocated in the arena, its memory footprint + // is deliberately truncated to not include unneeded tower elements. + // + // All accesses to elements should use CAS operations, with no need to lock. + tower [maxHeight]uint32 +} + +// Skiplist maps keys to values (in memory) +type Skiplist struct { + height int32 // Current height. 1 <= height <= kMaxHeight. CAS. + head *node + ref int32 + arena *Arena +} + +// IncrRef increases the refcount +func (s *Skiplist) IncrRef() { + atomic.AddInt32(&s.ref, 1) +} + +// DecrRef decrements the refcount, deallocating the Skiplist when done using it +func (s *Skiplist) DecrRef() { + newRef := atomic.AddInt32(&s.ref, -1) + if newRef > 0 { + return + } + + s.arena.reset() + // Indicate we are closed. Good for testing. Also, lets GC reclaim memory. Race condition + // here would suggest we are accessing skiplist when we are supposed to have no reference! + s.arena = nil + // Since the head references the arena's buf, as long as the head is kept around + // GC can't release the buf. + s.head = nil +} + +func newNode(arena *Arena, key []byte, v y.ValueStruct, height int) *node { + // The base level is already allocated in the node struct. + offset := arena.putNode(height) + node := arena.getNode(offset) + node.keyOffset = arena.putKey(key) + node.keySize = uint16(len(key)) + node.height = uint16(height) + node.value = encodeValue(arena.putVal(v), v.EncodedSize()) + return node +} + +func encodeValue(valOffset uint32, valSize uint16) uint64 { + return uint64(valSize)<<32 | uint64(valOffset) +} + +func decodeValue(value uint64) (valOffset uint32, valSize uint16) { + valOffset = uint32(value) + valSize = uint16(value >> 32) + return +} + +// NewSkiplist makes a new empty skiplist, with a given arena size +func NewSkiplist(arenaSize int64) *Skiplist { + arena := newArena(arenaSize) + head := newNode(arena, nil, y.ValueStruct{}, maxHeight) + return &Skiplist{ + height: 1, + head: head, + arena: arena, + ref: 1, + } +} + +func (s *node) getValueOffset() (uint32, uint16) { + value := atomic.LoadUint64(&s.value) + return decodeValue(value) +} + +func (s *node) key(arena *Arena) []byte { + return arena.getKey(s.keyOffset, s.keySize) +} + +func (s *node) setValue(arena *Arena, v y.ValueStruct) { + valOffset := arena.putVal(v) + value := encodeValue(valOffset, v.EncodedSize()) + atomic.StoreUint64(&s.value, value) +} + +func (s *node) getNextOffset(h int) uint32 { + return atomic.LoadUint32(&s.tower[h]) +} + +func (s *node) casNextOffset(h int, old, val uint32) bool { + return atomic.CompareAndSwapUint32(&s.tower[h], old, val) +} + +// Returns true if key is strictly > n.key. +// If n is nil, this is an "end" marker and we return false. +//func (s *Skiplist) keyIsAfterNode(key []byte, n *node) bool { +// y.AssertTrue(n != s.head) +// return n != nil && y.CompareKeys(key, n.key) > 0 +//} + +func randomHeight() int { + h := 1 + for h < maxHeight && rand.Uint32() <= heightIncrease { + h++ + } + return h +} + +func (s *Skiplist) getNext(nd *node, height int) *node { + return s.arena.getNode(nd.getNextOffset(height)) +} + +// findNear finds the node near to key. +// If less=true, it finds rightmost node such that node.key < key (if allowEqual=false) or +// node.key <= key (if allowEqual=true). +// If less=false, it finds leftmost node such that node.key > key (if allowEqual=false) or +// node.key >= key (if allowEqual=true). +// Returns the node found. The bool returned is true if the node has key equal to given key. +func (s *Skiplist) findNear(key []byte, less bool, allowEqual bool) (*node, bool) { + x := s.head + level := int(s.getHeight() - 1) + for { + // Assume x.key < key. + next := s.getNext(x, level) + if next == nil { + // x.key < key < END OF LIST + if level > 0 { + // Can descend further to iterate closer to the end. + level-- + continue + } + // Level=0. Cannot descend further. Let's return something that makes sense. + if !less { + return nil, false + } + // Try to return x. Make sure it is not a head node. + if x == s.head { + return nil, false + } + return x, false + } + + nextKey := next.key(s.arena) + cmp := y.CompareKeys(key, nextKey) + if cmp > 0 { + // x.key < next.key < key. We can continue to move right. + x = next + continue + } + if cmp == 0 { + // x.key < key == next.key. + if allowEqual { + return next, true + } + if !less { + // We want >, so go to base level to grab the next bigger note. + return s.getNext(next, 0), false + } + // We want <. If not base level, we should go closer in the next level. + if level > 0 { + level-- + continue + } + // On base level. Return x. + if x == s.head { + return nil, false + } + return x, false + } + // cmp < 0. In other words, x.key < key < next. + if level > 0 { + level-- + continue + } + // At base level. Need to return something. + if !less { + return next, false + } + // Try to return x. Make sure it is not a head node. + if x == s.head { + return nil, false + } + return x, false + } +} + +// findSpliceForLevel returns (outBefore, outAfter) with outBefore.key <= key <= outAfter.key. +// The input "before" tells us where to start looking. +// If we found a node with the same key, then we return outBefore = outAfter. +// Otherwise, outBefore.key < key < outAfter.key. +func (s *Skiplist) findSpliceForLevel(key []byte, before *node, level int) (*node, *node) { + for { + // Assume before.key < key. + next := s.getNext(before, level) + if next == nil { + return before, next + } + nextKey := next.key(s.arena) + cmp := y.CompareKeys(key, nextKey) + if cmp == 0 { + // Equality case. + return next, next + } + if cmp < 0 { + // before.key < key < next.key. We are done for this level. + return before, next + } + before = next // Keep moving right on this level. + } +} + +func (s *Skiplist) getHeight() int32 { + return atomic.LoadInt32(&s.height) +} + +// Put inserts the key-value pair. +func (s *Skiplist) Put(key []byte, v y.ValueStruct) { + // Since we allow overwrite, we may not need to create a new node. We might not even need to + // increase the height. Let's defer these actions. + + listHeight := s.getHeight() + var prev [maxHeight + 1]*node + var next [maxHeight + 1]*node + prev[listHeight] = s.head + next[listHeight] = nil + for i := int(listHeight) - 1; i >= 0; i-- { + // Use higher level to speed up for current level. + prev[i], next[i] = s.findSpliceForLevel(key, prev[i+1], i) + if prev[i] == next[i] { + prev[i].setValue(s.arena, v) + return + } + } + + // We do need to create a new node. + height := randomHeight() + x := newNode(s.arena, key, v, height) + + // Try to increase s.height via CAS. + listHeight = s.getHeight() + for height > int(listHeight) { + if atomic.CompareAndSwapInt32(&s.height, listHeight, int32(height)) { + // Successfully increased skiplist.height. + break + } + listHeight = s.getHeight() + } + + // We always insert from the base level and up. After you add a node in base level, we cannot + // create a node in the level above because it would have discovered the node in the base level. + for i := 0; i < height; i++ { + for { + if prev[i] == nil { + y.AssertTrue(i > 1) // This cannot happen in base level. + // We haven't computed prev, next for this level because height exceeds old listHeight. + // For these levels, we expect the lists to be sparse, so we can just search from head. + prev[i], next[i] = s.findSpliceForLevel(key, s.head, i) + // Someone adds the exact same key before we are able to do so. This can only happen on + // the base level. But we know we are not on the base level. + y.AssertTrue(prev[i] != next[i]) + } + nextOffset := s.arena.getNodeOffset(next[i]) + x.tower[i] = nextOffset + if prev[i].casNextOffset(i, nextOffset, s.arena.getNodeOffset(x)) { + // Managed to insert x between prev[i] and next[i]. Go to the next level. + break + } + // CAS failed. We need to recompute prev and next. + // It is unlikely to be helpful to try to use a different level as we redo the search, + // because it is unlikely that lots of nodes are inserted between prev[i] and next[i]. + prev[i], next[i] = s.findSpliceForLevel(key, prev[i], i) + if prev[i] == next[i] { + y.AssertTruef(i == 0, "Equality can happen only on base level: %d", i) + prev[i].setValue(s.arena, v) + return + } + } + } +} + +// Empty returns if the Skiplist is empty. +func (s *Skiplist) Empty() bool { + return s.findLast() == nil +} + +// findLast returns the last element. If head (empty list), we return nil. All the find functions +// will NEVER return the head nodes. +func (s *Skiplist) findLast() *node { + n := s.head + level := int(s.getHeight()) - 1 + for { + next := s.getNext(n, level) + if next != nil { + n = next + continue + } + if level == 0 { + if n == s.head { + return nil + } + return n + } + level-- + } +} + +// Get gets the value associated with the key. It returns a valid value if it finds equal or earlier +// version of the same key. +func (s *Skiplist) Get(key []byte) y.ValueStruct { + n, _ := s.findNear(key, false, true) // findGreaterOrEqual. + if n == nil { + return y.ValueStruct{} + } + + nextKey := s.arena.getKey(n.keyOffset, n.keySize) + if !y.SameKey(key, nextKey) { + return y.ValueStruct{} + } + + valOffset, valSize := n.getValueOffset() + vs := s.arena.getVal(valOffset, valSize) + vs.Version = y.ParseTs(nextKey) + return vs +} + +// NewIterator returns a skiplist iterator. You have to Close() the iterator. +func (s *Skiplist) NewIterator() *Iterator { + s.IncrRef() + return &Iterator{list: s} +} + +// MemSize returns the size of the Skiplist in terms of how much memory is used within its internal +// arena. +func (s *Skiplist) MemSize() int64 { return s.arena.size() } + +// Iterator is an iterator over skiplist object. For new objects, you just +// need to initialize Iterator.list. +type Iterator struct { + list *Skiplist + n *node +} + +// Close frees the resources held by the iterator +func (s *Iterator) Close() error { + s.list.DecrRef() + return nil +} + +// Valid returns true iff the iterator is positioned at a valid node. +func (s *Iterator) Valid() bool { return s.n != nil } + +// Key returns the key at the current position. +func (s *Iterator) Key() []byte { + return s.list.arena.getKey(s.n.keyOffset, s.n.keySize) +} + +// Value returns value. +func (s *Iterator) Value() y.ValueStruct { + valOffset, valSize := s.n.getValueOffset() + return s.list.arena.getVal(valOffset, valSize) +} + +// Next advances to the next position. +func (s *Iterator) Next() { + y.AssertTrue(s.Valid()) + s.n = s.list.getNext(s.n, 0) +} + +// Prev advances to the previous position. +func (s *Iterator) Prev() { + y.AssertTrue(s.Valid()) + s.n, _ = s.list.findNear(s.Key(), true, false) // find <. No equality allowed. +} + +// Seek advances to the first entry with a key >= target. +func (s *Iterator) Seek(target []byte) { + s.n, _ = s.list.findNear(target, false, true) // find >=. +} + +// SeekForPrev finds an entry with key <= target. +func (s *Iterator) SeekForPrev(target []byte) { + s.n, _ = s.list.findNear(target, true, true) // find <=. +} + +// SeekToFirst seeks position at the first entry in list. +// Final state of iterator is Valid() iff list is not empty. +func (s *Iterator) SeekToFirst() { + s.n = s.list.getNext(s.list.head, 0) +} + +// SeekToLast seeks position at the last entry in list. +// Final state of iterator is Valid() iff list is not empty. +func (s *Iterator) SeekToLast() { + s.n = s.list.findLast() +} + +// UniIterator is a unidirectional memtable iterator. It is a thin wrapper around +// Iterator. We like to keep Iterator as before, because it is more powerful and +// we might support bidirectional iterators in the future. +type UniIterator struct { + iter *Iterator + reversed bool +} + +// NewUniIterator returns a UniIterator. +func (s *Skiplist) NewUniIterator(reversed bool) *UniIterator { + return &UniIterator{ + iter: s.NewIterator(), + reversed: reversed, + } +} + +// Next implements y.Interface +func (s *UniIterator) Next() { + if !s.reversed { + s.iter.Next() + } else { + s.iter.Prev() + } +} + +// Rewind implements y.Interface +func (s *UniIterator) Rewind() { + if !s.reversed { + s.iter.SeekToFirst() + } else { + s.iter.SeekToLast() + } +} + +// Seek implements y.Interface +func (s *UniIterator) Seek(key []byte) { + if !s.reversed { + s.iter.Seek(key) + } else { + s.iter.SeekForPrev(key) + } +} + +// Key implements y.Interface +func (s *UniIterator) Key() []byte { return s.iter.Key() } + +// Value implements y.Interface +func (s *UniIterator) Value() y.ValueStruct { return s.iter.Value() } + +// Valid implements y.Interface +func (s *UniIterator) Valid() bool { return s.iter.Valid() } + +// Close implements y.Interface (and frees up the iter's resources) +func (s *UniIterator) Close() error { return s.iter.Close() } diff --git a/vendor/github.com/dgraph-io/badger/stream.go b/vendor/github.com/dgraph-io/badger/stream.go new file mode 100644 index 0000000000..f0841a6a40 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/stream.go @@ -0,0 +1,385 @@ +/* + * Copyright 2018 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package badger + +import ( + "bytes" + "context" + "math" + "sync" + "sync/atomic" + "time" + + "github.com/dgraph-io/badger/pb" + "github.com/dgraph-io/badger/y" + humanize "github.com/dustin/go-humanize" +) + +const pageSize = 4 << 20 // 4MB + +// Stream provides a framework to concurrently iterate over a snapshot of Badger, pick up +// key-values, batch them up and call Send. Stream does concurrent iteration over many smaller key +// ranges. It does NOT send keys in lexicographical sorted order. To get keys in sorted +// order, use Iterator. +type Stream struct { + // Prefix to only iterate over certain range of keys. If set to nil (default), Stream would + // iterate over the entire DB. + Prefix []byte + + // Number of goroutines to use for iterating over key ranges. Defaults to 16. + NumGo int + + // Badger would produce log entries in Infof to indicate the progress of Stream. LogPrefix can + // be used to help differentiate them from other activities. Default is "Badger.Stream". + LogPrefix string + + // ChooseKey is invoked each time a new key is encountered. Note that this is not called + // on every version of the value, only the first encountered version (i.e. the highest version + // of the value a key has). ChooseKey can be left nil to select all keys. + // + // Note: Calls to ChooseKey are concurrent. + ChooseKey func(item *Item) bool + + // KeyToList, similar to ChooseKey, is only invoked on the highest version of the value. It + // is upto the caller to iterate over the versions and generate zero, one or more KVs. It + // is expected that the user would advance the iterator to go through the versions of the + // values. However, the user MUST immediately return from this function on the first encounter + // with a mismatching key. See example usage in ToList function. Can be left nil to use ToList + // function by default. + // + // Note: Calls to KeyToList are concurrent. + KeyToList func(key []byte, itr *Iterator) (*pb.KVList, error) + + // This is the method where Stream sends the final output. All calls to Send are done by a + // single goroutine, i.e. logic within Send method can expect single threaded execution. + Send func(*pb.KVList) error + + readTs uint64 + db *DB + rangeCh chan keyRange + kvChan chan *pb.KVList + nextStreamId uint32 +} + +// ToList is a default implementation of KeyToList. It picks up all valid versions of the key, +// skipping over deleted or expired keys. +func (st *Stream) ToList(key []byte, itr *Iterator) (*pb.KVList, error) { + list := &pb.KVList{} + for ; itr.Valid(); itr.Next() { + item := itr.Item() + if item.IsDeletedOrExpired() { + break + } + if !bytes.Equal(key, item.Key()) { + // Break out on the first encounter with another key. + break + } + + valCopy, err := item.ValueCopy(nil) + if err != nil { + return nil, err + } + kv := &pb.KV{ + Key: item.KeyCopy(nil), + Value: valCopy, + UserMeta: []byte{item.UserMeta()}, + Version: item.Version(), + ExpiresAt: item.ExpiresAt(), + } + list.Kv = append(list.Kv, kv) + if st.db.opt.NumVersionsToKeep == 1 { + break + } + + if item.DiscardEarlierVersions() { + break + } + } + return list, nil +} + +// keyRange is [start, end), including start, excluding end. Do ensure that the start, +// end byte slices are owned by keyRange struct. +func (st *Stream) produceRanges(ctx context.Context) { + splits := st.db.KeySplits(st.Prefix) + + // We don't need to create more key ranges than NumGo goroutines. This way, we will have limited + // number of "streams" coming out, which then helps limit the memory used by SSWriter. + { + pickEvery := int(math.Floor(float64(len(splits)) / float64(st.NumGo))) + if pickEvery < 1 { + pickEvery = 1 + } + filtered := splits[:0] + for i, split := range splits { + if (i+1)%pickEvery == 0 { + filtered = append(filtered, split) + } + } + splits = filtered + } + + start := y.SafeCopy(nil, st.Prefix) + for _, key := range splits { + st.rangeCh <- keyRange{left: start, right: y.SafeCopy(nil, []byte(key))} + start = y.SafeCopy(nil, []byte(key)) + } + // Edge case: prefix is empty and no splits exist. In that case, we should have at least one + // keyRange output. + st.rangeCh <- keyRange{left: start} + close(st.rangeCh) +} + +// produceKVs picks up ranges from rangeCh, generates KV lists and sends them to kvChan. +func (st *Stream) produceKVs(ctx context.Context) error { + var size int + var txn *Txn + if st.readTs > 0 { + txn = st.db.NewTransactionAt(st.readTs, false) + } else { + txn = st.db.NewTransaction(false) + } + defer txn.Discard() + + iterate := func(kr keyRange) error { + iterOpts := DefaultIteratorOptions + iterOpts.AllVersions = true + iterOpts.Prefix = st.Prefix + iterOpts.PrefetchValues = false + itr := txn.NewIterator(iterOpts) + defer itr.Close() + + // This unique stream id is used to identify all the keys from this iteration. + streamId := atomic.AddUint32(&st.nextStreamId, 1) + + outList := new(pb.KVList) + var prevKey []byte + for itr.Seek(kr.left); itr.Valid(); { + // it.Valid would only return true for keys with the provided Prefix in iterOpts. + item := itr.Item() + if bytes.Equal(item.Key(), prevKey) { + itr.Next() + continue + } + prevKey = append(prevKey[:0], item.Key()...) + + // Check if we reached the end of the key range. + if len(kr.right) > 0 && bytes.Compare(item.Key(), kr.right) >= 0 { + break + } + // Check if we should pick this key. + if st.ChooseKey != nil && !st.ChooseKey(item) { + continue + } + + // Now convert to key value. + list, err := st.KeyToList(item.KeyCopy(nil), itr) + if err != nil { + return err + } + if list == nil || len(list.Kv) == 0 { + continue + } + outList.Kv = append(outList.Kv, list.Kv...) + size += list.Size() + if size >= pageSize { + for _, kv := range outList.Kv { + kv.StreamId = streamId + } + select { + case st.kvChan <- outList: + case <-ctx.Done(): + return ctx.Err() + } + outList = new(pb.KVList) + size = 0 + } + } + if len(outList.Kv) > 0 { + for _, kv := range outList.Kv { + kv.StreamId = streamId + } + // TODO: Think of a way to indicate that a stream is over. + select { + case st.kvChan <- outList: + case <-ctx.Done(): + return ctx.Err() + } + } + return nil + } + + for { + select { + case kr, ok := <-st.rangeCh: + if !ok { + // Done with the keys. + return nil + } + if err := iterate(kr); err != nil { + return err + } + case <-ctx.Done(): + return ctx.Err() + } + } +} + +func (st *Stream) streamKVs(ctx context.Context) error { + var count int + var bytesSent uint64 + t := time.NewTicker(time.Second) + defer t.Stop() + now := time.Now() + + slurp := func(batch *pb.KVList) error { + loop: + for { + select { + case kvs, ok := <-st.kvChan: + if !ok { + break loop + } + y.AssertTrue(kvs != nil) + batch.Kv = append(batch.Kv, kvs.Kv...) + default: + break loop + } + } + sz := uint64(batch.Size()) + bytesSent += sz + count += len(batch.Kv) + t := time.Now() + if err := st.Send(batch); err != nil { + return err + } + st.db.opt.Infof("%s Created batch of size: %s in %s.\n", + st.LogPrefix, humanize.Bytes(sz), time.Since(t)) + return nil + } + +outer: + for { + var batch *pb.KVList + select { + case <-ctx.Done(): + return ctx.Err() + + case <-t.C: + dur := time.Since(now) + durSec := uint64(dur.Seconds()) + if durSec == 0 { + continue + } + speed := bytesSent / durSec + st.db.opt.Infof("%s Time elapsed: %s, bytes sent: %s, speed: %s/sec\n", st.LogPrefix, + y.FixedDuration(dur), humanize.Bytes(bytesSent), humanize.Bytes(speed)) + + case kvs, ok := <-st.kvChan: + if !ok { + break outer + } + y.AssertTrue(kvs != nil) + batch = kvs + if err := slurp(batch); err != nil { + return err + } + } + } + + st.db.opt.Infof("%s Sent %d keys\n", st.LogPrefix, count) + return nil +} + +// Orchestrate runs Stream. It picks up ranges from the SSTables, then runs NumGo number of +// goroutines to iterate over these ranges and batch up KVs in lists. It concurrently runs a single +// goroutine to pick these lists, batch them up further and send to Output.Send. Orchestrate also +// spits logs out to Infof, using provided LogPrefix. Note that all calls to Output.Send +// are serial. In case any of these steps encounter an error, Orchestrate would stop execution and +// return that error. Orchestrate can be called multiple times, but in serial order. +func (st *Stream) Orchestrate(ctx context.Context) error { + st.rangeCh = make(chan keyRange, 3) // Contains keys for posting lists. + + // kvChan should only have a small capacity to ensure that we don't buffer up too much data if + // sending is slow. Page size is set to 4MB, which is used to lazily cap the size of each + // KVList. To get 128MB buffer, we can set the channel size to 32. + st.kvChan = make(chan *pb.KVList, 32) + + if st.KeyToList == nil { + st.KeyToList = st.ToList + } + + // Picks up ranges from Badger, and sends them to rangeCh. + go st.produceRanges(ctx) + + errCh := make(chan error, 1) // Stores error by consumeKeys. + var wg sync.WaitGroup + for i := 0; i < st.NumGo; i++ { + wg.Add(1) + go func() { + defer wg.Done() + // Picks up ranges from rangeCh, generates KV lists, and sends them to kvChan. + if err := st.produceKVs(ctx); err != nil { + select { + case errCh <- err: + default: + } + } + }() + } + + // Pick up key-values from kvChan and send to stream. + kvErr := make(chan error, 1) + go func() { + // Picks up KV lists from kvChan, and sends them to Output. + kvErr <- st.streamKVs(ctx) + }() + wg.Wait() // Wait for produceKVs to be over. + close(st.kvChan) // Now we can close kvChan. + + select { + case err := <-errCh: // Check error from produceKVs. + return err + default: + } + + // Wait for key streaming to be over. + err := <-kvErr + return err +} + +func (db *DB) newStream() *Stream { + return &Stream{db: db, NumGo: 16, LogPrefix: "Badger.Stream"} +} + +// NewStream creates a new Stream. +func (db *DB) NewStream() *Stream { + if db.opt.managedTxns { + panic("This API can not be called in managed mode.") + } + return db.newStream() +} + +// NewStreamAt creates a new Stream at a particular timestamp. Should only be used with managed DB. +func (db *DB) NewStreamAt(readTs uint64) *Stream { + if !db.opt.managedTxns { + panic("This API can only be called in managed mode.") + } + stream := db.newStream() + stream.readTs = readTs + return stream +} diff --git a/vendor/github.com/dgraph-io/badger/stream_writer.go b/vendor/github.com/dgraph-io/badger/stream_writer.go new file mode 100644 index 0000000000..3d2a7992ef --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/stream_writer.go @@ -0,0 +1,358 @@ +/* + * Copyright 2019 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package badger + +import ( + "math" + + "github.com/dgraph-io/badger/pb" + "github.com/dgraph-io/badger/table" + "github.com/dgraph-io/badger/y" + humanize "github.com/dustin/go-humanize" + "github.com/pkg/errors" +) + +const headStreamId uint32 = math.MaxUint32 + +// StreamWriter is used to write data coming from multiple streams. The streams must not have any +// overlapping key ranges. Within each stream, the keys must be sorted. Badger Stream framework is +// capable of generating such an output. So, this StreamWriter can be used at the other end to build +// BadgerDB at a much faster pace by writing SSTables (and value logs) directly to LSM tree levels +// without causing any compactions at all. This is way faster than using batched writer or using +// transactions, but only applicable in situations where the keys are pre-sorted and the DB is being +// bootstrapped. Existing data would get deleted when using this writer. So, this is only useful +// when restoring from backup or replicating DB across servers. +// +// StreamWriter should not be called on in-use DB instances. It is designed only to bootstrap new +// DBs. +type StreamWriter struct { + db *DB + done func() + throttle *y.Throttle + maxVersion uint64 + writers map[uint32]*sortedWriter + closer *y.Closer +} + +// NewStreamWriter creates a StreamWriter. Right after creating StreamWriter, Prepare must be +// called. The memory usage of a StreamWriter is directly proportional to the number of streams +// possible. So, efforts must be made to keep the number of streams low. Stream framework would +// typically use 16 goroutines and hence create 16 streams. +func (db *DB) NewStreamWriter() *StreamWriter { + return &StreamWriter{ + db: db, + // throttle shouldn't make much difference. Memory consumption is based on the number of + // concurrent streams being processed. + throttle: y.NewThrottle(16), + writers: make(map[uint32]*sortedWriter), + closer: y.NewCloser(0), + } +} + +// Prepare should be called before writing any entry to StreamWriter. It deletes all data present in +// existing DB, stops compactions and any writes being done by other means. Be very careful when +// calling Prepare, because it could result in permanent data loss. Not calling Prepare would result +// in a corrupt Badger instance. +func (sw *StreamWriter) Prepare() error { + var err error + sw.done, err = sw.db.dropAll() + return err +} + +// Write writes KVList to DB. Each KV within the list contains the stream id which StreamWriter +// would use to demux the writes. Write is not thread safe and it should NOT be called concurrently. +func (sw *StreamWriter) Write(kvs *pb.KVList) error { + if len(kvs.GetKv()) == 0 { + return nil + } + streamReqs := make(map[uint32]*request) + for _, kv := range kvs.Kv { + var meta, userMeta byte + if len(kv.Meta) > 0 { + meta = kv.Meta[0] + } + if len(kv.UserMeta) > 0 { + userMeta = kv.UserMeta[0] + } + if sw.maxVersion < kv.Version { + sw.maxVersion = kv.Version + } + e := &Entry{ + Key: y.KeyWithTs(kv.Key, kv.Version), + Value: kv.Value, + UserMeta: userMeta, + ExpiresAt: kv.ExpiresAt, + meta: meta, + } + // If the value can be colocated with the key in LSM tree, we can skip + // writing the value to value log. + e.skipVlog = sw.db.shouldWriteValueToLSM(*e) + req := streamReqs[kv.StreamId] + if req == nil { + req = &request{} + streamReqs[kv.StreamId] = req + } + req.Entries = append(req.Entries, e) + } + var all []*request + for _, req := range streamReqs { + all = append(all, req) + } + if err := sw.db.vlog.write(all); err != nil { + return err + } + + for streamId, req := range streamReqs { + writer, ok := sw.writers[streamId] + if !ok { + writer = sw.newWriter(streamId) + sw.writers[streamId] = writer + } + writer.reqCh <- req + } + return nil +} + +// Flush is called once we are done writing all the entries. It syncs DB directories. It also +// updates Oracle with maxVersion found in all entries (if DB is not managed). +func (sw *StreamWriter) Flush() error { + defer sw.done() + + sw.closer.SignalAndWait() + var maxHead valuePointer + for _, writer := range sw.writers { + if err := writer.Done(); err != nil { + return err + } + if maxHead.Less(writer.head) { + maxHead = writer.head + } + } + + // Encode and write the value log head into a new table. + data := make([]byte, vptrSize) + maxHead.Encode(data) + headWriter := sw.newWriter(headStreamId) + if err := headWriter.Add( + y.KeyWithTs(head, sw.maxVersion), + y.ValueStruct{Value: data}); err != nil { + return err + } + if err := headWriter.Done(); err != nil { + return err + } + + if !sw.db.opt.managedTxns { + if sw.db.orc != nil { + sw.db.orc.Stop() + } + sw.db.orc = newOracle(sw.db.opt) + sw.db.orc.nextTxnTs = sw.maxVersion + sw.db.orc.txnMark.Done(sw.maxVersion) + sw.db.orc.readMark.Done(sw.maxVersion) + sw.db.orc.incrementNextTs() + } + + // Wait for all files to be written. + if err := sw.throttle.Finish(); err != nil { + return err + } + + // Now sync the directories, so all the files are registered. + if sw.db.opt.ValueDir != sw.db.opt.Dir { + if err := syncDir(sw.db.opt.ValueDir); err != nil { + return err + } + } + if err := syncDir(sw.db.opt.Dir); err != nil { + return err + } + return sw.db.lc.validate() +} + +type sortedWriter struct { + db *DB + throttle *y.Throttle + + builder *table.Builder + lastKey []byte + streamId uint32 + reqCh chan *request + head valuePointer +} + +func (sw *StreamWriter) newWriter(streamId uint32) *sortedWriter { + w := &sortedWriter{ + db: sw.db, + streamId: streamId, + throttle: sw.throttle, + builder: table.NewTableBuilder(), + reqCh: make(chan *request, 3), + } + sw.closer.AddRunning(1) + go w.handleRequests(sw.closer) + return w +} + +// ErrUnsortedKey is returned when any out of order key arrives at sortedWriter during call to Add. +var ErrUnsortedKey = errors.New("Keys not in sorted order") + +func (w *sortedWriter) handleRequests(closer *y.Closer) { + defer closer.Done() + + process := func(req *request) { + for i, e := range req.Entries { + vptr := req.Ptrs[i] + if !vptr.IsZero() { + y.AssertTrue(w.head.Less(vptr)) + w.head = vptr + } + + var vs y.ValueStruct + if e.skipVlog { + vs = y.ValueStruct{ + Value: e.Value, + Meta: e.meta, + UserMeta: e.UserMeta, + ExpiresAt: e.ExpiresAt, + } + } else { + vbuf := make([]byte, vptrSize) + vs = y.ValueStruct{ + Value: vptr.Encode(vbuf), + Meta: e.meta | bitValuePointer, + UserMeta: e.UserMeta, + ExpiresAt: e.ExpiresAt, + } + } + if err := w.Add(e.Key, vs); err != nil { + panic(err) + } + } + } + + for { + select { + case req := <-w.reqCh: + process(req) + case <-closer.HasBeenClosed(): + close(w.reqCh) + for req := range w.reqCh { + process(req) + } + return + } + } +} + +// Add adds key and vs to sortedWriter. +func (w *sortedWriter) Add(key []byte, vs y.ValueStruct) error { + if len(w.lastKey) > 0 && y.CompareKeys(key, w.lastKey) <= 0 { + return ErrUnsortedKey + } + + sameKey := y.SameKey(key, w.lastKey) + // Same keys should go into the same SSTable. + if !sameKey && w.builder.ReachedCapacity(w.db.opt.MaxTableSize) { + if err := w.send(); err != nil { + return err + } + } + + w.lastKey = y.SafeCopy(w.lastKey, key) + return w.builder.Add(key, vs) +} + +func (w *sortedWriter) send() error { + if err := w.throttle.Do(); err != nil { + return err + } + go func(builder *table.Builder) { + data := builder.Finish() + err := w.createTable(data) + w.throttle.Done(err) + }(w.builder) + w.builder = table.NewTableBuilder() + return nil +} + +// Done is called once we are done writing all keys and valueStructs +// to sortedWriter. It completes writing current SST to disk. +func (w *sortedWriter) Done() error { + if w.builder.Empty() { + return nil + } + return w.send() +} + +func (w *sortedWriter) createTable(data []byte) error { + if len(data) == 0 { + return nil + } + fileID := w.db.lc.reserveFileID() + fd, err := y.CreateSyncedFile(table.NewFilename(fileID, w.db.opt.Dir), true) + if err != nil { + return err + } + if _, err := fd.Write(data); err != nil { + return err + } + tbl, err := table.OpenTable(fd, w.db.opt.TableLoadingMode, nil) + if err != nil { + return err + } + lc := w.db.lc + + var lhandler *levelHandler + // We should start the levels from 1, because we need level 0 to set the !badger!head key. We + // cannot mix up this key with other keys from the DB, otherwise we would introduce a range + // overlap violation. + y.AssertTrue(len(lc.levels) > 1) + for _, l := range lc.levels[1:] { + ratio := float64(l.getTotalSize()) / float64(l.maxTotalSize) + if ratio < 1.0 { + lhandler = l + break + } + } + if lhandler == nil { + // If we're exceeding the size of the lowest level, shove it in the lowest level. Can't do + // better than that. + lhandler = lc.levels[len(lc.levels)-1] + } + if w.streamId == headStreamId { + // This is a special !badger!head key. We should store it at level 0, separate from all the + // other keys to avoid an overlap. + lhandler = lc.levels[0] + } + // Now that table can be opened successfully, let's add this to the MANIFEST. + change := &pb.ManifestChange{ + Id: tbl.ID(), + Op: pb.ManifestChange_CREATE, + Level: uint32(lhandler.level), + Checksum: tbl.Checksum, + } + if err := w.db.manifest.addChanges([]*pb.ManifestChange{change}); err != nil { + return err + } + if err := lhandler.replaceTables([]*table.Table{}, []*table.Table{tbl}); err != nil { + return err + } + w.db.opt.Infof("Table created: %d at level: %d for stream: %d. Size: %s\n", + fileID, lhandler.level, w.streamId, humanize.Bytes(uint64(tbl.Size()))) + return nil +} diff --git a/vendor/github.com/dgraph-io/badger/structs.go b/vendor/github.com/dgraph-io/badger/structs.go new file mode 100644 index 0000000000..51d16cdb2d --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/structs.go @@ -0,0 +1,186 @@ +package badger + +import ( + "bytes" + "encoding/binary" + "fmt" + "hash/crc32" + "time" + + "github.com/dgraph-io/badger/y" +) + +type valuePointer struct { + Fid uint32 + Len uint32 + Offset uint32 +} + +func (p valuePointer) Less(o valuePointer) bool { + if p.Fid != o.Fid { + return p.Fid < o.Fid + } + if p.Offset != o.Offset { + return p.Offset < o.Offset + } + return p.Len < o.Len +} + +func (p valuePointer) IsZero() bool { + return p.Fid == 0 && p.Offset == 0 && p.Len == 0 +} + +const vptrSize = 12 + +// Encode encodes Pointer into byte buffer. +func (p valuePointer) Encode(b []byte) []byte { + binary.BigEndian.PutUint32(b[:4], p.Fid) + binary.BigEndian.PutUint32(b[4:8], p.Len) + binary.BigEndian.PutUint32(b[8:12], p.Offset) + return b[:vptrSize] +} + +func (p *valuePointer) Decode(b []byte) { + p.Fid = binary.BigEndian.Uint32(b[:4]) + p.Len = binary.BigEndian.Uint32(b[4:8]) + p.Offset = binary.BigEndian.Uint32(b[8:12]) +} + +// header is used in value log as a header before Entry. +type header struct { + klen uint32 + vlen uint32 + expiresAt uint64 + meta byte + userMeta byte +} + +const ( + headerBufSize = 18 +) + +func (h header) Encode(out []byte) { + y.AssertTrue(len(out) >= headerBufSize) + binary.BigEndian.PutUint32(out[0:4], h.klen) + binary.BigEndian.PutUint32(out[4:8], h.vlen) + binary.BigEndian.PutUint64(out[8:16], h.expiresAt) + out[16] = h.meta + out[17] = h.userMeta +} + +// Decodes h from buf. +func (h *header) Decode(buf []byte) { + h.klen = binary.BigEndian.Uint32(buf[0:4]) + h.vlen = binary.BigEndian.Uint32(buf[4:8]) + h.expiresAt = binary.BigEndian.Uint64(buf[8:16]) + h.meta = buf[16] + h.userMeta = buf[17] +} + +// Entry provides Key, Value, UserMeta and ExpiresAt. This struct can be used by +// the user to set data. +type Entry struct { + Key []byte + Value []byte + UserMeta byte + ExpiresAt uint64 // time.Unix + meta byte + + // Fields maintained internally. + offset uint32 + skipVlog bool +} + +func (e *Entry) estimateSize(threshold int) int { + if len(e.Value) < threshold { + return len(e.Key) + len(e.Value) + 2 // Meta, UserMeta + } + return len(e.Key) + 12 + 2 // 12 for ValuePointer, 2 for metas. +} + +// Encodes e to buf. Returns number of bytes written. +func encodeEntry(e *Entry, buf *bytes.Buffer) (int, error) { + h := header{ + klen: uint32(len(e.Key)), + vlen: uint32(len(e.Value)), + expiresAt: e.ExpiresAt, + meta: e.meta, + userMeta: e.UserMeta, + } + + var headerEnc [headerBufSize]byte + h.Encode(headerEnc[:]) + + hash := crc32.New(y.CastagnoliCrcTable) + + buf.Write(headerEnc[:]) + if _, err := hash.Write(headerEnc[:]); err != nil { + return 0, err + } + + buf.Write(e.Key) + if _, err := hash.Write(e.Key); err != nil { + return 0, err + } + + buf.Write(e.Value) + if _, err := hash.Write(e.Value); err != nil { + return 0, err + } + + var crcBuf [crc32.Size]byte + binary.BigEndian.PutUint32(crcBuf[:], hash.Sum32()) + buf.Write(crcBuf[:]) + + return len(headerEnc) + len(e.Key) + len(e.Value) + len(crcBuf), nil +} + +func (e Entry) print(prefix string) { + fmt.Printf("%s Key: %s Meta: %d UserMeta: %d Offset: %d len(val)=%d", + prefix, e.Key, e.meta, e.UserMeta, e.offset, len(e.Value)) +} + +// NewEntry creates a new entry with key and value passed in args. This newly created entry can be +// set in a transaction by calling txn.SetEntry(). All other properties of Entry can be set by +// calling WithMeta, WithDiscard, WithTTL methods on it. +// This function uses key and value reference, hence users must +// not modify key and value until the end of transaction. +func NewEntry(key, value []byte) *Entry { + return &Entry{ + Key: key, + Value: value, + } +} + +// WithMeta adds meta data to Entry e. This byte is stored alongside the key +// and can be used as an aid to interpret the value or store other contextual +// bits corresponding to the key-value pair of entry. +func (e *Entry) WithMeta(meta byte) *Entry { + e.UserMeta = meta + return e +} + +// WithDiscard adds a marker to Entry e. This means all the previous versions of the key (of the +// Entry) will be eligible for garbage collection. +// This method is only useful if you have set a higher limit for options.NumVersionsToKeep. The +// default setting is 1, in which case, this function doesn't add any more benefit. If however, you +// have a higher setting for NumVersionsToKeep (in Dgraph, we set it to infinity), you can use this +// method to indicate that all the older versions can be discarded and removed during compactions. +func (e *Entry) WithDiscard() *Entry { + e.meta = bitDiscardEarlierVersions + return e +} + +// WithTTL adds time to live duration to Entry e. Entry stored with a TTL would automatically expire +// after the time has elapsed, and will be eligible for garbage collection. +func (e *Entry) WithTTL(dur time.Duration) *Entry { + e.ExpiresAt = uint64(time.Now().Add(dur).Unix()) + return e +} + +// withMergeBit sets merge bit in entry's metadata. This +// function is called by MergeOperator's Add method. +func (e *Entry) withMergeBit() *Entry { + e.meta = bitMergeEntry + return e +} diff --git a/vendor/github.com/dgraph-io/badger/table/README.md b/vendor/github.com/dgraph-io/badger/table/README.md new file mode 100644 index 0000000000..a784f12680 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/table/README.md @@ -0,0 +1,69 @@ +Size of table is 122,173,606 bytes for all benchmarks. + +# BenchmarkRead +``` +$ go test -bench ^BenchmarkRead$ -run ^$ -count 3 +goos: linux +goarch: amd64 +pkg: github.com/dgraph-io/badger/table +BenchmarkRead-16 10 153281932 ns/op +BenchmarkRead-16 10 153454443 ns/op +BenchmarkRead-16 10 155349696 ns/op +PASS +ok github.com/dgraph-io/badger/table 23.549s +``` + +Size of table is 122,173,606 bytes, which is ~117MB. + +The rate is ~750MB/s using LoadToRAM (when table is in RAM). + +To read a 64MB table, this would take ~0.0853s, which is negligible. + +# BenchmarkReadAndBuild +```go +$ go test -bench BenchmarkReadAndBuild -run ^$ -count 3 +goos: linux +goarch: amd64 +pkg: github.com/dgraph-io/badger/table +BenchmarkReadAndBuild-16 2 945041628 ns/op +BenchmarkReadAndBuild-16 2 947120893 ns/op +BenchmarkReadAndBuild-16 2 954909506 ns/op +PASS +ok github.com/dgraph-io/badger/table 26.856s +``` + +The rate is ~122MB/s. To build a 64MB table, this would take ~0.52s. Note that this +does NOT include the flushing of the table to disk. All we are doing above is +reading one table (which is in RAM) and write one table in memory. + +The table building takes 0.52-0.0853s ~ 0.4347s. + +# BenchmarkReadMerged +Below, we merge 5 tables. The total size remains unchanged at ~122M. + +```go +$ go test -bench ReadMerged -run ^$ -count 3 +BenchmarkReadMerged-16 2 954475788 ns/op +BenchmarkReadMerged-16 2 955252462 ns/op +BenchmarkReadMerged-16 2 956857353 ns/op +PASS +ok github.com/dgraph-io/badger/table 33.327s +``` + +The rate is ~122MB/s. To read a 64MB table using merge iterator, this would take ~0.52s. + +# BenchmarkRandomRead + +```go +go test -bench BenchmarkRandomRead$ -run ^$ -count 3 +goos: linux +goarch: amd64 +pkg: github.com/dgraph-io/badger/table +BenchmarkRandomRead-16 300000 3596 ns/op +BenchmarkRandomRead-16 300000 3621 ns/op +BenchmarkRandomRead-16 300000 3596 ns/op +PASS +ok github.com/dgraph-io/badger/table 44.727s +``` + +For random read benchmarking, we are randomly reading a key and verifying its value. diff --git a/vendor/github.com/dgraph-io/badger/table/builder.go b/vendor/github.com/dgraph-io/badger/table/builder.go new file mode 100644 index 0000000000..0657cbca18 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/table/builder.go @@ -0,0 +1,237 @@ +/* + * Copyright 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package table + +import ( + "bytes" + "encoding/binary" + "io" + "math" + + "github.com/AndreasBriese/bbloom" + "github.com/dgraph-io/badger/y" +) + +var ( + restartInterval = 100 // Might want to change this to be based on total size instead of numKeys. +) + +func newBuffer(sz int) *bytes.Buffer { + b := new(bytes.Buffer) + b.Grow(sz) + return b +} + +type header struct { + plen uint16 // Overlap with base key. + klen uint16 // Length of the diff. + vlen uint16 // Length of value. + prev uint32 // Offset for the previous key-value pair. The offset is relative to block base offset. +} + +// Encode encodes the header. +func (h header) Encode(b []byte) { + binary.BigEndian.PutUint16(b[0:2], h.plen) + binary.BigEndian.PutUint16(b[2:4], h.klen) + binary.BigEndian.PutUint16(b[4:6], h.vlen) + binary.BigEndian.PutUint32(b[6:10], h.prev) +} + +// Decode decodes the header. +func (h *header) Decode(buf []byte) int { + h.plen = binary.BigEndian.Uint16(buf[0:2]) + h.klen = binary.BigEndian.Uint16(buf[2:4]) + h.vlen = binary.BigEndian.Uint16(buf[4:6]) + h.prev = binary.BigEndian.Uint32(buf[6:10]) + return h.Size() +} + +// Size returns size of the header. Currently it's just a constant. +func (h header) Size() int { return 10 } + +// Builder is used in building a table. +type Builder struct { + counter int // Number of keys written for the current block. + + // Typically tens or hundreds of meg. This is for one single file. + buf *bytes.Buffer + + baseKey []byte // Base key for the current block. + baseOffset uint32 // Offset for the current block. + + restarts []uint32 // Base offsets of every block. + + // Tracks offset for the previous key-value pair. Offset is relative to block base offset. + prevOffset uint32 + + keyBuf *bytes.Buffer + keyCount int +} + +// NewTableBuilder makes a new TableBuilder. +func NewTableBuilder() *Builder { + return &Builder{ + keyBuf: newBuffer(1 << 20), + buf: newBuffer(1 << 20), + prevOffset: math.MaxUint32, // Used for the first element! + } +} + +// Close closes the TableBuilder. +func (b *Builder) Close() {} + +// Empty returns whether it's empty. +func (b *Builder) Empty() bool { return b.buf.Len() == 0 } + +// keyDiff returns a suffix of newKey that is different from b.baseKey. +func (b Builder) keyDiff(newKey []byte) []byte { + var i int + for i = 0; i < len(newKey) && i < len(b.baseKey); i++ { + if newKey[i] != b.baseKey[i] { + break + } + } + return newKey[i:] +} + +func (b *Builder) addHelper(key []byte, v y.ValueStruct) { + // Add key to bloom filter. + if len(key) > 0 { + var klen [2]byte + keyNoTs := y.ParseKey(key) + binary.BigEndian.PutUint16(klen[:], uint16(len(keyNoTs))) + b.keyBuf.Write(klen[:]) + b.keyBuf.Write(keyNoTs) + b.keyCount++ + } + + // diffKey stores the difference of key with baseKey. + var diffKey []byte + if len(b.baseKey) == 0 { + // Make a copy. Builder should not keep references. Otherwise, caller has to be very careful + // and will have to make copies of keys every time they add to builder, which is even worse. + b.baseKey = append(b.baseKey[:0], key...) + diffKey = key + } else { + diffKey = b.keyDiff(key) + } + + h := header{ + plen: uint16(len(key) - len(diffKey)), + klen: uint16(len(diffKey)), + vlen: uint16(v.EncodedSize()), + prev: b.prevOffset, // prevOffset is the location of the last key-value added. + } + b.prevOffset = uint32(b.buf.Len()) - b.baseOffset // Remember current offset for the next Add call. + + // Layout: header, diffKey, value. + var hbuf [10]byte + h.Encode(hbuf[:]) + b.buf.Write(hbuf[:]) + b.buf.Write(diffKey) // We only need to store the key difference. + + v.EncodeTo(b.buf) + b.counter++ // Increment number of keys added for this current block. +} + +func (b *Builder) finishBlock() { + // When we are at the end of the block and Valid=false, and the user wants to do a Prev, + // we need a dummy header to tell us the offset of the previous key-value pair. + b.addHelper([]byte{}, y.ValueStruct{}) +} + +// Add adds a key-value pair to the block. +// If doNotRestart is true, we will not restart even if b.counter >= restartInterval. +func (b *Builder) Add(key []byte, value y.ValueStruct) error { + if b.counter >= restartInterval { + b.finishBlock() + // Start a new block. Initialize the block. + b.restarts = append(b.restarts, uint32(b.buf.Len())) + b.counter = 0 + b.baseKey = []byte{} + b.baseOffset = uint32(b.buf.Len()) + b.prevOffset = math.MaxUint32 // First key-value pair of block has header.prev=MaxInt. + } + b.addHelper(key, value) + return nil // Currently, there is no meaningful error. +} + +// TODO: vvv this was the comment on ReachedCapacity. +// FinalSize returns the *rough* final size of the array, counting the header which is +// not yet written. +// TODO: Look into why there is a discrepancy. I suspect it is because of Write(empty, empty) +// at the end. The diff can vary. + +// ReachedCapacity returns true if we... roughly (?) reached capacity? +func (b *Builder) ReachedCapacity(cap int64) bool { + estimateSz := b.buf.Len() + 8 /* empty header */ + 4*len(b.restarts) + + 8 /* 8 = end of buf offset + len(restarts) */ + return int64(estimateSz) > cap +} + +// blockIndex generates the block index for the table. +// It is mainly a list of all the block base offsets. +func (b *Builder) blockIndex() []byte { + // Store the end offset, so we know the length of the final block. + b.restarts = append(b.restarts, uint32(b.buf.Len())) + + // Add 4 because we want to write out number of restarts at the end. + sz := 4*len(b.restarts) + 4 + out := make([]byte, sz) + buf := out + for _, r := range b.restarts { + binary.BigEndian.PutUint32(buf[:4], r) + buf = buf[4:] + } + binary.BigEndian.PutUint32(buf[:4], uint32(len(b.restarts))) + return out +} + +// Finish finishes the table by appending the index. +func (b *Builder) Finish() []byte { + bf := bbloom.New(float64(b.keyCount), 0.01) + var klen [2]byte + key := make([]byte, 1024) + for { + if _, err := b.keyBuf.Read(klen[:]); err == io.EOF { + break + } else if err != nil { + y.Check(err) + } + kl := int(binary.BigEndian.Uint16(klen[:])) + if cap(key) < kl { + key = make([]byte, 2*int(kl)) // 2 * uint16 will overflow + } + key = key[:kl] + y.Check2(b.keyBuf.Read(key)) + bf.Add(key) + } + + b.finishBlock() // This will never start a new block. + index := b.blockIndex() + b.buf.Write(index) + + // Write bloom filter. + bdata := bf.JSONMarshal() + n, err := b.buf.Write(bdata) + y.Check(err) + var buf [4]byte + binary.BigEndian.PutUint32(buf[:], uint32(n)) + b.buf.Write(buf[:]) + + return b.buf.Bytes() +} diff --git a/vendor/github.com/dgraph-io/badger/table/iterator.go b/vendor/github.com/dgraph-io/badger/table/iterator.go new file mode 100644 index 0000000000..0eb5ed01a9 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/table/iterator.go @@ -0,0 +1,539 @@ +/* + * Copyright 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package table + +import ( + "bytes" + "io" + "math" + "sort" + + "github.com/dgraph-io/badger/y" + "github.com/pkg/errors" +) + +type blockIterator struct { + data []byte + pos uint32 + err error + baseKey []byte + + key []byte + val []byte + init bool + + last header // The last header we saw. +} + +func (itr *blockIterator) Reset() { + itr.pos = 0 + itr.err = nil + itr.baseKey = []byte{} + itr.key = []byte{} + itr.val = []byte{} + itr.init = false + itr.last = header{} +} + +func (itr *blockIterator) Init() { + if !itr.init { + itr.Next() + } +} + +func (itr *blockIterator) Valid() bool { + return itr != nil && itr.err == nil +} + +func (itr *blockIterator) Error() error { + return itr.err +} + +func (itr *blockIterator) Close() {} + +var ( + origin = 0 + current = 1 +) + +// Seek brings us to the first block element that is >= input key. +func (itr *blockIterator) Seek(key []byte, whence int) { + itr.err = nil + + switch whence { + case origin: + itr.Reset() + case current: + } + + var done bool + for itr.Init(); itr.Valid(); itr.Next() { + k := itr.Key() + if y.CompareKeys(k, key) >= 0 { + // We are done as k is >= key. + done = true + break + } + } + if !done { + itr.err = io.EOF + } +} + +func (itr *blockIterator) SeekToFirst() { + itr.err = nil + itr.Init() +} + +// SeekToLast brings us to the last element. Valid should return true. +func (itr *blockIterator) SeekToLast() { + itr.err = nil + for itr.Init(); itr.Valid(); itr.Next() { + } + itr.Prev() +} + +// parseKV would allocate a new byte slice for key and for value. +func (itr *blockIterator) parseKV(h header) { + if cap(itr.key) < int(h.plen+h.klen) { + sz := int(h.plen) + int(h.klen) // Convert to int before adding to avoid uint16 overflow. + itr.key = make([]byte, 2*sz) + } + itr.key = itr.key[:h.plen+h.klen] + copy(itr.key, itr.baseKey[:h.plen]) + copy(itr.key[h.plen:], itr.data[itr.pos:itr.pos+uint32(h.klen)]) + itr.pos += uint32(h.klen) + + if itr.pos+uint32(h.vlen) > uint32(len(itr.data)) { + itr.err = errors.Errorf("Value exceeded size of block: %d %d %d %d %v", + itr.pos, h.klen, h.vlen, len(itr.data), h) + return + } + itr.val = y.SafeCopy(itr.val, itr.data[itr.pos:itr.pos+uint32(h.vlen)]) + itr.pos += uint32(h.vlen) +} + +func (itr *blockIterator) Next() { + itr.init = true + itr.err = nil + if itr.pos >= uint32(len(itr.data)) { + itr.err = io.EOF + return + } + + var h header + itr.pos += uint32(h.Decode(itr.data[itr.pos:])) + itr.last = h // Store the last header. + + if h.klen == 0 && h.plen == 0 { + // Last entry in the table. + itr.err = io.EOF + return + } + + // Populate baseKey if it isn't set yet. This would only happen for the first Next. + if len(itr.baseKey) == 0 { + // This should be the first Next() for this block. Hence, prefix length should be zero. + y.AssertTrue(h.plen == 0) + itr.baseKey = itr.data[itr.pos : itr.pos+uint32(h.klen)] + } + itr.parseKV(h) +} + +func (itr *blockIterator) Prev() { + if !itr.init { + return + } + itr.err = nil + if itr.last.prev == math.MaxUint32 { + // This is the first element of the block! + itr.err = io.EOF + itr.pos = 0 + return + } + + // Move back using current header's prev. + itr.pos = itr.last.prev + + var h header + y.AssertTruef(itr.pos < uint32(len(itr.data)), "%d %d", itr.pos, len(itr.data)) + itr.pos += uint32(h.Decode(itr.data[itr.pos:])) + itr.parseKV(h) + itr.last = h +} + +func (itr *blockIterator) Key() []byte { + if itr.err != nil { + return nil + } + return itr.key +} + +func (itr *blockIterator) Value() []byte { + if itr.err != nil { + return nil + } + return itr.val +} + +// Iterator is an iterator for a Table. +type Iterator struct { + t *Table + bpos int + bi *blockIterator + err error + + // Internally, Iterator is bidirectional. However, we only expose the + // unidirectional functionality for now. + reversed bool +} + +// NewIterator returns a new iterator of the Table +func (t *Table) NewIterator(reversed bool) *Iterator { + t.IncrRef() // Important. + ti := &Iterator{t: t, reversed: reversed} + ti.next() + return ti +} + +// Close closes the iterator (and it must be called). +func (itr *Iterator) Close() error { + return itr.t.DecrRef() +} + +func (itr *Iterator) reset() { + itr.bpos = 0 + itr.err = nil +} + +// Valid follows the y.Iterator interface +func (itr *Iterator) Valid() bool { + return itr.err == nil +} + +func (itr *Iterator) seekToFirst() { + numBlocks := len(itr.t.blockIndex) + if numBlocks == 0 { + itr.err = io.EOF + return + } + itr.bpos = 0 + block, err := itr.t.block(itr.bpos) + if err != nil { + itr.err = err + return + } + itr.bi = block.NewIterator() + itr.bi.SeekToFirst() + itr.err = itr.bi.Error() +} + +func (itr *Iterator) seekToLast() { + numBlocks := len(itr.t.blockIndex) + if numBlocks == 0 { + itr.err = io.EOF + return + } + itr.bpos = numBlocks - 1 + block, err := itr.t.block(itr.bpos) + if err != nil { + itr.err = err + return + } + itr.bi = block.NewIterator() + itr.bi.SeekToLast() + itr.err = itr.bi.Error() +} + +func (itr *Iterator) seekHelper(blockIdx int, key []byte) { + itr.bpos = blockIdx + block, err := itr.t.block(blockIdx) + if err != nil { + itr.err = err + return + } + itr.bi = block.NewIterator() + itr.bi.Seek(key, origin) + itr.err = itr.bi.Error() +} + +// seekFrom brings us to a key that is >= input key. +func (itr *Iterator) seekFrom(key []byte, whence int) { + itr.err = nil + switch whence { + case origin: + itr.reset() + case current: + } + + idx := sort.Search(len(itr.t.blockIndex), func(idx int) bool { + ko := itr.t.blockIndex[idx] + return y.CompareKeys(ko.key, key) > 0 + }) + if idx == 0 { + // The smallest key in our table is already strictly > key. We can return that. + // This is like a SeekToFirst. + itr.seekHelper(0, key) + return + } + + // block[idx].smallest is > key. + // Since idx>0, we know block[idx-1].smallest is <= key. + // There are two cases. + // 1) Everything in block[idx-1] is strictly < key. In this case, we should go to the first + // element of block[idx]. + // 2) Some element in block[idx-1] is >= key. We should go to that element. + itr.seekHelper(idx-1, key) + if itr.err == io.EOF { + // Case 1. Need to visit block[idx]. + if idx == len(itr.t.blockIndex) { + // If idx == len(itr.t.blockIndex), then input key is greater than ANY element of table. + // There's nothing we can do. Valid() should return false as we seek to end of table. + return + } + // Since block[idx].smallest is > key. This is essentially a block[idx].SeekToFirst. + itr.seekHelper(idx, key) + } + // Case 2: No need to do anything. We already did the seek in block[idx-1]. +} + +// seek will reset iterator and seek to >= key. +func (itr *Iterator) seek(key []byte) { + itr.seekFrom(key, origin) +} + +// seekForPrev will reset iterator and seek to <= key. +func (itr *Iterator) seekForPrev(key []byte) { + // TODO: Optimize this. We shouldn't have to take a Prev step. + itr.seekFrom(key, origin) + if !bytes.Equal(itr.Key(), key) { + itr.prev() + } +} + +func (itr *Iterator) next() { + itr.err = nil + + if itr.bpos >= len(itr.t.blockIndex) { + itr.err = io.EOF + return + } + + if itr.bi == nil { + block, err := itr.t.block(itr.bpos) + if err != nil { + itr.err = err + return + } + itr.bi = block.NewIterator() + itr.bi.SeekToFirst() + itr.err = itr.bi.Error() + return + } + + itr.bi.Next() + if !itr.bi.Valid() { + itr.bpos++ + itr.bi = nil + itr.next() + return + } +} + +func (itr *Iterator) prev() { + itr.err = nil + if itr.bpos < 0 { + itr.err = io.EOF + return + } + + if itr.bi == nil { + block, err := itr.t.block(itr.bpos) + if err != nil { + itr.err = err + return + } + itr.bi = block.NewIterator() + itr.bi.SeekToLast() + itr.err = itr.bi.Error() + return + } + + itr.bi.Prev() + if !itr.bi.Valid() { + itr.bpos-- + itr.bi = nil + itr.prev() + return + } +} + +// Key follows the y.Iterator interface +func (itr *Iterator) Key() []byte { + return itr.bi.Key() +} + +// Value follows the y.Iterator interface +func (itr *Iterator) Value() (ret y.ValueStruct) { + ret.Decode(itr.bi.Value()) + return +} + +// Next follows the y.Iterator interface +func (itr *Iterator) Next() { + if !itr.reversed { + itr.next() + } else { + itr.prev() + } +} + +// Rewind follows the y.Iterator interface +func (itr *Iterator) Rewind() { + if !itr.reversed { + itr.seekToFirst() + } else { + itr.seekToLast() + } +} + +// Seek follows the y.Iterator interface +func (itr *Iterator) Seek(key []byte) { + if !itr.reversed { + itr.seek(key) + } else { + itr.seekForPrev(key) + } +} + +// ConcatIterator concatenates the sequences defined by several iterators. (It only works with +// TableIterators, probably just because it's faster to not be so generic.) +type ConcatIterator struct { + idx int // Which iterator is active now. + cur *Iterator + iters []*Iterator // Corresponds to tables. + tables []*Table // Disregarding reversed, this is in ascending order. + reversed bool +} + +// NewConcatIterator creates a new concatenated iterator +func NewConcatIterator(tbls []*Table, reversed bool) *ConcatIterator { + iters := make([]*Iterator, len(tbls)) + for i := 0; i < len(tbls); i++ { + iters[i] = tbls[i].NewIterator(reversed) + } + return &ConcatIterator{ + reversed: reversed, + iters: iters, + tables: tbls, + idx: -1, // Not really necessary because s.it.Valid()=false, but good to have. + } +} + +func (s *ConcatIterator) setIdx(idx int) { + s.idx = idx + if idx < 0 || idx >= len(s.iters) { + s.cur = nil + } else { + s.cur = s.iters[s.idx] + } +} + +// Rewind implements y.Interface +func (s *ConcatIterator) Rewind() { + if len(s.iters) == 0 { + return + } + if !s.reversed { + s.setIdx(0) + } else { + s.setIdx(len(s.iters) - 1) + } + s.cur.Rewind() +} + +// Valid implements y.Interface +func (s *ConcatIterator) Valid() bool { + return s.cur != nil && s.cur.Valid() +} + +// Key implements y.Interface +func (s *ConcatIterator) Key() []byte { + return s.cur.Key() +} + +// Value implements y.Interface +func (s *ConcatIterator) Value() y.ValueStruct { + return s.cur.Value() +} + +// Seek brings us to element >= key if reversed is false. Otherwise, <= key. +func (s *ConcatIterator) Seek(key []byte) { + var idx int + if !s.reversed { + idx = sort.Search(len(s.tables), func(i int) bool { + return y.CompareKeys(s.tables[i].Biggest(), key) >= 0 + }) + } else { + n := len(s.tables) + idx = n - 1 - sort.Search(n, func(i int) bool { + return y.CompareKeys(s.tables[n-1-i].Smallest(), key) <= 0 + }) + } + if idx >= len(s.tables) || idx < 0 { + s.setIdx(-1) + return + } + // For reversed=false, we know s.tables[i-1].Biggest() < key. Thus, the + // previous table cannot possibly contain key. + s.setIdx(idx) + s.cur.Seek(key) +} + +// Next advances our concat iterator. +func (s *ConcatIterator) Next() { + s.cur.Next() + if s.cur.Valid() { + // Nothing to do. Just stay with the current table. + return + } + for { // In case there are empty tables. + if !s.reversed { + s.setIdx(s.idx + 1) + } else { + s.setIdx(s.idx - 1) + } + if s.cur == nil { + // End of list. Valid will become false. + return + } + s.cur.Rewind() + if s.cur.Valid() { + break + } + } +} + +// Close implements y.Interface. +func (s *ConcatIterator) Close() error { + for _, it := range s.iters { + if err := it.Close(); err != nil { + return errors.Wrap(err, "ConcatIterator") + } + } + return nil +} diff --git a/vendor/github.com/dgraph-io/badger/table/table.go b/vendor/github.com/dgraph-io/badger/table/table.go new file mode 100644 index 0000000000..0a1f42d464 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/table/table.go @@ -0,0 +1,360 @@ +/* + * Copyright 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package table + +import ( + "bytes" + "crypto/sha256" + "encoding/binary" + "fmt" + "io" + "os" + "path" + "path/filepath" + "strconv" + "strings" + "sync" + "sync/atomic" + + "github.com/AndreasBriese/bbloom" + "github.com/dgraph-io/badger/options" + "github.com/dgraph-io/badger/y" + "github.com/pkg/errors" +) + +const fileSuffix = ".sst" + +type keyOffset struct { + key []byte + offset int + len int +} + +// TableInterface is useful for testing. +type TableInterface interface { + Smallest() []byte + Biggest() []byte + DoesNotHave(key []byte) bool +} + +// Table represents a loaded table file with the info we have about it +type Table struct { + sync.Mutex + + fd *os.File // Own fd. + tableSize int // Initialized in OpenTable, using fd.Stat(). + + blockIndex []keyOffset + ref int32 // For file garbage collection. Atomic. + + loadingMode options.FileLoadingMode + mmap []byte // Memory mapped. + + // The following are initialized once and const. + smallest, biggest []byte // Smallest and largest keys. + id uint64 // file id, part of filename + + bf bbloom.Bloom + + Checksum []byte +} + +// IncrRef increments the refcount (having to do with whether the file should be deleted) +func (t *Table) IncrRef() { + atomic.AddInt32(&t.ref, 1) +} + +// DecrRef decrements the refcount and possibly deletes the table +func (t *Table) DecrRef() error { + newRef := atomic.AddInt32(&t.ref, -1) + if newRef == 0 { + // We can safely delete this file, because for all the current files, we always have + // at least one reference pointing to them. + + // It's necessary to delete windows files + if t.loadingMode == options.MemoryMap { + if err := y.Munmap(t.mmap); err != nil { + return err + } + } + if err := t.fd.Truncate(0); err != nil { + // This is very important to let the FS know that the file is deleted. + return err + } + filename := t.fd.Name() + if err := t.fd.Close(); err != nil { + return err + } + if err := os.Remove(filename); err != nil { + return err + } + } + return nil +} + +type block struct { + offset int + data []byte +} + +func (b block) NewIterator() *blockIterator { + return &blockIterator{data: b.data} +} + +// OpenTable assumes file has only one table and opens it. Takes ownership of fd upon function +// entry. Returns a table with one reference count on it (decrementing which may delete the file! +// -- consider t.Close() instead). The fd has to writeable because we call Truncate on it before +// deleting. +func OpenTable(fd *os.File, mode options.FileLoadingMode, cksum []byte) (*Table, error) { + fileInfo, err := fd.Stat() + if err != nil { + // It's OK to ignore fd.Close() errs in this function because we have only read + // from the file. + _ = fd.Close() + return nil, y.Wrap(err) + } + + filename := fileInfo.Name() + id, ok := ParseFileID(filename) + if !ok { + _ = fd.Close() + return nil, errors.Errorf("Invalid filename: %s", filename) + } + t := &Table{ + fd: fd, + ref: 1, // Caller is given one reference. + id: id, + loadingMode: mode, + } + + t.tableSize = int(fileInfo.Size()) + + // We first load to RAM, so we can read the index and do checksum. + if err := t.loadToRAM(); err != nil { + return nil, err + } + // Enforce checksum before we read index. Otherwise, if the file was + // truncated, we'd end up with panics in readIndex. + if len(cksum) > 0 && !bytes.Equal(t.Checksum, cksum) { + return nil, fmt.Errorf( + "CHECKSUM_MISMATCH: Table checksum does not match checksum in MANIFEST."+ + " NOT including table %s. This would lead to missing data."+ + "\n sha256 %x Expected\n sha256 %x Found\n", filename, cksum, t.Checksum) + } + if err := t.readIndex(); err != nil { + return nil, y.Wrap(err) + } + + it := t.NewIterator(false) + defer it.Close() + it.Rewind() + if it.Valid() { + t.smallest = it.Key() + } + + it2 := t.NewIterator(true) + defer it2.Close() + it2.Rewind() + if it2.Valid() { + t.biggest = it2.Key() + } + + switch mode { + case options.LoadToRAM: + // No need to do anything. t.mmap is already filled. + case options.MemoryMap: + t.mmap, err = y.Mmap(fd, false, fileInfo.Size()) + if err != nil { + _ = fd.Close() + return nil, y.Wrapf(err, "Unable to map file: %q", fileInfo.Name()) + } + case options.FileIO: + t.mmap = nil + default: + panic(fmt.Sprintf("Invalid loading mode: %v", mode)) + } + return t, nil +} + +// Close closes the open table. (Releases resources back to the OS.) +func (t *Table) Close() error { + if t.loadingMode == options.MemoryMap { + if err := y.Munmap(t.mmap); err != nil { + return err + } + } + + return t.fd.Close() +} + +func (t *Table) read(off, sz int) ([]byte, error) { + if len(t.mmap) > 0 { + if len(t.mmap[off:]) < sz { + return nil, y.ErrEOF + } + return t.mmap[off : off+sz], nil + } + + res := make([]byte, sz) + nbr, err := t.fd.ReadAt(res, int64(off)) + y.NumReads.Add(1) + y.NumBytesRead.Add(int64(nbr)) + return res, err +} + +func (t *Table) readNoFail(off, sz int) []byte { + res, err := t.read(off, sz) + y.Check(err) + return res +} + +func (t *Table) readIndex() error { + if len(t.mmap) != t.tableSize { + panic("Table size does not match the read bytes") + } + readPos := t.tableSize + + // Read bloom filter. + readPos -= 4 + buf := t.readNoFail(readPos, 4) + bloomLen := int(binary.BigEndian.Uint32(buf)) + readPos -= bloomLen + data := t.readNoFail(readPos, bloomLen) + t.bf = bbloom.JSONUnmarshal(data) + + readPos -= 4 + buf = t.readNoFail(readPos, 4) + restartsLen := int(binary.BigEndian.Uint32(buf)) + + readPos -= 4 * restartsLen + buf = t.readNoFail(readPos, 4*restartsLen) + + offsets := make([]int, restartsLen) + for i := 0; i < restartsLen; i++ { + offsets[i] = int(binary.BigEndian.Uint32(buf[:4])) + buf = buf[4:] + } + + // The last offset stores the end of the last block. + for i := 0; i < len(offsets); i++ { + var o int + if i == 0 { + o = 0 + } else { + o = offsets[i-1] + } + + ko := keyOffset{ + offset: o, + len: offsets[i] - o, + } + t.blockIndex = append(t.blockIndex, ko) + } + + // Execute this index read serially, because we already have table data in memory. + var h header + for idx := range t.blockIndex { + ko := &t.blockIndex[idx] + + hbuf := t.readNoFail(ko.offset, h.Size()) + h.Decode(hbuf) + y.AssertTrue(h.plen == 0) + + key := t.readNoFail(ko.offset+len(hbuf), int(h.klen)) + ko.key = append([]byte{}, key...) + } + + return nil +} + +func (t *Table) block(idx int) (block, error) { + y.AssertTruef(idx >= 0, "idx=%d", idx) + if idx >= len(t.blockIndex) { + return block{}, errors.New("block out of index") + } + + ko := t.blockIndex[idx] + blk := block{ + offset: ko.offset, + } + var err error + blk.data, err = t.read(blk.offset, ko.len) + return blk, err +} + +// Size is its file size in bytes +func (t *Table) Size() int64 { return int64(t.tableSize) } + +// Smallest is its smallest key, or nil if there are none +func (t *Table) Smallest() []byte { return t.smallest } + +// Biggest is its biggest key, or nil if there are none +func (t *Table) Biggest() []byte { return t.biggest } + +// Filename is NOT the file name. Just kidding, it is. +func (t *Table) Filename() string { return t.fd.Name() } + +// ID is the table's ID number (used to make the file name). +func (t *Table) ID() uint64 { return t.id } + +// DoesNotHave returns true if (but not "only if") the table does not have the key. It does a +// bloom filter lookup. +func (t *Table) DoesNotHave(key []byte) bool { return !t.bf.Has(key) } + +// ParseFileID reads the file id out of a filename. +func ParseFileID(name string) (uint64, bool) { + name = path.Base(name) + if !strings.HasSuffix(name, fileSuffix) { + return 0, false + } + // suffix := name[len(fileSuffix):] + name = strings.TrimSuffix(name, fileSuffix) + id, err := strconv.Atoi(name) + if err != nil { + return 0, false + } + y.AssertTrue(id >= 0) + return uint64(id), true +} + +// IDToFilename does the inverse of ParseFileID +func IDToFilename(id uint64) string { + return fmt.Sprintf("%06d", id) + fileSuffix +} + +// NewFilename should be named TableFilepath -- it combines the dir with the ID to make a table +// filepath. +func NewFilename(id uint64, dir string) string { + return filepath.Join(dir, IDToFilename(id)) +} + +func (t *Table) loadToRAM() error { + if _, err := t.fd.Seek(0, io.SeekStart); err != nil { + return err + } + t.mmap = make([]byte, t.tableSize) + sum := sha256.New() + tee := io.TeeReader(t.fd, sum) + read, err := tee.Read(t.mmap) + if err != nil || read != t.tableSize { + return y.Wrapf(err, "Unable to load file in memory. Table file: %s", t.Filename()) + } + t.Checksum = sum.Sum(nil) + y.NumReads.Add(1) + y.NumBytesRead.Add(int64(read)) + return nil +} diff --git a/vendor/github.com/dgraph-io/badger/test.sh b/vendor/github.com/dgraph-io/badger/test.sh new file mode 100644 index 0000000000..5b14bfd8f5 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/test.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +set -e + +# Ensure that we can compile the binary. +pushd badger +go build -v . +popd + +# Run the memory intensive tests first. +go test -v --manual=true -run='TestBigKeyValuePairs$' +go test -v --manual=true -run='TestPushValueLogLimit' + +# Run the special Truncate test. +rm -rf p +go test -v --manual=true -run='TestTruncateVlogNoClose$' . +truncate --size=4096 p/000000.vlog +go test -v --manual=true -run='TestTruncateVlogNoClose2$' . +go test -v --manual=true -run='TestTruncateVlogNoClose3$' . +rm -rf p + +# Then the normal tests. +echo +echo "==> Starting tests with value log mmapped..." +sleep 5 +go test -v --vlog_mmap=true -race ./... + +echo +echo "==> Starting tests with value log not mmapped..." +sleep 5 +go test -v --vlog_mmap=false -race ./... diff --git a/vendor/github.com/dgraph-io/badger/txn.go b/vendor/github.com/dgraph-io/badger/txn.go new file mode 100644 index 0000000000..67411a8f57 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/txn.go @@ -0,0 +1,701 @@ +/* + * Copyright 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package badger + +import ( + "bytes" + "context" + "encoding/hex" + "math" + "sort" + "strconv" + "sync" + "sync/atomic" + + "github.com/dgraph-io/badger/y" + farm "github.com/dgryski/go-farm" + "github.com/pkg/errors" +) + +type oracle struct { + // A 64-bit integer must be at the top for memory alignment. See issue #311. + refCount int64 + isManaged bool // Does not change value, so no locking required. + + sync.Mutex // For nextTxnTs and commits. + // writeChLock lock is for ensuring that transactions go to the write + // channel in the same order as their commit timestamps. + writeChLock sync.Mutex + nextTxnTs uint64 + + // Used to block NewTransaction, so all previous commits are visible to a new read. + txnMark *y.WaterMark + + // Either of these is used to determine which versions can be permanently + // discarded during compaction. + discardTs uint64 // Used by ManagedDB. + readMark *y.WaterMark // Used by DB. + + // commits stores a key fingerprint and latest commit counter for it. + // refCount is used to clear out commits map to avoid a memory blowup. + commits map[uint64]uint64 + + // closer is used to stop watermarks. + closer *y.Closer +} + +func newOracle(opt Options) *oracle { + orc := &oracle{ + isManaged: opt.managedTxns, + commits: make(map[uint64]uint64), + // We're not initializing nextTxnTs and readOnlyTs. It would be done after replay in Open. + // + // WaterMarks must be 64-bit aligned for atomic package, hence we must use pointers here. + // See https://golang.org/pkg/sync/atomic/#pkg-note-BUG. + readMark: &y.WaterMark{Name: "badger.PendingReads"}, + txnMark: &y.WaterMark{Name: "badger.TxnTimestamp"}, + closer: y.NewCloser(2), + } + orc.readMark.Init(orc.closer) + orc.txnMark.Init(orc.closer) + return orc +} + +func (o *oracle) Stop() { + o.closer.SignalAndWait() +} + +func (o *oracle) addRef() { + atomic.AddInt64(&o.refCount, 1) +} + +func (o *oracle) decrRef() { + if atomic.AddInt64(&o.refCount, -1) != 0 { + return + } + + // Clear out commits maps to release memory. + o.Lock() + defer o.Unlock() + // Avoids the race where something new is added to commitsMap + // after we check refCount and before we take Lock. + if atomic.LoadInt64(&o.refCount) != 0 { + return + } + if len(o.commits) >= 1000 { // If the map is still small, let it slide. + o.commits = make(map[uint64]uint64) + } +} + +func (o *oracle) readTs() uint64 { + if o.isManaged { + panic("ReadTs should not be retrieved for managed DB") + } + + var readTs uint64 + o.Lock() + readTs = o.nextTxnTs - 1 + o.readMark.Begin(readTs) + o.Unlock() + + // Wait for all txns which have no conflicts, have been assigned a commit + // timestamp and are going through the write to value log and LSM tree + // process. Not waiting here could mean that some txns which have been + // committed would not be read. + y.Check(o.txnMark.WaitForMark(context.Background(), readTs)) + return readTs +} + +func (o *oracle) nextTs() uint64 { + o.Lock() + defer o.Unlock() + return o.nextTxnTs +} + +func (o *oracle) incrementNextTs() { + o.Lock() + defer o.Unlock() + o.nextTxnTs++ +} + +// Any deleted or invalid versions at or below ts would be discarded during +// compaction to reclaim disk space in LSM tree and thence value log. +func (o *oracle) setDiscardTs(ts uint64) { + o.Lock() + defer o.Unlock() + o.discardTs = ts +} + +func (o *oracle) discardAtOrBelow() uint64 { + if o.isManaged { + o.Lock() + defer o.Unlock() + return o.discardTs + } + return o.readMark.DoneUntil() +} + +// hasConflict must be called while having a lock. +func (o *oracle) hasConflict(txn *Txn) bool { + if len(txn.reads) == 0 { + return false + } + for _, ro := range txn.reads { + // A commit at the read timestamp is expected. + // But, any commit after the read timestamp should cause a conflict. + if ts, has := o.commits[ro]; has && ts > txn.readTs { + return true + } + } + return false +} + +func (o *oracle) newCommitTs(txn *Txn) uint64 { + o.Lock() + defer o.Unlock() + + if o.hasConflict(txn) { + return 0 + } + + var ts uint64 + if !o.isManaged { + // This is the general case, when user doesn't specify the read and commit ts. + ts = o.nextTxnTs + o.nextTxnTs++ + o.txnMark.Begin(ts) + + } else { + // If commitTs is set, use it instead. + ts = txn.commitTs + } + + for _, w := range txn.writes { + o.commits[w] = ts // Update the commitTs. + } + return ts +} + +func (o *oracle) doneCommit(cts uint64) { + if o.isManaged { + // No need to update anything. + return + } + o.txnMark.Done(cts) +} + +// Txn represents a Badger transaction. +type Txn struct { + readTs uint64 + commitTs uint64 + + update bool // update is used to conditionally keep track of reads. + reads []uint64 // contains fingerprints of keys read. + writes []uint64 // contains fingerprints of keys written. + + pendingWrites map[string]*Entry // cache stores any writes done by txn. + + db *DB + discarded bool + + size int64 + count int64 + numIterators int32 +} + +type pendingWritesIterator struct { + entries []*Entry + nextIdx int + readTs uint64 + reversed bool +} + +func (pi *pendingWritesIterator) Next() { + pi.nextIdx++ +} + +func (pi *pendingWritesIterator) Rewind() { + pi.nextIdx = 0 +} + +func (pi *pendingWritesIterator) Seek(key []byte) { + key = y.ParseKey(key) + pi.nextIdx = sort.Search(len(pi.entries), func(idx int) bool { + cmp := bytes.Compare(pi.entries[idx].Key, key) + if !pi.reversed { + return cmp >= 0 + } + return cmp <= 0 + }) +} + +func (pi *pendingWritesIterator) Key() []byte { + y.AssertTrue(pi.Valid()) + entry := pi.entries[pi.nextIdx] + return y.KeyWithTs(entry.Key, pi.readTs) +} + +func (pi *pendingWritesIterator) Value() y.ValueStruct { + y.AssertTrue(pi.Valid()) + entry := pi.entries[pi.nextIdx] + return y.ValueStruct{ + Value: entry.Value, + Meta: entry.meta, + UserMeta: entry.UserMeta, + ExpiresAt: entry.ExpiresAt, + Version: pi.readTs, + } +} + +func (pi *pendingWritesIterator) Valid() bool { + return pi.nextIdx < len(pi.entries) +} + +func (pi *pendingWritesIterator) Close() error { + return nil +} + +func (txn *Txn) newPendingWritesIterator(reversed bool) *pendingWritesIterator { + if !txn.update || len(txn.pendingWrites) == 0 { + return nil + } + entries := make([]*Entry, 0, len(txn.pendingWrites)) + for _, e := range txn.pendingWrites { + entries = append(entries, e) + } + // Number of pending writes per transaction shouldn't be too big in general. + sort.Slice(entries, func(i, j int) bool { + cmp := bytes.Compare(entries[i].Key, entries[j].Key) + if !reversed { + return cmp < 0 + } + return cmp > 0 + }) + return &pendingWritesIterator{ + readTs: txn.readTs, + entries: entries, + reversed: reversed, + } +} + +func (txn *Txn) checkSize(e *Entry) error { + count := txn.count + 1 + // Extra bytes for version in key. + size := txn.size + int64(e.estimateSize(txn.db.opt.ValueThreshold)) + 10 + if count >= txn.db.opt.maxBatchCount || size >= txn.db.opt.maxBatchSize { + return ErrTxnTooBig + } + txn.count, txn.size = count, size + return nil +} + +func exceedsSize(prefix string, max int64, key []byte) error { + return errors.Errorf("%s with size %d exceeded %d limit. %s:\n%s", + prefix, len(key), max, prefix, hex.Dump(key[:1<<10])) +} + +func (txn *Txn) modify(e *Entry) error { + const maxKeySize = 65000 + + switch { + case !txn.update: + return ErrReadOnlyTxn + case txn.discarded: + return ErrDiscardedTxn + case len(e.Key) == 0: + return ErrEmptyKey + case bytes.HasPrefix(e.Key, badgerPrefix): + return ErrInvalidKey + case len(e.Key) > maxKeySize: + // Key length can't be more than uint16, as determined by table::header. To + // keep things safe and allow badger move prefix and a timestamp suffix, let's + // cut it down to 65000, instead of using 65536. + return exceedsSize("Key", maxKeySize, e.Key) + case int64(len(e.Value)) > txn.db.opt.ValueLogFileSize: + return exceedsSize("Value", txn.db.opt.ValueLogFileSize, e.Value) + } + + if err := txn.checkSize(e); err != nil { + return err + } + fp := farm.Fingerprint64(e.Key) // Avoid dealing with byte arrays. + txn.writes = append(txn.writes, fp) + txn.pendingWrites[string(e.Key)] = e + return nil +} + +// Set adds a key-value pair to the database. +// It will return ErrReadOnlyTxn if update flag was set to false when creating the transaction. +// +// The current transaction keeps a reference to the key and val byte slice +// arguments. Users must not modify key and val until the end of the transaction. +func (txn *Txn) Set(key, val []byte) error { + return txn.SetEntry(NewEntry(key, val)) +} + +// SetEntry takes an Entry struct and adds the key-value pair in the struct, +// along with other metadata to the database. +// +// The current transaction keeps a reference to the entry passed in argument. +// Users must not modify the entry until the end of the transaction. +func (txn *Txn) SetEntry(e *Entry) error { + return txn.modify(e) +} + +// Delete deletes a key. +// +// This is done by adding a delete marker for the key at commit timestamp. Any +// reads happening before this timestamp would be unaffected. Any reads after +// this commit would see the deletion. +// +// The current transaction keeps a reference to the key byte slice argument. +// Users must not modify the key until the end of the transaction. +func (txn *Txn) Delete(key []byte) error { + e := &Entry{ + Key: key, + meta: bitDelete, + } + return txn.modify(e) +} + +// Get looks for key and returns corresponding Item. +// If key is not found, ErrKeyNotFound is returned. +func (txn *Txn) Get(key []byte) (item *Item, rerr error) { + if len(key) == 0 { + return nil, ErrEmptyKey + } else if txn.discarded { + return nil, ErrDiscardedTxn + } + + item = new(Item) + if txn.update { + if e, has := txn.pendingWrites[string(key)]; has && bytes.Equal(key, e.Key) { + if isDeletedOrExpired(e.meta, e.ExpiresAt) { + return nil, ErrKeyNotFound + } + // Fulfill from cache. + item.meta = e.meta + item.val = e.Value + item.userMeta = e.UserMeta + item.key = key + item.status = prefetched + item.version = txn.readTs + item.expiresAt = e.ExpiresAt + // We probably don't need to set db on item here. + return item, nil + } + // Only track reads if this is update txn. No need to track read if txn serviced it + // internally. + txn.addReadKey(key) + } + + seek := y.KeyWithTs(key, txn.readTs) + vs, err := txn.db.get(seek) + if err != nil { + return nil, errors.Wrapf(err, "DB::Get key: %q", key) + } + if vs.Value == nil && vs.Meta == 0 { + return nil, ErrKeyNotFound + } + if isDeletedOrExpired(vs.Meta, vs.ExpiresAt) { + return nil, ErrKeyNotFound + } + + item.key = key + item.version = vs.Version + item.meta = vs.Meta + item.userMeta = vs.UserMeta + item.db = txn.db + item.vptr = vs.Value // TODO: Do we need to copy this over? + item.txn = txn + item.expiresAt = vs.ExpiresAt + return item, nil +} + +func (txn *Txn) addReadKey(key []byte) { + if txn.update { + fp := farm.Fingerprint64(key) + txn.reads = append(txn.reads, fp) + } +} + +// Discard discards a created transaction. This method is very important and must be called. Commit +// method calls this internally, however, calling this multiple times doesn't cause any issues. So, +// this can safely be called via a defer right when transaction is created. +// +// NOTE: If any operations are run on a discarded transaction, ErrDiscardedTxn is returned. +func (txn *Txn) Discard() { + if txn.discarded { // Avoid a re-run. + return + } + if atomic.LoadInt32(&txn.numIterators) > 0 { + panic("Unclosed iterator at time of Txn.Discard.") + } + txn.discarded = true + if !txn.db.orc.isManaged { + txn.db.orc.readMark.Done(txn.readTs) + } + if txn.update { + txn.db.orc.decrRef() + } +} + +func (txn *Txn) commitAndSend() (func() error, error) { + orc := txn.db.orc + // Ensure that the order in which we get the commit timestamp is the same as + // the order in which we push these updates to the write channel. So, we + // acquire a writeChLock before getting a commit timestamp, and only release + // it after pushing the entries to it. + orc.writeChLock.Lock() + defer orc.writeChLock.Unlock() + + commitTs := orc.newCommitTs(txn) + if commitTs == 0 { + return nil, ErrConflict + } + + // The following debug information is what led to determining the cause of + // bank txn violation bug, and it took a whole bunch of effort to narrow it + // down to here. So, keep this around for at least a couple of months. + // var b strings.Builder + // fmt.Fprintf(&b, "Read: %d. Commit: %d. reads: %v. writes: %v. Keys: ", + // txn.readTs, commitTs, txn.reads, txn.writes) + entries := make([]*Entry, 0, len(txn.pendingWrites)+1) + for _, e := range txn.pendingWrites { + // fmt.Fprintf(&b, "[%q : %q], ", e.Key, e.Value) + + // Suffix the keys with commit ts, so the key versions are sorted in + // descending order of commit timestamp. + e.Key = y.KeyWithTs(e.Key, commitTs) + e.meta |= bitTxn + entries = append(entries, e) + } + // log.Printf("%s\n", b.String()) + e := &Entry{ + Key: y.KeyWithTs(txnKey, commitTs), + Value: []byte(strconv.FormatUint(commitTs, 10)), + meta: bitFinTxn, + } + entries = append(entries, e) + + req, err := txn.db.sendToWriteCh(entries) + if err != nil { + orc.doneCommit(commitTs) + return nil, err + } + ret := func() error { + err := req.Wait() + // Wait before marking commitTs as done. + // We can't defer doneCommit above, because it is being called from a + // callback here. + orc.doneCommit(commitTs) + return err + } + return ret, nil +} + +func (txn *Txn) commitPrecheck() { + if txn.commitTs == 0 && txn.db.opt.managedTxns { + panic("Commit cannot be called with managedDB=true. Use CommitAt.") + } + if txn.discarded { + panic("Trying to commit a discarded txn") + } +} + +// Commit commits the transaction, following these steps: +// +// 1. If there are no writes, return immediately. +// +// 2. Check if read rows were updated since txn started. If so, return ErrConflict. +// +// 3. If no conflict, generate a commit timestamp and update written rows' commit ts. +// +// 4. Batch up all writes, write them to value log and LSM tree. +// +// 5. If callback is provided, Badger will return immediately after checking +// for conflicts. Writes to the database will happen in the background. If +// there is a conflict, an error will be returned and the callback will not +// run. If there are no conflicts, the callback will be called in the +// background upon successful completion of writes or any error during write. +// +// If error is nil, the transaction is successfully committed. In case of a non-nil error, the LSM +// tree won't be updated, so there's no need for any rollback. +func (txn *Txn) Commit() error { + txn.commitPrecheck() // Precheck before discarding txn. + defer txn.Discard() + + if len(txn.writes) == 0 { + return nil // Nothing to do. + } + + txnCb, err := txn.commitAndSend() + if err != nil { + return err + } + // If batchSet failed, LSM would not have been updated. So, no need to rollback anything. + + // TODO: What if some of the txns successfully make it to value log, but others fail. + // Nothing gets updated to LSM, until a restart happens. + return txnCb() +} + +type txnCb struct { + commit func() error + user func(error) + err error +} + +func runTxnCallback(cb *txnCb) { + switch { + case cb == nil: + panic("txn callback is nil") + case cb.user == nil: + panic("Must have caught a nil callback for txn.CommitWith") + case cb.err != nil: + cb.user(cb.err) + case cb.commit != nil: + err := cb.commit() + cb.user(err) + default: + cb.user(nil) + } +} + +// CommitWith acts like Commit, but takes a callback, which gets run via a +// goroutine to avoid blocking this function. The callback is guaranteed to run, +// so it is safe to increment sync.WaitGroup before calling CommitWith, and +// decrementing it in the callback; to block until all callbacks are run. +func (txn *Txn) CommitWith(cb func(error)) { + txn.commitPrecheck() // Precheck before discarding txn. + defer txn.Discard() + + if cb == nil { + panic("Nil callback provided to CommitWith") + } + + if len(txn.writes) == 0 { + // Do not run these callbacks from here, because the CommitWith and the + // callback might be acquiring the same locks. Instead run the callback + // from another goroutine. + go runTxnCallback(&txnCb{user: cb, err: nil}) + return + } + + commitCb, err := txn.commitAndSend() + if err != nil { + go runTxnCallback(&txnCb{user: cb, err: err}) + return + } + + go runTxnCallback(&txnCb{user: cb, commit: commitCb}) +} + +// ReadTs returns the read timestamp of the transaction. +func (txn *Txn) ReadTs() uint64 { + return txn.readTs +} + +// NewTransaction creates a new transaction. Badger supports concurrent execution of transactions, +// providing serializable snapshot isolation, avoiding write skews. Badger achieves this by tracking +// the keys read and at Commit time, ensuring that these read keys weren't concurrently modified by +// another transaction. +// +// For read-only transactions, set update to false. In this mode, we don't track the rows read for +// any changes. Thus, any long running iterations done in this mode wouldn't pay this overhead. +// +// Running transactions concurrently is OK. However, a transaction itself isn't thread safe, and +// should only be run serially. It doesn't matter if a transaction is created by one goroutine and +// passed down to other, as long as the Txn APIs are called serially. +// +// When you create a new transaction, it is absolutely essential to call +// Discard(). This should be done irrespective of what the update param is set +// to. Commit API internally runs Discard, but running it twice wouldn't cause +// any issues. +// +// txn := db.NewTransaction(false) +// defer txn.Discard() +// // Call various APIs. +func (db *DB) NewTransaction(update bool) *Txn { + return db.newTransaction(update, false) +} + +func (db *DB) newTransaction(update, isManaged bool) *Txn { + if db.opt.ReadOnly && update { + // DB is read-only, force read-only transaction. + update = false + } + + txn := &Txn{ + update: update, + db: db, + count: 1, // One extra entry for BitFin. + size: int64(len(txnKey) + 10), // Some buffer for the extra entry. + } + if update { + txn.pendingWrites = make(map[string]*Entry) + txn.db.orc.addRef() + } + // It is important that the oracle addRef happens BEFORE we retrieve a read + // timestamp. Otherwise, it is possible that the oracle commit map would + // become nil after we get the read timestamp. + // The sequence of events can be: + // 1. This txn gets a read timestamp. + // 2. Another txn working on the same keyset commits them, and decrements + // the reference to oracle. + // 3. Oracle ref reaches zero, resetting commit map. + // 4. This txn increments the oracle reference. + // 5. Now this txn would go on to commit the keyset, and no conflicts + // would be detected. + // See issue: https://github.com/dgraph-io/badger/issues/574 + if !isManaged { + txn.readTs = db.orc.readTs() + } + return txn +} + +// View executes a function creating and managing a read-only transaction for the user. Error +// returned by the function is relayed by the View method. +// If View is used with managed transactions, it would assume a read timestamp of MaxUint64. +func (db *DB) View(fn func(txn *Txn) error) error { + var txn *Txn + if db.opt.managedTxns { + txn = db.NewTransactionAt(math.MaxUint64, false) + } else { + txn = db.NewTransaction(false) + } + defer txn.Discard() + + return fn(txn) +} + +// Update executes a function, creating and managing a read-write transaction +// for the user. Error returned by the function is relayed by the Update method. +// Update cannot be used with managed transactions. +func (db *DB) Update(fn func(txn *Txn) error) error { + if db.opt.managedTxns { + panic("Update can only be used with managedDB=false.") + } + txn := db.NewTransaction(true) + defer txn.Discard() + + if err := fn(txn); err != nil { + return err + } + + return txn.Commit() +} diff --git a/vendor/github.com/dgraph-io/badger/util.go b/vendor/github.com/dgraph-io/badger/util.go new file mode 100644 index 0000000000..c5173e26cc --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/util.go @@ -0,0 +1,116 @@ +/* + * Copyright 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package badger + +import ( + "encoding/hex" + "io/ioutil" + "math/rand" + "sync/atomic" + "time" + + "github.com/dgraph-io/badger/table" + "github.com/dgraph-io/badger/y" + "github.com/pkg/errors" +) + +func (s *levelsController) validate() error { + for _, l := range s.levels { + if err := l.validate(); err != nil { + return errors.Wrap(err, "Levels Controller") + } + } + return nil +} + +// Check does some sanity check on one level of data or in-memory index. +func (s *levelHandler) validate() error { + if s.level == 0 { + return nil + } + + s.RLock() + defer s.RUnlock() + numTables := len(s.tables) + for j := 1; j < numTables; j++ { + if j >= len(s.tables) { + return errors.Errorf("Level %d, j=%d numTables=%d", s.level, j, numTables) + } + + if y.CompareKeys(s.tables[j-1].Biggest(), s.tables[j].Smallest()) >= 0 { + return errors.Errorf( + "Inter: Biggest(j-1) \n%s\n vs Smallest(j): \n%s\n: level=%d j=%d numTables=%d", + hex.Dump(s.tables[j-1].Biggest()), hex.Dump(s.tables[j].Smallest()), + s.level, j, numTables) + } + + if y.CompareKeys(s.tables[j].Smallest(), s.tables[j].Biggest()) > 0 { + return errors.Errorf( + "Intra: %q vs %q: level=%d j=%d numTables=%d", + s.tables[j].Smallest(), s.tables[j].Biggest(), s.level, j, numTables) + } + } + return nil +} + +// func (s *KV) debugPrintMore() { s.lc.debugPrintMore() } + +// // debugPrintMore shows key ranges of each level. +// func (s *levelsController) debugPrintMore() { +// s.Lock() +// defer s.Unlock() +// for i := 0; i < s.kv.opt.MaxLevels; i++ { +// s.levels[i].debugPrintMore() +// } +// } + +// func (s *levelHandler) debugPrintMore() { +// s.RLock() +// defer s.RUnlock() +// s.elog.Printf("Level %d:", s.level) +// for _, t := range s.tables { +// y.Printf(" [%s, %s]", t.Smallest(), t.Biggest()) +// } +// y.Printf("\n") +// } + +// reserveFileID reserves a unique file id. +func (s *levelsController) reserveFileID() uint64 { + id := atomic.AddUint64(&s.nextFileID, 1) + return id - 1 +} + +func getIDMap(dir string) map[uint64]struct{} { + fileInfos, err := ioutil.ReadDir(dir) + y.Check(err) + idMap := make(map[uint64]struct{}) + for _, info := range fileInfos { + if info.IsDir() { + continue + } + fileID, ok := table.ParseFileID(info.Name()) + if !ok { + continue + } + idMap[fileID] = struct{}{} + } + return idMap +} + +func init() { + rand.Seed(time.Now().UnixNano()) +} diff --git a/vendor/github.com/dgraph-io/badger/value.go b/vendor/github.com/dgraph-io/badger/value.go new file mode 100644 index 0000000000..f57f1b3ba8 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/value.go @@ -0,0 +1,1455 @@ +/* + * Copyright 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package badger + +import ( + "bufio" + "bytes" + "encoding/binary" + "encoding/json" + "fmt" + "hash/crc32" + "io" + "io/ioutil" + "math" + "math/rand" + "os" + "sort" + "strconv" + "strings" + "sync" + "sync/atomic" + "time" + + "github.com/dgraph-io/badger/options" + "github.com/dgraph-io/badger/y" + "github.com/pkg/errors" + "golang.org/x/net/trace" +) + +// Values have their first byte being byteData or byteDelete. This helps us distinguish between +// a key that has never been seen and a key that has been explicitly deleted. +const ( + bitDelete byte = 1 << 0 // Set if the key has been deleted. + bitValuePointer byte = 1 << 1 // Set if the value is NOT stored directly next to key. + bitDiscardEarlierVersions byte = 1 << 2 // Set if earlier versions can be discarded. + // Set if item shouldn't be discarded via compactions (used by merge operator) + bitMergeEntry byte = 1 << 3 + // The MSB 2 bits are for transactions. + bitTxn byte = 1 << 6 // Set if the entry is part of a txn. + bitFinTxn byte = 1 << 7 // Set if the entry is to indicate end of txn in value log. + + mi int64 = 1 << 20 + + // The number of updates after which discard map should be flushed into badger. + discardStatsFlushThreshold = 100 +) + +type logFile struct { + path string + // This is a lock on the log file. It guards the fd’s value, the file’s + // existence and the file’s memory map. + // + // Use shared ownership when reading/writing the file or memory map, use + // exclusive ownership to open/close the descriptor, unmap or remove the file. + lock sync.RWMutex + fd *os.File + fid uint32 + fmap []byte + size uint32 + loadingMode options.FileLoadingMode +} + +// openReadOnly assumes that we have a write lock on logFile. +func (lf *logFile) openReadOnly() error { + var err error + lf.fd, err = os.OpenFile(lf.path, os.O_RDONLY, 0666) + if err != nil { + return errors.Wrapf(err, "Unable to open %q as RDONLY.", lf.path) + } + + fi, err := lf.fd.Stat() + if err != nil { + return errors.Wrapf(err, "Unable to check stat for %q", lf.path) + } + y.AssertTrue(fi.Size() <= math.MaxUint32) + lf.size = uint32(fi.Size()) + + if err = lf.mmap(fi.Size()); err != nil { + _ = lf.fd.Close() + return y.Wrapf(err, "Unable to map file: %q", fi.Name()) + } + + return nil +} + +func (lf *logFile) mmap(size int64) (err error) { + if lf.loadingMode != options.MemoryMap { + // Nothing to do + return nil + } + lf.fmap, err = y.Mmap(lf.fd, false, size) + if err == nil { + err = y.Madvise(lf.fmap, false) // Disable readahead + } + return err +} + +func (lf *logFile) munmap() (err error) { + if lf.loadingMode != options.MemoryMap { + // Nothing to do + return nil + } + if err := y.Munmap(lf.fmap); err != nil { + return errors.Wrapf(err, "Unable to munmap value log: %q", lf.path) + } + return nil +} + +// Acquire lock on mmap/file if you are calling this +func (lf *logFile) read(p valuePointer, s *y.Slice) (buf []byte, err error) { + var nbr int64 + offset := p.Offset + if lf.loadingMode == options.FileIO { + buf = s.Resize(int(p.Len)) + var n int + n, err = lf.fd.ReadAt(buf, int64(offset)) + nbr = int64(n) + } else { + // Do not convert size to uint32, because the lf.fmap can be of size + // 4GB, which overflows the uint32 during conversion to make the size 0, + // causing the read to fail with ErrEOF. See issue #585. + size := int64(len(lf.fmap)) + valsz := p.Len + if int64(offset) >= size || int64(offset+valsz) > size { + err = y.ErrEOF + } else { + buf = lf.fmap[offset : offset+valsz] + nbr = int64(valsz) + } + } + y.NumReads.Add(1) + y.NumBytesRead.Add(nbr) + return buf, err +} + +func (lf *logFile) doneWriting(offset uint32) error { + // Sync before acquiring lock. (We call this from write() and thus know we have shared access + // to the fd.) + if err := y.FileSync(lf.fd); err != nil { + return errors.Wrapf(err, "Unable to sync value log: %q", lf.path) + } + // Close and reopen the file read-only. Acquire lock because fd will become invalid for a bit. + // Acquiring the lock is bad because, while we don't hold the lock for a long time, it forces + // one batch of readers wait for the preceding batch of readers to finish. + // + // If there's a benefit to reopening the file read-only, it might be on Windows. I don't know + // what the benefit is. Consider keeping the file read-write, or use fcntl to change + // permissions. + lf.lock.Lock() + defer lf.lock.Unlock() + if err := lf.munmap(); err != nil { + return err + } + // TODO: Confirm if we need to run a file sync after truncation. + // Truncation must run after unmapping, otherwise Windows would crap itself. + if err := lf.fd.Truncate(int64(offset)); err != nil { + return errors.Wrapf(err, "Unable to truncate file: %q", lf.path) + } + if err := lf.fd.Close(); err != nil { + return errors.Wrapf(err, "Unable to close value log: %q", lf.path) + } + + return lf.openReadOnly() +} + +// You must hold lf.lock to sync() +func (lf *logFile) sync() error { + return y.FileSync(lf.fd) +} + +var errStop = errors.New("Stop iteration") +var errTruncate = errors.New("Do truncate") +var errDeleteVlogFile = errors.New("Delete vlog file") + +type logEntry func(e Entry, vp valuePointer) error + +type safeRead struct { + k []byte + v []byte + + recordOffset uint32 +} + +func (r *safeRead) Entry(reader *bufio.Reader) (*Entry, error) { + var hbuf [headerBufSize]byte + var err error + + hash := crc32.New(y.CastagnoliCrcTable) + tee := io.TeeReader(reader, hash) + if _, err = io.ReadFull(tee, hbuf[:]); err != nil { + return nil, err + } + + var h header + h.Decode(hbuf[:]) + if h.klen > uint32(1<<16) { // Key length must be below uint16. + return nil, errTruncate + } + kl := int(h.klen) + if cap(r.k) < kl { + r.k = make([]byte, 2*kl) + } + vl := int(h.vlen) + if cap(r.v) < vl { + r.v = make([]byte, 2*vl) + } + + e := &Entry{} + e.offset = r.recordOffset + e.Key = r.k[:kl] + e.Value = r.v[:vl] + + if _, err = io.ReadFull(tee, e.Key); err != nil { + if err == io.EOF { + err = errTruncate + } + return nil, err + } + if _, err = io.ReadFull(tee, e.Value); err != nil { + if err == io.EOF { + err = errTruncate + } + return nil, err + } + var crcBuf [4]byte + if _, err = io.ReadFull(reader, crcBuf[:]); err != nil { + if err == io.EOF { + err = errTruncate + } + return nil, err + } + crc := binary.BigEndian.Uint32(crcBuf[:]) + if crc != hash.Sum32() { + return nil, errTruncate + } + e.meta = h.meta + e.UserMeta = h.userMeta + e.ExpiresAt = h.expiresAt + return e, nil +} + +// iterate iterates over log file. It doesn't not allocate new memory for every kv pair. +// Therefore, the kv pair is only valid for the duration of fn call. +func (vlog *valueLog) iterate(lf *logFile, offset uint32, fn logEntry) (uint32, error) { + fi, err := lf.fd.Stat() + if err != nil { + return 0, err + } + if int64(offset) == fi.Size() { + // We're at the end of the file already. No need to do anything. + return offset, nil + } + if vlog.opt.ReadOnly { + // We're not at the end of the file. We'd need to replay the entries, or + // possibly truncate the file. + return 0, ErrReplayNeeded + } + + // We're not at the end of the file. Let's Seek to the offset and start reading. + if _, err := lf.fd.Seek(int64(offset), io.SeekStart); err != nil { + return 0, errFile(err, lf.path, "Unable to seek") + } + + reader := bufio.NewReader(lf.fd) + read := &safeRead{ + k: make([]byte, 10), + v: make([]byte, 10), + recordOffset: offset, + } + + var lastCommit uint64 + var validEndOffset uint32 + for { + e, err := read.Entry(reader) + if err == io.EOF { + break + } else if err == io.ErrUnexpectedEOF || err == errTruncate { + break + } else if err != nil { + return 0, err + } else if e == nil { + continue + } + + var vp valuePointer + vp.Len = uint32(headerBufSize + len(e.Key) + len(e.Value) + crc32.Size) + read.recordOffset += vp.Len + + vp.Offset = e.offset + vp.Fid = lf.fid + + if e.meta&bitTxn > 0 { + txnTs := y.ParseTs(e.Key) + if lastCommit == 0 { + lastCommit = txnTs + } + if lastCommit != txnTs { + break + } + + } else if e.meta&bitFinTxn > 0 { + txnTs, err := strconv.ParseUint(string(e.Value), 10, 64) + if err != nil || lastCommit != txnTs { + break + } + // Got the end of txn. Now we can store them. + lastCommit = 0 + validEndOffset = read.recordOffset + + } else { + if lastCommit != 0 { + // This is most likely an entry which was moved as part of GC. + // We shouldn't get this entry in the middle of a transaction. + break + } + validEndOffset = read.recordOffset + } + + if err := fn(*e, vp); err != nil { + if err == errStop { + break + } + return 0, errFile(err, lf.path, "Iteration function") + } + } + return validEndOffset, nil +} + +func (vlog *valueLog) rewrite(f *logFile, tr trace.Trace) error { + maxFid := atomic.LoadUint32(&vlog.maxFid) + y.AssertTruef(uint32(f.fid) < maxFid, "fid to move: %d. Current max fid: %d", f.fid, maxFid) + tr.LazyPrintf("Rewriting fid: %d", f.fid) + + wb := make([]*Entry, 0, 1000) + var size int64 + + y.AssertTrue(vlog.db != nil) + var count, moved int + fe := func(e Entry) error { + count++ + if count%100000 == 0 { + tr.LazyPrintf("Processing entry %d", count) + } + + vs, err := vlog.db.get(e.Key) + if err != nil { + return err + } + if discardEntry(e, vs) { + return nil + } + + // Value is still present in value log. + if len(vs.Value) == 0 { + return errors.Errorf("Empty value: %+v", vs) + } + var vp valuePointer + vp.Decode(vs.Value) + + if vp.Fid > f.fid { + return nil + } + if vp.Offset > e.offset { + return nil + } + if vp.Fid == f.fid && vp.Offset == e.offset { + moved++ + // This new entry only contains the key, and a pointer to the value. + ne := new(Entry) + ne.meta = 0 // Remove all bits. Different keyspace doesn't need these bits. + ne.UserMeta = e.UserMeta + + // Create a new key in a separate keyspace, prefixed by moveKey. We are not + // allowed to rewrite an older version of key in the LSM tree, because then this older + // version would be at the top of the LSM tree. To work correctly, reads expect the + // latest versions to be at the top, and the older versions at the bottom. + if bytes.HasPrefix(e.Key, badgerMove) { + ne.Key = append([]byte{}, e.Key...) + } else { + ne.Key = make([]byte, len(badgerMove)+len(e.Key)) + n := copy(ne.Key, badgerMove) + copy(ne.Key[n:], e.Key) + } + + ne.Value = append([]byte{}, e.Value...) + wb = append(wb, ne) + size += int64(e.estimateSize(vlog.opt.ValueThreshold)) + if size >= 64*mi { + tr.LazyPrintf("request has %d entries, size %d", len(wb), size) + if err := vlog.db.batchSet(wb); err != nil { + return err + } + size = 0 + wb = wb[:0] + } + } else { + vlog.db.opt.Warningf("This entry should have been caught. %+v\n", e) + } + return nil + } + + _, err := vlog.iterate(f, 0, func(e Entry, vp valuePointer) error { + return fe(e) + }) + if err != nil { + return err + } + + tr.LazyPrintf("request has %d entries, size %d", len(wb), size) + batchSize := 1024 + var loops int + for i := 0; i < len(wb); { + loops++ + if batchSize == 0 { + vlog.db.opt.Warningf("We shouldn't reach batch size of zero.") + return ErrNoRewrite + } + end := i + batchSize + if end > len(wb) { + end = len(wb) + } + if err := vlog.db.batchSet(wb[i:end]); err != nil { + if err == ErrTxnTooBig { + // Decrease the batch size to half. + batchSize = batchSize / 2 + tr.LazyPrintf("Dropped batch size to %d", batchSize) + continue + } + return err + } + i += batchSize + } + tr.LazyPrintf("Processed %d entries in %d loops", len(wb), loops) + tr.LazyPrintf("Total entries: %d. Moved: %d", count, moved) + tr.LazyPrintf("Removing fid: %d", f.fid) + var deleteFileNow bool + // Entries written to LSM. Remove the older file now. + { + vlog.filesLock.Lock() + // Just a sanity-check. + if _, ok := vlog.filesMap[f.fid]; !ok { + vlog.filesLock.Unlock() + return errors.Errorf("Unable to find fid: %d", f.fid) + } + if vlog.iteratorCount() == 0 { + delete(vlog.filesMap, f.fid) + deleteFileNow = true + } else { + vlog.filesToBeDeleted = append(vlog.filesToBeDeleted, f.fid) + } + vlog.filesLock.Unlock() + } + + if deleteFileNow { + if err := vlog.deleteLogFile(f); err != nil { + return err + } + } + + return nil +} + +func (vlog *valueLog) deleteMoveKeysFor(fid uint32, tr trace.Trace) error { + db := vlog.db + var result []*Entry + var count, pointers uint64 + tr.LazyPrintf("Iterating over move keys to find invalids for fid: %d", fid) + err := db.View(func(txn *Txn) error { + opt := DefaultIteratorOptions + opt.InternalAccess = true + opt.PrefetchValues = false + itr := txn.NewIterator(opt) + defer itr.Close() + + for itr.Seek(badgerMove); itr.ValidForPrefix(badgerMove); itr.Next() { + count++ + item := itr.Item() + if item.meta&bitValuePointer == 0 { + continue + } + pointers++ + var vp valuePointer + vp.Decode(item.vptr) + if vp.Fid == fid { + e := &Entry{Key: y.KeyWithTs(item.Key(), item.Version()), meta: bitDelete} + result = append(result, e) + } + } + return nil + }) + if err != nil { + tr.LazyPrintf("Got error while iterating move keys: %v", err) + tr.SetError() + return err + } + tr.LazyPrintf("Num total move keys: %d. Num pointers: %d", count, pointers) + tr.LazyPrintf("Number of invalid move keys found: %d", len(result)) + batchSize := 10240 + for i := 0; i < len(result); { + end := i + batchSize + if end > len(result) { + end = len(result) + } + if err := db.batchSet(result[i:end]); err != nil { + if err == ErrTxnTooBig { + batchSize /= 2 + tr.LazyPrintf("Dropped batch size to %d", batchSize) + continue + } + tr.LazyPrintf("Error while doing batchSet: %v", err) + tr.SetError() + return err + } + i += batchSize + } + tr.LazyPrintf("Move keys deletion done.") + return nil +} + +func (vlog *valueLog) incrIteratorCount() { + atomic.AddInt32(&vlog.numActiveIterators, 1) +} + +func (vlog *valueLog) iteratorCount() int { + return int(atomic.LoadInt32(&vlog.numActiveIterators)) +} + +func (vlog *valueLog) decrIteratorCount() error { + num := atomic.AddInt32(&vlog.numActiveIterators, -1) + if num != 0 { + return nil + } + + vlog.filesLock.Lock() + lfs := make([]*logFile, 0, len(vlog.filesToBeDeleted)) + for _, id := range vlog.filesToBeDeleted { + lfs = append(lfs, vlog.filesMap[id]) + delete(vlog.filesMap, id) + } + vlog.filesToBeDeleted = nil + vlog.filesLock.Unlock() + + for _, lf := range lfs { + if err := vlog.deleteLogFile(lf); err != nil { + return err + } + } + return nil +} + +func (vlog *valueLog) deleteLogFile(lf *logFile) error { + if lf == nil { + return nil + } + path := vlog.fpath(lf.fid) + if err := lf.munmap(); err != nil { + _ = lf.fd.Close() + return err + } + if err := lf.fd.Close(); err != nil { + return err + } + return os.Remove(path) +} + +func (vlog *valueLog) dropAll() (int, error) { + // We don't want to block dropAll on any pending transactions. So, don't worry about iterator + // count. + var count int + deleteAll := func() error { + vlog.filesLock.Lock() + defer vlog.filesLock.Unlock() + for _, lf := range vlog.filesMap { + if err := vlog.deleteLogFile(lf); err != nil { + return err + } + count++ + } + vlog.filesMap = make(map[uint32]*logFile) + return nil + } + if err := deleteAll(); err != nil { + return count, err + } + + vlog.db.opt.Infof("Value logs deleted. Creating value log file: 0") + if _, err := vlog.createVlogFile(0); err != nil { + return count, err + } + atomic.StoreUint32(&vlog.maxFid, 0) + return count, nil +} + +// lfDiscardStats keeps track of the amount of data that could be discarded for +// a given logfile. +type lfDiscardStats struct { + sync.Mutex + m map[uint32]int64 + updatesSinceFlush int +} + +type valueLog struct { + dirPath string + elog trace.EventLog + + // guards our view of which files exist, which to be deleted, how many active iterators + filesLock sync.RWMutex + filesMap map[uint32]*logFile + filesToBeDeleted []uint32 + // A refcount of iterators -- when this hits zero, we can delete the filesToBeDeleted. + numActiveIterators int32 + + db *DB + maxFid uint32 // accessed via atomics. + writableLogOffset uint32 // read by read, written by write. Must access via atomics. + numEntriesWritten uint32 + opt Options + + garbageCh chan struct{} + lfDiscardStats *lfDiscardStats +} + +func vlogFilePath(dirPath string, fid uint32) string { + return fmt.Sprintf("%s%s%06d.vlog", dirPath, string(os.PathSeparator), fid) +} + +func (vlog *valueLog) fpath(fid uint32) string { + return vlogFilePath(vlog.dirPath, fid) +} + +func (vlog *valueLog) populateFilesMap() error { + vlog.filesMap = make(map[uint32]*logFile) + + files, err := ioutil.ReadDir(vlog.dirPath) + if err != nil { + return errFile(err, vlog.dirPath, "Unable to open log dir.") + } + + found := make(map[uint64]struct{}) + for _, file := range files { + if !strings.HasSuffix(file.Name(), ".vlog") { + continue + } + fsz := len(file.Name()) + fid, err := strconv.ParseUint(file.Name()[:fsz-5], 10, 32) + if err != nil { + return errFile(err, file.Name(), "Unable to parse log id.") + } + if _, ok := found[fid]; ok { + return errFile(err, file.Name(), "Duplicate file found. Please delete one.") + } + found[fid] = struct{}{} + + lf := &logFile{ + fid: uint32(fid), + path: vlog.fpath(uint32(fid)), + loadingMode: vlog.opt.ValueLogLoadingMode, + } + vlog.filesMap[uint32(fid)] = lf + if vlog.maxFid < uint32(fid) { + vlog.maxFid = uint32(fid) + } + } + return nil +} + +func (vlog *valueLog) createVlogFile(fid uint32) (*logFile, error) { + path := vlog.fpath(fid) + lf := &logFile{ + fid: fid, + path: path, + loadingMode: vlog.opt.ValueLogLoadingMode, + } + // writableLogOffset is only written by write func, by read by Read func. + // To avoid a race condition, all reads and updates to this variable must be + // done via atomics. + atomic.StoreUint32(&vlog.writableLogOffset, 0) + vlog.numEntriesWritten = 0 + + var err error + if lf.fd, err = y.CreateSyncedFile(path, vlog.opt.SyncWrites); err != nil { + return nil, errFile(err, lf.path, "Create value log file") + } + if err = syncDir(vlog.dirPath); err != nil { + return nil, errFile(err, vlog.dirPath, "Sync value log dir") + } + if err = lf.mmap(2 * vlog.opt.ValueLogFileSize); err != nil { + return nil, errFile(err, lf.path, "Mmap value log file") + } + + vlog.filesLock.Lock() + vlog.filesMap[fid] = lf + vlog.filesLock.Unlock() + + return lf, nil +} + +func errFile(err error, path string, msg string) error { + return fmt.Errorf("%s. Path=%s. Error=%v", msg, path, err) +} + +func (vlog *valueLog) replayLog(lf *logFile, offset uint32, replayFn logEntry) error { + var err error + mode := os.O_RDONLY + if vlog.opt.Truncate { + // We should open the file in RW mode, so it can be truncated. + mode = os.O_RDWR + } + lf.fd, err = os.OpenFile(lf.path, mode, 0) + if err != nil { + return errFile(err, lf.path, "Open file") + } + defer lf.fd.Close() + + fi, err := lf.fd.Stat() + if err != nil { + return errFile(err, lf.path, "Unable to run file.Stat") + } + + // Alright, let's iterate now. + endOffset, err := vlog.iterate(lf, offset, replayFn) + if err != nil { + return errFile(err, lf.path, "Unable to replay logfile") + } + if int64(endOffset) == fi.Size() { + return nil + } + + // End offset is different from file size. So, we should truncate the file + // to that size. + y.AssertTrue(int64(endOffset) <= fi.Size()) + if !vlog.opt.Truncate { + return ErrTruncateNeeded + } + + // The entire file should be truncated (i.e. it should be deleted). + // If fid == maxFid then it's okay to truncate the entire file since it will be + // used for future additions. Also, it's okay if the last file has size zero. + // We mmap 2*opt.ValueLogSize for the last file. See vlog.Open() function + if endOffset == 0 && lf.fid != vlog.maxFid { + return errDeleteVlogFile + } + if err := lf.fd.Truncate(int64(endOffset)); err != nil { + return errFile(err, lf.path, fmt.Sprintf( + "Truncation needed at offset %d. Can be done manually as well.", endOffset)) + } + return nil +} + +func (vlog *valueLog) open(db *DB, ptr valuePointer, replayFn logEntry) error { + opt := db.opt + vlog.opt = opt + vlog.dirPath = opt.ValueDir + vlog.db = db + vlog.elog = trace.NewEventLog("Badger", "Valuelog") + vlog.garbageCh = make(chan struct{}, 1) // Only allow one GC at a time. + vlog.lfDiscardStats = &lfDiscardStats{m: make(map[uint32]int64)} + if err := vlog.populateFilesMap(); err != nil { + return err + } + // If no files are found, then create a new file. + if len(vlog.filesMap) == 0 { + _, err := vlog.createVlogFile(0) + return err + } + + fids := vlog.sortedFids() + for _, fid := range fids { + lf, ok := vlog.filesMap[fid] + y.AssertTrue(ok) + + // This file is before the value head pointer. So, we don't need to + // replay it, and can just open it in readonly mode. + if fid < ptr.Fid { + if err := lf.openReadOnly(); err != nil { + return err + } + continue + } + + var offset uint32 + if fid == ptr.Fid { + offset = ptr.Offset + ptr.Len + } + vlog.db.opt.Infof("Replaying file id: %d at offset: %d\n", fid, offset) + now := time.Now() + // Replay and possible truncation done. Now we can open the file as per + // user specified options. + if err := vlog.replayLog(lf, offset, replayFn); err != nil { + // Log file is corrupted. Delete it. + if err == errDeleteVlogFile { + delete(vlog.filesMap, fid) + path := vlog.fpath(lf.fid) + if err := os.Remove(path); err != nil { + return y.Wrapf(err, "failed to delete empty value log file: %q", path) + } + continue + } + return err + } + vlog.db.opt.Infof("Replay took: %s\n", time.Since(now)) + + if fid < vlog.maxFid { + if err := lf.openReadOnly(); err != nil { + return err + } + } else { + var flags uint32 + switch { + case vlog.opt.ReadOnly: + // If we have read only, we don't need SyncWrites. + flags |= y.ReadOnly + case vlog.opt.SyncWrites: + flags |= y.Sync + } + var err error + if lf.fd, err = y.OpenExistingFile(vlog.fpath(fid), flags); err != nil { + return errFile(err, lf.path, "Open existing file") + } + } + } + + // Seek to the end to start writing. + last, ok := vlog.filesMap[vlog.maxFid] + y.AssertTrue(ok) + lastOffset, err := last.fd.Seek(0, io.SeekEnd) + if err != nil { + return errFile(err, last.path, "file.Seek to end") + } + vlog.writableLogOffset = uint32(lastOffset) + + // Update the head to point to the updated tail. Otherwise, even after doing a successful + // replay and closing the DB, the value log head does not get updated, which causes the replay + // to happen repeatedly. + vlog.db.vhead = valuePointer{Fid: vlog.maxFid, Offset: uint32(lastOffset)} + + // Map the file if needed. When we create a file, it is automatically mapped. + if err = last.mmap(2 * opt.ValueLogFileSize); err != nil { + return errFile(err, last.path, "Map log file") + } + if err := vlog.populateDiscardStats(); err != nil { + return err + } + return nil +} + +func (vlog *valueLog) Close() error { + vlog.elog.Printf("Stopping garbage collection of values.") + defer vlog.elog.Finish() + + var err error + for id, f := range vlog.filesMap { + f.lock.Lock() // We won’t release the lock. + if munmapErr := f.munmap(); munmapErr != nil && err == nil { + err = munmapErr + } + + maxFid := atomic.LoadUint32(&vlog.maxFid) + if !vlog.opt.ReadOnly && id == maxFid { + // truncate writable log file to correct offset. + if truncErr := f.fd.Truncate( + int64(vlog.woffset())); truncErr != nil && err == nil { + err = truncErr + } + } + + if closeErr := f.fd.Close(); closeErr != nil && err == nil { + err = closeErr + } + } + return err +} + +// sortedFids returns the file id's not pending deletion, sorted. Assumes we have shared access to +// filesMap. +func (vlog *valueLog) sortedFids() []uint32 { + toBeDeleted := make(map[uint32]struct{}) + for _, fid := range vlog.filesToBeDeleted { + toBeDeleted[fid] = struct{}{} + } + ret := make([]uint32, 0, len(vlog.filesMap)) + for fid := range vlog.filesMap { + if _, ok := toBeDeleted[fid]; !ok { + ret = append(ret, fid) + } + } + sort.Slice(ret, func(i, j int) bool { + return ret[i] < ret[j] + }) + return ret +} + +type request struct { + // Input values + Entries []*Entry + // Output values and wait group stuff below + Ptrs []valuePointer + Wg sync.WaitGroup + Err error + ref int32 +} + +func (req *request) IncrRef() { + atomic.AddInt32(&req.ref, 1) +} + +func (req *request) DecrRef() { + nRef := atomic.AddInt32(&req.ref, -1) + if nRef > 0 { + return + } + req.Entries = nil + requestPool.Put(req) +} + +func (req *request) Wait() error { + req.Wg.Wait() + err := req.Err + req.DecrRef() // DecrRef after writing to DB. + return err +} + +type requests []*request + +func (reqs requests) DecrRef() { + for _, req := range reqs { + req.DecrRef() + } +} + +// sync function syncs content of latest value log file to disk. Syncing of value log directory is +// not required here as it happens every time a value log file rotation happens(check createVlogFile +// function). During rotation, previous value log file also gets synced to disk. It only syncs file +// if fid >= vlog.maxFid. In some cases such as replay(while openning db), it might be called with +// fid < vlog.maxFid. To sync irrespective of file id just call it with math.MaxUint32. +func (vlog *valueLog) sync(fid uint32) error { + if vlog.opt.SyncWrites { + return nil + } + + vlog.filesLock.RLock() + maxFid := atomic.LoadUint32(&vlog.maxFid) + // During replay it is possible to get sync call with fid less than maxFid. + // Because older file has already been synced, we can return from here. + if fid < maxFid || len(vlog.filesMap) == 0 { + vlog.filesLock.RUnlock() + return nil + } + curlf := vlog.filesMap[maxFid] + // Sometimes it is possible that vlog.maxFid has been increased but file creation + // with same id is still in progress and this function is called. In those cases + // entry for the file might not be present in vlog.filesMap. + if curlf == nil { + vlog.filesLock.RUnlock() + return nil + } + curlf.lock.RLock() + vlog.filesLock.RUnlock() + + err := curlf.sync() + curlf.lock.RUnlock() + return err +} + +func (vlog *valueLog) woffset() uint32 { + return atomic.LoadUint32(&vlog.writableLogOffset) +} + +// write is thread-unsafe by design and should not be called concurrently. +func (vlog *valueLog) write(reqs []*request) error { + vlog.filesLock.RLock() + maxFid := atomic.LoadUint32(&vlog.maxFid) + curlf := vlog.filesMap[maxFid] + vlog.filesLock.RUnlock() + + var buf bytes.Buffer + toDisk := func() error { + if buf.Len() == 0 { + return nil + } + vlog.elog.Printf("Flushing %d blocks of total size: %d", len(reqs), buf.Len()) + n, err := curlf.fd.Write(buf.Bytes()) + if err != nil { + return errors.Wrapf(err, "Unable to write to value log file: %q", curlf.path) + } + buf.Reset() + y.NumWrites.Add(1) + y.NumBytesWritten.Add(int64(n)) + vlog.elog.Printf("Done") + atomic.AddUint32(&vlog.writableLogOffset, uint32(n)) + + if vlog.woffset() > uint32(vlog.opt.ValueLogFileSize) || + vlog.numEntriesWritten > vlog.opt.ValueLogMaxEntries { + var err error + if err = curlf.doneWriting(vlog.woffset()); err != nil { + return err + } + + newid := atomic.AddUint32(&vlog.maxFid, 1) + y.AssertTruef(newid > 0, "newid has overflown uint32: %v", newid) + newlf, err := vlog.createVlogFile(newid) + if err != nil { + return err + } + curlf = newlf + atomic.AddInt32(&vlog.db.logRotates, 1) + } + return nil + } + + for i := range reqs { + b := reqs[i] + b.Ptrs = b.Ptrs[:0] + var written int + for j := range b.Entries { + e := b.Entries[j] + if e.skipVlog { + b.Ptrs = append(b.Ptrs, valuePointer{}) + continue + } + var p valuePointer + + p.Fid = curlf.fid + // Use the offset including buffer length so far. + p.Offset = vlog.woffset() + uint32(buf.Len()) + plen, err := encodeEntry(e, &buf) // Now encode the entry into buffer. + if err != nil { + return err + } + p.Len = uint32(plen) + b.Ptrs = append(b.Ptrs, p) + written++ + } + vlog.numEntriesWritten += uint32(written) + // We write to disk here so that all entries that are part of the same transaction are + // written to the same vlog file. + writeNow := + vlog.woffset()+uint32(buf.Len()) > uint32(vlog.opt.ValueLogFileSize) || + vlog.numEntriesWritten > uint32(vlog.opt.ValueLogMaxEntries) + if writeNow { + if err := toDisk(); err != nil { + return err + } + } + } + return toDisk() +} + +// Gets the logFile and acquires and RLock() for the mmap. You must call RUnlock on the file +// (if non-nil) +func (vlog *valueLog) getFileRLocked(fid uint32) (*logFile, error) { + vlog.filesLock.RLock() + defer vlog.filesLock.RUnlock() + ret, ok := vlog.filesMap[fid] + if !ok { + // log file has gone away, will need to retry the operation. + return nil, ErrRetry + } + ret.lock.RLock() + return ret, nil +} + +// Read reads the value log at a given location. +// TODO: Make this read private. +func (vlog *valueLog) Read(vp valuePointer, s *y.Slice) ([]byte, func(), error) { + // Check for valid offset if we are reading to writable log. + maxFid := atomic.LoadUint32(&vlog.maxFid) + if vp.Fid == maxFid && vp.Offset >= vlog.woffset() { + return nil, nil, errors.Errorf( + "Invalid value pointer offset: %d greater than current offset: %d", + vp.Offset, vlog.woffset()) + } + + buf, cb, err := vlog.readValueBytes(vp, s) + if err != nil { + return nil, cb, err + } + var h header + h.Decode(buf) + n := uint32(headerBufSize) + h.klen + return buf[n : n+h.vlen], cb, nil +} + +func (vlog *valueLog) readValueBytes(vp valuePointer, s *y.Slice) ([]byte, func(), error) { + lf, err := vlog.getFileRLocked(vp.Fid) + if err != nil { + return nil, nil, err + } + + buf, err := lf.read(vp, s) + if vlog.opt.ValueLogLoadingMode == options.MemoryMap { + return buf, lf.lock.RUnlock, err + } + // If we are using File I/O we unlock the file immediately + // and return an empty function as callback. + lf.lock.RUnlock() + return buf, nil, err +} + +// Test helper +func valueBytesToEntry(buf []byte) (e Entry) { + var h header + h.Decode(buf) + n := uint32(headerBufSize) + + e.Key = buf[n : n+h.klen] + n += h.klen + e.meta = h.meta + e.UserMeta = h.userMeta + e.Value = buf[n : n+h.vlen] + return +} + +func (vlog *valueLog) pickLog(head valuePointer, tr trace.Trace) (files []*logFile) { + vlog.filesLock.RLock() + defer vlog.filesLock.RUnlock() + fids := vlog.sortedFids() + if len(fids) <= 1 { + tr.LazyPrintf("Only one or less value log file.") + return nil + } else if head.Fid == 0 { + tr.LazyPrintf("Head pointer is at zero.") + return nil + } + + // Pick a candidate that contains the largest amount of discardable data + candidate := struct { + fid uint32 + discard int64 + }{math.MaxUint32, 0} + vlog.lfDiscardStats.Lock() + for _, fid := range fids { + if fid >= head.Fid { + break + } + if vlog.lfDiscardStats.m[fid] > candidate.discard { + candidate.fid = fid + candidate.discard = vlog.lfDiscardStats.m[fid] + } + } + vlog.lfDiscardStats.Unlock() + + if candidate.fid != math.MaxUint32 { // Found a candidate + tr.LazyPrintf("Found candidate via discard stats: %v", candidate) + files = append(files, vlog.filesMap[candidate.fid]) + } else { + tr.LazyPrintf("Could not find candidate via discard stats. Randomly picking one.") + } + + // Fallback to randomly picking a log file + var idxHead int + for i, fid := range fids { + if fid == head.Fid { + idxHead = i + break + } + } + if idxHead == 0 { // Not found or first file + tr.LazyPrintf("Could not find any file.") + return nil + } + idx := rand.Intn(idxHead) // Don’t include head.Fid. We pick a random file before it. + if idx > 0 { + idx = rand.Intn(idx + 1) // Another level of rand to favor smaller fids. + } + tr.LazyPrintf("Randomly chose fid: %d", fids[idx]) + files = append(files, vlog.filesMap[fids[idx]]) + return files +} + +func discardEntry(e Entry, vs y.ValueStruct) bool { + if vs.Version != y.ParseTs(e.Key) { + // Version not found. Discard. + return true + } + if isDeletedOrExpired(vs.Meta, vs.ExpiresAt) { + return true + } + if (vs.Meta & bitValuePointer) == 0 { + // Key also stores the value in LSM. Discard. + return true + } + if (vs.Meta & bitFinTxn) > 0 { + // Just a txn finish entry. Discard. + return true + } + return false +} + +func (vlog *valueLog) doRunGC(lf *logFile, discardRatio float64, tr trace.Trace) (err error) { + // Update stats before exiting + defer func() { + if err == nil { + vlog.lfDiscardStats.Lock() + delete(vlog.lfDiscardStats.m, lf.fid) + vlog.lfDiscardStats.Unlock() + } + }() + + type reason struct { + total float64 + discard float64 + count int + } + + fi, err := lf.fd.Stat() + if err != nil { + tr.LazyPrintf("Error while finding file size: %v", err) + tr.SetError() + return err + } + + // Set up the sampling window sizes. + sizeWindow := float64(fi.Size()) * 0.1 // 10% of the file as window. + sizeWindowM := sizeWindow / (1 << 20) // in MBs. + countWindow := int(float64(vlog.opt.ValueLogMaxEntries) * 0.01) // 1% of num entries. + tr.LazyPrintf("Size window: %5.2f. Count window: %d.", sizeWindow, countWindow) + + // Pick a random start point for the log. + skipFirstM := float64(rand.Int63n(fi.Size())) // Pick a random starting location. + skipFirstM -= sizeWindow // Avoid hitting EOF by moving back by window. + skipFirstM /= float64(mi) // Convert to MBs. + tr.LazyPrintf("Skip first %5.2f MB of file of size: %d MB", skipFirstM, fi.Size()/mi) + var skipped float64 + + var r reason + start := time.Now() + y.AssertTrue(vlog.db != nil) + s := new(y.Slice) + var numIterations int + _, err = vlog.iterate(lf, 0, func(e Entry, vp valuePointer) error { + numIterations++ + esz := float64(vp.Len) / (1 << 20) // in MBs. + if skipped < skipFirstM { + skipped += esz + return nil + } + + // Sample until we reach the window sizes or exceed 10 seconds. + if r.count > countWindow { + tr.LazyPrintf("Stopping sampling after %d entries.", countWindow) + return errStop + } + if r.total > sizeWindowM { + tr.LazyPrintf("Stopping sampling after reaching window size.") + return errStop + } + if time.Since(start) > 10*time.Second { + tr.LazyPrintf("Stopping sampling after 10 seconds.") + return errStop + } + r.total += esz + r.count++ + + vs, err := vlog.db.get(e.Key) + if err != nil { + return err + } + if discardEntry(e, vs) { + r.discard += esz + return nil + } + + // Value is still present in value log. + y.AssertTrue(len(vs.Value) > 0) + vp.Decode(vs.Value) + + if vp.Fid > lf.fid { + // Value is present in a later log. Discard. + r.discard += esz + return nil + } + if vp.Offset > e.offset { + // Value is present in a later offset, but in the same log. + r.discard += esz + return nil + } + if vp.Fid == lf.fid && vp.Offset == e.offset { + // This is still the active entry. This would need to be rewritten. + + } else { + vlog.elog.Printf("Reason=%+v\n", r) + + buf, cb, err := vlog.readValueBytes(vp, s) + if err != nil { + return errStop + } + ne := valueBytesToEntry(buf) + ne.offset = vp.Offset + ne.print("Latest Entry Header in LSM") + e.print("Latest Entry in Log") + runCallback(cb) + return errors.Errorf("This shouldn't happen. Latest Pointer:%+v. Meta:%v.", + vp, vs.Meta) + } + return nil + }) + + if err != nil { + tr.LazyPrintf("Error while iterating for RunGC: %v", err) + tr.SetError() + return err + } + tr.LazyPrintf("Fid: %d. Skipped: %5.2fMB Num iterations: %d. Data status=%+v\n", + lf.fid, skipped, numIterations, r) + + // If we couldn't sample at least a 1000 KV pairs or at least 75% of the window size, + // and what we can discard is below the threshold, we should skip the rewrite. + if (r.count < countWindow && r.total < sizeWindowM*0.75) || r.discard < discardRatio*r.total { + tr.LazyPrintf("Skipping GC on fid: %d", lf.fid) + return ErrNoRewrite + } + if err = vlog.rewrite(lf, tr); err != nil { + return err + } + tr.LazyPrintf("Done rewriting.") + return nil +} + +func (vlog *valueLog) waitOnGC(lc *y.Closer) { + defer lc.Done() + + <-lc.HasBeenClosed() // Wait for lc to be closed. + + // Block any GC in progress to finish, and don't allow any more writes to runGC by filling up + // the channel of size 1. + vlog.garbageCh <- struct{}{} +} + +func (vlog *valueLog) runGC(discardRatio float64, head valuePointer) error { + select { + case vlog.garbageCh <- struct{}{}: + // Pick a log file for GC. + tr := trace.New("Badger.ValueLog", "GC") + tr.SetMaxEvents(100) + defer func() { + tr.Finish() + <-vlog.garbageCh + }() + + var err error + files := vlog.pickLog(head, tr) + if len(files) == 0 { + tr.LazyPrintf("PickLog returned zero results.") + return ErrNoRewrite + } + tried := make(map[uint32]bool) + for _, lf := range files { + if _, done := tried[lf.fid]; done { + continue + } + tried[lf.fid] = true + err = vlog.doRunGC(lf, discardRatio, tr) + if err == nil { + return vlog.deleteMoveKeysFor(lf.fid, tr) + } + } + return err + default: + return ErrRejected + } +} + +func (vlog *valueLog) updateDiscardStats(stats map[uint32]int64) error { + vlog.lfDiscardStats.Lock() + for fid, sz := range stats { + vlog.lfDiscardStats.m[fid] += sz + vlog.lfDiscardStats.updatesSinceFlush++ + } + vlog.lfDiscardStats.Unlock() + if vlog.lfDiscardStats.updatesSinceFlush > discardStatsFlushThreshold { + if err := vlog.flushDiscardStats(); err != nil { + return err + } + vlog.lfDiscardStats.updatesSinceFlush = 0 + } + return nil +} + +// flushDiscardStats inserts discard stats into badger. Returns error on failure. +func (vlog *valueLog) flushDiscardStats() error { + if len(vlog.lfDiscardStats.m) == 0 { + return nil + } + entries := []*Entry{{ + Key: y.KeyWithTs(lfDiscardStatsKey, 1), + Value: vlog.encodedDiscardStats(), + }} + req, err := vlog.db.sendToWriteCh(entries) + if err != nil { + return errors.Wrapf(err, "failed to push discard stats to write channel") + } + return req.Wait() +} + +// encodedDiscardStats returns []byte representation of lfDiscardStats +// This will be called while storing stats in BadgerDB +func (vlog *valueLog) encodedDiscardStats() []byte { + vlog.lfDiscardStats.Lock() + defer vlog.lfDiscardStats.Unlock() + + encodedStats, _ := json.Marshal(vlog.lfDiscardStats.m) + return encodedStats +} + +// populateDiscardStats populates vlog.lfDiscardStats +// This function will be called while initializing valueLog +func (vlog *valueLog) populateDiscardStats() error { + discardStatsKey := y.KeyWithTs(lfDiscardStatsKey, math.MaxUint64) + vs, err := vlog.db.get(discardStatsKey) + if err != nil { + return err + } + + // check if value is Empty + if vs.Value == nil || len(vs.Value) == 0 { + return nil + } + + var statsMap map[uint32]int64 + // discard map is stored in the vlog file. + if vs.Meta&bitValuePointer > 0 { + var vp valuePointer + vp.Decode(vs.Value) + result, cb, err := vlog.Read(vp, new(y.Slice)) + if err != nil { + return errors.Wrapf(err, "failed to read value pointer from vlog file: %+v", vp) + } + defer runCallback(cb) + if err := json.Unmarshal(result, &statsMap); err != nil { + return errors.Wrapf(err, "failed to unmarshal discard stats") + } + } else { + if err := json.Unmarshal(vs.Value, &statsMap); err != nil { + return errors.Wrapf(err, "failed to unmarshal discard stats") + } + } + vlog.opt.Debugf("Value Log Discard stats: %v", statsMap) + vlog.lfDiscardStats = &lfDiscardStats{m: statsMap} + return nil +} diff --git a/vendor/github.com/dgraph-io/badger/y/error.go b/vendor/github.com/dgraph-io/badger/y/error.go new file mode 100644 index 0000000000..59bb283584 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/y/error.go @@ -0,0 +1,83 @@ +/* + * Copyright 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package y + +// This file contains some functions for error handling. Note that we are moving +// towards using x.Trace, i.e., rpc tracing using net/tracer. But for now, these +// functions are useful for simple checks logged on one machine. +// Some common use cases are: +// (1) You receive an error from external lib, and would like to check/log fatal. +// For this, use x.Check, x.Checkf. These will check for err != nil, which is +// more common in Go. If you want to check for boolean being true, use +// x.Assert, x.Assertf. +// (2) You receive an error from external lib, and would like to pass on with some +// stack trace information. In this case, use x.Wrap or x.Wrapf. +// (3) You want to generate a new error with stack trace info. Use x.Errorf. + +import ( + "fmt" + "log" + + "github.com/pkg/errors" +) + +var debugMode = true + +// Check logs fatal if err != nil. +func Check(err error) { + if err != nil { + log.Fatalf("%+v", Wrap(err)) + } +} + +// Check2 acts as convenience wrapper around Check, using the 2nd argument as error. +func Check2(_ interface{}, err error) { + Check(err) +} + +// AssertTrue asserts that b is true. Otherwise, it would log fatal. +func AssertTrue(b bool) { + if !b { + log.Fatalf("%+v", errors.Errorf("Assert failed")) + } +} + +// AssertTruef is AssertTrue with extra info. +func AssertTruef(b bool, format string, args ...interface{}) { + if !b { + log.Fatalf("%+v", errors.Errorf(format, args...)) + } +} + +// Wrap wraps errors from external lib. +func Wrap(err error) error { + if !debugMode { + return err + } + return errors.Wrap(err, "") +} + +// Wrapf is Wrap with extra info. +func Wrapf(err error, format string, args ...interface{}) error { + if !debugMode { + if err == nil { + return nil + } + return fmt.Errorf(format+" error: %+v", append(args, err)...) + } + return errors.Wrapf(err, format, args...) +} diff --git a/vendor/github.com/dgraph-io/badger/y/file_dsync.go b/vendor/github.com/dgraph-io/badger/y/file_dsync.go new file mode 100644 index 0000000000..3f3445e2e9 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/y/file_dsync.go @@ -0,0 +1,25 @@ +// +build !dragonfly,!freebsd,!windows + +/* + * Copyright 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package y + +import "golang.org/x/sys/unix" + +func init() { + datasyncFileFlag = unix.O_DSYNC +} diff --git a/vendor/github.com/dgraph-io/badger/y/file_nodsync.go b/vendor/github.com/dgraph-io/badger/y/file_nodsync.go new file mode 100644 index 0000000000..b68be7ab94 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/y/file_nodsync.go @@ -0,0 +1,25 @@ +// +build dragonfly freebsd windows + +/* + * Copyright 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package y + +import "syscall" + +func init() { + datasyncFileFlag = syscall.O_SYNC +} diff --git a/vendor/github.com/dgraph-io/badger/y/file_sync.go b/vendor/github.com/dgraph-io/badger/y/file_sync.go new file mode 100644 index 0000000000..19016ef698 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/y/file_sync.go @@ -0,0 +1,28 @@ +// +build !darwin go1.12 + +/* + * Copyright 2019 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package y + +import "os" + +// FileSync calls os.File.Sync with the right parameters. +// This function can be removed once we stop supporting Go 1.11 +// on MacOS. +// +// More info: https://golang.org/issue/26650. +func FileSync(f *os.File) error { return f.Sync() } diff --git a/vendor/github.com/dgraph-io/badger/y/file_sync_darwin.go b/vendor/github.com/dgraph-io/badger/y/file_sync_darwin.go new file mode 100644 index 0000000000..01c79f2301 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/y/file_sync_darwin.go @@ -0,0 +1,37 @@ +// +build darwin,!go1.12 + +/* + * Copyright 2019 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package y + +import ( + "os" + "syscall" +) + +// FileSync calls os.File.Sync with the right parameters. +// This function can be removed once we stop supporting Go 1.11 +// on MacOS. +// +// More info: https://golang.org/issue/26650. +func FileSync(f *os.File) error { + _, _, err := syscall.Syscall(syscall.SYS_FCNTL, f.Fd(), syscall.F_FULLFSYNC, 0) + if err == 0 { + return nil + } + return err +} diff --git a/vendor/github.com/dgraph-io/badger/y/iterator.go b/vendor/github.com/dgraph-io/badger/y/iterator.go new file mode 100644 index 0000000000..719e8ec8ea --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/y/iterator.go @@ -0,0 +1,264 @@ +/* + * Copyright 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package y + +import ( + "bytes" + "container/heap" + "encoding/binary" + + "github.com/pkg/errors" +) + +// ValueStruct represents the value info that can be associated with a key, but also the internal +// Meta field. +type ValueStruct struct { + Meta byte + UserMeta byte + ExpiresAt uint64 + Value []byte + + Version uint64 // This field is not serialized. Only for internal usage. +} + +func sizeVarint(x uint64) (n int) { + for { + n++ + x >>= 7 + if x == 0 { + break + } + } + return n +} + +// EncodedSize is the size of the ValueStruct when encoded +func (v *ValueStruct) EncodedSize() uint16 { + sz := len(v.Value) + 2 // meta, usermeta. + if v.ExpiresAt == 0 { + return uint16(sz + 1) + } + + enc := sizeVarint(v.ExpiresAt) + return uint16(sz + enc) +} + +// Decode uses the length of the slice to infer the length of the Value field. +func (v *ValueStruct) Decode(b []byte) { + v.Meta = b[0] + v.UserMeta = b[1] + var sz int + v.ExpiresAt, sz = binary.Uvarint(b[2:]) + v.Value = b[2+sz:] +} + +// Encode expects a slice of length at least v.EncodedSize(). +func (v *ValueStruct) Encode(b []byte) { + b[0] = v.Meta + b[1] = v.UserMeta + sz := binary.PutUvarint(b[2:], v.ExpiresAt) + copy(b[2+sz:], v.Value) +} + +// EncodeTo should be kept in sync with the Encode function above. The reason +// this function exists is to avoid creating byte arrays per key-value pair in +// table/builder.go. +func (v *ValueStruct) EncodeTo(buf *bytes.Buffer) { + buf.WriteByte(v.Meta) + buf.WriteByte(v.UserMeta) + var enc [binary.MaxVarintLen64]byte + sz := binary.PutUvarint(enc[:], v.ExpiresAt) + buf.Write(enc[:sz]) + buf.Write(v.Value) +} + +// Iterator is an interface for a basic iterator. +type Iterator interface { + Next() + Rewind() + Seek(key []byte) + Key() []byte + Value() ValueStruct + Valid() bool + + // All iterators should be closed so that file garbage collection works. + Close() error +} + +type elem struct { + itr Iterator + nice int + reversed bool +} + +type elemHeap []*elem + +func (eh elemHeap) Len() int { return len(eh) } +func (eh elemHeap) Swap(i, j int) { eh[i], eh[j] = eh[j], eh[i] } +func (eh *elemHeap) Push(x interface{}) { *eh = append(*eh, x.(*elem)) } +func (eh *elemHeap) Pop() interface{} { + // Remove the last element, because Go has already swapped 0th elem <-> last. + old := *eh + n := len(old) + x := old[n-1] + *eh = old[0 : n-1] + return x +} +func (eh elemHeap) Less(i, j int) bool { + cmp := CompareKeys(eh[i].itr.Key(), eh[j].itr.Key()) + if cmp < 0 { + return !eh[i].reversed + } + if cmp > 0 { + return eh[i].reversed + } + // The keys are equal. In this case, lower nice take precedence. This is important. + return eh[i].nice < eh[j].nice +} + +// MergeIterator merges multiple iterators. +// NOTE: MergeIterator owns the array of iterators and is responsible for closing them. +type MergeIterator struct { + h elemHeap + curKey []byte + reversed bool + + all []Iterator +} + +// NewMergeIterator returns a new MergeIterator from a list of Iterators. +func NewMergeIterator(iters []Iterator, reversed bool) *MergeIterator { + m := &MergeIterator{all: iters, reversed: reversed} + m.h = make(elemHeap, 0, len(iters)) + m.initHeap() + return m +} + +func (s *MergeIterator) storeKey(smallest Iterator) { + if cap(s.curKey) < len(smallest.Key()) { + s.curKey = make([]byte, 2*len(smallest.Key())) + } + s.curKey = s.curKey[:len(smallest.Key())] + copy(s.curKey, smallest.Key()) +} + +// initHeap checks all iterators and initializes our heap and array of keys. +// Whenever we reverse direction, we need to run this. +func (s *MergeIterator) initHeap() { + s.h = s.h[:0] + for idx, itr := range s.all { + if !itr.Valid() { + continue + } + e := &elem{itr: itr, nice: idx, reversed: s.reversed} + s.h = append(s.h, e) + } + heap.Init(&s.h) + for len(s.h) > 0 { + it := s.h[0].itr + if it == nil || !it.Valid() { + heap.Pop(&s.h) + continue + } + s.storeKey(s.h[0].itr) + break + } +} + +// Valid returns whether the MergeIterator is at a valid element. +func (s *MergeIterator) Valid() bool { + if s == nil { + return false + } + if len(s.h) == 0 { + return false + } + return s.h[0].itr.Valid() +} + +// Key returns the key associated with the current iterator +func (s *MergeIterator) Key() []byte { + if len(s.h) == 0 { + return nil + } + return s.h[0].itr.Key() +} + +// Value returns the value associated with the iterator. +func (s *MergeIterator) Value() ValueStruct { + if len(s.h) == 0 { + return ValueStruct{} + } + return s.h[0].itr.Value() +} + +// Next returns the next element. If it is the same as the current key, ignore it. +func (s *MergeIterator) Next() { + if len(s.h) == 0 { + return + } + + smallest := s.h[0].itr + smallest.Next() + + for len(s.h) > 0 { + smallest = s.h[0].itr + if !smallest.Valid() { + heap.Pop(&s.h) + continue + } + + heap.Fix(&s.h, 0) + smallest = s.h[0].itr + if smallest.Valid() { + if !bytes.Equal(smallest.Key(), s.curKey) { + break + } + smallest.Next() + } + } + if !smallest.Valid() { + return + } + s.storeKey(smallest) +} + +// Rewind seeks to first element (or last element for reverse iterator). +func (s *MergeIterator) Rewind() { + for _, itr := range s.all { + itr.Rewind() + } + s.initHeap() +} + +// Seek brings us to element with key >= given key. +func (s *MergeIterator) Seek(key []byte) { + for _, itr := range s.all { + itr.Seek(key) + } + s.initHeap() +} + +// Close implements y.Iterator +func (s *MergeIterator) Close() error { + for _, itr := range s.all { + if err := itr.Close(); err != nil { + return errors.Wrap(err, "MergeIterator") + } + } + return nil +} diff --git a/vendor/github.com/dgraph-io/badger/y/metrics.go b/vendor/github.com/dgraph-io/badger/y/metrics.go new file mode 100644 index 0000000000..2de17d1004 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/y/metrics.go @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package y + +import "expvar" + +var ( + // LSMSize has size of the LSM in bytes + LSMSize *expvar.Map + // VlogSize has size of the value log in bytes + VlogSize *expvar.Map + // PendingWrites tracks the number of pending writes. + PendingWrites *expvar.Map + + // These are cumulative + + // NumReads has cumulative number of reads + NumReads *expvar.Int + // NumWrites has cumulative number of writes + NumWrites *expvar.Int + // NumBytesRead has cumulative number of bytes read + NumBytesRead *expvar.Int + // NumBytesWritten has cumulative number of bytes written + NumBytesWritten *expvar.Int + // NumLSMGets is number of LMS gets + NumLSMGets *expvar.Map + // NumLSMBloomHits is number of LMS bloom hits + NumLSMBloomHits *expvar.Map + // NumGets is number of gets + NumGets *expvar.Int + // NumPuts is number of puts + NumPuts *expvar.Int + // NumBlockedPuts is number of blocked puts + NumBlockedPuts *expvar.Int + // NumMemtableGets is number of memtable gets + NumMemtableGets *expvar.Int +) + +// These variables are global and have cumulative values for all kv stores. +func init() { + NumReads = expvar.NewInt("badger_disk_reads_total") + NumWrites = expvar.NewInt("badger_disk_writes_total") + NumBytesRead = expvar.NewInt("badger_read_bytes") + NumBytesWritten = expvar.NewInt("badger_written_bytes") + NumLSMGets = expvar.NewMap("badger_lsm_level_gets_total") + NumLSMBloomHits = expvar.NewMap("badger_lsm_bloom_hits_total") + NumGets = expvar.NewInt("badger_gets_total") + NumPuts = expvar.NewInt("badger_puts_total") + NumBlockedPuts = expvar.NewInt("badger_blocked_puts_total") + NumMemtableGets = expvar.NewInt("badger_memtable_gets_total") + LSMSize = expvar.NewMap("badger_lsm_size_bytes") + VlogSize = expvar.NewMap("badger_vlog_size_bytes") + PendingWrites = expvar.NewMap("badger_pending_writes_total") +} diff --git a/vendor/github.com/dgraph-io/badger/y/mmap_unix.go b/vendor/github.com/dgraph-io/badger/y/mmap_unix.go new file mode 100644 index 0000000000..f9203a0139 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/y/mmap_unix.go @@ -0,0 +1,63 @@ +// +build !windows + +/* + * Copyright 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package y + +import ( + "os" + "syscall" + "unsafe" + + "golang.org/x/sys/unix" +) + +// Mmap uses the mmap system call to memory-map a file. If writable is true, +// memory protection of the pages is set so that they may be written to as well. +func Mmap(fd *os.File, writable bool, size int64) ([]byte, error) { + mtype := unix.PROT_READ + if writable { + mtype |= unix.PROT_WRITE + } + return unix.Mmap(int(fd.Fd()), 0, int(size), mtype, unix.MAP_SHARED) +} + +// Munmap unmaps a previously mapped slice. +func Munmap(b []byte) error { + return unix.Munmap(b) +} + +// Madvise uses the madvise system call to give advise about the use of memory +// when using a slice that is memory-mapped to a file. Set the readahead flag to +// false if page references are expected in random order. +func Madvise(b []byte, readahead bool) error { + flags := unix.MADV_NORMAL + if !readahead { + flags = unix.MADV_RANDOM + } + return madvise(b, flags) +} + +// This is required because the unix package does not support the madvise system call on OS X. +func madvise(b []byte, advice int) (err error) { + _, _, e1 := syscall.Syscall(syscall.SYS_MADVISE, uintptr(unsafe.Pointer(&b[0])), + uintptr(len(b)), uintptr(advice)) + if e1 != 0 { + err = e1 + } + return +} diff --git a/vendor/github.com/dgraph-io/badger/y/mmap_windows.go b/vendor/github.com/dgraph-io/badger/y/mmap_windows.go new file mode 100644 index 0000000000..0efb2d0f8d --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/y/mmap_windows.go @@ -0,0 +1,90 @@ +// +build windows + +/* + * Copyright 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package y + +import ( + "fmt" + "os" + "syscall" + "unsafe" +) + +func Mmap(fd *os.File, write bool, size int64) ([]byte, error) { + protect := syscall.PAGE_READONLY + access := syscall.FILE_MAP_READ + + if write { + protect = syscall.PAGE_READWRITE + access = syscall.FILE_MAP_WRITE + } + fi, err := fd.Stat() + if err != nil { + return nil, err + } + + // Truncate the database to the size of the mmap. + if fi.Size() < size { + if err := fd.Truncate(size); err != nil { + return nil, fmt.Errorf("truncate: %s", err) + } + } + + // Open a file mapping handle. + sizelo := uint32(size >> 32) + sizehi := uint32(size) & 0xffffffff + + handler, err := syscall.CreateFileMapping(syscall.Handle(fd.Fd()), nil, + uint32(protect), sizelo, sizehi, nil) + if err != nil { + return nil, os.NewSyscallError("CreateFileMapping", err) + } + + // Create the memory map. + addr, err := syscall.MapViewOfFile(handler, uint32(access), 0, 0, uintptr(size)) + if addr == 0 { + return nil, os.NewSyscallError("MapViewOfFile", err) + } + + // Close mapping handle. + if err := syscall.CloseHandle(syscall.Handle(handler)); err != nil { + return nil, os.NewSyscallError("CloseHandle", err) + } + + // Slice memory layout + // Copied this snippet from golang/sys package + var sl = struct { + addr uintptr + len int + cap int + }{addr, int(size), int(size)} + + // Use unsafe to turn sl into a []byte. + data := *(*[]byte)(unsafe.Pointer(&sl)) + + return data, nil +} + +func Munmap(b []byte) error { + return syscall.UnmapViewOfFile(uintptr(unsafe.Pointer(&b[0]))) +} + +func Madvise(b []byte, readahead bool) error { + // Do Nothing. We don’t care about this setting on Windows + return nil +} diff --git a/vendor/github.com/dgraph-io/badger/y/watermark.go b/vendor/github.com/dgraph-io/badger/y/watermark.go new file mode 100644 index 0000000000..10ca00e7e3 --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/y/watermark.go @@ -0,0 +1,233 @@ +/* + * Copyright 2016-2018 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package y + +import ( + "container/heap" + "context" + "sync/atomic" + + "golang.org/x/net/trace" +) + +type uint64Heap []uint64 + +func (u uint64Heap) Len() int { return len(u) } +func (u uint64Heap) Less(i, j int) bool { return u[i] < u[j] } +func (u uint64Heap) Swap(i, j int) { u[i], u[j] = u[j], u[i] } +func (u *uint64Heap) Push(x interface{}) { *u = append(*u, x.(uint64)) } +func (u *uint64Heap) Pop() interface{} { + old := *u + n := len(old) + x := old[n-1] + *u = old[0 : n-1] + return x +} + +// mark contains one of more indices, along with a done boolean to indicate the +// status of the index: begin or done. It also contains waiters, who could be +// waiting for the watermark to reach >= a certain index. +type mark struct { + // Either this is an (index, waiter) pair or (index, done) or (indices, done). + index uint64 + waiter chan struct{} + indices []uint64 + done bool // Set to true if the index is done. +} + +// WaterMark is used to keep track of the minimum un-finished index. Typically, an index k becomes +// finished or "done" according to a WaterMark once Done(k) has been called +// 1. as many times as Begin(k) has, AND +// 2. a positive number of times. +// +// An index may also become "done" by calling SetDoneUntil at a time such that it is not +// inter-mingled with Begin/Done calls. +// +// Since doneUntil and lastIndex addresses are passed to sync/atomic packages, we ensure that they +// are 64-bit aligned by putting them at the beginning of the structure. +type WaterMark struct { + doneUntil uint64 + lastIndex uint64 + Name string + markCh chan mark + elog trace.EventLog +} + +// Init initializes a WaterMark struct. MUST be called before using it. +func (w *WaterMark) Init(closer *Closer) { + w.markCh = make(chan mark, 100) + w.elog = trace.NewEventLog("Watermark", w.Name) + go w.process(closer) +} + +// Begin sets the last index to the given value. +func (w *WaterMark) Begin(index uint64) { + atomic.StoreUint64(&w.lastIndex, index) + w.markCh <- mark{index: index, done: false} +} + +// BeginMany works like Begin but accepts multiple indices. +func (w *WaterMark) BeginMany(indices []uint64) { + atomic.StoreUint64(&w.lastIndex, indices[len(indices)-1]) + w.markCh <- mark{index: 0, indices: indices, done: false} +} + +// Done sets a single index as done. +func (w *WaterMark) Done(index uint64) { + w.markCh <- mark{index: index, done: true} +} + +// DoneMany works like Done but accepts multiple indices. +func (w *WaterMark) DoneMany(indices []uint64) { + w.markCh <- mark{index: 0, indices: indices, done: true} +} + +// DoneUntil returns the maximum index that has the property that all indices +// less than or equal to it are done. +func (w *WaterMark) DoneUntil() uint64 { + return atomic.LoadUint64(&w.doneUntil) +} + +// SetDoneUntil sets the maximum index that has the property that all indices +// less than or equal to it are done. +func (w *WaterMark) SetDoneUntil(val uint64) { + atomic.StoreUint64(&w.doneUntil, val) +} + +// LastIndex returns the last index for which Begin has been called. +func (w *WaterMark) LastIndex() uint64 { + return atomic.LoadUint64(&w.lastIndex) +} + +// WaitForMark waits until the given index is marked as done. +func (w *WaterMark) WaitForMark(ctx context.Context, index uint64) error { + if w.DoneUntil() >= index { + return nil + } + waitCh := make(chan struct{}) + w.markCh <- mark{index: index, waiter: waitCh} + + select { + case <-ctx.Done(): + return ctx.Err() + case <-waitCh: + return nil + } +} + +// process is used to process the Mark channel. This is not thread-safe, +// so only run one goroutine for process. One is sufficient, because +// all goroutine ops use purely memory and cpu. +// Each index has to emit atleast one begin watermark in serial order otherwise waiters +// can get blocked idefinitely. Example: We had an watermark at 100 and a waiter at 101, +// if no watermark is emitted at index 101 then waiter would get stuck indefinitely as it +// can't decide whether the task at 101 has decided not to emit watermark or it didn't get +// scheduled yet. +func (w *WaterMark) process(closer *Closer) { + defer closer.Done() + + var indices uint64Heap + // pending maps raft proposal index to the number of pending mutations for this proposal. + pending := make(map[uint64]int) + waiters := make(map[uint64][]chan struct{}) + + heap.Init(&indices) + var loop uint64 + + processOne := func(index uint64, done bool) { + // If not already done, then set. Otherwise, don't undo a done entry. + prev, present := pending[index] + if !present { + heap.Push(&indices, index) + } + + delta := 1 + if done { + delta = -1 + } + pending[index] = prev + delta + + loop++ + if len(indices) > 0 && loop%10000 == 0 { + min := indices[0] + w.elog.Printf("WaterMark %s: Done entry %4d. Size: %4d Watermark: %-4d Looking for: "+ + "%-4d. Value: %d\n", w.Name, index, len(indices), w.DoneUntil(), min, pending[min]) + } + + // Update mark by going through all indices in order; and checking if they have + // been done. Stop at the first index, which isn't done. + doneUntil := w.DoneUntil() + if doneUntil > index { + AssertTruef(false, "Name: %s doneUntil: %d. Index: %d", w.Name, doneUntil, index) + } + + until := doneUntil + loops := 0 + + for len(indices) > 0 { + min := indices[0] + if done := pending[min]; done > 0 { + break // len(indices) will be > 0. + } + // Even if done is called multiple times causing it to become + // negative, we should still pop the index. + heap.Pop(&indices) + delete(pending, min) + until = min + loops++ + } + for i := doneUntil + 1; i <= until; i++ { + toNotify := waiters[i] + for _, ch := range toNotify { + close(ch) + } + delete(waiters, i) // Release the memory back. + } + if until != doneUntil { + AssertTrue(atomic.CompareAndSwapUint64(&w.doneUntil, doneUntil, until)) + w.elog.Printf("%s: Done until %d. Loops: %d\n", w.Name, until, loops) + } + } + + for { + select { + case <-closer.HasBeenClosed(): + return + case mark := <-w.markCh: + if mark.waiter != nil { + doneUntil := atomic.LoadUint64(&w.doneUntil) + if doneUntil >= mark.index { + close(mark.waiter) + } else { + ws, ok := waiters[mark.index] + if !ok { + waiters[mark.index] = []chan struct{}{mark.waiter} + } else { + waiters[mark.index] = append(ws, mark.waiter) + } + } + } else { + if mark.index > 0 { + processOne(mark.index, mark.done) + } + for _, index := range mark.indices { + processOne(index, mark.done) + } + } + } + } +} diff --git a/vendor/github.com/dgraph-io/badger/y/y.go b/vendor/github.com/dgraph-io/badger/y/y.go new file mode 100644 index 0000000000..4948315a9a --- /dev/null +++ b/vendor/github.com/dgraph-io/badger/y/y.go @@ -0,0 +1,302 @@ +/* + * Copyright 2017 Dgraph Labs, Inc. and Contributors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package y + +import ( + "bytes" + "encoding/binary" + "fmt" + "hash/crc32" + "math" + "os" + "sync" + "time" + + "github.com/pkg/errors" +) + +// ErrEOF indicates an end of file when trying to read from a memory mapped file +// and encountering the end of slice. +var ErrEOF = errors.New("End of mapped region") + +const ( + // Sync indicates that O_DSYNC should be set on the underlying file, + // ensuring that data writes do not return until the data is flushed + // to disk. + Sync = 1 << iota + // ReadOnly opens the underlying file on a read-only basis. + ReadOnly +) + +var ( + // This is O_DSYNC (datasync) on platforms that support it -- see file_unix.go + datasyncFileFlag = 0x0 + + // CastagnoliCrcTable is a CRC32 polynomial table + CastagnoliCrcTable = crc32.MakeTable(crc32.Castagnoli) + + // Dummy channel for nil closers. + dummyCloserChan = make(chan struct{}) +) + +// OpenExistingFile opens an existing file, errors if it doesn't exist. +func OpenExistingFile(filename string, flags uint32) (*os.File, error) { + openFlags := os.O_RDWR + if flags&ReadOnly != 0 { + openFlags = os.O_RDONLY + } + + if flags&Sync != 0 { + openFlags |= datasyncFileFlag + } + return os.OpenFile(filename, openFlags, 0) +} + +// CreateSyncedFile creates a new file (using O_EXCL), errors if it already existed. +func CreateSyncedFile(filename string, sync bool) (*os.File, error) { + flags := os.O_RDWR | os.O_CREATE | os.O_EXCL + if sync { + flags |= datasyncFileFlag + } + return os.OpenFile(filename, flags, 0666) +} + +// OpenSyncedFile creates the file if one doesn't exist. +func OpenSyncedFile(filename string, sync bool) (*os.File, error) { + flags := os.O_RDWR | os.O_CREATE + if sync { + flags |= datasyncFileFlag + } + return os.OpenFile(filename, flags, 0666) +} + +// OpenTruncFile opens the file with O_RDWR | O_CREATE | O_TRUNC +func OpenTruncFile(filename string, sync bool) (*os.File, error) { + flags := os.O_RDWR | os.O_CREATE | os.O_TRUNC + if sync { + flags |= datasyncFileFlag + } + return os.OpenFile(filename, flags, 0666) +} + +// SafeCopy does append(a[:0], src...). +func SafeCopy(a, src []byte) []byte { + return append(a[:0], src...) +} + +// Copy copies a byte slice and returns the copied slice. +func Copy(a []byte) []byte { + b := make([]byte, len(a)) + copy(b, a) + return b +} + +// KeyWithTs generates a new key by appending ts to key. +func KeyWithTs(key []byte, ts uint64) []byte { + out := make([]byte, len(key)+8) + copy(out, key) + binary.BigEndian.PutUint64(out[len(key):], math.MaxUint64-ts) + return out +} + +// ParseTs parses the timestamp from the key bytes. +func ParseTs(key []byte) uint64 { + if len(key) <= 8 { + return 0 + } + return math.MaxUint64 - binary.BigEndian.Uint64(key[len(key)-8:]) +} + +// CompareKeys checks the key without timestamp and checks the timestamp if keyNoTs +// is same. +// a would be sorted higher than aa if we use bytes.compare +// All keys should have timestamp. +func CompareKeys(key1, key2 []byte) int { + AssertTrue(len(key1) > 8 && len(key2) > 8) + if cmp := bytes.Compare(key1[:len(key1)-8], key2[:len(key2)-8]); cmp != 0 { + return cmp + } + return bytes.Compare(key1[len(key1)-8:], key2[len(key2)-8:]) +} + +// ParseKey parses the actual key from the key bytes. +func ParseKey(key []byte) []byte { + if key == nil { + return nil + } + + AssertTrue(len(key) > 8) + return key[:len(key)-8] +} + +// SameKey checks for key equality ignoring the version timestamp suffix. +func SameKey(src, dst []byte) bool { + if len(src) != len(dst) { + return false + } + return bytes.Equal(ParseKey(src), ParseKey(dst)) +} + +// Slice holds a reusable buf, will reallocate if you request a larger size than ever before. +// One problem is with n distinct sizes in random order it'll reallocate log(n) times. +type Slice struct { + buf []byte +} + +// Resize reuses the Slice's buffer (or makes a new one) and returns a slice in that buffer of +// length sz. +func (s *Slice) Resize(sz int) []byte { + if cap(s.buf) < sz { + s.buf = make([]byte, sz) + } + return s.buf[0:sz] +} + +// FixedDuration returns a string representation of the given duration with the +// hours, minutes, and seconds. +func FixedDuration(d time.Duration) string { + str := fmt.Sprintf("%02ds", int(d.Seconds())%60) + if d >= time.Minute { + str = fmt.Sprintf("%02dm", int(d.Minutes())%60) + str + } + if d >= time.Hour { + str = fmt.Sprintf("%02dh", int(d.Hours())) + str + } + return str +} + +// Closer holds the two things we need to close a goroutine and wait for it to finish: a chan +// to tell the goroutine to shut down, and a WaitGroup with which to wait for it to finish shutting +// down. +type Closer struct { + closed chan struct{} + waiting sync.WaitGroup +} + +// NewCloser constructs a new Closer, with an initial count on the WaitGroup. +func NewCloser(initial int) *Closer { + ret := &Closer{closed: make(chan struct{})} + ret.waiting.Add(initial) + return ret +} + +// AddRunning Add()'s delta to the WaitGroup. +func (lc *Closer) AddRunning(delta int) { + lc.waiting.Add(delta) +} + +// Signal signals the HasBeenClosed signal. +func (lc *Closer) Signal() { + close(lc.closed) +} + +// HasBeenClosed gets signaled when Signal() is called. +func (lc *Closer) HasBeenClosed() <-chan struct{} { + if lc == nil { + return dummyCloserChan + } + return lc.closed +} + +// Done calls Done() on the WaitGroup. +func (lc *Closer) Done() { + if lc == nil { + return + } + lc.waiting.Done() +} + +// Wait waits on the WaitGroup. (It waits for NewCloser's initial value, AddRunning, and Done +// calls to balance out.) +func (lc *Closer) Wait() { + lc.waiting.Wait() +} + +// SignalAndWait calls Signal(), then Wait(). +func (lc *Closer) SignalAndWait() { + lc.Signal() + lc.Wait() +} + +// Throttle allows a limited number of workers to run at a time. It also +// provides a mechanism to check for errors encountered by workers and wait for +// them to finish. +type Throttle struct { + once sync.Once + wg sync.WaitGroup + ch chan struct{} + errCh chan error + finishErr error +} + +// NewThrottle creates a new throttle with a max number of workers. +func NewThrottle(max int) *Throttle { + return &Throttle{ + ch: make(chan struct{}, max), + errCh: make(chan error, max), + } +} + +// Do should be called by workers before they start working. It blocks if there +// are already maximum number of workers working. If it detects an error from +// previously Done workers, it would return it. +func (t *Throttle) Do() error { + for { + select { + case t.ch <- struct{}{}: + t.wg.Add(1) + return nil + case err := <-t.errCh: + if err != nil { + return err + } + } + } +} + +// Done should be called by workers when they finish working. They can also +// pass the error status of work done. +func (t *Throttle) Done(err error) { + if err != nil { + t.errCh <- err + } + select { + case <-t.ch: + default: + panic("Throttle Do Done mismatch") + } + t.wg.Done() +} + +// Finish waits until all workers have finished working. It would return any error passed by Done. +// If Finish is called multiple time, it will wait for workers to finish only once(first time). +// From next calls, it will return same error as found on first call. +func (t *Throttle) Finish() error { + t.once.Do(func() { + t.wg.Wait() + close(t.ch) + close(t.errCh) + for err := range t.errCh { + if err != nil { + t.finishErr = err + return + } + } + }) + + return t.finishErr +} diff --git a/vendor/github.com/dgryski/go-farm/.gitignore b/vendor/github.com/dgryski/go-farm/.gitignore new file mode 100644 index 0000000000..36029ab5e8 --- /dev/null +++ b/vendor/github.com/dgryski/go-farm/.gitignore @@ -0,0 +1,24 @@ +# Compiled Object files, Static and Dynamic libs (Shared Objects) +*.o +*.a +*.so + +# Folders +_obj +_test + +# Architecture specific extensions/prefixes +*.[568vq] +[568vq].out + +*.cgo1.go +*.cgo2.c +_cgo_defun.c +_cgo_gotypes.go +_cgo_export.* + +*.exe +*.test +*.prof + +target diff --git a/vendor/github.com/dgryski/go-farm/.travis.yml b/vendor/github.com/dgryski/go-farm/.travis.yml new file mode 100644 index 0000000000..bc89a55d1c --- /dev/null +++ b/vendor/github.com/dgryski/go-farm/.travis.yml @@ -0,0 +1,39 @@ +language: go + +sudo: false + +branches: + except: + - release + +branches: + only: + - master + - develop + - travis + +go: + - 1.11.x + - 1.12.x + - tip + +matrix: + allow_failures: + - go: tip + +before_install: + - if [ -n "$GH_USER" ]; then git config --global github.user ${GH_USER}; fi; + - if [ -n "$GH_TOKEN" ]; then git config --global github.token ${GH_TOKEN}; fi; + - go get github.com/mattn/goveralls + +before_script: + - make deps + +script: + - make qa + +after_failure: + - cat ./target/test/report.xml + +after_success: + - if [ "$TRAVIS_GO_VERSION" = "1.9" ]; then $HOME/gopath/bin/goveralls -covermode=count -coverprofile=target/report/coverage.out -service=travis-ci; fi; diff --git a/vendor/github.com/dgryski/go-farm/LICENSE b/vendor/github.com/dgryski/go-farm/LICENSE new file mode 100644 index 0000000000..3d07f6662d --- /dev/null +++ b/vendor/github.com/dgryski/go-farm/LICENSE @@ -0,0 +1,23 @@ +As this is a highly derivative work, I have placed it under the same license as the original implementation: + +Copyright (c) 2014-2017 Damian Gryski +Copyright (c) 2016-2017 Nicola Asuni - Tecnick.com + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + diff --git a/vendor/github.com/dgryski/go-farm/Makefile b/vendor/github.com/dgryski/go-farm/Makefile new file mode 100644 index 0000000000..c189c95dd7 --- /dev/null +++ b/vendor/github.com/dgryski/go-farm/Makefile @@ -0,0 +1,203 @@ +# MAKEFILE +# +# @author Nicola Asuni +# @link https://github.com/dgryski/go-farm +# +# This file is intended to be executed in a Linux-compatible system. +# It also assumes that the project has been cloned in the right path under GOPATH: +# $GOPATH/src/github.com/dgryski/go-farm +# +# ------------------------------------------------------------------------------ + +# List special make targets that are not associated with files +.PHONY: help all test format fmtcheck vet lint coverage cyclo ineffassign misspell structcheck varcheck errcheck gosimple astscan qa deps clean nuke + +# Use bash as shell (Note: Ubuntu now uses dash which doesn't support PIPESTATUS). +SHELL=/bin/bash + +# CVS path (path to the parent dir containing the project) +CVSPATH=github.com/dgryski + +# Project owner +OWNER=dgryski + +# Project vendor +VENDOR=dgryski + +# Project name +PROJECT=go-farm + +# Project version +VERSION=$(shell cat VERSION) + +# Name of RPM or DEB package +PKGNAME=${VENDOR}-${PROJECT} + +# Current directory +CURRENTDIR=$(shell pwd) + +# GO lang path +ifneq ($(GOPATH),) + ifeq ($(findstring $(GOPATH),$(CURRENTDIR)),) + # the defined GOPATH is not valid + GOPATH= + endif +endif +ifeq ($(GOPATH),) + # extract the GOPATH + GOPATH=$(firstword $(subst /src/, ,$(CURRENTDIR))) +endif + +# --- MAKE TARGETS --- + +# Display general help about this command +help: + @echo "" + @echo "$(PROJECT) Makefile." + @echo "GOPATH=$(GOPATH)" + @echo "The following commands are available:" + @echo "" + @echo " make qa : Run all the tests" + @echo " make test : Run the unit tests" + @echo "" + @echo " make format : Format the source code" + @echo " make fmtcheck : Check if the source code has been formatted" + @echo " make vet : Check for suspicious constructs" + @echo " make lint : Check for style errors" + @echo " make coverage : Generate the coverage report" + @echo " make cyclo : Generate the cyclomatic complexity report" + @echo " make ineffassign : Detect ineffectual assignments" + @echo " make misspell : Detect commonly misspelled words in source files" + @echo " make structcheck : Find unused struct fields" + @echo " make varcheck : Find unused global variables and constants" + @echo " make errcheck : Check that error return values are used" + @echo " make gosimple : Suggest code simplifications" + @echo " make astscan : GO AST scanner" + @echo "" + @echo " make docs : Generate source code documentation" + @echo "" + @echo " make deps : Get the dependencies" + @echo " make clean : Remove any build artifact" + @echo " make nuke : Deletes any intermediate file" + @echo "" + + +# Alias for help target +all: help + +# Run the unit tests +test: + @mkdir -p target/test + @mkdir -p target/report + GOPATH=$(GOPATH) \ + go test \ + -covermode=atomic \ + -bench=. \ + -race \ + -cpuprofile=target/report/cpu.out \ + -memprofile=target/report/mem.out \ + -mutexprofile=target/report/mutex.out \ + -coverprofile=target/report/coverage.out \ + -v ./... | \ + tee >(PATH=$(GOPATH)/bin:$(PATH) go-junit-report > target/test/report.xml); \ + test $${PIPESTATUS[0]} -eq 0 + +# Format the source code +format: + @find . -type f -name "*.go" -exec gofmt -s -w {} \; + +# Check if the source code has been formatted +fmtcheck: + @mkdir -p target + @find . -type f -name "*.go" -exec gofmt -s -d {} \; | tee target/format.diff + @test ! -s target/format.diff || { echo "ERROR: the source code has not been formatted - please use 'make format' or 'gofmt'"; exit 1; } + +# Check for syntax errors +vet: + GOPATH=$(GOPATH) go vet . + +# Check for style errors +lint: + GOPATH=$(GOPATH) PATH=$(GOPATH)/bin:$(PATH) golint . + +# Generate the coverage report +coverage: + @mkdir -p target/report + GOPATH=$(GOPATH) \ + go tool cover -html=target/report/coverage.out -o target/report/coverage.html + +# Report cyclomatic complexity +cyclo: + @mkdir -p target/report + GOPATH=$(GOPATH) gocyclo -avg ./ | tee target/report/cyclo.txt ; test $${PIPESTATUS[0]} -eq 0 + +# Detect ineffectual assignments +ineffassign: + @mkdir -p target/report + GOPATH=$(GOPATH) ineffassign ./ | tee target/report/ineffassign.txt ; test $${PIPESTATUS[0]} -eq 0 + +# Detect commonly misspelled words in source files +misspell: + @mkdir -p target/report + GOPATH=$(GOPATH) misspell -error ./ | tee target/report/misspell.txt ; test $${PIPESTATUS[0]} -eq 0 + +# Find unused struct fields +structcheck: + @mkdir -p target/report + GOPATH=$(GOPATH) structcheck -a ./ | tee target/report/structcheck.txt + +# Find unused global variables and constants +varcheck: + @mkdir -p target/report + GOPATH=$(GOPATH) varcheck -e ./ | tee target/report/varcheck.txt + +# Check that error return values are used +errcheck: + @mkdir -p target/report + GOPATH=$(GOPATH) errcheck ./ | tee target/report/errcheck.txt + +# Suggest code simplifications +gosimple: + @mkdir -p target/report + GOPATH=$(GOPATH) gosimple ./ | tee target/report/gosimple.txt + +# AST scanner +astscan: + @mkdir -p target/report + GOPATH=$(GOPATH) gas .//*.go | tee target/report/astscan.txt + +# Generate source docs +docs: + @mkdir -p target/docs + nohup sh -c 'GOPATH=$(GOPATH) godoc -http=127.0.0.1:6060' > target/godoc_server.log 2>&1 & + wget --directory-prefix=target/docs/ --execute robots=off --retry-connrefused --recursive --no-parent --adjust-extension --page-requisites --convert-links http://127.0.0.1:6060/pkg/github.com/${VENDOR}/${PROJECT}/ ; kill -9 `lsof -ti :6060` + @echo ''${PKGNAME}' Documentation ...' > target/docs/index.html + +# Alias to run all quality-assurance checks +qa: fmtcheck test vet lint coverage cyclo ineffassign misspell structcheck varcheck errcheck gosimple astscan + +# --- INSTALL --- + +# Get the dependencies +deps: + GOPATH=$(GOPATH) go get ./... + GOPATH=$(GOPATH) go get golang.org/x/lint/golint + GOPATH=$(GOPATH) go get github.com/jstemmer/go-junit-report + GOPATH=$(GOPATH) go get github.com/axw/gocov/gocov + GOPATH=$(GOPATH) go get github.com/fzipp/gocyclo + GOPATH=$(GOPATH) go get github.com/gordonklaus/ineffassign + GOPATH=$(GOPATH) go get github.com/client9/misspell/cmd/misspell + GOPATH=$(GOPATH) go get github.com/opennota/check/cmd/structcheck + GOPATH=$(GOPATH) go get github.com/opennota/check/cmd/varcheck + GOPATH=$(GOPATH) go get github.com/kisielk/errcheck + GOPATH=$(GOPATH) go get honnef.co/go/tools/cmd/gosimple + GOPATH=$(GOPATH) go get github.com/GoASTScanner/gas + +# Remove any build artifact +clean: + GOPATH=$(GOPATH) go clean ./... + +# Deletes any intermediate file +nuke: + rm -rf ./target + GOPATH=$(GOPATH) go clean -i ./... diff --git a/vendor/github.com/dgryski/go-farm/README.md b/vendor/github.com/dgryski/go-farm/README.md new file mode 100644 index 0000000000..dd07d6f991 --- /dev/null +++ b/vendor/github.com/dgryski/go-farm/README.md @@ -0,0 +1,41 @@ +# go-farm + +*Google's FarmHash hash functions implemented in Go* + +[![Master Branch](https://img.shields.io/badge/-master:-gray.svg)](https://github.com/dgryski/go-farm/tree/master) +[![Master Build Status](https://secure.travis-ci.org/dgryski/go-farm.png?branch=master)](https://travis-ci.org/dgryski/go-farm?branch=master) +[![Master Coverage Status](https://coveralls.io/repos/dgryski/go-farm/badge.svg?branch=master&service=github)](https://coveralls.io/github/dgryski/go-farm?branch=master) +[![Go Report Card](https://goreportcard.com/badge/github.com/dgryski/go-farm)](https://goreportcard.com/report/github.com/dgryski/go-farm) +[![GoDoc](https://godoc.org/github.com/dgryski/go-farm?status.svg)](http://godoc.org/github.com/dgryski/go-farm) + +## Description + +FarmHash, a family of hash functions. + +This is a (mechanical) translation of the non-SSE4/non-AESNI hash functions from Google's FarmHash (https://github.com/google/farmhash). + + +FarmHash provides hash functions for strings and other data. +The functions mix the input bits thoroughly but are not suitable for cryptography. + +All members of the FarmHash family were designed with heavy reliance on previous work by Jyrki Alakuijala, Austin Appleby, Bob Jenkins, and others. + +For more information please consult https://github.com/google/farmhash + + +## Getting started + +This application is written in Go language, please refer to the guides in https://golang.org for getting started. + +This project include a Makefile that allows you to test and build the project with simple commands. +To see all available options: +```bash +make help +``` + +## Running all tests + +Before committing the code, please check if it passes all tests using +```bash +make qa +``` diff --git a/vendor/github.com/dgryski/go-farm/VERSION b/vendor/github.com/dgryski/go-farm/VERSION new file mode 100644 index 0000000000..38f77a65b3 --- /dev/null +++ b/vendor/github.com/dgryski/go-farm/VERSION @@ -0,0 +1 @@ +2.0.1 diff --git a/vendor/github.com/dgryski/go-farm/basics.go b/vendor/github.com/dgryski/go-farm/basics.go new file mode 100644 index 0000000000..ec7076c03b --- /dev/null +++ b/vendor/github.com/dgryski/go-farm/basics.go @@ -0,0 +1,32 @@ +package farm + +import "math/bits" + +// Some primes between 2^63 and 2^64 for various uses. +const k0 uint64 = 0xc3a5c85c97cb3127 +const k1 uint64 = 0xb492b66fbe98f273 +const k2 uint64 = 0x9ae16a3b2f90404f + +// Magic numbers for 32-bit hashing. Copied from Murmur3. +const c1 uint32 = 0xcc9e2d51 +const c2 uint32 = 0x1b873593 + +// A 32-bit to 32-bit integer hash copied from Murmur3. +func fmix(h uint32) uint32 { + h ^= h >> 16 + h *= 0x85ebca6b + h ^= h >> 13 + h *= 0xc2b2ae35 + h ^= h >> 16 + return h +} + +func mur(a, h uint32) uint32 { + // Helper from Murmur3 for combining two 32-bit values. + a *= c1 + a = bits.RotateLeft32(a, -17) + a *= c2 + h ^= a + h = bits.RotateLeft32(h, -19) + return h*5 + 0xe6546b64 +} diff --git a/vendor/github.com/dgryski/go-farm/farmhashcc.go b/vendor/github.com/dgryski/go-farm/farmhashcc.go new file mode 100644 index 0000000000..3e68ae3a3b --- /dev/null +++ b/vendor/github.com/dgryski/go-farm/farmhashcc.go @@ -0,0 +1,194 @@ +package farm + +import ( + "encoding/binary" + "math/bits" +) + +// This file provides a 32-bit hash equivalent to CityHash32 (v1.1.1) +// and a 128-bit hash equivalent to CityHash128 (v1.1.1). It also provides +// a seeded 32-bit hash function similar to CityHash32. + +func hash32Len13to24Seed(s []byte, seed uint32) uint32 { + slen := len(s) + a := binary.LittleEndian.Uint32(s[-4+(slen>>1) : -4+(slen>>1)+4]) + b := binary.LittleEndian.Uint32(s[4 : 4+4]) + c := binary.LittleEndian.Uint32(s[slen-8 : slen-8+4]) + d := binary.LittleEndian.Uint32(s[(slen >> 1) : (slen>>1)+4]) + e := binary.LittleEndian.Uint32(s[0 : 0+4]) + f := binary.LittleEndian.Uint32(s[slen-4 : slen-4+4]) + h := d*c1 + uint32(slen) + seed + a = bits.RotateLeft32(a, -12) + f + h = mur(c, h) + a + a = bits.RotateLeft32(a, -3) + c + h = mur(e, h) + a + a = bits.RotateLeft32(a+f, -12) + d + h = mur(b^seed, h) + a + return fmix(h) +} + +func hash32Len0to4(s []byte, seed uint32) uint32 { + slen := len(s) + b := seed + c := uint32(9) + for i := 0; i < slen; i++ { + v := int8(s[i]) + b = (b * c1) + uint32(v) + c ^= b + } + return fmix(mur(b, mur(uint32(slen), c))) +} + +func hash128to64(x uint128) uint64 { + // Murmur-inspired hashing. + const mul uint64 = 0x9ddfea08eb382d69 + a := (x.lo ^ x.hi) * mul + a ^= (a >> 47) + b := (x.hi ^ a) * mul + b ^= (b >> 47) + b *= mul + return b +} + +type uint128 struct { + lo uint64 + hi uint64 +} + +// A subroutine for CityHash128(). Returns a decent 128-bit hash for strings +// of any length representable in signed long. Based on City and Murmur. +func cityMurmur(s []byte, seed uint128) uint128 { + slen := len(s) + a := seed.lo + b := seed.hi + var c uint64 + var d uint64 + l := slen - 16 + if l <= 0 { // len <= 16 + a = shiftMix(a*k1) * k1 + c = b*k1 + hashLen0to16(s) + if slen >= 8 { + d = shiftMix(a + binary.LittleEndian.Uint64(s[0:0+8])) + } else { + d = shiftMix(a + c) + } + } else { // len > 16 + c = hashLen16(binary.LittleEndian.Uint64(s[slen-8:slen-8+8])+k1, a) + d = hashLen16(b+uint64(slen), c+binary.LittleEndian.Uint64(s[slen-16:slen-16+8])) + a += d + for { + a ^= shiftMix(binary.LittleEndian.Uint64(s[0:0+8])*k1) * k1 + a *= k1 + b ^= a + c ^= shiftMix(binary.LittleEndian.Uint64(s[8:8+8])*k1) * k1 + c *= k1 + d ^= c + s = s[16:] + l -= 16 + if l <= 0 { + break + } + } + } + a = hashLen16(a, c) + b = hashLen16(d, b) + return uint128{a ^ b, hashLen16(b, a)} +} + +func cityHash128WithSeed(s []byte, seed uint128) uint128 { + slen := len(s) + if slen < 128 { + return cityMurmur(s, seed) + } + + endIdx := ((slen - 1) / 128) * 128 + lastBlockIdx := endIdx + ((slen - 1) & 127) - 127 + last := s[lastBlockIdx:] + + // We expect len >= 128 to be the common case. Keep 56 bytes of state: + // v, w, x, y, and z. + var v1, v2 uint64 + var w1, w2 uint64 + x := seed.lo + y := seed.hi + z := uint64(slen) * k1 + v1 = bits.RotateLeft64(y^k1, -49)*k1 + binary.LittleEndian.Uint64(s[0:0+8]) + v2 = bits.RotateLeft64(v1, -42)*k1 + binary.LittleEndian.Uint64(s[8:8+8]) + w1 = bits.RotateLeft64(y+z, -35)*k1 + x + w2 = bits.RotateLeft64(x+binary.LittleEndian.Uint64(s[88:88+8]), -53) * k1 + + // This is the same inner loop as CityHash64(), manually unrolled. + for { + x = bits.RotateLeft64(x+y+v1+binary.LittleEndian.Uint64(s[8:8+8]), -37) * k1 + y = bits.RotateLeft64(y+v2+binary.LittleEndian.Uint64(s[48:48+8]), -42) * k1 + x ^= w2 + y += v1 + binary.LittleEndian.Uint64(s[40:40+8]) + z = bits.RotateLeft64(z+w1, -33) * k1 + v1, v2 = weakHashLen32WithSeeds(s, v2*k1, x+w1) + w1, w2 = weakHashLen32WithSeeds(s[32:], z+w2, y+binary.LittleEndian.Uint64(s[16:16+8])) + z, x = x, z + s = s[64:] + x = bits.RotateLeft64(x+y+v1+binary.LittleEndian.Uint64(s[8:8+8]), -37) * k1 + y = bits.RotateLeft64(y+v2+binary.LittleEndian.Uint64(s[48:48+8]), -42) * k1 + x ^= w2 + y += v1 + binary.LittleEndian.Uint64(s[40:40+8]) + z = bits.RotateLeft64(z+w1, -33) * k1 + v1, v2 = weakHashLen32WithSeeds(s, v2*k1, x+w1) + w1, w2 = weakHashLen32WithSeeds(s[32:], z+w2, y+binary.LittleEndian.Uint64(s[16:16+8])) + z, x = x, z + s = s[64:] + slen -= 128 + if slen < 128 { + break + } + } + x += bits.RotateLeft64(v1+z, -49) * k0 + y = y*k0 + bits.RotateLeft64(w2, -37) + z = z*k0 + bits.RotateLeft64(w1, -27) + w1 *= 9 + v1 *= k0 + // If 0 < len < 128, hash up to 4 chunks of 32 bytes each from the end of s. + for tailDone := 0; tailDone < slen; { + tailDone += 32 + y = bits.RotateLeft64(x+y, -42)*k0 + v2 + w1 += binary.LittleEndian.Uint64(last[128-tailDone+16 : 128-tailDone+16+8]) + x = x*k0 + w1 + z += w2 + binary.LittleEndian.Uint64(last[128-tailDone:128-tailDone+8]) + w2 += v1 + v1, v2 = weakHashLen32WithSeeds(last[128-tailDone:], v1+z, v2) + v1 *= k0 + } + + // At this point our 56 bytes of state should contain more than + // enough information for a strong 128-bit hash. We use two + // different 56-byte-to-8-byte hashes to get a 16-byte final result. + x = hashLen16(x, v1) + y = hashLen16(y+z, w1) + return uint128{hashLen16(x+v2, w2) + y, + hashLen16(x+w2, y+v2)} +} + +func cityHash128(s []byte) uint128 { + slen := len(s) + if slen >= 16 { + return cityHash128WithSeed(s[16:], uint128{binary.LittleEndian.Uint64(s[0 : 0+8]), binary.LittleEndian.Uint64(s[8:8+8]) + k0}) + } + return cityHash128WithSeed(s, uint128{k0, k1}) +} + +// Fingerprint128 is a 128-bit fingerprint function for byte-slices +func Fingerprint128(s []byte) (lo, hi uint64) { + h := cityHash128(s) + return h.lo, h.hi +} + +// Hash128 is a 128-bit hash function for byte-slices +func Hash128(s []byte) (lo, hi uint64) { + return Fingerprint128(s) +} + +// Hash128WithSeed is a 128-bit hash function for byte-slices and a 128-bit seed +func Hash128WithSeed(s []byte, seed0, seed1 uint64) (lo, hi uint64) { + h := cityHash128WithSeed(s, uint128{seed0, seed1}) + return h.lo, h.hi +} diff --git a/vendor/github.com/dgryski/go-farm/farmhashmk.go b/vendor/github.com/dgryski/go-farm/farmhashmk.go new file mode 100644 index 0000000000..8e4c7428b5 --- /dev/null +++ b/vendor/github.com/dgryski/go-farm/farmhashmk.go @@ -0,0 +1,102 @@ +package farm + +import ( + "encoding/binary" + "math/bits" +) + +func hash32Len5to12(s []byte, seed uint32) uint32 { + slen := len(s) + a := uint32(len(s)) + b := uint32(len(s) * 5) + c := uint32(9) + d := b + seed + a += binary.LittleEndian.Uint32(s[0 : 0+4]) + b += binary.LittleEndian.Uint32(s[slen-4 : slen-4+4]) + c += binary.LittleEndian.Uint32(s[((slen >> 1) & 4) : ((slen>>1)&4)+4]) + return fmix(seed ^ mur(c, mur(b, mur(a, d)))) +} + +// Hash32 hashes a byte slice and returns a uint32 hash value +func Hash32(s []byte) uint32 { + + slen := len(s) + + if slen <= 24 { + if slen <= 12 { + if slen <= 4 { + return hash32Len0to4(s, 0) + } + return hash32Len5to12(s, 0) + } + return hash32Len13to24Seed(s, 0) + } + + // len > 24 + h := uint32(slen) + g := c1 * uint32(slen) + f := g + a0 := bits.RotateLeft32(binary.LittleEndian.Uint32(s[slen-4:slen-4+4])*c1, -17) * c2 + a1 := bits.RotateLeft32(binary.LittleEndian.Uint32(s[slen-8:slen-8+4])*c1, -17) * c2 + a2 := bits.RotateLeft32(binary.LittleEndian.Uint32(s[slen-16:slen-16+4])*c1, -17) * c2 + a3 := bits.RotateLeft32(binary.LittleEndian.Uint32(s[slen-12:slen-12+4])*c1, -17) * c2 + a4 := bits.RotateLeft32(binary.LittleEndian.Uint32(s[slen-20:slen-20+4])*c1, -17) * c2 + h ^= a0 + h = bits.RotateLeft32(h, -19) + h = h*5 + 0xe6546b64 + h ^= a2 + h = bits.RotateLeft32(h, -19) + h = h*5 + 0xe6546b64 + g ^= a1 + g = bits.RotateLeft32(g, -19) + g = g*5 + 0xe6546b64 + g ^= a3 + g = bits.RotateLeft32(g, -19) + g = g*5 + 0xe6546b64 + f += a4 + f = bits.RotateLeft32(f, -19) + 113 + for len(s) > 20 { + a := binary.LittleEndian.Uint32(s[0 : 0+4]) + b := binary.LittleEndian.Uint32(s[4 : 4+4]) + c := binary.LittleEndian.Uint32(s[8 : 8+4]) + d := binary.LittleEndian.Uint32(s[12 : 12+4]) + e := binary.LittleEndian.Uint32(s[16 : 16+4]) + h += a + g += b + f += c + h = mur(d, h) + e + g = mur(c, g) + a + f = mur(b+e*c1, f) + d + f += g + g += f + s = s[20:] + } + g = bits.RotateLeft32(g, -11) * c1 + g = bits.RotateLeft32(g, -17) * c1 + f = bits.RotateLeft32(f, -11) * c1 + f = bits.RotateLeft32(f, -17) * c1 + h = bits.RotateLeft32(h+g, -19) + h = h*5 + 0xe6546b64 + h = bits.RotateLeft32(h, -17) * c1 + h = bits.RotateLeft32(h+f, -19) + h = h*5 + 0xe6546b64 + h = bits.RotateLeft32(h, -17) * c1 + return h +} + +// Hash32WithSeed hashes a byte slice and a uint32 seed and returns a uint32 hash value +func Hash32WithSeed(s []byte, seed uint32) uint32 { + slen := len(s) + + if slen <= 24 { + if slen >= 13 { + return hash32Len13to24Seed(s, seed*c1) + } + if slen >= 5 { + return hash32Len5to12(s, seed) + } + return hash32Len0to4(s, seed) + } + h := hash32Len13to24Seed(s[:24], seed^uint32(slen)) + return mur(Hash32(s[24:])+seed, h) +} diff --git a/vendor/github.com/dgryski/go-farm/farmhashna.go b/vendor/github.com/dgryski/go-farm/farmhashna.go new file mode 100644 index 0000000000..ac62edd3bb --- /dev/null +++ b/vendor/github.com/dgryski/go-farm/farmhashna.go @@ -0,0 +1,161 @@ +package farm + +import ( + "encoding/binary" + "math/bits" +) + +func shiftMix(val uint64) uint64 { + return val ^ (val >> 47) +} + +func hashLen16(u, v uint64) uint64 { + return hash128to64(uint128{u, v}) +} + +func hashLen16Mul(u, v, mul uint64) uint64 { + // Murmur-inspired hashing. + a := (u ^ v) * mul + a ^= (a >> 47) + b := (v ^ a) * mul + b ^= (b >> 47) + b *= mul + return b +} + +func hashLen0to16(s []byte) uint64 { + slen := uint64(len(s)) + if slen >= 8 { + mul := k2 + slen*2 + a := binary.LittleEndian.Uint64(s[0:0+8]) + k2 + b := binary.LittleEndian.Uint64(s[int(slen-8) : int(slen-8)+8]) + c := bits.RotateLeft64(b, -37)*mul + a + d := (bits.RotateLeft64(a, -25) + b) * mul + return hashLen16Mul(c, d, mul) + } + + if slen >= 4 { + mul := k2 + slen*2 + a := binary.LittleEndian.Uint32(s[0 : 0+4]) + return hashLen16Mul(slen+(uint64(a)<<3), uint64(binary.LittleEndian.Uint32(s[int(slen-4):int(slen-4)+4])), mul) + } + if slen > 0 { + a := s[0] + b := s[slen>>1] + c := s[slen-1] + y := uint32(a) + (uint32(b) << 8) + z := uint32(slen) + (uint32(c) << 2) + return shiftMix(uint64(y)*k2^uint64(z)*k0) * k2 + } + return k2 +} + +// This probably works well for 16-byte strings as well, but it may be overkill +// in that case. +func hashLen17to32(s []byte) uint64 { + slen := len(s) + mul := k2 + uint64(slen*2) + a := binary.LittleEndian.Uint64(s[0:0+8]) * k1 + b := binary.LittleEndian.Uint64(s[8 : 8+8]) + c := binary.LittleEndian.Uint64(s[slen-8:slen-8+8]) * mul + d := binary.LittleEndian.Uint64(s[slen-16:slen-16+8]) * k2 + return hashLen16Mul(bits.RotateLeft64(a+b, -43)+bits.RotateLeft64(c, -30)+d, a+bits.RotateLeft64(b+k2, -18)+c, mul) +} + +// Return a 16-byte hash for 48 bytes. Quick and dirty. +// Callers do best to use "random-looking" values for a and b. +func weakHashLen32WithSeedsWords(w, x, y, z, a, b uint64) (uint64, uint64) { + a += w + b = bits.RotateLeft64(b+a+z, -21) + c := a + a += x + a += y + b += bits.RotateLeft64(a, -44) + return a + z, b + c +} + +// Return a 16-byte hash for s[0] ... s[31], a, and b. Quick and dirty. +func weakHashLen32WithSeeds(s []byte, a, b uint64) (uint64, uint64) { + return weakHashLen32WithSeedsWords(binary.LittleEndian.Uint64(s[0:0+8]), + binary.LittleEndian.Uint64(s[8:8+8]), + binary.LittleEndian.Uint64(s[16:16+8]), + binary.LittleEndian.Uint64(s[24:24+8]), + a, + b) +} + +// Return an 8-byte hash for 33 to 64 bytes. +func hashLen33to64(s []byte) uint64 { + slen := len(s) + mul := k2 + uint64(slen)*2 + a := binary.LittleEndian.Uint64(s[0:0+8]) * k2 + b := binary.LittleEndian.Uint64(s[8 : 8+8]) + c := binary.LittleEndian.Uint64(s[slen-8:slen-8+8]) * mul + d := binary.LittleEndian.Uint64(s[slen-16:slen-16+8]) * k2 + y := bits.RotateLeft64(a+b, -43) + bits.RotateLeft64(c, -30) + d + z := hashLen16Mul(y, a+bits.RotateLeft64(b+k2, -18)+c, mul) + e := binary.LittleEndian.Uint64(s[16:16+8]) * mul + f := binary.LittleEndian.Uint64(s[24 : 24+8]) + g := (y + binary.LittleEndian.Uint64(s[slen-32:slen-32+8])) * mul + h := (z + binary.LittleEndian.Uint64(s[slen-24:slen-24+8])) * mul + return hashLen16Mul(bits.RotateLeft64(e+f, -43)+bits.RotateLeft64(g, -30)+h, e+bits.RotateLeft64(f+a, -18)+g, mul) +} + +func naHash64(s []byte) uint64 { + slen := len(s) + var seed uint64 = 81 + if slen <= 32 { + if slen <= 16 { + return hashLen0to16(s) + } + return hashLen17to32(s) + } + if slen <= 64 { + return hashLen33to64(s) + } + // For strings over 64 bytes we loop. + // Internal state consists of 56 bytes: v, w, x, y, and z. + v := uint128{0, 0} + w := uint128{0, 0} + x := seed*k2 + binary.LittleEndian.Uint64(s[0:0+8]) + y := seed*k1 + 113 + z := shiftMix(y*k2+113) * k2 + // Set end so that after the loop we have 1 to 64 bytes left to process. + endIdx := ((slen - 1) / 64) * 64 + last64Idx := endIdx + ((slen - 1) & 63) - 63 + last64 := s[last64Idx:] + for len(s) > 64 { + x = bits.RotateLeft64(x+y+v.lo+binary.LittleEndian.Uint64(s[8:8+8]), -37) * k1 + y = bits.RotateLeft64(y+v.hi+binary.LittleEndian.Uint64(s[48:48+8]), -42) * k1 + x ^= w.hi + y += v.lo + binary.LittleEndian.Uint64(s[40:40+8]) + z = bits.RotateLeft64(z+w.lo, -33) * k1 + v.lo, v.hi = weakHashLen32WithSeeds(s, v.hi*k1, x+w.lo) + w.lo, w.hi = weakHashLen32WithSeeds(s[32:], z+w.hi, y+binary.LittleEndian.Uint64(s[16:16+8])) + x, z = z, x + s = s[64:] + } + mul := k1 + ((z & 0xff) << 1) + // Make s point to the last 64 bytes of input. + s = last64 + w.lo += (uint64(slen-1) & 63) + v.lo += w.lo + w.lo += v.lo + x = bits.RotateLeft64(x+y+v.lo+binary.LittleEndian.Uint64(s[8:8+8]), -37) * mul + y = bits.RotateLeft64(y+v.hi+binary.LittleEndian.Uint64(s[48:48+8]), -42) * mul + x ^= w.hi * 9 + y += v.lo*9 + binary.LittleEndian.Uint64(s[40:40+8]) + z = bits.RotateLeft64(z+w.lo, -33) * mul + v.lo, v.hi = weakHashLen32WithSeeds(s, v.hi*mul, x+w.lo) + w.lo, w.hi = weakHashLen32WithSeeds(s[32:], z+w.hi, y+binary.LittleEndian.Uint64(s[16:16+8])) + x, z = z, x + return hashLen16Mul(hashLen16Mul(v.lo, w.lo, mul)+shiftMix(y)*k0+z, hashLen16Mul(v.hi, w.hi, mul)+x, mul) +} + +func naHash64WithSeed(s []byte, seed uint64) uint64 { + return naHash64WithSeeds(s, k2, seed) +} + +func naHash64WithSeeds(s []byte, seed0, seed1 uint64) uint64 { + return hashLen16(naHash64(s)-seed0, seed1) +} diff --git a/vendor/github.com/dgryski/go-farm/farmhashuo.go b/vendor/github.com/dgryski/go-farm/farmhashuo.go new file mode 100644 index 0000000000..474b74e059 --- /dev/null +++ b/vendor/github.com/dgryski/go-farm/farmhashuo.go @@ -0,0 +1,122 @@ +package farm + +import ( + "encoding/binary" + "math/bits" +) + +func uoH(x, y, mul uint64, r uint) uint64 { + a := (x ^ y) * mul + a ^= (a >> 47) + b := (y ^ a) * mul + return bits.RotateLeft64(b, -int(r)) * mul +} + +// Hash64WithSeeds hashes a byte slice and two uint64 seeds and returns a uint64 hash value +func Hash64WithSeeds(s []byte, seed0, seed1 uint64) uint64 { + slen := len(s) + if slen <= 64 { + return naHash64WithSeeds(s, seed0, seed1) + } + + // For strings over 64 bytes we loop. + // Internal state consists of 64 bytes: u, v, w, x, y, and z. + x := seed0 + y := seed1*k2 + 113 + z := shiftMix(y*k2) * k2 + v := uint128{seed0, seed1} + var w uint128 + u := x - z + x *= k2 + mul := k2 + (u & 0x82) + + // Set end so that after the loop we have 1 to 64 bytes left to process. + endIdx := ((slen - 1) / 64) * 64 + last64Idx := endIdx + ((slen - 1) & 63) - 63 + last64 := s[last64Idx:] + + for len(s) > 64 { + a0 := binary.LittleEndian.Uint64(s[0 : 0+8]) + a1 := binary.LittleEndian.Uint64(s[8 : 8+8]) + a2 := binary.LittleEndian.Uint64(s[16 : 16+8]) + a3 := binary.LittleEndian.Uint64(s[24 : 24+8]) + a4 := binary.LittleEndian.Uint64(s[32 : 32+8]) + a5 := binary.LittleEndian.Uint64(s[40 : 40+8]) + a6 := binary.LittleEndian.Uint64(s[48 : 48+8]) + a7 := binary.LittleEndian.Uint64(s[56 : 56+8]) + x += a0 + a1 + y += a2 + z += a3 + v.lo += a4 + v.hi += a5 + a1 + w.lo += a6 + w.hi += a7 + + x = bits.RotateLeft64(x, -26) + x *= 9 + y = bits.RotateLeft64(y, -29) + z *= mul + v.lo = bits.RotateLeft64(v.lo, -33) + v.hi = bits.RotateLeft64(v.hi, -30) + w.lo ^= x + w.lo *= 9 + z = bits.RotateLeft64(z, -32) + z += w.hi + w.hi += z + z *= 9 + u, y = y, u + + z += a0 + a6 + v.lo += a2 + v.hi += a3 + w.lo += a4 + w.hi += a5 + a6 + x += a1 + y += a7 + + y += v.lo + v.lo += x - y + v.hi += w.lo + w.lo += v.hi + w.hi += x - y + x += w.hi + w.hi = bits.RotateLeft64(w.hi, -34) + u, z = z, u + s = s[64:] + } + // Make s point to the last 64 bytes of input. + s = last64 + u *= 9 + v.hi = bits.RotateLeft64(v.hi, -28) + v.lo = bits.RotateLeft64(v.lo, -20) + w.lo += (uint64(slen-1) & 63) + u += y + y += u + x = bits.RotateLeft64(y-x+v.lo+binary.LittleEndian.Uint64(s[8:8+8]), -37) * mul + y = bits.RotateLeft64(y^v.hi^binary.LittleEndian.Uint64(s[48:48+8]), -42) * mul + x ^= w.hi * 9 + y += v.lo + binary.LittleEndian.Uint64(s[40:40+8]) + z = bits.RotateLeft64(z+w.lo, -33) * mul + v.lo, v.hi = weakHashLen32WithSeeds(s, v.hi*mul, x+w.lo) + w.lo, w.hi = weakHashLen32WithSeeds(s[32:], z+w.hi, y+binary.LittleEndian.Uint64(s[16:16+8])) + return uoH(hashLen16Mul(v.lo+x, w.lo^y, mul)+z-u, + uoH(v.hi+y, w.hi+z, k2, 30)^x, + k2, + 31) +} + +// Hash64WithSeed hashes a byte slice and a uint64 seed and returns a uint64 hash value +func Hash64WithSeed(s []byte, seed uint64) uint64 { + if len(s) <= 64 { + return naHash64WithSeed(s, seed) + } + return Hash64WithSeeds(s, 0, seed) +} + +// Hash64 hashes a byte slice and returns a uint64 hash value +func Hash64(s []byte) uint64 { + if len(s) <= 64 { + return naHash64(s) + } + return Hash64WithSeeds(s, 81, 0) +} diff --git a/vendor/github.com/dgryski/go-farm/fp_amd64.s b/vendor/github.com/dgryski/go-farm/fp_amd64.s new file mode 100644 index 0000000000..2b8fa32473 --- /dev/null +++ b/vendor/github.com/dgryski/go-farm/fp_amd64.s @@ -0,0 +1,951 @@ +// Code generated by command: go run asm.go -out=fp_amd64.s -go111=false. DO NOT EDIT. + +// +build amd64,!purego + +#include "textflag.h" + +// func Fingerprint64(s []byte) uint64 +TEXT ·Fingerprint64(SB), NOSPLIT, $0-32 + MOVQ s_base+0(FP), CX + MOVQ s_len+8(FP), AX + CMPQ AX, $0x10 + JG check32 + CMPQ AX, $0x08 + JL check4 + MOVQ (CX), DX + MOVQ AX, BX + SUBQ $0x08, BX + ADDQ CX, BX + MOVQ (BX), BX + MOVQ $0x9ae16a3b2f90404f, BP + ADDQ BP, DX + SHLQ $0x01, AX + ADDQ BP, AX + MOVQ BX, BP + RORQ $0x25, BP + IMULQ AX, BP + ADDQ DX, BP + RORQ $0x19, DX + ADDQ BX, DX + IMULQ AX, DX + XORQ DX, BP + IMULQ AX, BP + MOVQ BP, BX + SHRQ $0x2f, BX + XORQ BP, BX + XORQ BX, DX + IMULQ AX, DX + MOVQ DX, BX + SHRQ $0x2f, BX + XORQ DX, BX + IMULQ AX, BX + MOVQ BX, ret+24(FP) + RET + +check4: + CMPQ AX, $0x04 + JL check0 + MOVQ $0x9ae16a3b2f90404f, DX + MOVQ AX, BX + SHLQ $0x01, BX + ADDQ DX, BX + MOVL (CX), SI + SHLQ $0x03, SI + ADDQ AX, SI + SUBQ $0x04, AX + ADDQ AX, CX + MOVL (CX), DI + XORQ DI, SI + IMULQ BX, SI + MOVQ SI, DX + SHRQ $0x2f, DX + XORQ SI, DX + XORQ DX, DI + IMULQ BX, DI + MOVQ DI, DX + SHRQ $0x2f, DX + XORQ DI, DX + IMULQ BX, DX + MOVQ DX, ret+24(FP) + RET + +check0: + TESTQ AX, AX + JZ empty + MOVBQZX (CX), DX + MOVQ AX, BX + SHRQ $0x01, BX + ADDQ CX, BX + MOVBQZX (BX), BP + MOVQ AX, BX + SUBQ $0x01, BX + ADDQ CX, BX + MOVBQZX (BX), BX + SHLQ $0x08, BP + ADDQ BP, DX + SHLQ $0x02, BX + ADDQ BX, AX + MOVQ $0xc3a5c85c97cb3127, BX + IMULQ BX, AX + MOVQ $0x9ae16a3b2f90404f, BX + IMULQ BX, DX + XORQ DX, AX + MOVQ AX, DX + SHRQ $0x2f, DX + XORQ AX, DX + IMULQ BX, DX + MOVQ DX, ret+24(FP) + RET + +empty: + MOVQ $0x9ae16a3b2f90404f, DX + MOVQ DX, ret+24(FP) + RET + +check32: + CMPQ AX, $0x20 + JG check64 + MOVQ AX, DX + SHLQ $0x01, DX + MOVQ $0x9ae16a3b2f90404f, BX + ADDQ BX, DX + MOVQ (CX), BP + MOVQ $0xb492b66fbe98f273, SI + IMULQ SI, BP + MOVQ 8(CX), SI + MOVQ AX, DI + SUBQ $0x10, DI + ADDQ CX, DI + MOVQ 8(DI), R12 + IMULQ DX, R12 + MOVQ (DI), DI + IMULQ BX, DI + MOVQ BP, R13 + ADDQ SI, R13 + RORQ $0x2b, R13 + ADDQ DI, R13 + MOVQ R12, DI + RORQ $0x1e, DI + ADDQ DI, R13 + ADDQ R12, BP + ADDQ BX, SI + RORQ $0x12, SI + ADDQ SI, BP + XORQ BP, R13 + IMULQ DX, R13 + MOVQ R13, BX + SHRQ $0x2f, BX + XORQ R13, BX + XORQ BX, BP + IMULQ DX, BP + MOVQ BP, BX + SHRQ $0x2f, BX + XORQ BP, BX + IMULQ DX, BX + MOVQ BX, ret+24(FP) + RET + +check64: + CMPQ AX, $0x40 + JG long + MOVQ AX, DX + SHLQ $0x01, DX + MOVQ $0x9ae16a3b2f90404f, BX + ADDQ BX, DX + MOVQ (CX), BP + IMULQ BX, BP + MOVQ 8(CX), SI + MOVQ AX, DI + SUBQ $0x10, DI + ADDQ CX, DI + MOVQ 8(DI), R12 + IMULQ DX, R12 + MOVQ (DI), DI + IMULQ BX, DI + MOVQ BP, R13 + ADDQ SI, R13 + RORQ $0x2b, R13 + ADDQ DI, R13 + MOVQ R12, DI + RORQ $0x1e, DI + ADDQ DI, R13 + ADDQ BP, R12 + ADDQ BX, SI + RORQ $0x12, SI + ADDQ SI, R12 + MOVQ R13, BX + XORQ R12, BX + IMULQ DX, BX + MOVQ BX, SI + SHRQ $0x2f, SI + XORQ BX, SI + XORQ SI, R12 + IMULQ DX, R12 + MOVQ R12, BX + SHRQ $0x2f, BX + XORQ R12, BX + IMULQ DX, BX + MOVQ 16(CX), SI + IMULQ DX, SI + MOVQ 24(CX), DI + MOVQ AX, R12 + SUBQ $0x20, R12 + ADDQ CX, R12 + MOVQ (R12), R14 + ADDQ R13, R14 + IMULQ DX, R14 + MOVQ 8(R12), R12 + ADDQ BX, R12 + IMULQ DX, R12 + MOVQ SI, BX + ADDQ DI, BX + RORQ $0x2b, BX + ADDQ R12, BX + MOVQ R14, R12 + RORQ $0x1e, R12 + ADDQ R12, BX + ADDQ R14, SI + ADDQ BP, DI + RORQ $0x12, DI + ADDQ DI, SI + XORQ SI, BX + IMULQ DX, BX + MOVQ BX, BP + SHRQ $0x2f, BP + XORQ BX, BP + XORQ BP, SI + IMULQ DX, SI + MOVQ SI, BX + SHRQ $0x2f, BX + XORQ SI, BX + IMULQ DX, BX + MOVQ BX, ret+24(FP) + RET + +long: + XORQ R8, R8 + XORQ R9, R9 + XORQ R10, R10 + XORQ R11, R11 + MOVQ $0x01529cba0ca458ff, DX + ADDQ (CX), DX + MOVQ $0x226bb95b4e64b6d4, BX + MOVQ $0x134a747f856d0526, BP + MOVQ AX, SI + SUBQ $0x01, SI + MOVQ $0xffffffffffffffc0, DI + ANDQ DI, SI + MOVQ AX, DI + SUBQ $0x01, DI + ANDQ $0x3f, DI + SUBQ $0x3f, DI + ADDQ SI, DI + MOVQ DI, SI + ADDQ CX, SI + MOVQ AX, DI + +loop: + MOVQ $0xb492b66fbe98f273, R12 + ADDQ BX, DX + ADDQ R8, DX + ADDQ 8(CX), DX + RORQ $0x25, DX + IMULQ R12, DX + ADDQ R9, BX + ADDQ 48(CX), BX + RORQ $0x2a, BX + IMULQ R12, BX + XORQ R11, DX + ADDQ R8, BX + ADDQ 40(CX), BX + ADDQ R10, BP + RORQ $0x21, BP + IMULQ R12, BP + IMULQ R12, R9 + MOVQ DX, R8 + ADDQ R10, R8 + ADDQ (CX), R9 + ADDQ R9, R8 + ADDQ 24(CX), R8 + RORQ $0x15, R8 + MOVQ R9, R10 + ADDQ 8(CX), R9 + ADDQ 16(CX), R9 + MOVQ R9, R13 + RORQ $0x2c, R13 + ADDQ R13, R8 + ADDQ 24(CX), R9 + ADDQ R10, R8 + XCHGQ R9, R8 + ADDQ BP, R11 + MOVQ BX, R10 + ADDQ 16(CX), R10 + ADDQ 32(CX), R11 + ADDQ R11, R10 + ADDQ 56(CX), R10 + RORQ $0x15, R10 + MOVQ R11, R13 + ADDQ 40(CX), R11 + ADDQ 48(CX), R11 + MOVQ R11, R14 + RORQ $0x2c, R14 + ADDQ R14, R10 + ADDQ 56(CX), R11 + ADDQ R13, R10 + XCHGQ R11, R10 + XCHGQ BP, DX + ADDQ $0x40, CX + SUBQ $0x40, DI + CMPQ DI, $0x40 + JG loop + MOVQ SI, CX + MOVQ BP, DI + ANDQ $0xff, DI + SHLQ $0x01, DI + ADDQ R12, DI + MOVQ SI, CX + SUBQ $0x01, AX + ANDQ $0x3f, AX + ADDQ AX, R10 + ADDQ R10, R8 + ADDQ R8, R10 + ADDQ BX, DX + ADDQ R8, DX + ADDQ 8(CX), DX + RORQ $0x25, DX + IMULQ DI, DX + ADDQ R9, BX + ADDQ 48(CX), BX + RORQ $0x2a, BX + IMULQ DI, BX + MOVQ $0x00000009, AX + IMULQ R11, AX + XORQ AX, DX + MOVQ $0x00000009, AX + IMULQ R8, AX + ADDQ AX, BX + ADDQ 40(CX), BX + ADDQ R10, BP + RORQ $0x21, BP + IMULQ DI, BP + IMULQ DI, R9 + MOVQ DX, R8 + ADDQ R10, R8 + ADDQ (CX), R9 + ADDQ R9, R8 + ADDQ 24(CX), R8 + RORQ $0x15, R8 + MOVQ R9, AX + ADDQ 8(CX), R9 + ADDQ 16(CX), R9 + MOVQ R9, SI + RORQ $0x2c, SI + ADDQ SI, R8 + ADDQ 24(CX), R9 + ADDQ AX, R8 + XCHGQ R9, R8 + ADDQ BP, R11 + MOVQ BX, R10 + ADDQ 16(CX), R10 + ADDQ 32(CX), R11 + ADDQ R11, R10 + ADDQ 56(CX), R10 + RORQ $0x15, R10 + MOVQ R11, AX + ADDQ 40(CX), R11 + ADDQ 48(CX), R11 + MOVQ R11, SI + RORQ $0x2c, SI + ADDQ SI, R10 + ADDQ 56(CX), R11 + ADDQ AX, R10 + XCHGQ R11, R10 + XCHGQ BP, DX + XORQ R10, R8 + IMULQ DI, R8 + MOVQ R8, AX + SHRQ $0x2f, AX + XORQ R8, AX + XORQ AX, R10 + IMULQ DI, R10 + MOVQ R10, AX + SHRQ $0x2f, AX + XORQ R10, AX + IMULQ DI, AX + ADDQ BP, AX + MOVQ BX, CX + SHRQ $0x2f, CX + XORQ BX, CX + MOVQ $0xc3a5c85c97cb3127, BX + IMULQ BX, CX + ADDQ CX, AX + XORQ R11, R9 + IMULQ DI, R9 + MOVQ R9, CX + SHRQ $0x2f, CX + XORQ R9, CX + XORQ CX, R11 + IMULQ DI, R11 + MOVQ R11, CX + SHRQ $0x2f, CX + XORQ R11, CX + IMULQ DI, CX + ADDQ DX, CX + XORQ CX, AX + IMULQ DI, AX + MOVQ AX, DX + SHRQ $0x2f, DX + XORQ AX, DX + XORQ DX, CX + IMULQ DI, CX + MOVQ CX, AX + SHRQ $0x2f, AX + XORQ CX, AX + IMULQ DI, AX + MOVQ AX, ret+24(FP) + RET + +// func Fingerprint32(s []byte) uint32 +TEXT ·Fingerprint32(SB), NOSPLIT, $0-28 + MOVQ s_base+0(FP), AX + MOVQ s_len+8(FP), CX + CMPQ CX, $0x18 + JG long + CMPQ CX, $0x0c + JG hash_13_24 + CMPQ CX, $0x04 + JG hash_5_12 + XORL DX, DX + MOVL $0x00000009, BX + TESTQ CX, CX + JZ done + MOVQ CX, BP + MOVL $0xcc9e2d51, DI + IMULL DI, DX + MOVBLSX (AX), SI + ADDL SI, DX + XORL DX, BX + SUBQ $0x01, BP + TESTQ BP, BP + JZ done + IMULL DI, DX + MOVBLSX 1(AX), SI + ADDL SI, DX + XORL DX, BX + SUBQ $0x01, BP + TESTQ BP, BP + JZ done + IMULL DI, DX + MOVBLSX 2(AX), SI + ADDL SI, DX + XORL DX, BX + SUBQ $0x01, BP + TESTQ BP, BP + JZ done + IMULL DI, DX + MOVBLSX 3(AX), SI + ADDL SI, DX + XORL DX, BX + SUBQ $0x01, BP + TESTQ BP, BP + JZ done + +done: + MOVL CX, BP + MOVL $0xcc9e2d51, SI + IMULL SI, BP + RORL $0x11, BP + MOVL $0x1b873593, SI + IMULL SI, BP + XORL BP, BX + RORL $0x13, BX + LEAL (BX)(BX*4), BP + LEAL 3864292196(BP), BX + MOVL $0xcc9e2d51, BP + IMULL BP, DX + RORL $0x11, DX + MOVL $0x1b873593, BP + IMULL BP, DX + XORL DX, BX + RORL $0x13, BX + LEAL (BX)(BX*4), DX + LEAL 3864292196(DX), BX + MOVL BX, DX + SHRL $0x10, DX + XORL DX, BX + MOVL $0x85ebca6b, DX + IMULL DX, BX + MOVL BX, DX + SHRL $0x0d, DX + XORL DX, BX + MOVL $0xc2b2ae35, DX + IMULL DX, BX + MOVL BX, DX + SHRL $0x10, DX + XORL DX, BX + MOVL BX, ret+24(FP) + RET + +hash_5_12: + MOVL CX, DX + MOVL DX, BX + SHLL $0x02, BX + ADDL DX, BX + MOVL $0x00000009, BP + MOVL BX, SI + ADDL (AX), DX + MOVQ CX, DI + SUBQ $0x04, DI + ADDQ AX, DI + ADDL (DI), BX + MOVQ CX, DI + SHRQ $0x01, DI + ANDQ $0x04, DI + ADDQ AX, DI + ADDL (DI), BP + MOVL $0xcc9e2d51, DI + IMULL DI, DX + RORL $0x11, DX + MOVL $0x1b873593, DI + IMULL DI, DX + XORL DX, SI + RORL $0x13, SI + LEAL (SI)(SI*4), DX + LEAL 3864292196(DX), SI + MOVL $0xcc9e2d51, DX + IMULL DX, BX + RORL $0x11, BX + MOVL $0x1b873593, DX + IMULL DX, BX + XORL BX, SI + RORL $0x13, SI + LEAL (SI)(SI*4), BX + LEAL 3864292196(BX), SI + MOVL $0xcc9e2d51, DX + IMULL DX, BP + RORL $0x11, BP + MOVL $0x1b873593, DX + IMULL DX, BP + XORL BP, SI + RORL $0x13, SI + LEAL (SI)(SI*4), BP + LEAL 3864292196(BP), SI + MOVL SI, DX + SHRL $0x10, DX + XORL DX, SI + MOVL $0x85ebca6b, DX + IMULL DX, SI + MOVL SI, DX + SHRL $0x0d, DX + XORL DX, SI + MOVL $0xc2b2ae35, DX + IMULL DX, SI + MOVL SI, DX + SHRL $0x10, DX + XORL DX, SI + MOVL SI, ret+24(FP) + RET + +hash_13_24: + MOVQ CX, DX + SHRQ $0x01, DX + ADDQ AX, DX + MOVL -4(DX), BX + MOVL 4(AX), BP + MOVQ CX, SI + ADDQ AX, SI + MOVL -8(SI), DI + MOVL (DX), DX + MOVL (AX), R8 + MOVL -4(SI), SI + MOVL $0xcc9e2d51, R9 + IMULL DX, R9 + ADDL CX, R9 + RORL $0x0c, BX + ADDL SI, BX + MOVL DI, R10 + MOVL $0xcc9e2d51, R11 + IMULL R11, R10 + RORL $0x11, R10 + MOVL $0x1b873593, R11 + IMULL R11, R10 + XORL R10, R9 + RORL $0x13, R9 + LEAL (R9)(R9*4), R10 + LEAL 3864292196(R10), R9 + ADDL BX, R9 + RORL $0x03, BX + ADDL DI, BX + MOVL $0xcc9e2d51, DI + IMULL DI, R8 + RORL $0x11, R8 + MOVL $0x1b873593, DI + IMULL DI, R8 + XORL R8, R9 + RORL $0x13, R9 + LEAL (R9)(R9*4), R8 + LEAL 3864292196(R8), R9 + ADDL BX, R9 + ADDL SI, BX + RORL $0x0c, BX + ADDL DX, BX + MOVL $0xcc9e2d51, DX + IMULL DX, BP + RORL $0x11, BP + MOVL $0x1b873593, DX + IMULL DX, BP + XORL BP, R9 + RORL $0x13, R9 + LEAL (R9)(R9*4), BP + LEAL 3864292196(BP), R9 + ADDL BX, R9 + MOVL R9, DX + SHRL $0x10, DX + XORL DX, R9 + MOVL $0x85ebca6b, DX + IMULL DX, R9 + MOVL R9, DX + SHRL $0x0d, DX + XORL DX, R9 + MOVL $0xc2b2ae35, DX + IMULL DX, R9 + MOVL R9, DX + SHRL $0x10, DX + XORL DX, R9 + MOVL R9, ret+24(FP) + RET + +long: + MOVL CX, DX + MOVL $0xcc9e2d51, BX + IMULL DX, BX + MOVL BX, BP + MOVQ CX, SI + ADDQ AX, SI + MOVL $0xcc9e2d51, DI + MOVL $0x1b873593, R8 + MOVL -4(SI), R9 + IMULL DI, R9 + RORL $0x11, R9 + IMULL R8, R9 + XORL R9, DX + RORL $0x13, DX + MOVL DX, R9 + SHLL $0x02, R9 + ADDL R9, DX + ADDL $0xe6546b64, DX + MOVL -8(SI), R9 + IMULL DI, R9 + RORL $0x11, R9 + IMULL R8, R9 + XORL R9, BX + RORL $0x13, BX + MOVL BX, R9 + SHLL $0x02, R9 + ADDL R9, BX + ADDL $0xe6546b64, BX + MOVL -16(SI), R9 + IMULL DI, R9 + RORL $0x11, R9 + IMULL R8, R9 + XORL R9, DX + RORL $0x13, DX + MOVL DX, R9 + SHLL $0x02, R9 + ADDL R9, DX + ADDL $0xe6546b64, DX + MOVL -12(SI), R9 + IMULL DI, R9 + RORL $0x11, R9 + IMULL R8, R9 + XORL R9, BX + RORL $0x13, BX + MOVL BX, R9 + SHLL $0x02, R9 + ADDL R9, BX + ADDL $0xe6546b64, BX + PREFETCHT0 (AX) + MOVL -20(SI), SI + IMULL DI, SI + RORL $0x11, SI + IMULL R8, SI + ADDL SI, BP + RORL $0x13, BP + ADDL $0x71, BP + +loop80: + CMPQ CX, $0x64 + JL loop20 + PREFETCHT0 20(AX) + MOVL (AX), SI + ADDL SI, DX + MOVL 4(AX), DI + ADDL DI, BX + MOVL 8(AX), R8 + ADDL R8, BP + MOVL 12(AX), R9 + MOVL R9, R11 + MOVL $0xcc9e2d51, R10 + IMULL R10, R11 + RORL $0x11, R11 + MOVL $0x1b873593, R10 + IMULL R10, R11 + XORL R11, DX + RORL $0x13, DX + LEAL (DX)(DX*4), R11 + LEAL 3864292196(R11), DX + MOVL 16(AX), R10 + ADDL R10, DX + MOVL R8, R11 + MOVL $0xcc9e2d51, R8 + IMULL R8, R11 + RORL $0x11, R11 + MOVL $0x1b873593, R8 + IMULL R8, R11 + XORL R11, BX + RORL $0x13, BX + LEAL (BX)(BX*4), R11 + LEAL 3864292196(R11), BX + ADDL SI, BX + MOVL $0xcc9e2d51, SI + IMULL SI, R10 + MOVL R10, R11 + ADDL DI, R11 + MOVL $0xcc9e2d51, SI + IMULL SI, R11 + RORL $0x11, R11 + MOVL $0x1b873593, SI + IMULL SI, R11 + XORL R11, BP + RORL $0x13, BP + LEAL (BP)(BP*4), R11 + LEAL 3864292196(R11), BP + ADDL R9, BP + ADDL BX, BP + ADDL BP, BX + PREFETCHT0 40(AX) + MOVL 20(AX), SI + ADDL SI, DX + MOVL 24(AX), DI + ADDL DI, BX + MOVL 28(AX), R8 + ADDL R8, BP + MOVL 32(AX), R9 + MOVL R9, R11 + MOVL $0xcc9e2d51, R10 + IMULL R10, R11 + RORL $0x11, R11 + MOVL $0x1b873593, R10 + IMULL R10, R11 + XORL R11, DX + RORL $0x13, DX + LEAL (DX)(DX*4), R11 + LEAL 3864292196(R11), DX + MOVL 36(AX), R10 + ADDL R10, DX + MOVL R8, R11 + MOVL $0xcc9e2d51, R8 + IMULL R8, R11 + RORL $0x11, R11 + MOVL $0x1b873593, R8 + IMULL R8, R11 + XORL R11, BX + RORL $0x13, BX + LEAL (BX)(BX*4), R11 + LEAL 3864292196(R11), BX + ADDL SI, BX + MOVL $0xcc9e2d51, SI + IMULL SI, R10 + MOVL R10, R11 + ADDL DI, R11 + MOVL $0xcc9e2d51, SI + IMULL SI, R11 + RORL $0x11, R11 + MOVL $0x1b873593, SI + IMULL SI, R11 + XORL R11, BP + RORL $0x13, BP + LEAL (BP)(BP*4), R11 + LEAL 3864292196(R11), BP + ADDL R9, BP + ADDL BX, BP + ADDL BP, BX + PREFETCHT0 60(AX) + MOVL 40(AX), SI + ADDL SI, DX + MOVL 44(AX), DI + ADDL DI, BX + MOVL 48(AX), R8 + ADDL R8, BP + MOVL 52(AX), R9 + MOVL R9, R11 + MOVL $0xcc9e2d51, R10 + IMULL R10, R11 + RORL $0x11, R11 + MOVL $0x1b873593, R10 + IMULL R10, R11 + XORL R11, DX + RORL $0x13, DX + LEAL (DX)(DX*4), R11 + LEAL 3864292196(R11), DX + MOVL 56(AX), R10 + ADDL R10, DX + MOVL R8, R11 + MOVL $0xcc9e2d51, R8 + IMULL R8, R11 + RORL $0x11, R11 + MOVL $0x1b873593, R8 + IMULL R8, R11 + XORL R11, BX + RORL $0x13, BX + LEAL (BX)(BX*4), R11 + LEAL 3864292196(R11), BX + ADDL SI, BX + MOVL $0xcc9e2d51, SI + IMULL SI, R10 + MOVL R10, R11 + ADDL DI, R11 + MOVL $0xcc9e2d51, SI + IMULL SI, R11 + RORL $0x11, R11 + MOVL $0x1b873593, SI + IMULL SI, R11 + XORL R11, BP + RORL $0x13, BP + LEAL (BP)(BP*4), R11 + LEAL 3864292196(R11), BP + ADDL R9, BP + ADDL BX, BP + ADDL BP, BX + PREFETCHT0 80(AX) + MOVL 60(AX), SI + ADDL SI, DX + MOVL 64(AX), DI + ADDL DI, BX + MOVL 68(AX), R8 + ADDL R8, BP + MOVL 72(AX), R9 + MOVL R9, R11 + MOVL $0xcc9e2d51, R10 + IMULL R10, R11 + RORL $0x11, R11 + MOVL $0x1b873593, R10 + IMULL R10, R11 + XORL R11, DX + RORL $0x13, DX + LEAL (DX)(DX*4), R11 + LEAL 3864292196(R11), DX + MOVL 76(AX), R10 + ADDL R10, DX + MOVL R8, R11 + MOVL $0xcc9e2d51, R8 + IMULL R8, R11 + RORL $0x11, R11 + MOVL $0x1b873593, R8 + IMULL R8, R11 + XORL R11, BX + RORL $0x13, BX + LEAL (BX)(BX*4), R11 + LEAL 3864292196(R11), BX + ADDL SI, BX + MOVL $0xcc9e2d51, SI + IMULL SI, R10 + MOVL R10, R11 + ADDL DI, R11 + MOVL $0xcc9e2d51, SI + IMULL SI, R11 + RORL $0x11, R11 + MOVL $0x1b873593, SI + IMULL SI, R11 + XORL R11, BP + RORL $0x13, BP + LEAL (BP)(BP*4), R11 + LEAL 3864292196(R11), BP + ADDL R9, BP + ADDL BX, BP + ADDL BP, BX + ADDQ $0x50, AX + SUBQ $0x50, CX + JMP loop80 + +loop20: + CMPQ CX, $0x14 + JLE after + MOVL (AX), SI + ADDL SI, DX + MOVL 4(AX), DI + ADDL DI, BX + MOVL 8(AX), R8 + ADDL R8, BP + MOVL 12(AX), R9 + MOVL R9, R11 + MOVL $0xcc9e2d51, R10 + IMULL R10, R11 + RORL $0x11, R11 + MOVL $0x1b873593, R10 + IMULL R10, R11 + XORL R11, DX + RORL $0x13, DX + LEAL (DX)(DX*4), R11 + LEAL 3864292196(R11), DX + MOVL 16(AX), R10 + ADDL R10, DX + MOVL R8, R11 + MOVL $0xcc9e2d51, R8 + IMULL R8, R11 + RORL $0x11, R11 + MOVL $0x1b873593, R8 + IMULL R8, R11 + XORL R11, BX + RORL $0x13, BX + LEAL (BX)(BX*4), R11 + LEAL 3864292196(R11), BX + ADDL SI, BX + MOVL $0xcc9e2d51, SI + IMULL SI, R10 + MOVL R10, R11 + ADDL DI, R11 + MOVL $0xcc9e2d51, SI + IMULL SI, R11 + RORL $0x11, R11 + MOVL $0x1b873593, SI + IMULL SI, R11 + XORL R11, BP + RORL $0x13, BP + LEAL (BP)(BP*4), R11 + LEAL 3864292196(R11), BP + ADDL R9, BP + ADDL BX, BP + ADDL BP, BX + ADDQ $0x14, AX + SUBQ $0x14, CX + JMP loop20 + +after: + MOVL $0xcc9e2d51, AX + RORL $0x0b, BX + IMULL AX, BX + RORL $0x11, BX + IMULL AX, BX + RORL $0x0b, BP + IMULL AX, BP + RORL $0x11, BP + IMULL AX, BP + ADDL BX, DX + RORL $0x13, DX + MOVL DX, CX + SHLL $0x02, CX + ADDL CX, DX + ADDL $0xe6546b64, DX + RORL $0x11, DX + IMULL AX, DX + ADDL BP, DX + RORL $0x13, DX + MOVL DX, CX + SHLL $0x02, CX + ADDL CX, DX + ADDL $0xe6546b64, DX + RORL $0x11, DX + IMULL AX, DX + MOVL DX, ret+24(FP) + RET diff --git a/vendor/github.com/dgryski/go-farm/fp_generic.go b/vendor/github.com/dgryski/go-farm/fp_generic.go new file mode 100644 index 0000000000..2cfa1b9dcb --- /dev/null +++ b/vendor/github.com/dgryski/go-farm/fp_generic.go @@ -0,0 +1,13 @@ +// +build !amd64 purego + +package farm + +// Fingerprint64 is a 64-bit fingerprint function for byte-slices +func Fingerprint64(s []byte) uint64 { + return naHash64(s) +} + +// Fingerprint32 is a 32-bit fingerprint function for byte-slices +func Fingerprint32(s []byte) uint32 { + return Hash32(s) +} diff --git a/vendor/github.com/dgryski/go-farm/fp_stub.go b/vendor/github.com/dgryski/go-farm/fp_stub.go new file mode 100644 index 0000000000..94fff8de5a --- /dev/null +++ b/vendor/github.com/dgryski/go-farm/fp_stub.go @@ -0,0 +1,9 @@ +// Code generated by command: go run asm.go -out=fp_amd64.s -stubs=fp_stub.go. DO NOT EDIT. + +// +build amd64,!purego + +package farm + +func Fingerprint64(s []byte) uint64 + +func Fingerprint32(s []byte) uint32 diff --git a/vendor/github.com/philippgille/gokv/.gitignore b/vendor/github.com/philippgille/gokv/.gitignore new file mode 100644 index 0000000000..e65384f91d --- /dev/null +++ b/vendor/github.com/philippgille/gokv/.gitignore @@ -0,0 +1,21 @@ +# Binaries for programs and plugins +*.exe +*.exe~ +*.dll +*.so +*.dylib + +# Test binary, build with `go test -c` +*.test + +# Output of the go coverage tool, specifically when used with LiteIDE +*.out + +### Own changes + +# Test coverage file +coverage.txt +# LevelDB DB files that are created during LevelDB tests +/leveldb/leveldb + +/examples/examples diff --git a/vendor/github.com/philippgille/gokv/.golangci.yml b/vendor/github.com/philippgille/gokv/.golangci.yml new file mode 100644 index 0000000000..9ed6c49643 --- /dev/null +++ b/vendor/github.com/philippgille/gokv/.golangci.yml @@ -0,0 +1,17 @@ +# https://github.com/golangci/golangci-lint#config-file +issues: + # Excluding configuration per-path and per-linter + exclude-rules: + # Ease some staticcheck warnings. + - path: (bigcache/bigcache_test\.go|bigcache\\bigcache_test\.go) + text: SA5001 + linters: + - staticcheck + - path: (mysql/mysql_test\.go|mysql\\mysql_test\.go) + text: SA5001 + linters: + - staticcheck + - path: (mysql/mysql\.go|mysql\\mysql\.go) + text: SA5001.*tempDB\.Close\(\) + linters: + - staticcheck diff --git a/vendor/github.com/philippgille/gokv/.travis.yml b/vendor/github.com/philippgille/gokv/.travis.yml new file mode 100644 index 0000000000..22ff1bded3 --- /dev/null +++ b/vendor/github.com/philippgille/gokv/.travis.yml @@ -0,0 +1,56 @@ +# Sudo is required for running Docker +sudo: required + +services: + # Docker is required for running some services that aren't provided by Travis CI, e.g. Consul + - docker + - redis-server + - mongodb + - memcached + - mysql + +git: + depth: 1 + +language: go + +go: + - "1.12" + - "1.13" + +env: + # For encrypted environment variables, use: + # travis encrypt 'PASSWORD="SECRET"' -r philippgille/gokv + global: + - GO111MODULE=on + # "DynamoDB local" accepts any credentials + - AWS_ACCESS_KEY_ID=user + - AWS_SECRET_ACCESS_KEY=secret + +before_install: + - go version + - go env + +# Overwrite the `install` phase because the default one runs `travis_install_go_dependencies`, +# which leads to errors in case some subdirectories are Go modules and others are regular Go packages +install: + - echo "Skipping default install phase" + +# Don't start Docker containers all at once. The Travis CI VM doesn't have enough memory for that. +# TODO: Implement a proper wait (e.g. with container health check) instead of sleeping for 10s +script: + # Build + - build/build.sh + + # Linter + - curl -sfL https://install.goreleaser.com/github.com/golangci/golangci-lint.sh | sh -s -- -b $(go env GOPATH)/bin v1.15.0 + - golangci-lint run + + # Test + - build/test.sh + +after_success: + # Combine coverage reports + - build/combine-coverage.sh + # Upload coverage data to codecov.io + - bash <(curl -s https://codecov.io/bash) diff --git a/vendor/github.com/philippgille/gokv/LICENSE b/vendor/github.com/philippgille/gokv/LICENSE new file mode 100644 index 0000000000..a612ad9813 --- /dev/null +++ b/vendor/github.com/philippgille/gokv/LICENSE @@ -0,0 +1,373 @@ +Mozilla Public License Version 2.0 +================================== + +1. Definitions +-------------- + +1.1. "Contributor" + means each individual or legal entity that creates, contributes to + the creation of, or owns Covered Software. + +1.2. "Contributor Version" + means the combination of the Contributions of others (if any) used + by a Contributor and that particular Contributor's Contribution. + +1.3. "Contribution" + means Covered Software of a particular Contributor. + +1.4. "Covered Software" + means Source Code Form to which the initial Contributor has attached + the notice in Exhibit A, the Executable Form of such Source Code + Form, and Modifications of such Source Code Form, in each case + including portions thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + (a) that the initial Contributor has attached the notice described + in Exhibit B to the Covered Software; or + + (b) that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the + terms of a Secondary License. + +1.6. "Executable Form" + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + means a work that combines Covered Software with other material, in + a separate file or files, that is not Covered Software. + +1.8. "License" + means this document. + +1.9. "Licensable" + means having the right to grant, to the maximum extent possible, + whether at the time of the initial grant or subsequently, any and + all of the rights conveyed by this License. + +1.10. "Modifications" + means any of the following: + + (a) any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered + Software; or + + (b) any new file in Source Code Form that contains any Covered + Software. + +1.11. "Patent Claims" of a Contributor + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the + License, by the making, using, selling, offering for sale, having + made, import, or transfer of either its Contributions or its + Contributor Version. + +1.12. "Secondary License" + means either the GNU General Public License, Version 2.0, the GNU + Lesser General Public License, Version 2.1, the GNU Affero General + Public License, Version 3.0, or any later versions of those + licenses. + +1.13. "Source Code Form" + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that + controls, is controlled by, or is under common control with You. For + purposes of this definition, "control" means (a) the power, direct + or indirect, to cause the direction or management of such entity, + whether by contract or otherwise, or (b) ownership of more than + fifty percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants and Conditions +-------------------------------- + +2.1. Grants + +Each Contributor hereby grants You a world-wide, royalty-free, +non-exclusive license: + +(a) under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + +(b) under Patent Claims of such Contributor to make, use, sell, offer + for sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + +The licenses granted in Section 2.1 with respect to any Contribution +become effective for each Contribution on the date the Contributor first +distributes such Contribution. + +2.3. Limitations on Grant Scope + +The licenses granted in this Section 2 are the only rights granted under +this License. No additional rights or licenses will be implied from the +distribution or licensing of Covered Software under this License. +Notwithstanding Section 2.1(b) above, no patent license is granted by a +Contributor: + +(a) for any code that a Contributor has removed from Covered Software; + or + +(b) for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + +(c) under Patent Claims infringed by Covered Software in the absence of + its Contributions. + +This License does not grant any rights in the trademarks, service marks, +or logos of any Contributor (except as may be necessary to comply with +the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + +No Contributor makes additional grants as a result of Your choice to +distribute the Covered Software under a subsequent version of this +License (see Section 10.2) or under the terms of a Secondary License (if +permitted under the terms of Section 3.3). + +2.5. Representation + +Each Contributor represents that the Contributor believes its +Contributions are its original creation(s) or it has sufficient rights +to grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + +This License is not intended to limit any rights You have under +applicable copyright doctrines of fair use, fair dealing, or other +equivalents. + +2.7. Conditions + +Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +in Section 2.1. + +3. Responsibilities +------------------- + +3.1. Distribution of Source Form + +All distribution of Covered Software in Source Code Form, including any +Modifications that You create or to which You contribute, must be under +the terms of this License. You must inform recipients that the Source +Code Form of the Covered Software is governed by the terms of this +License, and how they can obtain a copy of this License. You may not +attempt to alter or restrict the recipients' rights in the Source Code +Form. + +3.2. Distribution of Executable Form + +If You distribute Covered Software in Executable Form then: + +(a) such Covered Software must also be made available in Source Code + Form, as described in Section 3.1, and You must inform recipients of + the Executable Form how they can obtain a copy of such Source Code + Form by reasonable means in a timely manner, at a charge no more + than the cost of distribution to the recipient; and + +(b) You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter + the recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + +You may create and distribute a Larger Work under terms of Your choice, +provided that You also comply with the requirements of this License for +the Covered Software. If the Larger Work is a combination of Covered +Software with a work governed by one or more Secondary Licenses, and the +Covered Software is not Incompatible With Secondary Licenses, this +License permits You to additionally distribute such Covered Software +under the terms of such Secondary License(s), so that the recipient of +the Larger Work may, at their option, further distribute the Covered +Software under the terms of either this License or such Secondary +License(s). + +3.4. Notices + +You may not remove or alter the substance of any license notices +(including copyright notices, patent notices, disclaimers of warranty, +or limitations of liability) contained within the Source Code Form of +the Covered Software, except that You may alter any license notices to +the extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + +You may choose to offer, and to charge a fee for, warranty, support, +indemnity or liability obligations to one or more recipients of Covered +Software. However, You may do so only on Your own behalf, and not on +behalf of any Contributor. You must make it absolutely clear that any +such warranty, support, indemnity, or liability obligation is offered by +You alone, and You hereby agree to indemnify every Contributor for any +liability incurred by such Contributor as a result of warranty, support, +indemnity or liability terms You offer. You may include additional +disclaimers of warranty and limitations of liability specific to any +jurisdiction. + +4. Inability to Comply Due to Statute or Regulation +--------------------------------------------------- + +If it is impossible for You to comply with any of the terms of this +License with respect to some or all of the Covered Software due to +statute, judicial order, or regulation then You must: (a) comply with +the terms of this License to the maximum extent possible; and (b) +describe the limitations and the code they affect. Such description must +be placed in a text file included with all distributions of the Covered +Software under this License. Except to the extent prohibited by statute +or regulation, such description must be sufficiently detailed for a +recipient of ordinary skill to be able to understand it. + +5. Termination +-------------- + +5.1. The rights granted under this License will terminate automatically +if You fail to comply with any of its terms. However, if You become +compliant, then the rights granted under this License from a particular +Contributor are reinstated (a) provisionally, unless and until such +Contributor explicitly and finally terminates Your grants, and (b) on an +ongoing basis, if such Contributor fails to notify You of the +non-compliance by some reasonable means prior to 60 days after You have +come back into compliance. Moreover, Your grants from a particular +Contributor are reinstated on an ongoing basis if such Contributor +notifies You of the non-compliance by some reasonable means, this is the +first time You have received notice of non-compliance with this License +from such Contributor, and You become compliant prior to 30 days after +Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent +infringement claim (excluding declaratory judgment actions, +counter-claims, and cross-claims) alleging that a Contributor Version +directly or indirectly infringes any patent, then the rights granted to +You by any and all Contributors for the Covered Software under Section +2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all +end user license agreements (excluding distributors and resellers) which +have been validly granted by You or Your distributors under this License +prior to termination shall survive termination. + +************************************************************************ +* * +* 6. Disclaimer of Warranty * +* ------------------------- * +* * +* Covered Software is provided under this License on an "as is" * +* basis, without warranty of any kind, either expressed, implied, or * +* statutory, including, without limitation, warranties that the * +* Covered Software is free of defects, merchantable, fit for a * +* particular purpose or non-infringing. The entire risk as to the * +* quality and performance of the Covered Software is with You. * +* Should any Covered Software prove defective in any respect, You * +* (not any Contributor) assume the cost of any necessary servicing, * +* repair, or correction. This disclaimer of warranty constitutes an * +* essential part of this License. No use of any Covered Software is * +* authorized under this License except under this disclaimer. * +* * +************************************************************************ + +************************************************************************ +* * +* 7. Limitation of Liability * +* -------------------------- * +* * +* Under no circumstances and under no legal theory, whether tort * +* (including negligence), contract, or otherwise, shall any * +* Contributor, or anyone who distributes Covered Software as * +* permitted above, be liable to You for any direct, indirect, * +* special, incidental, or consequential damages of any character * +* including, without limitation, damages for lost profits, loss of * +* goodwill, work stoppage, computer failure or malfunction, or any * +* and all other commercial damages or losses, even if such party * +* shall have been informed of the possibility of such damages. This * +* limitation of liability shall not apply to liability for death or * +* personal injury resulting from such party's negligence to the * +* extent applicable law prohibits such limitation. Some * +* jurisdictions do not allow the exclusion or limitation of * +* incidental or consequential damages, so this exclusion and * +* limitation may not apply to You. * +* * +************************************************************************ + +8. Litigation +------------- + +Any litigation relating to this License may be brought only in the +courts of a jurisdiction where the defendant maintains its principal +place of business and such litigation shall be governed by laws of that +jurisdiction, without reference to its conflict-of-law provisions. +Nothing in this Section shall prevent a party's ability to bring +cross-claims or counter-claims. + +9. Miscellaneous +---------------- + +This License represents the complete agreement concerning the subject +matter hereof. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent +necessary to make it enforceable. Any law or regulation which provides +that the language of a contract shall be construed against the drafter +shall not be used to construe this License against a Contributor. + +10. Versions of the License +--------------------------- + +10.1. New Versions + +Mozilla Foundation is the license steward. Except as provided in Section +10.3, no one other than the license steward has the right to modify or +publish new versions of this License. Each version will be given a +distinguishing version number. + +10.2. Effect of New Versions + +You may distribute the Covered Software under the terms of the version +of the License under which You originally received the Covered Software, +or under the terms of any subsequent version published by the license +steward. + +10.3. Modified Versions + +If you create software not governed by this License, and you want to +create a new license for such software, you may create and use a +modified version of this License if you rename the license and remove +any references to the name of the license steward (except to note that +such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary +Licenses + +If You choose to distribute Source Code Form that is Incompatible With +Secondary Licenses under the terms of this version of the License, the +notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice +------------------------------------------- + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular +file, then You may include the notice in a location (such as a LICENSE +file in a relevant directory) where a recipient would be likely to look +for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice +--------------------------------------------------------- + + This Source Code Form is "Incompatible With Secondary Licenses", as + defined by the Mozilla Public License, v. 2.0. diff --git a/vendor/github.com/philippgille/gokv/README.md b/vendor/github.com/philippgille/gokv/README.md new file mode 100644 index 0000000000..b08beccb61 --- /dev/null +++ b/vendor/github.com/philippgille/gokv/README.md @@ -0,0 +1,305 @@ +gokv +==== + +[![GoDoc](http://www.godoc.org/github.com/philippgille/gokv?status.svg)](http://www.godoc.org/github.com/philippgille/gokv) [![Build Status](https://travis-ci.org/philippgille/gokv.svg?branch=master)](https://travis-ci.org/philippgille/gokv) [![Go Report Card](https://goreportcard.com/badge/github.com/philippgille/gokv)](https://goreportcard.com/report/github.com/philippgille/gokv) [![codecov](https://codecov.io/gh/philippgille/gokv/branch/master/graph/badge.svg)](https://codecov.io/gh/philippgille/gokv) [![GitHub Releases](https://img.shields.io/github/release/philippgille/gokv.svg)](https://github.com/philippgille/gokv/releases) [![Mentioned in Awesome Go](https://awesome.re/mentioned-badge.svg)](https://github.com/avelino/awesome-go) + +Simple key-value store abstraction and implementations for Go + +Contents +-------- + +1. [Features](#features) + 1. [Simple interface](#simple-interface) + 2. [Implementations](#implementations) + 3. [Value types](#value-types) + 4. [Marshal formats](#marshal-formats) + 5. [Roadmap](#roadmap) +2. [Usage](#usage) +3. [Project status](#project-status) +4. [Motivation](#motivation) +5. [Design decisions](#design-decisions) +6. [Related projects](#related-projects) + +Features +-------- + +### Simple interface + +> Note: The interface is not final yet! See [Project status](#project-status) for details. + +```go +type Store interface { + Set(k string, v interface{}) error + Get(k string, v interface{}) (found bool, err error) + Delete(k string) error + Close() error +} +``` + +There are detailed descriptions of the methods in the [docs](https://www.godoc.org/github.com/philippgille/gokv#Store) and in the [code](https://github.com/philippgille/gokv/blob/master/store.go). You should read them if you plan to write your own `gokv.Store` implementation or if you create a Go package with a method that takes a `gokv.Store` as parameter, so you know exactly what happens in the background. + +### Implementations + +Some of the following databases aren't specifically engineered for storing key-value pairs, but if someone's running them already for other purposes and doesn't want to set up one of the proper key-value stores due to administrative overhead etc., they can of course be used as well. In those cases let's focus on a few of the most popular though. This mostly goes for the SQL, NoSQL and NewSQL categories. + +Feel free to suggest more stores by creating an [issue](https://github.com/philippgille/gokv/issues) or even add an actual implementation - [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](http://makeapullrequest.com). + +For differences between the implementations, see [Choosing an implementation](docs/choosing-implementation.md). +For the GoDoc of specific implementations, see [https://www.godoc.org/github.com/philippgille/gokv#pkg-subdirectories](https://www.godoc.org/github.com/philippgille/gokv#pkg-subdirectories). + +- Local in-memory + - [X] Go `sync.Map` + - [X] Go `map` (with `sync.RWMutex`) + - [X] [FreeCache](https://github.com/coocood/freecache) + - [X] [BigCache](https://github.com/allegro/bigcache) +- Embedded + - [X] [bbolt](https://github.com/etcd-io/bbolt) (formerly known as [Bolt / Bolt DB](https://github.com/boltdb/bolt)) + - [X] [BadgerDB](https://github.com/dgraph-io/badger) + - [X] [LevelDB / goleveldb](https://github.com/syndtr/goleveldb) + - [X] Local files (one file per key-value pair, with the key being the filename and the value being the file content) +- Distributed store + - [X] [Redis](https://github.com/antirez/redis) + - [X] [Consul](https://github.com/hashicorp/consul) + - [X] [etcd](https://github.com/etcd-io/etcd) + - [X] [Apache ZooKeeper](https://github.com/apache/zookeeper) + - [ ] [TiKV](https://github.com/tikv/tikv) +- Distributed cache (no presistence *by default*) + - [X] [Memcached](https://github.com/memcached/memcached) + - [X] [Hazelcast](https://github.com/hazelcast/hazelcast) +- Cloud + - [X] [Amazon DynamoDB](https://aws.amazon.com/dynamodb/) + - [X] [Amazon S3](https://aws.amazon.com/s3/) / [Google Cloud Storage](https://cloud.google.com/storage/) / [Alibaba Cloud Object Storage Service (OSS)](https://www.alibabacloud.com/en/product/oss) / [DigitalOcean Spaces](https://www.digitalocean.com/products/spaces/) / [Scaleway Object Storage](https://www.scaleway.com/object-storage/) / [OpenStack Swift](https://github.com/openstack/swift) / [Ceph](https://github.com/ceph/ceph) / [Minio](https://github.com/minio/minio) / ... + - [ ] [Azure Cosmos DB](https://azure.microsoft.com/en-us/services/cosmos-db/) + - [X] [Azure Table Storage](https://azure.microsoft.com/en-us/services/storage/tables/) + - [X] [Google Cloud Datastore](https://cloud.google.com/datastore/) + - [ ] [Google Cloud Firestore](https://cloud.google.com/firestore/) + - [X] [Alibaba Cloud Table Store](https://www.alibabacloud.com/de/product/table-store) +- SQL + - [X] [MySQL](https://github.com/mysql/mysql-server) + - [X] [PostgreSQL](https://github.com/postgres/postgres) +- NoSQL + - [X] [MongoDB](https://github.com/mongodb/mongo) + - [ ] [Apache Cassandra](https://github.com/apache/cassandra) +- "NewSQL" + - [X] [CockroachDB](https://github.com/cockroachdb/cockroach) + - [ ] [TiDB](https://github.com/pingcap/tidb) +- Multi-model + - [X] [Apache Ignite](https://github.com/apache/ignite) + - [ ] [ArangoDB](https://github.com/arangodb/arangodb) + - [ ] [OrientDB](https://github.com/orientechnologies/orientdb) + +Again: +For differences between the implementations, see [Choosing an implementation](docs/choosing-implementation.md). +For the GoDoc of specific implementations, see [https://www.godoc.org/github.com/philippgille/gokv#pkg-subdirectories](https://www.godoc.org/github.com/philippgille/gokv#pkg-subdirectories). + +### Value types + +Most Go packages for key-value stores just accept a `[]byte` as value, which requires developers for example to marshal (and later unmarshal) their structs. `gokv` is meant to be simple and make developers' lifes easier, so it accepts any type (with using `interface{}` as parameter), including structs, and automatically (un-)marshals the value. + +The kind of (un-)marshalling is left to the implementation. All implementations in this repository currently support JSON and [gob](https://blog.golang.org/gobs-of-data) by using the `encoding` subpackage in this repository, which wraps the core functionality of the standard library's `encoding/json` and `encoding/gob` packages. See [Marshal formats](#marshal-formats) for details. + +For unexported struct fields to be (un-)marshalled to/from JSON/gob, the respective custom (un-)marshalling methods need to be implemented as methods of the struct (e.g. `MarshalJSON() ([]byte, error)` for custom marshalling into JSON). See [Marshaler](https://godoc.org/encoding/json#Marshaler) and [Unmarshaler](https://godoc.org/encoding/json#Unmarshaler) for JSON, and [GobEncoder](https://godoc.org/encoding/gob#GobEncoder) and [GobDecoder](https://godoc.org/encoding/gob#GobDecoder) for gob. + +To improve performance you can also implement the custom (un-)marshalling methods so that no reflection is used by the `encoding/json` / `encoding/gob` packages. This is not a disadvantage of using a generic key-value store package, it's the same as if you would use a concrete key-value store package which only accepts `[]byte`, requiring you to (un-)marshal your structs. + +### Marshal formats + +This repository contains the subpackage `encoding`, which is an abstraction and wrapper for the core functionality of packages like `encoding/json` and `encoding/gob`. The currently supported marshal formats are: + +- [X] JSON +- [X] [gob](https://blog.golang.org/gobs-of-data) + +More formats will be supported in the future (e.g. XML). + +The stores use this `encoding` package to marshal and unmarshal the values when storing / retrieving them. The default format is JSON, but all `gokv.Store` implementations in this repository also support [gob](https://blog.golang.org/gobs-of-data) as alternative, configurable via their `Options`. + +The marshal format is up to the implementations though, so package creators using the `gokv.Store` interface as parameter of a function should not make any assumptions about this. If they require any specific format they should inform the package user about this in the GoDoc of the function taking the store interface as parameter. + +Differences between the formats: + +- Depending on the struct, one of the formats might be faster +- Depending on the struct, one of the formats might lead to a lower storage size +- Depending on the use case, the custom (un-)marshal methods of one of the formats might be easier to implement + - JSON: [`MarshalJSON() ([]byte, error)`](https://godoc.org/encoding/json#Marshaler) and [`UnmarshalJSON([]byte) error`](https://godoc.org/encoding/json#Unmarshaler) + - gob: [`GobEncode() ([]byte, error)`](https://godoc.org/encoding/gob#GobEncoder) and [`GobDecode([]byte) error`](https://godoc.org/encoding/gob#GobDecoder) + +### Roadmap + +- Benchmarks! +- CLI: A simple command line interface tool that allows you create, read, update and delete key-value pairs in all of the `gokv` storages +- A `combiner` package that allows you to create a `gokv.Store` which forwards its call to multiple implementations at the same time. So for example you can use `memcached` and `s3` simultaneously to have 1) super fast access but also 2) durable redundant persistent storage. +- A way to directly configure the clients via the options of the underlying used Go package (e.g. not the `redis.Options` struct in `github.com/philippgille/gokv`, but instead the `redis.Options` struct in `github.com/go-redis/redis`) + - Will be optional and discouraged, because this will lead to compile errors in code that uses `gokv` when switching the underlying used Go package, but definitely useful for some people +- More stores (see stores in [Implementations](#implementations) list with unchecked boxes) +- Maybe rename the project from `gokv` to `SimpleKV`? +- Maybe move all implementation packages into a subdirectory, e.g. `github.com/philippgille/gokv/store/redis`? + +Usage +----- + +First, download the [module](https://github.com/golang/go/wiki/Modules) you want to work with: + +- For example when you want to work with the `gokv.Store` interface: + - `go get github.com/philippgille/gokv@latest` +- For example when you want to work with the Redis implementation: + - `go get github.com/philippgille/gokv/redis@latest` + +Then you can import and use it. + +Every implementation has its own `Options` struct, but all implementations have a `NewStore()` / `NewClient()` function that returns an object of a sctruct that implements the `gokv.Store` interface. Let's take the implementation for Redis as example, which is the most popular distributed key-value store. + +```go +package main + +import ( + "fmt" + + "github.com/philippgille/gokv" + "github.com/philippgille/gokv/redis" +) + +type foo struct { + Bar string +} + +func main() { + options := redis.DefaultOptions // Address: "localhost:6379", Password: "", DB: 0 + + // Create client + client, err := redis.NewClient(options) + if err != nil { + panic(err) + } + defer client.Close() + + // Store, retrieve, print and delete a value + interactWithStore(client) +} + +// interactWithStore stores, retrieves, prints and deletes a value. +// It's completely independent of the store implementation. +func interactWithStore(store gokv.Store) { + // Store value + val := foo{ + Bar: "baz", + } + err := store.Set("foo123", val) + if err != nil { + panic(err) + } + + // Retrieve value + retrievedVal := new(foo) + found, err := store.Get("foo123", retrievedVal) + if err != nil { + panic(err) + } + if !found { + panic("Value not found") + } + + fmt.Printf("foo: %+v", *retrievedVal) // Prints `foo: {Bar:baz}` + + // Delete value + err = store.Delete("foo123") + if err != nil { + panic(err) + } +} +``` + +As described in the comments, that code does the following: + +1. Create a client for Redis + - Some implementations' stores/clients don't require to be closed, but when working with the interface (for example as function parameter) you *must* call `Close()` because you don't know which implementation is passed. Even if you work with a specific implementation you *should* always call `Close()`, so you can easily change the implementation without the risk of forgetting to add the call. +2. Call `interactWithStore()`, which requires a `gokv.Store` as parameter. This method then: + 1. Stores an object of type `foo` in the Redis server running on `localhost:6379` with the key `foo123` + 2. Retrieves the value for the key `foo123` + - The check if the value was found isn't needed in this example but is included for demonstration purposes + 3. Prints the value. It prints `foo: {Bar:baz}`, which is exactly what was stored before. + 4. Deletes the value + +Now let's say you don't want to use Redis but Consul instead. You just have to make three simple changes: + +1. Replace the import of `"github.com/philippgille/gokv/redis"` by `"github.com/philippgille/gokv/consul"` +2. Replace `redis.DefaultOptions` by `consul.DefaultOptions` +3. Replace `redis.NewClient(options)` by `consul.NewClient(options)` + +Everything else works the same way. `interactWithStore()` is completely unaffected. + +Project status +-------------- + +> Note: `gokv`'s API is not stable yet and is under active development. Upcoming releases are likely to contain breaking changes as long as the version is `v0.x.y`. You should use vendoring to prevent bad surprises. This project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html) and all notable changes to this project are documented in [RELEASES.md](https://github.com/philippgille/gokv/blob/master/RELEASES.md). + +Planned interface methods until `v1.0.0`: + +- `List(interface{}) error` / `GetAll(interface{}) error` or similar + +The interface might even change until `v1.0.0`. For example one consideration is to change `Get(string, interface{}) (bool, error)` to `Get(string, interface{}) error` (no boolean return value anymore), with the `error` being something like `gokv.ErrNotFound // "Key-value pair not found"` to fulfill the additional role of indicating that the key-value pair wasn't found. But at the moment we prefer the current method signature. + +Also, more interfaces might be added. For example so that there's a `SimpleStore` and an `AdvancedStore`, with the first one containing only the basic methods and the latter one with advanced features such as key-value pair lifetimes (deletion of key-value pairs after a given time), notification of value changes via Go channels etc. But currently the focus is simplicity, see [Design decisions](#design-decisions). + +Motivation +---------- + +When creating a package you want the package to be usable by as many developers as possible. Let's look at a specific example: You want to create a paywall middleware for the Gin web framework. You need some database to store state. You can't use a Go map, because its data is not persisted across web service restarts. You can't use an embedded DB like bbolt, BadgerDB or SQLite, because that would restrict the web service to one instance, but nowadays every web service is designed with high horizontal scalability in mind. If you use Redis, MongoDB or PostgreSQL though, you would force the package user (the developer who creates the actual web service with Gin and your middleware) to run and administrate the server, even if she might never have used it before and doesn't know how to configure them for high performance and security. + +Any decision for a specific database would limit the package's usability. + +One solution would be a custom interface where you would leave the implementation to the package user. But that would require the developer to dive into the details of the Go package of the chosen key-value store. And if the developer wants to switch the store, or maybe use one for local testing and another for production, she would need to write *multiple* implementations. + +`gokv` is the solution for these problems. Package *creators* use the `gokv.Store` interface as parameter and can call its methods within their code, leaving the decision which actual store to use to the package user. Package *users* pick one of the implementations, for example `github.com/philippgille/gokv/redis` for Redis and pass the `redis.Client` created by `redis.NewClient(...)` as parameter. Package users can also develop their own implementations if they need to. + +`gokv` doesn't just have to be used to satisfy some `gokv.Store` parameter. It can of course also be used by application / web service developers who just don't want to dive into the sometimes complicated usage of some key-value store packages. + +Initially it was developed as `storage` package within the project [ln-paywall](https://github.com/philippgille/ln-paywall) to provide the users of ln-paywall with multiple storage options, but at some point it made sense to turn it into a repository of its own. + +Before doing so I examined existing Go packages with a similar purpose (see [Related projects](#related-projects)), but none of them fit my needs. They either had too few implementations, or they didn't automatically marshal / unmarshal passed structs, or the interface had too many methods, making the project seem too complex to maintain and extend, proven by some that were abandoned or forked (splitting the community with it). + +Design decisions +---------------- + +- `gokv` is primarily an abstraction for **key-value stores**, not caches, so there's no need for cache eviction and timeouts. + - It's still possible to have cache eviction. In some cases you can configure it on the server, or in case of Memcached it's even the default. Or you can have an implementation-specific `Option` that configures the key-value store client to set a timeout on some key-value pair when storing it in the server. But this should be implementation-specific and not be part of the interface methods, which would require *every* implementation to support cache eviction. +- The package should be usable without having to write additional code, so structs should be (un-)marshalled automatically, without having to implement `MarshalJSON()` / `GobEncode()` and `UnmarshalJSON()` / `GobDecode()` first. It's still possible to implement these methods to customize the (un-)marshalling, for example to include unexported fields, or for higher performance (because the `encoding/json` / `encoding/gob` package doesn't have to use reflection). +- It should be easy to create your own store implementations, as well as to review and maintain the code of this repository, so there should be as few interface methods as possible, but still enough so that functions taking the `gokv.Store` interface as parameter can do everything that's usually required when working with a key-value store. For example, a boolean return value for the `Delete` method that indicates whether a value was actually deleted (because it was previously present) can be useful, but isn't a must-have, and also it would require some `Store` implementations to implement the check by themselves (because the existing libraries don't support it), which would unnecessarily decrease performance for those who don't need it. Or as another example, a `Watch(key string) (<-chan Notification, error)` method that sends notifications via a Go channel when the value of a given key changes is nice to have for a few use cases, but in most cases it's not required. + - > Note: In the future we might add another interface, so that there's one for the basic operations and one for advanced uses. +- Similar projects name the structs that are implementations of the store interface according to the backing store, for example `boltdb.BoltDB`, but this leads to so called "stuttering" that's discouraged when writing idiomatic Go. That's why `gokv` uses for example `bbolt.Store` and `syncmap.Store`. For easier differentiation between embedded DBs and DBs that have a client and a server component though, the first ones are called `Store` and the latter ones are called `Client`, for example `redis.Client`. +- All errors are implementation-specific. We could introduce a `gokv.StoreError` type and define some constants like a `SetError` or something more specific like a `TimeoutError`, but non-specific errors don't help the package user, and specific errors would make it very hard to create and especially maintain a `gokv.Store` implementation. You would need to know exactly in which cases the package (that the implementation uses) returns errors, what the errors mean (to "translate" them) and keep up with changes and additions of errors in the package. So instead, errors are just forwarded. For example, if you use the `dynamodb` package, the returned errors will be errors from the `"github.com/aws/aws-sdk-go` package. +- Keep the terminology of used packages. This might be controversial, because an abstraction / wrapper *unifies* the interface of the used packages. But: + 1. Naming is hard. If one used package for an embedded database uses `Path` and another `Directory`, then how should be name the option for the database directory? Maybe `Folder`, to add to the confusion? Also, some users might already have used the packages we use directly and they would wonder about the "new" variable name which has the same meaning. + Using the packages' variable names spares us the need to come up with unified, understandable variable names without alienating users who already used the packages we use directly. + 2. Only few users are going to switch back and forth between `gokv.Store` implementations, so most user won't even notice the differences in variable names. +- Each `gokv` implementation is a Go module. This differs from repositories that contain a single Go module with many subpackages, but has the huge advantage that if you only want to work with the Redis client for example, the `go get` will only fetch the Redis dependencies and not the huge amount of dependencies that are used across the whole repository. + +Related projects +---------------- + +- [libkv](https://github.com/docker/libkv) + - Uses `[]byte` as value, no automatic (un-)marshalling of structs + - No support for Redis, BadgerDB, Go map, MongoDB, AWS DynamoDB, Memcached, MySQL, ... + - Not actively maintained anymore (3 direct commits + 1 merged PR in the last 10+ months, as of 2018-10-13) +- [valkeyrie](https://github.com/abronan/valkeyrie) + - Fork of libkv + - Same disadvantage: Uses `[]byte` as value, no automatic (un-)marshalling of structs + - No support for BadgerDB, Go map, MongoDB, AWS DynamoDB, Memcached, MySQL, ... +- [gokvstores](https://github.com/ulule/gokvstores) + - Only supports Redis and local in-memory cache + - Not actively maintained anymore (4 direct commits + 1 merged PR in the last 10+ months, as of 2018-10-13) + - 13 stars (as of 2018-10-13) +- [gokv](https://github.com/gokv) + - Requires a `json.Marshaler` / `json.Unmarshaler` as parameter, so you always need to explicitly implement their methods for your structs, and also you can't use gob or other formats for (un-)marshaling. + - No support for Consul, etcd, bbolt / Bolt, BadgerDB, MongoDB, AWS DynamoDB, Memcached, MySQL, ... + - Separate repo for each implementation, which has advantages and disadvantages + - No releases (makes it harder to use with package managers like dep) + - 2-7 stars (depending on the repository, as of 2018-10-13) + +Others: + +- [gladkikhartem/gokv](https://github.com/gladkikhartem/gokv): No `Delete()` method, no Redis, embedded DBs etc., no Git tags / releases, no stars (as of 2018-11-28) +- [bradberger/gokv](https://github.com/bradberger/gokv): Not maintained (no commits in the last 22 months), no Redis, Consul etc., no Git tags / releases, 1 star (as of 2018-11-28) + - This package inspired me to implement something similar to its `Codec`. +- [ppacher/gokv](https://github.com/ppacher/gokv): Not maintained (no commits in the last 22 months), no Redis, embedded DBs etc., no automatic (un-)marshalling, 1 star (as of 2018-11-28) + - Nice CLI! +- [kapitan-k/gokvstore](https://github.com/kapitan-k/gokvstore): Not actively maintained (no commits in the last 10+ months), RocksDB only, requires cgo, no automatic (un-)marshalling, no Git tags/ releases, 1 star (as of 2018-11-28) diff --git a/vendor/github.com/philippgille/gokv/RELEASES.md b/vendor/github.com/philippgille/gokv/RELEASES.md new file mode 100644 index 0000000000..62648360ef --- /dev/null +++ b/vendor/github.com/philippgille/gokv/RELEASES.md @@ -0,0 +1,105 @@ +Releases +======== + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). + +vNext +----- + +- Added support for [Go modules](https://github.com/golang/go/wiki/Modules) (issue [#81](https://github.com/philippgille/gokv/issues/81)) + - All `gokv.Store` implementations are now separate Go modules +- Added `gokv.Store` implementations: + - Package `hazelcast` - A `gokv.Store` implementation for [Hazelcast](https://github.com/hazelcast/hazelcast) (issue [#75](https://github.com/philippgille/gokv/issues/75)) +- Fixed: Compile error in `badgerdb` after a breaking change in BadgerDB 1.6.0 + +v0.5.0 (2019-01-12) +------------------- + +- Added: Package `encoding` - An abstraction and wrapper for the core functionality of packages like `encoding/json` and `encoding/gob` (issue [#47](https://github.com/philippgille/gokv/issues/47)) +- Added: Package `sql` - It contains shared code for SQL implementations. `mysql` and `postgres` already use it and if you want to create your own SQL implementation you can use it as well. (Useful for issue [#57](https://github.com/philippgille/gokv/issues/57).) +- Added `gokv.Store` implementations: + - Package `s3` - A `gokv.Store` implementation for [Amazon S3](https://aws.amazon.com/s3/) (issue [#37](https://github.com/philippgille/gokv/issues/37)) + - Also works for other S3-compatible cloud services like [DigitalOcean Spaces](https://www.digitalocean.com/products/spaces/) and [Scaleway Object Storage](https://www.scaleway.com/object-storage/), as well as for self-hosted solutions like [OpenStack Swift](https://github.com/openstack/swift), [Ceph](https://github.com/ceph/ceph) and [Minio](https://github.com/minio/minio) + - Package `tablestorage` - A `gokv.Store` implementation for [Azure Table Storage](https://azure.microsoft.com/en-us/services/storage/tables/) (issue [#42](https://github.com/philippgille/gokv/issues/42)) + - Package `datastore` - A `gokv.Store` implementation for [Google Cloud Datastore](https://cloud.google.com/datastore/) (issue [#51](https://github.com/philippgille/gokv/issues/51)) + - Package `tablestore` - A `gokv.Store` implementation for [Alibaba Cloud Table Store](https://www.alibabacloud.com/de/product/table-store) (issue [#70](https://github.com/philippgille/gokv/issues/70)) + - Package `leveldb` - A `gokv.Store` implementation for [LevelDB](https://github.com/syndtr/goleveldb) (issue [#48](https://github.com/philippgille/gokv/issues/48)) + - Package `file` - A `gokv.Store` implementation for storing key-value pairs as files (issue [#52](https://github.com/philippgille/gokv/issues/52)) + - Package `zookeeper` - A `gokv.Store` implementation for [Apache ZooKeeper](https://github.com/apache/zookeeper) (issue [#66](https://github.com/philippgille/gokv/issues/66)) + - Package `postgresql` - A `gokv.Store` implementation for [PostgreSQL](https://github.com/postgres/postgres) (issue [#57](https://github.com/philippgille/gokv/issues/57)) + - Package `cockroachdb` - A `gokv.Store` implementation for [CockroachDB](https://github.com/cockroachdb/cockroach) (issue [#62](https://github.com/philippgille/gokv/issues/62)) + - Package `ignite` - A `gokv.Store` implementation for [Apache Ignite](https://github.com/apache/ignite) (issue [#64](https://github.com/philippgille/gokv/issues/64)) + - Package `freecache` - A `gokv.Store` implementation for [FreeCache](https://github.com/coocood/freecache) (issue [#44](https://github.com/philippgille/gokv/issues/44)) + - Package `bigcache` - A `gokv.Store` implementation for [BigCache](https://github.com/allegro/bigcache) (issue [#45](https://github.com/philippgille/gokv/issues/45)) + +Breaking changes +---------------- + +- The `MarshalFormat` enums were removed from all packages that contained `gokv.Store` implementations. Instead the shared package `encoding` was introduced (required for issue [#47](https://github.com/philippgille/gokv/issues/47)) + +v0.4.0 (2018-12-02) +------------------- + +- Added: Method `Close() error` (issue [#36](https://github.com/philippgille/gokv/issues/36)) +- Added `gokv.Store` implementations: + - Package `mongodb` - A `gokv.Store` implementation for [MongoDB](https://github.com/mongodb/mongo) (issue [#27](https://github.com/philippgille/gokv/issues/27)) + - Package `dynamodb` - A `gokv.Store` implementation for [Amazon DynamoDB](https://aws.amazon.com/dynamodb/) (issue [#28](https://github.com/philippgille/gokv/issues/28)) + - Package `memcached` - A `gokv.Store` implementation for [Memcached](https://github.com/memcached/memcached) (issue [#31](https://github.com/philippgille/gokv/issues/31)) + - Package `mysql` - A `gokv.Store` implementation for [MySQL](https://github.com/mysql/mysql-server) (issue [#32](https://github.com/philippgille/gokv/issues/32)) +- Added: The factory function `redis.NewClient()` now checks if the connection to the Redis server works and otherwise returns an error. +- Added: The `test` package now has the function `func TestConcurrentInteractions(t *testing.T, goroutineCount int, store gokv.Store)` that you can use to test your `gokv.Store` implementation with concurrent interactions. +- Improved: The `etcd.Client` timeout implementation was improved. +- Fixed: The `Get()` method of the `bbolt` store ignored errors if they occurred during the retrieval of the value +- Fixed: Spelling in error message when using the etcd implementation and the etcd server is unreachable + +### Breaking changes + +- The added `Close() error` method (see above) means that previous implementations of `gokv.Store` are not compatible with the interface anymore. +- Renamed `bolt` package to `bbolt` to reflect the fact that the maintained fork is used. Also changed all other occurrences of "bolt" (e.g. in GoDoc comments etc.). +- Due to the above mentioned addition to the Redis client factory function, the function signature changed from `func NewClient(options Options) Client` to `func NewClient(options Options) (Client, error)`. + +v0.3.0 (2018-11-17) +------------------- + +- Added: Method `Delete(string) error` (issue [#8](https://github.com/philippgille/gokv/issues/8)) +- Added: All `gokv.Store` implementations in this package now also support [gob](https://blog.golang.org/gobs-of-data) as marshal format as alternative to JSON (issue [#22](https://github.com/philippgille/gokv/issues/22)) + - Part of this addition are a new field in the existing `Options` structs, called `MarshalFormat`, as well as the related `MarshalFormat` enum (custom type + related `const` values) in each implementation package +- Added `gokv.Store` implementations: + - Package `badgerdb` - A `gokv.Store` implementation for [BadgerDB](https://github.com/dgraph-io/badger) (issue [#16](https://github.com/philippgille/gokv/issues/16)) + - Package `consul` - A `gokv.Store` implementation for [Consul](https://github.com/hashicorp/consul) (issue [#18](https://github.com/philippgille/gokv/issues/18)) + - Package `etcd` - A `gokv.Store` implementation for [etcd](https://github.com/etcd-io/etcd) (issue [#24](https://github.com/philippgille/gokv/issues/24)) + +### Breaking changes + +- The added `Delete(string) error` method (see above) means that previous implementations of `gokv.Store` are not compatible with the interface anymore. +- Changed: The `NewStore()` function in `gomap` and `syncmap` now has an `Option` parameter. Required for issue [#22](https://github.com/philippgille/gokv/issues/22). +- Changed: Passing an empty string as key to `Set()`, `Get()` or `Delete()` now results in an error +- Changed: Passing `nil` as value parameter to `Set()` or as pointer to `Get()` now results in an error. This change leads to a consistent behaviour across the different marshal formats (otherwise for example `encoding/json` marshals `nil` to `null` while `encoding/gob` returns an error). + +v0.2.0 (2018-11-05) +------------------- + +- Added `gokv.Store` implementation: + - Package `gomap` - A `gokv.Store` implementation for a plain Go map with a `sync.RWMutex` for concurrent access (issue [#11](https://github.com/philippgille/gokv/issues/11)) +- Improved: Every `gokv.Store` implementation resides in its own package now, so when downloading the package of an implementation, for example with `go get github.com/philippgille/gokv/redis`, only the actually required dependencies are downloaded and compiled, making the process much faster. This is especially useful for example when creating Docker images, where in many cases (depending on the `Dockerfile`) the download and compilation are repeated for *each build*. (Issue [#2](https://github.com/philippgille/gokv/issues/2)) +- Improved: The performance of `bolt.Store` should be higher, because unnecessary manual locking was removed. (Issue [#1](https://github.com/philippgille/gokv/issues/1)) +- Fixed: The `gokv.Store` implementation for bbolt / Bolt DB used data from within a Bolt transaction outside of it, without copying the value, which can lead to errors (see [here](https://github.com/etcd-io/bbolt/blob/76a4670663d125b6b89d47ea3cc659a282d87c28/doc.go#L38)) (issue [#13](https://github.com/philippgille/gokv/issues/13)) + +### Breaking changes + +- All `gokv.Store` implementations were moved into their own packages and the structs that implement the interface were renamed to avoid unidiomatic "stuttering". + +v0.1.0 (2018-10-14) +------------------- + +Initial release with code from [philippgille/ln-paywall:78fd1dfbf10f549a22f4f30ac7f68c2a2735e989](https://github.com/philippgille/ln-paywall/tree/78fd1dfbf10f549a22f4f30ac7f68c2a2735e989) with only a few changes like a different default path and a bucket name as additional option for the Bolt DB implementation. + +Features: + +- Interface with `Set(string, interface{}) error` and `Get(string, interface{}) (bool, error)` +- Implementations for: + - [bbolt](https://github.com/etcd-io/bbolt) (formerly known as Bolt / Bolt DB) + - Go map (`sync.Map`) + - [Redis](https://github.com/antirez/redis) diff --git a/vendor/github.com/philippgille/gokv/badgerdb/LICENSE b/vendor/github.com/philippgille/gokv/badgerdb/LICENSE new file mode 100644 index 0000000000..a612ad9813 --- /dev/null +++ b/vendor/github.com/philippgille/gokv/badgerdb/LICENSE @@ -0,0 +1,373 @@ +Mozilla Public License Version 2.0 +================================== + +1. Definitions +-------------- + +1.1. "Contributor" + means each individual or legal entity that creates, contributes to + the creation of, or owns Covered Software. + +1.2. "Contributor Version" + means the combination of the Contributions of others (if any) used + by a Contributor and that particular Contributor's Contribution. + +1.3. "Contribution" + means Covered Software of a particular Contributor. + +1.4. "Covered Software" + means Source Code Form to which the initial Contributor has attached + the notice in Exhibit A, the Executable Form of such Source Code + Form, and Modifications of such Source Code Form, in each case + including portions thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + (a) that the initial Contributor has attached the notice described + in Exhibit B to the Covered Software; or + + (b) that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the + terms of a Secondary License. + +1.6. "Executable Form" + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + means a work that combines Covered Software with other material, in + a separate file or files, that is not Covered Software. + +1.8. "License" + means this document. + +1.9. "Licensable" + means having the right to grant, to the maximum extent possible, + whether at the time of the initial grant or subsequently, any and + all of the rights conveyed by this License. + +1.10. "Modifications" + means any of the following: + + (a) any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered + Software; or + + (b) any new file in Source Code Form that contains any Covered + Software. + +1.11. "Patent Claims" of a Contributor + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the + License, by the making, using, selling, offering for sale, having + made, import, or transfer of either its Contributions or its + Contributor Version. + +1.12. "Secondary License" + means either the GNU General Public License, Version 2.0, the GNU + Lesser General Public License, Version 2.1, the GNU Affero General + Public License, Version 3.0, or any later versions of those + licenses. + +1.13. "Source Code Form" + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that + controls, is controlled by, or is under common control with You. For + purposes of this definition, "control" means (a) the power, direct + or indirect, to cause the direction or management of such entity, + whether by contract or otherwise, or (b) ownership of more than + fifty percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants and Conditions +-------------------------------- + +2.1. Grants + +Each Contributor hereby grants You a world-wide, royalty-free, +non-exclusive license: + +(a) under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + +(b) under Patent Claims of such Contributor to make, use, sell, offer + for sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + +The licenses granted in Section 2.1 with respect to any Contribution +become effective for each Contribution on the date the Contributor first +distributes such Contribution. + +2.3. Limitations on Grant Scope + +The licenses granted in this Section 2 are the only rights granted under +this License. No additional rights or licenses will be implied from the +distribution or licensing of Covered Software under this License. +Notwithstanding Section 2.1(b) above, no patent license is granted by a +Contributor: + +(a) for any code that a Contributor has removed from Covered Software; + or + +(b) for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + +(c) under Patent Claims infringed by Covered Software in the absence of + its Contributions. + +This License does not grant any rights in the trademarks, service marks, +or logos of any Contributor (except as may be necessary to comply with +the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + +No Contributor makes additional grants as a result of Your choice to +distribute the Covered Software under a subsequent version of this +License (see Section 10.2) or under the terms of a Secondary License (if +permitted under the terms of Section 3.3). + +2.5. Representation + +Each Contributor represents that the Contributor believes its +Contributions are its original creation(s) or it has sufficient rights +to grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + +This License is not intended to limit any rights You have under +applicable copyright doctrines of fair use, fair dealing, or other +equivalents. + +2.7. Conditions + +Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +in Section 2.1. + +3. Responsibilities +------------------- + +3.1. Distribution of Source Form + +All distribution of Covered Software in Source Code Form, including any +Modifications that You create or to which You contribute, must be under +the terms of this License. You must inform recipients that the Source +Code Form of the Covered Software is governed by the terms of this +License, and how they can obtain a copy of this License. You may not +attempt to alter or restrict the recipients' rights in the Source Code +Form. + +3.2. Distribution of Executable Form + +If You distribute Covered Software in Executable Form then: + +(a) such Covered Software must also be made available in Source Code + Form, as described in Section 3.1, and You must inform recipients of + the Executable Form how they can obtain a copy of such Source Code + Form by reasonable means in a timely manner, at a charge no more + than the cost of distribution to the recipient; and + +(b) You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter + the recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + +You may create and distribute a Larger Work under terms of Your choice, +provided that You also comply with the requirements of this License for +the Covered Software. If the Larger Work is a combination of Covered +Software with a work governed by one or more Secondary Licenses, and the +Covered Software is not Incompatible With Secondary Licenses, this +License permits You to additionally distribute such Covered Software +under the terms of such Secondary License(s), so that the recipient of +the Larger Work may, at their option, further distribute the Covered +Software under the terms of either this License or such Secondary +License(s). + +3.4. Notices + +You may not remove or alter the substance of any license notices +(including copyright notices, patent notices, disclaimers of warranty, +or limitations of liability) contained within the Source Code Form of +the Covered Software, except that You may alter any license notices to +the extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + +You may choose to offer, and to charge a fee for, warranty, support, +indemnity or liability obligations to one or more recipients of Covered +Software. However, You may do so only on Your own behalf, and not on +behalf of any Contributor. You must make it absolutely clear that any +such warranty, support, indemnity, or liability obligation is offered by +You alone, and You hereby agree to indemnify every Contributor for any +liability incurred by such Contributor as a result of warranty, support, +indemnity or liability terms You offer. You may include additional +disclaimers of warranty and limitations of liability specific to any +jurisdiction. + +4. Inability to Comply Due to Statute or Regulation +--------------------------------------------------- + +If it is impossible for You to comply with any of the terms of this +License with respect to some or all of the Covered Software due to +statute, judicial order, or regulation then You must: (a) comply with +the terms of this License to the maximum extent possible; and (b) +describe the limitations and the code they affect. Such description must +be placed in a text file included with all distributions of the Covered +Software under this License. Except to the extent prohibited by statute +or regulation, such description must be sufficiently detailed for a +recipient of ordinary skill to be able to understand it. + +5. Termination +-------------- + +5.1. The rights granted under this License will terminate automatically +if You fail to comply with any of its terms. However, if You become +compliant, then the rights granted under this License from a particular +Contributor are reinstated (a) provisionally, unless and until such +Contributor explicitly and finally terminates Your grants, and (b) on an +ongoing basis, if such Contributor fails to notify You of the +non-compliance by some reasonable means prior to 60 days after You have +come back into compliance. Moreover, Your grants from a particular +Contributor are reinstated on an ongoing basis if such Contributor +notifies You of the non-compliance by some reasonable means, this is the +first time You have received notice of non-compliance with this License +from such Contributor, and You become compliant prior to 30 days after +Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent +infringement claim (excluding declaratory judgment actions, +counter-claims, and cross-claims) alleging that a Contributor Version +directly or indirectly infringes any patent, then the rights granted to +You by any and all Contributors for the Covered Software under Section +2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all +end user license agreements (excluding distributors and resellers) which +have been validly granted by You or Your distributors under this License +prior to termination shall survive termination. + +************************************************************************ +* * +* 6. Disclaimer of Warranty * +* ------------------------- * +* * +* Covered Software is provided under this License on an "as is" * +* basis, without warranty of any kind, either expressed, implied, or * +* statutory, including, without limitation, warranties that the * +* Covered Software is free of defects, merchantable, fit for a * +* particular purpose or non-infringing. The entire risk as to the * +* quality and performance of the Covered Software is with You. * +* Should any Covered Software prove defective in any respect, You * +* (not any Contributor) assume the cost of any necessary servicing, * +* repair, or correction. This disclaimer of warranty constitutes an * +* essential part of this License. No use of any Covered Software is * +* authorized under this License except under this disclaimer. * +* * +************************************************************************ + +************************************************************************ +* * +* 7. Limitation of Liability * +* -------------------------- * +* * +* Under no circumstances and under no legal theory, whether tort * +* (including negligence), contract, or otherwise, shall any * +* Contributor, or anyone who distributes Covered Software as * +* permitted above, be liable to You for any direct, indirect, * +* special, incidental, or consequential damages of any character * +* including, without limitation, damages for lost profits, loss of * +* goodwill, work stoppage, computer failure or malfunction, or any * +* and all other commercial damages or losses, even if such party * +* shall have been informed of the possibility of such damages. This * +* limitation of liability shall not apply to liability for death or * +* personal injury resulting from such party's negligence to the * +* extent applicable law prohibits such limitation. Some * +* jurisdictions do not allow the exclusion or limitation of * +* incidental or consequential damages, so this exclusion and * +* limitation may not apply to You. * +* * +************************************************************************ + +8. Litigation +------------- + +Any litigation relating to this License may be brought only in the +courts of a jurisdiction where the defendant maintains its principal +place of business and such litigation shall be governed by laws of that +jurisdiction, without reference to its conflict-of-law provisions. +Nothing in this Section shall prevent a party's ability to bring +cross-claims or counter-claims. + +9. Miscellaneous +---------------- + +This License represents the complete agreement concerning the subject +matter hereof. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent +necessary to make it enforceable. Any law or regulation which provides +that the language of a contract shall be construed against the drafter +shall not be used to construe this License against a Contributor. + +10. Versions of the License +--------------------------- + +10.1. New Versions + +Mozilla Foundation is the license steward. Except as provided in Section +10.3, no one other than the license steward has the right to modify or +publish new versions of this License. Each version will be given a +distinguishing version number. + +10.2. Effect of New Versions + +You may distribute the Covered Software under the terms of the version +of the License under which You originally received the Covered Software, +or under the terms of any subsequent version published by the license +steward. + +10.3. Modified Versions + +If you create software not governed by this License, and you want to +create a new license for such software, you may create and use a +modified version of this License if you rename the license and remove +any references to the name of the license steward (except to note that +such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary +Licenses + +If You choose to distribute Source Code Form that is Incompatible With +Secondary Licenses under the terms of this version of the License, the +notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice +------------------------------------------- + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular +file, then You may include the notice in a location (such as a LICENSE +file in a relevant directory) where a recipient would be likely to look +for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice +--------------------------------------------------------- + + This Source Code Form is "Incompatible With Secondary Licenses", as + defined by the Mozilla Public License, v. 2.0. diff --git a/vendor/github.com/philippgille/gokv/badgerdb/badgerdb.go b/vendor/github.com/philippgille/gokv/badgerdb/badgerdb.go new file mode 100644 index 0000000000..842f0e1823 --- /dev/null +++ b/vendor/github.com/philippgille/gokv/badgerdb/badgerdb.go @@ -0,0 +1,140 @@ +package badgerdb + +import ( + "github.com/dgraph-io/badger" + + "github.com/philippgille/gokv/encoding" + "github.com/philippgille/gokv/util" +) + +// Store is a gokv.Store implementation for BadgerDB. +type Store struct { + db *badger.DB + codec encoding.Codec +} + +// Set stores the given value for the given key. +// Values are automatically marshalled to JSON or gob (depending on the configuration). +// The key must not be "" and the value must not be nil. +func (s Store) Set(k string, v interface{}) error { + if err := util.CheckKeyAndValue(k, v); err != nil { + return err + } + + // First turn the passed object into something that BadgerDB can handle + data, err := s.codec.Marshal(v) + if err != nil { + return err + } + + err = s.db.Update(func(txn *badger.Txn) error { + return txn.Set([]byte(k), data) + }) + if err != nil { + return err + } + return nil +} + +// Get retrieves the stored value for the given key. +// You need to pass a pointer to the value, so in case of a struct +// the automatic unmarshalling can populate the fields of the object +// that v points to with the values of the retrieved object's values. +// If no value is found it returns (false, nil). +// The key must not be "" and the pointer must not be nil. +func (s Store) Get(k string, v interface{}) (found bool, err error) { + if err := util.CheckKeyAndValue(k, v); err != nil { + return false, err + } + + var data []byte + err = s.db.View(func(txn *badger.Txn) error { + item, err := txn.Get([]byte(k)) + if err != nil { + return err + } + // item.Value() is only valid within the transaction. + // We can either copy it ourselves or use the ValueCopy() method. + // TODO: Benchmark if it's faster to copy + close tx, + // or to keep the tx open until unmarshalling is done. + data, err = item.ValueCopy(nil) + if err != nil { + return err + } + return nil + }) + // If no value was found return false + if err == badger.ErrKeyNotFound { + return false, nil + } else if err != nil { + return false, err + } + + return true, s.codec.Unmarshal(data, v) +} + +// Delete deletes the stored value for the given key. +// Deleting a non-existing key-value pair does NOT lead to an error. +// The key must not be "". +func (s Store) Delete(k string) error { + if err := util.CheckKey(k); err != nil { + return err + } + + return s.db.Update(func(txn *badger.Txn) error { + return txn.Delete([]byte(k)) + }) +} + +// Close closes the store. +// It must be called to make sure that all pending updates make their way to disk. +func (s Store) Close() error { + return s.db.Close() +} + +// Options are the options for the BadgerDB store. +type Options struct { + // Directory for storing the DB files. + // Optional ("BadgerDB" by default). + Dir string + // Encoding format. + // Optional (encoding.JSON by default). + Codec encoding.Codec +} + +// DefaultOptions is an Options object with default values. +// Dir: "BadgerDB", Codec: encoding.JSON +var DefaultOptions = Options{ + Dir: "BadgerDB", + Codec: encoding.JSON, +} + +// NewStore creates a new BadgerDB store. +// Note: BadgerDB uses an exclusive write lock on the database directory so it cannot be shared by multiple processes. +// So when creating multiple clients you should always use a new database directory (by setting a different Path in the options). +// +// You must call the Close() method on the store when you're done working with it. +func NewStore(options Options) (Store, error) { + result := Store{} + + // Set default values + if options.Dir == "" { + options.Dir = DefaultOptions.Dir + } + if options.Codec == nil { + options.Codec = DefaultOptions.Codec + } + + // Open the Badger database located in the options.Dir directory. + // It will be created if it doesn't exist. + opts := badger.DefaultOptions(options.Dir) + db, err := badger.Open(opts) + if err != nil { + return result, err + } + + result.db = db + result.codec = options.Codec + + return result, nil +} diff --git a/vendor/github.com/philippgille/gokv/badgerdb/docs.go b/vendor/github.com/philippgille/gokv/badgerdb/docs.go new file mode 100644 index 0000000000..24caf79e6e --- /dev/null +++ b/vendor/github.com/philippgille/gokv/badgerdb/docs.go @@ -0,0 +1,4 @@ +/* +Package badgerdb contains an implementation of the `gokv.Store` interface for BadgerDB. +*/ +package badgerdb diff --git a/vendor/github.com/philippgille/gokv/docs.go b/vendor/github.com/philippgille/gokv/docs.go new file mode 100644 index 0000000000..a859b2c70b --- /dev/null +++ b/vendor/github.com/philippgille/gokv/docs.go @@ -0,0 +1,69 @@ +/* +Package gokv contains a simple key-value store abstraction in the form of a Go interface. +Implementations of the gokv.Store interface can be found in the sub-packages. + +Usage + +Example code for using Redis: + + package main + + import ( + "fmt" + + "github.com/philippgille/gokv" + "github.com/philippgille/gokv/redis" + ) + + type foo struct { + Bar string + } + + func main() { + options := redis.DefaultOptions // Address: "localhost:6379", Password: "", DB: 0 + + // Create client + client, err := redis.NewClient(options) + if err != nil { + panic(err) + } + defer client.Close() + + // Store, retrieve, print and delete a value + interactWithStore(client) + } + + // interactWithStore stores, retrieves, prints and deletes a value. + // It's completely independent of the store implementation. + func interactWithStore(store gokv.Store) { + // Store value + val := foo{ + Bar: "baz", + } + err := store.Set("foo123", val) + if err != nil { + panic(err) + } + + // Retrieve value + retrievedVal := new(foo) + found, err := store.Get("foo123", retrievedVal) + if err != nil { + panic(err) + } + if !found { + panic("Value not found") + } + + fmt.Printf("foo: %+v", *retrievedVal) // Prints `foo: {Bar:baz}` + + // Delete value + err = store.Delete("foo123") + if err != nil { + panic(err) + } + } + +More details can be found on https://github.com/philippgille/gokv. +*/ +package gokv diff --git a/vendor/github.com/philippgille/gokv/encoding/LICENSE b/vendor/github.com/philippgille/gokv/encoding/LICENSE new file mode 100644 index 0000000000..a612ad9813 --- /dev/null +++ b/vendor/github.com/philippgille/gokv/encoding/LICENSE @@ -0,0 +1,373 @@ +Mozilla Public License Version 2.0 +================================== + +1. Definitions +-------------- + +1.1. "Contributor" + means each individual or legal entity that creates, contributes to + the creation of, or owns Covered Software. + +1.2. "Contributor Version" + means the combination of the Contributions of others (if any) used + by a Contributor and that particular Contributor's Contribution. + +1.3. "Contribution" + means Covered Software of a particular Contributor. + +1.4. "Covered Software" + means Source Code Form to which the initial Contributor has attached + the notice in Exhibit A, the Executable Form of such Source Code + Form, and Modifications of such Source Code Form, in each case + including portions thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + (a) that the initial Contributor has attached the notice described + in Exhibit B to the Covered Software; or + + (b) that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the + terms of a Secondary License. + +1.6. "Executable Form" + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + means a work that combines Covered Software with other material, in + a separate file or files, that is not Covered Software. + +1.8. "License" + means this document. + +1.9. "Licensable" + means having the right to grant, to the maximum extent possible, + whether at the time of the initial grant or subsequently, any and + all of the rights conveyed by this License. + +1.10. "Modifications" + means any of the following: + + (a) any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered + Software; or + + (b) any new file in Source Code Form that contains any Covered + Software. + +1.11. "Patent Claims" of a Contributor + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the + License, by the making, using, selling, offering for sale, having + made, import, or transfer of either its Contributions or its + Contributor Version. + +1.12. "Secondary License" + means either the GNU General Public License, Version 2.0, the GNU + Lesser General Public License, Version 2.1, the GNU Affero General + Public License, Version 3.0, or any later versions of those + licenses. + +1.13. "Source Code Form" + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that + controls, is controlled by, or is under common control with You. For + purposes of this definition, "control" means (a) the power, direct + or indirect, to cause the direction or management of such entity, + whether by contract or otherwise, or (b) ownership of more than + fifty percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants and Conditions +-------------------------------- + +2.1. Grants + +Each Contributor hereby grants You a world-wide, royalty-free, +non-exclusive license: + +(a) under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + +(b) under Patent Claims of such Contributor to make, use, sell, offer + for sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + +The licenses granted in Section 2.1 with respect to any Contribution +become effective for each Contribution on the date the Contributor first +distributes such Contribution. + +2.3. Limitations on Grant Scope + +The licenses granted in this Section 2 are the only rights granted under +this License. No additional rights or licenses will be implied from the +distribution or licensing of Covered Software under this License. +Notwithstanding Section 2.1(b) above, no patent license is granted by a +Contributor: + +(a) for any code that a Contributor has removed from Covered Software; + or + +(b) for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + +(c) under Patent Claims infringed by Covered Software in the absence of + its Contributions. + +This License does not grant any rights in the trademarks, service marks, +or logos of any Contributor (except as may be necessary to comply with +the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + +No Contributor makes additional grants as a result of Your choice to +distribute the Covered Software under a subsequent version of this +License (see Section 10.2) or under the terms of a Secondary License (if +permitted under the terms of Section 3.3). + +2.5. Representation + +Each Contributor represents that the Contributor believes its +Contributions are its original creation(s) or it has sufficient rights +to grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + +This License is not intended to limit any rights You have under +applicable copyright doctrines of fair use, fair dealing, or other +equivalents. + +2.7. Conditions + +Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +in Section 2.1. + +3. Responsibilities +------------------- + +3.1. Distribution of Source Form + +All distribution of Covered Software in Source Code Form, including any +Modifications that You create or to which You contribute, must be under +the terms of this License. You must inform recipients that the Source +Code Form of the Covered Software is governed by the terms of this +License, and how they can obtain a copy of this License. You may not +attempt to alter or restrict the recipients' rights in the Source Code +Form. + +3.2. Distribution of Executable Form + +If You distribute Covered Software in Executable Form then: + +(a) such Covered Software must also be made available in Source Code + Form, as described in Section 3.1, and You must inform recipients of + the Executable Form how they can obtain a copy of such Source Code + Form by reasonable means in a timely manner, at a charge no more + than the cost of distribution to the recipient; and + +(b) You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter + the recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + +You may create and distribute a Larger Work under terms of Your choice, +provided that You also comply with the requirements of this License for +the Covered Software. If the Larger Work is a combination of Covered +Software with a work governed by one or more Secondary Licenses, and the +Covered Software is not Incompatible With Secondary Licenses, this +License permits You to additionally distribute such Covered Software +under the terms of such Secondary License(s), so that the recipient of +the Larger Work may, at their option, further distribute the Covered +Software under the terms of either this License or such Secondary +License(s). + +3.4. Notices + +You may not remove or alter the substance of any license notices +(including copyright notices, patent notices, disclaimers of warranty, +or limitations of liability) contained within the Source Code Form of +the Covered Software, except that You may alter any license notices to +the extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + +You may choose to offer, and to charge a fee for, warranty, support, +indemnity or liability obligations to one or more recipients of Covered +Software. However, You may do so only on Your own behalf, and not on +behalf of any Contributor. You must make it absolutely clear that any +such warranty, support, indemnity, or liability obligation is offered by +You alone, and You hereby agree to indemnify every Contributor for any +liability incurred by such Contributor as a result of warranty, support, +indemnity or liability terms You offer. You may include additional +disclaimers of warranty and limitations of liability specific to any +jurisdiction. + +4. Inability to Comply Due to Statute or Regulation +--------------------------------------------------- + +If it is impossible for You to comply with any of the terms of this +License with respect to some or all of the Covered Software due to +statute, judicial order, or regulation then You must: (a) comply with +the terms of this License to the maximum extent possible; and (b) +describe the limitations and the code they affect. Such description must +be placed in a text file included with all distributions of the Covered +Software under this License. Except to the extent prohibited by statute +or regulation, such description must be sufficiently detailed for a +recipient of ordinary skill to be able to understand it. + +5. Termination +-------------- + +5.1. The rights granted under this License will terminate automatically +if You fail to comply with any of its terms. However, if You become +compliant, then the rights granted under this License from a particular +Contributor are reinstated (a) provisionally, unless and until such +Contributor explicitly and finally terminates Your grants, and (b) on an +ongoing basis, if such Contributor fails to notify You of the +non-compliance by some reasonable means prior to 60 days after You have +come back into compliance. Moreover, Your grants from a particular +Contributor are reinstated on an ongoing basis if such Contributor +notifies You of the non-compliance by some reasonable means, this is the +first time You have received notice of non-compliance with this License +from such Contributor, and You become compliant prior to 30 days after +Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent +infringement claim (excluding declaratory judgment actions, +counter-claims, and cross-claims) alleging that a Contributor Version +directly or indirectly infringes any patent, then the rights granted to +You by any and all Contributors for the Covered Software under Section +2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all +end user license agreements (excluding distributors and resellers) which +have been validly granted by You or Your distributors under this License +prior to termination shall survive termination. + +************************************************************************ +* * +* 6. Disclaimer of Warranty * +* ------------------------- * +* * +* Covered Software is provided under this License on an "as is" * +* basis, without warranty of any kind, either expressed, implied, or * +* statutory, including, without limitation, warranties that the * +* Covered Software is free of defects, merchantable, fit for a * +* particular purpose or non-infringing. The entire risk as to the * +* quality and performance of the Covered Software is with You. * +* Should any Covered Software prove defective in any respect, You * +* (not any Contributor) assume the cost of any necessary servicing, * +* repair, or correction. This disclaimer of warranty constitutes an * +* essential part of this License. No use of any Covered Software is * +* authorized under this License except under this disclaimer. * +* * +************************************************************************ + +************************************************************************ +* * +* 7. Limitation of Liability * +* -------------------------- * +* * +* Under no circumstances and under no legal theory, whether tort * +* (including negligence), contract, or otherwise, shall any * +* Contributor, or anyone who distributes Covered Software as * +* permitted above, be liable to You for any direct, indirect, * +* special, incidental, or consequential damages of any character * +* including, without limitation, damages for lost profits, loss of * +* goodwill, work stoppage, computer failure or malfunction, or any * +* and all other commercial damages or losses, even if such party * +* shall have been informed of the possibility of such damages. This * +* limitation of liability shall not apply to liability for death or * +* personal injury resulting from such party's negligence to the * +* extent applicable law prohibits such limitation. Some * +* jurisdictions do not allow the exclusion or limitation of * +* incidental or consequential damages, so this exclusion and * +* limitation may not apply to You. * +* * +************************************************************************ + +8. Litigation +------------- + +Any litigation relating to this License may be brought only in the +courts of a jurisdiction where the defendant maintains its principal +place of business and such litigation shall be governed by laws of that +jurisdiction, without reference to its conflict-of-law provisions. +Nothing in this Section shall prevent a party's ability to bring +cross-claims or counter-claims. + +9. Miscellaneous +---------------- + +This License represents the complete agreement concerning the subject +matter hereof. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent +necessary to make it enforceable. Any law or regulation which provides +that the language of a contract shall be construed against the drafter +shall not be used to construe this License against a Contributor. + +10. Versions of the License +--------------------------- + +10.1. New Versions + +Mozilla Foundation is the license steward. Except as provided in Section +10.3, no one other than the license steward has the right to modify or +publish new versions of this License. Each version will be given a +distinguishing version number. + +10.2. Effect of New Versions + +You may distribute the Covered Software under the terms of the version +of the License under which You originally received the Covered Software, +or under the terms of any subsequent version published by the license +steward. + +10.3. Modified Versions + +If you create software not governed by this License, and you want to +create a new license for such software, you may create and use a +modified version of this License if you rename the license and remove +any references to the name of the license steward (except to note that +such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary +Licenses + +If You choose to distribute Source Code Form that is Incompatible With +Secondary Licenses under the terms of this version of the License, the +notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice +------------------------------------------- + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular +file, then You may include the notice in a location (such as a LICENSE +file in a relevant directory) where a recipient would be likely to look +for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice +--------------------------------------------------------- + + This Source Code Form is "Incompatible With Secondary Licenses", as + defined by the Mozilla Public License, v. 2.0. diff --git a/vendor/github.com/philippgille/gokv/encoding/codec.go b/vendor/github.com/philippgille/gokv/encoding/codec.go new file mode 100644 index 0000000000..93a816c677 --- /dev/null +++ b/vendor/github.com/philippgille/gokv/encoding/codec.go @@ -0,0 +1,17 @@ +package encoding + +// Codec encodes/decodes Go values to/from slices of bytes. +type Codec interface { + // Marshal encodes a Go value to a slice of bytes. + Marshal(v interface{}) ([]byte, error) + // Unmarshal decodes a slice of bytes into a Go value. + Unmarshal(data []byte, v interface{}) error +} + +// Convenience variables +var ( + // JSON is a JSONcodec that encodes/decodes Go values to/from JSON. + JSON = JSONcodec{} + // Gob is a GobCodec that encodes/decodes Go values to/from gob. + Gob = GobCodec{} +) diff --git a/vendor/github.com/philippgille/gokv/encoding/docs.go b/vendor/github.com/philippgille/gokv/encoding/docs.go new file mode 100644 index 0000000000..8819c109c0 --- /dev/null +++ b/vendor/github.com/philippgille/gokv/encoding/docs.go @@ -0,0 +1,7 @@ +/* +Package encoding is a wrapper for the core functionality of packages like "encoding/json" and "encoding/gob". + +It contains the Codec interface and multiple implementations for encoding Go values to other formats and decode from other formats to Go values. +Formats can be JSON, gob etc. +*/ +package encoding diff --git a/vendor/github.com/philippgille/gokv/encoding/gob.go b/vendor/github.com/philippgille/gokv/encoding/gob.go new file mode 100644 index 0000000000..1fa7afa06e --- /dev/null +++ b/vendor/github.com/philippgille/gokv/encoding/gob.go @@ -0,0 +1,28 @@ +package encoding + +import ( + "bytes" + "encoding/gob" +) + +// GobCodec encodes/decodes Go values to/from gob. +// You can use encoding.Gob instead of creating an instance of this struct. +type GobCodec struct{} + +// Marshal encodes a Go value to gob. +func (c GobCodec) Marshal(v interface{}) ([]byte, error) { + buffer := new(bytes.Buffer) + encoder := gob.NewEncoder(buffer) + err := encoder.Encode(v) + if err != nil { + return nil, err + } + return buffer.Bytes(), nil +} + +// Unmarshal decodes a gob value into a Go value. +func (c GobCodec) Unmarshal(data []byte, v interface{}) error { + reader := bytes.NewReader(data) + decoder := gob.NewDecoder(reader) + return decoder.Decode(v) +} diff --git a/vendor/github.com/philippgille/gokv/encoding/json.go b/vendor/github.com/philippgille/gokv/encoding/json.go new file mode 100644 index 0000000000..a25baa67f4 --- /dev/null +++ b/vendor/github.com/philippgille/gokv/encoding/json.go @@ -0,0 +1,19 @@ +package encoding + +import ( + "encoding/json" +) + +// JSONcodec encodes/decodes Go values to/from JSON. +// You can use encoding.JSON instead of creating an instance of this struct. +type JSONcodec struct{} + +// Marshal encodes a Go value to JSON. +func (c JSONcodec) Marshal(v interface{}) ([]byte, error) { + return json.Marshal(v) +} + +// Unmarshal decodes a JSON value into a Go value. +func (c JSONcodec) Unmarshal(data []byte, v interface{}) error { + return json.Unmarshal(data, v) +} diff --git a/vendor/github.com/philippgille/gokv/store.go b/vendor/github.com/philippgille/gokv/store.go new file mode 100644 index 0000000000..22ac087c31 --- /dev/null +++ b/vendor/github.com/philippgille/gokv/store.go @@ -0,0 +1,40 @@ +package gokv + +// Store is an abstraction for different key-value store implementations. +// A store must be able to store, retrieve and delete key-value pairs, +// with the key being a string and the value being any Go interface{}. +type Store interface { + // Set stores the given value for the given key. + // The implementation automatically marshalls the value. + // The marshalling format depends on the implementation. It can be JSON, gob etc. + // The key must not be "" and the value must not be nil. + Set(k string, v interface{}) error + // Get retrieves the value for the given key. + // The implementation automatically unmarshalls the value. + // The unmarshalling source depends on the implementation. It can be JSON, gob etc. + // The automatic unmarshalling requires a pointer to an object of the correct type + // being passed as parameter. + // In case of a struct the Get method will populate the fields of the object + // that the passed pointer points to with the values of the retrieved object's values. + // If no value is found it returns (false, nil). + // The key must not be "" and the pointer must not be nil. + Get(k string, v interface{}) (found bool, err error) + // Delete deletes the stored value for the given key. + // Deleting a non-existing key-value pair does NOT lead to an error. + // The key must not be "". + Delete(k string) error + // Close must be called when the work with the key-value store is done. + // Most (if not all) implementations are meant to be used long-lived, + // so only call Close() at the very end. + // Depending on the store implementation it might do one or more of the following: + // Make sure all pending updates make their way to disk, + // finish open transactions, + // close the file handle to an embedded DB, + // close the connection to the DB server, + // release any open resources, + // etc. + // Some implementation might not need the store to be closed, + // but as long as you work with the gokv.Store interface you never know which implementation + // is passed to your method, so you should always call it. + Close() error +} diff --git a/vendor/github.com/philippgille/gokv/util/LICENSE b/vendor/github.com/philippgille/gokv/util/LICENSE new file mode 100644 index 0000000000..a612ad9813 --- /dev/null +++ b/vendor/github.com/philippgille/gokv/util/LICENSE @@ -0,0 +1,373 @@ +Mozilla Public License Version 2.0 +================================== + +1. Definitions +-------------- + +1.1. "Contributor" + means each individual or legal entity that creates, contributes to + the creation of, or owns Covered Software. + +1.2. "Contributor Version" + means the combination of the Contributions of others (if any) used + by a Contributor and that particular Contributor's Contribution. + +1.3. "Contribution" + means Covered Software of a particular Contributor. + +1.4. "Covered Software" + means Source Code Form to which the initial Contributor has attached + the notice in Exhibit A, the Executable Form of such Source Code + Form, and Modifications of such Source Code Form, in each case + including portions thereof. + +1.5. "Incompatible With Secondary Licenses" + means + + (a) that the initial Contributor has attached the notice described + in Exhibit B to the Covered Software; or + + (b) that the Covered Software was made available under the terms of + version 1.1 or earlier of the License, but not also under the + terms of a Secondary License. + +1.6. "Executable Form" + means any form of the work other than Source Code Form. + +1.7. "Larger Work" + means a work that combines Covered Software with other material, in + a separate file or files, that is not Covered Software. + +1.8. "License" + means this document. + +1.9. "Licensable" + means having the right to grant, to the maximum extent possible, + whether at the time of the initial grant or subsequently, any and + all of the rights conveyed by this License. + +1.10. "Modifications" + means any of the following: + + (a) any file in Source Code Form that results from an addition to, + deletion from, or modification of the contents of Covered + Software; or + + (b) any new file in Source Code Form that contains any Covered + Software. + +1.11. "Patent Claims" of a Contributor + means any patent claim(s), including without limitation, method, + process, and apparatus claims, in any patent Licensable by such + Contributor that would be infringed, but for the grant of the + License, by the making, using, selling, offering for sale, having + made, import, or transfer of either its Contributions or its + Contributor Version. + +1.12. "Secondary License" + means either the GNU General Public License, Version 2.0, the GNU + Lesser General Public License, Version 2.1, the GNU Affero General + Public License, Version 3.0, or any later versions of those + licenses. + +1.13. "Source Code Form" + means the form of the work preferred for making modifications. + +1.14. "You" (or "Your") + means an individual or a legal entity exercising rights under this + License. For legal entities, "You" includes any entity that + controls, is controlled by, or is under common control with You. For + purposes of this definition, "control" means (a) the power, direct + or indirect, to cause the direction or management of such entity, + whether by contract or otherwise, or (b) ownership of more than + fifty percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants and Conditions +-------------------------------- + +2.1. Grants + +Each Contributor hereby grants You a world-wide, royalty-free, +non-exclusive license: + +(a) under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or + as part of a Larger Work; and + +(b) under Patent Claims of such Contributor to make, use, sell, offer + for sale, have made, import, and otherwise transfer either its + Contributions or its Contributor Version. + +2.2. Effective Date + +The licenses granted in Section 2.1 with respect to any Contribution +become effective for each Contribution on the date the Contributor first +distributes such Contribution. + +2.3. Limitations on Grant Scope + +The licenses granted in this Section 2 are the only rights granted under +this License. No additional rights or licenses will be implied from the +distribution or licensing of Covered Software under this License. +Notwithstanding Section 2.1(b) above, no patent license is granted by a +Contributor: + +(a) for any code that a Contributor has removed from Covered Software; + or + +(b) for infringements caused by: (i) Your and any other third party's + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + +(c) under Patent Claims infringed by Covered Software in the absence of + its Contributions. + +This License does not grant any rights in the trademarks, service marks, +or logos of any Contributor (except as may be necessary to comply with +the notice requirements in Section 3.4). + +2.4. Subsequent Licenses + +No Contributor makes additional grants as a result of Your choice to +distribute the Covered Software under a subsequent version of this +License (see Section 10.2) or under the terms of a Secondary License (if +permitted under the terms of Section 3.3). + +2.5. Representation + +Each Contributor represents that the Contributor believes its +Contributions are its original creation(s) or it has sufficient rights +to grant the rights to its Contributions conveyed by this License. + +2.6. Fair Use + +This License is not intended to limit any rights You have under +applicable copyright doctrines of fair use, fair dealing, or other +equivalents. + +2.7. Conditions + +Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted +in Section 2.1. + +3. Responsibilities +------------------- + +3.1. Distribution of Source Form + +All distribution of Covered Software in Source Code Form, including any +Modifications that You create or to which You contribute, must be under +the terms of this License. You must inform recipients that the Source +Code Form of the Covered Software is governed by the terms of this +License, and how they can obtain a copy of this License. You may not +attempt to alter or restrict the recipients' rights in the Source Code +Form. + +3.2. Distribution of Executable Form + +If You distribute Covered Software in Executable Form then: + +(a) such Covered Software must also be made available in Source Code + Form, as described in Section 3.1, and You must inform recipients of + the Executable Form how they can obtain a copy of such Source Code + Form by reasonable means in a timely manner, at a charge no more + than the cost of distribution to the recipient; and + +(b) You may distribute such Executable Form under the terms of this + License, or sublicense it under different terms, provided that the + license for the Executable Form does not attempt to limit or alter + the recipients' rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + +You may create and distribute a Larger Work under terms of Your choice, +provided that You also comply with the requirements of this License for +the Covered Software. If the Larger Work is a combination of Covered +Software with a work governed by one or more Secondary Licenses, and the +Covered Software is not Incompatible With Secondary Licenses, this +License permits You to additionally distribute such Covered Software +under the terms of such Secondary License(s), so that the recipient of +the Larger Work may, at their option, further distribute the Covered +Software under the terms of either this License or such Secondary +License(s). + +3.4. Notices + +You may not remove or alter the substance of any license notices +(including copyright notices, patent notices, disclaimers of warranty, +or limitations of liability) contained within the Source Code Form of +the Covered Software, except that You may alter any license notices to +the extent required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + +You may choose to offer, and to charge a fee for, warranty, support, +indemnity or liability obligations to one or more recipients of Covered +Software. However, You may do so only on Your own behalf, and not on +behalf of any Contributor. You must make it absolutely clear that any +such warranty, support, indemnity, or liability obligation is offered by +You alone, and You hereby agree to indemnify every Contributor for any +liability incurred by such Contributor as a result of warranty, support, +indemnity or liability terms You offer. You may include additional +disclaimers of warranty and limitations of liability specific to any +jurisdiction. + +4. Inability to Comply Due to Statute or Regulation +--------------------------------------------------- + +If it is impossible for You to comply with any of the terms of this +License with respect to some or all of the Covered Software due to +statute, judicial order, or regulation then You must: (a) comply with +the terms of this License to the maximum extent possible; and (b) +describe the limitations and the code they affect. Such description must +be placed in a text file included with all distributions of the Covered +Software under this License. Except to the extent prohibited by statute +or regulation, such description must be sufficiently detailed for a +recipient of ordinary skill to be able to understand it. + +5. Termination +-------------- + +5.1. The rights granted under this License will terminate automatically +if You fail to comply with any of its terms. However, if You become +compliant, then the rights granted under this License from a particular +Contributor are reinstated (a) provisionally, unless and until such +Contributor explicitly and finally terminates Your grants, and (b) on an +ongoing basis, if such Contributor fails to notify You of the +non-compliance by some reasonable means prior to 60 days after You have +come back into compliance. Moreover, Your grants from a particular +Contributor are reinstated on an ongoing basis if such Contributor +notifies You of the non-compliance by some reasonable means, this is the +first time You have received notice of non-compliance with this License +from such Contributor, and You become compliant prior to 30 days after +Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent +infringement claim (excluding declaratory judgment actions, +counter-claims, and cross-claims) alleging that a Contributor Version +directly or indirectly infringes any patent, then the rights granted to +You by any and all Contributors for the Covered Software under Section +2.1 of this License shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all +end user license agreements (excluding distributors and resellers) which +have been validly granted by You or Your distributors under this License +prior to termination shall survive termination. + +************************************************************************ +* * +* 6. Disclaimer of Warranty * +* ------------------------- * +* * +* Covered Software is provided under this License on an "as is" * +* basis, without warranty of any kind, either expressed, implied, or * +* statutory, including, without limitation, warranties that the * +* Covered Software is free of defects, merchantable, fit for a * +* particular purpose or non-infringing. The entire risk as to the * +* quality and performance of the Covered Software is with You. * +* Should any Covered Software prove defective in any respect, You * +* (not any Contributor) assume the cost of any necessary servicing, * +* repair, or correction. This disclaimer of warranty constitutes an * +* essential part of this License. No use of any Covered Software is * +* authorized under this License except under this disclaimer. * +* * +************************************************************************ + +************************************************************************ +* * +* 7. Limitation of Liability * +* -------------------------- * +* * +* Under no circumstances and under no legal theory, whether tort * +* (including negligence), contract, or otherwise, shall any * +* Contributor, or anyone who distributes Covered Software as * +* permitted above, be liable to You for any direct, indirect, * +* special, incidental, or consequential damages of any character * +* including, without limitation, damages for lost profits, loss of * +* goodwill, work stoppage, computer failure or malfunction, or any * +* and all other commercial damages or losses, even if such party * +* shall have been informed of the possibility of such damages. This * +* limitation of liability shall not apply to liability for death or * +* personal injury resulting from such party's negligence to the * +* extent applicable law prohibits such limitation. Some * +* jurisdictions do not allow the exclusion or limitation of * +* incidental or consequential damages, so this exclusion and * +* limitation may not apply to You. * +* * +************************************************************************ + +8. Litigation +------------- + +Any litigation relating to this License may be brought only in the +courts of a jurisdiction where the defendant maintains its principal +place of business and such litigation shall be governed by laws of that +jurisdiction, without reference to its conflict-of-law provisions. +Nothing in this Section shall prevent a party's ability to bring +cross-claims or counter-claims. + +9. Miscellaneous +---------------- + +This License represents the complete agreement concerning the subject +matter hereof. If any provision of this License is held to be +unenforceable, such provision shall be reformed only to the extent +necessary to make it enforceable. Any law or regulation which provides +that the language of a contract shall be construed against the drafter +shall not be used to construe this License against a Contributor. + +10. Versions of the License +--------------------------- + +10.1. New Versions + +Mozilla Foundation is the license steward. Except as provided in Section +10.3, no one other than the license steward has the right to modify or +publish new versions of this License. Each version will be given a +distinguishing version number. + +10.2. Effect of New Versions + +You may distribute the Covered Software under the terms of the version +of the License under which You originally received the Covered Software, +or under the terms of any subsequent version published by the license +steward. + +10.3. Modified Versions + +If you create software not governed by this License, and you want to +create a new license for such software, you may create and use a +modified version of this License if you rename the license and remove +any references to the name of the license steward (except to note that +such modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary +Licenses + +If You choose to distribute Source Code Form that is Incompatible With +Secondary Licenses under the terms of this version of the License, the +notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice +------------------------------------------- + + This Source Code Form is subject to the terms of the Mozilla Public + License, v. 2.0. If a copy of the MPL was not distributed with this + file, You can obtain one at http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular +file, then You may include the notice in a location (such as a LICENSE +file in a relevant directory) where a recipient would be likely to look +for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - "Incompatible With Secondary Licenses" Notice +--------------------------------------------------------- + + This Source Code Form is "Incompatible With Secondary Licenses", as + defined by the Mozilla Public License, v. 2.0. diff --git a/vendor/github.com/philippgille/gokv/util/docs.go b/vendor/github.com/philippgille/gokv/util/docs.go new file mode 100644 index 0000000000..b28b9fc0c7 --- /dev/null +++ b/vendor/github.com/philippgille/gokv/util/docs.go @@ -0,0 +1,4 @@ +/* +Package util contains utility functions that are used across all `gokv.Store` implementations. +*/ +package util diff --git a/vendor/github.com/philippgille/gokv/util/util.go b/vendor/github.com/philippgille/gokv/util/util.go new file mode 100644 index 0000000000..d4f4856ff3 --- /dev/null +++ b/vendor/github.com/philippgille/gokv/util/util.go @@ -0,0 +1,29 @@ +package util + +import ( + "errors" +) + +// CheckKeyAndValue returns an error if k == "" or if v == nil +func CheckKeyAndValue(k string, v interface{}) error { + if err := CheckKey(k); err != nil { + return err + } + return CheckVal(v) +} + +// CheckKey returns an error if k == "" +func CheckKey(k string) error { + if k == "" { + return errors.New("The passed key is an empty string, which is invalid") + } + return nil +} + +// CheckVal returns an error if v == nil +func CheckVal(v interface{}) error { + if v == nil { + return errors.New("The passed value is nil, which is not allowed") + } + return nil +} diff --git a/vendor/golang.org/x/net/internal/timeseries/timeseries.go b/vendor/golang.org/x/net/internal/timeseries/timeseries.go new file mode 100644 index 0000000000..dc5225b6d4 --- /dev/null +++ b/vendor/golang.org/x/net/internal/timeseries/timeseries.go @@ -0,0 +1,525 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package timeseries implements a time series structure for stats collection. +package timeseries // import "golang.org/x/net/internal/timeseries" + +import ( + "fmt" + "log" + "time" +) + +const ( + timeSeriesNumBuckets = 64 + minuteHourSeriesNumBuckets = 60 +) + +var timeSeriesResolutions = []time.Duration{ + 1 * time.Second, + 10 * time.Second, + 1 * time.Minute, + 10 * time.Minute, + 1 * time.Hour, + 6 * time.Hour, + 24 * time.Hour, // 1 day + 7 * 24 * time.Hour, // 1 week + 4 * 7 * 24 * time.Hour, // 4 weeks + 16 * 7 * 24 * time.Hour, // 16 weeks +} + +var minuteHourSeriesResolutions = []time.Duration{ + 1 * time.Second, + 1 * time.Minute, +} + +// An Observable is a kind of data that can be aggregated in a time series. +type Observable interface { + Multiply(ratio float64) // Multiplies the data in self by a given ratio + Add(other Observable) // Adds the data from a different observation to self + Clear() // Clears the observation so it can be reused. + CopyFrom(other Observable) // Copies the contents of a given observation to self +} + +// Float attaches the methods of Observable to a float64. +type Float float64 + +// NewFloat returns a Float. +func NewFloat() Observable { + f := Float(0) + return &f +} + +// String returns the float as a string. +func (f *Float) String() string { return fmt.Sprintf("%g", f.Value()) } + +// Value returns the float's value. +func (f *Float) Value() float64 { return float64(*f) } + +func (f *Float) Multiply(ratio float64) { *f *= Float(ratio) } + +func (f *Float) Add(other Observable) { + o := other.(*Float) + *f += *o +} + +func (f *Float) Clear() { *f = 0 } + +func (f *Float) CopyFrom(other Observable) { + o := other.(*Float) + *f = *o +} + +// A Clock tells the current time. +type Clock interface { + Time() time.Time +} + +type defaultClock int + +var defaultClockInstance defaultClock + +func (defaultClock) Time() time.Time { return time.Now() } + +// Information kept per level. Each level consists of a circular list of +// observations. The start of the level may be derived from end and the +// len(buckets) * sizeInMillis. +type tsLevel struct { + oldest int // index to oldest bucketed Observable + newest int // index to newest bucketed Observable + end time.Time // end timestamp for this level + size time.Duration // duration of the bucketed Observable + buckets []Observable // collections of observations + provider func() Observable // used for creating new Observable +} + +func (l *tsLevel) Clear() { + l.oldest = 0 + l.newest = len(l.buckets) - 1 + l.end = time.Time{} + for i := range l.buckets { + if l.buckets[i] != nil { + l.buckets[i].Clear() + l.buckets[i] = nil + } + } +} + +func (l *tsLevel) InitLevel(size time.Duration, numBuckets int, f func() Observable) { + l.size = size + l.provider = f + l.buckets = make([]Observable, numBuckets) +} + +// Keeps a sequence of levels. Each level is responsible for storing data at +// a given resolution. For example, the first level stores data at a one +// minute resolution while the second level stores data at a one hour +// resolution. + +// Each level is represented by a sequence of buckets. Each bucket spans an +// interval equal to the resolution of the level. New observations are added +// to the last bucket. +type timeSeries struct { + provider func() Observable // make more Observable + numBuckets int // number of buckets in each level + levels []*tsLevel // levels of bucketed Observable + lastAdd time.Time // time of last Observable tracked + total Observable // convenient aggregation of all Observable + clock Clock // Clock for getting current time + pending Observable // observations not yet bucketed + pendingTime time.Time // what time are we keeping in pending + dirty bool // if there are pending observations +} + +// init initializes a level according to the supplied criteria. +func (ts *timeSeries) init(resolutions []time.Duration, f func() Observable, numBuckets int, clock Clock) { + ts.provider = f + ts.numBuckets = numBuckets + ts.clock = clock + ts.levels = make([]*tsLevel, len(resolutions)) + + for i := range resolutions { + if i > 0 && resolutions[i-1] >= resolutions[i] { + log.Print("timeseries: resolutions must be monotonically increasing") + break + } + newLevel := new(tsLevel) + newLevel.InitLevel(resolutions[i], ts.numBuckets, ts.provider) + ts.levels[i] = newLevel + } + + ts.Clear() +} + +// Clear removes all observations from the time series. +func (ts *timeSeries) Clear() { + ts.lastAdd = time.Time{} + ts.total = ts.resetObservation(ts.total) + ts.pending = ts.resetObservation(ts.pending) + ts.pendingTime = time.Time{} + ts.dirty = false + + for i := range ts.levels { + ts.levels[i].Clear() + } +} + +// Add records an observation at the current time. +func (ts *timeSeries) Add(observation Observable) { + ts.AddWithTime(observation, ts.clock.Time()) +} + +// AddWithTime records an observation at the specified time. +func (ts *timeSeries) AddWithTime(observation Observable, t time.Time) { + + smallBucketDuration := ts.levels[0].size + + if t.After(ts.lastAdd) { + ts.lastAdd = t + } + + if t.After(ts.pendingTime) { + ts.advance(t) + ts.mergePendingUpdates() + ts.pendingTime = ts.levels[0].end + ts.pending.CopyFrom(observation) + ts.dirty = true + } else if t.After(ts.pendingTime.Add(-1 * smallBucketDuration)) { + // The observation is close enough to go into the pending bucket. + // This compensates for clock skewing and small scheduling delays + // by letting the update stay in the fast path. + ts.pending.Add(observation) + ts.dirty = true + } else { + ts.mergeValue(observation, t) + } +} + +// mergeValue inserts the observation at the specified time in the past into all levels. +func (ts *timeSeries) mergeValue(observation Observable, t time.Time) { + for _, level := range ts.levels { + index := (ts.numBuckets - 1) - int(level.end.Sub(t)/level.size) + if 0 <= index && index < ts.numBuckets { + bucketNumber := (level.oldest + index) % ts.numBuckets + if level.buckets[bucketNumber] == nil { + level.buckets[bucketNumber] = level.provider() + } + level.buckets[bucketNumber].Add(observation) + } + } + ts.total.Add(observation) +} + +// mergePendingUpdates applies the pending updates into all levels. +func (ts *timeSeries) mergePendingUpdates() { + if ts.dirty { + ts.mergeValue(ts.pending, ts.pendingTime) + ts.pending = ts.resetObservation(ts.pending) + ts.dirty = false + } +} + +// advance cycles the buckets at each level until the latest bucket in +// each level can hold the time specified. +func (ts *timeSeries) advance(t time.Time) { + if !t.After(ts.levels[0].end) { + return + } + for i := 0; i < len(ts.levels); i++ { + level := ts.levels[i] + if !level.end.Before(t) { + break + } + + // If the time is sufficiently far, just clear the level and advance + // directly. + if !t.Before(level.end.Add(level.size * time.Duration(ts.numBuckets))) { + for _, b := range level.buckets { + ts.resetObservation(b) + } + level.end = time.Unix(0, (t.UnixNano()/level.size.Nanoseconds())*level.size.Nanoseconds()) + } + + for t.After(level.end) { + level.end = level.end.Add(level.size) + level.newest = level.oldest + level.oldest = (level.oldest + 1) % ts.numBuckets + ts.resetObservation(level.buckets[level.newest]) + } + + t = level.end + } +} + +// Latest returns the sum of the num latest buckets from the level. +func (ts *timeSeries) Latest(level, num int) Observable { + now := ts.clock.Time() + if ts.levels[0].end.Before(now) { + ts.advance(now) + } + + ts.mergePendingUpdates() + + result := ts.provider() + l := ts.levels[level] + index := l.newest + + for i := 0; i < num; i++ { + if l.buckets[index] != nil { + result.Add(l.buckets[index]) + } + if index == 0 { + index = ts.numBuckets + } + index-- + } + + return result +} + +// LatestBuckets returns a copy of the num latest buckets from level. +func (ts *timeSeries) LatestBuckets(level, num int) []Observable { + if level < 0 || level > len(ts.levels) { + log.Print("timeseries: bad level argument: ", level) + return nil + } + if num < 0 || num >= ts.numBuckets { + log.Print("timeseries: bad num argument: ", num) + return nil + } + + results := make([]Observable, num) + now := ts.clock.Time() + if ts.levels[0].end.Before(now) { + ts.advance(now) + } + + ts.mergePendingUpdates() + + l := ts.levels[level] + index := l.newest + + for i := 0; i < num; i++ { + result := ts.provider() + results[i] = result + if l.buckets[index] != nil { + result.CopyFrom(l.buckets[index]) + } + + if index == 0 { + index = ts.numBuckets + } + index -= 1 + } + return results +} + +// ScaleBy updates observations by scaling by factor. +func (ts *timeSeries) ScaleBy(factor float64) { + for _, l := range ts.levels { + for i := 0; i < ts.numBuckets; i++ { + l.buckets[i].Multiply(factor) + } + } + + ts.total.Multiply(factor) + ts.pending.Multiply(factor) +} + +// Range returns the sum of observations added over the specified time range. +// If start or finish times don't fall on bucket boundaries of the same +// level, then return values are approximate answers. +func (ts *timeSeries) Range(start, finish time.Time) Observable { + return ts.ComputeRange(start, finish, 1)[0] +} + +// Recent returns the sum of observations from the last delta. +func (ts *timeSeries) Recent(delta time.Duration) Observable { + now := ts.clock.Time() + return ts.Range(now.Add(-delta), now) +} + +// Total returns the total of all observations. +func (ts *timeSeries) Total() Observable { + ts.mergePendingUpdates() + return ts.total +} + +// ComputeRange computes a specified number of values into a slice using +// the observations recorded over the specified time period. The return +// values are approximate if the start or finish times don't fall on the +// bucket boundaries at the same level or if the number of buckets spanning +// the range is not an integral multiple of num. +func (ts *timeSeries) ComputeRange(start, finish time.Time, num int) []Observable { + if start.After(finish) { + log.Printf("timeseries: start > finish, %v>%v", start, finish) + return nil + } + + if num < 0 { + log.Printf("timeseries: num < 0, %v", num) + return nil + } + + results := make([]Observable, num) + + for _, l := range ts.levels { + if !start.Before(l.end.Add(-l.size * time.Duration(ts.numBuckets))) { + ts.extract(l, start, finish, num, results) + return results + } + } + + // Failed to find a level that covers the desired range. So just + // extract from the last level, even if it doesn't cover the entire + // desired range. + ts.extract(ts.levels[len(ts.levels)-1], start, finish, num, results) + + return results +} + +// RecentList returns the specified number of values in slice over the most +// recent time period of the specified range. +func (ts *timeSeries) RecentList(delta time.Duration, num int) []Observable { + if delta < 0 { + return nil + } + now := ts.clock.Time() + return ts.ComputeRange(now.Add(-delta), now, num) +} + +// extract returns a slice of specified number of observations from a given +// level over a given range. +func (ts *timeSeries) extract(l *tsLevel, start, finish time.Time, num int, results []Observable) { + ts.mergePendingUpdates() + + srcInterval := l.size + dstInterval := finish.Sub(start) / time.Duration(num) + dstStart := start + srcStart := l.end.Add(-srcInterval * time.Duration(ts.numBuckets)) + + srcIndex := 0 + + // Where should scanning start? + if dstStart.After(srcStart) { + advance := int(dstStart.Sub(srcStart) / srcInterval) + srcIndex += advance + srcStart = srcStart.Add(time.Duration(advance) * srcInterval) + } + + // The i'th value is computed as show below. + // interval = (finish/start)/num + // i'th value = sum of observation in range + // [ start + i * interval, + // start + (i + 1) * interval ) + for i := 0; i < num; i++ { + results[i] = ts.resetObservation(results[i]) + dstEnd := dstStart.Add(dstInterval) + for srcIndex < ts.numBuckets && srcStart.Before(dstEnd) { + srcEnd := srcStart.Add(srcInterval) + if srcEnd.After(ts.lastAdd) { + srcEnd = ts.lastAdd + } + + if !srcEnd.Before(dstStart) { + srcValue := l.buckets[(srcIndex+l.oldest)%ts.numBuckets] + if !srcStart.Before(dstStart) && !srcEnd.After(dstEnd) { + // dst completely contains src. + if srcValue != nil { + results[i].Add(srcValue) + } + } else { + // dst partially overlaps src. + overlapStart := maxTime(srcStart, dstStart) + overlapEnd := minTime(srcEnd, dstEnd) + base := srcEnd.Sub(srcStart) + fraction := overlapEnd.Sub(overlapStart).Seconds() / base.Seconds() + + used := ts.provider() + if srcValue != nil { + used.CopyFrom(srcValue) + } + used.Multiply(fraction) + results[i].Add(used) + } + + if srcEnd.After(dstEnd) { + break + } + } + srcIndex++ + srcStart = srcStart.Add(srcInterval) + } + dstStart = dstStart.Add(dstInterval) + } +} + +// resetObservation clears the content so the struct may be reused. +func (ts *timeSeries) resetObservation(observation Observable) Observable { + if observation == nil { + observation = ts.provider() + } else { + observation.Clear() + } + return observation +} + +// TimeSeries tracks data at granularities from 1 second to 16 weeks. +type TimeSeries struct { + timeSeries +} + +// NewTimeSeries creates a new TimeSeries using the function provided for creating new Observable. +func NewTimeSeries(f func() Observable) *TimeSeries { + return NewTimeSeriesWithClock(f, defaultClockInstance) +} + +// NewTimeSeriesWithClock creates a new TimeSeries using the function provided for creating new Observable and the clock for +// assigning timestamps. +func NewTimeSeriesWithClock(f func() Observable, clock Clock) *TimeSeries { + ts := new(TimeSeries) + ts.timeSeries.init(timeSeriesResolutions, f, timeSeriesNumBuckets, clock) + return ts +} + +// MinuteHourSeries tracks data at granularities of 1 minute and 1 hour. +type MinuteHourSeries struct { + timeSeries +} + +// NewMinuteHourSeries creates a new MinuteHourSeries using the function provided for creating new Observable. +func NewMinuteHourSeries(f func() Observable) *MinuteHourSeries { + return NewMinuteHourSeriesWithClock(f, defaultClockInstance) +} + +// NewMinuteHourSeriesWithClock creates a new MinuteHourSeries using the function provided for creating new Observable and the clock for +// assigning timestamps. +func NewMinuteHourSeriesWithClock(f func() Observable, clock Clock) *MinuteHourSeries { + ts := new(MinuteHourSeries) + ts.timeSeries.init(minuteHourSeriesResolutions, f, + minuteHourSeriesNumBuckets, clock) + return ts +} + +func (ts *MinuteHourSeries) Minute() Observable { + return ts.timeSeries.Latest(0, 60) +} + +func (ts *MinuteHourSeries) Hour() Observable { + return ts.timeSeries.Latest(1, 60) +} + +func minTime(a, b time.Time) time.Time { + if a.Before(b) { + return a + } + return b +} + +func maxTime(a, b time.Time) time.Time { + if a.After(b) { + return a + } + return b +} diff --git a/vendor/golang.org/x/net/trace/events.go b/vendor/golang.org/x/net/trace/events.go new file mode 100644 index 0000000000..c646a6952e --- /dev/null +++ b/vendor/golang.org/x/net/trace/events.go @@ -0,0 +1,532 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package trace + +import ( + "bytes" + "fmt" + "html/template" + "io" + "log" + "net/http" + "runtime" + "sort" + "strconv" + "strings" + "sync" + "sync/atomic" + "text/tabwriter" + "time" +) + +const maxEventsPerLog = 100 + +type bucket struct { + MaxErrAge time.Duration + String string +} + +var buckets = []bucket{ + {0, "total"}, + {10 * time.Second, "errs<10s"}, + {1 * time.Minute, "errs<1m"}, + {10 * time.Minute, "errs<10m"}, + {1 * time.Hour, "errs<1h"}, + {10 * time.Hour, "errs<10h"}, + {24000 * time.Hour, "errors"}, +} + +// RenderEvents renders the HTML page typically served at /debug/events. +// It does not do any auth checking. The request may be nil. +// +// Most users will use the Events handler. +func RenderEvents(w http.ResponseWriter, req *http.Request, sensitive bool) { + now := time.Now() + data := &struct { + Families []string // family names + Buckets []bucket + Counts [][]int // eventLog count per family/bucket + + // Set when a bucket has been selected. + Family string + Bucket int + EventLogs eventLogs + Expanded bool + }{ + Buckets: buckets, + } + + data.Families = make([]string, 0, len(families)) + famMu.RLock() + for name := range families { + data.Families = append(data.Families, name) + } + famMu.RUnlock() + sort.Strings(data.Families) + + // Count the number of eventLogs in each family for each error age. + data.Counts = make([][]int, len(data.Families)) + for i, name := range data.Families { + // TODO(sameer): move this loop under the family lock. + f := getEventFamily(name) + data.Counts[i] = make([]int, len(data.Buckets)) + for j, b := range data.Buckets { + data.Counts[i][j] = f.Count(now, b.MaxErrAge) + } + } + + if req != nil { + var ok bool + data.Family, data.Bucket, ok = parseEventsArgs(req) + if !ok { + // No-op + } else { + data.EventLogs = getEventFamily(data.Family).Copy(now, buckets[data.Bucket].MaxErrAge) + } + if data.EventLogs != nil { + defer data.EventLogs.Free() + sort.Sort(data.EventLogs) + } + if exp, err := strconv.ParseBool(req.FormValue("exp")); err == nil { + data.Expanded = exp + } + } + + famMu.RLock() + defer famMu.RUnlock() + if err := eventsTmpl().Execute(w, data); err != nil { + log.Printf("net/trace: Failed executing template: %v", err) + } +} + +func parseEventsArgs(req *http.Request) (fam string, b int, ok bool) { + fam, bStr := req.FormValue("fam"), req.FormValue("b") + if fam == "" || bStr == "" { + return "", 0, false + } + b, err := strconv.Atoi(bStr) + if err != nil || b < 0 || b >= len(buckets) { + return "", 0, false + } + return fam, b, true +} + +// An EventLog provides a log of events associated with a specific object. +type EventLog interface { + // Printf formats its arguments with fmt.Sprintf and adds the + // result to the event log. + Printf(format string, a ...interface{}) + + // Errorf is like Printf, but it marks this event as an error. + Errorf(format string, a ...interface{}) + + // Finish declares that this event log is complete. + // The event log should not be used after calling this method. + Finish() +} + +// NewEventLog returns a new EventLog with the specified family name +// and title. +func NewEventLog(family, title string) EventLog { + el := newEventLog() + el.ref() + el.Family, el.Title = family, title + el.Start = time.Now() + el.events = make([]logEntry, 0, maxEventsPerLog) + el.stack = make([]uintptr, 32) + n := runtime.Callers(2, el.stack) + el.stack = el.stack[:n] + + getEventFamily(family).add(el) + return el +} + +func (el *eventLog) Finish() { + getEventFamily(el.Family).remove(el) + el.unref() // matches ref in New +} + +var ( + famMu sync.RWMutex + families = make(map[string]*eventFamily) // family name => family +) + +func getEventFamily(fam string) *eventFamily { + famMu.Lock() + defer famMu.Unlock() + f := families[fam] + if f == nil { + f = &eventFamily{} + families[fam] = f + } + return f +} + +type eventFamily struct { + mu sync.RWMutex + eventLogs eventLogs +} + +func (f *eventFamily) add(el *eventLog) { + f.mu.Lock() + f.eventLogs = append(f.eventLogs, el) + f.mu.Unlock() +} + +func (f *eventFamily) remove(el *eventLog) { + f.mu.Lock() + defer f.mu.Unlock() + for i, el0 := range f.eventLogs { + if el == el0 { + copy(f.eventLogs[i:], f.eventLogs[i+1:]) + f.eventLogs = f.eventLogs[:len(f.eventLogs)-1] + return + } + } +} + +func (f *eventFamily) Count(now time.Time, maxErrAge time.Duration) (n int) { + f.mu.RLock() + defer f.mu.RUnlock() + for _, el := range f.eventLogs { + if el.hasRecentError(now, maxErrAge) { + n++ + } + } + return +} + +func (f *eventFamily) Copy(now time.Time, maxErrAge time.Duration) (els eventLogs) { + f.mu.RLock() + defer f.mu.RUnlock() + els = make(eventLogs, 0, len(f.eventLogs)) + for _, el := range f.eventLogs { + if el.hasRecentError(now, maxErrAge) { + el.ref() + els = append(els, el) + } + } + return +} + +type eventLogs []*eventLog + +// Free calls unref on each element of the list. +func (els eventLogs) Free() { + for _, el := range els { + el.unref() + } +} + +// eventLogs may be sorted in reverse chronological order. +func (els eventLogs) Len() int { return len(els) } +func (els eventLogs) Less(i, j int) bool { return els[i].Start.After(els[j].Start) } +func (els eventLogs) Swap(i, j int) { els[i], els[j] = els[j], els[i] } + +// A logEntry is a timestamped log entry in an event log. +type logEntry struct { + When time.Time + Elapsed time.Duration // since previous event in log + NewDay bool // whether this event is on a different day to the previous event + What string + IsErr bool +} + +// WhenString returns a string representation of the elapsed time of the event. +// It will include the date if midnight was crossed. +func (e logEntry) WhenString() string { + if e.NewDay { + return e.When.Format("2006/01/02 15:04:05.000000") + } + return e.When.Format("15:04:05.000000") +} + +// An eventLog represents an active event log. +type eventLog struct { + // Family is the top-level grouping of event logs to which this belongs. + Family string + + // Title is the title of this event log. + Title string + + // Timing information. + Start time.Time + + // Call stack where this event log was created. + stack []uintptr + + // Append-only sequence of events. + // + // TODO(sameer): change this to a ring buffer to avoid the array copy + // when we hit maxEventsPerLog. + mu sync.RWMutex + events []logEntry + LastErrorTime time.Time + discarded int + + refs int32 // how many buckets this is in +} + +func (el *eventLog) reset() { + // Clear all but the mutex. Mutexes may not be copied, even when unlocked. + el.Family = "" + el.Title = "" + el.Start = time.Time{} + el.stack = nil + el.events = nil + el.LastErrorTime = time.Time{} + el.discarded = 0 + el.refs = 0 +} + +func (el *eventLog) hasRecentError(now time.Time, maxErrAge time.Duration) bool { + if maxErrAge == 0 { + return true + } + el.mu.RLock() + defer el.mu.RUnlock() + return now.Sub(el.LastErrorTime) < maxErrAge +} + +// delta returns the elapsed time since the last event or the log start, +// and whether it spans midnight. +// L >= el.mu +func (el *eventLog) delta(t time.Time) (time.Duration, bool) { + if len(el.events) == 0 { + return t.Sub(el.Start), false + } + prev := el.events[len(el.events)-1].When + return t.Sub(prev), prev.Day() != t.Day() + +} + +func (el *eventLog) Printf(format string, a ...interface{}) { + el.printf(false, format, a...) +} + +func (el *eventLog) Errorf(format string, a ...interface{}) { + el.printf(true, format, a...) +} + +func (el *eventLog) printf(isErr bool, format string, a ...interface{}) { + e := logEntry{When: time.Now(), IsErr: isErr, What: fmt.Sprintf(format, a...)} + el.mu.Lock() + e.Elapsed, e.NewDay = el.delta(e.When) + if len(el.events) < maxEventsPerLog { + el.events = append(el.events, e) + } else { + // Discard the oldest event. + if el.discarded == 0 { + // el.discarded starts at two to count for the event it + // is replacing, plus the next one that we are about to + // drop. + el.discarded = 2 + } else { + el.discarded++ + } + // TODO(sameer): if this causes allocations on a critical path, + // change eventLog.What to be a fmt.Stringer, as in trace.go. + el.events[0].What = fmt.Sprintf("(%d events discarded)", el.discarded) + // The timestamp of the discarded meta-event should be + // the time of the last event it is representing. + el.events[0].When = el.events[1].When + copy(el.events[1:], el.events[2:]) + el.events[maxEventsPerLog-1] = e + } + if e.IsErr { + el.LastErrorTime = e.When + } + el.mu.Unlock() +} + +func (el *eventLog) ref() { + atomic.AddInt32(&el.refs, 1) +} + +func (el *eventLog) unref() { + if atomic.AddInt32(&el.refs, -1) == 0 { + freeEventLog(el) + } +} + +func (el *eventLog) When() string { + return el.Start.Format("2006/01/02 15:04:05.000000") +} + +func (el *eventLog) ElapsedTime() string { + elapsed := time.Since(el.Start) + return fmt.Sprintf("%.6f", elapsed.Seconds()) +} + +func (el *eventLog) Stack() string { + buf := new(bytes.Buffer) + tw := tabwriter.NewWriter(buf, 1, 8, 1, '\t', 0) + printStackRecord(tw, el.stack) + tw.Flush() + return buf.String() +} + +// printStackRecord prints the function + source line information +// for a single stack trace. +// Adapted from runtime/pprof/pprof.go. +func printStackRecord(w io.Writer, stk []uintptr) { + for _, pc := range stk { + f := runtime.FuncForPC(pc) + if f == nil { + continue + } + file, line := f.FileLine(pc) + name := f.Name() + // Hide runtime.goexit and any runtime functions at the beginning. + if strings.HasPrefix(name, "runtime.") { + continue + } + fmt.Fprintf(w, "# %s\t%s:%d\n", name, file, line) + } +} + +func (el *eventLog) Events() []logEntry { + el.mu.RLock() + defer el.mu.RUnlock() + return el.events +} + +// freeEventLogs is a freelist of *eventLog +var freeEventLogs = make(chan *eventLog, 1000) + +// newEventLog returns a event log ready to use. +func newEventLog() *eventLog { + select { + case el := <-freeEventLogs: + return el + default: + return new(eventLog) + } +} + +// freeEventLog adds el to freeEventLogs if there's room. +// This is non-blocking. +func freeEventLog(el *eventLog) { + el.reset() + select { + case freeEventLogs <- el: + default: + } +} + +var eventsTmplCache *template.Template +var eventsTmplOnce sync.Once + +func eventsTmpl() *template.Template { + eventsTmplOnce.Do(func() { + eventsTmplCache = template.Must(template.New("events").Funcs(template.FuncMap{ + "elapsed": elapsed, + "trimSpace": strings.TrimSpace, + }).Parse(eventsHTML)) + }) + return eventsTmplCache +} + +const eventsHTML = ` + + + events + + + + +

/debug/events

+ + + {{range $i, $fam := .Families}} + + + + {{range $j, $bucket := $.Buckets}} + {{$n := index $.Counts $i $j}} + + {{end}} + + {{end}} +
{{$fam}} + {{if $n}}{{end}} + [{{$n}} {{$bucket.String}}] + {{if $n}}{{end}} +
+ +{{if $.EventLogs}} +
+

Family: {{$.Family}}

+ +{{if $.Expanded}}{{end}} +[Summary]{{if $.Expanded}}{{end}} + +{{if not $.Expanded}}{{end}} +[Expanded]{{if not $.Expanded}}{{end}} + + + + {{range $el := $.EventLogs}} + + + + + {{if $.Expanded}} + + + + + + {{range $el.Events}} + + + + + + {{end}} + {{end}} + {{end}} +
WhenElapsed
{{$el.When}}{{$el.ElapsedTime}}{{$el.Title}} +
{{$el.Stack|trimSpace}}
{{.WhenString}}{{elapsed .Elapsed}}.{{if .IsErr}}E{{else}}.{{end}}. {{.What}}
+{{end}} + + +` diff --git a/vendor/golang.org/x/net/trace/histogram.go b/vendor/golang.org/x/net/trace/histogram.go new file mode 100644 index 0000000000..9bf4286c79 --- /dev/null +++ b/vendor/golang.org/x/net/trace/histogram.go @@ -0,0 +1,365 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package trace + +// This file implements histogramming for RPC statistics collection. + +import ( + "bytes" + "fmt" + "html/template" + "log" + "math" + "sync" + + "golang.org/x/net/internal/timeseries" +) + +const ( + bucketCount = 38 +) + +// histogram keeps counts of values in buckets that are spaced +// out in powers of 2: 0-1, 2-3, 4-7... +// histogram implements timeseries.Observable +type histogram struct { + sum int64 // running total of measurements + sumOfSquares float64 // square of running total + buckets []int64 // bucketed values for histogram + value int // holds a single value as an optimization + valueCount int64 // number of values recorded for single value +} + +// AddMeasurement records a value measurement observation to the histogram. +func (h *histogram) addMeasurement(value int64) { + // TODO: assert invariant + h.sum += value + h.sumOfSquares += float64(value) * float64(value) + + bucketIndex := getBucket(value) + + if h.valueCount == 0 || (h.valueCount > 0 && h.value == bucketIndex) { + h.value = bucketIndex + h.valueCount++ + } else { + h.allocateBuckets() + h.buckets[bucketIndex]++ + } +} + +func (h *histogram) allocateBuckets() { + if h.buckets == nil { + h.buckets = make([]int64, bucketCount) + h.buckets[h.value] = h.valueCount + h.value = 0 + h.valueCount = -1 + } +} + +func log2(i int64) int { + n := 0 + for ; i >= 0x100; i >>= 8 { + n += 8 + } + for ; i > 0; i >>= 1 { + n += 1 + } + return n +} + +func getBucket(i int64) (index int) { + index = log2(i) - 1 + if index < 0 { + index = 0 + } + if index >= bucketCount { + index = bucketCount - 1 + } + return +} + +// Total returns the number of recorded observations. +func (h *histogram) total() (total int64) { + if h.valueCount >= 0 { + total = h.valueCount + } + for _, val := range h.buckets { + total += int64(val) + } + return +} + +// Average returns the average value of recorded observations. +func (h *histogram) average() float64 { + t := h.total() + if t == 0 { + return 0 + } + return float64(h.sum) / float64(t) +} + +// Variance returns the variance of recorded observations. +func (h *histogram) variance() float64 { + t := float64(h.total()) + if t == 0 { + return 0 + } + s := float64(h.sum) / t + return h.sumOfSquares/t - s*s +} + +// StandardDeviation returns the standard deviation of recorded observations. +func (h *histogram) standardDeviation() float64 { + return math.Sqrt(h.variance()) +} + +// PercentileBoundary estimates the value that the given fraction of recorded +// observations are less than. +func (h *histogram) percentileBoundary(percentile float64) int64 { + total := h.total() + + // Corner cases (make sure result is strictly less than Total()) + if total == 0 { + return 0 + } else if total == 1 { + return int64(h.average()) + } + + percentOfTotal := round(float64(total) * percentile) + var runningTotal int64 + + for i := range h.buckets { + value := h.buckets[i] + runningTotal += value + if runningTotal == percentOfTotal { + // We hit an exact bucket boundary. If the next bucket has data, it is a + // good estimate of the value. If the bucket is empty, we interpolate the + // midpoint between the next bucket's boundary and the next non-zero + // bucket. If the remaining buckets are all empty, then we use the + // boundary for the next bucket as the estimate. + j := uint8(i + 1) + min := bucketBoundary(j) + if runningTotal < total { + for h.buckets[j] == 0 { + j++ + } + } + max := bucketBoundary(j) + return min + round(float64(max-min)/2) + } else if runningTotal > percentOfTotal { + // The value is in this bucket. Interpolate the value. + delta := runningTotal - percentOfTotal + percentBucket := float64(value-delta) / float64(value) + bucketMin := bucketBoundary(uint8(i)) + nextBucketMin := bucketBoundary(uint8(i + 1)) + bucketSize := nextBucketMin - bucketMin + return bucketMin + round(percentBucket*float64(bucketSize)) + } + } + return bucketBoundary(bucketCount - 1) +} + +// Median returns the estimated median of the observed values. +func (h *histogram) median() int64 { + return h.percentileBoundary(0.5) +} + +// Add adds other to h. +func (h *histogram) Add(other timeseries.Observable) { + o := other.(*histogram) + if o.valueCount == 0 { + // Other histogram is empty + } else if h.valueCount >= 0 && o.valueCount > 0 && h.value == o.value { + // Both have a single bucketed value, aggregate them + h.valueCount += o.valueCount + } else { + // Two different values necessitate buckets in this histogram + h.allocateBuckets() + if o.valueCount >= 0 { + h.buckets[o.value] += o.valueCount + } else { + for i := range h.buckets { + h.buckets[i] += o.buckets[i] + } + } + } + h.sumOfSquares += o.sumOfSquares + h.sum += o.sum +} + +// Clear resets the histogram to an empty state, removing all observed values. +func (h *histogram) Clear() { + h.buckets = nil + h.value = 0 + h.valueCount = 0 + h.sum = 0 + h.sumOfSquares = 0 +} + +// CopyFrom copies from other, which must be a *histogram, into h. +func (h *histogram) CopyFrom(other timeseries.Observable) { + o := other.(*histogram) + if o.valueCount == -1 { + h.allocateBuckets() + copy(h.buckets, o.buckets) + } + h.sum = o.sum + h.sumOfSquares = o.sumOfSquares + h.value = o.value + h.valueCount = o.valueCount +} + +// Multiply scales the histogram by the specified ratio. +func (h *histogram) Multiply(ratio float64) { + if h.valueCount == -1 { + for i := range h.buckets { + h.buckets[i] = int64(float64(h.buckets[i]) * ratio) + } + } else { + h.valueCount = int64(float64(h.valueCount) * ratio) + } + h.sum = int64(float64(h.sum) * ratio) + h.sumOfSquares = h.sumOfSquares * ratio +} + +// New creates a new histogram. +func (h *histogram) New() timeseries.Observable { + r := new(histogram) + r.Clear() + return r +} + +func (h *histogram) String() string { + return fmt.Sprintf("%d, %f, %d, %d, %v", + h.sum, h.sumOfSquares, h.value, h.valueCount, h.buckets) +} + +// round returns the closest int64 to the argument +func round(in float64) int64 { + return int64(math.Floor(in + 0.5)) +} + +// bucketBoundary returns the first value in the bucket. +func bucketBoundary(bucket uint8) int64 { + if bucket == 0 { + return 0 + } + return 1 << bucket +} + +// bucketData holds data about a specific bucket for use in distTmpl. +type bucketData struct { + Lower, Upper int64 + N int64 + Pct, CumulativePct float64 + GraphWidth int +} + +// data holds data about a Distribution for use in distTmpl. +type data struct { + Buckets []*bucketData + Count, Median int64 + Mean, StandardDeviation float64 +} + +// maxHTMLBarWidth is the maximum width of the HTML bar for visualizing buckets. +const maxHTMLBarWidth = 350.0 + +// newData returns data representing h for use in distTmpl. +func (h *histogram) newData() *data { + // Force the allocation of buckets to simplify the rendering implementation + h.allocateBuckets() + // We scale the bars on the right so that the largest bar is + // maxHTMLBarWidth pixels in width. + maxBucket := int64(0) + for _, n := range h.buckets { + if n > maxBucket { + maxBucket = n + } + } + total := h.total() + barsizeMult := maxHTMLBarWidth / float64(maxBucket) + var pctMult float64 + if total == 0 { + pctMult = 1.0 + } else { + pctMult = 100.0 / float64(total) + } + + buckets := make([]*bucketData, len(h.buckets)) + runningTotal := int64(0) + for i, n := range h.buckets { + if n == 0 { + continue + } + runningTotal += n + var upperBound int64 + if i < bucketCount-1 { + upperBound = bucketBoundary(uint8(i + 1)) + } else { + upperBound = math.MaxInt64 + } + buckets[i] = &bucketData{ + Lower: bucketBoundary(uint8(i)), + Upper: upperBound, + N: n, + Pct: float64(n) * pctMult, + CumulativePct: float64(runningTotal) * pctMult, + GraphWidth: int(float64(n) * barsizeMult), + } + } + return &data{ + Buckets: buckets, + Count: total, + Median: h.median(), + Mean: h.average(), + StandardDeviation: h.standardDeviation(), + } +} + +func (h *histogram) html() template.HTML { + buf := new(bytes.Buffer) + if err := distTmpl().Execute(buf, h.newData()); err != nil { + buf.Reset() + log.Printf("net/trace: couldn't execute template: %v", err) + } + return template.HTML(buf.String()) +} + +var distTmplCache *template.Template +var distTmplOnce sync.Once + +func distTmpl() *template.Template { + distTmplOnce.Do(func() { + // Input: data + distTmplCache = template.Must(template.New("distTmpl").Parse(` + + + + + + + +
Count: {{.Count}}Mean: {{printf "%.0f" .Mean}}StdDev: {{printf "%.0f" .StandardDeviation}}Median: {{.Median}}
+
+ +{{range $b := .Buckets}} +{{if $b}} + + + + + + + + + +{{end}} +{{end}} +
[{{.Lower}},{{.Upper}}){{.N}}{{printf "%#.3f" .Pct}}%{{printf "%#.3f" .CumulativePct}}%
+`)) + }) + return distTmplCache +} diff --git a/vendor/golang.org/x/net/trace/trace.go b/vendor/golang.org/x/net/trace/trace.go new file mode 100644 index 0000000000..eae2a99f54 --- /dev/null +++ b/vendor/golang.org/x/net/trace/trace.go @@ -0,0 +1,1130 @@ +// Copyright 2015 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +/* +Package trace implements tracing of requests and long-lived objects. +It exports HTTP interfaces on /debug/requests and /debug/events. + +A trace.Trace provides tracing for short-lived objects, usually requests. +A request handler might be implemented like this: + + func fooHandler(w http.ResponseWriter, req *http.Request) { + tr := trace.New("mypkg.Foo", req.URL.Path) + defer tr.Finish() + ... + tr.LazyPrintf("some event %q happened", str) + ... + if err := somethingImportant(); err != nil { + tr.LazyPrintf("somethingImportant failed: %v", err) + tr.SetError() + } + } + +The /debug/requests HTTP endpoint organizes the traces by family, +errors, and duration. It also provides histogram of request duration +for each family. + +A trace.EventLog provides tracing for long-lived objects, such as RPC +connections. + + // A Fetcher fetches URL paths for a single domain. + type Fetcher struct { + domain string + events trace.EventLog + } + + func NewFetcher(domain string) *Fetcher { + return &Fetcher{ + domain, + trace.NewEventLog("mypkg.Fetcher", domain), + } + } + + func (f *Fetcher) Fetch(path string) (string, error) { + resp, err := http.Get("http://" + f.domain + "/" + path) + if err != nil { + f.events.Errorf("Get(%q) = %v", path, err) + return "", err + } + f.events.Printf("Get(%q) = %s", path, resp.Status) + ... + } + + func (f *Fetcher) Close() error { + f.events.Finish() + return nil + } + +The /debug/events HTTP endpoint organizes the event logs by family and +by time since the last error. The expanded view displays recent log +entries and the log's call stack. +*/ +package trace // import "golang.org/x/net/trace" + +import ( + "bytes" + "context" + "fmt" + "html/template" + "io" + "log" + "net" + "net/http" + "net/url" + "runtime" + "sort" + "strconv" + "sync" + "sync/atomic" + "time" + + "golang.org/x/net/internal/timeseries" +) + +// DebugUseAfterFinish controls whether to debug uses of Trace values after finishing. +// FOR DEBUGGING ONLY. This will slow down the program. +var DebugUseAfterFinish = false + +// HTTP ServeMux paths. +const ( + debugRequestsPath = "/debug/requests" + debugEventsPath = "/debug/events" +) + +// AuthRequest determines whether a specific request is permitted to load the +// /debug/requests or /debug/events pages. +// +// It returns two bools; the first indicates whether the page may be viewed at all, +// and the second indicates whether sensitive events will be shown. +// +// AuthRequest may be replaced by a program to customize its authorization requirements. +// +// The default AuthRequest function returns (true, true) if and only if the request +// comes from localhost/127.0.0.1/[::1]. +var AuthRequest = func(req *http.Request) (any, sensitive bool) { + // RemoteAddr is commonly in the form "IP" or "IP:port". + // If it is in the form "IP:port", split off the port. + host, _, err := net.SplitHostPort(req.RemoteAddr) + if err != nil { + host = req.RemoteAddr + } + switch host { + case "localhost", "127.0.0.1", "::1": + return true, true + default: + return false, false + } +} + +func init() { + _, pat := http.DefaultServeMux.Handler(&http.Request{URL: &url.URL{Path: debugRequestsPath}}) + if pat == debugRequestsPath { + panic("/debug/requests is already registered. You may have two independent copies of " + + "golang.org/x/net/trace in your binary, trying to maintain separate state. This may " + + "involve a vendored copy of golang.org/x/net/trace.") + } + + // TODO(jbd): Serve Traces from /debug/traces in the future? + // There is no requirement for a request to be present to have traces. + http.HandleFunc(debugRequestsPath, Traces) + http.HandleFunc(debugEventsPath, Events) +} + +// NewContext returns a copy of the parent context +// and associates it with a Trace. +func NewContext(ctx context.Context, tr Trace) context.Context { + return context.WithValue(ctx, contextKey, tr) +} + +// FromContext returns the Trace bound to the context, if any. +func FromContext(ctx context.Context) (tr Trace, ok bool) { + tr, ok = ctx.Value(contextKey).(Trace) + return +} + +// Traces responds with traces from the program. +// The package initialization registers it in http.DefaultServeMux +// at /debug/requests. +// +// It performs authorization by running AuthRequest. +func Traces(w http.ResponseWriter, req *http.Request) { + any, sensitive := AuthRequest(req) + if !any { + http.Error(w, "not allowed", http.StatusUnauthorized) + return + } + w.Header().Set("Content-Type", "text/html; charset=utf-8") + Render(w, req, sensitive) +} + +// Events responds with a page of events collected by EventLogs. +// The package initialization registers it in http.DefaultServeMux +// at /debug/events. +// +// It performs authorization by running AuthRequest. +func Events(w http.ResponseWriter, req *http.Request) { + any, sensitive := AuthRequest(req) + if !any { + http.Error(w, "not allowed", http.StatusUnauthorized) + return + } + w.Header().Set("Content-Type", "text/html; charset=utf-8") + RenderEvents(w, req, sensitive) +} + +// Render renders the HTML page typically served at /debug/requests. +// It does not do any auth checking. The request may be nil. +// +// Most users will use the Traces handler. +func Render(w io.Writer, req *http.Request, sensitive bool) { + data := &struct { + Families []string + ActiveTraceCount map[string]int + CompletedTraces map[string]*family + + // Set when a bucket has been selected. + Traces traceList + Family string + Bucket int + Expanded bool + Traced bool + Active bool + ShowSensitive bool // whether to show sensitive events + + Histogram template.HTML + HistogramWindow string // e.g. "last minute", "last hour", "all time" + + // If non-zero, the set of traces is a partial set, + // and this is the total number. + Total int + }{ + CompletedTraces: completedTraces, + } + + data.ShowSensitive = sensitive + if req != nil { + // Allow show_sensitive=0 to force hiding of sensitive data for testing. + // This only goes one way; you can't use show_sensitive=1 to see things. + if req.FormValue("show_sensitive") == "0" { + data.ShowSensitive = false + } + + if exp, err := strconv.ParseBool(req.FormValue("exp")); err == nil { + data.Expanded = exp + } + if exp, err := strconv.ParseBool(req.FormValue("rtraced")); err == nil { + data.Traced = exp + } + } + + completedMu.RLock() + data.Families = make([]string, 0, len(completedTraces)) + for fam := range completedTraces { + data.Families = append(data.Families, fam) + } + completedMu.RUnlock() + sort.Strings(data.Families) + + // We are careful here to minimize the time spent locking activeMu, + // since that lock is required every time an RPC starts and finishes. + data.ActiveTraceCount = make(map[string]int, len(data.Families)) + activeMu.RLock() + for fam, s := range activeTraces { + data.ActiveTraceCount[fam] = s.Len() + } + activeMu.RUnlock() + + var ok bool + data.Family, data.Bucket, ok = parseArgs(req) + switch { + case !ok: + // No-op + case data.Bucket == -1: + data.Active = true + n := data.ActiveTraceCount[data.Family] + data.Traces = getActiveTraces(data.Family) + if len(data.Traces) < n { + data.Total = n + } + case data.Bucket < bucketsPerFamily: + if b := lookupBucket(data.Family, data.Bucket); b != nil { + data.Traces = b.Copy(data.Traced) + } + default: + if f := getFamily(data.Family, false); f != nil { + var obs timeseries.Observable + f.LatencyMu.RLock() + switch o := data.Bucket - bucketsPerFamily; o { + case 0: + obs = f.Latency.Minute() + data.HistogramWindow = "last minute" + case 1: + obs = f.Latency.Hour() + data.HistogramWindow = "last hour" + case 2: + obs = f.Latency.Total() + data.HistogramWindow = "all time" + } + f.LatencyMu.RUnlock() + if obs != nil { + data.Histogram = obs.(*histogram).html() + } + } + } + + if data.Traces != nil { + defer data.Traces.Free() + sort.Sort(data.Traces) + } + + completedMu.RLock() + defer completedMu.RUnlock() + if err := pageTmpl().ExecuteTemplate(w, "Page", data); err != nil { + log.Printf("net/trace: Failed executing template: %v", err) + } +} + +func parseArgs(req *http.Request) (fam string, b int, ok bool) { + if req == nil { + return "", 0, false + } + fam, bStr := req.FormValue("fam"), req.FormValue("b") + if fam == "" || bStr == "" { + return "", 0, false + } + b, err := strconv.Atoi(bStr) + if err != nil || b < -1 { + return "", 0, false + } + + return fam, b, true +} + +func lookupBucket(fam string, b int) *traceBucket { + f := getFamily(fam, false) + if f == nil || b < 0 || b >= len(f.Buckets) { + return nil + } + return f.Buckets[b] +} + +type contextKeyT string + +var contextKey = contextKeyT("golang.org/x/net/trace.Trace") + +// Trace represents an active request. +type Trace interface { + // LazyLog adds x to the event log. It will be evaluated each time the + // /debug/requests page is rendered. Any memory referenced by x will be + // pinned until the trace is finished and later discarded. + LazyLog(x fmt.Stringer, sensitive bool) + + // LazyPrintf evaluates its arguments with fmt.Sprintf each time the + // /debug/requests page is rendered. Any memory referenced by a will be + // pinned until the trace is finished and later discarded. + LazyPrintf(format string, a ...interface{}) + + // SetError declares that this trace resulted in an error. + SetError() + + // SetRecycler sets a recycler for the trace. + // f will be called for each event passed to LazyLog at a time when + // it is no longer required, whether while the trace is still active + // and the event is discarded, or when a completed trace is discarded. + SetRecycler(f func(interface{})) + + // SetTraceInfo sets the trace info for the trace. + // This is currently unused. + SetTraceInfo(traceID, spanID uint64) + + // SetMaxEvents sets the maximum number of events that will be stored + // in the trace. This has no effect if any events have already been + // added to the trace. + SetMaxEvents(m int) + + // Finish declares that this trace is complete. + // The trace should not be used after calling this method. + Finish() +} + +type lazySprintf struct { + format string + a []interface{} +} + +func (l *lazySprintf) String() string { + return fmt.Sprintf(l.format, l.a...) +} + +// New returns a new Trace with the specified family and title. +func New(family, title string) Trace { + tr := newTrace() + tr.ref() + tr.Family, tr.Title = family, title + tr.Start = time.Now() + tr.maxEvents = maxEventsPerTrace + tr.events = tr.eventsBuf[:0] + + activeMu.RLock() + s := activeTraces[tr.Family] + activeMu.RUnlock() + if s == nil { + activeMu.Lock() + s = activeTraces[tr.Family] // check again + if s == nil { + s = new(traceSet) + activeTraces[tr.Family] = s + } + activeMu.Unlock() + } + s.Add(tr) + + // Trigger allocation of the completed trace structure for this family. + // This will cause the family to be present in the request page during + // the first trace of this family. We don't care about the return value, + // nor is there any need for this to run inline, so we execute it in its + // own goroutine, but only if the family isn't allocated yet. + completedMu.RLock() + if _, ok := completedTraces[tr.Family]; !ok { + go allocFamily(tr.Family) + } + completedMu.RUnlock() + + return tr +} + +func (tr *trace) Finish() { + elapsed := time.Since(tr.Start) + tr.mu.Lock() + tr.Elapsed = elapsed + tr.mu.Unlock() + + if DebugUseAfterFinish { + buf := make([]byte, 4<<10) // 4 KB should be enough + n := runtime.Stack(buf, false) + tr.finishStack = buf[:n] + } + + activeMu.RLock() + m := activeTraces[tr.Family] + activeMu.RUnlock() + m.Remove(tr) + + f := getFamily(tr.Family, true) + tr.mu.RLock() // protects tr fields in Cond.match calls + for _, b := range f.Buckets { + if b.Cond.match(tr) { + b.Add(tr) + } + } + tr.mu.RUnlock() + + // Add a sample of elapsed time as microseconds to the family's timeseries + h := new(histogram) + h.addMeasurement(elapsed.Nanoseconds() / 1e3) + f.LatencyMu.Lock() + f.Latency.Add(h) + f.LatencyMu.Unlock() + + tr.unref() // matches ref in New +} + +const ( + bucketsPerFamily = 9 + tracesPerBucket = 10 + maxActiveTraces = 20 // Maximum number of active traces to show. + maxEventsPerTrace = 10 + numHistogramBuckets = 38 +) + +var ( + // The active traces. + activeMu sync.RWMutex + activeTraces = make(map[string]*traceSet) // family -> traces + + // Families of completed traces. + completedMu sync.RWMutex + completedTraces = make(map[string]*family) // family -> traces +) + +type traceSet struct { + mu sync.RWMutex + m map[*trace]bool + + // We could avoid the entire map scan in FirstN by having a slice of all the traces + // ordered by start time, and an index into that from the trace struct, with a periodic + // repack of the slice after enough traces finish; we could also use a skip list or similar. + // However, that would shift some of the expense from /debug/requests time to RPC time, + // which is probably the wrong trade-off. +} + +func (ts *traceSet) Len() int { + ts.mu.RLock() + defer ts.mu.RUnlock() + return len(ts.m) +} + +func (ts *traceSet) Add(tr *trace) { + ts.mu.Lock() + if ts.m == nil { + ts.m = make(map[*trace]bool) + } + ts.m[tr] = true + ts.mu.Unlock() +} + +func (ts *traceSet) Remove(tr *trace) { + ts.mu.Lock() + delete(ts.m, tr) + ts.mu.Unlock() +} + +// FirstN returns the first n traces ordered by time. +func (ts *traceSet) FirstN(n int) traceList { + ts.mu.RLock() + defer ts.mu.RUnlock() + + if n > len(ts.m) { + n = len(ts.m) + } + trl := make(traceList, 0, n) + + // Fast path for when no selectivity is needed. + if n == len(ts.m) { + for tr := range ts.m { + tr.ref() + trl = append(trl, tr) + } + sort.Sort(trl) + return trl + } + + // Pick the oldest n traces. + // This is inefficient. See the comment in the traceSet struct. + for tr := range ts.m { + // Put the first n traces into trl in the order they occur. + // When we have n, sort trl, and thereafter maintain its order. + if len(trl) < n { + tr.ref() + trl = append(trl, tr) + if len(trl) == n { + // This is guaranteed to happen exactly once during this loop. + sort.Sort(trl) + } + continue + } + if tr.Start.After(trl[n-1].Start) { + continue + } + + // Find where to insert this one. + tr.ref() + i := sort.Search(n, func(i int) bool { return trl[i].Start.After(tr.Start) }) + trl[n-1].unref() + copy(trl[i+1:], trl[i:]) + trl[i] = tr + } + + return trl +} + +func getActiveTraces(fam string) traceList { + activeMu.RLock() + s := activeTraces[fam] + activeMu.RUnlock() + if s == nil { + return nil + } + return s.FirstN(maxActiveTraces) +} + +func getFamily(fam string, allocNew bool) *family { + completedMu.RLock() + f := completedTraces[fam] + completedMu.RUnlock() + if f == nil && allocNew { + f = allocFamily(fam) + } + return f +} + +func allocFamily(fam string) *family { + completedMu.Lock() + defer completedMu.Unlock() + f := completedTraces[fam] + if f == nil { + f = newFamily() + completedTraces[fam] = f + } + return f +} + +// family represents a set of trace buckets and associated latency information. +type family struct { + // traces may occur in multiple buckets. + Buckets [bucketsPerFamily]*traceBucket + + // latency time series + LatencyMu sync.RWMutex + Latency *timeseries.MinuteHourSeries +} + +func newFamily() *family { + return &family{ + Buckets: [bucketsPerFamily]*traceBucket{ + {Cond: minCond(0)}, + {Cond: minCond(50 * time.Millisecond)}, + {Cond: minCond(100 * time.Millisecond)}, + {Cond: minCond(200 * time.Millisecond)}, + {Cond: minCond(500 * time.Millisecond)}, + {Cond: minCond(1 * time.Second)}, + {Cond: minCond(10 * time.Second)}, + {Cond: minCond(100 * time.Second)}, + {Cond: errorCond{}}, + }, + Latency: timeseries.NewMinuteHourSeries(func() timeseries.Observable { return new(histogram) }), + } +} + +// traceBucket represents a size-capped bucket of historic traces, +// along with a condition for a trace to belong to the bucket. +type traceBucket struct { + Cond cond + + // Ring buffer implementation of a fixed-size FIFO queue. + mu sync.RWMutex + buf [tracesPerBucket]*trace + start int // < tracesPerBucket + length int // <= tracesPerBucket +} + +func (b *traceBucket) Add(tr *trace) { + b.mu.Lock() + defer b.mu.Unlock() + + i := b.start + b.length + if i >= tracesPerBucket { + i -= tracesPerBucket + } + if b.length == tracesPerBucket { + // "Remove" an element from the bucket. + b.buf[i].unref() + b.start++ + if b.start == tracesPerBucket { + b.start = 0 + } + } + b.buf[i] = tr + if b.length < tracesPerBucket { + b.length++ + } + tr.ref() +} + +// Copy returns a copy of the traces in the bucket. +// If tracedOnly is true, only the traces with trace information will be returned. +// The logs will be ref'd before returning; the caller should call +// the Free method when it is done with them. +// TODO(dsymonds): keep track of traced requests in separate buckets. +func (b *traceBucket) Copy(tracedOnly bool) traceList { + b.mu.RLock() + defer b.mu.RUnlock() + + trl := make(traceList, 0, b.length) + for i, x := 0, b.start; i < b.length; i++ { + tr := b.buf[x] + if !tracedOnly || tr.spanID != 0 { + tr.ref() + trl = append(trl, tr) + } + x++ + if x == b.length { + x = 0 + } + } + return trl +} + +func (b *traceBucket) Empty() bool { + b.mu.RLock() + defer b.mu.RUnlock() + return b.length == 0 +} + +// cond represents a condition on a trace. +type cond interface { + match(t *trace) bool + String() string +} + +type minCond time.Duration + +func (m minCond) match(t *trace) bool { return t.Elapsed >= time.Duration(m) } +func (m minCond) String() string { return fmt.Sprintf("≥%gs", time.Duration(m).Seconds()) } + +type errorCond struct{} + +func (e errorCond) match(t *trace) bool { return t.IsError } +func (e errorCond) String() string { return "errors" } + +type traceList []*trace + +// Free calls unref on each element of the list. +func (trl traceList) Free() { + for _, t := range trl { + t.unref() + } +} + +// traceList may be sorted in reverse chronological order. +func (trl traceList) Len() int { return len(trl) } +func (trl traceList) Less(i, j int) bool { return trl[i].Start.After(trl[j].Start) } +func (trl traceList) Swap(i, j int) { trl[i], trl[j] = trl[j], trl[i] } + +// An event is a timestamped log entry in a trace. +type event struct { + When time.Time + Elapsed time.Duration // since previous event in trace + NewDay bool // whether this event is on a different day to the previous event + Recyclable bool // whether this event was passed via LazyLog + Sensitive bool // whether this event contains sensitive information + What interface{} // string or fmt.Stringer +} + +// WhenString returns a string representation of the elapsed time of the event. +// It will include the date if midnight was crossed. +func (e event) WhenString() string { + if e.NewDay { + return e.When.Format("2006/01/02 15:04:05.000000") + } + return e.When.Format("15:04:05.000000") +} + +// discarded represents a number of discarded events. +// It is stored as *discarded to make it easier to update in-place. +type discarded int + +func (d *discarded) String() string { + return fmt.Sprintf("(%d events discarded)", int(*d)) +} + +// trace represents an active or complete request, +// either sent or received by this program. +type trace struct { + // Family is the top-level grouping of traces to which this belongs. + Family string + + // Title is the title of this trace. + Title string + + // Start time of the this trace. + Start time.Time + + mu sync.RWMutex + events []event // Append-only sequence of events (modulo discards). + maxEvents int + recycler func(interface{}) + IsError bool // Whether this trace resulted in an error. + Elapsed time.Duration // Elapsed time for this trace, zero while active. + traceID uint64 // Trace information if non-zero. + spanID uint64 + + refs int32 // how many buckets this is in + disc discarded // scratch space to avoid allocation + + finishStack []byte // where finish was called, if DebugUseAfterFinish is set + + eventsBuf [4]event // preallocated buffer in case we only log a few events +} + +func (tr *trace) reset() { + // Clear all but the mutex. Mutexes may not be copied, even when unlocked. + tr.Family = "" + tr.Title = "" + tr.Start = time.Time{} + + tr.mu.Lock() + tr.Elapsed = 0 + tr.traceID = 0 + tr.spanID = 0 + tr.IsError = false + tr.maxEvents = 0 + tr.events = nil + tr.recycler = nil + tr.mu.Unlock() + + tr.refs = 0 + tr.disc = 0 + tr.finishStack = nil + for i := range tr.eventsBuf { + tr.eventsBuf[i] = event{} + } +} + +// delta returns the elapsed time since the last event or the trace start, +// and whether it spans midnight. +// L >= tr.mu +func (tr *trace) delta(t time.Time) (time.Duration, bool) { + if len(tr.events) == 0 { + return t.Sub(tr.Start), false + } + prev := tr.events[len(tr.events)-1].When + return t.Sub(prev), prev.Day() != t.Day() +} + +func (tr *trace) addEvent(x interface{}, recyclable, sensitive bool) { + if DebugUseAfterFinish && tr.finishStack != nil { + buf := make([]byte, 4<<10) // 4 KB should be enough + n := runtime.Stack(buf, false) + log.Printf("net/trace: trace used after finish:\nFinished at:\n%s\nUsed at:\n%s", tr.finishStack, buf[:n]) + } + + /* + NOTE TO DEBUGGERS + + If you are here because your program panicked in this code, + it is almost definitely the fault of code using this package, + and very unlikely to be the fault of this code. + + The most likely scenario is that some code elsewhere is using + a trace.Trace after its Finish method is called. + You can temporarily set the DebugUseAfterFinish var + to help discover where that is; do not leave that var set, + since it makes this package much less efficient. + */ + + e := event{When: time.Now(), What: x, Recyclable: recyclable, Sensitive: sensitive} + tr.mu.Lock() + e.Elapsed, e.NewDay = tr.delta(e.When) + if len(tr.events) < tr.maxEvents { + tr.events = append(tr.events, e) + } else { + // Discard the middle events. + di := int((tr.maxEvents - 1) / 2) + if d, ok := tr.events[di].What.(*discarded); ok { + (*d)++ + } else { + // disc starts at two to count for the event it is replacing, + // plus the next one that we are about to drop. + tr.disc = 2 + if tr.recycler != nil && tr.events[di].Recyclable { + go tr.recycler(tr.events[di].What) + } + tr.events[di].What = &tr.disc + } + // The timestamp of the discarded meta-event should be + // the time of the last event it is representing. + tr.events[di].When = tr.events[di+1].When + + if tr.recycler != nil && tr.events[di+1].Recyclable { + go tr.recycler(tr.events[di+1].What) + } + copy(tr.events[di+1:], tr.events[di+2:]) + tr.events[tr.maxEvents-1] = e + } + tr.mu.Unlock() +} + +func (tr *trace) LazyLog(x fmt.Stringer, sensitive bool) { + tr.addEvent(x, true, sensitive) +} + +func (tr *trace) LazyPrintf(format string, a ...interface{}) { + tr.addEvent(&lazySprintf{format, a}, false, false) +} + +func (tr *trace) SetError() { + tr.mu.Lock() + tr.IsError = true + tr.mu.Unlock() +} + +func (tr *trace) SetRecycler(f func(interface{})) { + tr.mu.Lock() + tr.recycler = f + tr.mu.Unlock() +} + +func (tr *trace) SetTraceInfo(traceID, spanID uint64) { + tr.mu.Lock() + tr.traceID, tr.spanID = traceID, spanID + tr.mu.Unlock() +} + +func (tr *trace) SetMaxEvents(m int) { + tr.mu.Lock() + // Always keep at least three events: first, discarded count, last. + if len(tr.events) == 0 && m > 3 { + tr.maxEvents = m + } + tr.mu.Unlock() +} + +func (tr *trace) ref() { + atomic.AddInt32(&tr.refs, 1) +} + +func (tr *trace) unref() { + if atomic.AddInt32(&tr.refs, -1) == 0 { + tr.mu.RLock() + if tr.recycler != nil { + // freeTrace clears tr, so we hold tr.recycler and tr.events here. + go func(f func(interface{}), es []event) { + for _, e := range es { + if e.Recyclable { + f(e.What) + } + } + }(tr.recycler, tr.events) + } + tr.mu.RUnlock() + + freeTrace(tr) + } +} + +func (tr *trace) When() string { + return tr.Start.Format("2006/01/02 15:04:05.000000") +} + +func (tr *trace) ElapsedTime() string { + tr.mu.RLock() + t := tr.Elapsed + tr.mu.RUnlock() + + if t == 0 { + // Active trace. + t = time.Since(tr.Start) + } + return fmt.Sprintf("%.6f", t.Seconds()) +} + +func (tr *trace) Events() []event { + tr.mu.RLock() + defer tr.mu.RUnlock() + return tr.events +} + +var traceFreeList = make(chan *trace, 1000) // TODO(dsymonds): Use sync.Pool? + +// newTrace returns a trace ready to use. +func newTrace() *trace { + select { + case tr := <-traceFreeList: + return tr + default: + return new(trace) + } +} + +// freeTrace adds tr to traceFreeList if there's room. +// This is non-blocking. +func freeTrace(tr *trace) { + if DebugUseAfterFinish { + return // never reuse + } + tr.reset() + select { + case traceFreeList <- tr: + default: + } +} + +func elapsed(d time.Duration) string { + b := []byte(fmt.Sprintf("%.6f", d.Seconds())) + + // For subsecond durations, blank all zeros before decimal point, + // and all zeros between the decimal point and the first non-zero digit. + if d < time.Second { + dot := bytes.IndexByte(b, '.') + for i := 0; i < dot; i++ { + b[i] = ' ' + } + for i := dot + 1; i < len(b); i++ { + if b[i] == '0' { + b[i] = ' ' + } else { + break + } + } + } + + return string(b) +} + +var pageTmplCache *template.Template +var pageTmplOnce sync.Once + +func pageTmpl() *template.Template { + pageTmplOnce.Do(func() { + pageTmplCache = template.Must(template.New("Page").Funcs(template.FuncMap{ + "elapsed": elapsed, + "add": func(a, b int) int { return a + b }, + }).Parse(pageHTML)) + }) + return pageTmplCache +} + +const pageHTML = ` +{{template "Prolog" .}} +{{template "StatusTable" .}} +{{template "Epilog" .}} + +{{define "Prolog"}} + + + /debug/requests + + + + +

/debug/requests

+{{end}} {{/* end of Prolog */}} + +{{define "StatusTable"}} + + {{range $fam := .Families}} + + + + {{$n := index $.ActiveTraceCount $fam}} + + + {{$f := index $.CompletedTraces $fam}} + {{range $i, $b := $f.Buckets}} + {{$empty := $b.Empty}} + + {{end}} + + {{$nb := len $f.Buckets}} + + + + + + {{end}} +
{{$fam}} + {{if $n}}{{end}} + [{{$n}} active] + {{if $n}}{{end}} + + {{if not $empty}}{{end}} + [{{.Cond}}] + {{if not $empty}}{{end}} + + [minute] + + [hour] + + [total] +
+{{end}} {{/* end of StatusTable */}} + +{{define "Epilog"}} +{{if $.Traces}} +
+

Family: {{$.Family}}

+ +{{if or $.Expanded $.Traced}} + [Normal/Summary] +{{else}} + [Normal/Summary] +{{end}} + +{{if or (not $.Expanded) $.Traced}} + [Normal/Expanded] +{{else}} + [Normal/Expanded] +{{end}} + +{{if not $.Active}} + {{if or $.Expanded (not $.Traced)}} + [Traced/Summary] + {{else}} + [Traced/Summary] + {{end}} + {{if or (not $.Expanded) (not $.Traced)}} + [Traced/Expanded] + {{else}} + [Traced/Expanded] + {{end}} +{{end}} + +{{if $.Total}} +

Showing {{len $.Traces}} of {{$.Total}} traces.

+{{end}} + + + + + {{range $tr := $.Traces}} + + + + + {{/* TODO: include traceID/spanID */}} + + {{if $.Expanded}} + {{range $tr.Events}} + + + + + + {{end}} + {{end}} + {{end}} +
+ {{if $.Active}}Active{{else}}Completed{{end}} Requests +
WhenElapsed (s)
{{$tr.When}}{{$tr.ElapsedTime}}{{$tr.Title}}
{{.WhenString}}{{elapsed .Elapsed}}{{if or $.ShowSensitive (not .Sensitive)}}... {{.What}}{{else}}[redacted]{{end}}
+{{end}} {{/* if $.Traces */}} + +{{if $.Histogram}} +

Latency (µs) of {{$.Family}} over {{$.HistogramWindow}}

+{{$.Histogram}} +{{end}} {{/* if $.Histogram */}} + + + +{{end}} {{/* end of Epilog */}} +` diff --git a/vendor/modules.txt b/vendor/modules.txt index e4f4475da6..a5d9fa1cc5 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -5,6 +5,9 @@ filippo.io/edwards25519/field # github.com/42wim/go-gitter v0.0.0-20170828205020-017310c2d557 ## explicit github.com/42wim/go-gitter +# github.com/AndreasBriese/bbloom v0.0.0-20190825152654-46b345b51c96 +## explicit +github.com/AndreasBriese/bbloom # github.com/Baozisoftware/qrcode-terminal-go v0.0.0-20170407111555-c0650d8dff0f ## explicit github.com/Baozisoftware/qrcode-terminal-go @@ -68,6 +71,17 @@ github.com/d5/tengo/v2/token # github.com/davecgh/go-spew v1.1.1 ## explicit github.com/davecgh/go-spew/spew +# github.com/dgraph-io/badger v1.6.0 +## explicit; go 1.12 +github.com/dgraph-io/badger +github.com/dgraph-io/badger/options +github.com/dgraph-io/badger/pb +github.com/dgraph-io/badger/skl +github.com/dgraph-io/badger/table +github.com/dgraph-io/badger/y +# github.com/dgryski/go-farm v0.0.0-20190423205320-6a90982ecee2 +## explicit +github.com/dgryski/go-farm # github.com/dustin/go-humanize v1.0.0 ## explicit github.com/dustin/go-humanize @@ -361,6 +375,18 @@ github.com/pelletier/go-toml/v2/unstable # github.com/philhofer/fwd v1.1.1 ## explicit github.com/philhofer/fwd +# github.com/philippgille/gokv v0.6.0 +## explicit; go 1.13 +github.com/philippgille/gokv +# github.com/philippgille/gokv/badgerdb v0.6.0 +## explicit; go 1.13 +github.com/philippgille/gokv/badgerdb +# github.com/philippgille/gokv/encoding v0.0.0-20191011213304-eb77f15b9c61 +## explicit; go 1.12 +github.com/philippgille/gokv/encoding +# github.com/philippgille/gokv/util v0.0.0-20191011213304-eb77f15b9c61 +## explicit; go 1.12 +github.com/philippgille/gokv/util # github.com/pkg/errors v0.9.1 ## explicit github.com/pkg/errors @@ -591,7 +617,9 @@ golang.org/x/net/http2 golang.org/x/net/http2/h2c golang.org/x/net/http2/hpack golang.org/x/net/idna +golang.org/x/net/internal/timeseries golang.org/x/net/publicsuffix +golang.org/x/net/trace golang.org/x/net/websocket # golang.org/x/oauth2 v0.6.0 ## explicit; go 1.17