Skip to content

Commit

Permalink
divider fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
Jeffrey Koehler committed Oct 14, 2024
1 parent 9b940fb commit 8217899
Show file tree
Hide file tree
Showing 6 changed files with 53 additions and 83 deletions.
2 changes: 1 addition & 1 deletion internal/redisstreams/streams.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ func (s *StreamListener) Listen() {
if errors.Is(err, context.Canceled) {
return
}
s.Logger(s.Ctx).Panic("failed to read data from stream", slog.String("err.error", err.Error()), slog.String("streamlistener.stream", s.Key))
s.Logger(s.Ctx).Panic("failed to read data from stream", err, slog.String("streamlistener.stream", s.Key))
}

//for all events, get teh data and send it to the channel.
Expand Down
18 changes: 9 additions & 9 deletions internal/ticker/ticker.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package ticker
import (
"context"
"fmt"
"log/slog"
"time"

"github.com/streemtech/divider"
Expand Down Expand Up @@ -33,14 +32,15 @@ func (t TickerFunc) Do() error {
ticker := time.NewTicker(t.D)

go func() {
defer func() {
ticker.Stop()
if e := recover(); e != nil {
if t.Logger != nil {
t.Logger(t.C).Error("Ran into exception when running ticker function", slog.String("err.panic", fmt.Sprintf("%+v", e)))
}
}
}()
//Intentionally disabling the panic catch as the ticker will be disabled, which causes unexpected problems.
// defer func() {
// ticker.Stop()
// if e := recover(); e != nil {
// if t.Logger != nil {
// t.Logger(t.C).Error("Ran into exception when running ticker function", slog.String("err.panic", fmt.Sprintf("%+v", e)))
// }
// }
// }()
for {
select {
case <-ticker.C:
Expand Down
12 changes: 0 additions & 12 deletions redisconsistent/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,6 @@ var WorkerRectifyTime = promauto.NewHistogramVec(prometheus.HistogramOpts{
Help: "Returns the length of time that rectifying the work assigned to this node took",
Buckets: HistogramBuckets,
}, []string{"divider"})
var MasterPingTime = promauto.NewHistogramVec(prometheus.HistogramOpts{
Namespace: "streemtech",
Subsystem: "redis_work_divider",
Name: "master_ping_time",
Help: "Returns the length of time that updating the master took.",
Buckets: HistogramBuckets,
}, []string{"divider"})
var WorkerPingTime = promauto.NewHistogramVec(prometheus.HistogramOpts{
Namespace: "streemtech",
Subsystem: "redis_work_divider",
Expand Down Expand Up @@ -161,11 +154,6 @@ func InitMetrics(name string) {
panic(errors.Wrap(err, "failed to get metric"))
}

_, err = MasterPingTime.GetMetricWithLabelValues(name)
if err != nil {
panic(errors.Wrap(err, "failed to get metric"))
}

_, err = WorkerPingTime.GetMetricWithLabelValues(name)
if err != nil {
panic(errors.Wrap(err, "failed to get metric"))
Expand Down
96 changes: 40 additions & 56 deletions redisconsistent/redis.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"log/slog"
"time"

// "github.com/k0kubun/pp/v3"
"github.com/pkg/errors"

"github.com/streemtech/divider"
Expand Down Expand Up @@ -80,18 +81,6 @@ func (d *dividerWorker) StartWorker(ctx context.Context) {
D: d.conf.compareKeys,
F: d.workerRectifyAssignedWorkFunc,
}
d.masterPing = ticker.TickerFunc{
C: d.ctx,
Logger: logger,
D: d.conf.masterPing,
F: d.masterPingFunc,
}
d.workerPing = ticker.TickerFunc{
C: d.ctx,
Logger: logger,
D: d.conf.workerPing,
F: d.workerPingFunc,
}

InitMetrics(d.conf.metricsName)

Expand All @@ -103,7 +92,7 @@ func (d *dividerWorker) StartWorker(ctx context.Context) {
//add workers to work holder
err := d.storage.UpdateTimeoutForWorkers(ctx, d.getWorkerNodeKeys())
if err != nil {
d.conf.logger(ctx).Panic("failed to update timeout for workers", slog.String("err.error", err.Error()), slog.String("divider.id", d.conf.instanceID))
d.conf.logger(ctx).Panic("failed to update timeout for workers", err, slog.String("divider.id", d.conf.instanceID))
}

//start all listeners and tickers.
Expand All @@ -114,23 +103,13 @@ func (d *dividerWorker) StartWorker(ctx context.Context) {

err = d.masterUpdateRequiredWork.Do()
if err != nil {
d.conf.logger(ctx).Panic("failed to start ticker", slog.String("err.error", err.Error()), slog.String("divider.id", d.conf.instanceID))
d.conf.logger(ctx).Panic("failed to start ticker", err, slog.String("divider.id", d.conf.instanceID))
}
err = d.workerRectifyAssignedWork.Do()
if err != nil {
d.conf.logger(ctx).Panic("failed to start ticker", slog.String("err.error", err.Error()), slog.String("divider.id", d.conf.instanceID))
}
err = d.masterPing.Do()
if err != nil {
d.conf.logger(ctx).Panic("failed to start ticker", slog.String("err.error", err.Error()), slog.String("divider.id", d.conf.instanceID))
}
err = d.workerPing.Do()
if err != nil {
d.conf.logger(ctx).Panic("failed to start ticker", slog.String("err.error", err.Error()), slog.String("divider.id", d.conf.instanceID))
d.conf.logger(ctx).Panic("failed to start ticker", err, slog.String("divider.id", d.conf.instanceID))
}

d.masterPing.F()
d.workerPing.F()
d.masterUpdateRequiredWork.F()
d.workerRectifyAssignedWork.F()

Expand Down Expand Up @@ -309,52 +288,53 @@ func (d *dividerWorker) masterUpdateRequiredWorkFunc() {
}()

d.conf.logger(d.ctx).Debug("masterUpdateRequiredWorkFunc triggered", slog.String("divider.id", d.conf.instanceID))
masterKey := fmt.Sprintf("%s:%s", d.conf.rootKey, "master")

//check the master key.
master, err := d.client.Get(d.ctx, masterKey).Result()
if err != nil {
d.conf.logger(d.ctx).Panic("Error getting current master", slog.String("err.error", err.Error()), slog.String("divider.id", d.conf.instanceID))
return
}
//if not master, dont do the work fetcher.
if master != d.conf.instanceID {
if !d.materUpdate(d.ctx) {
return
}

//Get all the newWork that needs to be assigned.
newWork, err := d.conf.workFetcher(d.ctx)
if err != nil {
d.conf.logger(d.ctx).Panic("failed to execute work fetcher", slog.String("err.error", err.Error()), slog.String("divider.id", d.conf.instanceID))
d.conf.logger(d.ctx).Panic("failed to execute work fetcher", err, slog.String("divider.id", d.conf.instanceID))
return
}

//Add the work to the list of work in the system.
err = d.storage.AddWorkToDividedWork(d.ctx, newWork)
if err != nil {
d.conf.logger(d.ctx).Panic("failed to add work to divided work", slog.String("err.error", err.Error()), slog.String("divider.id", d.conf.instanceID))
d.conf.logger(d.ctx).Panic("failed to add work to divided work", err, slog.String("divider.id", d.conf.instanceID))
return
}

//get existing work
existingWork, err := d.storage.GetAllWork(d.ctx)
if err != nil {
d.conf.logger(d.ctx).Panic("failed to get list of all work", slog.String("err.error", err.Error()), slog.String("divider.id", d.conf.instanceID))
d.conf.logger(d.ctx).Panic("failed to get list of all work", err, slog.String("divider.id", d.conf.instanceID))
return
}

//determine what work to remove
_, remove := getToRemoveToKeep(existingWork, newWork)

// pp.Println(d.conf.metricsName, existingWork, newWork, remove)
//remove all that work
err = d.storage.RemoveWorkFromDividedWork(d.ctx, remove)
if err != nil {
d.conf.logger(d.ctx).Panic("failed to remove the old work", slog.String("err.error", err.Error()), slog.String("divider.id", d.conf.instanceID))
d.conf.logger(d.ctx).Panic("failed to remove the old work", err, slog.String("divider.id", d.conf.instanceID))
return
}

for _, v := range remove {
err = d.removeWork.Publish(d.ctx, v)
if err != nil {
d.conf.logger(d.ctx).Panic("failed to publish old work removal", err, slog.String("divider.id", d.conf.instanceID))
}
}
//note: Not triggering the add work event as that event should be triggered manually by the add work call, not by this.
//The work will be picked up by the rectify call later.
//Am manually calling the remove work as that should happen as soon as rectified if needed. (This can result in a double call, but thats ok in this case.)
}

// get work assigned to this node, compare with known work, and start/stop all work as needed.
Expand All @@ -369,56 +349,56 @@ func (d *dividerWorker) workerRectifyAssignedWorkFunc() {
if err != nil {
d.conf.logger(d.ctx).Error("failed to rectify work", slog.String("err.error", err.Error()), slog.String("divider.id", d.conf.instanceID))
}

d.workerPing(d.ctx)
}

// set master as still attached
func (d *dividerWorker) masterPingFunc() {
start := time.Now()
defer func() {
ObserveDuration(MasterPingTime, d.conf.metricsName, time.Since(start))
}()
func (d *dividerWorker) materUpdate(ctx context.Context) (isMaster bool) {

// d.conf.logger(d.ctx).Debug("masterPingFunc triggered")
masterKey := fmt.Sprintf("%s:%s", d.conf.rootKey, "master")
//set the master key to this value if it does not exist.
set, err := d.client.SetNX(d.ctx, masterKey, d.conf.instanceID, d.conf.masterTimeout).Result()
set, err := d.client.SetNX(ctx, masterKey, d.conf.instanceID, d.conf.masterTimeout).Result()
if err != nil {
d.conf.logger(d.ctx).Panic("Error updating node master", slog.String("err.error", err.Error()), slog.String("divider.id", d.conf.instanceID))
d.conf.logger(ctx).Panic("Error updating node master", err, slog.String("divider.id", d.conf.instanceID))
return
}

if set {
d.conf.logger(d.ctx).Info("Master set to this node", slog.String("divider.id", d.conf.instanceID))
d.conf.logger(ctx).Info("Master set to this node", slog.String("divider.id", d.conf.instanceID))
}

//check the master key.
master, err := d.client.Get(d.ctx, masterKey).Result()
master, err := d.client.Get(ctx, masterKey).Result()
if err != nil {
d.conf.logger(d.ctx).Panic("Error getting current master", slog.String("err.error", err.Error()), slog.String("divider.id", d.conf.instanceID))
d.conf.logger(ctx).Panic("Error getting current master", err, slog.String("divider.id", d.conf.instanceID))
return
}

//if this is the master, run the update to keep the master inline.
if master == d.conf.instanceID {
_, err = d.client.Set(d.ctx, masterKey, d.conf.instanceID, d.conf.masterTimeout).Result()
_, err = d.client.Set(ctx, masterKey, d.conf.instanceID, d.conf.masterTimeout).Result()
if err != nil {
d.conf.logger(d.ctx).Panic("Error updating master timeout", slog.String("err.error", err.Error()), slog.String("divider.id", d.conf.instanceID))
d.conf.logger(ctx).Panic("Error updating master timeout", err, slog.String("divider.id", d.conf.instanceID))
return
}
return true
}
return false
}

// update nodes in storage as still attached.
func (d *dividerWorker) workerPingFunc() {
func (d *dividerWorker) workerPing(ctx context.Context) {
start := time.Now()
defer func() {
ObserveDuration(WorkerPingTime, d.conf.metricsName, time.Since(start))
}()
// d.conf.logger(d.ctx).Debug("workerPingFunc triggered")
//add workers to work holder
err := d.storage.UpdateTimeoutForWorkers(d.ctx, d.getWorkerNodeKeys())
err := d.storage.UpdateTimeoutForWorkers(ctx, d.getWorkerNodeKeys())
if err != nil {
d.conf.logger(d.ctx).Panic("failed to update timeout for workers", slog.String("err.error", err.Error()), slog.String("divider.id", d.conf.instanceID))
d.conf.logger(d.ctx).Panic("failed to update timeout for workers", err, slog.String("divider.id", d.conf.instanceID))
}
}

Expand Down Expand Up @@ -474,19 +454,23 @@ func (d *dividerWorker) rectifyWork(ctx context.Context) (err error) {

// pp.Println(d.conf.instanceID, "toRemove", toRemove.Array())
for key := range toRemove.Iterator() {
err = d.conf.stopper(ctx, key)
ctx2, can := context.WithTimeout(ctx, time.Second*5)
defer can()
err = d.conf.stopper(ctx2, key)
if err != nil {
d.conf.logger(ctx).Error("failed to execute stopper, not removing from known work", slog.String("err.error", err.Error()), slog.String("divider.id", d.conf.instanceID))
d.conf.logger(ctx2).Error("failed to execute stopper, not removing from known work", slog.String("err.error", err.Error()), slog.String("divider.id", d.conf.instanceID))
continue
}
d.knownWork.Remove(key)
}

// pp.Println(d.conf.instanceID, "toAdd", toAdd.Array())
for key := range toAdd.Iterator() {
err = d.conf.starter(ctx, key)
ctx2, can := context.WithTimeout(ctx, time.Second*5)
defer can()
err = d.conf.starter(ctx2, key)
if err != nil {
d.conf.logger(ctx).Error("failed to execute starter, not adding to known work", slog.String("err.error", err.Error()), slog.String("divider.id", d.conf.instanceID))
d.conf.logger(ctx2).Error("failed to execute starter, not adding to known work", slog.String("err.error", err.Error()), slog.String("divider.id", d.conf.instanceID))
continue
}
d.knownWork.Add(key)
Expand Down
2 changes: 0 additions & 2 deletions redisconsistent/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,4 @@ type dividerWorker struct {

masterUpdateRequiredWork ticker.TickerFunc
workerRectifyAssignedWork ticker.TickerFunc
masterPing ticker.TickerFunc
workerPing ticker.TickerFunc
}
6 changes: 3 additions & 3 deletions util.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ type Logger interface {
Debug(msg string, args ...slog.Attr)
Info(msg string, args ...slog.Attr)
Error(msg string, args ...slog.Attr)
Panic(msg string, args ...slog.Attr)
Panic(msg string, err error, args ...slog.Attr)
}

func DefaultLogger(ctx context.Context) Logger {
Expand Down Expand Up @@ -71,8 +71,8 @@ func (d *defaultLogger) Error(msg string, args ...slog.Attr) {

l.ErrorContext(d.ctx, msg)
}
func (d *defaultLogger) Panic(msg string, args ...slog.Attr) {
l := slog.Default()
func (d *defaultLogger) Panic(msg string, err error, args ...slog.Attr) {
l := slog.Default().With(slog.String("err.error", err.Error()))
for _, v := range args {
l = l.With(v)
}
Expand Down

0 comments on commit 8217899

Please sign in to comment.