pkg/causality(cdc): refactor and simplify conflict detector (#10954) (#…

…11275) close #10973
pingcap · Aug 2, 2024 · 4ef1810 · 4ef1810
1 parent 6cf7f46
commit 4ef1810
Show file tree

Hide file tree

Showing 12 changed files with 197 additions and 289 deletions.
diff --git a/cdc/sink/dmlsink/txn/event.go b/cdc/sink/dmlsink/txn/event.go
@@ -16,7 +16,6 @@ package txn
 import (
 	"encoding/binary"
 	"hash/fnv"
-	"sort"
 	"strings"
 	"time"
 
@@ -41,51 +40,17 @@ func (e *txnEvent) OnConflictResolved() {
 	e.conflictResolved = time.Now()
 }
 
-// GenSortedDedupKeysHash implements causality.txnEvent interface.
-func (e *txnEvent) GenSortedDedupKeysHash(numSlots uint64) []uint64 {
-	hashes := genTxnKeys(e.TxnCallbackableEvent.Event)
-
-	// Sort and dedup hashes.
-	// Sort hashes by `hash % numSlots` to avoid deadlock, and then dedup
-	// hashes, so the same txn will not check confict with the same hash twice to
-	// prevent potential cyclic self dependency in the causality dependency
-	// graph.
-	return sortAndDedupHashes(hashes, numSlots)
-}
-
-func sortAndDedupHashes(hashes []uint64, numSlots uint64) []uint64 {
-	if len(hashes) == 0 {
-		return nil
-	}
-
-	// Sort hashes by `hash % numSlots` to avoid deadlock.
-	sort.Slice(hashes, func(i, j int) bool { return hashes[i]%numSlots < hashes[j]%numSlots })
-
-	// Dedup hashes
-	last := hashes[0]
-	j := 1
-	for i, hash := range hashes {
-		if i == 0 {
-			// skip first one, start checking duplication from 2nd one
-			continue
-		}
-		if hash == last {
-			continue
-		}
-		last = hash
-		hashes[j] = hash
-		j++
-	}
-	hashes = hashes[:j]
-
-	return hashes
+// ConflictKeys implements causality.txnEvent interface.
+func (e *txnEvent) ConflictKeys() []uint64 {
+	return genTxnKeys(e.TxnCallbackableEvent.Event)
 }
 
-// genTxnKeys returns hash keys for `txn`.
+// genTxnKeys returns deduplicated hash keys of a transaction.
 func genTxnKeys(txn *model.SingleTableTxn) []uint64 {
 	if len(txn.Rows) == 0 {
 		return nil
 	}
+
 	hashRes := make(map[uint64]struct{}, len(txn.Rows))
 	hasher := fnv.New32a()
 	for _, row := range txn.Rows {

diff --git a/cdc/sink/dmlsink/txn/event_test.go b/cdc/sink/dmlsink/txn/event_test.go
@@ -199,30 +199,3 @@ func TestGenKeys(t *testing.T) {
 		require.Equal(t, tc.expected, keys)
 	}
 }
-
-func TestSortAndDedupHash(t *testing.T) {
-	// If a transaction contains multiple rows, these rows may generate the same hash
-	// in some rare cases. We should dedup these hashes to avoid unnecessary self cyclic
-	// dependency in the causality dependency graph.
-	t.Parallel()
-	testCases := []struct {
-		hashes   []uint64
-		expected []uint64
-	}{{
-		// No duplicate hashes
-		hashes:   []uint64{1, 2, 3, 4, 5},
-		expected: []uint64{1, 2, 3, 4, 5},
-	}, {
-		// Duplicate hashes
-		hashes:   []uint64{1, 2, 3, 4, 5, 1, 2, 3, 4, 5},
-		expected: []uint64{1, 2, 3, 4, 5},
-	}, {
-		// Has hash value larger than slots count, should sort by `hash % numSlots` first.
-		hashes:   []uint64{4, 9, 9, 3},
-		expected: []uint64{9, 3, 4},
-	}}
-
-	for _, tc := range testCases {
-		require.Equal(t, tc.expected, sortAndDedupHashes(tc.hashes, 8))
-	}
-}
diff --git a/pkg/causality/conflict_detector.go b/pkg/causality/conflict_detector.go
@@ -34,15 +34,14 @@ type ConflictDetector[Txn txnEvent] struct {
 
 	// slots are used to find all unfinished transactions
 	// conflicting with an incoming transactions.
-	slots    *internal.Slots[*internal.Node]
+	slots    *internal.Slots
 	numSlots uint64
 
-	// nextWorkerID is used to dispatch transactions round-robin.
-	nextWorkerID atomic.Int64
+	// nextCacheID is used to dispatch transactions round-robin.
+	nextCacheID atomic.Int64
 
 	// Used to run a background goroutine to GC or notify nodes.
 	notifiedNodes *chann.DrainableChann[func()]
-	garbageNodes  *chann.DrainableChann[*internal.Node]
 	wg            sync.WaitGroup
 	closeCh       chan struct{}
 }
@@ -53,10 +52,9 @@ func NewConflictDetector[Txn txnEvent](
 ) *ConflictDetector[Txn] {
 	ret := &ConflictDetector[Txn]{
 		resolvedTxnCaches: make([]txnCache[Txn], opt.Count),
-		slots:             internal.NewSlots[*internal.Node](numSlots),
+		slots:             internal.NewSlots(numSlots),
 		numSlots:          numSlots,
 		notifiedNodes:     chann.NewAutoDrainChann[func()](),
-		garbageNodes:      chann.NewAutoDrainChann[*internal.Node](),
 		closeCh:           make(chan struct{}),
 	}
 	for i := 0; i < opt.Count; i++ {
@@ -75,28 +73,24 @@ func NewConflictDetector[Txn txnEvent](
 // Add pushes a transaction to the ConflictDetector.
 //
 // NOTE: if multiple threads access this concurrently,
-// Txn.GenSortedDedupKeysHash must be sorted by the slot index.
+// Txn.ConflictKeys must be sorted by the slot index.
 func (d *ConflictDetector[Txn]) Add(txn Txn) {
-	sortedKeysHash := txn.GenSortedDedupKeysHash(d.numSlots)
-	node := internal.NewNode(sortedKeysHash)
+	hashes := txn.ConflictKeys()
+	node := d.slots.AllocNode(hashes)
 	txnWithNotifier := TxnWithNotifier[Txn]{
 		TxnEvent: txn,
 		PostTxnExecuted: func() {
 			// After this transaction is executed, we can remove the node from the graph,
 			// and resolve related dependencies for these transacitons which depend on this
 			// executed transaction.
-			node.Remove()
-
-			// Send this node to garbageNodes to GC it from the slots if this node is still
-			// occupied related slots.
-			d.garbageNodes.In() <- node
+			d.slots.Remove(node)
 		},
 	}
 	node.TrySendToTxnCache = func(cacheID int64) bool {
 		// Try sending this txn to related cache as soon as all dependencies are resolved.
 		return d.sendToCache(txnWithNotifier, cacheID)
 	}
-	node.RandCacheID = func() int64 { return d.nextWorkerID.Add(1) % int64(len(d.resolvedTxnCaches)) }
+	node.RandCacheID = func() int64 { return d.nextCacheID.Add(1) % int64(len(d.resolvedTxnCaches)) }
 	node.OnNotified = func(callback func()) { d.notifiedNodes.In() <- callback }
 	d.slots.Add(node)
 }
@@ -110,7 +104,6 @@ func (d *ConflictDetector[Txn]) Close() {
 func (d *ConflictDetector[Txn]) runBackgroundTasks() {
 	defer func() {
 		d.notifiedNodes.CloseAndDrain()
-		d.garbageNodes.CloseAndDrain()
 	}()
 	for {
 		select {
@@ -120,10 +113,6 @@ func (d *ConflictDetector[Txn]) runBackgroundTasks() {
 			if notifyCallback != nil {
 				notifyCallback()
 			}
-		case node := <-d.garbageNodes.Out():
-			if node != nil {
-				d.slots.Free(node)
-			}
 		}
 	}
 }

diff --git a/pkg/causality/internal/main_test.go b/pkg/causality/internal/main_test.go
@@ -0,0 +1,24 @@
+// Copyright 2024 PingCAP, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package internal
+
+import (
+	"testing"
+
+	"github.com/pingcap/tiflow/pkg/leakutil"
+)
+
+func TestMain(m *testing.M) {
+	leakutil.SetUpLeakTest(m)
+}
diff --git a/pkg/causality/internal/node.go b/pkg/causality/internal/node.go
@@ -29,8 +29,6 @@ type (
 const (
 	unassigned    = cacheID(-2)
 	assignedToAny = cacheID(-1)
-
-	invalidNodeID = int64(-1)
 )
 
 var (
@@ -60,8 +58,8 @@ type Node struct {
 
 	// Following fields are used for notifying a node's dependers lock-free.
 	totalDependencies    int32
-	resolvedDependencies int32
 	removedDependencies  int32
+	resolvedDependencies int32
 	resolvedList         []int64
 
 	// Following fields are protected by `mu`.
@@ -84,52 +82,14 @@ type Node struct {
 	dependers *btree.BTreeG[*Node]
 }
 
-// NewNode creates a new node.
-func NewNode(sortedDedupKeysHash []uint64) (ret *Node) {
-	defer func() {
-		ret.id = genNextNodeID()
-		ret.sortedDedupKeysHash = sortedDedupKeysHash
-		ret.TrySendToTxnCache = nil
-		ret.RandCacheID = nil
-		ret.totalDependencies = 0
-		ret.resolvedDependencies = 0
-		ret.removedDependencies = 0
-		ret.resolvedList = nil
-		ret.assignedTo = unassigned
-		ret.removed = false
-	}()
-
-	ret = new(Node)
-	return
-}
-
-// NodeID implements interface internal.SlotNode.
-func (n *Node) NodeID() int64 {
+func (n *Node) nodeID() int64 {
 	return n.id
 }
 
-// Hashes implements interface internal.SlotNode.
-func (n *Node) Hashes() []uint64 {
-	return n.sortedDedupKeysHash
-}
-
-// DependOn implements interface internal.SlotNode.
-func (n *Node) DependOn(dependencyNodes map[int64]*Node, noDependencyKeyCnt int) {
+func (n *Node) dependOn(dependencyNodes map[int64]*Node) {
 	resolvedDependencies := int32(0)
 
 	depend := func(target *Node) {
-		if target == nil {
-			// For a given Node, every dependency corresponds to a target.
-			// If target is nil it means the dependency doesn't conflict
-			// with any other nodes. However it's still necessary to track
-			// it because Node.tryResolve needs to counting the number of
-			// resolved dependencies.
-			resolvedDependencies = atomic.AddInt32(&n.resolvedDependencies, 1)
-			atomic.StoreInt64(&n.resolvedList[resolvedDependencies-1], assignedToAny)
-			atomic.AddInt32(&n.removedDependencies, 1)
-			return
-		}
-
 		if target.id == n.id {
 			log.Panic("node cannot depend on itself")
 		}
@@ -157,13 +117,8 @@ func (n *Node) DependOn(dependencyNodes map[int64]*Node, noDependencyKeyCnt int)
 		}
 	}
 
-	// Re-allocate ID in `DependOn` instead of creating the node, because the node can be
-	// pending in slots after it's created.
-	// ?: why gen new ID here?
-	n.id = genNextNodeID()
-
 	// `totalDependencies` and `resolvedList` must be initialized before depending on any targets.
-	n.totalDependencies = int32(len(dependencyNodes) + noDependencyKeyCnt)
+	n.totalDependencies = int32(len(dependencyNodes))
 	n.resolvedList = make([]int64, 0, n.totalDependencies)
 	for i := 0; i < int(n.totalDependencies); i++ {
 		n.resolvedList = append(n.resolvedList, unassigned)
@@ -172,15 +127,11 @@ func (n *Node) DependOn(dependencyNodes map[int64]*Node, noDependencyKeyCnt int)
 	for _, node := range dependencyNodes {
 		depend(node)
 	}
-	for i := 0; i < noDependencyKeyCnt; i++ {
-		depend(nil)
-	}
 
 	n.maybeResolve()
 }
 
-// Remove implements interface internal.SlotNode.
-func (n *Node) Remove() {
+func (n *Node) remove() {
 	n.mu.Lock()
 	defer n.mu.Unlock()
 
@@ -197,25 +148,6 @@ func (n *Node) Remove() {
 	}
 }
 
-// Free implements interface internal.SlotNode.
-// It must be called if a node is no longer used.
-// We are using sync.Pool to lessen the burden of GC.
-func (n *Node) Free() {
-	n.mu.Lock()
-	defer n.mu.Unlock()
-	if n.id == invalidNodeID {
-		log.Panic("double free")
-	}
-
-	n.id = invalidNodeID
-	n.TrySendToTxnCache = nil
-
-	// TODO: reuse node if necessary. Currently it's impossible if async-notify is used.
-	// The reason is a node can step functions `assignTo`, `Remove`, `Free`, then `assignTo`.
-	// again. In the last `assignTo`, it can never know whether the node has been reused
-	// or not.
-}
-
 // tryAssignTo assigns a node to a cache. Returns `true` on success.
 func (n *Node) tryAssignTo(cacheID int64) bool {
 	n.mu.Lock()
@@ -254,7 +186,7 @@ func (n *Node) maybeResolve() {
 			log.Panic("invalid cache ID", zap.Uint64("cacheID", uint64(cacheID)))
 		}
 
-		if cacheID >= 0 {
+		if cacheID != assignedToAny {
 			n.tryAssignTo(cacheID)
 			return
 		}
@@ -289,9 +221,6 @@ func (n *Node) tryResolve() (int64, bool) {
 		hasDiffDep := false
 		for i := 1; i < int(n.totalDependencies); i++ {
 			curr := atomic.LoadInt64(&n.resolvedList[i])
-			// Todo: simplify assign to logic, only resolve dependencies nodes after
-			// corresponding transactions are executed.
-			//
 			// In DependOn, depend(nil) set resolvedList[i] to assignedToAny
 			// for these no dependecy keys.
 			if curr == assignedToAny {