diff --git a/velox/common/memory/AllocationPool.h b/velox/common/memory/AllocationPool.h
index ff3fee4442f0..1449be6b9af9 100644
--- a/velox/common/memory/AllocationPool.h
+++ b/velox/common/memory/AllocationPool.h
@@ -18,13 +18,12 @@
 #include "velox/common/memory/Memory.h"
 
 namespace facebook::velox::memory {
-// A set of Allocations holding the fixed width payload
-// rows. The Runs are filled to the end except for the last one. This
-// is used for iterating over the payload for rehashing, returning
-// results etc. This is used via HashStringAllocator for variable length
-// allocation for backing ByteStreams for complex objects. In that case, there
-// is a current run that is appended to and when this is exhausted a new run is
-// started.
+/// A set of Allocations holding the fixed width payload ows. The Runs are
+/// filled to the end except for the last one. This is used for iterating over
+/// the payload for rehashing, returning results etc. This is used via
+/// HashStringAllocator for variable length allocation for backing ByteStreams
+/// for complex objects. In that case, there is a current run that is appended
+/// to and when this is exhausted a new run is started.
 class AllocationPool {
  public:
   static constexpr int32_t kMinPages = 16;
diff --git a/velox/common/memory/HashStringAllocator.cpp b/velox/common/memory/HashStringAllocator.cpp
index 1a938a0dd571..57987fce5e0f 100644
--- a/velox/common/memory/HashStringAllocator.cpp
+++ b/velox/common/memory/HashStringAllocator.cpp
@@ -386,7 +386,11 @@ int32_t HashStringAllocator::freeListIndex(int size) {
 }
 
 void HashStringAllocator::removeFromFreeList(Header* header) {
-  VELOX_CHECK(header->isFree());
+  //LOG(ERROR) << "header " << header << " " << this;
+  if (!header->isFree()) {
+    LOG(ERROR) << "bad header " << header;
+    VELOX_FAIL("header");
+  }
   header->clearFree();
   const auto index = freeListIndex(header->size());
   reinterpret_cast<CompactDoubleList*>(header->begin())->remove();
@@ -448,6 +452,9 @@ HashStringAllocator::Header* HashStringAllocator::allocateFromFreeList(
     return nullptr;
   }
   auto* found = headerOf(item);
+  if (!(found->isFree() && (!mustHaveSize || found->size() >= preferredSize))) {
+    LOG(ERROR) << "bad found " << found << " free " << found->isFree() << " size " << found->size() << " preferredSize " << preferredSize << " mustHaveSize " << mustHaveSize;
+  }
   VELOX_CHECK(
       found->isFree() && (!mustHaveSize || found->size() >= preferredSize));
   --state_.numFree();
diff --git a/velox/common/memory/HashStringAllocator.h b/velox/common/memory/HashStringAllocator.h
index 4604b7565c4f..3d0bea802414 100644
--- a/velox/common/memory/HashStringAllocator.h
+++ b/velox/common/memory/HashStringAllocator.h
@@ -186,7 +186,8 @@ class HashStringAllocator : public StreamArena {
   // header is below the first byte of the StringView's data. StringViews
   // written by this are to be read with contiguousString(). This is nearly
   // always zero copy but will accommodate the odd extra large string.
-  void copyMultipart(const StringView& str, char* group, int32_t offset) {
+  virtual void
+  copyMultipart(const StringView& str, char* group, int32_t offset) {
     if (str.isInline()) {
       *reinterpret_cast<StringView*>(group + offset) = str;
       return;
@@ -201,7 +202,7 @@ class HashStringAllocator : public StreamArena {
 
   /// Allocates 'size' contiguous bytes preceded by a Header. Returns the
   /// address of Header.
-  Header* allocate(int32_t size) {
+  virtual Header* allocate(int32_t size) {
     VELOX_CHECK_NULL(
         state_.currentHeader(),
         "Do not call allocate() when a write is in progress");
@@ -211,11 +212,11 @@ class HashStringAllocator : public StreamArena {
   /// Allocates a block that is independently freeable but is freed on
   /// destruction of 'this'. The block has no header and must be freed by
   /// freeToPool() if to be freed before destruction of 'this'.
-  void* allocateFromPool(size_t size);
+  virtual void* allocateFromPool(size_t size);
 
   /// Frees a block allocated with allocateFromPool(). The pointer and size must
   /// match.
-  void freeToPool(void* ptr, size_t size);
+  virtual void freeToPool(void* ptr, size_t size);
 
   /// Returns the header immediately below 'data'.
   static Header* headerOf(const void* data) {
@@ -228,11 +229,6 @@ class HashStringAllocator : public StreamArena {
         const_cast<char*>(reinterpret_cast<const char*>(data)));
   }
 
-  /// Returns the byte size of block pointed by 'header'.
-  inline size_t blockBytes(const Header* header) const {
-    return header->size() + kHeaderSize;
-  }
-
   /// Returns ByteInputStream over the data in the range of 'header' and
   /// possible continuation ranges.
   /// @param maxBytes If provided, the returned stream will cover at most that
@@ -256,27 +252,27 @@ class HashStringAllocator : public StreamArena {
   /// without allocating more space. 'position' can be changed but will
   /// logically point at the same data. Data to the right of 'position is not
   /// preserved.
-  void ensureAvailable(int32_t bytes, Position& position);
+  virtual void ensureAvailable(int32_t bytes, Position& position);
 
   /// Sets stream to write to this pool. The write can span multiple
   /// non-contiguous runs. Each contiguous run will have at least kMinContiguous
   /// bytes of contiguous space. finishWrite finalizes the allocation
   /// information after the write is done. Returns the position at the start of
   /// the allocated block.
-  Position newWrite(
+  virtual Position newWrite(
       ByteOutputStream& stream,
       int32_t preferredSize = kMinContiguous);
 
   // Sets 'stream' to write starting at 'position'. If new ranges have to
   // be allocated when writing, headers will be updated accordingly.
-  void extendWrite(Position position, ByteOutputStream& stream);
+  virtual void extendWrite(Position position, ByteOutputStream& stream);
 
   /// Completes a write prepared with newWrite or extendWrite. Up to
   /// 'numReserveBytes' unused bytes, if available, are left after the end of
   /// the write to accommodate another write. Returns a pair of positions: (1)
   /// position at the start of this 'write', (2) position immediately after the
   /// last written byte.
-  std::pair<Position, Position> finishWrite(
+  virtual std::pair<Position, Position> finishWrite(
       ByteOutputStream& stream,
       int32_t numReserveBytes);
 
@@ -291,30 +287,26 @@ class HashStringAllocator : public StreamArena {
   /// ranges will not overwrite the next pointer.
   ///
   /// May allocate less than 'bytes'.
-  void newRange(int32_t bytes, ByteRange* lastRange, ByteRange* range) override;
+  virtual void newRange(int32_t bytes, ByteRange* lastRange, ByteRange* range)
+      override;
 
   /// Allocates a new range of at least 'bytes' size.
-  void newContiguousRange(int32_t bytes, ByteRange* range);
-
-  void newTinyRange(int32_t bytes, ByteRange* lastRange, ByteRange* range)
-      override {
-    newRange(bytes, lastRange, range);
-  }
+  virtual void newContiguousRange(int32_t bytes, ByteRange* range);
 
   /// Returns the total memory footprint of 'this'.
-  int64_t retainedSize() const {
+  virtual int64_t retainedSize() const {
     return state_.pool().allocatedBytes() + state_.sizeFromPool();
   }
 
   /// Adds the allocation of 'header' and any extensions (if header has
   /// kContinued set) to the free list.
-  void free(Header* header);
+  virtual void free(Header* header);
 
   /// Returns a lower bound on bytes available without growing 'this'. This is
   /// the sum of free block sizes minus size of pointer for each. We subtract
   /// the pointer because in the worst case we would have one allocation that
   /// chains many small free blocks together via kContinued.
-  uint64_t freeSpace() const {
+  virtual uint64_t freeSpace() const {
     const int64_t minFree = state_.freeBytes() -
         state_.numFree() * (kHeaderSize + Header::kContinuedPtrSize);
     VELOX_CHECK_GE(minFree, 0, "Guaranteed free space cannot be negative");
@@ -328,26 +320,26 @@ class HashStringAllocator : public StreamArena {
     return state_.pool().pool();
   }
 
-  uint64_t currentBytes() const {
+  virtual uint64_t currentBytes() const {
     return state_.currentBytes();
   }
 
   /// Checks the free space accounting and consistency of Headers. Throws when
   /// detects corruption. Returns the number of allocated payload bytes,
   /// excluding headers, continue links and other overhead.
-  int64_t checkConsistency() const;
+  virtual int64_t checkConsistency() const;
 
   /// Returns 'true' if this is empty. The implementation includes a call to
   /// checkConsistency() which makes it slow. Do not use in hot paths.
-  bool isEmpty() const;
+  virtual bool isEmpty() const;
 
   /// Throws if 'this' is not empty. Checks consistency of
   /// 'this'. This is a fast check for RowContainer users freeing the
   /// variable length data they store. Can be used in non-debug
   /// builds.
-  void checkEmpty() const;
+  virtual void checkEmpty() const;
 
-  std::string toString() const;
+  virtual std::string toString() const;
 
   /// Effectively makes this immutable while executing f, any attempt to access
   /// state_ in a mutable way while f is executing will cause an exception to be
@@ -363,12 +355,18 @@ class HashStringAllocator : public StreamArena {
     f();
   }
 
- private:
+ protected:
   static constexpr int32_t kUnitSize = 16 * memory::AllocationTraits::kPageSize;
   static constexpr int32_t kMinContiguous = 48;
   static constexpr int32_t kNumFreeLists = kMaxAlloc - kMinAlloc + 2;
   static constexpr uint32_t kHeaderSize = sizeof(Header);
 
+ private:
+  // Returns the byte size of block pointed by 'header'.
+  inline size_t blockBytes(const Header* header) const {
+    return header->size() + kHeaderSize;
+  }
+
   void newRange(
       int32_t bytes,
       ByteRange* lastRange,
@@ -603,6 +601,126 @@ struct StlAllocator {
   HashStringAllocator* allocator_;
 };
 
+class ThreadSafeHashStringAllocator : public HashStringAllocator {
+ public:
+  explicit ThreadSafeHashStringAllocator(HashStringAllocator* allocator)
+      : HashStringAllocator(allocator->pool()), allocator_(allocator) {
+    VELOX_CHECK_NOT_NULL(allocator_);
+  }
+
+  void copyMultipart(const StringView& str, char* group, int32_t offset)
+      override {
+    std::lock_guard<std::mutex> l(mu_);
+    return allocator_->copyMultipart(str, group, offset);
+  }
+
+  Header* allocate(int32_t size) override {
+    std::lock_guard<std::mutex> l(mu_);
+    auto* head = allocator_->allocate(size);
+    //LOG(ERROR) << "allocate " << head << " " << this;
+    return head;
+    // return allocator_->allocate(size);
+  }
+
+  void* allocateFromPool(size_t size) override {
+    void* ptr;
+    {
+      std::lock_guard<std::mutex> l(mu_);
+      ptr = allocator_->allocateFromPool(size);
+    }
+    //LOG(ERROR) << "allocateFromPool " << ptr << " size " << size << " " << this;
+    return ptr;
+  }
+
+  void freeToPool(void* ptr, size_t size) override {
+    //LOG(ERROR) << "freeToPool " << ptr << " size " << size << " " << this;
+    {
+      std::lock_guard<std::mutex> l(mu_);
+      allocator_->freeToPool(ptr, size);
+    }
+  }
+
+  void ensureAvailable(int32_t bytes, Position& position) override {
+    std::lock_guard<std::mutex> l(mu_);
+    allocator_->ensureAvailable(bytes, position);
+  }
+
+  Position newWrite(
+      ByteOutputStream& stream,
+      int32_t preferredSize = kMinContiguous) {
+    std::lock_guard<std::mutex> l(mu_);
+    return allocator_->newWrite(stream, preferredSize);
+  }
+
+  void extendWrite(Position position, ByteOutputStream& stream) override {
+    std::lock_guard<std::mutex> l(mu_);
+    allocator_->extendWrite(position, stream);
+  }
+
+  std::pair<Position, Position> finishWrite(
+      ByteOutputStream& stream,
+      int32_t numReserveBytes) override {
+    std::lock_guard<std::mutex> l(mu_);
+    return allocator_->finishWrite(stream, numReserveBytes);
+  }
+
+  void newRange(int32_t bytes, ByteRange* lastRange, ByteRange* range)
+      override {
+    std::lock_guard<std::mutex> l(mu_);
+    allocator_->newRange(bytes, lastRange, range);
+  }
+
+  void newContiguousRange(int32_t bytes, ByteRange* range) override {
+    std::lock_guard<std::mutex> l(mu_);
+    allocator_->newContiguousRange(bytes, range);
+  }
+
+  void free(Header* header) override {
+    std::lock_guard<std::mutex> l(mu_);
+    //LOG(ERROR) << "free " << header << " " << this;
+    allocator_->free(header);
+  }
+
+  int64_t retainedSize() const override {
+    std::lock_guard<std::mutex> l(mu_);
+    return allocator_->retainedSize();
+  }
+
+  uint64_t freeSpace() const override {
+    std::lock_guard<std::mutex> l(mu_);
+    return allocator_->freeSpace();
+  }
+
+  void clear() override {
+    std::lock_guard<std::mutex> l(mu_);
+    allocator_->clear();
+  }
+
+  uint64_t currentBytes() const override {
+    std::lock_guard<std::mutex> l(mu_);
+    return allocator_->currentBytes();
+  }
+
+  int64_t checkConsistency() const override {
+    std::lock_guard<std::mutex> l(mu_);
+    return allocator_->checkConsistency();
+  }
+
+  void checkEmpty() const override {
+    std::lock_guard<std::mutex> l(mu_);
+    allocator_->checkEmpty();
+  }
+
+  std::string toString() const override {
+    std::lock_guard<std::mutex> l(mu_);
+    return allocator_->toString();
+  }
+
+ private:
+  HashStringAllocator* const allocator_;
+  mutable std::mutex mu_;
+};
+
 /// An allocator backed by HashStringAllocator that guaratees a configurable
 /// alignment. The alignment must be a power of 2 and not be 0. This allocator
 /// can be used with folly F14 containers that requires 16-bytes alignment.
diff --git a/velox/common/memory/MemoryPool.cpp b/velox/common/memory/MemoryPool.cpp
index 1687ec867b19..9cedf012e940 100644
--- a/velox/common/memory/MemoryPool.cpp
+++ b/velox/common/memory/MemoryPool.cpp
@@ -456,7 +456,7 @@ MemoryPoolImpl::~MemoryPoolImpl() {
           kMetricMemoryPoolReservationLeakBytes, minReservationBytes_);
     }
   }
-  VELOX_DCHECK(
+  VELOX_CHECK(
       (usedReservationBytes_ == 0) && (reservationBytes_ == 0) &&
           (minReservationBytes_ == 0),
       "Bad memory usage track state: {}",
diff --git a/velox/common/memory/tests/HashStringAllocatorTest.cpp b/velox/common/memory/tests/HashStringAllocatorTest.cpp
index 124220d4d57f..6132efb3af84 100644
--- a/velox/common/memory/tests/HashStringAllocatorTest.cpp
+++ b/velox/common/memory/tests/HashStringAllocatorTest.cpp
@@ -178,6 +178,8 @@ TEST_F(HashStringAllocatorTest, headerToString) {
 }
 
 TEST_F(HashStringAllocatorTest, allocate) {
+  auto* buffer = allocator_->allocate(8192);
+  return;
   for (auto count = 0; count < 3; ++count) {
     std::vector<HSA::Header*> headers;
     for (auto i = 0; i < 10'000; ++i) {
diff --git a/velox/exec/HashTable.cpp b/velox/exec/HashTable.cpp
index 3a15da01614e..f92115bd4bc1 100644
--- a/velox/exec/HashTable.cpp
+++ b/velox/exec/HashTable.cpp
@@ -954,11 +954,18 @@ void HashTable<ignoreNullKeys>::parallelJoinBuild() {
   }
 
   // The parallel table building step.
+  std::unique_ptr<HashStringAllocator> threadSafeAllocator(
+      new ThreadSafeHashStringAllocator(&rows_->stringAllocator()));
   std::vector<std::vector<char*>> overflowPerPartition(numPartitions);
   for (auto i = 0; i < numPartitions; ++i) {
     buildSteps.push_back(std::make_shared<AsyncSource<bool>>(
-        [this, i, &overflowPerPartition, &rowPartitions]() {
-          buildJoinPartition(i, rowPartitions, overflowPerPartition[i]);
+        [this,
+         i,
+         &overflowPerPartition,
+         &rowPartitions,
+         allocator = threadSafeAllocator.get()]() {
+          buildJoinPartition(
+              i, rowPartitions, overflowPerPartition[i], allocator);
           return std::make_unique<bool>(true);
         }));
     VELOX_CHECK(!buildSteps.empty());
@@ -984,7 +991,8 @@ void HashTable<ignoreNullKeys>::parallelJoinBuild() {
         overflows.data(),
         hashes.data(),
         overflows.size(),
-        nullptr);
+        nullptr,
+        &rows_->stringAllocator());
 
     VELOX_CHECK_EQ(table->rows()->numRows(), table->numParallelBuildRows_);
   }
@@ -1043,7 +1051,8 @@ template <bool ignoreNullKeys>
 void HashTable<ignoreNullKeys>::buildJoinPartition(
     uint8_t partition,
     const std::vector<std::unique_ptr<RowPartitions>>& rowPartitions,
-    std::vector<char*>& overflow) {
+    std::vector<char*>& overflow,
+    HashStringAllocator* allocator) {
   constexpr int32_t kBatch = 1024;
   raw_vector<char*> rows(kBatch);
   raw_vector<uint64_t> hashes(kBatch);
@@ -1052,8 +1061,6 @@ void HashTable<ignoreNullKeys>::buildJoinPartition(
       buildPartitionBounds_[partition],
       buildPartitionBounds_[partition + 1],
       overflow};
-  auto* rowContainer =
-      (partition == 0 ? this : otherTables_[partition - 1].get())->rows();
   for (auto i = 0; i < numPartitions; ++i) {
     auto* table = i == 0 ? this : otherTables_[i - 1].get();
     RowContainerIterator iter;
@@ -1061,7 +1068,12 @@ void HashTable<ignoreNullKeys>::buildJoinPartition(
                iter, partition, kBatch, *rowPartitions[i], rows.data())) {
       hashRows(folly::Range(rows.data(), numRows), false, hashes);
       insertForJoin(
-          rowContainer, rows.data(), hashes.data(), numRows, &partitionInfo);
+          table->rows_.get(),
+          rows.data(),
+          hashes.data(),
+          numRows,
+          &partitionInfo,
+          allocator);
       table->numParallelBuildRows_ += numRows;
     }
   }
@@ -1077,7 +1089,13 @@ bool HashTable<ignoreNullKeys>::insertBatch(
     return false;
   }
   if (isJoinBuild_) {
-    insertForJoin(rows(), groups, hashes.data(), numGroups);
+    insertForJoin(
+        rows(),
+        groups,
+        hashes.data(),
+        numGroups,
+        nullptr,
+        &rows_->stringAllocator());
   } else {
     insertForGroupBy(groups, hashes.data(), numGroups);
   }
@@ -1137,12 +1155,16 @@ void HashTable<ignoreNullKeys>::insertForGroupBy(
 }
 
 template <bool ignoreNullKeys>
-bool HashTable<ignoreNullKeys>::arrayPushRow(char* row, int32_t index) {
+bool HashTable<ignoreNullKeys>::arrayPushRow(
+    RowContainer* rows,
+    char* row,
+    int32_t index,
+    HashStringAllocator* allocator) {
   auto* existingRow = table_[index];
   if (existingRow != nullptr) {
     if (nextOffset_ > 0) {
       hasDuplicates_ = true;
-      rows_->appendNextRow(existingRow, row);
+      rows->appendNextRow(existingRow, row, allocator);
     }
     return false;
   }
@@ -1154,10 +1176,11 @@ template <bool ignoreNullKeys>
 void HashTable<ignoreNullKeys>::pushNext(
     RowContainer* rows,
     char* row,
-    char* next) {
+    char* next,
+    HashStringAllocator* allocator) {
   VELOX_CHECK_GT(nextOffset_, 0);
   hasDuplicates_ = true;
-  rows->appendNextRow(row, next);
+  rows->appendNextRow(row, next, allocator);
 }
 
 template <bool ignoreNullKeys>
@@ -1168,7 +1191,8 @@ FOLLY_ALWAYS_INLINE void HashTable<ignoreNullKeys>::buildFullProbe(
     uint64_t hash,
     char* inserted,
     bool extraCheck,
-    TableInsertPartitionInfo* partitionInfo) {
+    TableInsertPartitionInfo* partitionInfo,
+    HashStringAllocator* allocator) {
   constexpr int32_t kKeyOffset =
       -static_cast<int32_t>(sizeof(normalized_key_t));
   auto insertFn = [&](int32_t /*row*/, PartitionBoundIndexType index) {
@@ -1187,7 +1211,7 @@ FOLLY_ALWAYS_INLINE void HashTable<ignoreNullKeys>::buildFullProbe(
           if (RowContainer::normalizedKey(group) ==
               RowContainer::normalizedKey(inserted)) {
             if (nextOffset_ > 0) {
-              pushNext(rows, group, inserted);
+              pushNext(rows, group, inserted, allocator);
             }
             return true;
           }
@@ -1204,7 +1228,7 @@ FOLLY_ALWAYS_INLINE void HashTable<ignoreNullKeys>::buildFullProbe(
         [&](char* group, int32_t /*row*/) {
           if (compareKeys(group, inserted)) {
             if (nextOffset_ > 0) {
-              pushNext(rows, group, inserted);
+              pushNext(rows, group, inserted, allocator);
             }
             return true;
           }
@@ -1224,7 +1248,8 @@ FOLLY_ALWAYS_INLINE void HashTable<ignoreNullKeys>::insertForJoinWithPrefetch(
     char** groups,
     uint64_t* hashes,
     int32_t numGroups,
-    TableInsertPartitionInfo* partitionInfo) {
+    TableInsertPartitionInfo* partitionInfo,
+    HashStringAllocator* allocator) {
   auto i = 0;
   ProbeState states[kPrefetchSize];
   constexpr int32_t kKeyOffset =
@@ -1244,14 +1269,20 @@ FOLLY_ALWAYS_INLINE void HashTable<ignoreNullKeys>::insertForJoinWithPrefetch(
     for (int32_t j = 0; j < kPrefetchSize; ++j) {
       auto index = i + j;
       buildFullProbe<isNormailizedKeyMode>(
-          rows, states[j], hashes[index], groups[index], j != 0, partitionInfo);
+          rows,
+          states[j],
+          hashes[index],
+          groups[index],
+          j != 0,
+          partitionInfo,
+          allocator);
     }
   }
   for (; i < numGroups; ++i) {
     states[0].preProbe(*this, hashes[i], i);
     states[0].firstProbe(*this, keyOffset);
     buildFullProbe<isNormailizedKeyMode>(
-        rows, states[0], hashes[i], groups[i], false, partitionInfo);
+        rows, states[0], hashes[i], groups[i], false, partitionInfo, allocator);
   }
 }
 
@@ -1261,23 +1292,24 @@ void HashTable<ignoreNullKeys>::insertForJoin(
     char** groups,
     uint64_t* hashes,
     int32_t numGroups,
-    TableInsertPartitionInfo* partitionInfo) {
+    TableInsertPartitionInfo* partitionInfo,
+    HashStringAllocator* allocator) {
   // The insertable rows are in the table, all get put in the hash table or
   // array.
   if (hashMode_ == HashMode::kArray) {
     for (auto i = 0; i < numGroups; ++i) {
       auto index = hashes[i];
       VELOX_CHECK_LT(index, capacity_);
-      arrayPushRow(groups[i], index);
+      arrayPushRow(rows, groups[i], index, allocator);
     }
     return;
   }
   if (hashMode_ == HashMode::kNormalizedKey) {
     insertForJoinWithPrefetch<true>(
-        rows, groups, hashes, numGroups, partitionInfo);
+        rows, groups, hashes, numGroups, partitionInfo, allocator);
   } else {
     insertForJoinWithPrefetch<false>(
-        rows, groups, hashes, numGroups, partitionInfo);
+        rows, groups, hashes, numGroups, partitionInfo, allocator);
   }
 }
 
diff --git a/velox/exec/HashTable.h b/velox/exec/HashTable.h
index 4726960a2b6f..0b9a82483675 100644
--- a/velox/exec/HashTable.h
+++ b/velox/exec/HashTable.h
@@ -824,7 +824,8 @@ class HashTable : public BaseHashTable {
       char** groups,
       uint64_t* hashes,
       int32_t numGroups,
-      TableInsertPartitionInfo* = nullptr);
+      TableInsertPartitionInfo* partitionInfo,
+      HashStringAllocator* allocator);
 
   // Inserts 'numGroups' entries into 'this'. 'groups' point to
   // contents in a RowContainer owned by 'this'. 'hashes' are the hash
@@ -857,7 +858,8 @@ class HashTable : public BaseHashTable {
   void buildJoinPartition(
       uint8_t partition,
       const std::vector<std::unique_ptr<RowPartitions>>& rowPartitions,
-      std::vector<char*>& overflow);
+      std::vector<char*>& overflow,
+      HashStringAllocator* allocator);
 
   // Assigns a partition to each row of 'subtable' in RowPartitions of
   // subtable's RowContainer. If 'hashMode_' is kNormalizedKeys, records the
@@ -906,13 +908,17 @@ class HashTable : public BaseHashTable {
   // Adds a row to a hash join table in kArray hash mode. Returns true
   // if a new entry was made and false if the row was added to an
   // existing set of rows with the same key.
-  bool arrayPushRow(char* row, int32_t index);
+  bool arrayPushRow(RowContainer* rows, char* row, int32_t index, HashStringAllocator* allocator);
 
   // Adds a row to a hash join build side entry with multiple rows
   // with the same key.
   // 'rows' should be the same as the one in hash table except for
   // 'parallelJoinBuild'.
-  void pushNext(RowContainer* rows, char* row, char* next);
+  void pushNext(
+      RowContainer* rows,
+      char* row,
+      char* next,
+      HashStringAllocator* allocator);
 
   // Finishes inserting an entry into a join hash table. If 'partitionInfo' is
   // not null and the insert falls out-side of the partition range, then insert
@@ -924,7 +930,8 @@ class HashTable : public BaseHashTable {
       uint64_t hash,
       char* row,
       bool extraCheck,
-      TableInsertPartitionInfo* partitionInfo);
+      TableInsertPartitionInfo* partitionInfo,
+      HashStringAllocator* allocator);
 
   template <bool isNormailizedKeyMode>
   void insertForJoinWithPrefetch(
@@ -932,7 +939,8 @@ class HashTable : public BaseHashTable {
       char** groups,
       uint64_t* hashes,
       int32_t numGroups,
-      TableInsertPartitionInfo* partitionInfo);
+      TableInsertPartitionInfo* partitionInfo,
+      HashStringAllocator* allocator);
 
   // Updates 'hashers_' to correspond to the keys in the
   // content. Returns true if all hashers offer a mapping to value ids
diff --git a/velox/exec/RowContainer.cpp b/velox/exec/RowContainer.cpp
index cb7e438cba07..4cb18c09a92c 100644
--- a/velox/exec/RowContainer.cpp
+++ b/velox/exec/RowContainer.cpp
@@ -144,12 +144,12 @@ RowContainer::RowContainer(
     : keyTypes_(keyTypes),
       nullableKeys_(nullableKeys),
       isJoinBuild_(isJoinBuild),
-      accumulators_(accumulators),
       hasNormalizedKeys_(hasNormalizedKeys),
-      rows_(pool),
       stringAllocator_(
           stringAllocator ? stringAllocator
-                          : std::make_shared<HashStringAllocator>(pool)) {
+                          : std::make_shared<HashStringAllocator>(pool)),
+      accumulators_(accumulators),
+      rows_(pool) {
   // Compute the layout of the payload row.  The row has keys, null flags,
   // accumulators, dependent fields. All fields are fixed width. If variable
   // width data is referenced, this is done with StringView(for VARCHAR) and
@@ -392,13 +392,16 @@ int32_t RowContainer::findRows(folly::Range<char**> rows, char** result) {
   return numRows;
 }
 
-void RowContainer::appendNextRow(char* current, char* nextRow) {
+void RowContainer::appendNextRow(
+    char* current,
+    char* nextRow,
+    HashStringAllocator* allocator) {
+  // VELOX_CHECK_NOT_NULL(dynamic_cast<HashStringAllocator*>(allocator));
   VELOX_CHECK(getNextRowVector(nextRow) == nullptr);
   NextRowVector*& nextRowArrayPtr = getNextRowVector(current);
   if (nextRowArrayPtr == nullptr) {
-    nextRowArrayPtr =
-        new (stringAllocator_->allocate(kNextRowVectorSize)->begin())
-            NextRowVector(StlAllocator<char*>(stringAllocator_.get()));
+    nextRowArrayPtr = new (allocator->allocate(kNextRowVectorSize)->begin())
+        NextRowVector(StlAllocator<char*>(allocator));
     hasDuplicateRows_ = true;
     nextRowArrayPtr->emplace_back(current);
   }
@@ -459,6 +462,8 @@ void RowContainer::freeAggregates(folly::Range<char**> rows) {
 }
 
 void RowContainer::freeNextRowVectors(folly::Range<char**> rows, bool clear) {
+  return;
+  // Fast check if the entire row container has duplicate rows or not.
   if (!nextOffset_ || !hasDuplicateRows_) {
     return;
   }
diff --git a/velox/exec/RowContainer.h b/velox/exec/RowContainer.h
index 722dfbdc8997..147cf966825d 100644
--- a/velox/exec/RowContainer.h
+++ b/velox/exec/RowContainer.h
@@ -680,7 +680,8 @@ class RowContainer {
   /// Creates a next-row-vector if it doesn't exist. Appends the row address to
   /// the next-row-vector, and store the address of the next-row-vector in the
   /// 'nextOffset_' slot for all duplicate rows.
-  void appendNextRow(char* current, char* nextRow);
+  void
+  appendNextRow(char* current, char* nextRow, HashStringAllocator* allocator);
 
   NextRowVector*& getNextRowVector(char* row) const {
     return *reinterpret_cast<NextRowVector**>(row + nextOffset_);
@@ -1306,6 +1307,9 @@ class RowContainer {
   const std::vector<TypePtr> keyTypes_;
   const bool nullableKeys_;
   const bool isJoinBuild_;
+  // True if normalized keys are enabled in initial state.
+  const bool hasNormalizedKeys_;
+  const std::shared_ptr<HashStringAllocator> stringAllocator_;
 
   std::vector<bool> columnHasNulls_;
 
@@ -1342,8 +1346,6 @@ class RowContainer {
   int32_t fixedRowSize_;
   // How many bytes do the flags (null, probed, free) occupy.
   int32_t flagBytes_;
-  // True if normalized keys are enabled in initial state.
-  const bool hasNormalizedKeys_;
   // The count of entries that have an extra normalized_key_t before the
   // start.
   int64_t numRowsWithNormalizedKey_ = 0;
@@ -1362,7 +1364,6 @@ class RowContainer {
   uint64_t numFreeRows_ = 0;
 
   memory::AllocationPool rows_;
-  std::shared_ptr<HashStringAllocator> stringAllocator_;
 
   int alignment_ = 1;
 };
diff --git a/velox/exec/benchmarks/HashJoinListResultBenchmark.cpp b/velox/exec/benchmarks/HashJoinListResultBenchmark.cpp
index aba2a254068b..f0478f350e7f 100644
--- a/velox/exec/benchmarks/HashJoinListResultBenchmark.cpp
+++ b/velox/exec/benchmarks/HashJoinListResultBenchmark.cpp
@@ -373,14 +373,15 @@ class HashTableListJoinResultBenchmark : public VectorTestBase {
       }
     }
     SelectivityInfo buildClocks;
+    uint64_t time{0};
     {
-      SelectivityTimer timer(buildClocks, 0);
+      ClockTimer timer(time);
       topTable_->prepareJoinTable(
           std::move(otherTables),
           BaseHashTable::kNoSpillInputStartPartitionBit,
           executor_.get());
     }
-    buildTime_ = buildClocks.timeToDropValue();
+    buildTime_ = time;
   }
 
   void probeTable(
@@ -536,8 +537,9 @@ int main(int argc, char** argv) {
       BaseHashTable::HashMode::kHash};
 
   std::vector<std::vector<std::pair<int32_t, int32_t>>> keyRepeatDists = {
-      // 20% of the rows are repeated only once, and 80% of the rows are not
-      // repeated.
+  // 20% of the rows are repeated only once, and 80% of the rows are not
+  // repeated.
+#if 1
       {{20, 1}, {80, 0}},
       {{20, 5}, {80, 0}},
       {{20, 10}, {80, 0}},
@@ -552,6 +554,7 @@ int main(int argc, char** argv) {
       {{100, 10}},
       {{100, 15}},
       {{100, 20}},
+#endif
       {{100, 25}}};
   std::vector<HashTableBenchmarkParams> params;
   for (auto withErase : {false, true}) {
diff --git a/velox/exec/tests/RowContainerTest.cpp b/velox/exec/tests/RowContainerTest.cpp
index 96b8dc01be8b..857de03cad90 100644
--- a/velox/exec/tests/RowContainerTest.cpp
+++ b/velox/exec/tests/RowContainerTest.cpp
@@ -1980,7 +1980,7 @@ TEST_F(RowContainerTest, nextRowVector) {
     EXPECT_EQ(rows, rowsFromContainer);
 
     for (int i = 0; i + 2 <= numRows; i += 2) {
-      data->appendNextRow(rows[i], rows[i + 1]);
+      data->appendNextRow(rows[i], rows[i + 1], &data->stringAllocator());
     }
     validateNextRowVector();
   };