diff --git a/common/cacheline.h b/common/cacheline.h
index dd54c64f99..2212516b1f 100644
--- a/common/cacheline.h
+++ b/common/cacheline.h
@@ -7,7 +7,11 @@
 
 #include <stdint.h>
 
+#if defined(__ARM_ARCH_7A__) || defined(__aarch64__)
+#define KDB_CACHELINE_SIZE 128
+#else
 #define KDB_CACHELINE_SIZE 64
+#endif // __ARM_ARCH_7A__ || __aarch64__
 #define KDB_CACHELINE_ALIGNED __attribute__((aligned(KDB_CACHELINE_SIZE)))
 
 #endif // KDB_COMMON_CACHELINE_H
diff --git a/compiler/scheduler/scheduler-base.cpp b/compiler/scheduler/scheduler-base.cpp
index 1ab866cba5..ecec42fb21 100644
--- a/compiler/scheduler/scheduler-base.cpp
+++ b/compiler/scheduler/scheduler-base.cpp
@@ -6,7 +6,7 @@
 
 #include <cassert>
 
-volatile int tasks_before_sync_node;
+std::atomic<int> tasks_before_sync_node;
 
 static SchedulerBase *scheduler;
 
diff --git a/compiler/scheduler/scheduler-base.h b/compiler/scheduler/scheduler-base.h
index f5d7c3fbc9..875b4a15b2 100644
--- a/compiler/scheduler/scheduler-base.h
+++ b/compiler/scheduler/scheduler-base.h
@@ -4,6 +4,8 @@
 
 #pragma once
 
+#include <atomic>
+
 class Node;
 
 class Task;
@@ -22,7 +24,7 @@ SchedulerBase *get_scheduler();
 void set_scheduler(SchedulerBase *new_scheduler);
 void unset_scheduler(SchedulerBase *old_scheduler);
 
-extern volatile int tasks_before_sync_node;
+extern std::atomic<int> tasks_before_sync_node;
 
 inline void register_async_task(Task *task) {
   get_scheduler()->add_task(task);
diff --git a/compiler/scheduler/scheduler.cpp b/compiler/scheduler/scheduler.cpp
index 633db5dea0..59cc0eb7ab 100644
--- a/compiler/scheduler/scheduler.cpp
+++ b/compiler/scheduler/scheduler.cpp
@@ -67,7 +67,7 @@ void Scheduler::execute() {
   }
 
   while (true) {
-    if (tasks_before_sync_node > 0) {
+    if (tasks_before_sync_node.load(std::memory_order_acquire) > 0) {
       usleep(250);
       continue;
     }
@@ -101,7 +101,7 @@ bool Scheduler::thread_process_node(Node *node) {
   }
   task->execute();
   delete task;
-  __sync_fetch_and_sub(&tasks_before_sync_node, 1);
+  tasks_before_sync_node.fetch_sub(1, std::memory_order_release);
   return true;
 }
 
diff --git a/compiler/stage.cpp b/compiler/stage.cpp
index 3d47150271..d3094aca35 100644
--- a/compiler/stage.cpp
+++ b/compiler/stage.cpp
@@ -2,6 +2,8 @@
 // Copyright (c) 2020 LLC «V Kontakte»
 // Distributed under the GPL v3 License, see LICENSE.notice.txt
 
+#include <atomic>
+
 #include "compiler/stage.h"
 
 #include "common/termformat/termformat.h"
@@ -31,7 +33,7 @@ const char *get_assert_level_desc(AssertLevelT assert_level) {
   }
 }
 
-volatile int ce_locker;
+std::atomic<int> ce_locker;
 
 namespace {
 FILE *warning_file{nullptr};
@@ -44,7 +46,7 @@ void stage::set_warning_file(FILE *file) noexcept {
 void on_compilation_error(const char *description __attribute__((unused)), const char *file_name, int line_number,
                           const char *full_description, AssertLevelT assert_level) {
 
-  AutoLocker<volatile int *> locker(&ce_locker);
+  AutoLocker<std::atomic<int> *> locker(&ce_locker);
   FILE *file = stdout;
   if (assert_level == WRN_ASSERT_LEVEL && warning_file) {
     file = warning_file;
diff --git a/compiler/threading/data-stream.h b/compiler/threading/data-stream.h
index 03e285c184..8023ea9a2d 100644
--- a/compiler/threading/data-stream.h
+++ b/compiler/threading/data-stream.h
@@ -38,7 +38,7 @@ class DataStream {
 
   void operator<<(DataType input) {
     if (!is_sink_mode_) {
-      __sync_fetch_and_add(&tasks_before_sync_node, 1);
+      tasks_before_sync_node.fetch_add(1, std::memory_order_release);
     }
     std::lock_guard<std::mutex> lock{mutex_};
     queue_.push_front(std::move(input));
@@ -60,7 +60,6 @@ class DataStream {
   const bool is_sink_mode_;
 };
 
-
 struct EmptyStream {
   template<size_t stream_id>
   using NthDataType = EmptyStream;
diff --git a/compiler/threading/hash-table.h b/compiler/threading/hash-table.h
index 4fa9189d4b..0c6d054505 100644
--- a/compiler/threading/hash-table.h
+++ b/compiler/threading/hash-table.h
@@ -13,7 +13,7 @@ template<class T, int N = 1000000>
 class TSHashTable {
 public:
   struct HTNode : Lockable {
-    unsigned long long hash;
+    std::atomic<unsigned long long> hash;
     T data;
 
     HTNode() :
@@ -24,7 +24,8 @@ class TSHashTable {
 
 private:
   HTNode *nodes;
-  int used_size;
+  std::atomic<int> used_size;
+
 public:
   TSHashTable() :
     nodes(new HTNode[N]),
@@ -34,14 +35,15 @@ class TSHashTable {
   HTNode *at(unsigned long long hash) {
     int i = (unsigned)hash % (unsigned)N;
     while (true) {
-      while (nodes[i].hash != 0 && nodes[i].hash != hash) {
+      while (nodes[i].hash.load(std::memory_order_acquire) != 0 && nodes[i].hash.load(std::memory_order_relaxed) != hash) {
         i++;
         if (i == N) {
           i = 0;
         }
       }
-      if (nodes[i].hash == 0 && !__sync_bool_compare_and_swap(&nodes[i].hash, 0, hash)) {
-        int id = __sync_fetch_and_add(&used_size, 1);
+      unsigned long long expected = 0;
+      if (nodes[i].hash.load(std::memory_order_acquire) == 0 && !nodes[i].hash.compare_exchange_strong(expected, hash, std::memory_order_acq_rel)) {
+        int id = used_size.fetch_add(1, std::memory_order_relaxed);
         assert(id * 2 < N);
         continue;
       }
@@ -52,20 +54,20 @@ class TSHashTable {
 
   const T *find(unsigned long long hash) {
     int i = (unsigned)hash % (unsigned)N;
-    while (nodes[i].hash != 0 && nodes[i].hash != hash) {
+    while (nodes[i].hash.load(std::memory_order_acquire) != 0 && nodes[i].hash.load(std::memory_order_relaxed) != hash) {
       i++;
       if (i == N) {
         i = 0;
       }
     }
 
-    return nodes[i].hash == hash ? &nodes[i].data : nullptr;
+    return nodes[i].hash.load(std::memory_order_acquire) == hash ? &nodes[i].data : nullptr;
   }
 
   std::vector<T> get_all() {
     std::vector<T> res;
     for (int i = 0; i < N; i++) {
-      if (nodes[i].hash != 0) {
+      if (nodes[i].hash.load(std::memory_order_acquire) != 0) {
         res.push_back(nodes[i].data);
       }
     }
@@ -76,7 +78,7 @@ class TSHashTable {
   std::vector<T> get_all_if(const CondF &callbackF) {
     std::vector<T> res;
     for (int i = 0; i < N; i++) {
-      if (nodes[i].hash != 0 && callbackF(nodes[i].data)) {
+      if (nodes[i].hash.load(std::memory_order_acquire) != 0 && callbackF(nodes[i].data)) {
         res.push_back(nodes[i].data);
       }
     }
diff --git a/compiler/threading/locks.h b/compiler/threading/locks.h
index e90fb041fe..c831920e87 100644
--- a/compiler/threading/locks.h
+++ b/compiler/threading/locks.h
@@ -4,9 +4,14 @@
 
 #pragma once
 
+#include <atomic>
 #include <cassert>
 #include <unistd.h>
 
+#include "common/cacheline.h"
+
+enum { LOCKED = 1, UNLOCKED = 0 };
+
 template<class T>
 bool try_lock(T);
 
@@ -20,28 +25,43 @@ void unlock(T locker) {
   locker->unlock();
 }
 
-inline bool try_lock(volatile int *locker) {
-  return __sync_lock_test_and_set(locker, 1) == 0;
+inline bool try_lock(std::atomic<int> *locker) {
+  int expected = UNLOCKED;
+  return locker->compare_exchange_weak(expected, LOCKED, std::memory_order_acq_rel);
 }
 
-inline void lock(volatile int *locker) {
+inline void lock(std::atomic<int> *locker) {
   while (!try_lock(locker)) {
     usleep(250);
   }
 }
 
-inline void unlock(volatile int *locker) {
-  assert(*locker == 1);
-  __sync_lock_release(locker);
+inline void unlock(std::atomic<int> *locker) {
+  assert(locker->load(std::memory_order_relaxed) == LOCKED);
+  locker->store(UNLOCKED, std::memory_order_release);
 }
 
-class Lockable {
+class KDB_CACHELINE_ALIGNED Lockable {
 private:
-  volatile int x;
+  std::atomic<int> x;
+
 public:
   Lockable() :
     x(0) {}
 
+  Lockable(const Lockable &other) noexcept :
+    x{other.x.load(std::memory_order_relaxed)} {}
+  Lockable(Lockable &&other) noexcept :
+    x{other.x.load(std::memory_order_relaxed)} {}
+  Lockable &operator=(const Lockable &other) noexcept {
+    x = other.x.load(std::memory_order_relaxed);
+    return *this;
+  }
+  Lockable &operator=(Lockable &&other) noexcept {
+    x = other.x.load(std::memory_order_relaxed);
+    return *this;
+  }
+
   virtual ~Lockable() = default;
 
   void lock() {
diff --git a/compiler/threading/thread-id.cpp b/compiler/threading/thread-id.cpp
index a7d30b07db..f03235e294 100644
--- a/compiler/threading/thread-id.cpp
+++ b/compiler/threading/thread-id.cpp
@@ -4,7 +4,7 @@
 
 #include "compiler/threading/thread-id.h"
 
-static __thread int bicycle_thread_id;
+static thread_local int bicycle_thread_id;
 
 int get_thread_id() {
   return bicycle_thread_id;
diff --git a/compiler/threading/tls.h b/compiler/threading/tls.h
index 0b0f2a83f8..2be42a6e91 100644
--- a/compiler/threading/tls.h
+++ b/compiler/threading/tls.h
@@ -8,6 +8,8 @@
 #include <cassert>
 #include <thread>
 
+#include "common/cacheline.h"
+
 #include "compiler/threading/locks.h"
 #include "compiler/threading/thread-id.h"
 
@@ -23,10 +25,8 @@ inline uint32_t get_default_threads_count() noexcept {
 template<class T>
 struct TLS {
 private:
-  struct TLSRaw {
+  struct KDB_CACHELINE_ALIGNED TLSRaw {
     T data{};
-    volatile int locker = 0;
-    char dummy[4096];
   };
 
   // The thread with thread_id = 0 is the main thread in which the scheduler's master code is executed.
@@ -49,7 +49,6 @@ struct TLS {
     arr() {
   }
 
-
   T &get() {
     return get_raw()->data;
   }
@@ -69,19 +68,6 @@ struct TLS {
   int size() {
     return MAX_THREADS_COUNT + 1;
   }
-
-  T *lock_get() {
-    TLSRaw *raw = get_raw();
-    bool ok = try_lock(&raw->locker);
-    assert(ok);
-    return &raw->data;
-  }
-
-  void unlock_get(T *ptr) {
-    TLSRaw *raw = get_raw();
-    assert(&raw->data == ptr);
-    unlock(&raw->locker);
-  }
 };
 
 #pragma GCC diagnostic pop
diff --git a/runtime/critical_section.cpp b/runtime/critical_section.cpp
index a7097a1503..7c93a12e0c 100644
--- a/runtime/critical_section.cpp
+++ b/runtime/critical_section.cpp
@@ -16,8 +16,8 @@ void check_stack_overflow() {
 
 namespace dl {
 
-volatile int in_critical_section = 0;
-volatile long long pending_signals = 0;
+volatile int in_critical_section;
+volatile long long pending_signals;
 
 void enter_critical_section() noexcept {
   check_stack_overflow();