-
Notifications
You must be signed in to change notification settings - Fork 3
Cache Bypassing For Scalar Get Set
Sometimes access pattern has close to zero hit rate on (LRU)cache. To get better performance out of this, there are uncached versions of set/get:
virtualArray.setUncached(random_index,my_object);
auto val = virtualArray.getUncached(random_index);
For every block of uncached get/set operations, streaming mode needs to be enabled and disabled:
arr.streamStart();
arr.setUncached(..);
arr.getUncached(..);
arr.streamStop();
The streamStart command flushes all edited active pages to vram, streamStop command updates all active pages with the new values from vram. If there is no cached set/get/write/read/map command used, then streamStart/Stop are not required.
Advantage of uncached set/get methods is skipping LRU algorithm that takes some CPU cycles. Due to this, writing latency can be twice as fast while reading latency can be improved for up to 50% for single threaded access and 10% for multithreaded access.
Example benchmark:
#include "GraphicsCardSupplyDepot.h"
#include "VirtualMultiArray.h"
#include "PcieBandwidthBenchmarker.h"
#include "CpuBenchmarker.h"
// testing
#include <random>
#include <iostream>
#include "omp.h"
constexpr bool TEST_BANDWIDTH=true;
constexpr bool TEST_LATENCY=false;
constexpr bool testType = TEST_LATENCY;
class Object
{
public:
Object():id(-1){}
Object(int p):id(p){}
const int getId() const {return id;}
private:
char data[testType?(1024*512 - 4):(4)];
int id;
};
int main()
{
const long long pageSize = 1;
const long long n = pageSize*(testType?4000:100000);
const int numTestsPerThread = 25;
VirtualMultiArray<Object> test(n,GraphicsCardSupplyDepot().requestGpus(),pageSize,3,PcieBandwidthBenchmarker().bestBandwidth(10));
#pragma omp parallel for
for(long long j=0;j<n;j++)
{
test.set(j,Object(j));
}
for(int i=1;i<=64;i++)
{
{
CpuBenchmarker bench(i*numTestsPerThread*sizeof(Object),std::string("scalar cached set, ")+std::to_string(i)+std::string("threads"),i*numTestsPerThread);
#pragma omp parallel for num_threads(i)
for(long long j=0;j<i;j++)
{
std::random_device rd;
std::mt19937 rng(rd());
std::uniform_real_distribution<float> rnd(0,n-1);
for(int k=0;k<numTestsPerThread;k++)
{
int rndv = rnd(rng);
test.set(rndv,Object(rndv));
}
}
}
{
CpuBenchmarker bench(i*numTestsPerThread*sizeof(Object),std::string("scalar cached get, ")+std::to_string(i)+std::string("threads"),i*numTestsPerThread);
#pragma omp parallel for num_threads(i)
for(long long j=0;j<i;j++)
{
std::random_device rd;
std::mt19937 rng(rd());
std::uniform_real_distribution<float> rnd(0,n-1);
for(int k=0;k<numTestsPerThread;k++)
{
int rndv = rnd(rng);
const auto obj = test.get(rndv);
if(obj.getId()!=rndv)
{
throw std::invalid_argument("Error: set/get");
}
}
}
}
test.streamStart();
{
CpuBenchmarker bench(i*numTestsPerThread*sizeof(Object),std::string("scalar uncached set, ")+std::to_string(i)+std::string("threads"),i*numTestsPerThread);
#pragma omp parallel for num_threads(i)
for(long long j=0;j<i;j++)
{
std::random_device rd;
std::mt19937 rng(rd());
std::uniform_real_distribution<float> rnd(0,n-1);
for(int k=0;k<numTestsPerThread;k++)
{
int rndv = rnd(rng);
test.setUncached(rndv,Object(rndv));
}
}
}
{
CpuBenchmarker bench(i*numTestsPerThread*sizeof(Object),std::string("scalar uncached get, ")+std::to_string(i)+std::string("threads"),i*numTestsPerThread);
#pragma omp parallel for num_threads(i)
for(long long j=0;j<i;j++)
{
std::random_device rd;
std::mt19937 rng(rd());
std::uniform_real_distribution<float> rnd(0,n-1);
for(int k=0;k<numTestsPerThread;k++)
{
int rndv = rnd(rng);
const Object obj = test.getUncached(rndv);
if(obj.getId()!=rndv)
{
throw std::invalid_argument("Error: set/get");
}
}
}
}
test.streamStop();
std::cout<<"==================================================================="<<std::endl;
}
return 0;
}
output:
scalar cached set, 1threads: 631240 nanoseconds (bandwidth = 0.32 MB/s) (throughput = 25249.60 nanoseconds per iteration)
scalar cached get, 1threads: 576859 nanoseconds (bandwidth = 0.35 MB/s) (throughput = 23074.36 nanoseconds per iteration)
scalar uncached set, 1threads: 364622 nanoseconds (bandwidth = 0.55 MB/s) (throughput = 14584.88 nanoseconds per iteration)
scalar uncached get, 1threads: 358113 nanoseconds (bandwidth = 0.56 MB/s) (throughput = 14324.52 nanoseconds per iteration)
===================================================================
scalar cached set, 2threads: 894248 nanoseconds (bandwidth = 0.45 MB/s) (throughput = 17884.96 nanoseconds per iteration)
scalar cached get, 2threads: 792666 nanoseconds (bandwidth = 0.50 MB/s) (throughput = 15853.32 nanoseconds per iteration)
scalar uncached set, 2threads: 681664 nanoseconds (bandwidth = 0.59 MB/s) (throughput = 13633.28 nanoseconds per iteration)
scalar uncached get, 2threads: 741334 nanoseconds (bandwidth = 0.54 MB/s) (throughput = 14826.68 nanoseconds per iteration)
===================================================================
scalar cached set, 3threads: 1287174 nanoseconds (bandwidth = 0.47 MB/s) (throughput = 17162.32 nanoseconds per iteration)
scalar cached get, 3threads: 1750979 nanoseconds (bandwidth = 0.34 MB/s) (throughput = 23346.39 nanoseconds per iteration)
scalar uncached set, 3threads: 1120729 nanoseconds (bandwidth = 0.54 MB/s) (throughput = 14943.05 nanoseconds per iteration)
scalar uncached get, 3threads: 1197230 nanoseconds (bandwidth = 0.50 MB/s) (throughput = 15963.07 nanoseconds per iteration)
===================================================================
scalar cached set, 4threads: 5423048 nanoseconds (bandwidth = 0.15 MB/s) (throughput = 54230.48 nanoseconds per iteration)
scalar cached get, 4threads: 2207157 nanoseconds (bandwidth = 0.36 MB/s) (throughput = 22071.57 nanoseconds per iteration)
scalar uncached set, 4threads: 1470098 nanoseconds (bandwidth = 0.54 MB/s) (throughput = 14700.98 nanoseconds per iteration)
scalar uncached get, 4threads: 1657056 nanoseconds (bandwidth = 0.48 MB/s) (throughput = 16570.56 nanoseconds per iteration)
===================================================================
scalar cached set, 5threads: 2052123 nanoseconds (bandwidth = 0.49 MB/s) (throughput = 16416.98 nanoseconds per iteration)
scalar cached get, 5threads: 2622677 nanoseconds (bandwidth = 0.38 MB/s) (throughput = 20981.42 nanoseconds per iteration)
scalar uncached set, 5threads: 1789924 nanoseconds (bandwidth = 0.56 MB/s) (throughput = 14319.39 nanoseconds per iteration)
scalar uncached get, 5threads: 1960566 nanoseconds (bandwidth = 0.51 MB/s) (throughput = 15684.53 nanoseconds per iteration)
===================================================================
scalar cached set, 6threads: 2440860 nanoseconds (bandwidth = 0.49 MB/s) (throughput = 16272.40 nanoseconds per iteration)
scalar cached get, 6threads: 2705441 nanoseconds (bandwidth = 0.44 MB/s) (throughput = 18036.27 nanoseconds per iteration)
scalar uncached set, 6threads: 1933212 nanoseconds (bandwidth = 0.62 MB/s) (throughput = 12888.08 nanoseconds per iteration)
scalar uncached get, 6threads: 1889944 nanoseconds (bandwidth = 0.63 MB/s) (throughput = 12599.63 nanoseconds per iteration)
===================================================================
scalar cached set, 7threads: 3117703 nanoseconds (bandwidth = 0.45 MB/s) (throughput = 17815.45 nanoseconds per iteration)
scalar cached get, 7threads: 3345039 nanoseconds (bandwidth = 0.42 MB/s) (throughput = 19114.51 nanoseconds per iteration)
scalar uncached set, 7threads: 2559306 nanoseconds (bandwidth = 0.55 MB/s) (throughput = 14624.61 nanoseconds per iteration)
scalar uncached get, 7threads: 2280535 nanoseconds (bandwidth = 0.61 MB/s) (throughput = 13031.63 nanoseconds per iteration)
===================================================================
scalar cached set, 8threads: 4999430 nanoseconds (bandwidth = 0.32 MB/s) (throughput = 24997.15 nanoseconds per iteration)
scalar cached get, 8threads: 5446593 nanoseconds (bandwidth = 0.29 MB/s) (throughput = 27232.97 nanoseconds per iteration)
scalar uncached set, 8threads: 3353092 nanoseconds (bandwidth = 0.48 MB/s) (throughput = 16765.46 nanoseconds per iteration)
scalar uncached get, 8threads: 4212719 nanoseconds (bandwidth = 0.38 MB/s) (throughput = 21063.60 nanoseconds per iteration)
===================================================================
scalar cached set, 9threads: 3972187 nanoseconds (bandwidth = 0.45 MB/s) (throughput = 17654.16 nanoseconds per iteration)
scalar cached get, 9threads: 4490111 nanoseconds (bandwidth = 0.40 MB/s) (throughput = 19956.05 nanoseconds per iteration)
scalar uncached set, 9threads: 3231230 nanoseconds (bandwidth = 0.56 MB/s) (throughput = 14361.02 nanoseconds per iteration)
scalar uncached get, 9threads: 3323277 nanoseconds (bandwidth = 0.54 MB/s) (throughput = 14770.12 nanoseconds per iteration)
===================================================================
scalar cached set, 10threads: 4460103 nanoseconds (bandwidth = 0.45 MB/s) (throughput = 17840.41 nanoseconds per iteration)
scalar cached get, 10threads: 4644328 nanoseconds (bandwidth = 0.43 MB/s) (throughput = 18577.31 nanoseconds per iteration)
scalar uncached set, 10threads: 3418475 nanoseconds (bandwidth = 0.59 MB/s) (throughput = 13673.90 nanoseconds per iteration)
scalar uncached get, 10threads: 3685433 nanoseconds (bandwidth = 0.54 MB/s) (throughput = 14741.73 nanoseconds per iteration)
===================================================================
scalar cached set, 11threads: 5440126 nanoseconds (bandwidth = 0.40 MB/s) (throughput = 19782.28 nanoseconds per iteration)
scalar cached get, 11threads: 6129922 nanoseconds (bandwidth = 0.36 MB/s) (throughput = 22290.63 nanoseconds per iteration)
scalar uncached set, 11threads: 3613842 nanoseconds (bandwidth = 0.61 MB/s) (throughput = 13141.24 nanoseconds per iteration)
scalar uncached get, 11threads: 4318797 nanoseconds (bandwidth = 0.51 MB/s) (throughput = 15704.72 nanoseconds per iteration)
===================================================================
scalar cached set, 12threads: 5873667 nanoseconds (bandwidth = 0.41 MB/s) (throughput = 19578.89 nanoseconds per iteration)
scalar cached get, 12threads: 5354420 nanoseconds (bandwidth = 0.45 MB/s) (throughput = 17848.07 nanoseconds per iteration)
scalar uncached set, 12threads: 5315196 nanoseconds (bandwidth = 0.45 MB/s) (throughput = 17717.32 nanoseconds per iteration)
scalar uncached get, 12threads: 4449370 nanoseconds (bandwidth = 0.54 MB/s) (throughput = 14831.23 nanoseconds per iteration)
===================================================================
scalar cached set, 13threads: 6271409 nanoseconds (bandwidth = 0.41 MB/s) (throughput = 19296.64 nanoseconds per iteration)
scalar cached get, 13threads: 5806685 nanoseconds (bandwidth = 0.45 MB/s) (throughput = 17866.72 nanoseconds per iteration)
scalar uncached set, 13threads: 5036642 nanoseconds (bandwidth = 0.52 MB/s) (throughput = 15497.36 nanoseconds per iteration)
scalar uncached get, 13threads: 4718302 nanoseconds (bandwidth = 0.55 MB/s) (throughput = 14517.85 nanoseconds per iteration)
===================================================================
scalar cached set, 14threads: 6791267 nanoseconds (bandwidth = 0.41 MB/s) (throughput = 19403.62 nanoseconds per iteration)
scalar cached get, 14threads: 6379612 nanoseconds (bandwidth = 0.44 MB/s) (throughput = 18227.46 nanoseconds per iteration)
scalar uncached set, 14threads: 4914211 nanoseconds (bandwidth = 0.57 MB/s) (throughput = 14040.60 nanoseconds per iteration)
scalar uncached get, 14threads: 5296954 nanoseconds (bandwidth = 0.53 MB/s) (throughput = 15134.15 nanoseconds per iteration)
===================================================================
scalar cached set, 15threads: 6751554 nanoseconds (bandwidth = 0.44 MB/s) (throughput = 18004.14 nanoseconds per iteration)
scalar cached get, 15threads: 6185170 nanoseconds (bandwidth = 0.49 MB/s) (throughput = 16493.79 nanoseconds per iteration)
scalar uncached set, 15threads: 5220272 nanoseconds (bandwidth = 0.57 MB/s) (throughput = 13920.73 nanoseconds per iteration)
scalar uncached get, 15threads: 5348351 nanoseconds (bandwidth = 0.56 MB/s) (throughput = 14262.27 nanoseconds per iteration)
===================================================================
scalar cached set, 16threads: 8088210 nanoseconds (bandwidth = 0.40 MB/s) (throughput = 20220.53 nanoseconds per iteration)
scalar cached get, 16threads: 6965195 nanoseconds (bandwidth = 0.46 MB/s) (throughput = 17412.99 nanoseconds per iteration)
scalar uncached set, 16threads: 6133680 nanoseconds (bandwidth = 0.52 MB/s) (throughput = 15334.20 nanoseconds per iteration)
scalar uncached get, 16threads: 6204725 nanoseconds (bandwidth = 0.52 MB/s) (throughput = 15511.81 nanoseconds per iteration)
===================================================================
scalar cached set, 17threads: 8222609 nanoseconds (bandwidth = 0.41 MB/s) (throughput = 19347.32 nanoseconds per iteration)
scalar cached get, 17threads: 6807366 nanoseconds (bandwidth = 0.50 MB/s) (throughput = 16017.33 nanoseconds per iteration)
scalar uncached set, 17threads: 5985567 nanoseconds (bandwidth = 0.57 MB/s) (throughput = 14083.69 nanoseconds per iteration)
scalar uncached get, 17threads: 5893776 nanoseconds (bandwidth = 0.58 MB/s) (throughput = 13867.71 nanoseconds per iteration)
===================================================================
scalar cached set, 18threads: 8488874 nanoseconds (bandwidth = 0.42 MB/s) (throughput = 18864.16 nanoseconds per iteration)
scalar cached get, 18threads: 7212841 nanoseconds (bandwidth = 0.50 MB/s) (throughput = 16028.54 nanoseconds per iteration)
scalar uncached set, 18threads: 6487665 nanoseconds (bandwidth = 0.55 MB/s) (throughput = 14417.03 nanoseconds per iteration)
scalar uncached get, 18threads: 6459853 nanoseconds (bandwidth = 0.56 MB/s) (throughput = 14355.23 nanoseconds per iteration)
===================================================================
scalar cached set, 19threads: 8987782 nanoseconds (bandwidth = 0.42 MB/s) (throughput = 18921.65 nanoseconds per iteration)
scalar cached get, 19threads: 7666428 nanoseconds (bandwidth = 0.50 MB/s) (throughput = 16139.85 nanoseconds per iteration)
scalar uncached set, 19threads: 6830533 nanoseconds (bandwidth = 0.56 MB/s) (throughput = 14380.07 nanoseconds per iteration)
scalar uncached get, 19threads: 6777907 nanoseconds (bandwidth = 0.56 MB/s) (throughput = 14269.28 nanoseconds per iteration)
===================================================================
scalar cached set, 20threads: 9572720 nanoseconds (bandwidth = 0.42 MB/s) (throughput = 19145.44 nanoseconds per iteration)
scalar cached get, 20threads: 7909843 nanoseconds (bandwidth = 0.51 MB/s) (throughput = 15819.69 nanoseconds per iteration)
scalar uncached set, 20threads: 7494471 nanoseconds (bandwidth = 0.53 MB/s) (throughput = 14988.94 nanoseconds per iteration)
scalar uncached get, 20threads: 7263069 nanoseconds (bandwidth = 0.55 MB/s) (throughput = 14526.14 nanoseconds per iteration)
===================================================================
scalar cached set, 21threads: 9898320 nanoseconds (bandwidth = 0.42 MB/s) (throughput = 18853.94 nanoseconds per iteration)
scalar cached get, 21threads: 8974454 nanoseconds (bandwidth = 0.47 MB/s) (throughput = 17094.20 nanoseconds per iteration)
scalar uncached set, 21threads: 8149802 nanoseconds (bandwidth = 0.52 MB/s) (throughput = 15523.43 nanoseconds per iteration)
scalar uncached get, 21threads: 7169336 nanoseconds (bandwidth = 0.59 MB/s) (throughput = 13655.88 nanoseconds per iteration)
===================================================================
scalar cached set, 22threads: 10535488 nanoseconds (bandwidth = 0.42 MB/s) (throughput = 19155.43 nanoseconds per iteration)
scalar cached get, 22threads: 9645580 nanoseconds (bandwidth = 0.46 MB/s) (throughput = 17537.42 nanoseconds per iteration)
scalar uncached set, 22threads: 7670474 nanoseconds (bandwidth = 0.57 MB/s) (throughput = 13946.32 nanoseconds per iteration)
scalar uncached get, 22threads: 7733832 nanoseconds (bandwidth = 0.57 MB/s) (throughput = 14061.51 nanoseconds per iteration)
===================================================================
scalar cached set, 23threads: 13323303 nanoseconds (bandwidth = 0.35 MB/s) (throughput = 23170.96 nanoseconds per iteration)
scalar cached get, 23threads: 8989479 nanoseconds (bandwidth = 0.51 MB/s) (throughput = 15633.88 nanoseconds per iteration)
scalar uncached set, 23threads: 7944639 nanoseconds (bandwidth = 0.58 MB/s) (throughput = 13816.76 nanoseconds per iteration)
scalar uncached get, 23threads: 7547908 nanoseconds (bandwidth = 0.61 MB/s) (throughput = 13126.80 nanoseconds per iteration)
===================================================================
scalar cached set, 24threads: 11855103 nanoseconds (bandwidth = 0.40 MB/s) (throughput = 19758.51 nanoseconds per iteration)
scalar cached get, 24threads: 9004947 nanoseconds (bandwidth = 0.53 MB/s) (throughput = 15008.25 nanoseconds per iteration)
scalar uncached set, 24threads: 8094817 nanoseconds (bandwidth = 0.59 MB/s) (throughput = 13491.36 nanoseconds per iteration)
scalar uncached get, 24threads: 8564948 nanoseconds (bandwidth = 0.56 MB/s) (throughput = 14274.91 nanoseconds per iteration)
===================================================================
scalar cached set, 25threads: 12061635 nanoseconds (bandwidth = 0.41 MB/s) (throughput = 19298.62 nanoseconds per iteration)
scalar cached get, 25threads: 9686548 nanoseconds (bandwidth = 0.52 MB/s) (throughput = 15498.48 nanoseconds per iteration)
scalar uncached set, 25threads: 9095523 nanoseconds (bandwidth = 0.55 MB/s) (throughput = 14552.84 nanoseconds per iteration)
scalar uncached get, 25threads: 9610507 nanoseconds (bandwidth = 0.52 MB/s) (throughput = 15376.81 nanoseconds per iteration)
===================================================================
scalar cached set, 26threads: 13630227 nanoseconds (bandwidth = 0.38 MB/s) (throughput = 20969.58 nanoseconds per iteration)
scalar cached get, 26threads: 10239935 nanoseconds (bandwidth = 0.51 MB/s) (throughput = 15753.75 nanoseconds per iteration)
scalar uncached set, 26threads: 8953193 nanoseconds (bandwidth = 0.58 MB/s) (throughput = 13774.14 nanoseconds per iteration)
scalar uncached get, 26threads: 9001990 nanoseconds (bandwidth = 0.58 MB/s) (throughput = 13849.22 nanoseconds per iteration)
===================================================================
scalar cached set, 27threads: 13299100 nanoseconds (bandwidth = 0.41 MB/s) (throughput = 19702.37 nanoseconds per iteration)
scalar cached get, 27threads: 10246967 nanoseconds (bandwidth = 0.53 MB/s) (throughput = 15180.69 nanoseconds per iteration)
scalar uncached set, 27threads: 9830727 nanoseconds (bandwidth = 0.55 MB/s) (throughput = 14564.04 nanoseconds per iteration)
scalar uncached get, 27threads: 9365766 nanoseconds (bandwidth = 0.58 MB/s) (throughput = 13875.21 nanoseconds per iteration)
===================================================================
scalar cached set, 28threads: 16082237 nanoseconds (bandwidth = 0.35 MB/s) (throughput = 22974.62 nanoseconds per iteration)
scalar cached get, 28threads: 12099787 nanoseconds (bandwidth = 0.46 MB/s) (throughput = 17285.41 nanoseconds per iteration)
scalar uncached set, 28threads: 9231972 nanoseconds (bandwidth = 0.61 MB/s) (throughput = 13188.53 nanoseconds per iteration)
scalar uncached get, 28threads: 10501294 nanoseconds (bandwidth = 0.53 MB/s) (throughput = 15001.85 nanoseconds per iteration)
===================================================================
scalar cached set, 29threads: 14578584 nanoseconds (bandwidth = 0.40 MB/s) (throughput = 20108.39 nanoseconds per iteration)
scalar cached get, 29threads: 11485306 nanoseconds (bandwidth = 0.50 MB/s) (throughput = 15841.80 nanoseconds per iteration)
scalar uncached set, 29threads: 9836257 nanoseconds (bandwidth = 0.59 MB/s) (throughput = 13567.25 nanoseconds per iteration)
scalar uncached get, 29threads: 9866161 nanoseconds (bandwidth = 0.59 MB/s) (throughput = 13608.50 nanoseconds per iteration)
===================================================================
scalar cached set, 30threads: 16116942 nanoseconds (bandwidth = 0.37 MB/s) (throughput = 21489.26 nanoseconds per iteration)
scalar cached get, 30threads: 11621775 nanoseconds (bandwidth = 0.52 MB/s) (throughput = 15495.70 nanoseconds per iteration)
scalar uncached set, 30threads: 12603637 nanoseconds (bandwidth = 0.48 MB/s) (throughput = 16804.85 nanoseconds per iteration)
scalar uncached get, 30threads: 10352982 nanoseconds (bandwidth = 0.58 MB/s) (throughput = 13803.98 nanoseconds per iteration)
===================================================================
scalar cached set, 31threads: 16498009 nanoseconds (bandwidth = 0.38 MB/s) (throughput = 21287.75 nanoseconds per iteration)
scalar cached get, 31threads: 13051952 nanoseconds (bandwidth = 0.48 MB/s) (throughput = 16841.23 nanoseconds per iteration)
scalar uncached set, 31threads: 10985775 nanoseconds (bandwidth = 0.56 MB/s) (throughput = 14175.19 nanoseconds per iteration)
scalar uncached get, 31threads: 10882004 nanoseconds (bandwidth = 0.57 MB/s) (throughput = 14041.30 nanoseconds per iteration)
===================================================================
scalar cached set, 32threads: 16279916 nanoseconds (bandwidth = 0.39 MB/s) (throughput = 20349.90 nanoseconds per iteration)
scalar cached get, 32threads: 13159572 nanoseconds (bandwidth = 0.49 MB/s) (throughput = 16449.47 nanoseconds per iteration)
scalar uncached set, 32threads: 10965009 nanoseconds (bandwidth = 0.58 MB/s) (throughput = 13706.26 nanoseconds per iteration)
scalar uncached get, 32threads: 10918201 nanoseconds (bandwidth = 0.59 MB/s) (throughput = 13647.75 nanoseconds per iteration)
===================================================================
scalar cached set, 33threads: 17150738 nanoseconds (bandwidth = 0.38 MB/s) (throughput = 20788.77 nanoseconds per iteration)
scalar cached get, 33threads: 13000754 nanoseconds (bandwidth = 0.51 MB/s) (throughput = 15758.49 nanoseconds per iteration)
scalar uncached set, 33threads: 11747109 nanoseconds (bandwidth = 0.56 MB/s) (throughput = 14238.92 nanoseconds per iteration)
scalar uncached get, 33threads: 11385147 nanoseconds (bandwidth = 0.58 MB/s) (throughput = 13800.18 nanoseconds per iteration)
===================================================================
scalar cached set, 34threads: 17229417 nanoseconds (bandwidth = 0.39 MB/s) (throughput = 20269.90 nanoseconds per iteration)
scalar cached get, 34threads: 12889588 nanoseconds (bandwidth = 0.53 MB/s) (throughput = 15164.22 nanoseconds per iteration)
scalar uncached set, 34threads: 13012646 nanoseconds (bandwidth = 0.52 MB/s) (throughput = 15309.00 nanoseconds per iteration)
scalar uncached get, 34threads: 11917856 nanoseconds (bandwidth = 0.57 MB/s) (throughput = 14021.01 nanoseconds per iteration)
===================================================================
scalar cached set, 35threads: 18147680 nanoseconds (bandwidth = 0.39 MB/s) (throughput = 20740.21 nanoseconds per iteration)
scalar cached get, 35threads: 13632704 nanoseconds (bandwidth = 0.51 MB/s) (throughput = 15580.23 nanoseconds per iteration)
scalar uncached set, 35threads: 12170241 nanoseconds (bandwidth = 0.58 MB/s) (throughput = 13908.85 nanoseconds per iteration)
scalar uncached get, 35threads: 12036331 nanoseconds (bandwidth = 0.58 MB/s) (throughput = 13755.81 nanoseconds per iteration)
===================================================================
scalar cached set, 36threads: 18008514 nanoseconds (bandwidth = 0.40 MB/s) (throughput = 20009.46 nanoseconds per iteration)
scalar cached get, 36threads: 14345471 nanoseconds (bandwidth = 0.50 MB/s) (throughput = 15939.41 nanoseconds per iteration)
scalar uncached set, 36threads: 12387248 nanoseconds (bandwidth = 0.58 MB/s) (throughput = 13763.61 nanoseconds per iteration)
scalar uncached get, 36threads: 13976227 nanoseconds (bandwidth = 0.52 MB/s) (throughput = 15529.14 nanoseconds per iteration)
===================================================================
scalar cached set, 37threads: 19024116 nanoseconds (bandwidth = 0.39 MB/s) (throughput = 20566.61 nanoseconds per iteration)
scalar cached get, 37threads: 16157028 nanoseconds (bandwidth = 0.46 MB/s) (throughput = 17467.06 nanoseconds per iteration)
scalar uncached set, 37threads: 13193369 nanoseconds (bandwidth = 0.56 MB/s) (throughput = 14263.10 nanoseconds per iteration)
scalar uncached get, 37threads: 12624440 nanoseconds (bandwidth = 0.59 MB/s) (throughput = 13648.04 nanoseconds per iteration)
===================================================================
scalar cached set, 38threads: 20109377 nanoseconds (bandwidth = 0.38 MB/s) (throughput = 21167.77 nanoseconds per iteration)
scalar cached get, 38threads: 13941088 nanoseconds (bandwidth = 0.55 MB/s) (throughput = 14674.83 nanoseconds per iteration)
scalar uncached set, 38threads: 13069416 nanoseconds (bandwidth = 0.58 MB/s) (throughput = 13757.28 nanoseconds per iteration)
scalar uncached get, 38threads: 12866192 nanoseconds (bandwidth = 0.59 MB/s) (throughput = 13543.36 nanoseconds per iteration)
===================================================================
scalar cached set, 39threads: 20849560 nanoseconds (bandwidth = 0.37 MB/s) (throughput = 21384.16 nanoseconds per iteration)
scalar cached get, 39threads: 14980244 nanoseconds (bandwidth = 0.52 MB/s) (throughput = 15364.35 nanoseconds per iteration)
scalar uncached set, 39threads: 15513204 nanoseconds (bandwidth = 0.50 MB/s) (throughput = 15910.98 nanoseconds per iteration)
scalar uncached get, 39threads: 13542077 nanoseconds (bandwidth = 0.58 MB/s) (throughput = 13889.31 nanoseconds per iteration)
===================================================================
scalar cached set, 40threads: 21111675 nanoseconds (bandwidth = 0.38 MB/s) (throughput = 21111.67 nanoseconds per iteration)
scalar cached get, 40threads: 14929343 nanoseconds (bandwidth = 0.54 MB/s) (throughput = 14929.34 nanoseconds per iteration)
scalar uncached set, 40threads: 14720261 nanoseconds (bandwidth = 0.54 MB/s) (throughput = 14720.26 nanoseconds per iteration)
scalar uncached get, 40threads: 13883596 nanoseconds (bandwidth = 0.58 MB/s) (throughput = 13883.60 nanoseconds per iteration)
===================================================================
scalar cached set, 41threads: 21837940 nanoseconds (bandwidth = 0.38 MB/s) (throughput = 21305.31 nanoseconds per iteration)
scalar cached get, 41threads: 15687758 nanoseconds (bandwidth = 0.52 MB/s) (throughput = 15305.13 nanoseconds per iteration)
scalar uncached set, 41threads: 14450850 nanoseconds (bandwidth = 0.57 MB/s) (throughput = 14098.39 nanoseconds per iteration)
scalar uncached get, 41threads: 13731421 nanoseconds (bandwidth = 0.60 MB/s) (throughput = 13396.51 nanoseconds per iteration)
===================================================================
scalar cached set, 42threads: 23986726 nanoseconds (bandwidth = 0.35 MB/s) (throughput = 22844.50 nanoseconds per iteration)
scalar cached get, 42threads: 15625414 nanoseconds (bandwidth = 0.54 MB/s) (throughput = 14881.35 nanoseconds per iteration)
scalar uncached set, 42threads: 14865864 nanoseconds (bandwidth = 0.57 MB/s) (throughput = 14157.97 nanoseconds per iteration)
scalar uncached get, 42threads: 14199577 nanoseconds (bandwidth = 0.59 MB/s) (throughput = 13523.41 nanoseconds per iteration)
===================================================================
scalar cached set, 43threads: 21767998 nanoseconds (bandwidth = 0.40 MB/s) (throughput = 20249.30 nanoseconds per iteration)
scalar cached get, 43threads: 17182061 nanoseconds (bandwidth = 0.50 MB/s) (throughput = 15983.31 nanoseconds per iteration)
scalar uncached set, 43threads: 15288939 nanoseconds (bandwidth = 0.56 MB/s) (throughput = 14222.27 nanoseconds per iteration)
scalar uncached get, 43threads: 15517985 nanoseconds (bandwidth = 0.55 MB/s) (throughput = 14435.33 nanoseconds per iteration)
===================================================================
scalar cached set, 44threads: 24529425 nanoseconds (bandwidth = 0.36 MB/s) (throughput = 22299.48 nanoseconds per iteration)
scalar cached get, 44threads: 16159428 nanoseconds (bandwidth = 0.54 MB/s) (throughput = 14690.39 nanoseconds per iteration)
scalar uncached set, 44threads: 15097062 nanoseconds (bandwidth = 0.58 MB/s) (throughput = 13724.60 nanoseconds per iteration)
scalar uncached get, 44threads: 15051665 nanoseconds (bandwidth = 0.58 MB/s) (throughput = 13683.33 nanoseconds per iteration)
===================================================================
scalar cached set, 45threads: 23483874 nanoseconds (bandwidth = 0.38 MB/s) (throughput = 20874.55 nanoseconds per iteration)
scalar cached get, 45threads: 17451121 nanoseconds (bandwidth = 0.52 MB/s) (throughput = 15512.11 nanoseconds per iteration)
scalar uncached set, 45threads: 15815197 nanoseconds (bandwidth = 0.57 MB/s) (throughput = 14057.95 nanoseconds per iteration)
scalar uncached get, 45threads: 16966793 nanoseconds (bandwidth = 0.53 MB/s) (throughput = 15081.59 nanoseconds per iteration)
===================================================================
scalar cached set, 46threads: 24450423 nanoseconds (bandwidth = 0.38 MB/s) (throughput = 21261.24 nanoseconds per iteration)
scalar cached get, 46threads: 17295807 nanoseconds (bandwidth = 0.53 MB/s) (throughput = 15039.83 nanoseconds per iteration)
scalar uncached set, 46threads: 16172473 nanoseconds (bandwidth = 0.57 MB/s) (throughput = 14063.02 nanoseconds per iteration)
scalar uncached get, 46threads: 15699103 nanoseconds (bandwidth = 0.59 MB/s) (throughput = 13651.39 nanoseconds per iteration)
===================================================================
scalar cached set, 47threads: 25891378 nanoseconds (bandwidth = 0.36 MB/s) (throughput = 22035.22 nanoseconds per iteration)
scalar cached get, 47threads: 17453082 nanoseconds (bandwidth = 0.54 MB/s) (throughput = 14853.69 nanoseconds per iteration)
scalar uncached set, 47threads: 16498013 nanoseconds (bandwidth = 0.57 MB/s) (throughput = 14040.86 nanoseconds per iteration)
scalar uncached get, 47threads: 16871537 nanoseconds (bandwidth = 0.56 MB/s) (throughput = 14358.75 nanoseconds per iteration)
===================================================================
scalar cached set, 48threads: 24534954 nanoseconds (bandwidth = 0.39 MB/s) (throughput = 20445.79 nanoseconds per iteration)
scalar cached get, 48threads: 18841680 nanoseconds (bandwidth = 0.51 MB/s) (throughput = 15701.40 nanoseconds per iteration)
scalar uncached set, 48threads: 17275508 nanoseconds (bandwidth = 0.56 MB/s) (throughput = 14396.26 nanoseconds per iteration)
scalar uncached get, 48threads: 17215483 nanoseconds (bandwidth = 0.56 MB/s) (throughput = 14346.24 nanoseconds per iteration)
===================================================================
scalar cached set, 49threads: 26304889 nanoseconds (bandwidth = 0.37 MB/s) (throughput = 21473.38 nanoseconds per iteration)
scalar cached get, 49threads: 18372957 nanoseconds (bandwidth = 0.53 MB/s) (throughput = 14998.33 nanoseconds per iteration)
scalar uncached set, 49threads: 16757019 nanoseconds (bandwidth = 0.58 MB/s) (throughput = 13679.20 nanoseconds per iteration)
scalar uncached get, 49threads: 16774562 nanoseconds (bandwidth = 0.58 MB/s) (throughput = 13693.52 nanoseconds per iteration)
===================================================================
scalar cached set, 50threads: 25646477 nanoseconds (bandwidth = 0.39 MB/s) (throughput = 20517.18 nanoseconds per iteration)
scalar cached get, 50threads: 18091327 nanoseconds (bandwidth = 0.55 MB/s) (throughput = 14473.06 nanoseconds per iteration)
scalar uncached set, 50threads: 17601779 nanoseconds (bandwidth = 0.57 MB/s) (throughput = 14081.42 nanoseconds per iteration)
scalar uncached get, 50threads: 17188814 nanoseconds (bandwidth = 0.58 MB/s) (throughput = 13751.05 nanoseconds per iteration)
===================================================================
scalar cached set, 51threads: 27657388 nanoseconds (bandwidth = 0.37 MB/s) (throughput = 21692.07 nanoseconds per iteration)
scalar cached get, 51threads: 18902716 nanoseconds (bandwidth = 0.54 MB/s) (throughput = 14825.66 nanoseconds per iteration)
scalar uncached set, 51threads: 18594709 nanoseconds (bandwidth = 0.55 MB/s) (throughput = 14584.09 nanoseconds per iteration)
scalar uncached get, 51threads: 17971386 nanoseconds (bandwidth = 0.57 MB/s) (throughput = 14095.20 nanoseconds per iteration)
===================================================================
scalar cached set, 52threads: 26522986 nanoseconds (bandwidth = 0.39 MB/s) (throughput = 20402.30 nanoseconds per iteration)
scalar cached get, 52threads: 19028640 nanoseconds (bandwidth = 0.55 MB/s) (throughput = 14637.42 nanoseconds per iteration)
scalar uncached set, 52threads: 18223474 nanoseconds (bandwidth = 0.57 MB/s) (throughput = 14018.06 nanoseconds per iteration)
scalar uncached get, 52threads: 18593858 nanoseconds (bandwidth = 0.56 MB/s) (throughput = 14302.97 nanoseconds per iteration)
===================================================================
scalar cached set, 53threads: 27520628 nanoseconds (bandwidth = 0.39 MB/s) (throughput = 20770.29 nanoseconds per iteration)
scalar cached get, 53threads: 19534563 nanoseconds (bandwidth = 0.54 MB/s) (throughput = 14743.07 nanoseconds per iteration)
scalar uncached set, 53threads: 18610927 nanoseconds (bandwidth = 0.57 MB/s) (throughput = 14045.98 nanoseconds per iteration)
scalar uncached get, 53threads: 18526625 nanoseconds (bandwidth = 0.57 MB/s) (throughput = 13982.36 nanoseconds per iteration)
===================================================================
scalar cached set, 54threads: 29403799 nanoseconds (bandwidth = 0.37 MB/s) (throughput = 21780.59 nanoseconds per iteration)
scalar cached get, 54threads: 19451844 nanoseconds (bandwidth = 0.56 MB/s) (throughput = 14408.77 nanoseconds per iteration)
scalar uncached set, 54threads: 20031465 nanoseconds (bandwidth = 0.54 MB/s) (throughput = 14838.12 nanoseconds per iteration)
scalar uncached get, 54threads: 18825519 nanoseconds (bandwidth = 0.57 MB/s) (throughput = 13944.83 nanoseconds per iteration)
===================================================================
scalar cached set, 55threads: 30705178 nanoseconds (bandwidth = 0.36 MB/s) (throughput = 22331.04 nanoseconds per iteration)
scalar cached get, 55threads: 20161574 nanoseconds (bandwidth = 0.55 MB/s) (throughput = 14662.96 nanoseconds per iteration)
scalar uncached set, 55threads: 19487317 nanoseconds (bandwidth = 0.56 MB/s) (throughput = 14172.59 nanoseconds per iteration)
scalar uncached get, 55threads: 18261559 nanoseconds (bandwidth = 0.60 MB/s) (throughput = 13281.13 nanoseconds per iteration)
===================================================================
scalar cached set, 56threads: 29273199 nanoseconds (bandwidth = 0.38 MB/s) (throughput = 20909.43 nanoseconds per iteration)
scalar cached get, 56threads: 20994229 nanoseconds (bandwidth = 0.53 MB/s) (throughput = 14995.88 nanoseconds per iteration)
scalar uncached set, 56threads: 19503792 nanoseconds (bandwidth = 0.57 MB/s) (throughput = 13931.28 nanoseconds per iteration)
scalar uncached get, 56threads: 20116147 nanoseconds (bandwidth = 0.56 MB/s) (throughput = 14368.68 nanoseconds per iteration)
===================================================================
scalar cached set, 57threads: 30938862 nanoseconds (bandwidth = 0.37 MB/s) (throughput = 21711.48 nanoseconds per iteration)
scalar cached get, 57threads: 21731334 nanoseconds (bandwidth = 0.52 MB/s) (throughput = 15250.06 nanoseconds per iteration)
scalar uncached set, 57threads: 20020230 nanoseconds (bandwidth = 0.57 MB/s) (throughput = 14049.28 nanoseconds per iteration)
scalar uncached get, 57threads: 19637263 nanoseconds (bandwidth = 0.58 MB/s) (throughput = 13780.54 nanoseconds per iteration)
===================================================================
scalar cached set, 58threads: 30140114 nanoseconds (bandwidth = 0.38 MB/s) (throughput = 20786.29 nanoseconds per iteration)
scalar cached get, 58threads: 21495490 nanoseconds (bandwidth = 0.54 MB/s) (throughput = 14824.48 nanoseconds per iteration)
scalar uncached set, 58threads: 20984226 nanoseconds (bandwidth = 0.55 MB/s) (throughput = 14471.88 nanoseconds per iteration)
scalar uncached get, 58threads: 20498463 nanoseconds (bandwidth = 0.57 MB/s) (throughput = 14136.87 nanoseconds per iteration)
===================================================================
scalar cached set, 59threads: 31254005 nanoseconds (bandwidth = 0.38 MB/s) (throughput = 21189.16 nanoseconds per iteration)
scalar cached get, 59threads: 21990602 nanoseconds (bandwidth = 0.54 MB/s) (throughput = 14908.88 nanoseconds per iteration)
scalar uncached set, 59threads: 19834334 nanoseconds (bandwidth = 0.59 MB/s) (throughput = 13447.01 nanoseconds per iteration)
scalar uncached get, 59threads: 20997916 nanoseconds (bandwidth = 0.56 MB/s) (throughput = 14235.88 nanoseconds per iteration)
===================================================================
scalar cached set, 60threads: 32833679 nanoseconds (bandwidth = 0.37 MB/s) (throughput = 21889.12 nanoseconds per iteration)
scalar cached get, 60threads: 22257266 nanoseconds (bandwidth = 0.54 MB/s) (throughput = 14838.18 nanoseconds per iteration)
scalar uncached set, 60threads: 21026281 nanoseconds (bandwidth = 0.57 MB/s) (throughput = 14017.52 nanoseconds per iteration)
scalar uncached get, 60threads: 20949798 nanoseconds (bandwidth = 0.57 MB/s) (throughput = 13966.53 nanoseconds per iteration)
===================================================================
scalar cached set, 61threads: 33410776 nanoseconds (bandwidth = 0.37 MB/s) (throughput = 21908.71 nanoseconds per iteration)
scalar cached get, 61threads: 22535059 nanoseconds (bandwidth = 0.54 MB/s) (throughput = 14777.09 nanoseconds per iteration)
scalar uncached set, 61threads: 22172723 nanoseconds (bandwidth = 0.55 MB/s) (throughput = 14539.49 nanoseconds per iteration)
scalar uncached get, 61threads: 20779790 nanoseconds (bandwidth = 0.59 MB/s) (throughput = 13626.09 nanoseconds per iteration)
===================================================================
scalar cached set, 62threads: 32335236 nanoseconds (bandwidth = 0.38 MB/s) (throughput = 20861.44 nanoseconds per iteration)
scalar cached get, 62threads: 23383845 nanoseconds (bandwidth = 0.53 MB/s) (throughput = 15086.35 nanoseconds per iteration)
scalar uncached set, 62threads: 21831015 nanoseconds (bandwidth = 0.57 MB/s) (throughput = 14084.53 nanoseconds per iteration)
scalar uncached get, 62threads: 21472829 nanoseconds (bandwidth = 0.58 MB/s) (throughput = 13853.44 nanoseconds per iteration)
===================================================================
scalar cached set, 63threads: 34048310 nanoseconds (bandwidth = 0.37 MB/s) (throughput = 21617.97 nanoseconds per iteration)
scalar cached get, 63threads: 23474419 nanoseconds (bandwidth = 0.54 MB/s) (throughput = 14904.39 nanoseconds per iteration)
scalar uncached set, 63threads: 21282604 nanoseconds (bandwidth = 0.59 MB/s) (throughput = 13512.76 nanoseconds per iteration)
scalar uncached get, 63threads: 21757277 nanoseconds (bandwidth = 0.58 MB/s) (throughput = 13814.14 nanoseconds per iteration)
===================================================================
scalar cached set, 64threads: 35219700 nanoseconds (bandwidth = 0.36 MB/s) (throughput = 22012.31 nanoseconds per iteration)
scalar cached get, 64threads: 22812182 nanoseconds (bandwidth = 0.56 MB/s) (throughput = 14257.61 nanoseconds per iteration)
scalar uncached set, 64threads: 21755992 nanoseconds (bandwidth = 0.59 MB/s) (throughput = 13597.50 nanoseconds per iteration)
scalar uncached get, 64threads: 22108579 nanoseconds (bandwidth = 0.58 MB/s) (throughput = 13817.86 nanoseconds per iteration)
===================================================================