Scroll to navigation

std::hardware_destructive_interference_size,std::hardware_constructive_interference_size(3) C++ Standard Libary std::hardware_destructive_interference_size,std::hardware_constructive_interference_size(3)

NAME

std::hardware_destructive_interference_size,std::hardware_constructive_interference_size - std::hardware_destructive_interference_size,std::hardware_constructive_interference_size

Synopsis


Defined in header <new>
inline constexpr std::size_t
hardware_destructive_interference_size = (1) (since C++17)
/*implementation-defined*/;
inline constexpr std::size_t
hardware_constructive_interference_size = (2) (since C++17)
/*implementation-defined*/;


1) Minimum offset between two objects to avoid false sharing. Guaranteed to be at
least alignof(std::max_align_t)


struct keep_apart {
alignas(std::hardware_destructive_interference_size) std::atomic<int> cat;
alignas(std::hardware_destructive_interference_size) std::atomic<int> dog;
};


2) Maximum size of contiguous memory to promote true sharing. Guaranteed to be at
least alignof(std::max_align_t)


struct together {
std::atomic<int> dog;
int puppy;
};
struct kennel {
// Other data members...
alignas(sizeof(together)) together pack;
// Other data members...
};
static_assert(sizeof(together) <= std::hardware_constructive_interference_size);

Notes


These constants provide a portable way to access the L1 data cache line size.


Feature-test macro: __cpp_lib_hardware_interference_size

Example


The program uses two threads that write (atomically) to the data members of the
given global objects. The first object fits in one cache line, which results in
"hardware interference". The second object keeps its data members on separate cache
lines, so possible "cache synchronization" after thread writes is avoided.

// Run this code


#include <atomic>
#include <chrono>
#include <cstddef>
#include <iomanip>
#include <iostream>
#include <mutex>
#include <new>
#include <thread>


#ifdef __cpp_lib_hardware_interference_size
using std::hardware_constructive_interference_size;
using std::hardware_destructive_interference_size;
#else
// 64 bytes on x86-64 │ L1_CACHE_BYTES │ L1_CACHE_SHIFT │ __cacheline_aligned │ ...
constexpr std::size_t hardware_constructive_interference_size = 64;
constexpr std::size_t hardware_destructive_interference_size = 64;
#endif


std::mutex cout_mutex;


constexpr int max_write_iterations{10'000'000}; // the benchmark time tuning


struct alignas(hardware_constructive_interference_size)
OneCacheLiner { // occupies one cache line
std::atomic_uint64_t x{};
std::atomic_uint64_t y{};
} oneCacheLiner;


struct TwoCacheLiner { // occupies two cache lines
alignas(hardware_destructive_interference_size) std::atomic_uint64_t x{};
alignas(hardware_destructive_interference_size) std::atomic_uint64_t y{};
} twoCacheLiner;


inline auto now() noexcept { return std::chrono::high_resolution_clock::now(); }


template<bool xy>
void oneCacheLinerThread() {
const auto start { now() };


for (uint64_t count{}; count != max_write_iterations; ++count)
if constexpr (xy)
oneCacheLiner.x.fetch_add(1, std::memory_order_relaxed);
else oneCacheLiner.y.fetch_add(1, std::memory_order_relaxed);


const std::chrono::duration<double, std::milli> elapsed { now() - start };
std::lock_guard lk{cout_mutex};
std::cout << "oneCacheLinerThread() spent " << elapsed.count() << " ms\n";
if constexpr (xy)
oneCacheLiner.x = elapsed.count();
else oneCacheLiner.y = elapsed.count();
}


template<bool xy>
void twoCacheLinerThread() {
const auto start { now() };


for (uint64_t count{}; count != max_write_iterations; ++count)
if constexpr (xy)
twoCacheLiner.x.fetch_add(1, std::memory_order_relaxed);
else twoCacheLiner.y.fetch_add(1, std::memory_order_relaxed);


const std::chrono::duration<double, std::milli> elapsed { now() - start };
std::lock_guard lk{cout_mutex};
std::cout << "twoCacheLinerThread() spent " << elapsed.count() << " ms\n";
if constexpr (xy)
twoCacheLiner.x = elapsed.count();
else twoCacheLiner.y = elapsed.count();
}


int main() {
std::cout << "__cpp_lib_hardware_interference_size "
# ifdef __cpp_lib_hardware_interference_size
" = " << __cpp_lib_hardware_interference_size << '\n';
# else
"is not defined, use " << hardware_destructive_interference_size << " as fallback\n";
# endif


std::cout
<< "hardware_destructive_interference_size == "
<< hardware_destructive_interference_size << '\n'
<< "hardware_constructive_interference_size == "
<< hardware_constructive_interference_size << "\n\n";


std::cout
<< std::fixed << std::setprecision(2)
<< "sizeof( OneCacheLiner ) == " << sizeof( OneCacheLiner ) << '\n'
<< "sizeof( TwoCacheLiner ) == " << sizeof( TwoCacheLiner ) << "\n\n";


constexpr int max_runs{4};


int oneCacheLiner_average{0};
for (auto i{0}; i != max_runs; ++i) {
std::thread th1{oneCacheLinerThread<0>};
std::thread th2{oneCacheLinerThread<1>};
th1.join(); th2.join();
oneCacheLiner_average += oneCacheLiner.x + oneCacheLiner.y;
}
std::cout << "Average T1 time: " << (oneCacheLiner_average / max_runs / 2) << " ms\n\n";


int twoCacheLiner_average{0};
for (auto i{0}; i != max_runs; ++i) {
std::thread th1{twoCacheLinerThread<0>};
std::thread th2{twoCacheLinerThread<1>};
th1.join(); th2.join();
twoCacheLiner_average += twoCacheLiner.x + twoCacheLiner.y;
}
std::cout << "Average T2 time: " << (twoCacheLiner_average / max_runs / 2) << " ms\n\n";


std::cout << "Ratio T1/T2:~ " << 1.*oneCacheLiner_average/twoCacheLiner_average << '\n';
}

Possible output:


__cpp_lib_hardware_interference_size is not defined, use 64 as fallback
hardware_destructive_interference_size == 64
hardware_constructive_interference_size == 64


sizeof( OneCacheLiner ) == 64
sizeof( TwoCacheLiner ) == 128


oneCacheLinerThread() spent 634.25 ms
oneCacheLinerThread() spent 651.55 ms
oneCacheLinerThread() spent 990.23 ms
oneCacheLinerThread() spent 1033.94 ms
oneCacheLinerThread() spent 838.14 ms
oneCacheLinerThread() spent 883.25 ms
oneCacheLinerThread() spent 873.02 ms
oneCacheLinerThread() spent 914.26 ms
Average T1 time: 852 ms


twoCacheLinerThread() spent 119.22 ms
twoCacheLinerThread() spent 127.91 ms
twoCacheLinerThread() spent 114.17 ms
twoCacheLinerThread() spent 126.41 ms
twoCacheLinerThread() spent 125.17 ms
twoCacheLinerThread() spent 126.06 ms
twoCacheLinerThread() spent 117.94 ms
twoCacheLinerThread() spent 129.03 ms
Average T2 time: 122 ms


Ratio T1/T2:~ 6.98

See also


hardware_concurrency returns the number of concurrent threads supported by the
[static] implementation
(public static member function of std::thread)
hardware_concurrency returns the number of concurrent threads supported by the
[static] implementation
(public static member function of std::jthread)

2022.07.31 http://cppreference.com