work_contract/src/executable/benchmark/main.cpp at 4732eccf9806d354bc68a1d686f5738f4b8a52ca · buildingcpp/work_contract · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
#include <include/jthread.h>
#include <cstddef>
#include <iostream>
#include <memory>
#include <chrono>
#include <condition_variable>
#include <mutex>
#include <cstdint>
#include <atomic>
#include <vector>
#include <cmath>
#include <iomanip>
#include <span>
#include <fmt/format.h>

using namespace std::chrono;


// it might look a bit odd to hard code the cpus to use in the benchmark
// but one of my test machines has a blend of different cpus and I can't seem
// to disable hyperthreading on that machine via the bios nor the terminal.
// This made ensuring that the benchmark is running on the preferred physical cores
// a bit difficult so I just did it this way until I have time to write a tool
// to dynamically figure out the optimal cores to use.
//int cores[] = {0,2,4,6,8,10,12,14};
//int cores[] = {0,2,4,6,8,10,12,14,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
int cores[] = {16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
//int cores[] = {16,20,24,28,17,21,25,29,18,22,26,30,19,23,27,31};
int mainCpu = 0;

static auto constexpr test_duration = 1s;
static auto constexpr max_tasks = (1 << 13);
static auto constexpr max_threads = std::extent_v<decltype(cores)>;

// containers for gathering stats during test
std::array<std::atomic<std::size_t>, max_tasks> taskExecutionCount;
std::array<std::atomic<std::size_t>, max_threads> threadExecutionCount;
thread_local std::array<std::size_t, max_tasks> tlsExecutionCount;

std::size_t thread_local tlsThreadIndex;
std::size_t thread_local tlsCurrentTaskId;

std::atomic<bool> startTest = false;
std::atomic<bool> endTest = false;
std::vector<bcpp::detail::jthread> testThreads;

#include "./test_harness.h"


//==============================================================================
bool set_cpu_affinity
(
    int value
)
{
#ifdef __linux__
    cpu_set_t cpuSet;
    CPU_ZERO(&cpuSet);
    CPU_SET(value, &cpuSet);
    return (pthread_setaffinity_np(pthread_self(), sizeof(cpuSet), &cpuSet) == 0);
#else
    // macOS doesn't support CPU affinity via pthread_setaffinity_np
    // Thread affinity can be set via thread_policy_set but it's more complex
    (void)value;
    return true;
#endif
}


//=============================================================================
auto gather_stats
(
    auto const input
) -> std::tuple<std::size_t, long double, long double, long double>
{
    std::size_t total = 0;
    for (auto const & v : input)
        total += v;
    long double mean = ((long double)total / input.size());
    long double k = 0;
    for (auto const & v : input)
        k += ((v - mean) * (v - mean));
    k /= (input.size() - 1);
    auto sd = std::sqrt(k);
    return {total, mean, sd, sd / mean};
}


//=============================================================================
void print_stats
(
    auto numThreads,
    auto testDurationInSeconds
)
{
    auto [taskTotal, taskMean, taskSd, taskCv] = gather_stats(std::span(taskExecutionCount.data(), taskExecutionCount.size()));
    auto [threadTotal, threadMean, threadSd, threadCv] = gather_stats(std::span(threadExecutionCount.begin(), numThreads));
    std::cout <<fmt::format("{:<15}{:<20}{:<25}{:<10.4f}{:<10.4f}\n", numThreads, taskTotal, (int)((taskTotal / testDurationInSeconds) / numThreads), taskCv, threadCv);

    for (auto & _ : taskExecutionCount)
        _ = 0;
    for (auto & _ : threadExecutionCount)
        _ = 0;
}


//=============================================================================
template <std::size_t N>
auto hash_task()
{
    static auto constexpr str = "guess what? chicken butt!";
    auto volatile n = 0;
    for (auto i = 0ull; i < N; ++i)
        n *= std::hash<std::string>()(str);
    return n;
};


//=============================================================================
auto create_worker_threads
(
    // prep the test, create the worker threads, wait until all are ready
    std::size_t numWorkerThreads,
    std::invocable auto && work
)
{
    startTest = false;
    endTest = false;

    testThreads.resize(numWorkerThreads);
    std::atomic<std::size_t> readyThreadCount = 0;
    auto index = 0;
    for (auto & thread : testThreads)
    {
        thread = bcpp::detail::jthread([&readyThreadCount, work, threadId = index]
                (
                ) mutable
                {
                    set_cpu_affinity(cores[threadId]);
                    tlsThreadIndex = threadId;
                    for (auto & _ : tlsExecutionCount)
                        _ = 0;
                    readyThreadCount++;
                    while (!startTest)
                        ;
                    while (!endTest)
                        work();
                    // copy tls stats to global
                    for (auto i = 0; i < max_tasks; ++i)
                    {
                        taskExecutionCount[i] += tlsExecutionCount[i];
                        threadExecutionCount[threadId] += tlsExecutionCount[i];
                    }
                });
        ++index;
    }

    while (readyThreadCount != testThreads.size())
        ;
}


//=============================================================================
auto execute_test
(
    std::size_t numWorkerThreads,
    auto && threadFunction
)
{
    create_worker_threads(numWorkerThreads, threadFunction);

    // start test
    auto startTime = std::chrono::system_clock::now();
    startTest = true;
    // wait for duration of test
    std::this_thread::sleep_for(test_duration);
    endTest = true;
    auto stopTime = std::chrono::system_clock::now();
    // stop worker threads
    for (auto & testThread : testThreads)
    {
        testThread.request_stop();
        testThread.join();
    }

    // test completed
    // gather timing
    auto elapsedTime = (stopTime - startTime);
    auto testDurationInSeconds = (double)std::chrono::duration_cast<std::chrono::nanoseconds>(elapsedTime).count() / std::nano::den;
    print_stats(testThreads.size(), testDurationInSeconds);
}


//=============================================================================
template <algorithm T>
auto test_algorithm
(
    std::size_t numWorkerThreads,
    std::invocable auto && task
)
{
    test_harness<T, std::decay_t<decltype(task)>> testHarness(max_tasks);
    for (auto i = 0; i < max_tasks; ++i)
        testHarness.add_task(task);
    execute_test(numWorkerThreads, [&](){testHarness.process_next_task();});
}


//=============================================================================
auto get_task_duration
(
    // this function tests the actual time it takes to execute a task without the multithreaded frameworks
    std::invocable auto && task
)
{
    std::size_t counter = 0;
    bool volatile start = false;
    bool volatile end = false;
    bool volatile ready = false;
    std::size_t total = 0;
    bcpp::detail::jthread thread([&]()
            {
                ready = true;
                while (!start)
                    ;
                while (!end)
                {
                    counter++;
                    total += task();
                }
            });

    while (!ready)
        ;
    auto startTime = std::chrono::system_clock::now();
    start = true;
    std::this_thread::sleep_for(std::chrono::seconds(1));
    end = true;
    auto endTime = std::chrono::system_clock::now();
    return ((double)std::chrono::duration_cast<std::chrono::nanoseconds>(endTime - startTime).count() / counter);
}


//=============================================================================
int main
(
    int,
    char const **
)
{
    set_cpu_affinity(mainCpu);

    auto run_test = []<typename T>
    (
        T task,
        std::string title
    )
    {
        std::string green = "\033[1m";
        std::string defaultColor = "\033[0m";
        std::string line = "==================================================================================\n";
        std::cout << fmt::format("\n\nTask {}, average task duration is {:.2f} ns\n", title, get_task_duration(task));
        auto header = fmt::format("{:<15}{:<20}{:<25}{:<10}{:<10}\n", "Thread Count:", "Tasks per Second:", "Tasks per Thread/sec:", "Task cv:", "Thread cv:");

        std::cout << "\n" << green << line << "TBB concurrent_queue:\n" << header << line << defaultColor;
        for (auto i = 2ull; i <= max_threads; ++i)
            test_algorithm<algorithm::tbb>(i, task);

        std::cout << "\n" << green << line << "Strauss MPMC queue:\n" << header << line << defaultColor;
        for (auto i = 2ull; i <= max_threads; ++i)
            test_algorithm<algorithm::es>(i, task);

        std::cout << "\n" << green << line << "MoodyCamel ConcurrentQueue:\n" << header << line << defaultColor;
        for (auto i = 2ull; i <= max_threads; ++i)
            test_algorithm<algorithm::moody_camel>(i, task);

        std::cout << "\n" << green << line << "Work Contract:\n" << header << line << defaultColor;
        for (auto i = 2ull; i <= max_threads; ++i)
            test_algorithm<algorithm::work_contract>(i, task);

        std::cout << "\n" << green << line << "Blocking Work Contract:\n" << header << line << defaultColor;
        for (auto i = 2ull; i <= max_threads; ++i)
            test_algorithm<algorithm::blocking_work_contract>(i, task);
    };

    run_test(hash_task<0>, "maximum contention"); // approx 1.5ns
    run_test(hash_task<1>, "high contention"); // approx 17ns
    run_test(hash_task<64>, "medium contention"); // ~1100ns
    run_test(hash_task<256>, "low contention"); // ~4100ns

    return 0;
}