diff --git a/emper/Emper.cpp b/emper/Emper.cpp index f61c3bc9dd8649492412fca8bce5f05acb0a993c..5c250d5ccf30c5ab0f8c264908683d0fbd5dbaf1 100644 --- a/emper/Emper.cpp +++ b/emper/Emper.cpp @@ -2,6 +2,8 @@ // Copyright © 2020-2022 Florian Schmaus #include "Emper.hpp" +#include + #include #include #include @@ -204,4 +206,11 @@ auto nanosleep(const struct timespec* rqtp) -> bool { return _sleep(ts); } +auto nanosleep(std::uint64_t ns) -> bool { + auto tv_sec = static_cast<__kernel_time64_t>(ns / 1'000'000'0000); + auto tv_nsec = static_cast(ns % 1'000'000'0000); + emper::io::AlarmFuture::Timespec ts = {.tv_sec = tv_sec, .tv_nsec = tv_nsec}; + return _sleep(ts); +} + } // namespace emper diff --git a/emper/include/emper.hpp b/emper/include/emper.hpp index b3823f6728bcf9d4c1c852543cff3f9975ed658e..28941085d04237d0bc5ce32a586d70676e2e498e 100644 --- a/emper/include/emper.hpp +++ b/emper/include/emper.hpp @@ -1,8 +1,9 @@ // SPDX-License-Identifier: LGPL-3.0-or-later -// Copyright © 2020-2021 Florian Schmaus +// Copyright © 2020-2022 Florian Schmaus #pragma once #include +#include #include #include @@ -57,5 +58,6 @@ void yield(); auto sleep(unsigned int seconds) -> bool; auto nanosleep(const struct timespec* rqtp) -> bool; +auto nanosleep(std::uint64_t ns) -> bool; } // namespace emper diff --git a/eval/BFP9000.cpp b/eval/BFP9000.cpp new file mode 100644 index 0000000000000000000000000000000000000000..cab489fdd64bb377281556671ec50815d049764a --- /dev/null +++ b/eval/BFP9000.cpp @@ -0,0 +1,306 @@ +// SPDX-License-Identifier: LGPL-3.0-or-later +// Copyright © 2022 Florian Schmaus +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // IWYU pragma: keep +#include +#include + +#include "CountingPrivateSemaphore.hpp" +#include "Fibril.hpp" +#include "Runtime.hpp" +#include "emper-common.h" +#include "emper.hpp" +#include "lib/math.hpp" + +namespace chrn = std::chrono; +namespace fs = std::filesystem; +namespace math = emper::lib::math; +namespace po = boost::program_options; + +using Clock = std::chrono::high_resolution_clock; + +static double utilization = 0.8; +static std::uint64_t pulse_target_work_ns = 233'000; +static unsigned int pulse_concurrency_level = 4; +static std::uint_fast64_t seed = 20141007; +static unsigned int duration_ms = 10'000; +static workerid_t worker_count = Runtime::getDefaultWorkerCount(); +static bool trace = false; +static boost::optional result_dir; + +// Derived values from the user supplied configuration values. +static std::uint64_t duration_ns; +static std::uint64_t work_duration_ns; +static std::uint64_t sleep_duration_ns; +static std::uint64_t number_of_pulses; +static std::uint64_t pulse_target_sleep_ns; + +struct PulseTrace { + chrn::time_point sleep_finish; + std::uint64_t work_ns; + chrn::time_point work_start; + chrn::time_point work_finish; + std::uint64_t sleep_ns; +}; + +class Pulser { + private: + std::mt19937_64 rng; + + std::uint64_t remaining_work_ns; + std::uint64_t remaining_sleep_ns; + + public: + std::vector traces; + + private: + static void busyWait(std::uint64_t work_ns) { + const auto deadline = Clock::now() + std::chrono::nanoseconds(work_ns); + while (Clock::now() < deadline) { + } + } + + // NOLINTNEXTLINE(clang-diagnostic-unknown-attributes) + emper_fibril static void work(std::uint64_t work_ns) { + if (pulse_concurrency_level) { + std::uint64_t work_per_concurrency_level_ns = work_ns / pulse_concurrency_level; + + Fibril fibril; + for (unsigned int i = 0; i < pulse_concurrency_level; ++i) { + fibril.spawn(&Pulser::busyWait, work_per_concurrency_level_ns); + } + } else { + busyWait(work_ns); + } + } + + auto getNanos(std::binomial_distribution& dist, std::uint64_t remaining_ns, + std::uint64_t target_ns, std::uint64_t pulse_number) -> std::uint64_t { + assert(remaining_ns); + + std::uint64_t nanos = dist(rng); + + if (nanos > remaining_ns) { + // If the random distribution returned a value higher than the + // remaining value, then we need to cap the value. + nanos = remaining_ns; + + if (nanos > target_ns) { + std::uint64_t remaining_pulses = number_of_pulses - pulse_number - 1; + nanos = nanos / remaining_pulses; + } + } + + return nanos; + } + + public: + Pulser(std::uint_fast64_t seed) + : rng(seed), remaining_work_ns(work_duration_ns), remaining_sleep_ns(sleep_duration_ns){}; + + void pulse() { + std::binomial_distribution work_dist(2 * pulse_target_work_ns, 0.5); + std::binomial_distribution sleep_dist(2 * pulse_target_sleep_ns, 0.5); + + if (trace) { + traces = std::vector(number_of_pulses); + traces[0].sleep_finish = Clock::now(); + } + + for (std::uint64_t i = 0; i < number_of_pulses; ++i) { + const auto work_ns = getNanos(work_dist, remaining_work_ns, pulse_target_work_ns, i); + if (trace) { + traces[i].work_start = Clock::now(); + traces[i].work_ns = work_ns; + } + work(work_ns); + if (trace) traces[i].work_finish = Clock::now(); + remaining_work_ns -= work_ns; + + const auto sleep_ns = getNanos(sleep_dist, remaining_sleep_ns, pulse_target_sleep_ns, i); + if (trace) { + traces[i].sleep_ns = sleep_ns; + auto next_i = i + 1; + if (next_i < number_of_pulses) { + traces[next_i].sleep_finish = Clock::now() + chrn::nanoseconds(sleep_ns); + } + } + emper::nanosleep(sleep_ns); + remaining_sleep_ns -= sleep_ns; + } + } +}; + +static void bfp9000() { + duration_ns = duration_ms * 1'000'000L; + work_duration_ns = static_cast(static_cast(duration_ns) * utilization); + sleep_duration_ns = duration_ns - work_duration_ns; + number_of_pulses = work_duration_ns / pulse_target_work_ns; + pulse_target_sleep_ns = sleep_duration_ns / number_of_pulses; + + std::mt19937_64 rng(seed); + std::vector pulsers; + pulsers.reserve(worker_count); + for (auto i = 0; i < worker_count; ++i) { + pulsers.emplace_back(rng()); + } + + // clang-format off + std::cout << std::boolalpha + << " *** BFP 9K ***" << std::endl + << " The Big Fancy Pulser 9000" << std::endl + << std::endl + << "duration-ns: " << duration_ns << std::endl + << "work-duation-ns: " << work_duration_ns << std::endl + << "sleep-duration-ns: " << sleep_duration_ns << std::endl + << "pulse-target-work-ns: " << pulse_target_work_ns << std::endl + << "pulse-target-sleep-ns: " << pulse_target_sleep_ns << std::endl + << "number-of-pulses: " << number_of_pulses << std::endl + << "pulse-concurrency-level: " << pulse_concurrency_level << std::endl + << "worker-count: " << worker_count << std::endl + << "trace: " << trace << std::endl + << "seed: " << seed << std::endl + ; + // clang-format on + + const auto start = Clock::now(); + { + Runtime runtime(worker_count); + runtime.executeAndWait([&pulsers] { + CPS cps; + for (auto& pulser : pulsers) { + spawn([&pulser] { pulser.pulse(); }, cps); + } + cps.wait(); + }); + } + const auto end = Clock::now(); + const auto actual_duration_us = chrn::duration_cast(end - start); + + std::cout << actual_duration_us.count() << " us" << std::endl; + + if (!trace) return; + + std::uint64_t actual_total_work_ns = 0; + std::uint64_t actual_total_sleep_ns = 0; + std::vector latencies; + latencies.reserve(pulsers.size() * number_of_pulses); + + std::optional latencies_file_strm; + std::optional work_file_strm; + std::optional sleep_file_strm; + if (result_dir) { + auto latencies_file = *result_dir / "latencies"; + latencies_file_strm = std::ofstream(latencies_file); + + auto work_file = *result_dir / "work"; + work_file_strm = std::ofstream(work_file); + + auto sleep_file = *result_dir / "sleep"; + sleep_file_strm = std::ofstream(sleep_file); + } + + for (auto& pulser : pulsers) { + std::uint64_t pulser_work_ns = 0; + std::uint64_t pulser_sleep_ns = 0; + for (auto& trace : pulser.traces) { + pulser_work_ns += trace.work_ns; + pulser_sleep_ns += trace.sleep_ns; + + auto pulse_latency = + chrn::duration_cast(trace.work_start - trace.sleep_finish); + + assert(pulse_latency.count()); + latencies.push_back(pulse_latency); + + if (!result_dir) continue; + + *latencies_file_strm << pulse_latency.count() << std::endl; + *work_file_strm << trace.work_ns << std::endl; + *sleep_file_strm << trace.sleep_ns << std::endl; + } + + actual_total_work_ns += pulser_work_ns; + actual_total_sleep_ns += pulser_sleep_ns; + } + + std::sort(latencies.begin(), latencies.end()); + + const double median_latency = [&latencies] { + std::size_t latencies_size = latencies.size(); + std::size_t middle_or_right_index = latencies_size / 2; + auto middle_or_right_value = static_cast(latencies[middle_or_right_index].count()); + if (latencies_size % 2) { + return middle_or_right_value; + } + + auto left = static_cast(latencies[middle_or_right_index - 1].count()); + return (left + middle_or_right_value) / 2.0; + }(); + auto min_latency = latencies.front(); + auto max_latency = latencies.back(); + auto accumulated_latency = + std::accumulate(latencies.begin(), latencies.end(), chrn::nanoseconds(0)); + double average_latency = + static_cast(accumulated_latency.count()) / static_cast(latencies.size()); + + // clang-format off + std::cout << "actual-total-work-ns: " << actual_total_work_ns << std::endl + << "actual-total-sleep-ns: " << actual_total_sleep_ns << std::endl + << "median-latency: " << median_latency << std::endl + << "average-latency: " << average_latency << std::endl + << "min-latency: " << min_latency.count() << std::endl + << "max-latency: " << max_latency.count() << std::endl + ; + // clang-format on +} + +auto main(int argc, char* argv[]) -> int { + po::options_description desc("Big Fancy Pulser 9000 - Options"); + // clang-format off + desc.add_options() + ("utilization,u", po::value(&utilization)->default_value(utilization), "The utilization wihtin the range (0, 1.0]") + ("pulse-target-work-ns,p", po::value(&pulse_target_work_ns)->default_value(pulse_target_work_ns), "The target duration in nanoseconds of a single pulse, i.e. a work unit") + ("pulse-concurrency-level,c", po::value(&pulse_concurrency_level)->default_value(pulse_concurrency_level), "The concurrency level of a single pulse") + ("seed,s", po::value(&seed)->default_value(seed), "The seed of the random number generator") + ("duration-ms,d", po::value(&duration_ms)->default_value(duration_ms), "The duration of the benchmark in milliseconds") + ("worker-count,w", po::value(&worker_count)->default_value(worker_count), "The number of worker threads") + ("trace,t", po::bool_switch(&trace)->default_value(trace), "Trace the execution") + ("result-dir,r", po::value(&result_dir), "A directory where to write the evaluation results to") + ("help,h", "Show help") + ; + // clang-format on + + auto parse_result = po::command_line_parser(argc, argv).options(desc).run(); + + po::variables_map vm; + po::store(parse_result, vm); + po::notify(vm); + + if (vm.count("help")) { + std::cout << desc << std::endl; + return EXIT_SUCCESS; + } + + try { + bfp9000(); + } catch (std::exception& e) { + std::cerr << e.what() << std::endl; + return EXIT_FAILURE; + } +} diff --git a/eval/meson.build b/eval/meson.build index 9d2593bb956b123fbf1d68cc763d1543bdfa1a62..57e6b9f31b22637ee335ef7df2ddd84f69624e37 100644 --- a/eval/meson.build +++ b/eval/meson.build @@ -22,6 +22,12 @@ if cpp_can_link_with_boost_program_options 'Pulse.cpp', dependencies: [emper_dep, boost_program_options_dep], ) + + executable( + 'bfp9k', + 'BFP9000.cpp', + dependencies: [emper_dep, boost_program_options_dep], + ) endif std_atomic_wait_notify_code = '''\ diff --git a/iwyu-mappings.imp b/iwyu-mappings.imp index a3161cf2e57eaf67e3555bc74870b5fff0d1dbba..d363fabea071b8ee4edc3b266606db4a874f6eae 100644 --- a/iwyu-mappings.imp +++ b/iwyu-mappings.imp @@ -20,6 +20,7 @@ { include: ["", "private", "", "public"], }, { include: ["", "private", "", "public"], }, { include: ["", "private", "", "public"], }, + { include: ["", "private", "", "public"], }, { include: ["", "private", "", "public"], }, { symbol: ["__kernel_timespec", "private", "", "public" ] },