From 9f545ba08fad48cba0cfbdfaa4777d17669e2dc7 Mon Sep 17 00:00:00 2001
From: Florian Fischer <florian.fischer@muhq.space>
Date: Tue, 21 Sep 2021 14:16:29 +0200
Subject: [PATCH] [IoContext] replace fancy CountingTryLock with simple CQ
 emptiness check

---
 emper/io/IoContext.cpp                     | 61 ++++++--------------
 emper/io/IoContext.hpp                     | 22 +------
 emper/lib/sync/CountingTryLock.hpp         | 67 ----------------------
 emper/lib/sync/PseudoCountingTryLock.hpp   | 25 --------
 emper/sleep_strategy/PipeSleepStrategy.cpp |  1 +
 meson.build                                |  3 -
 meson_options.txt                          |  7 ---
 7 files changed, 20 insertions(+), 166 deletions(-)
 delete mode 100644 emper/lib/sync/CountingTryLock.hpp
 delete mode 100644 emper/lib/sync/PseudoCountingTryLock.hpp

diff --git a/emper/io/IoContext.cpp b/emper/io/IoContext.cpp
index 6a867a05..bc5a01cb 100644
--- a/emper/io/IoContext.cpp
+++ b/emper/io/IoContext.cpp
@@ -220,37 +220,30 @@ template void IoContext::submitAndWait<CallerEnvironment::ANYWHERE>(Future &futu
 template <CallerEnvironment callerEnvironment>
 auto IoContext::reapCompletions(ContinuationBuffer &continuationFibers) -> unsigned {
 	unsigned reReapCount = 0;
-	uint32_t maxRaceFreeCompleterAttempts = 1;
 
 	using Completion = std::pair<int32_t, void *>;
 	// vector to store seen cqes to make the critical section
 	// where cq_lock is held as small as possible
 	std::array<Completion, CQE_BATCH_COUNT> reapedCompletions;
 
-	// this label is not used for callerEnvironment::ANYWHERE and thus has to be
-	// annotated with ATTR_UNUSED
-reap_cqes:
-	ATTR_UNUSED;
-
 	// never reap completions on the global IoContext
 	assert(this != runtime.globalIo);
 
 	LOGD("Reaping completions for worker " << std::to_string(worker->getWorkerId()));
+// this label is not used for callerEnvironment::ANYWHERE and thus has to be
+// annotated with ATTR_UNUSED
+reap_cqes:
+	ATTR_UNUSED;
 	std::array<struct io_uring_cqe *, CQE_BATCH_COUNT> cqes;
 
 	if constexpr (needsCqLock) {
 		// Someone else is currently reaping completions
-		if constexpr (callerEnvironment == CallerEnvironment::EMPER) {
-			if (unlikely(!cq_lock.try_lock())) {
-				LOGD("worker unsuccessful try_lock");
-				return 0;
-			}
-		} else {
-			if (!cq_lock.try_lock_or_increment()) {
-				LOGD("Global completer unsuccessful try_lock_or_increment");
-				return 0;
-			}
+		if (unlikely(!cq_lock.try_lock())) {
+			LOGD("unsuccessful try_lock");
+			return 0;
+		}
 
+		if constexpr (callerEnvironment == CallerEnvironment::ANYWHERE) {
 			// We have to check the waitInflight flag with the cq_lock held to
 			// ensure we observe an update by the worker holding the lock.
 			// Otherwise this could happen:
@@ -288,9 +281,8 @@ reap_cqes:
 
 	io_uring_cq_advance(&ring, count);
 
-	uint32_t globalCompleterAttempts;
 	if constexpr (needsCqLock) {
-		globalCompleterAttempts = cq_lock.unlock();
+		cq_lock.unlock();
 	}
 
 	LOGD("got " << count << " cqes from worker " << worker->getWorkerId() << "'s io_uring");
@@ -307,18 +299,8 @@ reap_cqes:
 	// from reaping the additional completions we have a lost wakeup possibly leading
 	// to a completely sleeping runtime with runnable completions in a worker's IoContext.
 
-	// To prevent this race the cq_lock counts the unsuccessful tries from
-	// the globalCompleter.
-	// If a worker observes that the globalCompleter tried to reapCompletions
-	// more than twice we know that a lost wakeup could have occurred and we try to
-	// reap again.
-
-	// In the case a lost wakeup was possible we schedule our reaped cqes
-	// and try again.
-
-	// On all cq iteration after the first we expect no globalCompleterAttempt
-	// or in other words a single globalCompleterAttempt attempt means
-	// additional completions arrive and lost wakeup was possible again.
+	// To prevent this race we check the CQ again for new cqes after we already
+	// dropped the CQ lock and possibly reap again.
 
 	stats.record_reaps<callerEnvironment>(count);
 
@@ -375,26 +357,17 @@ reap_cqes:
 		}
 	}
 
-	// check if lost wakeup was possible
-	if constexpr (needsCqLock && callerEnvironment == CallerEnvironment::EMPER) {
-		bool reReap = false;
-		// TODO: How sure are we that this is unlikely?
-		if (unlikely(globalCompleterAttempts > maxRaceFreeCompleterAttempts)) {
-			// In all CQ iteration after the first we expect no further globalCompleter attempts
-			maxRaceFreeCompleterAttempts = 0;
-			reReap = true;
-		} else if (count == CQE_BATCH_COUNT) {
-			// We reaped a full batch, this means there could be potentially
-			// more CQEs in the completion queue.
-			reReap = true;
-		}
-
+	// check if we missed new cqes
+	// TODO: should only the worker recheck?
+	if constexpr (callerEnvironment == CallerEnvironment::EMPER) {
+		bool reReap = io_uring_cq_ready(&ring) != 0;
 		if (reReap) {
 			// schedule all already collected continuation fibers
 			runtime.schedule(continuationFibers.data(), posInBuf);
 
 			reReapCount++;
 
+			LOGD("Re-Reaping completions");
 			goto reap_cqes;
 		}
 
diff --git a/emper/io/IoContext.hpp b/emper/io/IoContext.hpp
index ec9c8eaf..95684fd1 100644
--- a/emper/io/IoContext.hpp
+++ b/emper/io/IoContext.hpp
@@ -11,6 +11,7 @@
 #include <cstdint>		 // for uint64_t
 #include <functional>	 // for less
 #include <iterator>
+#include <mutex>
 #include <vector>
 
 #include "CallerEnvironment.hpp"	// for CallerEnvironment, EMPER
@@ -28,25 +29,6 @@
 class AbstractWorkStealingScheduler;
 class Fiber;
 
-#ifdef EMPER_IO_CQ_LOCK_COUNTING_TRY_LOCK
-#include "lib/sync/CountingTryLock.hpp"
-using CqLock = emper::lib::sync::CountingTryLock;
-
-#elif defined EMPER_IO_CQ_LOCK_MUTEX
-#include <mutex>
-
-#include "lib/sync/PseudoCountingTryLock.hpp"
-using CqLock = emper::lib::sync::PseudoCountingTryLock<std::mutex>;
-
-#elif defined EMPER_IO_CQ_LOCK_SPIN_LOCK
-#include "lib/sync/PseudoCountingTryLock.hpp"
-#include "lib/sync/SpinLock.hpp"
-using CqLock = emper::lib::sync::PseudoCountingTryLock<emper::lib::sync::SpinLock>;
-
-#else
-#error Uknown cq lock implementation
-#endif
-
 namespace emper::sleep_strategy {
 class PipeSleepStrategy;
 }
@@ -73,7 +55,7 @@ class IoContext : public Logger<LogSubsystem::IO> {
 	static constexpr bool needsCqLock =
 			emper::IO_COMPLETER_BEHAVIOR != emper::IoCompleterBehavior::none;
 	// TryLock protecting the completion queue of ring.
-	ALIGN_TO_CACHE_LINE CqLock cq_lock;
+	CACHE_LINE_EXCLUSIVE(std::mutex, cq_lock);
 	struct io_uring ring;
 
 	// In a worker's IoContext This eventfd is registered with the io_uring to get completion
diff --git a/emper/lib/sync/CountingTryLock.hpp b/emper/lib/sync/CountingTryLock.hpp
deleted file mode 100644
index cc67ec1b..00000000
--- a/emper/lib/sync/CountingTryLock.hpp
+++ /dev/null
@@ -1,67 +0,0 @@
-// SPDX-License-Identifier: LGPL-3.0-or-later
-// Copyright Â© 2021 Florian Fischer
-#pragma once
-
-#include <atomic>
-#include <climits>
-
-#include "Common.hpp"
-
-namespace emper::lib::sync {
-
-class CountingTryLock {
- private:
-	// The lower 32-bit are used as the actual lock
-	// The higher 32-bit are used as a counter which is incremented if
-	// try_lock_or_increment fails to acquire the lock.
-
-	// Layout of our lock union:
-	// |--------------  countingLock  --------------|
-	//                     |------- counter  -------|
-	// | lock |
-	// 0    sizeof(std::atomic<bool>)              64
-	// 0                COUNTER_SHIFT              64
-	union {
-		std::atomic<uint64_t> countingLock;
-		std::atomic<bool> lock;
-	};
-
-	static const int COUNTER_SHIFT = 32;
-	static_assert(sizeof(std::atomic<bool>) * CHAR_BIT < COUNTER_SHIFT);
-
-	static const uint64_t LOCKED = 1;
-	static const uint64_t UNLOCKED = 0;
-
- public:
-	CountingTryLock(bool locked = false) : countingLock(locked ? LOCKED : UNLOCKED) {}
-
-	[[nodiscard]] auto try_lock() -> bool { return !lock.exchange(true, std::memory_order_acquire); }
-
-	[[nodiscard]] auto try_lock_or_increment() -> bool {
-		uint64_t oldVal, newVal;
-		oldVal = countingLock.load(std::memory_order_relaxed);
-		for (;;) {
-			// currently unlocked -> try to lock
-			if (oldVal == UNLOCKED) {
-				newVal = LOCKED;
-				// currently locked -> increment the counter
-			} else {
-				newVal = oldVal + (1L << COUNTER_SHIFT);
-			}
-
-			if (countingLock.compare_exchange_weak(oldVal, newVal, std::memory_order_acquire,
-																						 std::memory_order_relaxed)) {
-				break;
-			}
-		}
-
-		return oldVal == UNLOCKED;
-	}
-
-	// release the lock, zero and return the counter
-	auto unlock() -> uint32_t {
-		uint64_t oldVal = countingLock.exchange(UNLOCKED, std::memory_order_release);
-		return static_cast<uint32_t>(oldVal >> COUNTER_SHIFT);
-	}
-};
-}	 // namespace emper::lib::sync
diff --git a/emper/lib/sync/PseudoCountingTryLock.hpp b/emper/lib/sync/PseudoCountingTryLock.hpp
deleted file mode 100644
index 2b278bad..00000000
--- a/emper/lib/sync/PseudoCountingTryLock.hpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// SPDX-License-Identifier: LGPL-3.0-or-later
-// Copyright Â© 2021 Florian Fischer
-#pragma once
-
-namespace emper::lib::sync {
-
-template <class Lockable>
-class PseudoCountingTryLock {
- private:
-	Lockable lock;
-
- public:
-	[[nodiscard]] auto try_lock() -> bool {
-		lock.lock();
-		return true;
-	}
-
-	[[nodiscard]] auto try_lock_or_increment() -> bool { return try_lock(); }
-
-	auto unlock() -> uint32_t {
-		lock.unlock();
-		return 0;
-	}
-};
-}	 // namespace emper::lib::sync
diff --git a/emper/sleep_strategy/PipeSleepStrategy.cpp b/emper/sleep_strategy/PipeSleepStrategy.cpp
index 3c02694e..51ba1713 100644
--- a/emper/sleep_strategy/PipeSleepStrategy.cpp
+++ b/emper/sleep_strategy/PipeSleepStrategy.cpp
@@ -6,6 +6,7 @@
 
 #include <atomic>
 #include <cassert>
+#include <mutex>
 
 #include "CallerEnvironment.hpp"
 #include "Emper.hpp"
diff --git a/meson.build b/meson.build
index d6f9cbfd..ce0f95be 100644
--- a/meson.build
+++ b/meson.build
@@ -111,9 +111,6 @@ foreach option : io_raw_options
 	conf_data.set('EMPER_IO_' + option.to_upper(), get_option('io_' + option))
 endforeach
 
-io_cq_lock_impl = get_option('io_cq_lock_implementation')
-conf_data.set('EMPER_IO_CQ_LOCK_' + io_cq_lock_impl.to_upper(), true)
-
 io_completer_behavior = get_option('io_completer_behavior')
 if io_completer_behavior == 'maybe_wakeup'
 	if get_option('worker_sleep')
diff --git a/meson_options.txt b/meson_options.txt
index 1cd5d81a..b98b9f8e 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -140,13 +140,6 @@ option(
   value: false,
   description: 'Share a common async backend between all io_urings'
 )
-option(
-  'io_cq_lock_implementation',
-  type: 'combo',
-  description: 'The lock implementation used to protect a worker IoContext CQ',
-  choices: ['spin_lock', 'counting_try_lock', 'mutex'],
-  value: 'counting_try_lock',
-)
 option(
   'io_completer_behavior',
   type: 'combo',
-- 
GitLab