diff --git a/emper/io/IoContext.cpp b/emper/io/IoContext.cpp index a3319071a63fec5ccf8c74ec765c5d5270867359..5dbeb52df42b21a824b1e2e01d9e9bcb3fd97e05 100644 --- a/emper/io/IoContext.cpp +++ b/emper/io/IoContext.cpp @@ -322,14 +322,36 @@ reap_cqes: // Number of actual continuation fibers resulting from the reaped CQEs unsigned continuationsCount = 0; + // TODO: Is using a try lock and the waitInflight flag here even sound? + // Coudn't it be possible to have a lost wakeup with unconsumed new work notification + // cqe in our CQ + // + // State only a single worker does work involving IO and another (completer, io-stealing + // worker accesses its CQ. + + // Other Owner + // | submit IO + // | lock + // | prepare to sleep + // | set flag + // | unlock + // | sleep + // lock | + // | try lock unsucessfull + // | sleep again + // check flag | + // unlock | if constexpr (needsCqLock) { - // Someone else is currently reaping completions - if (unlikely(!cq_lock.try_lock())) { - LOGD("unsuccessful try_lock from " << callerEnvironment); - return 0; - } - - if constexpr (callerEnvironment != CallerEnvironment::OWNER) { + // The Owner always takes the lock to reap all completions and especially + // new work notifications and prevent the above discribed problem. + if constexpr (callerEnvironment == CallerEnvironment::OWNER) { + cq_lock.lock(); + } else { + // Someone else is currently reaping completions + if (unlikely(!cq_lock.try_lock())) { + LOGD("unsuccessful try_lock from " << callerEnvironment); + return 0; + } // We have to check the waitInflight flag with the cq_lock held to // ensure we observe an update by the worker holding the lock. // Otherwise this could happen: