Fix memory ordering/atomic access

A few memory accesses don't seem correct to me, and a few seem slightly stricter than necessary. Here are the rules in userspace, I think: sq.ktail is read by the kernel sq.khead is written by the kernel cq.ktail is written by the kernel cq.khead is read by the kernel Locations that are written concurrently by the kernel must be loaded atomically; if they specify (to the user) that some memory is now available, they must be loaded with acquire semantics before the memory is read. Similarly, locations that are read concurrently by the kernel must be written atomically; if they specify (to the kernel) that some memory is now available, they must be stored with release semantics after the memory is written to. Note that without SQPOLL, sq is only accessed by the kernel in response to a system call, so no synchronization is necessary. Note that in practice, on most CPU architectures, a regular aligned load/store is already atomic; in that situation READ_ONCE/WRITE_ONCE is primarily necessary for the compiler, which would otherwise be allowed to do invalid things like load the lower half of an integer and then the upper half. Signed-off-by: Shachaf Ben-Kiki <[email protected]>
axboe · Mar 3, 2024 · f03becf · f03becf
1 parent 2b353a8
commit f03becf
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 26 deletions.
diff --git a/src/include/liburing.h b/src/include/liburing.h
@@ -1217,7 +1217,7 @@ IOURINGINLINE void io_uring_prep_ftruncate(struct io_uring_sqe *sqe,
  */
 IOURINGINLINE unsigned io_uring_sq_ready(const struct io_uring *ring)
 {
-	unsigned khead = *ring->sq.khead;
+	unsigned khead;
 
 	/*
 	 * Without a barrier, we could miss an update and think the SQ wasn't
@@ -1226,6 +1226,8 @@ IOURINGINLINE unsigned io_uring_sq_ready(const struct io_uring *ring)
 	 */
 	if (ring->flags & IORING_SETUP_SQPOLL)
 		khead = io_uring_smp_load_acquire(ring->sq.khead);
+	else
+		khead = *ring->sq.khead;
 
 	/* always use real head, to avoid losing sync for short submit */
 	return ring->sq.sqe_tail - khead;
@@ -1412,7 +1414,7 @@ IOURINGINLINE struct io_uring_sqe *_io_uring_get_sqe(struct io_uring *ring)
 	if (ring->flags & IORING_SETUP_SQE128)
 		shift = 1;
 	if (!(ring->flags & IORING_SETUP_SQPOLL))
-		head = IO_URING_READ_ONCE(*sq->khead);
+		head = *sq->khead;
 	else
 		head = io_uring_smp_load_acquire(sq->khead);
 

diff --git a/src/queue.c b/src/queue.c
@@ -212,22 +212,18 @@ static unsigned __io_uring_flush_sq(struct io_uring *ring)
 		 * Ensure kernel sees the SQE updates before the tail update.
 		 */
 		if (!(ring->flags & IORING_SETUP_SQPOLL))
-			IO_URING_WRITE_ONCE(*sq->ktail, tail);
+			*sq->ktail = tail;
 		else
 			io_uring_smp_store_release(sq->ktail, tail);
 	}
 	/*
-	 * This _may_ look problematic, as we're not supposed to be reading
-	 * SQ->head without acquire semantics. When we're in SQPOLL mode, the
-	 * kernel submitter could be updating this right now. For non-SQPOLL,
-	 * task itself does it, and there's no potential race. But even for
-	 * SQPOLL, the load is going to be potentially out-of-date the very
-	 * instant it's done, regardless or whether or not it's done
-	 * atomically. Worst case, we're going to be over-estimating what
-	 * we can submit. The point is, we need to be able to deal with this
-	 * situation regardless of any perceived atomicity.
-	 */
-	return tail - *sq->khead;
+	* This load needs to be atomic, since sq->khead is written concurrently
+	* by the kernel, but it doesn't need to be load_acquire, since the
+	* kernel doesn't store to the submission queue; it advances khead just
+	* to indicate that it's finished reading the submission queue entries
+	* so they're available for us to write to.
+	*/
+	return tail - IO_URING_READ_ONCE(*sq->khead);
 }
 
 /*

diff --git a/test/helpers.c b/test/helpers.c
@@ -286,22 +286,18 @@ unsigned __io_uring_flush_sq(struct io_uring *ring)
 		 * Ensure kernel sees the SQE updates before the tail update.
 		 */
 		if (!(ring->flags & IORING_SETUP_SQPOLL))
-			IO_URING_WRITE_ONCE(*sq->ktail, tail);
+			*sq->ktail = tail;
 		else
 			io_uring_smp_store_release(sq->ktail, tail);
 	}
 	/*
-	 * This _may_ look problematic, as we're not supposed to be reading
-	 * SQ->head without acquire semantics. When we're in SQPOLL mode, the
-	 * kernel submitter could be updating this right now. For non-SQPOLL,
-	 * task itself does it, and there's no potential race. But even for
-	 * SQPOLL, the load is going to be potentially out-of-date the very
-	 * instant it's done, regardless or whether or not it's done
-	 * atomically. Worst case, we're going to be over-estimating what
-	 * we can submit. The point is, we need to be able to deal with this
-	 * situation regardless of any perceived atomicity.
-	 */
-	return tail - *sq->khead;
+	* This load needs to be atomic, since sq->khead is written concurrently
+	* by the kernel, but it doesn't need to be load_acquire, since the
+	* kernel doesn't store to the submission queue; it advances khead just
+	* to indicate that it's finished reading the submission queue entries
+	* so they're available for us to write to.
+	*/
+	return tail - IO_URING_READ_ONCE(*sq->khead);
 }
 
 /*