diff options
author | Olivier Houchard <cognet@FreeBSD.org> | 2021-10-29 17:01:15 +0000 |
---|---|---|
committer | Olivier Houchard <cognet@FreeBSD.org> | 2021-10-29 17:01:15 +0000 |
commit | ce929fe84f9c453263af379f3b255ff8eca01d48 (patch) | |
tree | e67b03511733872a7fea5ab703fee973ee18a6d4 | |
parent | 48289654a0ac91e3c29f25461373b3f1b7c82095 (diff) |
Import CK as of commit 2265c7846f4ce667f5216456afe2779b23c3e5f7.vendor/ck/2021029vendor/ck
-rw-r--r-- | include/ck_backoff.h | 2 | ||||
-rw-r--r-- | include/ck_cc.h | 2 | ||||
-rw-r--r-- | include/ck_ec.h | 945 | ||||
-rw-r--r-- | include/ck_fifo.h | 2 | ||||
-rw-r--r-- | include/ck_hs.h | 8 | ||||
-rw-r--r-- | include/ck_pr.h | 15 | ||||
-rw-r--r-- | include/ck_queue.h | 20 | ||||
-rw-r--r-- | include/ck_ring.h | 672 | ||||
-rw-r--r-- | include/gcc/aarch64/ck_pr.h | 12 | ||||
-rw-r--r-- | include/gcc/aarch64/ck_pr_llsc.h | 106 | ||||
-rw-r--r-- | include/gcc/aarch64/ck_pr_lse.h | 37 | ||||
-rw-r--r-- | include/gcc/ck_cc.h | 9 | ||||
-rw-r--r-- | include/gcc/x86/ck_pr.h | 109 | ||||
-rw-r--r-- | include/gcc/x86_64/ck_pr.h | 113 | ||||
-rw-r--r-- | include/spinlock/fas.h | 9 | ||||
-rw-r--r-- | src/ck_ec.c | 425 | ||||
-rw-r--r-- | src/ck_ec_timeutil.h | 150 | ||||
-rw-r--r-- | src/ck_hs.c | 7 | ||||
-rw-r--r-- | src/ck_ht.c | 3 |
19 files changed, 2285 insertions, 361 deletions
diff --git a/include/ck_backoff.h b/include/ck_backoff.h index 82a4f2152e3c..a1f7616a55db 100644 --- a/include/ck_backoff.h +++ b/include/ck_backoff.h @@ -50,7 +50,7 @@ ck_backoff_eb(unsigned int *c) for (i = 0; i < ceiling; i++) ck_pr_barrier(); - *c = ceiling <<= ceiling < CK_BACKOFF_CEILING; + *c = ceiling << (ceiling < CK_BACKOFF_CEILING); return; } diff --git a/include/ck_cc.h b/include/ck_cc.h index 9a152a3cddab..1b4ff4635fa6 100644 --- a/include/ck_cc.h +++ b/include/ck_cc.h @@ -50,6 +50,7 @@ * Container function. * This relies on (compiler) implementation-defined behavior. */ +#ifndef CK_CC_CONTAINER #define CK_CC_CONTAINER(F, T, M, N) \ CK_CC_INLINE static T * \ N(F *p) \ @@ -57,6 +58,7 @@ F *n = p; \ return (T *)(void *)(((char *)n) - ((size_t)&((T *)0)->M)); \ } +#endif #define CK_CC_PAD(x) union { char pad[x]; } diff --git a/include/ck_ec.h b/include/ck_ec.h new file mode 100644 index 000000000000..cd2a36813a79 --- /dev/null +++ b/include/ck_ec.h @@ -0,0 +1,945 @@ +/* + * Copyright 2018 Paul Khuong, Google LLC. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Overview + * ======== + * + * ck_ec implements 32- and 64- bit event counts. Event counts let us + * easily integrate OS-level blocking (e.g., futexes) in lock-free + * protocols. Waiters block conditionally, if the event count's value + * is still equal to some old value. + * + * Event counts come in four variants: 32 and 64 bit (with one bit + * stolen for internal signaling, so 31 and 63 bit counters), and + * single or multiple producers (wakers). Waiters are always multiple + * consumers. The 32 bit variants are smaller, and more efficient, + * especially in single producer mode. The 64 bit variants are larger, + * but practically invulnerable to ABA. + * + * The 32 bit variant is always available. The 64 bit variant is only + * available if CK supports 64-bit atomic operations. Currently, + * specialization for single producer is only implemented for x86 and + * x86-64, on compilers that support GCC extended inline assembly; + * other platforms fall back to the multiple producer code path. + * + * A typical usage pattern is: + * + * 1. On the producer side: + * + * - Make changes to some shared data structure, without involving + * the event count at all. + * - After each change, call ck_ec_inc on the event count. The call + * acts as a write-write barrier, and wakes up any consumer blocked + * on the event count (waiting for new changes). + * + * 2. On the consumer side: + * + * - Snapshot ck_ec_value of the event count. The call acts as a + * read barrier. + * - Read and process the shared data structure. + * - Wait for new changes by calling ck_ec_wait with the snapshot value. + * + * Some data structures may opt for tighter integration with their + * event count. For example, an SPMC ring buffer or disruptor might + * use the event count's value as the write pointer. If the buffer is + * regularly full, it might also make sense to store the read pointer + * in an MP event count. + * + * This event count implementation supports tighter integration in two + * ways. + * + * Producers may opt to increment by an arbitrary value (less than + * INT32_MAX / INT64_MAX), in order to encode, e.g., byte + * offsets. Larger increment values make wraparound more likely, so + * the increments should still be relatively small. + * + * Consumers may pass a predicate to ck_ec_wait_pred. This predicate + * can make `ck_ec_wait_pred` return early, before the event count's + * value changes, and can override the deadline passed to futex_wait. + * This lets consumer block on one eventcount, while optimistically + * looking at other waking conditions. + * + * API Reference + * ============= + * + * When compiled as C11 or later, this header defines type-generic + * macros for ck_ec32 and ck_ec64; the reference describes this + * type-generic API. + * + * ck_ec needs additional OS primitives to determine the current time, + * to wait on an address, and to wake all threads waiting on a given + * address. These are defined with fields in a struct ck_ec_ops. Each + * ck_ec_ops may additionally define the number of spin loop + * iterations in the slow path, as well as the initial wait time in + * the internal exponential backoff, the exponential scale factor, and + * the right shift count (< 32). + * + * The ops, in addition to the single/multiple producer flag, are + * encapsulated in a struct ck_ec_mode, passed to most ck_ec + * operations. + * + * ec is a struct ck_ec32 *, or a struct ck_ec64 *. + * + * value is an uint32_t for ck_ec32, and an uint64_t for ck_ec64. It + * never exceeds INT32_MAX and INT64_MAX respectively. + * + * mode is a struct ck_ec_mode *. + * + * deadline is either NULL, or a `const struct timespec *` that will + * be treated as an absolute deadline. + * + * `void ck_ec_init(ec, value)`: initializes the event count to value. + * + * `value ck_ec_value(ec)`: returns the current value of the event + * counter. This read acts as a read (acquire) barrier. + * + * `bool ck_ec_has_waiters(ec)`: returns whether some thread has + * marked the event count as requiring an OS wakeup. + * + * `void ck_ec_inc(ec, mode)`: increments the value of the event + * counter by one. This writes acts as a write barrier. Wakes up + * any waiting thread. + * + * `value ck_ec_add(ec, mode, value)`: increments the event counter by + * `value`, and returns the event counter's previous value. This + * write acts as a write barrier. Wakes up any waiting thread. + * + * `int ck_ec_deadline(struct timespec *new_deadline, + * mode, + * const struct timespec *timeout)`: + * computes a deadline `timeout` away from the current time. If + * timeout is NULL, computes a deadline in the infinite future. The + * resulting deadline is written to `new_deadline`. Returns 0 on + * success, and -1 if ops->gettime failed (without touching errno). + * + * `int ck_ec_wait(ec, mode, value, deadline)`: waits until the event + * counter's value differs from `value`, or, if `deadline` is + * provided and non-NULL, until the current time is after that + * deadline. Use a deadline with tv_sec = 0 for a non-blocking + * execution. Returns 0 if the event counter has changed, and -1 on + * timeout. This function acts as a read (acquire) barrier. + * + * `int ck_ec_wait_pred(ec, mode, value, pred, data, deadline)`: waits + * until the event counter's value differs from `value`, or until + * `pred` returns non-zero, or, if `deadline` is provided and + * non-NULL, until the current time is after that deadline. Use a + * deadline with tv_sec = 0 for a non-blocking execution. Returns 0 if + * the event counter has changed, `pred`'s return value if non-zero, + * and -1 on timeout. This function acts as a read (acquire) barrier. + * + * `pred` is always called as `pred(data, iteration_deadline, now)`, + * where `iteration_deadline` is a timespec of the deadline for this + * exponential backoff iteration, and `now` is the current time. If + * `pred` returns a non-zero value, that value is immediately returned + * to the waiter. Otherwise, `pred` is free to modify + * `iteration_deadline` (moving it further in the future is a bad + * idea). + * + * Implementation notes + * ==================== + * + * The multiple producer implementation is a regular locked event + * count, with a single flag bit to denote the need to wake up waiting + * threads. + * + * The single producer specialization is heavily tied to + * [x86-TSO](https://www.cl.cam.ac.uk/~pes20/weakmemory/cacm.pdf), and + * to non-atomic read-modify-write instructions (e.g., `inc mem`); + * these non-atomic RMW let us write to the same memory locations with + * atomic and non-atomic instructions, without suffering from process + * scheduling stalls. + * + * The reason we can mix atomic and non-atomic writes to the `counter` + * word is that every non-atomic write obviates the need for the + * atomically flipped flag bit: we only use non-atomic writes to + * update the event count, and the atomic flag only informs the + * producer that we would like a futex_wake, because of the update. + * We only require the non-atomic RMW counter update to prevent + * preemption from introducing arbitrarily long worst case delays. + * + * Correctness does not rely on the usual ordering argument: in the + * absence of fences, there is no strict ordering between atomic and + * non-atomic writes. The key is instead x86-TSO's guarantee that a + * read is satisfied from the most recent buffered write in the local + * store queue if there is one, or from memory if there is no write to + * that address in the store queue. + * + * x86-TSO's constraint on reads suffices to guarantee that the + * producer will never forget about a counter update. If the last + * update is still queued, the new update will be based on the queued + * value. Otherwise, the new update will be based on the value in + * memory, which may or may not have had its flag flipped. In either + * case, the value of the counter (modulo flag) is correct. + * + * When the producer forwards the counter's value from its store + * queue, the new update might not preserve a flag flip. Any waiter + * thus has to check from time to time to determine if it wasn't + * woken up because the flag bit was silently cleared. + * + * In reality, the store queue in x86-TSO stands for in-flight + * instructions in the chip's out-of-order backend. In the vast + * majority of cases, instructions will only remain in flight for a + * few hundred or thousand of cycles. That's why ck_ec_wait spins on + * the `counter` word for ~100 iterations after flipping its flag bit: + * if the counter hasn't changed after that many iterations, it is + * very likely that the producer's next counter update will observe + * the flag flip. + * + * That's still not a hard guarantee of correctness. Conservatively, + * we can expect that no instruction will remain in flight for more + * than 1 second... if only because some interrupt will have forced + * the chip to store its architectural state in memory, at which point + * an instruction is either fully retired or rolled back. Interrupts, + * particularly the pre-emption timer, are why single-producer updates + * must happen in a single non-atomic read-modify-write instruction. + * Having a single instruction as the critical section means we only + * have to consider the worst-case execution time for that + * instruction. That's easier than doing the same for a pair of + * instructions, which an unlucky pre-emption could delay for + * arbitrarily long. + * + * Thus, after a short spin loop, ck_ec_wait enters an exponential + * backoff loop, where each "sleep" is instead a futex_wait. The + * backoff is only necessary to handle rare cases where the flag flip + * was overwritten after the spin loop. Eventually, more than one + * second will have elapsed since the flag flip, and the sleep timeout + * becomes infinite: since the flag bit has been set for much longer + * than the time for which an instruction may remain in flight, the + * flag will definitely be observed at the next counter update. + * + * The 64 bit ck_ec_wait pulls another trick: futexes only handle 32 + * bit ints, so we must treat the 64 bit counter's low 32 bits as an + * int in futex_wait. That's a bit dodgy, but fine in practice, given + * that the OS's futex code will always read whatever value is + * currently in memory: even if the producer thread were to wait on + * its own event count, the syscall and ring transition would empty + * the store queue (the out-of-order execution backend). + * + * Finally, what happens when the producer is migrated to another core + * or otherwise pre-empted? Migration must already incur a barrier, so + * that thread always sees its own writes, so that's safe. As for + * pre-emption, that requires storing the architectural state, which + * means every instruction must either be executed fully or not at + * all when pre-emption happens. + */ + +#ifndef CK_EC_H +#define CK_EC_H +#include <ck_cc.h> +#include <ck_pr.h> +#include <ck_stdbool.h> +#include <ck_stdint.h> +#include <ck_stddef.h> +#include <sys/time.h> + +/* + * If we have ck_pr_faa_64 (and, presumably, ck_pr_load_64), we + * support 63 bit counters. + */ +#ifdef CK_F_PR_FAA_64 +#define CK_F_EC64 +#endif /* CK_F_PR_FAA_64 */ + +/* + * GCC inline assembly lets us exploit non-atomic read-modify-write + * instructions on x86/x86_64 for a fast single-producer mode. + * + * If we CK_F_EC_SP is not defined, CK_EC always uses the slower + * multiple producer code. + */ +#if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) +#define CK_F_EC_SP +#endif /* GNUC && (__i386__ || __x86_64__) */ + +struct ck_ec_ops; + +struct ck_ec_wait_state { + struct timespec start; /* Time when we entered ck_ec_wait. */ + struct timespec now; /* Time now. */ + const struct ck_ec_ops *ops; + void *data; /* Opaque pointer for the predicate's internal state. */ + +}; + +/* + * ck_ec_ops define system-specific functions to get the current time, + * atomically wait on an address if it still has some expected value, + * and to wake all threads waiting on an address. + * + * Each platform is expected to have few (one) opaque pointer to a + * const ops struct, and reuse it for all ck_ec_mode structs. + */ +struct ck_ec_ops { + /* Populates out with the current time. Returns non-zero on failure. */ + int (*gettime)(const struct ck_ec_ops *, struct timespec *out); + + /* + * Waits on address if its value is still `expected`. If + * deadline is non-NULL, stops waiting once that deadline is + * reached. May return early for any reason. + */ + void (*wait32)(const struct ck_ec_wait_state *, const uint32_t *, + uint32_t expected, const struct timespec *deadline); + + /* + * Same as wait32, but for a 64 bit counter. Only used if + * CK_F_EC64 is defined. + * + * If underlying blocking primitive only supports 32 bit + * control words, it should be safe to block on the least + * significant half of the 64 bit address. + */ + void (*wait64)(const struct ck_ec_wait_state *, const uint64_t *, + uint64_t expected, const struct timespec *deadline); + + /* Wakes up all threads waiting on address. */ + void (*wake32)(const struct ck_ec_ops *, const uint32_t *address); + + /* + * Same as wake32, but for a 64 bit counter. Only used if + * CK_F_EC64 is defined. + * + * When wait64 truncates the control word at address to `only` + * consider its least significant half, wake64 should perform + * any necessary fixup (e.g., on big endian platforms). + */ + void (*wake64)(const struct ck_ec_ops *, const uint64_t *address); + + /* + * Number of iterations for the initial busy wait. 0 defaults + * to 100 (not ABI stable). + */ + uint32_t busy_loop_iter; + + /* + * Delay in nanoseconds for the first iteration of the + * exponential backoff. 0 defaults to 2 ms (not ABI stable). + */ + uint32_t initial_wait_ns; + + /* + * Scale factor for the exponential backoff. 0 defaults to 8x + * (not ABI stable). + */ + uint32_t wait_scale_factor; + + /* + * Right shift count for the exponential backoff. The update + * after each iteration is + * wait_ns = (wait_ns * wait_scale_factor) >> wait_shift_count, + * until one second has elapsed. After that, the deadline goes + * to infinity. + */ + uint32_t wait_shift_count; +}; + +/* + * ck_ec_mode wraps the ops table, and informs the fast path whether + * it should attempt to specialize for single producer mode. + * + * mode structs are expected to be exposed by value, e.g., + * + * extern const struct ck_ec_ops system_ec_ops; + * + * static const struct ck_ec_mode ec_sp = { + * .ops = &system_ec_ops, + * .single_producer = true + * }; + * + * static const struct ck_ec_mode ec_mp = { + * .ops = &system_ec_ops, + * .single_producer = false + * }; + * + * ck_ec_mode structs are only passed to inline functions defined in + * this header, and never escape to their slow paths, so they should + * not result in any object file size increase. + */ +struct ck_ec_mode { + const struct ck_ec_ops *ops; + /* + * If single_producer is true, the event count has a unique + * incrementer. The implementation will specialize ck_ec_inc + * and ck_ec_add if possible (if CK_F_EC_SP is defined). + */ + bool single_producer; +}; + +struct ck_ec32 { + /* Flag is "sign" bit, value in bits 0:30. */ + uint32_t counter; +}; + +typedef struct ck_ec32 ck_ec32_t; + +#ifdef CK_F_EC64 +struct ck_ec64 { + /* + * Flag is bottom bit, value in bits 1:63. Eventcount only + * works on x86-64 (i.e., little endian), so the futex int + * lies in the first 4 (bottom) bytes. + */ + uint64_t counter; +}; + +typedef struct ck_ec64 ck_ec64_t; +#endif /* CK_F_EC64 */ + +#define CK_EC_INITIALIZER { .counter = 0 } + +/* + * Initializes the event count to `value`. The value must not + * exceed INT32_MAX. + */ +static void ck_ec32_init(struct ck_ec32 *ec, uint32_t value); + +#ifndef CK_F_EC64 +#define ck_ec_init ck_ec32_init +#else +/* + * Initializes the event count to `value`. The value must not + * exceed INT64_MAX. + */ +static void ck_ec64_init(struct ck_ec64 *ec, uint64_t value); + +#if __STDC_VERSION__ >= 201112L +#define ck_ec_init(EC, VALUE) \ + (_Generic(*(EC), \ + struct ck_ec32 : ck_ec32_init, \ + struct ck_ec64 : ck_ec64_init)((EC), (VALUE))) +#endif /* __STDC_VERSION__ */ +#endif /* CK_F_EC64 */ + +/* + * Returns the counter value in the event count. The value is at most + * INT32_MAX. + */ +static uint32_t ck_ec32_value(const struct ck_ec32* ec); + +#ifndef CK_F_EC64 +#define ck_ec_value ck_ec32_value +#else +/* + * Returns the counter value in the event count. The value is at most + * INT64_MAX. + */ +static uint64_t ck_ec64_value(const struct ck_ec64* ec); + +#if __STDC_VERSION__ >= 201112L +#define ck_ec_value(EC) \ + (_Generic(*(EC), \ + struct ck_ec32 : ck_ec32_value, \ + struct ck_ec64 : ck_ec64_value)((EC))) +#endif /* __STDC_VERSION__ */ +#endif /* CK_F_EC64 */ + +/* + * Returns whether there may be slow pathed waiters that need an + * explicit OS wakeup for this event count. + */ +static bool ck_ec32_has_waiters(const struct ck_ec32 *ec); + +#ifndef CK_F_EC64 +#define ck_ec_has_waiters ck_ec32_has_waiters +#else +static bool ck_ec64_has_waiters(const struct ck_ec64 *ec); + +#if __STDC_VERSION__ >= 201112L +#define ck_ec_has_waiters(EC) \ + (_Generic(*(EC), \ + struct ck_ec32 : ck_ec32_has_waiters, \ + struct ck_ec64 : ck_ec64_has_waiters)((EC))) +#endif /* __STDC_VERSION__ */ +#endif /* CK_F_EC64 */ + +/* + * Increments the counter value in the event count by one, and wakes + * up any waiter. + */ +static void ck_ec32_inc(struct ck_ec32 *ec, const struct ck_ec_mode *mode); + +#ifndef CK_F_EC64 +#define ck_ec_inc ck_ec32_inc +#else +static void ck_ec64_inc(struct ck_ec64 *ec, const struct ck_ec_mode *mode); + +#if __STDC_VERSION__ >= 201112L +#define ck_ec_inc(EC, MODE) \ + (_Generic(*(EC), \ + struct ck_ec32 : ck_ec32_inc, \ + struct ck_ec64 : ck_ec64_inc)((EC), (MODE))) +#endif /* __STDC_VERSION__ */ +#endif /* CK_F_EC64 */ + +/* + * Increments the counter value in the event count by delta, wakes + * up any waiter, and returns the previous counter value. + */ +static uint32_t ck_ec32_add(struct ck_ec32 *ec, + const struct ck_ec_mode *mode, + uint32_t delta); + +#ifndef CK_F_EC64 +#define ck_ec_add ck_ec32_add +#else +static uint64_t ck_ec64_add(struct ck_ec64 *ec, + const struct ck_ec_mode *mode, + uint64_t delta); + +#if __STDC_VERSION__ >= 201112L +#define ck_ec_add(EC, MODE, DELTA) \ + (_Generic(*(EC), \ + struct ck_ec32 : ck_ec32_add, \ + struct ck_ec64 : ck_ec64_add)((EC), (MODE), (DELTA))) +#endif /* __STDC_VERSION__ */ +#endif /* CK_F_EC64 */ + +/* + * Populates `new_deadline` with a deadline `timeout` in the future. + * Returns 0 on success, and -1 if clock_gettime failed, in which + * case errno is left as is. + */ +static int ck_ec_deadline(struct timespec *new_deadline, + const struct ck_ec_mode *mode, + const struct timespec *timeout); + +/* + * Waits until the counter value in the event count differs from + * old_value, or, if deadline is non-NULL, until CLOCK_MONOTONIC is + * past the deadline. + * + * Returns 0 on success, and -1 on timeout. + */ +static int ck_ec32_wait(struct ck_ec32 *ec, + const struct ck_ec_mode *mode, + uint32_t old_value, + const struct timespec *deadline); + +#ifndef CK_F_EC64 +#define ck_ec_wait ck_ec32_wait +#else +static int ck_ec64_wait(struct ck_ec64 *ec, + const struct ck_ec_mode *mode, + uint64_t old_value, + const struct timespec *deadline); + +#if __STDC_VERSION__ >= 201112L +#define ck_ec_wait(EC, MODE, OLD_VALUE, DEADLINE) \ + (_Generic(*(EC), \ + struct ck_ec32 : ck_ec32_wait, \ + struct ck_ec64 : ck_ec64_wait)((EC), (MODE), \ + (OLD_VALUE), (DEADLINE))) + +#endif /* __STDC_VERSION__ */ +#endif /* CK_F_EC64 */ + +/* + * Waits until the counter value in the event count differs from + * old_value, pred returns non-zero, or, if deadline is non-NULL, + * until CLOCK_MONOTONIC is past the deadline. + * + * Returns 0 on success, -1 on timeout, and the return value of pred + * if it returns non-zero. + * + * A NULL pred represents a function that always returns 0. + */ +static int ck_ec32_wait_pred(struct ck_ec32 *ec, + const struct ck_ec_mode *mode, + uint32_t old_value, + int (*pred)(const struct ck_ec_wait_state *, + struct timespec *deadline), + void *data, + const struct timespec *deadline); + +#ifndef CK_F_EC64 +#define ck_ec_wait_pred ck_ec32_wait_pred +#else +static int ck_ec64_wait_pred(struct ck_ec64 *ec, + const struct ck_ec_mode *mode, + uint64_t old_value, + int (*pred)(const struct ck_ec_wait_state *, + struct timespec *deadline), + void *data, + const struct timespec *deadline); + +#if __STDC_VERSION__ >= 201112L +#define ck_ec_wait_pred(EC, MODE, OLD_VALUE, PRED, DATA, DEADLINE) \ + (_Generic(*(EC), \ + struct ck_ec32 : ck_ec32_wait_pred, \ + struct ck_ec64 : ck_ec64_wait_pred) \ + ((EC), (MODE), (OLD_VALUE), (PRED), (DATA), (DEADLINE))) +#endif /* __STDC_VERSION__ */ +#endif /* CK_F_EC64 */ + +/* + * Inline implementation details. 32 bit first, then 64 bit + * conditionally. + */ +CK_CC_FORCE_INLINE void ck_ec32_init(struct ck_ec32 *ec, uint32_t value) +{ + ec->counter = value & ~(1UL << 31); + return; +} + +CK_CC_FORCE_INLINE uint32_t ck_ec32_value(const struct ck_ec32 *ec) +{ + uint32_t ret = ck_pr_load_32(&ec->counter) & ~(1UL << 31); + + ck_pr_fence_acquire(); + return ret; +} + +CK_CC_FORCE_INLINE bool ck_ec32_has_waiters(const struct ck_ec32 *ec) +{ + return ck_pr_load_32(&ec->counter) & (1UL << 31); +} + +/* Slow path for ck_ec{32,64}_{inc,add} */ +void ck_ec32_wake(struct ck_ec32 *ec, const struct ck_ec_ops *ops); + +CK_CC_FORCE_INLINE void ck_ec32_inc(struct ck_ec32 *ec, + const struct ck_ec_mode *mode) +{ +#if !defined(CK_F_EC_SP) + /* Nothing to specialize if we don't have EC_SP. */ + ck_ec32_add(ec, mode, 1); + return; +#else + char flagged; + +#if __GNUC__ >= 6 + /* + * We don't want to wake if the sign bit is 0. We do want to + * wake if the sign bit just flipped from 1 to 0. We don't + * care what happens when our increment caused the sign bit to + * flip from 0 to 1 (that's once per 2^31 increment). + * + * This leaves us with four cases: + * + * old sign bit | new sign bit | SF | OF | ZF + * ------------------------------------------- + * 0 | 0 | 0 | 0 | ? + * 0 | 1 | 1 | 0 | ? + * 1 | 1 | 1 | 0 | ? + * 1 | 0 | 0 | 0 | 1 + * + * In the first case, we don't want to hit ck_ec32_wake. In + * the last two cases, we do want to call ck_ec32_wake. In the + * second case, we don't care, so we arbitrarily choose to + * call ck_ec32_wake. + * + * The "le" condition checks if SF != OF, or ZF == 1, which + * meets our requirements. + */ +#define CK_EC32_INC_ASM(PREFIX) \ + __asm__ volatile(PREFIX " incl %0" \ + : "+m"(ec->counter), "=@ccle"(flagged) \ + :: "cc", "memory") +#else +#define CK_EC32_INC_ASM(PREFIX) \ + __asm__ volatile(PREFIX " incl %0; setle %1" \ + : "+m"(ec->counter), "=r"(flagged) \ + :: "cc", "memory") +#endif /* __GNUC__ */ + + if (mode->single_producer == true) { + ck_pr_fence_store(); + CK_EC32_INC_ASM(""); + } else { + ck_pr_fence_store_atomic(); + CK_EC32_INC_ASM("lock"); + } +#undef CK_EC32_INC_ASM + + if (CK_CC_UNLIKELY(flagged)) { + ck_ec32_wake(ec, mode->ops); + } + + return; +#endif /* CK_F_EC_SP */ +} + +CK_CC_FORCE_INLINE uint32_t ck_ec32_add_epilogue(struct ck_ec32 *ec, + const struct ck_ec_mode *mode, + uint32_t old) +{ + const uint32_t flag_mask = 1U << 31; + uint32_t ret; + + ret = old & ~flag_mask; + /* These two only differ if the flag bit is set. */ + if (CK_CC_UNLIKELY(old != ret)) { + ck_ec32_wake(ec, mode->ops); + } + + return ret; +} + +static CK_CC_INLINE uint32_t ck_ec32_add_mp(struct ck_ec32 *ec, + const struct ck_ec_mode *mode, + uint32_t delta) +{ + uint32_t old; + + ck_pr_fence_store_atomic(); + old = ck_pr_faa_32(&ec->counter, delta); + return ck_ec32_add_epilogue(ec, mode, old); +} + +#ifdef CK_F_EC_SP +static CK_CC_INLINE uint32_t ck_ec32_add_sp(struct ck_ec32 *ec, + const struct ck_ec_mode *mode, + uint32_t delta) +{ + uint32_t old; + + /* + * Correctness of this racy write depends on actually + * having an update to write. Exit here if the update + * is a no-op. + */ + if (CK_CC_UNLIKELY(delta == 0)) { + return ck_ec32_value(ec); + } + + ck_pr_fence_store(); + old = delta; + __asm__ volatile("xaddl %1, %0" + : "+m"(ec->counter), "+r"(old) + :: "cc", "memory"); + return ck_ec32_add_epilogue(ec, mode, old); +} +#endif /* CK_F_EC_SP */ + +CK_CC_FORCE_INLINE uint32_t ck_ec32_add(struct ck_ec32 *ec, + const struct ck_ec_mode *mode, + uint32_t delta) +{ +#ifdef CK_F_EC_SP + if (mode->single_producer == true) { + return ck_ec32_add_sp(ec, mode, delta); + } +#endif + + return ck_ec32_add_mp(ec, mode, delta); +} + +int ck_ec_deadline_impl(struct timespec *new_deadline, + const struct ck_ec_ops *ops, + const struct timespec *timeout); + +CK_CC_FORCE_INLINE int ck_ec_deadline(struct timespec *new_deadline, + const struct ck_ec_mode *mode, + const struct timespec *timeout) +{ + return ck_ec_deadline_impl(new_deadline, mode->ops, timeout); +} + + +int ck_ec32_wait_slow(struct ck_ec32 *ec, + const struct ck_ec_ops *ops, + uint32_t old_value, + const struct timespec *deadline); + +CK_CC_FORCE_INLINE int ck_ec32_wait(struct ck_ec32 *ec, + const struct ck_ec_mode *mode, + uint32_t old_value, + const struct timespec *deadline) +{ + if (ck_ec32_value(ec) != old_value) { + return 0; + } + + return ck_ec32_wait_slow(ec, mode->ops, old_value, deadline); +} + +int ck_ec32_wait_pred_slow(struct ck_ec32 *ec, + const struct ck_ec_ops *ops, + uint32_t old_value, + int (*pred)(const struct ck_ec_wait_state *state, + struct timespec *deadline), + void *data, + const struct timespec *deadline); + +CK_CC_FORCE_INLINE int +ck_ec32_wait_pred(struct ck_ec32 *ec, + const struct ck_ec_mode *mode, + uint32_t old_value, + int (*pred)(const struct ck_ec_wait_state *state, + struct timespec *deadline), + void *data, + const struct timespec *deadline) +{ + if (ck_ec32_value(ec) != old_value) { + return 0; + } + + return ck_ec32_wait_pred_slow(ec, mode->ops, old_value, + pred, data, deadline); +} + +#ifdef CK_F_EC64 +CK_CC_FORCE_INLINE void ck_ec64_init(struct ck_ec64 *ec, uint64_t value) +{ + ec->counter = value << 1; + return; +} + +CK_CC_FORCE_INLINE uint64_t ck_ec64_value(const struct ck_ec64 *ec) +{ + uint64_t ret = ck_pr_load_64(&ec->counter) >> 1; + + ck_pr_fence_acquire(); + return ret; +} + +CK_CC_FORCE_INLINE bool ck_ec64_has_waiters(const struct ck_ec64 *ec) +{ + return ck_pr_load_64(&ec->counter) & 1; +} + +void ck_ec64_wake(struct ck_ec64 *ec, const struct ck_ec_ops *ops); + +CK_CC_FORCE_INLINE void ck_ec64_inc(struct ck_ec64 *ec, + const struct ck_ec_mode *mode) +{ + /* We always xadd, so there's no special optimization here. */ + (void)ck_ec64_add(ec, mode, 1); + return; +} + +CK_CC_FORCE_INLINE uint64_t ck_ec_add64_epilogue(struct ck_ec64 *ec, + const struct ck_ec_mode *mode, + uint64_t old) +{ + uint64_t ret = old >> 1; + + if (CK_CC_UNLIKELY(old & 1)) { + ck_ec64_wake(ec, mode->ops); + } + + return ret; +} + +static CK_CC_INLINE uint64_t ck_ec64_add_mp(struct ck_ec64 *ec, + const struct ck_ec_mode *mode, + uint64_t delta) +{ + uint64_t inc = 2 * delta; /* The low bit is the flag bit. */ + + ck_pr_fence_store_atomic(); + return ck_ec_add64_epilogue(ec, mode, ck_pr_faa_64(&ec->counter, inc)); +} + +#ifdef CK_F_EC_SP +/* Single-producer specialisation. */ +static CK_CC_INLINE uint64_t ck_ec64_add_sp(struct ck_ec64 *ec, + const struct ck_ec_mode *mode, + uint64_t delta) +{ + uint64_t old; + + /* + * Correctness of this racy write depends on actually + * having an update to write. Exit here if the update + * is a no-op. + */ + if (CK_CC_UNLIKELY(delta == 0)) { + return ck_ec64_value(ec); + } + + ck_pr_fence_store(); + old = 2 * delta; /* The low bit is the flag bit. */ + __asm__ volatile("xaddq %1, %0" + : "+m"(ec->counter), "+r"(old) + :: "cc", "memory"); + return ck_ec_add64_epilogue(ec, mode, old); +} +#endif /* CK_F_EC_SP */ + +/* + * Dispatch on mode->single_producer in this FORCE_INLINE function: + * the end result is always small, but not all compilers have enough + * foresight to inline and get the reduction. + */ +CK_CC_FORCE_INLINE uint64_t ck_ec64_add(struct ck_ec64 *ec, + const struct ck_ec_mode *mode, + uint64_t delta) +{ +#ifdef CK_F_EC_SP + if (mode->single_producer == true) { + return ck_ec64_add_sp(ec, mode, delta); + } +#endif + + return ck_ec64_add_mp(ec, mode, delta); +} + +int ck_ec64_wait_slow(struct ck_ec64 *ec, + const struct ck_ec_ops *ops, + uint64_t old_value, + const struct timespec *deadline); + +CK_CC_FORCE_INLINE int ck_ec64_wait(struct ck_ec64 *ec, + const struct ck_ec_mode *mode, + uint64_t old_value, + const struct timespec *deadline) +{ + if (ck_ec64_value(ec) != old_value) { + return 0; + } + + return ck_ec64_wait_slow(ec, mode->ops, old_value, deadline); +} + +int ck_ec64_wait_pred_slow(struct ck_ec64 *ec, + const struct ck_ec_ops *ops, + uint64_t old_value, + int (*pred)(const struct ck_ec_wait_state *state, + struct timespec *deadline), + void *data, + const struct timespec *deadline); + + +CK_CC_FORCE_INLINE int +ck_ec64_wait_pred(struct ck_ec64 *ec, + const struct ck_ec_mode *mode, + uint64_t old_value, + int (*pred)(const struct ck_ec_wait_state *state, + struct timespec *deadline), + void *data, + const struct timespec *deadline) +{ + if (ck_ec64_value(ec) != old_value) { + return 0; + } + + return ck_ec64_wait_pred_slow(ec, mode->ops, old_value, + pred, data, deadline); +} +#endif /* CK_F_EC64 */ +#endif /* !CK_EC_H */ diff --git a/include/ck_fifo.h b/include/ck_fifo.h index 6d500708c445..c9a6f3d9a87d 100644 --- a/include/ck_fifo.h +++ b/include/ck_fifo.h @@ -115,7 +115,7 @@ CK_CC_INLINE static void ck_fifo_spsc_deinit(struct ck_fifo_spsc *fifo, struct ck_fifo_spsc_entry **garbage) { - *garbage = fifo->head; + *garbage = fifo->garbage; fifo->head = fifo->tail = NULL; return; } diff --git a/include/ck_hs.h b/include/ck_hs.h index 3c12b6e602a7..cd3e5dac87aa 100644 --- a/include/ck_hs.h +++ b/include/ck_hs.h @@ -109,6 +109,14 @@ typedef struct ck_hs_iterator ck_hs_iterator_t; /* Convenience wrapper to table hash function. */ #define CK_HS_HASH(T, F, K) F((K), (T)->seed) +/* Computes the hash of n bytes of k for the specified hash map. */ +static inline unsigned long +ck_hs_hash(const struct ck_hs *hs, const void *k) +{ + + return hs->hf(k, hs->seed); +} + typedef void *ck_hs_apply_fn_t(void *, void *); bool ck_hs_apply(ck_hs_t *, unsigned long, const void *, ck_hs_apply_fn_t *, void *); void ck_hs_iterator_init(ck_hs_iterator_t *); diff --git a/include/ck_pr.h b/include/ck_pr.h index 2de6e13ec3c9..8ebf855692dd 100644 --- a/include/ck_pr.h +++ b/include/ck_pr.h @@ -34,7 +34,20 @@ #include <ck_stdint.h> #include <ck_stdbool.h> -#ifndef CK_USE_CC_BUILTINS +/* + * Default to using builtins for clang analyzer, coverity, and sparse: + * inline assembly is often too opaque for useful analysis. Override + * the defaults by defining CK_USE_CC_BUILTINS=0 or 1. + */ +#if !defined(CK_USE_CC_BUILTINS) +#if defined(__clang_analyzer__) || defined(__COVERITY__) || defined(__CHECKER__) +#define CK_USE_CC_BUILTINS 1 +#else +#define CK_USE_CC_BUILTINS 0 +#endif +#endif + +#if !CK_USE_CC_BUILTINS #if defined(__x86_64__) #include "gcc/x86_64/ck_pr.h" #elif defined(__x86__) diff --git a/include/ck_queue.h b/include/ck_queue.h index 3f503aa6c3e5..fd38d8a583fa 100644 --- a/include/ck_queue.h +++ b/include/ck_queue.h @@ -53,7 +53,7 @@ * SUCH DAMAGE. * * @(#)queue.h 8.5 (Berkeley) 8/20/94 - * $FreeBSD$ + * $FreeBSD: release/9.0.0/sys/sys/queue.h 221843 2011-05-13 15:49:23Z mdf $ */ #ifndef CK_QUEUE_H @@ -150,17 +150,17 @@ struct { \ #define CK_SLIST_FOREACH(var, head, field) \ for ((var) = CK_SLIST_FIRST((head)); \ - (var) && (ck_pr_fence_load(), 1); \ + (var); \ (var) = CK_SLIST_NEXT((var), field)) -#define CK_SLIST_FOREACH_SAFE(var, head, field, tvar) \ - for ((var) = CK_SLIST_FIRST(head); \ - (var) && (ck_pr_fence_load(), (tvar) = CK_SLIST_NEXT(var, field), 1);\ +#define CK_SLIST_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = CK_SLIST_FIRST(head); \ + (var) && ((tvar) = CK_SLIST_NEXT(var, field), 1); \ (var) = (tvar)) #define CK_SLIST_FOREACH_PREVPTR(var, varp, head, field) \ for ((varp) = &(head)->cslh_first; \ - ((var) = ck_pr_load_ptr(varp)) != NULL && (ck_pr_fence_load(), 1); \ + ((var) = ck_pr_load_ptr(varp)) != NULL; \ (varp) = &(var)->field.csle_next) #define CK_SLIST_INIT(head) do { \ @@ -259,12 +259,12 @@ struct { \ #define CK_STAILQ_FOREACH(var, head, field) \ for((var) = CK_STAILQ_FIRST((head)); \ - (var) && (ck_pr_fence_load(), 1); \ + (var); \ (var) = CK_STAILQ_NEXT((var), field)) #define CK_STAILQ_FOREACH_SAFE(var, head, field, tvar) \ for ((var) = CK_STAILQ_FIRST((head)); \ - (var) && (ck_pr_fence_load(), (tvar) = \ + (var) && ((tvar) = \ CK_STAILQ_NEXT((var), field), 1); \ (var) = (tvar)) @@ -371,12 +371,12 @@ struct { \ #define CK_LIST_FOREACH(var, head, field) \ for ((var) = CK_LIST_FIRST((head)); \ - (var) && (ck_pr_fence_load(), 1); \ + (var); \ (var) = CK_LIST_NEXT((var), field)) #define CK_LIST_FOREACH_SAFE(var, head, field, tvar) \ for ((var) = CK_LIST_FIRST((head)); \ - (var) && (ck_pr_fence_load(), (tvar) = CK_LIST_NEXT((var), field), 1);\ + (var) && ((tvar) = CK_LIST_NEXT((var), field), 1); \ (var) = (tvar)) #define CK_LIST_INIT(head) do { \ diff --git a/include/ck_ring.h b/include/ck_ring.h index e5f0712ef7cf..9f6754e0cd24 100644 --- a/include/ck_ring.h +++ b/include/ck_ring.h @@ -66,9 +66,56 @@ ck_ring_size(const struct ck_ring *ring) CK_CC_INLINE static unsigned int ck_ring_capacity(const struct ck_ring *ring) { + return ring->size; } +/* + * This function is only safe to call when there are no concurrent operations + * on the ring. This is primarily meant for persistent ck_ring use-cases. The + * function returns true if any mutations were performed on the ring. + */ +CK_CC_INLINE static bool +ck_ring_repair(struct ck_ring *ring) +{ + bool r = false; + + if (ring->p_tail != ring->p_head) { + ring->p_tail = ring->p_head; + r = true; + } + + return r; +} + +/* + * This can be called when no concurrent updates are occurring on the ring + * structure to check for consistency. This is primarily meant to be used for + * persistent storage of the ring. If this functions returns false, the ring + * is in an inconsistent state. + */ +CK_CC_INLINE static bool +ck_ring_valid(const struct ck_ring *ring) +{ + unsigned int size = ring->size; + unsigned int c_head = ring->c_head; + unsigned int p_head = ring->p_head; + + /* The ring must be a power of 2. */ + if (size & (size - 1)) + return false; + + /* The consumer counter must always be smaller than the producer. */ + if (c_head > p_head) + return false; + + /* The producer may only be up to size slots ahead of consumer. */ + if (p_head - c_head >= size) + return false; + + return true; +} + CK_CC_INLINE static void ck_ring_init(struct ck_ring *ring, unsigned int size) { @@ -84,6 +131,45 @@ ck_ring_init(struct ck_ring *ring, unsigned int size) /* * The _ck_ring_* namespace is internal only and must not used externally. */ + +/* + * This function will return a region of memory to write for the next value + * for a single producer. + */ +CK_CC_FORCE_INLINE static void * +_ck_ring_enqueue_reserve_sp(struct ck_ring *ring, + void *CK_CC_RESTRICT buffer, + unsigned int ts, + unsigned int *size) +{ + const unsigned int mask = ring->mask; + unsigned int consumer, producer, delta; + + consumer = ck_pr_load_uint(&ring->c_head); + producer = ring->p_tail; + delta = producer + 1; + if (size != NULL) + *size = (producer - consumer) & mask; + + if (CK_CC_UNLIKELY((delta & mask) == (consumer & mask))) + return NULL; + + return (char *)buffer + ts * (producer & mask); +} + +/* + * This is to be called to commit and make visible a region of previously + * reserved with reverse_sp. + */ +CK_CC_FORCE_INLINE static void +_ck_ring_enqueue_commit_sp(struct ck_ring *ring) +{ + + ck_pr_fence_store(); + ck_pr_store_uint(&ring->p_tail, ring->p_tail + 1); + return; +} + CK_CC_FORCE_INLINE static bool _ck_ring_enqueue_sp(struct ck_ring *ring, void *CK_CC_RESTRICT buffer, @@ -163,6 +249,65 @@ _ck_ring_dequeue_sc(struct ck_ring *ring, return true; } +CK_CC_FORCE_INLINE static void * +_ck_ring_enqueue_reserve_mp(struct ck_ring *ring, + void *buffer, + unsigned int ts, + unsigned int *ticket, + unsigned int *size) +{ + const unsigned int mask = ring->mask; + unsigned int producer, consumer, delta; + + producer = ck_pr_load_uint(&ring->p_head); + + for (;;) { + ck_pr_fence_load(); + consumer = ck_pr_load_uint(&ring->c_head); + + delta = producer + 1; + + if (CK_CC_LIKELY((producer - consumer) < mask)) { + if (ck_pr_cas_uint_value(&ring->p_head, + producer, delta, &producer) == true) { + break; + } + } else { + unsigned int new_producer; + + ck_pr_fence_load(); + new_producer = ck_pr_load_uint(&ring->p_head); + + if (producer == new_producer) { + if (size != NULL) + *size = (producer - consumer) & mask; + + return false; + } + + producer = new_producer; + } + } + + *ticket = producer; + if (size != NULL) + *size = (producer - consumer) & mask; + + return (char *)buffer + ts * (producer & mask); +} + +CK_CC_FORCE_INLINE static void +_ck_ring_enqueue_commit_mp(struct ck_ring *ring, unsigned int producer) +{ + + while (ck_pr_load_uint(&ring->p_tail) != producer) + ck_pr_stall(); + + ck_pr_fence_store(); + ck_pr_store_uint(&ring->p_tail, producer + 1); + return; +} + CK_CC_FORCE_INLINE static bool _ck_ring_enqueue_mp(struct ck_ring *ring, void *buffer, @@ -354,6 +499,33 @@ ck_ring_enqueue_spsc(struct ck_ring *ring, &entry, sizeof(entry), NULL); } +CK_CC_INLINE static void * +ck_ring_enqueue_reserve_spsc_size(struct ck_ring *ring, + struct ck_ring_buffer *buffer, + unsigned int *size) +{ + + return _ck_ring_enqueue_reserve_sp(ring, buffer, sizeof(void *), + size); +} + +CK_CC_INLINE static void * +ck_ring_enqueue_reserve_spsc(struct ck_ring *ring, + struct ck_ring_buffer *buffer) +{ + + return _ck_ring_enqueue_reserve_sp(ring, buffer, sizeof(void *), + NULL); +} + +CK_CC_INLINE static void +ck_ring_enqueue_commit_spsc(struct ck_ring *ring) +{ + + _ck_ring_enqueue_commit_sp(ring); + return; +} + CK_CC_INLINE static bool ck_ring_dequeue_spsc(struct ck_ring *ring, const struct ck_ring_buffer *buffer, @@ -375,8 +547,7 @@ ck_ring_enqueue_mpmc(struct ck_ring *ring, const void *entry) { - return _ck_ring_enqueue_mp(ring, buffer, &entry, - sizeof(entry), NULL); + return _ck_ring_enqueue_mp(ring, buffer, &entry, sizeof(entry), NULL); } CK_CC_INLINE static bool @@ -386,8 +557,37 @@ ck_ring_enqueue_mpmc_size(struct ck_ring *ring, unsigned int *size) { - return _ck_ring_enqueue_mp_size(ring, buffer, &entry, - sizeof(entry), size); + return _ck_ring_enqueue_mp_size(ring, buffer, &entry, sizeof(entry), + size); +} + +CK_CC_INLINE static void * +ck_ring_enqueue_reserve_mpmc(struct ck_ring *ring, + struct ck_ring_buffer *buffer, + unsigned int *ticket) +{ + + return _ck_ring_enqueue_reserve_mp(ring, buffer, sizeof(void *), + ticket, NULL); +} + +CK_CC_INLINE static void * +ck_ring_enqueue_reserve_mpmc_size(struct ck_ring *ring, + struct ck_ring_buffer *buffer, + unsigned int *ticket, + unsigned int *size) +{ + + return _ck_ring_enqueue_reserve_mp(ring, buffer, sizeof(void *), + ticket, size); +} + +CK_CC_INLINE static void +ck_ring_enqueue_commit_mpmc(struct ck_ring *ring, unsigned int ticket) +{ + + _ck_ring_enqueue_commit_mp(ring, ticket); + return; } CK_CC_INLINE static bool @@ -415,6 +615,31 @@ ck_ring_dequeue_mpmc(struct ck_ring *ring, * ring buffer containing pointers. Correctness is provided for any number of * consumers with up to one concurrent producer. */ +CK_CC_INLINE static void * +ck_ring_enqueue_reserve_spmc_size(struct ck_ring *ring, + struct ck_ring_buffer *buffer, + unsigned int *size) +{ + + return _ck_ring_enqueue_reserve_sp(ring, buffer, sizeof(void *), size); +} + +CK_CC_INLINE static void * +ck_ring_enqueue_reserve_spmc(struct ck_ring *ring, + struct ck_ring_buffer *buffer) +{ + + return _ck_ring_enqueue_reserve_sp(ring, buffer, sizeof(void *), NULL); +} + +CK_CC_INLINE static void +ck_ring_enqueue_commit_spmc(struct ck_ring *ring) +{ + + _ck_ring_enqueue_commit_sp(ring); + return; +} + CK_CC_INLINE static bool ck_ring_enqueue_spmc_size(struct ck_ring *ring, struct ck_ring_buffer *buffer, @@ -459,6 +684,35 @@ ck_ring_dequeue_spmc(struct ck_ring *ring, * ring buffer containing pointers. Correctness is provided for any number of * producers with up to one concurrent consumers. */ +CK_CC_INLINE static void * +ck_ring_enqueue_reserve_mpsc(struct ck_ring *ring, + struct ck_ring_buffer *buffer, + unsigned int *ticket) +{ + + return _ck_ring_enqueue_reserve_mp(ring, buffer, sizeof(void *), + ticket, NULL); +} + +CK_CC_INLINE static void * +ck_ring_enqueue_reserve_mpsc_size(struct ck_ring *ring, + struct ck_ring_buffer *buffer, + unsigned int *ticket, + unsigned int *size) +{ + + return _ck_ring_enqueue_reserve_mp(ring, buffer, sizeof(void *), + ticket, size); +} + +CK_CC_INLINE static void +ck_ring_enqueue_commit_mpsc(struct ck_ring *ring, unsigned int ticket) +{ + + _ck_ring_enqueue_commit_mp(ring, ticket); + return; +} + CK_CC_INLINE static bool ck_ring_enqueue_mpsc(struct ck_ring *ring, struct ck_ring_buffer *buffer, @@ -494,194 +748,290 @@ ck_ring_dequeue_mpsc(struct ck_ring *ring, * CK_RING_PROTOTYPE is used to define a type-safe interface for inlining * values of a particular type in the ring the buffer. */ -#define CK_RING_PROTOTYPE(name, type) \ -CK_CC_INLINE static bool \ -ck_ring_enqueue_spsc_size_##name(struct ck_ring *a, \ - struct type *b, \ - struct type *c, \ - unsigned int *d) \ -{ \ - \ - return _ck_ring_enqueue_sp_size(a, b, c, \ - sizeof(struct type), d); \ -} \ - \ -CK_CC_INLINE static bool \ -ck_ring_enqueue_spsc_##name(struct ck_ring *a, \ - struct type *b, \ - struct type *c) \ -{ \ - \ - return _ck_ring_enqueue_sp(a, b, c, \ - sizeof(struct type), NULL); \ -} \ - \ -CK_CC_INLINE static bool \ -ck_ring_dequeue_spsc_##name(struct ck_ring *a, \ - struct type *b, \ - struct type *c) \ -{ \ - \ - return _ck_ring_dequeue_sc(a, b, c, \ - sizeof(struct type)); \ -} \ - \ -CK_CC_INLINE static bool \ -ck_ring_enqueue_spmc_size_##name(struct ck_ring *a, \ - struct type *b, \ - struct type *c, \ - unsigned int *d) \ -{ \ - \ - return _ck_ring_enqueue_sp_size(a, b, c, \ - sizeof(struct type), d); \ -} \ - \ -CK_CC_INLINE static bool \ -ck_ring_enqueue_spmc_##name(struct ck_ring *a, \ - struct type *b, \ - struct type *c) \ -{ \ - \ - return _ck_ring_enqueue_sp(a, b, c, \ - sizeof(struct type), NULL); \ -} \ - \ -CK_CC_INLINE static bool \ -ck_ring_trydequeue_spmc_##name(struct ck_ring *a, \ - struct type *b, \ - struct type *c) \ -{ \ - \ - return _ck_ring_trydequeue_mc(a, \ - b, c, sizeof(struct type)); \ -} \ - \ -CK_CC_INLINE static bool \ -ck_ring_dequeue_spmc_##name(struct ck_ring *a, \ - struct type *b, \ - struct type *c) \ -{ \ - \ - return _ck_ring_dequeue_mc(a, b, c, \ - sizeof(struct type)); \ -} \ - \ -CK_CC_INLINE static bool \ -ck_ring_enqueue_mpsc_##name(struct ck_ring *a, \ - struct type *b, \ - struct type *c) \ -{ \ - \ - return _ck_ring_enqueue_mp(a, b, c, \ - sizeof(struct type), NULL); \ -} \ - \ -CK_CC_INLINE static bool \ -ck_ring_enqueue_mpsc_size_##name(struct ck_ring *a, \ - struct type *b, \ - struct type *c, \ - unsigned int *d) \ -{ \ - \ - return _ck_ring_enqueue_mp_size(a, b, c, \ - sizeof(struct type), d); \ -} \ - \ -CK_CC_INLINE static bool \ -ck_ring_dequeue_mpsc_##name(struct ck_ring *a, \ - struct type *b, \ - struct type *c) \ -{ \ - \ - return _ck_ring_dequeue_sc(a, b, c, \ - sizeof(struct type)); \ -} \ - \ -CK_CC_INLINE static bool \ -ck_ring_enqueue_mpmc_size_##name(struct ck_ring *a, \ - struct type *b, \ - struct type *c, \ - unsigned int *d) \ -{ \ - \ - return _ck_ring_enqueue_mp_size(a, b, c, \ - sizeof(struct type), d); \ -} \ - \ -CK_CC_INLINE static bool \ -ck_ring_enqueue_mpmc_##name(struct ck_ring *a, \ - struct type *b, \ - struct type *c) \ -{ \ - \ - return _ck_ring_enqueue_mp(a, b, c, \ - sizeof(struct type), NULL); \ -} \ - \ -CK_CC_INLINE static bool \ -ck_ring_trydequeue_mpmc_##name(struct ck_ring *a, \ - struct type *b, \ - struct type *c) \ -{ \ - \ - return _ck_ring_trydequeue_mc(a, \ - b, c, sizeof(struct type)); \ -} \ - \ -CK_CC_INLINE static bool \ -ck_ring_dequeue_mpmc_##name(struct ck_ring *a, \ - struct type *b, \ - struct type *c) \ -{ \ - \ - return _ck_ring_dequeue_mc(a, b, c, \ - sizeof(struct type)); \ +#define CK_RING_PROTOTYPE(name, type) \ +CK_CC_INLINE static struct type * \ +ck_ring_enqueue_reserve_spsc_##name(struct ck_ring *a, \ + struct type *b) \ +{ \ + \ + return _ck_ring_enqueue_reserve_sp(a, b, \ + sizeof(struct type), NULL); \ +} \ + \ +CK_CC_INLINE static struct type * \ +ck_ring_enqueue_reserve_spsc_size_##name(struct ck_ring *a, \ + struct type *b, \ + unsigned int *c) \ +{ \ + \ + return _ck_ring_enqueue_reserve_sp(a, b, \ + sizeof(struct type), c); \ +} \ + \ +CK_CC_INLINE static bool \ +ck_ring_enqueue_spsc_size_##name(struct ck_ring *a, \ + struct type *b, \ + struct type *c, \ + unsigned int *d) \ +{ \ + \ + return _ck_ring_enqueue_sp_size(a, b, c, \ + sizeof(struct type), d); \ +} \ + \ +CK_CC_INLINE static bool \ +ck_ring_enqueue_spsc_##name(struct ck_ring *a, \ + struct type *b, \ + struct type *c) \ +{ \ + \ + return _ck_ring_enqueue_sp(a, b, c, \ + sizeof(struct type), NULL); \ +} \ + \ +CK_CC_INLINE static bool \ +ck_ring_dequeue_spsc_##name(struct ck_ring *a, \ + struct type *b, \ + struct type *c) \ +{ \ + \ + return _ck_ring_dequeue_sc(a, b, c, \ + sizeof(struct type)); \ +} \ + \ +CK_CC_INLINE static struct type * \ +ck_ring_enqueue_reserve_spmc_##name(struct ck_ring *a, \ + struct type *b) \ +{ \ + \ + return _ck_ring_enqueue_reserve_sp(a, b, \ + sizeof(struct type), NULL); \ +} \ + \ +CK_CC_INLINE static struct type * \ +ck_ring_enqueue_reserve_spmc_size_##name(struct ck_ring *a, \ + struct type *b, \ + unsigned int *c) \ +{ \ + \ + return _ck_ring_enqueue_reserve_sp(a, b, \ + sizeof(struct type), c); \ +} \ + \ +CK_CC_INLINE static bool \ +ck_ring_enqueue_spmc_size_##name(struct ck_ring *a, \ + struct type *b, \ + struct type *c, \ + unsigned int *d) \ +{ \ + \ + return _ck_ring_enqueue_sp_size(a, b, c, \ + sizeof(struct type), d); \ +} \ + \ +CK_CC_INLINE static bool \ +ck_ring_enqueue_spmc_##name(struct ck_ring *a, \ + struct type *b, \ + struct type *c) \ +{ \ + \ + return _ck_ring_enqueue_sp(a, b, c, \ + sizeof(struct type), NULL); \ +} \ + \ +CK_CC_INLINE static bool \ +ck_ring_trydequeue_spmc_##name(struct ck_ring *a, \ + struct type *b, \ + struct type *c) \ +{ \ + \ + return _ck_ring_trydequeue_mc(a, \ + b, c, sizeof(struct type)); \ +} \ + \ +CK_CC_INLINE static bool \ +ck_ring_dequeue_spmc_##name(struct ck_ring *a, \ + struct type *b, \ + struct type *c) \ +{ \ + \ + return _ck_ring_dequeue_mc(a, b, c, \ + sizeof(struct type)); \ +} \ + \ +CK_CC_INLINE static struct type * \ +ck_ring_enqueue_reserve_mpsc_##name(struct ck_ring *a, \ + struct type *b, \ + unsigned int *c) \ +{ \ + \ + return _ck_ring_enqueue_reserve_mp(a, b, \ + sizeof(struct type), c, NULL); \ +} \ + \ +CK_CC_INLINE static struct type * \ +ck_ring_enqueue_reserve_mpsc_size_##name(struct ck_ring *a, \ + struct type *b, \ + unsigned int *c, \ + unsigned int *d) \ +{ \ + \ + return _ck_ring_enqueue_reserve_mp(a, b, \ + sizeof(struct type), c, d); \ +} \ + \ +CK_CC_INLINE static bool \ +ck_ring_enqueue_mpsc_##name(struct ck_ring *a, \ + struct type *b, \ + struct type *c) \ +{ \ + \ + return _ck_ring_enqueue_mp(a, b, c, \ + sizeof(struct type), NULL); \ +} \ + \ +CK_CC_INLINE static bool \ +ck_ring_enqueue_mpsc_size_##name(struct ck_ring *a, \ + struct type *b, \ + struct type *c, \ + unsigned int *d) \ +{ \ + \ + return _ck_ring_enqueue_mp_size(a, b, c, \ + sizeof(struct type), d); \ +} \ + \ +CK_CC_INLINE static bool \ +ck_ring_dequeue_mpsc_##name(struct ck_ring *a, \ + struct type *b, \ + struct type *c) \ +{ \ + \ + return _ck_ring_dequeue_sc(a, b, c, \ + sizeof(struct type)); \ +} \ + \ +CK_CC_INLINE static struct type * \ +ck_ring_enqueue_reserve_mpmc_##name(struct ck_ring *a, \ + struct type *b, \ + unsigned int *c) \ +{ \ + \ + return _ck_ring_enqueue_reserve_mp(a, b, \ + sizeof(struct type), c, NULL); \ +} \ + \ +CK_CC_INLINE static struct type * \ +ck_ring_enqueue_reserve_mpmc_size_##name(struct ck_ring *a, \ + struct type *b, \ + unsigned int *c, \ + unsigned int *d) \ +{ \ + \ + return _ck_ring_enqueue_reserve_mp(a, b, \ + sizeof(struct type), c, d); \ +} \ + \ +CK_CC_INLINE static bool \ +ck_ring_enqueue_mpmc_size_##name(struct ck_ring *a, \ + struct type *b, \ + struct type *c, \ + unsigned int *d) \ +{ \ + \ + return _ck_ring_enqueue_mp_size(a, b, c, \ + sizeof(struct type), d); \ +} \ + \ +CK_CC_INLINE static bool \ +ck_ring_enqueue_mpmc_##name(struct ck_ring *a, \ + struct type *b, \ + struct type *c) \ +{ \ + \ + return _ck_ring_enqueue_mp(a, b, c, \ + sizeof(struct type), NULL); \ +} \ + \ +CK_CC_INLINE static bool \ +ck_ring_trydequeue_mpmc_##name(struct ck_ring *a, \ + struct type *b, \ + struct type *c) \ +{ \ + \ + return _ck_ring_trydequeue_mc(a, \ + b, c, sizeof(struct type)); \ +} \ + \ +CK_CC_INLINE static bool \ +ck_ring_dequeue_mpmc_##name(struct ck_ring *a, \ + struct type *b, \ + struct type *c) \ +{ \ + \ + return _ck_ring_dequeue_mc(a, b, c, \ + sizeof(struct type)); \ } /* * A single producer with one concurrent consumer. */ -#define CK_RING_ENQUEUE_SPSC(name, a, b, c) \ +#define CK_RING_ENQUEUE_SPSC(name, a, b, c) \ ck_ring_enqueue_spsc_##name(a, b, c) -#define CK_RING_ENQUEUE_SPSC_SIZE(name, a, b, c, d) \ +#define CK_RING_ENQUEUE_SPSC_SIZE(name, a, b, c, d) \ ck_ring_enqueue_spsc_size_##name(a, b, c, d) -#define CK_RING_DEQUEUE_SPSC(name, a, b, c) \ +#define CK_RING_ENQUEUE_RESERVE_SPSC(name, a, b, c) \ + ck_ring_enqueue_reserve_spsc_##name(a, b, c) +#define CK_RING_ENQUEUE_RESERVE_SPSC_SIZE(name, a, b, c, d) \ + ck_ring_enqueue_reserve_spsc_size_##name(a, b, c, d) +#define CK_RING_DEQUEUE_SPSC(name, a, b, c) \ ck_ring_dequeue_spsc_##name(a, b, c) /* * A single producer with any number of concurrent consumers. */ -#define CK_RING_ENQUEUE_SPMC(name, a, b, c) \ +#define CK_RING_ENQUEUE_SPMC(name, a, b, c) \ ck_ring_enqueue_spmc_##name(a, b, c) -#define CK_RING_ENQUEUE_SPMC_SIZE(name, a, b, c, d) \ +#define CK_RING_ENQUEUE_SPMC_SIZE(name, a, b, c, d) \ ck_ring_enqueue_spmc_size_##name(a, b, c, d) -#define CK_RING_TRYDEQUEUE_SPMC(name, a, b, c) \ +#define CK_RING_ENQUEUE_RESERVE_SPMC(name, a, b, c) \ + ck_ring_enqueue_reserve_spmc_##name(a, b, c) +#define CK_RING_ENQUEUE_RESERVE_SPMC_SIZE(name, a, b, c, d) \ + ck_ring_enqueue_reserve_spmc_size_##name(a, b, c, d) +#define CK_RING_TRYDEQUEUE_SPMC(name, a, b, c) \ ck_ring_trydequeue_spmc_##name(a, b, c) -#define CK_RING_DEQUEUE_SPMC(name, a, b, c) \ +#define CK_RING_DEQUEUE_SPMC(name, a, b, c) \ ck_ring_dequeue_spmc_##name(a, b, c) /* * Any number of concurrent producers with up to one * concurrent consumer. */ -#define CK_RING_ENQUEUE_MPSC(name, a, b, c) \ +#define CK_RING_ENQUEUE_MPSC(name, a, b, c) \ ck_ring_enqueue_mpsc_##name(a, b, c) -#define CK_RING_ENQUEUE_MPSC_SIZE(name, a, b, c, d) \ +#define CK_RING_ENQUEUE_MPSC_SIZE(name, a, b, c, d) \ ck_ring_enqueue_mpsc_size_##name(a, b, c, d) -#define CK_RING_DEQUEUE_MPSC(name, a, b, c) \ +#define CK_RING_ENQUEUE_RESERVE_MPSC(name, a, b, c) \ + ck_ring_enqueue_reserve_mpsc_##name(a, b, c) +#define CK_RING_ENQUEUE_RESERVE_MPSC_SIZE(name, a, b, c, d) \ + ck_ring_enqueue_reserve_mpsc_size_##name(a, b, c, d) +#define CK_RING_DEQUEUE_MPSC(name, a, b, c) \ ck_ring_dequeue_mpsc_##name(a, b, c) /* * Any number of concurrent producers and consumers. */ -#define CK_RING_ENQUEUE_MPMC(name, a, b, c) \ +#define CK_RING_ENQUEUE_MPMC(name, a, b, c) \ ck_ring_enqueue_mpmc_##name(a, b, c) -#define CK_RING_ENQUEUE_MPMC_SIZE(name, a, b, c, d) \ +#define CK_RING_ENQUEUE_MPMC_SIZE(name, a, b, c, d) \ ck_ring_enqueue_mpmc_size_##name(a, b, c, d) -#define CK_RING_TRYDEQUEUE_MPMC(name, a, b, c) \ +#define CK_RING_ENQUEUE_RESERVE_MPMC(name, a, b, c) \ + ck_ring_enqueue_reserve_mpmc_##name(a, b, c) +#define CK_RING_ENQUEUE_RESERVE_MPMC_SIZE(name, a, b, c, d) \ + ck_ring_enqueue_reserve_mpmc_size_##name(a, b, c, d) +#define CK_RING_TRYDEQUEUE_MPMC(name, a, b, c) \ ck_ring_trydequeue_mpmc_##name(a, b, c) -#define CK_RING_DEQUEUE_MPMC(name, a, b, c) \ +#define CK_RING_DEQUEUE_MPMC(name, a, b, c) \ ck_ring_dequeue_mpmc_##name(a, b, c) #endif /* CK_RING_H */ diff --git a/include/gcc/aarch64/ck_pr.h b/include/gcc/aarch64/ck_pr.h index e739c4d5b18e..0a473072fffd 100644 --- a/include/gcc/aarch64/ck_pr.h +++ b/include/gcc/aarch64/ck_pr.h @@ -92,7 +92,7 @@ CK_PR_FENCE(unlock, CK_DMB_SY) ck_pr_md_load_##S(const M *target) \ { \ long r = 0; \ - __asm__ __volatile__(I " %w0, [%1];" \ + __asm__ __volatile__(I " %w0, [%1]\n" \ : "=r" (r) \ : "r" (target) \ : "memory"); \ @@ -103,7 +103,7 @@ CK_PR_FENCE(unlock, CK_DMB_SY) ck_pr_md_load_##S(const M *target) \ { \ long r = 0; \ - __asm__ __volatile__(I " %0, [%1];" \ + __asm__ __volatile__(I " %0, [%1]\n" \ : "=r" (r) \ : "r" (target) \ : "memory"); \ @@ -195,10 +195,10 @@ CK_PR_STORE_S_64(double, double, "str") T previous = 0; \ T tmp = 0; \ __asm__ __volatile__("1:" \ - "ldxr" W " %" R "0, [%2];" \ - "neg %" R "0, %" R "0;" \ - "stxr" W " %w1, %" R "0, [%2];" \ - "cbnz %w1, 1b;" \ + "ldxr" W " %" R "0, [%2]\n"\ + "neg %" R "0, %" R "0\n" \ + "stxr" W " %w1, %" R "0, [%2]\n" \ + "cbnz %w1, 1b\n" \ : "=&r" (previous), \ "=&r" (tmp) \ : "r" (target) \ diff --git a/include/gcc/aarch64/ck_pr_llsc.h b/include/gcc/aarch64/ck_pr_llsc.h index aa4e3090fa3a..6500d9661c08 100644 --- a/include/gcc/aarch64/ck_pr_llsc.h +++ b/include/gcc/aarch64/ck_pr_llsc.h @@ -38,17 +38,17 @@ ck_pr_cas_64_2_value(uint64_t target[2], uint64_t compare[2], uint64_t set[2], u uint64_t tmp1, tmp2; __asm__ __volatile__("1:" - "ldxp %0, %1, [%4];" - "mov %2, %0;" - "mov %3, %1;" - "eor %0, %0, %5;" - "eor %1, %1, %6;" - "orr %1, %0, %1;" - "mov %w0, #0;" - "cbnz %1, 2f;" - "stxp %w0, %7, %8, [%4];" - "cbnz %w0, 1b;" - "mov %w0, #1;" + "ldxp %0, %1, [%4]\n" + "mov %2, %0\n" + "mov %3, %1\n" + "eor %0, %0, %5\n" + "eor %1, %1, %6\n" + "orr %1, %0, %1\n" + "mov %w0, #0\n" + "cbnz %1, 2f\n" + "stxp %w0, %7, %8, [%4]\n" + "cbnz %w0, 1b\n" + "mov %w0, #1\n" "2:" : "=&r" (tmp1), "=&r" (tmp2), "=&r" (value[0]), "=&r" (value[1]) : "r" (target), "r" (compare[0]), "r" (compare[1]), "r" (set[0]), "r" (set[1]) @@ -72,15 +72,15 @@ ck_pr_cas_64_2(uint64_t target[2], uint64_t compare[2], uint64_t set[2]) uint64_t tmp1, tmp2; __asm__ __volatile__("1:" - "ldxp %0, %1, [%2];" - "eor %0, %0, %3;" - "eor %1, %1, %4;" - "orr %1, %0, %1;" - "mov %w0, #0;" - "cbnz %1, 2f;" - "stxp %w0, %5, %6, [%2];" - "cbnz %w0, 1b;" - "mov %w0, #1;" + "ldxp %0, %1, [%2]\n" + "eor %0, %0, %3\n" + "eor %1, %1, %4\n" + "orr %1, %0, %1\n" + "mov %w0, #0\n" + "cbnz %1, 2f\n" + "stxp %w0, %5, %6, [%2]\n" + "cbnz %w0, 1b\n" + "mov %w0, #1\n" "2:" : "=&r" (tmp1), "=&r" (tmp2) : "r" (target), "r" (compare[0]), "r" (compare[1]), "r" (set[0]), "r" (set[1]) @@ -103,12 +103,12 @@ ck_pr_cas_ptr_2(void *target, void *compare, void *set) { \ T previous; \ T tmp; \ - __asm__ __volatile__("1:" \ - "ldxr" W " %" R "0, [%2];" \ - "cmp %" R "0, %" R "4;" \ - "b.ne 2f;" \ - "stxr" W " %w1, %" R "3, [%2];" \ - "cbnz %w1, 1b;" \ + __asm__ __volatile__("1:\n" \ + "ldxr" W " %" R "0, [%2]\n" \ + "cmp %" R "0, %" R "4\n" \ + "b.ne 2f\n" \ + "stxr" W " %w1, %" R "3, [%2]\n" \ + "cbnz %w1, 1b\n" \ "2:" \ : "=&r" (previous), \ "=&r" (tmp) \ @@ -126,11 +126,11 @@ ck_pr_cas_ptr_2(void *target, void *compare, void *set) T tmp; \ __asm__ __volatile__( \ "1:" \ - "ldxr" W " %" R "0, [%2];" \ - "cmp %" R "0, %" R "4;" \ - "b.ne 2f;" \ - "stxr" W " %w1, %" R "3, [%2];" \ - "cbnz %w1, 1b;" \ + "ldxr" W " %" R "0, [%2]\n" \ + "cmp %" R "0, %" R "4\n" \ + "b.ne 2f\n" \ + "stxr" W " %w1, %" R "3, [%2]\n" \ + "cbnz %w1, 1b\n" \ "2:" \ : "=&r" (previous), \ "=&r" (tmp) \ @@ -167,9 +167,9 @@ CK_PR_CAS_S(char, char, "b", "w") T previous; \ T tmp; \ __asm__ __volatile__("1:" \ - "ldxr" W " %" R "0, [%2];" \ - "stxr" W " %w1, %" R "3, [%2];"\ - "cbnz %w1, 1b;" \ + "ldxr" W " %" R "0, [%2]\n"\ + "stxr" W " %w1, %" R "3, [%2]\n"\ + "cbnz %w1, 1b\n" \ : "=&r" (previous), \ "=&r" (tmp) \ : "r" (target), \ @@ -198,10 +198,10 @@ CK_PR_FAS(char, char, char, "b", "w") T previous = 0; \ T tmp = 0; \ __asm__ __volatile__("1:" \ - "ldxr" W " %" R "0, [%2];" \ - I ";" \ - "stxr" W " %w1, %" R "0, [%2];" \ - "cbnz %w1, 1b;" \ + "ldxr" W " %" R "0, [%2]\n"\ + I "\n" \ + "stxr" W " %w1, %" R "0, [%2]\n" \ + "cbnz %w1, 1b\n" \ : "=&r" (previous), \ "=&r" (tmp) \ : "r" (target) \ @@ -239,10 +239,10 @@ CK_PR_UNARY_S(char, char, "b") T previous; \ T tmp; \ __asm__ __volatile__("1:" \ - "ldxr" W " %" R "0, [%2];"\ - I " %" R "0, %" R "0, %" R "3;" \ - "stxr" W " %w1, %" R "0, [%2];" \ - "cbnz %w1, 1b;" \ + "ldxr" W " %" R "0, [%2]\n"\ + I " %" R "0, %" R "0, %" R "3\n" \ + "stxr" W " %w1, %" R "0, [%2]\n" \ + "cbnz %w1, 1b\n" \ : "=&r" (previous), \ "=&r" (tmp) \ : "r" (target), \ @@ -286,10 +286,10 @@ ck_pr_faa_ptr(void *target, uintptr_t delta) uintptr_t previous, r, tmp; __asm__ __volatile__("1:" - "ldxr %0, [%3];" - "add %1, %4, %0;" - "stxr %w2, %1, [%3];" - "cbnz %w2, 1b;" + "ldxr %0, [%3]\n" + "add %1, %4, %0\n" + "stxr %w2, %1, [%3]\n" + "cbnz %w2, 1b\n" : "=&r" (previous), "=&r" (r), "=&r" (tmp) @@ -306,9 +306,9 @@ ck_pr_faa_64(uint64_t *target, uint64_t delta) uint64_t previous, r, tmp; __asm__ __volatile__("1:" - "ldxr %0, [%3];" - "add %1, %4, %0;" - "stxr %w2, %1, [%3];" + "ldxr %0, [%3]\n" + "add %1, %4, %0\n" + "stxr %w2, %1, [%3]\n" "cbnz %w2, 1b;" : "=&r" (previous), "=&r" (r), @@ -326,10 +326,10 @@ ck_pr_faa_64(uint64_t *target, uint64_t delta) { \ T previous, r, tmp; \ __asm__ __volatile__("1:" \ - "ldxr" W " %w0, [%3];" \ - "add %w1, %w4, %w0;" \ - "stxr" W " %w2, %w1, [%3];" \ - "cbnz %w2, 1b;" \ + "ldxr" W " %w0, [%3]\n" \ + "add %w1, %w4, %w0\n" \ + "stxr" W " %w2, %w1, [%3]\n" \ + "cbnz %w2, 1b\n" \ : "=&r" (previous), \ "=&r" (r), \ "=&r" (tmp) \ diff --git a/include/gcc/aarch64/ck_pr_lse.h b/include/gcc/aarch64/ck_pr_lse.h index e2c9554c8b4a..e450e72d60ec 100644 --- a/include/gcc/aarch64/ck_pr_lse.h +++ b/include/gcc/aarch64/ck_pr_lse.h @@ -29,6 +29,7 @@ #ifndef CK_PR_AARCH64_LSE_H #define CK_PR_AARCH64_LSE_H +#error bite #ifndef CK_PR_H #error Do not include this file directly, use ck_pr.h #endif @@ -43,10 +44,10 @@ ck_pr_cas_64_2_value(uint64_t target[2], uint64_t compare[2], uint64_t set[2], u register uint64_t x2 __asm__ ("x2") = set[0]; register uint64_t x3 __asm__ ("x3") = set[1]; - __asm__ __volatile__("casp %0, %1, %4, %5, [%6];" - "eor %2, %0, %7;" - "eor %3, %1, %8;" - "orr %2, %2, %3;" + __asm__ __volatile__("casp %0, %1, %4, %5, [%6]\n" + "eor %2, %0, %7\n" + "eor %3, %1, %8\n" + "orr %2, %2, %3\n" : "+&r" (x0), "+&r" (x1), "=&r" (tmp1), "=&r" (tmp2) : "r" (x2), "r" (x3), "r" (target), "r" (compare[0]), "r" (compare[1]) : "memory"); @@ -74,10 +75,10 @@ ck_pr_cas_64_2(uint64_t target[2], uint64_t compare[2], uint64_t set[2]) register uint64_t x2 __asm__ ("x2") = set[0]; register uint64_t x3 __asm__ ("x3") = set[1]; - __asm__ __volatile__("casp %0, %1, %2, %3, [%4];" - "eor %0, %0, %5;" - "eor %1, %1, %6;" - "orr %0, %0, %1;" + __asm__ __volatile__("casp %0, %1, %2, %3, [%4]\n" + "eor %0, %0, %5\n" + "eor %1, %1, %6\n" + "orr %0, %0, %1\n" : "+&r" (x0), "+&r" (x1) : "r" (x2), "r" (x3), "r" (target), "r" (compare[0]), "r" (compare[1]) : "memory"); @@ -99,7 +100,7 @@ ck_pr_cas_ptr_2(void *target, void *compare, void *set) { \ *(T *)value = compare; \ __asm__ __volatile__( \ - "cas" W " %" R "0, %" R "2, [%1];" \ + "cas" W " %" R "0, %" R "2, [%1]\n"\ : "+&r" (*(T *)value) \ : "r" (target), \ "r" (set) \ @@ -111,7 +112,7 @@ ck_pr_cas_ptr_2(void *target, void *compare, void *set) { \ T previous = compare; \ __asm__ __volatile__( \ - "cas" W " %" R "0, %" R "2, [%1];" \ + "cas" W " %" R "0, %" R "2, [%1]\n"\ : "+&r" (previous) \ : "r" (target), \ "r" (set) \ @@ -144,7 +145,7 @@ CK_PR_CAS_S(char, char, "b", "w") { \ T previous; \ __asm__ __volatile__( \ - "swp" W " %" R "2, %" R "0, [%1];" \ + "swp" W " %" R "2, %" R "0, [%1]\n"\ : "=&r" (previous) \ : "r" (target), \ "r" (v) \ @@ -169,8 +170,8 @@ CK_PR_FAS(char, char, char, "b", "w") CK_CC_INLINE static void \ ck_pr_##O##_##N(M *target) \ { \ - __asm__ __volatile__(I ";" \ - "st" S W " " R "0, [%0];" \ + __asm__ __volatile__(I "\n" \ + "st" S W " " R "0, [%0]\n" \ : \ : "r" (target) \ : "x0", "memory"); \ @@ -204,8 +205,8 @@ CK_PR_UNARY_S(char, char, "b") CK_CC_INLINE static void \ ck_pr_##O##_##N(M *target, T delta) \ { \ - __asm__ __volatile__(I ";" \ - "st" S W " %" R "0, [%1];" \ + __asm__ __volatile__(I "\n" \ + "st" S W " %" R "0, [%1]\n"\ : "+&r" (delta) \ : "r" (target) \ : "memory"); \ @@ -247,7 +248,7 @@ ck_pr_faa_ptr(void *target, uintptr_t delta) uintptr_t previous; __asm__ __volatile__( - "ldadd %2, %0, [%1];" + "ldadd %2, %0, [%1]\n" : "=r" (previous) : "r" (target), "r" (delta) @@ -262,7 +263,7 @@ ck_pr_faa_64(uint64_t *target, uint64_t delta) uint64_t previous; __asm__ __volatile__( - "ldadd %2, %0, [%1];" + "ldadd %2, %0, [%1]\n" : "=r" (previous) : "r" (target), "r" (delta) @@ -277,7 +278,7 @@ ck_pr_faa_64(uint64_t *target, uint64_t delta) { \ T previous; \ __asm__ __volatile__( \ - "ldadd" W " %w2, %w0, [%1];" \ + "ldadd" W " %w2, %w0, [%1]\n" \ : "=r" (previous) \ : "r" (target), \ "r" (delta) \ diff --git a/include/gcc/ck_cc.h b/include/gcc/ck_cc.h index 6ebc59cb5921..0a6d17b93569 100644 --- a/include/gcc/ck_cc.h +++ b/include/gcc/ck_cc.h @@ -39,6 +39,15 @@ #define CK_CC_UNUSED __attribute__((unused)) #define CK_CC_USED __attribute__((used)) #define CK_CC_IMM "i" + +#define CK_CC_CONTAINER(F, T, M, N) \ + CK_CC_INLINE static T * \ + N(F *p) \ + { \ + \ + return (T *)(void *)((char *)p - __builtin_offsetof(T, M)); \ + } + #if defined(__x86_64__) || defined(__x86__) #define CK_CC_IMM_U32 "Z" #define CK_CC_IMM_S32 "e" diff --git a/include/gcc/x86/ck_pr.h b/include/gcc/x86/ck_pr.h index e678e830e0b4..12291c830dfd 100644 --- a/include/gcc/x86/ck_pr.h +++ b/include/gcc/x86/ck_pr.h @@ -120,7 +120,7 @@ CK_PR_FENCE(unlock, CK_MD_X86_MFENCE) return v; \ } -CK_PR_FAS(ptr, void, void *, char, "xchgl") +CK_PR_FAS(ptr, void, void *, uint32_t, "xchgl") #define CK_PR_FAS_S(S, T, I) CK_PR_FAS(S, T, T, T, I) @@ -146,7 +146,7 @@ CK_PR_FAS_S(8, uint8_t, "xchgb") return (r); \ } -CK_PR_LOAD(ptr, void, void *, char, "movl") +CK_PR_LOAD(ptr, void, void *, uint32_t, "movl") #define CK_PR_LOAD_S(S, T, I) CK_PR_LOAD(S, T, T, T, I) @@ -171,7 +171,7 @@ CK_PR_LOAD_S(8, uint8_t, "movb") return; \ } -CK_PR_STORE(ptr, void, const void *, char, "movl") +CK_PR_STORE(ptr, void, const void *, uint32_t, "movl") #define CK_PR_STORE_S(S, T, I) CK_PR_STORE(S, T, T, T, I) @@ -200,7 +200,7 @@ CK_PR_STORE_S(8, uint8_t, "movb") return (d); \ } -CK_PR_FAA(ptr, void, uintptr_t, char, "xaddl") +CK_PR_FAA(ptr, void, uintptr_t, uint32_t, "xaddl") #define CK_PR_FAA_S(S, T, I) CK_PR_FAA(S, T, T, T, I) @@ -239,7 +239,7 @@ CK_PR_FAA_S(8, uint8_t, "xaddb") bool ret; \ __asm__ __volatile__(CK_PR_LOCK_PREFIX I " %0; setz %1" \ : "+m" (*(C *)target), \ - "=rm" (ret) \ + "=qm" (ret) \ : \ : "memory", "cc"); \ return ret; \ @@ -248,7 +248,7 @@ CK_PR_FAA_S(8, uint8_t, "xaddb") #define CK_PR_UNARY_S(K, S, T, I) CK_PR_UNARY(K, S, T, T, I) #define CK_PR_GENERATE(K) \ - CK_PR_UNARY(K, ptr, void, char, #K "l") \ + CK_PR_UNARY(K, ptr, void, uint32_t, #K "l") \ CK_PR_UNARY_S(K, char, char, #K "b") \ CK_PR_UNARY_S(K, int, int, #K "l") \ CK_PR_UNARY_S(K, uint, unsigned int, #K "l") \ @@ -288,7 +288,7 @@ CK_PR_GENERATE(not) #define CK_PR_BINARY_S(K, S, T, I) CK_PR_BINARY(K, S, T, T, T, I) #define CK_PR_GENERATE(K) \ - CK_PR_BINARY(K, ptr, void, uintptr_t, char, #K "l") \ + CK_PR_BINARY(K, ptr, void, uintptr_t, uint32_t, #K "l") \ CK_PR_BINARY_S(K, char, char, #K "b") \ CK_PR_BINARY_S(K, int, int, #K "l") \ CK_PR_BINARY_S(K, uint, unsigned int, #K "l") \ @@ -307,8 +307,38 @@ CK_PR_GENERATE(xor) #undef CK_PR_BINARY /* - * Atomic compare and swap. + * Atomic compare and swap, with a variant that sets *v to the old value of target. */ +#ifdef __GCC_ASM_FLAG_OUTPUTS__ +#define CK_PR_CAS(S, M, T, C, I) \ + CK_CC_INLINE static bool \ + ck_pr_cas_##S(M *target, T compare, T set) \ + { \ + bool z; \ + __asm__ __volatile__(CK_PR_LOCK_PREFIX I " %3, %0" \ + : "+m" (*(C *)target), \ + "=@ccz" (z), \ + /* RAX is clobbered by cmpxchg. */ \ + "+a" (compare) \ + : "q" (set) \ + : "memory", "cc"); \ + return z; \ + } \ + \ + CK_CC_INLINE static bool \ + ck_pr_cas_##S##_value(M *target, T compare, T set, M *v) \ + { \ + bool z; \ + __asm__ __volatile__(CK_PR_LOCK_PREFIX I " %3, %0;" \ + : "+m" (*(C *)target), \ + "=@ccz" (z), \ + "+a" (compare) \ + : "q" (set) \ + : "memory", "cc"); \ + *(T *)v = compare; \ + return z; \ + } +#else #define CK_PR_CAS(S, M, T, C, I) \ CK_CC_INLINE static bool \ ck_pr_cas_##S(M *target, T compare, T set) \ @@ -321,9 +351,25 @@ CK_PR_GENERATE(xor) "a" (compare) \ : "memory", "cc"); \ return z; \ + } \ + \ + CK_CC_INLINE static bool \ + ck_pr_cas_##S##_value(M *target, T compare, T set, M *v) \ + { \ + bool z; \ + __asm__ __volatile__(CK_PR_LOCK_PREFIX I " %3, %0;" \ + "setz %1;" \ + : "+m" (*(C *)target), \ + "=q" (z), \ + "+a" (compare) \ + : "q" (set) \ + : "memory", "cc"); \ + *(T *)v = compare; \ + return z; \ } +#endif -CK_PR_CAS(ptr, void, void *, char, "cmpxchgl") +CK_PR_CAS(ptr, void, void *, uint32_t, "cmpxchgl") #define CK_PR_CAS_S(S, T, I) CK_PR_CAS(S, T, T, T, I) @@ -338,41 +384,6 @@ CK_PR_CAS_S(8, uint8_t, "cmpxchgb") #undef CK_PR_CAS /* - * Compare and swap, set *v to old value of target. - */ -#define CK_PR_CAS_O(S, M, T, C, I, R) \ - CK_CC_INLINE static bool \ - ck_pr_cas_##S##_value(M *target, T compare, T set, M *v) \ - { \ - bool z; \ - __asm__ __volatile__(CK_PR_LOCK_PREFIX "cmpxchg" I " %3, %0;" \ - "mov %% " R ", %2;" \ - "setz %1;" \ - : "+m" (*(C *)target), \ - "=a" (z), \ - "=m" (*(C *)v) \ - : "q" (set), \ - "a" (compare) \ - : "memory", "cc"); \ - return (bool)z; \ - } - -CK_PR_CAS_O(ptr, void, void *, char, "l", "eax") - -#define CK_PR_CAS_O_S(S, T, I, R) \ - CK_PR_CAS_O(S, T, T, T, I, R) - -CK_PR_CAS_O_S(char, char, "b", "al") -CK_PR_CAS_O_S(int, int, "l", "eax") -CK_PR_CAS_O_S(uint, unsigned int, "l", "eax") -CK_PR_CAS_O_S(32, uint32_t, "l", "eax") -CK_PR_CAS_O_S(16, uint16_t, "w", "ax") -CK_PR_CAS_O_S(8, uint8_t, "b", "al") - -#undef CK_PR_CAS_O_S -#undef CK_PR_CAS_O - -/* * Atomic bit test operations. */ #define CK_PR_BT(K, S, T, P, C, I) \ @@ -390,11 +401,11 @@ CK_PR_CAS_O_S(8, uint8_t, "b", "al") #define CK_PR_BT_S(K, S, T, I) CK_PR_BT(K, S, T, T, T, I) -#define CK_PR_GENERATE(K) \ - CK_PR_BT(K, ptr, void, uint32_t, char, #K "l %2, %0") \ - CK_PR_BT_S(K, uint, unsigned int, #K "l %2, %0") \ - CK_PR_BT_S(K, int, int, #K "l %2, %0") \ - CK_PR_BT_S(K, 32, uint32_t, #K "l %2, %0") \ +#define CK_PR_GENERATE(K) \ + CK_PR_BT(K, ptr, void, uint32_t, uint32_t, #K "l %2, %0") \ + CK_PR_BT_S(K, uint, unsigned int, #K "l %2, %0") \ + CK_PR_BT_S(K, int, int, #K "l %2, %0") \ + CK_PR_BT_S(K, 32, uint32_t, #K "l %2, %0") \ CK_PR_BT_S(K, 16, uint16_t, #K "w %w2, %0") CK_PR_GENERATE(btc) diff --git a/include/gcc/x86_64/ck_pr.h b/include/gcc/x86_64/ck_pr.h index fb2804e8d8e5..37678b12b44a 100644 --- a/include/gcc/x86_64/ck_pr.h +++ b/include/gcc/x86_64/ck_pr.h @@ -149,7 +149,7 @@ ck_pr_rfo(const void *m) return v; \ } -CK_PR_FAS(ptr, void, void *, char, "xchgq") +CK_PR_FAS(ptr, void, void *, uint64_t, "xchgq") #define CK_PR_FAS_S(S, T, I) CK_PR_FAS(S, T, T, T, I) @@ -182,7 +182,7 @@ CK_PR_FAS_S(8, uint8_t, "xchgb") return (r); \ } -CK_PR_LOAD(ptr, void, void *, char, "movq") +CK_PR_LOAD(ptr, void, void *, uint64_t, "movq") #define CK_PR_LOAD_S(S, T, I) CK_PR_LOAD(S, T, T, T, I) @@ -264,7 +264,7 @@ CK_PR_LOAD_2(8, 16, uint8_t) return; \ } -CK_PR_STORE_IMM(ptr, void, const void *, char, "movq", CK_CC_IMM_U32) +CK_PR_STORE_IMM(ptr, void, const void *, uint64_t, "movq", CK_CC_IMM_U32) #ifndef CK_PR_DISABLE_DOUBLE CK_PR_STORE(double, double, double, double, "movq") #endif @@ -298,7 +298,7 @@ CK_PR_STORE_S(8, uint8_t, "movb", CK_CC_IMM_U32) return (d); \ } -CK_PR_FAA(ptr, void, uintptr_t, char, "xaddq") +CK_PR_FAA(ptr, void, uintptr_t, uint64_t, "xaddq") #define CK_PR_FAA_S(S, T, I) CK_PR_FAA(S, T, T, T, I) @@ -347,7 +347,7 @@ CK_PR_FAA_S(8, uint8_t, "xaddb") #define CK_PR_UNARY_S(K, S, T, I) CK_PR_UNARY(K, S, T, T, I) #define CK_PR_GENERATE(K) \ - CK_PR_UNARY(K, ptr, void, char, #K "q") \ + CK_PR_UNARY(K, ptr, void, uint64_t, #K "q") \ CK_PR_UNARY_S(K, char, char, #K "b") \ CK_PR_UNARY_S(K, int, int, #K "l") \ CK_PR_UNARY_S(K, uint, unsigned int, #K "l") \ @@ -388,7 +388,7 @@ CK_PR_GENERATE(not) #define CK_PR_BINARY_S(K, S, T, I, O) CK_PR_BINARY(K, S, T, T, T, I, O) #define CK_PR_GENERATE(K) \ - CK_PR_BINARY(K, ptr, void, uintptr_t, char, #K "q", CK_CC_IMM_U32) \ + CK_PR_BINARY(K, ptr, void, uintptr_t, uint64_t, #K "q", CK_CC_IMM_U32) \ CK_PR_BINARY_S(K, char, char, #K "b", CK_CC_IMM_S32) \ CK_PR_BINARY_S(K, int, int, #K "l", CK_CC_IMM_S32) \ CK_PR_BINARY_S(K, uint, unsigned int, #K "l", CK_CC_IMM_U32) \ @@ -408,8 +408,38 @@ CK_PR_GENERATE(xor) #undef CK_PR_BINARY /* - * Atomic compare and swap. + * Atomic compare and swap, with a variant that sets *v to the old value of target. */ +#ifdef __GCC_ASM_FLAG_OUTPUTS__ +#define CK_PR_CAS(S, M, T, C, I) \ + CK_CC_INLINE static bool \ + ck_pr_cas_##S(M *target, T compare, T set) \ + { \ + bool z; \ + __asm__ __volatile__(CK_PR_LOCK_PREFIX I " %3, %0" \ + : "+m" (*(C *)target), \ + "=@ccz" (z), \ + /* RAX is clobbered by cmpxchg. */ \ + "+a" (compare) \ + : "q" (set) \ + : "memory", "cc"); \ + return z; \ + } \ + \ + CK_CC_INLINE static bool \ + ck_pr_cas_##S##_value(M *target, T compare, T set, M *v) \ + { \ + bool z; \ + __asm__ __volatile__(CK_PR_LOCK_PREFIX I " %3, %0;" \ + : "+m" (*(C *)target), \ + "=@ccz" (z), \ + "+a" (compare) \ + : "q" (set) \ + : "memory", "cc"); \ + *(T *)v = compare; \ + return z; \ + } +#else #define CK_PR_CAS(S, M, T, C, I) \ CK_CC_INLINE static bool \ ck_pr_cas_##S(M *target, T compare, T set) \ @@ -422,9 +452,25 @@ CK_PR_GENERATE(xor) "a" (compare) \ : "memory", "cc"); \ return z; \ + } \ + \ + CK_CC_INLINE static bool \ + ck_pr_cas_##S##_value(M *target, T compare, T set, M *v) \ + { \ + bool z; \ + __asm__ __volatile__(CK_PR_LOCK_PREFIX I " %3, %0;" \ + "setz %1;" \ + : "+m" (*(C *)target), \ + "=q" (z), \ + "+a" (compare) \ + : "q" (set) \ + : "memory", "cc"); \ + *(T *)v = compare; \ + return z; \ } +#endif -CK_PR_CAS(ptr, void, void *, char, "cmpxchgq") +CK_PR_CAS(ptr, void, void *, uint64_t, "cmpxchgq") #define CK_PR_CAS_S(S, T, I) CK_PR_CAS(S, T, T, T, I) @@ -443,45 +489,6 @@ CK_PR_CAS_S(8, uint8_t, "cmpxchgb") #undef CK_PR_CAS /* - * Compare and swap, set *v to old value of target. - */ -#define CK_PR_CAS_O(S, M, T, C, I, R) \ - CK_CC_INLINE static bool \ - ck_pr_cas_##S##_value(M *target, T compare, T set, M *v) \ - { \ - bool z; \ - __asm__ __volatile__(CK_PR_LOCK_PREFIX "cmpxchg" I " %3, %0;" \ - "mov %% " R ", %2;" \ - "setz %1;" \ - : "+m" (*(C *)target), \ - "=a" (z), \ - "=m" (*(C *)v) \ - : "q" (set), \ - "a" (compare) \ - : "memory", "cc"); \ - return z; \ - } - -CK_PR_CAS_O(ptr, void, void *, char, "q", "rax") - -#define CK_PR_CAS_O_S(S, T, I, R) \ - CK_PR_CAS_O(S, T, T, T, I, R) - -CK_PR_CAS_O_S(char, char, "b", "al") -CK_PR_CAS_O_S(int, int, "l", "eax") -CK_PR_CAS_O_S(uint, unsigned int, "l", "eax") -#ifndef CK_PR_DISABLE_DOUBLE -CK_PR_CAS_O_S(double, double, "q", "rax") -#endif -CK_PR_CAS_O_S(64, uint64_t, "q", "rax") -CK_PR_CAS_O_S(32, uint32_t, "l", "eax") -CK_PR_CAS_O_S(16, uint16_t, "w", "ax") -CK_PR_CAS_O_S(8, uint8_t, "b", "al") - -#undef CK_PR_CAS_O_S -#undef CK_PR_CAS_O - -/* * Contrary to C-interface, alignment requirements are that of uint64_t[2]. */ CK_CC_INLINE static bool @@ -587,12 +594,12 @@ CK_PR_CAS_V(8, 16, uint8_t) #define CK_PR_BT_S(K, S, T, I) CK_PR_BT(K, S, T, T, T, I) -#define CK_PR_GENERATE(K) \ - CK_PR_BT(K, ptr, void, uint64_t, char, #K "q %2, %0") \ - CK_PR_BT_S(K, uint, unsigned int, #K "l %2, %0") \ - CK_PR_BT_S(K, int, int, #K "l %2, %0") \ - CK_PR_BT_S(K, 64, uint64_t, #K "q %2, %0") \ - CK_PR_BT_S(K, 32, uint32_t, #K "l %2, %0") \ +#define CK_PR_GENERATE(K) \ + CK_PR_BT(K, ptr, void, uint64_t, uint64_t, #K "q %2, %0") \ + CK_PR_BT_S(K, uint, unsigned int, #K "l %2, %0") \ + CK_PR_BT_S(K, int, int, #K "l %2, %0") \ + CK_PR_BT_S(K, 64, uint64_t, #K "q %2, %0") \ + CK_PR_BT_S(K, 32, uint32_t, #K "l %2, %0") \ CK_PR_BT_S(K, 16, uint16_t, #K "w %w2, %0") CK_PR_GENERATE(btc) diff --git a/include/spinlock/fas.h b/include/spinlock/fas.h index 4e6c1230eaf1..bfe91fed2f9f 100644 --- a/include/spinlock/fas.h +++ b/include/spinlock/fas.h @@ -77,10 +77,11 @@ CK_CC_INLINE static void ck_spinlock_fas_lock(struct ck_spinlock_fas *lock) { - while (ck_pr_fas_uint(&lock->value, true) == true) { - while (ck_pr_load_uint(&lock->value) == true) - ck_pr_stall(); - } + while (CK_CC_UNLIKELY(ck_pr_fas_uint(&lock->value, true) == true)) { + do { + ck_pr_stall(); + } while (ck_pr_load_uint(&lock->value) == true); + } ck_pr_fence_lock(); return; diff --git a/src/ck_ec.c b/src/ck_ec.c new file mode 100644 index 000000000000..9b24e762947c --- /dev/null +++ b/src/ck_ec.c @@ -0,0 +1,425 @@ +#include <ck_ec.h> +#include <ck_limits.h> + +#include "ck_ec_timeutil.h" + +#define DEFAULT_BUSY_LOOP_ITER 100U + +/* + * The 2ms, 8x/iter default parameter hit 1.024 seconds after 3 + * iterations. + */ +#define DEFAULT_INITIAL_WAIT_NS 2000000L /* Start at 2 ms */ +/* Grow the wait time 8x/iteration. */ +#define DEFAULT_WAIT_SCALE_FACTOR 8 +#define DEFAULT_WAIT_SHIFT_COUNT 0 + +struct ck_ec32_slow_path_state { + struct ck_ec32 *ec; + uint32_t flagged_word; +}; + +#ifdef CK_F_EC64 +struct ck_ec64_slow_path_state { + struct ck_ec64 *ec; + uint64_t flagged_word; +}; +#endif + +/* Once we've waited for >= 1 sec, go for the full deadline. */ +static const struct timespec final_wait_time = { + .tv_sec = 1 +}; + +void +ck_ec32_wake(struct ck_ec32 *ec, const struct ck_ec_ops *ops) +{ + /* Spurious wake-ups are OK. Clear the flag before futexing. */ + ck_pr_and_32(&ec->counter, (1U << 31) - 1); + ops->wake32(ops, &ec->counter); + return; +} + +int +ck_ec32_wait_slow(struct ck_ec32 *ec, + const struct ck_ec_ops *ops, + uint32_t old_value, + const struct timespec *deadline) +{ + return ck_ec32_wait_pred_slow(ec, ops, old_value, + NULL, NULL, deadline); +} + +#ifdef CK_F_EC64 +void +ck_ec64_wake(struct ck_ec64 *ec, const struct ck_ec_ops *ops) +{ + ck_pr_and_64(&ec->counter, ~1); + ops->wake64(ops, &ec->counter); + return; +} + +int +ck_ec64_wait_slow(struct ck_ec64 *ec, + const struct ck_ec_ops *ops, + uint64_t old_value, + const struct timespec *deadline) +{ + return ck_ec64_wait_pred_slow(ec, ops, old_value, + NULL, NULL, deadline); +} +#endif + +int +ck_ec_deadline_impl(struct timespec *new_deadline, + const struct ck_ec_ops *ops, + const struct timespec *timeout) +{ + struct timespec now; + int r; + + if (timeout == NULL) { + new_deadline->tv_sec = TIME_MAX; + new_deadline->tv_nsec = NSEC_MAX; + return 0; + } + + r = ops->gettime(ops, &now); + if (r != 0) { + return -1; + } + + *new_deadline = timespec_add(now, *timeout); + return 0; +} + +/* The rest of the file implements wait_pred_slow. */ + +/* + * Returns a timespec value for deadline_ptr. If deadline_ptr is NULL, + * returns a timespec far in the future. + */ +static struct timespec +canonical_deadline(const struct timespec *deadline_ptr) +{ + + if (deadline_ptr == NULL) { + return (struct timespec) { .tv_sec = TIME_MAX }; + } + + return *deadline_ptr; +} + +/* + * Really slow (sleeping) path for ck_ec_wait. Drives the exponential + * backoff scheme to sleep for longer and longer periods of time, + * until either the sleep function returns true (the eventcount's + * value has changed), or the predicate returns non-0 (something else + * has changed). + * + * If deadline is ever reached, returns -1 (timeout). + * + * TODO: add some form of randomisation to the intermediate timeout + * values. + */ +static int +exponential_backoff(struct ck_ec_wait_state *wait_state, + bool (*sleep)(const void *sleep_state, + const struct ck_ec_wait_state *wait_state, + const struct timespec *partial_deadline), + const void *sleep_state, + int (*pred)(const struct ck_ec_wait_state *state, + struct timespec *deadline), + const struct timespec *deadline) +{ + struct timespec begin; + struct timespec stop_backoff; + const struct ck_ec_ops *ops = wait_state->ops; + const uint32_t scale_factor = (ops->wait_scale_factor != 0) + ? ops->wait_scale_factor + : DEFAULT_WAIT_SCALE_FACTOR; + const uint32_t shift_count = (ops->wait_shift_count != 0) + ? ops->wait_shift_count + : DEFAULT_WAIT_SHIFT_COUNT; + uint32_t wait_ns = (ops->initial_wait_ns != 0) + ? ops->initial_wait_ns + : DEFAULT_INITIAL_WAIT_NS; + bool first = true; + + for (;;) { + struct timespec now; + struct timespec partial_deadline; + + if (check_deadline(&now, ops, *deadline) == true) { + /* Timeout. Bail out. */ + return -1; + } + + if (first) { + begin = now; + wait_state->start = begin; + stop_backoff = timespec_add(begin, final_wait_time); + first = false; + } + + wait_state->now = now; + if (timespec_cmp(now, stop_backoff) >= 0) { + partial_deadline = *deadline; + } else { + do { + partial_deadline = + timespec_add_ns(begin, wait_ns); + wait_ns = + wait_time_scale(wait_ns, + scale_factor, + shift_count); + } while (timespec_cmp(partial_deadline, now) <= 0); + } + + if (pred != NULL) { + int r = pred(wait_state, &partial_deadline); + if (r != 0) { + return r; + } + } + + /* Canonicalize deadlines in the far future to NULL. */ + if (sleep(sleep_state, wait_state, + ((partial_deadline.tv_sec == TIME_MAX) + ? NULL : &partial_deadline)) == true) { + return 0; + } + } +} + +/* + * Loops up to BUSY_LOOP_ITER times, or until ec's counter value + * (including the flag) differs from old_value. + * + * Returns the new value in ec. + */ +#define DEF_WAIT_EASY(W) \ + static uint##W##_t ck_ec##W##_wait_easy(struct ck_ec##W* ec, \ + const struct ck_ec_ops *ops, \ + uint##W##_t expected) \ + { \ + uint##W##_t current = ck_pr_load_##W(&ec->counter); \ + size_t n = (ops->busy_loop_iter != 0) \ + ? ops->busy_loop_iter \ + : DEFAULT_BUSY_LOOP_ITER; \ + \ + for (size_t i = 0; \ + i < n && current == expected; \ + i++) { \ + ck_pr_stall(); \ + current = ck_pr_load_##W(&ec->counter); \ + } \ + \ + return current; \ + } + +DEF_WAIT_EASY(32) +#ifdef CK_F_EC64 +DEF_WAIT_EASY(64) +#endif +#undef DEF_WAIT_EASY +/* + * Attempts to upgrade ec->counter from unflagged to flagged. + * + * Returns true if the event count has changed. Otherwise, ec's + * counter word is equal to flagged on return, or has been at some + * time before the return. + */ +#define DEF_UPGRADE(W) \ + static bool ck_ec##W##_upgrade(struct ck_ec##W* ec, \ + uint##W##_t current, \ + uint##W##_t unflagged, \ + uint##W##_t flagged) \ + { \ + uint##W##_t old_word; \ + \ + if (current == flagged) { \ + /* Nothing to do, no change. */ \ + return false; \ + } \ + \ + if (current != unflagged) { \ + /* We have a different counter value! */ \ + return true; \ + } \ + \ + /* \ + * Flag the counter value. The CAS only fails if the \ + * counter is already flagged, or has a new value. \ + */ \ + return (ck_pr_cas_##W##_value(&ec->counter, \ + unflagged, flagged, \ + &old_word) == false && \ + old_word != flagged); \ + } + +DEF_UPGRADE(32) +#ifdef CK_F_EC64 +DEF_UPGRADE(64) +#endif +#undef DEF_UPGRADE + +/* + * Blocks until partial_deadline on the ck_ec. Returns true if the + * eventcount's value has changed. If partial_deadline is NULL, wait + * forever. + */ +static bool +ck_ec32_wait_slow_once(const void *vstate, + const struct ck_ec_wait_state *wait_state, + const struct timespec *partial_deadline) +{ + const struct ck_ec32_slow_path_state *state = vstate; + const struct ck_ec32 *ec = state->ec; + const uint32_t flagged_word = state->flagged_word; + + wait_state->ops->wait32(wait_state, &ec->counter, + flagged_word, partial_deadline); + return ck_pr_load_32(&ec->counter) != flagged_word; +} + +#ifdef CK_F_EC64 +static bool +ck_ec64_wait_slow_once(const void *vstate, + const struct ck_ec_wait_state *wait_state, + const struct timespec *partial_deadline) +{ + const struct ck_ec64_slow_path_state *state = vstate; + const struct ck_ec64 *ec = state->ec; + const uint64_t flagged_word = state->flagged_word; + + /* futex_wait will only compare the low 32 bits. Perform a + * full comparison here to maximise the changes of catching an + * ABA in the low 32 bits. + */ + if (ck_pr_load_64(&ec->counter) != flagged_word) { + return true; + } + + wait_state->ops->wait64(wait_state, &ec->counter, + flagged_word, partial_deadline); + return ck_pr_load_64(&ec->counter) != flagged_word; +} +#endif + +/* + * The full wait logic is a lot of code (> 1KB). Encourage the + * compiler to lay this all out linearly with LIKELY annotations on + * every early exit. + */ +#define WAIT_SLOW_BODY(W, ec, ops, pred, data, deadline_ptr, \ + old_value, unflagged, flagged) \ + do { \ + struct ck_ec_wait_state wait_state = { \ + .ops = ops, \ + .data = data \ + }; \ + const struct ck_ec##W##_slow_path_state state = { \ + .ec = ec, \ + .flagged_word = flagged \ + }; \ + const struct timespec deadline = \ + canonical_deadline(deadline_ptr); \ + \ + /* Detect infinite past deadlines. */ \ + if (CK_CC_LIKELY(deadline.tv_sec <= 0)) { \ + return -1; \ + } \ + \ + for (;;) { \ + uint##W##_t current; \ + int r; \ + \ + current = ck_ec##W##_wait_easy(ec, ops, unflagged); \ + \ + /* \ + * We're about to wait harder (i.e., \ + * potentially with futex). Make sure the \ + * counter word is flagged. \ + */ \ + if (CK_CC_LIKELY( \ + ck_ec##W##_upgrade(ec, current, \ + unflagged, flagged) == true)) { \ + ck_pr_fence_acquire(); \ + return 0; \ + } \ + \ + /* \ + * By now, ec->counter == flagged_word (at \ + * some point in the past). Spin some more to \ + * heuristically let any in-flight SP inc/add \ + * to retire. This does not affect \ + * correctness, but practically eliminates \ + * lost wake-ups. \ + */ \ + current = ck_ec##W##_wait_easy(ec, ops, flagged); \ + if (CK_CC_LIKELY(current != flagged_word)) { \ + ck_pr_fence_acquire(); \ + return 0; \ + } \ + \ + r = exponential_backoff(&wait_state, \ + ck_ec##W##_wait_slow_once, \ + &state, \ + pred, &deadline); \ + if (r != 0) { \ + return r; \ + } \ + \ + if (ck_ec##W##_value(ec) != old_value) { \ + ck_pr_fence_acquire(); \ + return 0; \ + } \ + \ + /* Spurious wake-up. Redo the slow path. */ \ + } \ + } while (0) + +int +ck_ec32_wait_pred_slow(struct ck_ec32 *ec, + const struct ck_ec_ops *ops, + uint32_t old_value, + int (*pred)(const struct ck_ec_wait_state *state, + struct timespec *deadline), + void *data, + const struct timespec *deadline_ptr) +{ + const uint32_t unflagged_word = old_value; + const uint32_t flagged_word = old_value | (1UL << 31); + + if (CK_CC_UNLIKELY(ck_ec32_value(ec) != old_value)) { + return 0; + } + + WAIT_SLOW_BODY(32, ec, ops, pred, data, deadline_ptr, + old_value, unflagged_word, flagged_word); +} + +#ifdef CK_F_EC64 +int +ck_ec64_wait_pred_slow(struct ck_ec64 *ec, + const struct ck_ec_ops *ops, + uint64_t old_value, + int (*pred)(const struct ck_ec_wait_state *state, + struct timespec *deadline), + void *data, + const struct timespec *deadline_ptr) +{ + const uint64_t unflagged_word = old_value << 1; + const uint64_t flagged_word = unflagged_word | 1; + + if (CK_CC_UNLIKELY(ck_ec64_value(ec) != old_value)) { + return 0; + } + + WAIT_SLOW_BODY(64, ec, ops, pred, data, deadline_ptr, + old_value, unflagged_word, flagged_word); +} +#endif + +#undef WAIT_SLOW_BODY diff --git a/src/ck_ec_timeutil.h b/src/ck_ec_timeutil.h new file mode 100644 index 000000000000..50cfb67bf4a4 --- /dev/null +++ b/src/ck_ec_timeutil.h @@ -0,0 +1,150 @@ +#ifndef CK_EC_TIMEUTIL_H +#define CK_EC_TIMEUTIL_H +#include <ck_cc.h> +#include <ck_ec.h> +#include <ck_limits.h> +#include <ck_stdint.h> +#include <sys/time.h> + +#define TIME_MAX ((time_t)((1ULL << ((sizeof(time_t) * CHAR_BIT) - 1)) - 1)) +#define NSEC_MAX ((1000L * 1000 * 1000) - 1) + +/* + * Approximates (nsec * multiplier) >> shift. Clamps to UINT32_MAX on + * overflow. + */ +CK_CC_UNUSED static uint32_t +wait_time_scale(uint32_t nsec, + uint32_t multiplier, + unsigned int shift) +{ + uint64_t temp = (uint64_t)nsec * multiplier; + uint64_t max = (uint64_t)UINT32_MAX << shift; + + if (temp >= max) { + return UINT32_MAX; + } + + return temp >> shift; +} + + +/* + * Returns ts + ns. ns is clamped to at most 1 second. Clamps the + * return value to TIME_MAX, NSEC_MAX on overflow. + * + */ +CK_CC_UNUSED static struct timespec timespec_add_ns(const struct timespec ts, + uint32_t ns) +{ + struct timespec ret = { + .tv_sec = TIME_MAX, + .tv_nsec = NSEC_MAX + }; + time_t sec; + uint32_t sum_ns; + + if (ns > (uint32_t)NSEC_MAX) { + if (ts.tv_sec >= TIME_MAX) { + return ret; + } + + ret.tv_sec = ts.tv_sec + 1; + ret.tv_nsec = ts.tv_nsec; + return ret; + } + + sec = ts.tv_sec; + sum_ns = ns + ts.tv_nsec; + if (sum_ns > NSEC_MAX) { + if (sec >= TIME_MAX) { + return ret; + } + + sec++; + sum_ns -= (NSEC_MAX + 1); + } + + ret.tv_sec = sec; + ret.tv_nsec = sum_ns; + return ret; +} + + +/* + * Returns ts + inc. If inc is negative, it is normalized to 0. + * Clamps the return value to TIME_MAX, NSEC_MAX on overflow. + */ +CK_CC_UNUSED static struct timespec timespec_add(const struct timespec ts, + const struct timespec inc) +{ + /* Initial return value is clamped to infinite future. */ + struct timespec ret = { + .tv_sec = TIME_MAX, + .tv_nsec = NSEC_MAX + }; + time_t sec; + unsigned long nsec; + + /* Non-positive delta is a no-op. Invalid nsec is another no-op. */ + if (inc.tv_sec < 0 || inc.tv_nsec < 0 || inc.tv_nsec > NSEC_MAX) { + return ts; + } + + /* Detect overflow early. */ + if (inc.tv_sec > TIME_MAX - ts.tv_sec) { + return ret; + } + + sec = ts.tv_sec + inc.tv_sec; + /* This sum can't overflow if the inputs are valid.*/ + nsec = (unsigned long)ts.tv_nsec + inc.tv_nsec; + + if (nsec > NSEC_MAX) { + if (sec >= TIME_MAX) { + return ret; + } + + sec++; + nsec -= (NSEC_MAX + 1); + } + + ret.tv_sec = sec; + ret.tv_nsec = nsec; + return ret; +} + +/* Compares two timespecs. Returns -1 if x < y, 0 if x == y, and 1 if x > y. */ +CK_CC_UNUSED static int timespec_cmp(const struct timespec x, + const struct timespec y) +{ + if (x.tv_sec != y.tv_sec) { + return (x.tv_sec < y.tv_sec) ? -1 : 1; + } + + if (x.tv_nsec != y.tv_nsec) { + return (x.tv_nsec < y.tv_nsec) ? -1 : 1; + } + + return 0; +} + +/* + * Overwrites now with the current CLOCK_MONOTONIC time, and returns + * true if the current time is greater than or equal to the deadline, + * or the clock is somehow broken. + */ +CK_CC_UNUSED static bool check_deadline(struct timespec *now, + const struct ck_ec_ops *ops, + const struct timespec deadline) +{ + int r; + + r = ops->gettime(ops, now); + if (r != 0) { + return true; + } + + return timespec_cmp(*now, deadline) >= 0; +} +#endif /* !CK_EC_TIMEUTIL_H */ diff --git a/src/ck_hs.c b/src/ck_hs.c index a7e15eaddbeb..246bceb2a0ab 100644 --- a/src/ck_hs.c +++ b/src/ck_hs.c @@ -106,9 +106,11 @@ ck_hs_map_signal(struct ck_hs_map *map, unsigned long h) } static bool -_ck_hs_next(struct ck_hs *hs, struct ck_hs_map *map, struct ck_hs_iterator *i, void **key) +_ck_hs_next(struct ck_hs *hs, struct ck_hs_map *map, + struct ck_hs_iterator *i, void **key) { void *value; + if (i->offset >= map->capacity) return false; @@ -143,6 +145,7 @@ ck_hs_iterator_init(struct ck_hs_iterator *iterator) bool ck_hs_next(struct ck_hs *hs, struct ck_hs_iterator *i, void **key) { + return _ck_hs_next(hs, hs->map, i, key); } @@ -150,9 +153,11 @@ bool ck_hs_next_spmc(struct ck_hs *hs, struct ck_hs_iterator *i, void **key) { struct ck_hs_map *m = i->map; + if (m == NULL) { m = i->map = ck_pr_load_ptr(&hs->map); } + return _ck_hs_next(hs, m, i, key); } diff --git a/src/ck_ht.c b/src/ck_ht.c index 48b04c9678d9..66c7315038c1 100644 --- a/src/ck_ht.c +++ b/src/ck_ht.c @@ -30,9 +30,6 @@ /* * This implementation borrows several techniques from Josh Dybnis's * nbds library which can be found at http://code.google.com/p/nbds - * - * This release currently only includes support for 64-bit platforms. - * We can address 32-bit platforms in a future release. */ #include <ck_cc.h> #include <ck_md.h> |