urcu-qsbr's read site's quiescence is much longer than normal urcu ==>
synchronize_rcu() is much slower ==>
rcu_gp_ctr is updated much less ==>
the whole urcu-qsbr will not be slowed down by false sharing of rcu_gp_ctr.
But this patch makes sense to keep the code of urcu-qsbr like normal urcu,
better readability and maintenance.
Test: (4*6 CPUs)
Before patch:
[root@localhost userspace-rcu]# ./tests/test_urcu_qsbr 20 1 20
SUMMARY ./tests/test_urcu_qsbr testdur 20 nr_readers 20 rdur 0 wdur 0 nr_writers 1 wdelay 0 nr_reads
65498297587 nr_writes
2000665 nr_ops
65500298252
[root@localhost userspace-rcu]# ./tests/test_urcu_qsbr 20 1 20
SUMMARY ./tests/test_urcu_qsbr testdur 20 nr_readers 20 rdur 0 wdur 0 nr_writers 1 wdelay 0 nr_reads
67218079467 nr_writes
1981593 nr_ops
67220061060
After patch
./tests/test_urcu_qsbr 20 1 20
SUMMARY ./tests/test_urcu_qsbr testdur 20 nr_readers 20 rdur 0 wdur 0 nr_writers 1 wdelay 0 nr_reads
67473798999 nr_writes
1999151 nr_ops
67475798150
[root@localhost userspace-rcu]# ./tests/test_urcu_qsbr 20 1 20
SUMMARY ./tests/test_urcu_qsbr testdur 20 nr_readers 20 rdur 0 wdur 0 nr_writers 1 wdelay 0 nr_reads
67065521397 nr_writes
1993956 nr_ops
67067515353
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
void __attribute__((destructor)) rcu_exit(void);
static pthread_mutex_t rcu_gp_lock = PTHREAD_MUTEX_INITIALIZER;
void __attribute__((destructor)) rcu_exit(void);
static pthread_mutex_t rcu_gp_lock = PTHREAD_MUTEX_INITIALIZER;
-
-int32_t rcu_gp_futex;
-
-/*
- * Global grace period counter.
- */
-unsigned long rcu_gp_ctr = RCU_GP_ONLINE;
+struct urcu_gp rcu_gp = { .ctr = RCU_GP_ONLINE };
/*
* Active attempts to check for reader Q.S. before calling futex().
/*
* Active attempts to check for reader Q.S. before calling futex().
{
/* Read reader_gp before read futex */
cmm_smp_rmb();
{
/* Read reader_gp before read futex */
cmm_smp_rmb();
- if (uatomic_read(&rcu_gp_futex) == -1)
- futex_noasync(&rcu_gp_futex, FUTEX_WAIT, -1,
+ if (uatomic_read(&rcu_gp.futex) == -1)
+ futex_noasync(&rcu_gp.futex, FUTEX_WAIT, -1,
/*
* Wait for each thread URCU_TLS(rcu_reader).ctr to either
* indicate quiescence (offline), or for them to observe the
/*
* Wait for each thread URCU_TLS(rcu_reader).ctr to either
* indicate quiescence (offline), or for them to observe the
- * current rcu_gp_ctr value.
+ * current rcu_gp.ctr value.
*/
for (;;) {
wait_loops++;
if (wait_loops >= RCU_QS_ACTIVE_ATTEMPTS) {
*/
for (;;) {
wait_loops++;
if (wait_loops >= RCU_QS_ACTIVE_ATTEMPTS) {
- uatomic_set(&rcu_gp_futex, -1);
+ uatomic_set(&rcu_gp.futex, -1);
/*
* Write futex before write waiting (the other side
* reads them in the opposite order).
/*
* Write futex before write waiting (the other side
* reads them in the opposite order).
if (wait_loops >= RCU_QS_ACTIVE_ATTEMPTS) {
/* Read reader_gp before write futex */
cmm_smp_mb();
if (wait_loops >= RCU_QS_ACTIVE_ATTEMPTS) {
/* Read reader_gp before write futex */
cmm_smp_mb();
- uatomic_set(&rcu_gp_futex, 0);
+ uatomic_set(&rcu_gp.futex, 0);
/*
* Must finish waiting for quiescent state for original parity
/*
* Must finish waiting for quiescent state for original parity
- * before committing next rcu_gp_ctr update to memory. Failure
+ * before committing next rcu_gp.ctr update to memory. Failure
* to do so could result in the writer waiting forever while new
* readers are always accessing data (no progress). Enforce
* compiler-order of load URCU_TLS(rcu_reader).ctr before store
* to do so could result in the writer waiting forever while new
* readers are always accessing data (no progress). Enforce
* compiler-order of load URCU_TLS(rcu_reader).ctr before store
cmm_smp_mb();
/* Switch parity: 0 -> 1, 1 -> 0 */
cmm_smp_mb();
/* Switch parity: 0 -> 1, 1 -> 0 */
- CMM_STORE_SHARED(rcu_gp_ctr, rcu_gp_ctr ^ RCU_GP_CTR);
+ CMM_STORE_SHARED(rcu_gp.ctr, rcu_gp.ctr ^ RCU_GP_CTR);
- * Must commit rcu_gp_ctr update to memory before waiting for
+ * Must commit rcu_gp.ctr update to memory before waiting for
* quiescent state. Failure to do so could result in the writer
* waiting forever while new readers are always accessing data
* quiescent state. Failure to do so could result in the writer
* waiting forever while new readers are always accessing data
- * (no progress). Enforce compiler-order of store to rcu_gp_ctr
+ * (no progress). Enforce compiler-order of store to rcu_gp.ctr
* before load URCU_TLS(rcu_reader).ctr.
*/
cmm_barrier();
* before load URCU_TLS(rcu_reader).ctr.
*/
cmm_barrier();
goto out;
/* Increment current G.P. */
goto out;
/* Increment current G.P. */
- CMM_STORE_SHARED(rcu_gp_ctr, rcu_gp_ctr + RCU_GP_CTR);
+ CMM_STORE_SHARED(rcu_gp.ctr, rcu_gp.ctr + RCU_GP_CTR);
- * Must commit rcu_gp_ctr update to memory before waiting for
+ * Must commit rcu_gp.ctr update to memory before waiting for
* quiescent state. Failure to do so could result in the writer
* waiting forever while new readers are always accessing data
* quiescent state. Failure to do so could result in the writer
* waiting forever while new readers are always accessing data
- * (no progress). Enforce compiler-order of store to rcu_gp_ctr
+ * (no progress). Enforce compiler-order of store to rcu_gp.ctr
* before load URCU_TLS(rcu_reader).ctr.
*/
cmm_barrier();
* before load URCU_TLS(rcu_reader).ctr.
*/
cmm_barrier();
#define RCU_GP_ONLINE (1UL << 0)
#define RCU_GP_CTR (1UL << 1)
#define RCU_GP_ONLINE (1UL << 0)
#define RCU_GP_CTR (1UL << 1)
-/*
- * Global quiescent period counter with low-order bits unused.
- * Using a int rather than a char to eliminate false register dependencies
- * causing stalls on some architectures.
- */
-extern unsigned long rcu_gp_ctr;
+struct urcu_gp {
+ /*
+ * Global quiescent period counter with low-order bits unused.
+ * Using a int rather than a char to eliminate false register
+ * dependencies causing stalls on some architectures.
+ */
+ unsigned long ctr;
+
+ int32_t futex;
+} __attribute__((aligned(CAA_CACHE_LINE_SIZE)));
+
+extern struct urcu_gp rcu_gp;
struct rcu_reader {
/* Data used by both reader and synchronize_rcu() */
struct rcu_reader {
/* Data used by both reader and synchronize_rcu() */
extern DECLARE_URCU_TLS(struct rcu_reader, rcu_reader);
extern DECLARE_URCU_TLS(struct rcu_reader, rcu_reader);
-extern int32_t rcu_gp_futex;
-
/*
* Wake-up waiting synchronize_rcu(). Called from many concurrent threads.
*/
/*
* Wake-up waiting synchronize_rcu(). Called from many concurrent threads.
*/
if (caa_unlikely(_CMM_LOAD_SHARED(URCU_TLS(rcu_reader).waiting))) {
_CMM_STORE_SHARED(URCU_TLS(rcu_reader).waiting, 0);
cmm_smp_mb();
if (caa_unlikely(_CMM_LOAD_SHARED(URCU_TLS(rcu_reader).waiting))) {
_CMM_STORE_SHARED(URCU_TLS(rcu_reader).waiting, 0);
cmm_smp_mb();
- if (uatomic_read(&rcu_gp_futex) != -1)
+ if (uatomic_read(&rcu_gp.futex) != -1)
- uatomic_set(&rcu_gp_futex, 0);
- futex_noasync(&rcu_gp_futex, FUTEX_WAKE, 1,
+ uatomic_set(&rcu_gp.futex, 0);
+ futex_noasync(&rcu_gp.futex, FUTEX_WAKE, 1,
v = CMM_LOAD_SHARED(*ctr);
if (!v)
return RCU_READER_INACTIVE;
v = CMM_LOAD_SHARED(*ctr);
if (!v)
return RCU_READER_INACTIVE;
return RCU_READER_ACTIVE_CURRENT;
return RCU_READER_ACTIVE_OLD;
}
return RCU_READER_ACTIVE_CURRENT;
return RCU_READER_ACTIVE_OLD;
}
* to be invoked directly from non-LGPL code.
*
* We skip the memory barriers and gp store if our local ctr already
* to be invoked directly from non-LGPL code.
*
* We skip the memory barriers and gp store if our local ctr already
- * matches the global rcu_gp_ctr value: this is OK because a prior
+ * matches the global rcu_gp.ctr value: this is OK because a prior
* _rcu_quiescent_state() or _rcu_thread_online() already updated it
* within our thread, so we have no quiescent state to report.
*/
* _rcu_quiescent_state() or _rcu_thread_online() already updated it
* within our thread, so we have no quiescent state to report.
*/
- if ((gp_ctr = CMM_LOAD_SHARED(rcu_gp_ctr)) == URCU_TLS(rcu_reader).ctr)
+ if ((gp_ctr = CMM_LOAD_SHARED(rcu_gp.ctr)) == URCU_TLS(rcu_reader).ctr)
return;
_rcu_quiescent_state_update_and_wakeup(gp_ctr);
}
return;
_rcu_quiescent_state_update_and_wakeup(gp_ctr);
}
static inline void _rcu_thread_online(void)
{
cmm_barrier(); /* Ensure the compiler does not reorder us with mutex */
static inline void _rcu_thread_online(void)
{
cmm_barrier(); /* Ensure the compiler does not reorder us with mutex */
- _CMM_STORE_SHARED(URCU_TLS(rcu_reader).ctr, CMM_LOAD_SHARED(rcu_gp_ctr));
+ _CMM_STORE_SHARED(URCU_TLS(rcu_reader).ctr, CMM_LOAD_SHARED(rcu_gp.ctr));