* output exchanged. Therefore, i post-dominating j ensures that every path
* passing by j will pass by i before reaching the output.
*
+ * Prefetch and speculative execution
+ *
+ * If an instruction depends on the result of a previous branch, but it does not
+ * have side-effects, it can be executed before the branch result is known.
+ * however, it must be restarted if a core-synchronizing instruction is issued.
+ * Note that instructions which depend on the speculative instruction result
+ * but that have side-effects must depend on the branch completion in addition
+ * to the speculatively executed instruction.
+ *
* Other considerations
*
* Note about "volatile" keyword dependency : The compiler will order volatile
* Nested calls are not supported.
*/
+/*
+ * Only Alpha has out-of-order cache bank loads. Other architectures (intel,
+ * powerpc, arm) ensure that dependent reads won't be reordered. c.f.
+ * http://www.linuxjournal.com/article/8212)
+ */
+#ifdef ARCH_ALPHA
+#define HAVE_OOO_CACHE_READ
+#endif
+
/*
* Each process have its own data in cache. Caches are randomly updated.
* smp_wmb and smp_rmb forces cache updates (write and read), smp_mb forces
};
#define DECLARE_CACHED_VAR(type, x) \
- type mem_##x; \
- per_proc_##type cached_##x; \
- per_proc_bitfield cache_dirty_##x;
-
-#define INIT_CACHED_VAR(x, v, j) \
- mem_##x = v; \
- cache_dirty_##x.bitfield = 0; \
- j = 0; \
- do \
- :: j < NR_PROCS -> \
- cached_##x.val[j] = v; \
- j++ \
- :: j >= NR_PROCS -> break \
- od;
+ type mem_##x;
+
+#define DECLARE_PROC_CACHED_VAR(type, x)\
+ type cached_##x; \
+ bit cache_dirty_##x;
+
+#define INIT_CACHED_VAR(x, v) \
+ mem_##x = v;
+
+#define INIT_PROC_CACHED_VAR(x, v) \
+ cache_dirty_##x = 0; \
+ cached_##x = v;
-#define IS_CACHE_DIRTY(x, id) (cache_dirty_##x.bitfield & (1 << id))
+#define IS_CACHE_DIRTY(x, id) (cache_dirty_##x)
-#define READ_CACHED_VAR(x) (cached_##x.val[get_pid()])
+#define READ_CACHED_VAR(x) (cached_##x)
#define WRITE_CACHED_VAR(x, v) \
atomic { \
- cached_##x.val[get_pid()] = v; \
- cache_dirty_##x.bitfield = \
- cache_dirty_##x.bitfield | (1 << get_pid()); \
+ cached_##x = v; \
+ cache_dirty_##x = 1; \
}
#define CACHE_WRITE_TO_MEM(x, id) \
if \
:: IS_CACHE_DIRTY(x, id) -> \
- mem_##x = cached_##x.val[id]; \
- cache_dirty_##x.bitfield = \
- cache_dirty_##x.bitfield & (~(1 << id)); \
+ mem_##x = cached_##x; \
+ cache_dirty_##x = 0; \
:: else -> \
skip \
fi;
#define CACHE_READ_FROM_MEM(x, id) \
if \
:: !IS_CACHE_DIRTY(x, id) -> \
- cached_##x.val[id] = mem_##x;\
+ cached_##x = mem_##x; \
:: else -> \
skip \
fi;
fi;
/* Must consume all prior read tokens. All subsequent reads depend on it. */
-inline smp_rmb(i, j)
+inline smp_rmb(i)
{
atomic {
CACHE_READ_FROM_MEM(urcu_gp_ctr, get_pid());
}
/* Must consume all prior write tokens. All subsequent writes depend on it. */
-inline smp_wmb(i, j)
+inline smp_wmb(i)
{
atomic {
CACHE_WRITE_TO_MEM(urcu_gp_ctr, get_pid());
/* Synchronization point. Must consume all prior read and write tokens. All
* subsequent reads and writes depend on it. */
-inline smp_mb(i, j)
+inline smp_mb(i)
{
atomic {
- smp_wmb(i, j);
- smp_rmb(i, j);
+ smp_wmb(i);
+ smp_rmb(i);
}
}
{
do
:: (reader_barrier[get_readerid()] == 1) ->
- smp_mb(i, j);
+ /*
+ * We choose to ignore cycles caused by writer busy-looping,
+ * waiting for the reader, sending barrier requests, and the
+ * reader always services them without continuing execution.
+ */
+progress_ignoring_mb1:
+ smp_mb(i);
reader_barrier[get_readerid()] = 0;
:: 1 ->
- /* We choose to ignore writer's non-progress caused from the
- * reader ignoring the writer's mb() requests */
-#ifdef WRITER_PROGRESS
-progress_writer_from_reader:
-#endif
+ /*
+ * We choose to ignore writer's non-progress caused by the
+ * reader ignoring the writer's mb() requests.
+ */
+progress_ignoring_mb2:
break;
od;
}
-#ifdef WRITER_PROGRESS
-//#define PROGRESS_LABEL(progressid) progress_writer_progid_##progressid:
-#define PROGRESS_LABEL(progressid)
-#else
-#define PROGRESS_LABEL(progressid)
-#endif
+#define PROGRESS_LABEL(progressid) progress_writer_progid_##progressid:
#define smp_mb_send(i, j, progressid) \
{ \
- smp_mb(i, j); \
+ smp_mb(i); \
i = 0; \
do \
:: i < NR_READERS -> \
* interest, given the reader has the ability to totally ignore \
* barrier requests. \
*/ \
-PROGRESS_LABEL(progressid) \
do \
- :: (reader_barrier[i] == 1) -> skip; \
+ :: (reader_barrier[i] == 1) -> \
+PROGRESS_LABEL(progressid) \
+ skip; \
:: (reader_barrier[i] == 0) -> break; \
od; \
i++; \
:: i >= NR_READERS -> \
break \
od; \
- smp_mb(i, j); \
+ smp_mb(i); \
}
#else
-#define smp_mb_send(i, j, progressid) smp_mb(i, j)
-#define smp_mb_reader smp_mb
+#define smp_mb_send(i, j, progressid) smp_mb(i)
+#define smp_mb_reader(i, j) smp_mb(i)
#define smp_mb_recv(i, j)
#endif
i++
:: i >= SLAB_SIZE -> break
od;
+#ifdef HAVE_OOO_CACHE_READ
RANDOM_CACHE_READ_FROM_MEM(urcu_gp_ctr, get_pid());
i = 0;
do
i++
:: i >= SLAB_SIZE -> break
od;
+#else
+ smp_rmb(i);
+#endif /* HAVE_OOO_CACHE_READ */
}
}
#define READ_PROD_B_IF_FALSE (1 << 2)
#define READ_PROD_C_IF_TRUE_READ (1 << 3)
-#define PROCEDURE_READ_LOCK(base, consumetoken, producetoken) \
- :: CONSUME_TOKENS(proc_urcu_reader, consumetoken, READ_PROD_A_READ << base) -> \
+#define PROCEDURE_READ_LOCK(base, consumetoken, consumetoken2, producetoken) \
+ :: CONSUME_TOKENS(proc_urcu_reader, (consumetoken | consumetoken2), READ_PROD_A_READ << base) -> \
ooo_mem(i); \
tmp = READ_CACHED_VAR(urcu_active_readers[get_readerid()]); \
PRODUCE_TOKENS(proc_urcu_reader, READ_PROD_A_READ << base); \
PRODUCE_TOKENS(proc_urcu_reader, READ_PROD_B_IF_FALSE << base); \
fi; \
/* IF TRUE */ \
- :: CONSUME_TOKENS(proc_urcu_reader, READ_PROD_B_IF_TRUE << base, \
+ :: CONSUME_TOKENS(proc_urcu_reader, consumetoken, /* prefetch */ \
READ_PROD_C_IF_TRUE_READ << base) -> \
ooo_mem(i); \
tmp2 = READ_CACHED_VAR(urcu_gp_ctr); \
PRODUCE_TOKENS(proc_urcu_reader, READ_PROD_C_IF_TRUE_READ << base); \
:: CONSUME_TOKENS(proc_urcu_reader, \
- (READ_PROD_C_IF_TRUE_READ /* pre-dominant */ \
+ (READ_PROD_B_IF_TRUE \
+ | READ_PROD_C_IF_TRUE_READ /* pre-dominant */ \
| READ_PROD_A_READ) << base, /* WAR */ \
producetoken) -> \
ooo_mem(i); \
consumetoken, \
READ_PROC_READ_UNLOCK << base) -> \
ooo_mem(i); \
- tmp2 = READ_CACHED_VAR(urcu_active_readers[get_readerid()]); \
+ tmp = READ_CACHED_VAR(urcu_active_readers[get_readerid()]); \
PRODUCE_TOKENS(proc_urcu_reader, READ_PROC_READ_UNLOCK << base); \
:: CONSUME_TOKENS(proc_urcu_reader, \
consumetoken \
| (READ_PROC_READ_UNLOCK << base), /* WAR */ \
producetoken) -> \
ooo_mem(i); \
- WRITE_CACHED_VAR(urcu_active_readers[get_readerid()], tmp2 - 1); \
+ WRITE_CACHED_VAR(urcu_active_readers[get_readerid()], tmp - 1); \
PRODUCE_TOKENS(proc_urcu_reader, producetoken); \
skip
skip;
fi;
}
- :: 1 -> skip;
fi;
goto non_atomic3_skip;
atomic {
if
- PROCEDURE_READ_LOCK(READ_LOCK_BASE, READ_PROD_NONE, READ_LOCK_OUT);
+ PROCEDURE_READ_LOCK(READ_LOCK_BASE, READ_PROD_NONE, 0, READ_LOCK_OUT);
:: CONSUME_TOKENS(proc_urcu_reader,
READ_LOCK_OUT, /* post-dominant */
smp_mb_reader(i, j);
PRODUCE_TOKENS(proc_urcu_reader, READ_PROC_FIRST_MB);
- PROCEDURE_READ_LOCK(READ_LOCK_NESTED_BASE, READ_PROC_FIRST_MB | READ_LOCK_OUT,
+ PROCEDURE_READ_LOCK(READ_LOCK_NESTED_BASE, READ_PROC_FIRST_MB, READ_LOCK_OUT,
READ_LOCK_NESTED_OUT);
:: CONSUME_TOKENS(proc_urcu_reader,
/* reading urcu_active_readers, which have been written by
* READ_UNLOCK_OUT : RAW */
PROCEDURE_READ_LOCK(READ_LOCK_UNROLL_BASE,
- READ_UNLOCK_OUT /* RAW */
- | READ_PROC_SECOND_MB /* mb() orders reads */
- | READ_PROC_FIRST_MB /* mb() orders reads */
- | READ_LOCK_NESTED_OUT /* RAW */
+ READ_PROC_SECOND_MB /* mb() orders reads */
+ | READ_PROC_FIRST_MB, /* mb() orders reads */
+ READ_LOCK_NESTED_OUT /* RAW */
| READ_LOCK_OUT /* RAW */
- | READ_UNLOCK_NESTED_OUT, /* RAW */
+ | READ_UNLOCK_NESTED_OUT /* RAW */
+ | READ_UNLOCK_OUT, /* RAW */
READ_LOCK_OUT_UNROLL);
goto end;
rmb1:
#ifndef NO_RMB
- smp_rmb(i, j);
+ smp_rmb(i);
#else
ooo_mem(i);
#endif
goto rmb1_end;
rmb2:
#ifndef NO_RMB
- smp_rmb(i, j);
+ smp_rmb(i);
#else
ooo_mem(i);
#endif
byte i, j, nest_i;
byte tmp, tmp2;
+ /* Keep in sync manually with smp_rmb, smp_wmb, ooo_mem and init() */
+ DECLARE_PROC_CACHED_VAR(byte, urcu_gp_ctr);
+ /* Note ! currently only one reader */
+ DECLARE_PROC_CACHED_VAR(byte, urcu_active_readers[NR_READERS]);
+ /* RCU data */
+ DECLARE_PROC_CACHED_VAR(bit, rcu_data[SLAB_SIZE]);
+
+ /* RCU pointer */
+#if (SLAB_SIZE == 2)
+ DECLARE_PROC_CACHED_VAR(bit, rcu_ptr);
+#else
+ DECLARE_PROC_CACHED_VAR(byte, rcu_ptr);
+#endif
+
+ atomic {
+ INIT_PROC_CACHED_VAR(urcu_gp_ctr, 1);
+ INIT_PROC_CACHED_VAR(rcu_ptr, 0);
+
+ i = 0;
+ do
+ :: i < NR_READERS ->
+ INIT_PROC_CACHED_VAR(urcu_active_readers[i], 0);
+ i++;
+ :: i >= NR_READERS -> break
+ od;
+ INIT_PROC_CACHED_VAR(rcu_data[0], WINE);
+ i = 1;
+ do
+ :: i < SLAB_SIZE ->
+ INIT_PROC_CACHED_VAR(rcu_data[i], POISON);
+ i++
+ :: i >= SLAB_SIZE -> break
+ od;
+ }
+
wait_init_done();
assert(get_pid() < NR_PROCS);
* GP update. Needed to test single flip case.
*/
+ /* Keep in sync manually with smp_rmb, smp_wmb, ooo_mem and init() */
+ DECLARE_PROC_CACHED_VAR(byte, urcu_gp_ctr);
+ /* Note ! currently only one reader */
+ DECLARE_PROC_CACHED_VAR(byte, urcu_active_readers[NR_READERS]);
+ /* RCU data */
+ DECLARE_PROC_CACHED_VAR(bit, rcu_data[SLAB_SIZE]);
+
+ /* RCU pointer */
+#if (SLAB_SIZE == 2)
+ DECLARE_PROC_CACHED_VAR(bit, rcu_ptr);
+#else
+ DECLARE_PROC_CACHED_VAR(byte, rcu_ptr);
+#endif
+
+ atomic {
+ INIT_PROC_CACHED_VAR(urcu_gp_ctr, 1);
+ INIT_PROC_CACHED_VAR(rcu_ptr, 0);
+
+ i = 0;
+ do
+ :: i < NR_READERS ->
+ INIT_PROC_CACHED_VAR(urcu_active_readers[i], 0);
+ i++;
+ :: i >= NR_READERS -> break
+ od;
+ INIT_PROC_CACHED_VAR(rcu_data[0], WINE);
+ i = 1;
+ do
+ :: i < SLAB_SIZE ->
+ INIT_PROC_CACHED_VAR(rcu_data[i], POISON);
+ i++
+ :: i >= SLAB_SIZE -> break
+ od;
+ }
+
+
wait_init_done();
assert(get_pid() < NR_PROCS);
do
- :: (loop_nr < 4) ->
+ :: (loop_nr < 3) ->
#ifdef WRITER_PROGRESS
progress_writer1:
#endif
:: CONSUME_TOKENS(proc_urcu_writer,
WRITE_DATA,
WRITE_PROC_WMB) ->
- smp_wmb(i, j);
+ smp_wmb(i);
PRODUCE_TOKENS(proc_urcu_writer, WRITE_PROC_WMB);
:: CONSUME_TOKENS(proc_urcu_writer,
PRODUCE_TOKENS(proc_urcu_writer, WRITE_PROC_FIRST_WRITE_GP);
:: CONSUME_TOKENS(proc_urcu_writer,
- //WRITE_PROC_FIRST_WRITE_GP /* TEST ADDING SYNC CORE */
+ //WRITE_PROC_FIRST_WRITE_GP | /* TEST ADDING SYNC CORE */
WRITE_PROC_FIRST_MB, /* can be reordered before/after flips */
WRITE_PROC_FIRST_WAIT | WRITE_PROC_FIRST_WAIT_LOOP) ->
ooo_mem(i);
+ //smp_mb(i); /* TEST */
/* ONLY WAITING FOR READER 0 */
tmp2 = READ_CACHED_VAR(urcu_active_readers[0]);
#ifndef SINGLE_FLIP
#ifndef GEN_ERROR_WRITER_PROGRESS
goto smp_mb_send2;
smp_mb_send2_end:
+ /* The memory barrier will invalidate the
+ * second read done as prefetching. Note that all
+ * instructions with side-effects depending on
+ * WRITE_PROC_SECOND_READ_GP should also depend on
+ * completion of this busy-waiting loop. */
+ CLEAR_TOKENS(proc_urcu_writer, WRITE_PROC_SECOND_READ_GP);
#else
ooo_mem(i);
#endif
/* second flip */
:: CONSUME_TOKENS(proc_urcu_writer,
- WRITE_PROC_FIRST_WAIT /* Control dependency : need to branch out of
- * the loop to execute the next flip (CHECK) */
- | WRITE_PROC_FIRST_WRITE_GP
+ //WRITE_PROC_FIRST_WAIT | //test /* no dependency. Could pre-fetch, no side-effect. */
+ WRITE_PROC_FIRST_WRITE_GP
| WRITE_PROC_FIRST_READ_GP
| WRITE_PROC_FIRST_MB,
WRITE_PROC_SECOND_READ_GP) ->
ooo_mem(i);
+ //smp_mb(i); /* TEST */
tmpa = READ_CACHED_VAR(urcu_gp_ctr);
PRODUCE_TOKENS(proc_urcu_writer, WRITE_PROC_SECOND_READ_GP);
:: CONSUME_TOKENS(proc_urcu_writer,
- WRITE_PROC_FIRST_MB
+ WRITE_PROC_FIRST_WAIT /* dependency on first wait, because this
+ * instruction has globally observable
+ * side-effects.
+ */
+ | WRITE_PROC_FIRST_MB
| WRITE_PROC_WMB
| WRITE_PROC_FIRST_READ_GP
| WRITE_PROC_FIRST_WRITE_GP
PRODUCE_TOKENS(proc_urcu_writer, WRITE_PROC_SECOND_WRITE_GP);
:: CONSUME_TOKENS(proc_urcu_writer,
- //WRITE_PROC_FIRST_WRITE_GP /* TEST ADDING SYNC CORE */
+ //WRITE_PROC_FIRST_WRITE_GP | /* TEST ADDING SYNC CORE */
WRITE_PROC_FIRST_WAIT
| WRITE_PROC_FIRST_MB, /* can be reordered before/after flips */
WRITE_PROC_SECOND_WAIT | WRITE_PROC_SECOND_WAIT_LOOP) ->
ooo_mem(i);
+ //smp_mb(i); /* TEST */
/* ONLY WAITING FOR READER 0 */
tmp2 = READ_CACHED_VAR(urcu_active_readers[0]);
if
fi;
:: CONSUME_TOKENS(proc_urcu_writer,
- //WRITE_PROC_FIRST_WRITE_GP /* TEST ADDING SYNC CORE */
+ //WRITE_PROC_FIRST_WRITE_GP | /* TEST ADDING SYNC CORE */
WRITE_PROC_SECOND_WRITE_GP
| WRITE_PROC_FIRST_WRITE_GP
| WRITE_PROC_SECOND_READ_GP
:: 1 ->
#ifdef WRITER_PROGRESS
progress_writer2:
+#endif
+#ifdef READER_PROGRESS
+ /*
+ * Make sure we don't block the reader's progress.
+ */
+ smp_mb_send(i, j, 5);
#endif
skip;
od;
byte i, j;
atomic {
- INIT_CACHED_VAR(urcu_gp_ctr, 1, j);
- INIT_CACHED_VAR(rcu_ptr, 0, j);
+ INIT_CACHED_VAR(urcu_gp_ctr, 1);
+ INIT_CACHED_VAR(rcu_ptr, 0);
i = 0;
do
:: i < NR_READERS ->
- INIT_CACHED_VAR(urcu_active_readers[i], 0, j);
+ INIT_CACHED_VAR(urcu_active_readers[i], 0);
ptr_read_first[i] = 1;
ptr_read_second[i] = 1;
data_read_first[i] = WINE;
i++;
:: i >= NR_READERS -> break
od;
- INIT_CACHED_VAR(rcu_data[0], WINE, j);
+ INIT_CACHED_VAR(rcu_data[0], WINE);
i = 1;
do
:: i < SLAB_SIZE ->
- INIT_CACHED_VAR(rcu_data[i], POISON, j);
+ INIT_CACHED_VAR(rcu_data[i], POISON);
i++
:: i >= SLAB_SIZE -> break
od;