#include <assert.h>
#include <string.h>
#include <unistd.h>
+#include <errno.h>
#include <urcu/rcurbtree.h>
#include <urcu-pointer.h>
#include <urcu-call-rcu.h>
#include <urcu/compiler.h>
+/*
+ * Explanation of next/prev walk coherency and search coherency when
+ * performed concurrently with updates.
+ *
+ * next/prev walk coherency with respect to concurrent updates:
+ *
+ * There are 3 scenarios for which we need to model and validate this:
+ * rotation, transplant and "teleportation" (the latter being a remote
+ * transplant in a remove non-nil case).
+ *
+ * - rotation left (right is symmetric)
+ * xl and yr point to the same parent nodes before/after left
+ * rotation. yll and ylr also point to the same parent node
+ * before/after left rotation.
+ * As we are copying x, y and yl (the 3 nodes which parent/child
+ * relationship are changed) to "new" version of this node cluster,
+ * all external references to the cluster either point to the old
+ * cluster or the new one. If we take this cluster as a "black box"
+ * from the point of view of next/prev traversal, all we have to
+ * ensure is that the old and the new cluster behave in the exact same
+ * way with respect to traversal order.
+ *
+ * - transplant
+ * In this operation, we transplant a copy of "v" into its parent
+ * location (u), thus replacing it. The children of "v", vl and vr,
+ * still point to v (new version) after the transplant, so it does not
+ * change the behavior when considering the next/prev traversal. "v"
+ * being copied to a new version ensures that the parent pointers of v
+ * are pointing to its new parent (parent of u) before it is published
+ * to readers (by setting the child pointer of u's parent to the new
+ * copy of v).
+ *
+ * - teleportation
+ * This one is probably the most tricky and will require some ascii
+ * art to explain.
+ *
+ * We want to remove z from this tree:
+ *
+ * zp
+ * \
+ * z
+ * / \
+ * zl zr
+ * /
+ * a
+ * / \
+ * b ar
+ * / \
+ * y br
+ * \
+ * yr
+ * / \
+ * yrl yrr
+ *
+ * What we are going to do is to "teleport" y into z's location,
+ * reparenting yr to b. We are taking care to create a new cluster
+ * copy that is isolated from any reader. We will represent the new
+ * members of the cluster with capital letters.
+ *
+ * zp
+ * \
+ * Y
+ * / \
+ * zl ZR
+ * /
+ * A
+ * / \
+ * B ar
+ * / \
+ * YR br
+ * / \
+ * yrl yrr
+ *
+ * In this transient state, we notice that the pointers within the
+ * cluster all point to the new cluster nodes, and they point to the
+ * correct external nodes. However, no external pointer point to the
+ * cluster (yet). The first pointer to point to this cluster will be
+ * "zp->right". It will therefore make the cluster visible for search.
+ *
+ * In this intermediate state, we can walk through the new cluster
+ * when coming from the top (in a next/prev traversal), but can come
+ * back to the old cluster when going back up from the children nodes.
+ * All we have to ensure is that the two clusters, taken as a black
+ * box from a next/prev traversal perspective, yield to the exact same
+ * result.
+ *
+ * Search coherency with concurrent updates:
+ *
+ * Simple "search" (only going down the tree) is also handled by this
+ * cluster scheme. The explanation is a subset of the prev/next
+ * explanation, where we don't have to care about the intermediate
+ * stages where the children point to the old cluster, because we only
+ * ever use the top level pointers to go down into the children nodes,
+ * we never go back up. So by simply making sure that all the cluster
+ * internal nodes pointers are setup correctly before making the
+ * cluster visible to the readers (by setting the parent pointer to
+ * the topmost new node in the cluster), we are sure that readers will
+ * see a coherent view of the cluster at all times.
+ */
+
#ifdef DEBUG
#define dbg_printf(args...) printf(args)
#define dbg_usleep(usecs) usleep(usecs)
/*
* Undefine this to enable the non-RCU rotate and transplant functions
- * (for debugging).
+ * (for debugging). Note that these versions don't support the tree
+ * max_end updates, so lookups must be performed with
+ * rcu_rbtree_search_begin_key when using this debug facility.
*/
#define RBTREE_RCU_SUPPORT_ROTATE_LEFT
#define RBTREE_RCU_SUPPORT_ROTATE_RIGHT
#define RBTREE_RCU_SUPPORT_TRANSPLANT
+#define RBTREE_RCU_SUPPORT
+
+#ifdef RBTREE_RCU_SUPPORT
+#define c_rcu_dereference(x) rcu_dereference(x)
+#define c_cmm_smp_wmb() cmm_smp_wmb()
+#define c_cmm_smp_wmc() cmm_smp_wmc()
+#define C_CMM_STORE_SHARED(x, v) CMM_STORE_SHARED(x, v)
+#define C__CMM_STORE_SHARED(x, v) _CMM_STORE_SHARED(x, v)
+#else
+#define c_rcu_dereference(x) (x)
+#define c_cmm_smp_wmb()
+#define c_cmm_smp_wmc()
+#define C_CMM_STORE_SHARED(x, v) ((x) = (v))
+#define C__CMM_STORE_SHARED(x, v) ((x) = (v))
+#endif
-#ifdef EXTRA_DEBUG
+/*
+ * Add internal mutex locking within the RBTree, for debugging. Enable this
+ * define and add mutexes to RCU readers to debug races with with rotation or
+ * transplant.
+ */
+/* #define RBTREE_INTERNAL_LOCKING */
+
+#ifdef RBTREE_INTERNAL_LOCKING
static pthread_mutex_t test_mutex = PTHREAD_MUTEX_INITIALIZER;
static pthread_mutex_t outer_mutex = PTHREAD_MUTEX_INITIALIZER;
{
pthread_mutex_unlock(&test_mutex);
}
+#else
+static
+void lock_outer_mutex(void)
+{
+}
+
+static
+void unlock_outer_mutex(void)
+{
+}
+
+static
+void lock_test_mutex(void)
+{
+}
+
+static
+void unlock_test_mutex(void)
+{
+}
#endif
static
struct rcu_rbtree_node *parent,
unsigned int pos)
{
- _CMM_STORE_SHARED(node->parent, ((unsigned long) parent) | pos);
+ C__CMM_STORE_SHARED(node->parent, ((unsigned long) parent) | pos);
}
static
struct rcu_rbtree_node *get_parent_and_pos(struct rcu_rbtree_node *node,
unsigned int *pos)
{
- unsigned long parent_pos = rcu_dereference(node->parent);
+ unsigned long parent_pos = c_rcu_dereference(node->parent);
*pos = (unsigned int) (parent_pos & 1UL);
return (struct rcu_rbtree_node *) (parent_pos & ~1UL);
return x->decay_next;
}
+static
+struct rcu_rbtree_node *_rcu_rbtree_alloc_node(struct rcu_rbtree *rbtree)
+{
+ return rbtree->rballoc(sizeof(struct rcu_rbtree_node));
+}
+
+static
+void _rcu_rbtree_free_node(struct rcu_head *head)
+{
+ struct rcu_rbtree_node *node =
+ caa_container_of(head, struct rcu_rbtree_node, head);
+ node->rbtree->rbfree(node);
+}
+
+#ifdef RBTREE_RCU_SUPPORT
+
static
struct rcu_rbtree_node *dup_decay_node(struct rcu_rbtree *rbtree,
struct rcu_rbtree_node *x)
if (rcu_rbtree_is_nil(rbtree, x))
return x;
- xc = rbtree->rballoc();
- memcpy(xc, x, sizeof(struct rcu_rbtree_node));
+ xc = _rcu_rbtree_alloc_node(rbtree);
+ memcpy(xc, x, sizeof(*xc));
xc->decay_next = NULL;
set_decay(x, xc);
- call_rcu(&x->head, rbtree->rbfree);
+ rbtree->call_rcu(&x->head, _rcu_rbtree_free_node);
return xc;
}
+#else /* RBTREE_RCU_SUPPORT */
+
+static
+struct rcu_rbtree_node *dup_decay_node(struct rcu_rbtree *rbtree,
+ struct rcu_rbtree_node *x)
+{
+ return x;
+}
+
+#endif
+
/*
* Info for range lookups:
* Range lookup information is only valid when used when searching for
* Deal with memory allocation errors.
* Can be ensured by reserving a pool of memory entries before doing the
* insertion, which will have to be function of number of
- * transplantations/rotations required for the operation.
+ * transplantations/rotations required for the operation (which is a
+ * multiple of the tree height).
*/
#ifdef DEBUG
struct rcu_rbtree_node *xl;
dbg_printf("searching point 0x%lx\n", (unsigned long) point);
- x = rcu_dereference(x);
+ x = c_rcu_dereference(x);
while (!rcu_rbtree_is_nil(rbtree, x)) {
dbg_usleep(10);
- xl = rcu_dereference(x->_left);
+ xl = c_rcu_dereference(x->_left);
dbg_printf("search x %lx x_end %lx x_max_end %lx\n", (unsigned long) x->begin,
(unsigned long) x->end, (unsigned long) x->max_end);
dbg_printf("search xl %lx xl_end %lx xl_max_end %lx\n", (unsigned long) xl->begin,
&& (rbtree->comp(xl->max_end, point) > 0)) {
dbg_printf("go left\n");
x = xl;
- } else if (rbtree->comp(x->begin, point) >= 0
+ } else if (rbtree->comp(x->begin, point) <= 0
&& rbtree->comp(point, x->end) < 0) {
dbg_printf("got it!\n");
break;
} else if (rbtree->comp(point, x->begin) > 0) {
dbg_printf("go right\n");
- x = rcu_dereference(x->_right);
+ x = c_rcu_dereference(x->_right);
} else {
dbg_printf("not found!\n");
x = make_nil(rbtree);
struct rcu_rbtree_node *x,
void *k)
{
- x = rcu_dereference(x);
+ x = c_rcu_dereference(x);
int comp;
while (!rcu_rbtree_is_nil(rbtree, x) && (comp = rbtree->comp(k, x->begin)) != 0) {
dbg_usleep(10);
if (comp < 0)
- x = rcu_dereference(x->_left);
+ x = c_rcu_dereference(x->_left);
else
- x = rcu_dereference(x->_right);
+ x = c_rcu_dereference(x->_right);
}
return x;
}
{
struct rcu_rbtree_node *xl;
- x = rcu_dereference(x);
+ x = c_rcu_dereference(x);
if (rcu_rbtree_is_nil(rbtree, x)) {
*zr = x;
} else
*zr = x = dup_decay_node(rbtree, x);
- while (!rcu_rbtree_is_nil(rbtree, xl = rcu_dereference(x->_left))) {
+ while (!rcu_rbtree_is_nil(rbtree, xl = c_rcu_dereference(x->_left))) {
x = dup_decay_node(rbtree, xl);
set_parent(x, get_decay(get_parent(x)), get_pos(x));
get_parent(x)->_left = get_decay(get_parent(x)->_left);
{
struct rcu_rbtree_node *xl;
- x = rcu_dereference(x);
+ x = c_rcu_dereference(x);
if (rcu_rbtree_is_nil(rbtree, x))
return x;
get_pos(x->_left));
}
- while (!rcu_rbtree_is_nil(rbtree, xl = rcu_dereference(x->_left))) {
+ while (!rcu_rbtree_is_nil(rbtree, xl = c_rcu_dereference(x->_left))) {
x = xl;
set_parent(x->_right, get_decay(get_parent(x->_right)),
get_pos(x->_right));
{
struct rcu_rbtree_node *xl;
- x = rcu_dereference(x);
+ x = c_rcu_dereference(x);
if (rcu_rbtree_is_nil(rbtree, x))
return x;
- while (!rcu_rbtree_is_nil(rbtree, xl = rcu_dereference(x->_left)))
+ while (!rcu_rbtree_is_nil(rbtree, xl = c_rcu_dereference(x->_left)))
x = xl;
return x;
}
{
struct rcu_rbtree_node *xr;
- x = rcu_dereference(x);
+ x = c_rcu_dereference(x);
if (rcu_rbtree_is_nil(rbtree, x))
return x;
- while (!rcu_rbtree_is_nil(rbtree, xr = rcu_dereference(x->_right)))
+ while (!rcu_rbtree_is_nil(rbtree, xr = c_rcu_dereference(x->_right)))
x = xr;
return x;
}
-/*
- * FIXME:
- * Updates should wait for a grace period between update of the
- * redirect pointer and update of the parent child pointer. This is to make sure
- * that no reference to the old entry exist.
- */
-
/*
* RCU read lock must be held across the next/prev calls to ensure validity of
* the returned node.
struct rcu_rbtree_node *xr, *y;
unsigned int x_pos;
- x = rcu_dereference(x);
+ x = c_rcu_dereference(x);
- if (!rcu_rbtree_is_nil(rbtree, xr = rcu_dereference(x->_right)))
+ if (!rcu_rbtree_is_nil(rbtree, xr = c_rcu_dereference(x->_right)))
return rcu_rbtree_min(rbtree, xr);
y = get_parent_and_pos(x, &x_pos);
while (!rcu_rbtree_is_nil(rbtree, y) && x_pos == IS_RIGHT) {
struct rcu_rbtree_node *xl, *y;
unsigned int x_pos;
- x = rcu_dereference(x);
+ x = c_rcu_dereference(x);
- if (!rcu_rbtree_is_nil(rbtree, xl = rcu_dereference(x->_left)))
+ if (!rcu_rbtree_is_nil(rbtree, xl = c_rcu_dereference(x->_left)))
return rcu_rbtree_max(rbtree, xl);
y = get_parent_and_pos(x, &x_pos);
while (!rcu_rbtree_is_nil(rbtree, y) && x_pos == IS_LEFT) {
node->max_end = max_end;
} else {
top = get_parent(node);
- cmm_smp_wmb(); /* write into node before publish */
+ c_cmm_smp_wmb(); /* write into node before publish */
/* make new branch visible to readers */
if (rcu_rbtree_is_nil(rbtree, top))
- _CMM_STORE_SHARED(rbtree->root, node);
+ C__CMM_STORE_SHARED(rbtree->root, node);
if (get_pos(node) == IS_LEFT)
- _CMM_STORE_SHARED(top->_left, node);
+ C__CMM_STORE_SHARED(top->_left, node);
else
- _CMM_STORE_SHARED(top->_right, node);
+ C__CMM_STORE_SHARED(top->_right, node);
goto end;
}
} while (!rcu_rbtree_is_nil(rbtree, node));
top = node; /* nil */
- cmm_smp_wmb(); /* write into node before publish */
+ c_cmm_smp_wmb(); /* write into node before publish */
/* make new branch visible to readers */
- _CMM_STORE_SHARED(rbtree->root, prev);
+ C__CMM_STORE_SHARED(rbtree->root, prev);
end:
if (!copy_parents)
x->max_end = calculate_node_max_end(rbtree, x);
y->max_end = calculate_node_max_end(rbtree, y);
- cmm_smp_wmb(); /* write into node before publish */
+ c_cmm_smp_wmb(); /* write into node before publish */
/* External references update (visible by readers) */
if (rcu_rbtree_is_nil(rbtree, get_parent(y)))
- _CMM_STORE_SHARED(rbtree->root, y);
+ C__CMM_STORE_SHARED(rbtree->root, y);
else if (get_pos(y) == IS_LEFT)
- _CMM_STORE_SHARED(get_parent(y)->_left, y);
+ C__CMM_STORE_SHARED(get_parent(y)->_left, y);
else
- _CMM_STORE_SHARED(get_parent(y)->_right, y);
+ C__CMM_STORE_SHARED(get_parent(y)->_right, y);
/* Point children to new copy (parent only used by updates/next/prev) */
set_parent(x->_left, get_decay(get_parent(x->_left)),
}
y->_left = x;
set_parent(x, y, IS_LEFT);
+
+ /*
+ * We only changed the relative position of x and y wrt their
+ * children, and reparented y (but are keeping the same nodes in
+ * place, so its parent does not need to have end value
+ * recalculated).
+ */
+ x->max_end = calculate_node_max_end(rbtree, x);
+ y->max_end = calculate_node_max_end(rbtree, y);
+
unlock_test_mutex();
}
x->max_end = calculate_node_max_end(rbtree, x);
y->max_end = calculate_node_max_end(rbtree, y);
- cmm_smp_wmb(); /* write into node before publish */
+ c_cmm_smp_wmb(); /* write into node before publish */
/* External references update (visible by readers) */
if (rcu_rbtree_is_nil(rbtree, get_parent(y)))
- _CMM_STORE_SHARED(rbtree->root, y);
+ C__CMM_STORE_SHARED(rbtree->root, y);
else if (get_pos(y) == IS_RIGHT)
- _CMM_STORE_SHARED(get_parent(y)->_right, y);
+ C__CMM_STORE_SHARED(get_parent(y)->_right, y);
else
- _CMM_STORE_SHARED(get_parent(y)->_left, y);
+ C__CMM_STORE_SHARED(get_parent(y)->_left, y);
/* Point children to new copy (parent only used by updates/next/prev) */
set_parent(x->_right, get_decay(get_parent(x->_right)),
}
y->_right = x;
set_parent(x, y, IS_RIGHT);
+
+ /*
+ * We only changed the relative position of x and y wrt their
+ * children, and reparented y (but are keeping the same nodes in
+ * place, so its parent does not need to have end value
+ * recalculated).
+ */
+ x->max_end = calculate_node_max_end(rbtree, x);
+ y->max_end = calculate_node_max_end(rbtree, y);
+
unlock_test_mutex();
}
* Returns 0 on success, or < 0 on error.
*/
int rcu_rbtree_insert(struct rcu_rbtree *rbtree,
- struct rcu_rbtree_node *z)
+ void *begin, void *end)
{
- struct rcu_rbtree_node *x, *y;
+ struct rcu_rbtree_node *x, *y, *z;
+
+ z = _rcu_rbtree_alloc_node(rbtree);
+ if (!z)
+ return -ENOMEM;
+ z->begin = begin;
+ z->end = end;
dbg_printf("insert %p\n", z->begin);
assert(!is_decay(rbtree->root));
z->color = COLOR_RED;
z->decay_next = NULL;
z->max_end = z->end;
+ z->rbtree = rbtree;
if (rcu_rbtree_is_nil(rbtree, y)) {
set_parent(z, y, IS_RIGHT); /* pos arbitrary for root node */
* Order stores to z (children/parents) before stores
* that will make it visible to the rest of the tree.
*/
- cmm_smp_wmb();
- _CMM_STORE_SHARED(rbtree->root, z);
+ c_cmm_smp_wmb();
+ C__CMM_STORE_SHARED(rbtree->root, z);
} else if (rbtree->comp(z->begin, y->begin) < 0) {
y = dup_decay_node(rbtree, y);
set_parent(z, y, IS_LEFT);
if (get_pos(z) == IS_LEFT)
- _CMM_STORE_SHARED(y->_left, z);
+ C__CMM_STORE_SHARED(y->_left, z);
else
- _CMM_STORE_SHARED(y->_right, z);
+ C__CMM_STORE_SHARED(y->_right, z);
populate_node_end(rbtree, y, 1, NULL);
} else {
y = dup_decay_node(rbtree, y);
set_parent(z, y, IS_RIGHT);
if (get_pos(z) == IS_LEFT)
- _CMM_STORE_SHARED(y->_left, z);
+ C__CMM_STORE_SHARED(y->_left, z);
else
- _CMM_STORE_SHARED(y->_right, z);
+ C__CMM_STORE_SHARED(y->_right, z);
populate_node_end(rbtree, y, 1, NULL);
}
rcu_rbtree_insert_fixup(rbtree, z);
/*
- * Make sure to commit all _CMM_STORE_SHARED() for non-coherent caches.
+ * Make sure to commit all C__CMM_STORE_SHARED() for non-coherent caches.
*/
- cmm_smp_wmc();
+ c_cmm_smp_wmc();
show_tree(rbtree);
check_max_end(rbtree, z);
check_max_end(rbtree, y);
if (rcu_rbtree_is_nil(rbtree, get_parent(u))) {
/* pos is arbitrary for root node */
set_parent(v, get_parent(u), IS_RIGHT);
- cmm_smp_wmb(); /* write into node before publish */
- _CMM_STORE_SHARED(rbtree->root, v);
+ c_cmm_smp_wmb(); /* write into node before publish */
+ C__CMM_STORE_SHARED(rbtree->root, v);
} else {
struct rcu_rbtree_node *vp;
vp = dup_decay_node(rbtree, vp);
set_parent(v, vp, get_pos(u));
if (get_pos(v) == IS_LEFT)
- _CMM_STORE_SHARED(vp->_left, v);
+ C__CMM_STORE_SHARED(vp->_left, v);
else
- _CMM_STORE_SHARED(vp->_right, v);
+ C__CMM_STORE_SHARED(vp->_right, v);
populate_node_end(rbtree, vp, copy_parents, stop);
check_max_end(rbtree, vp);
}
dbg_printf("transplant %p\n", v->begin);
lock_test_mutex();
- if (rcu_rbtree_is_nil(rbtree, get_parent(u)))
+ if (rcu_rbtree_is_nil(rbtree, get_parent(u))) {
rbtree->root = v;
- else if (u == get_parent(u)->_left)
- get_parent(u)->_left = v;
- else
- get_parent(u)->_right = v;
+ } else {
+ if (u == get_parent(u)->_left)
+ get_parent(u)->_left = v;
+ else
+ get_parent(u)->_right = v;
+ populate_node_end(rbtree, get_parent(u), copy_parents, stop);
+ }
set_parent(v, get_parent(u), get_pos(u));
unlock_test_mutex();
}
check_max_end(rbtree, x);
check_max_end(rbtree, get_decay(y));
/*
- * Commit all _CMM_STORE_SHARED().
+ * Commit all C__CMM_STORE_SHARED().
*/
- cmm_smp_wmc();
+ c_cmm_smp_wmc();
+#ifdef RBTREE_RCU_SUPPORT
+ rbtree->call_rcu(&z->head, _rcu_rbtree_free_node);
+#else
+ _rcu_rbtree_free_node(&z->head);
+#endif
return 0;
}