Fix rcuja: fallback when adding element to full pool
[userspace-rcu.git] / rcuja / rcuja.c
index 407c101be65eeaecb98a45b70dad90a360bb81d8..fac1eb75d56a09210e87296a35b14664a09724ec 100644 (file)
@@ -277,6 +277,8 @@ static
 struct cds_ja_inode_flag *ja_linear_node_get_nth(const struct cds_ja_type *type,
                struct cds_ja_inode *node,
                struct cds_ja_inode_flag ***child_node_flag_ptr,
+               struct cds_ja_inode_flag **child_node_flag_v,
+               struct cds_ja_inode_flag ***node_flag_ptr,
                uint8_t n)
 {
        uint8_t nr_child;
@@ -297,12 +299,19 @@ struct cds_ja_inode_flag *ja_linear_node_get_nth(const struct cds_ja_type *type,
                if (CMM_LOAD_SHARED(values[i]) == n)
                        break;
        }
-       if (i >= nr_child)
+       if (i >= nr_child) {
+               if (caa_unlikely(node_flag_ptr))
+                       *node_flag_ptr = NULL;
                return NULL;
+       }
        pointers = (struct cds_ja_inode_flag **) align_ptr_size(&values[type->max_linear_child]);
        ptr = rcu_dereference(pointers[i]);
        if (caa_unlikely(child_node_flag_ptr) && ptr)
                *child_node_flag_ptr = &pointers[i];
+       if (caa_unlikely(child_node_flag_v) && ptr)
+               *child_node_flag_v = ptr;
+       if (caa_unlikely(node_flag_ptr))
+               *node_flag_ptr = &pointers[i];
        return ptr;
 }
 
@@ -329,6 +338,8 @@ static
 struct cds_ja_inode_flag *ja_pool_node_get_nth(const struct cds_ja_type *type,
                struct cds_ja_inode *node,
                struct cds_ja_inode_flag ***child_node_flag_ptr,
+               struct cds_ja_inode_flag **child_node_flag_v,
+               struct cds_ja_inode_flag ***node_flag_ptr,
                uint8_t n)
 {
        struct cds_ja_inode *linear;
@@ -340,7 +351,8 @@ struct cds_ja_inode_flag *ja_pool_node_get_nth(const struct cds_ja_type *type,
         */
        linear = (struct cds_ja_inode *)
                &node->u.data[((unsigned long) n >> (CHAR_BIT - type->nr_pool_order)) << type->pool_size_order];
-       return ja_linear_node_get_nth(type, linear, child_node_flag_ptr, n);
+       return ja_linear_node_get_nth(type, linear, child_node_flag_ptr,
+               child_node_flag_v, node_flag_ptr, n);
 }
 
 static
@@ -357,17 +369,25 @@ static
 struct cds_ja_inode_flag *ja_pigeon_node_get_nth(const struct cds_ja_type *type,
                struct cds_ja_inode *node,
                struct cds_ja_inode_flag ***child_node_flag_ptr,
+               struct cds_ja_inode_flag **child_node_flag_v,
+               struct cds_ja_inode_flag ***node_flag_ptr,
                uint8_t n)
 {
        struct cds_ja_inode_flag **child_node_flag;
+       struct cds_ja_inode_flag *child_node_flag_read;
 
        assert(type->type_class == RCU_JA_PIGEON);
        child_node_flag = &((struct cds_ja_inode_flag **) node->u.data)[n];
+       child_node_flag_read = rcu_dereference(*child_node_flag);
        dbg_printf("ja_pigeon_node_get_nth child_node_flag_ptr %p\n",
                child_node_flag);
-       if (caa_unlikely(child_node_flag_ptr) && *child_node_flag)
+       if (caa_unlikely(child_node_flag_ptr) && child_node_flag_read)
                *child_node_flag_ptr = child_node_flag;
-       return rcu_dereference(*child_node_flag);
+       if (caa_unlikely(child_node_flag_v) && child_node_flag_read)
+               *child_node_flag_v = child_node_flag_read;
+       if (caa_unlikely(node_flag_ptr))
+               *node_flag_ptr = child_node_flag;
+       return child_node_flag_read;
 }
 
 static
@@ -375,7 +395,7 @@ struct cds_ja_inode_flag *ja_pigeon_node_get_ith_pos(const struct cds_ja_type *t
                struct cds_ja_inode *node,
                uint8_t i)
 {
-       return ja_pigeon_node_get_nth(type, node, NULL, i);
+       return ja_pigeon_node_get_nth(type, node, NULL, NULL, NULL, i);
 }
 
 /*
@@ -383,8 +403,10 @@ struct cds_ja_inode_flag *ja_pigeon_node_get_ith_pos(const struct cds_ja_type *t
  * node_flag is already rcu_dereference'd.
  */
 static
-struct cds_ja_inode_flag * ja_node_get_nth(struct cds_ja_inode_flag *node_flag,
+struct cds_ja_inode_flag *ja_node_get_nth(struct cds_ja_inode_flag *node_flag,
                struct cds_ja_inode_flag ***child_node_flag_ptr,
+               struct cds_ja_inode_flag **child_node_flag,
+               struct cds_ja_inode_flag ***node_flag_ptr,
                uint8_t n)
 {
        unsigned int type_index;
@@ -399,13 +421,16 @@ struct cds_ja_inode_flag * ja_node_get_nth(struct cds_ja_inode_flag *node_flag,
        switch (type->type_class) {
        case RCU_JA_LINEAR:
                return ja_linear_node_get_nth(type, node,
-                               child_node_flag_ptr, n);
+                               child_node_flag_ptr, child_node_flag,
+                               node_flag_ptr, n);
        case RCU_JA_POOL:
                return ja_pool_node_get_nth(type, node,
-                               child_node_flag_ptr, n);
+                               child_node_flag_ptr, child_node_flag,
+                               node_flag_ptr, n);
        case RCU_JA_PIGEON:
                return ja_pigeon_node_get_nth(type, node,
-                               child_node_flag_ptr, n);
+                               child_node_flag_ptr, child_node_flag,
+                               node_flag_ptr, n);
        default:
                assert(0);
                return (void *) -1UL;
@@ -810,6 +835,9 @@ skip_copy:
                ret = _ja_node_set_nth(new_type, new_node,
                                new_shadow_node,
                                n, child_node_flag);
+               if (new_type->type_class == RCU_JA_POOL && ret) {
+                       goto fallback_toosmall;
+               }
                assert(!ret);
        }
        /* Return pointer to new recompacted node through old_node_flag_ptr */
@@ -938,7 +966,7 @@ struct cds_hlist_head cds_ja_lookup(struct cds_ja *ja, uint64_t key)
                uint8_t iter_key;
 
                iter_key = (uint8_t) (key >> (JA_BITS_PER_BYTE * (tree_depth - i - 1)));
-               node_flag = ja_node_get_nth(node_flag, NULL,
+               node_flag = ja_node_get_nth(node_flag, NULL, NULL, NULL,
                        iter_key);
                dbg_printf("cds_ja_lookup iter key lookup %u finds node_flag %p\n",
                                (unsigned int) iter_key, node_flag);
@@ -965,6 +993,8 @@ struct cds_hlist_head cds_ja_lookup(struct cds_ja *ja, uint64_t key)
  */
 static
 int ja_attach_node(struct cds_ja *ja,
+               struct cds_ja_inode_flag **attach_node_flag_ptr,
+               struct cds_ja_inode_flag *attach_node_flag,
                struct cds_ja_inode_flag **node_flag_ptr,
                struct cds_ja_inode_flag *node_flag,
                struct cds_ja_inode_flag *parent_node_flag,
@@ -1000,6 +1030,27 @@ int ja_attach_node(struct cds_ja *ja,
                }
        }
 
+       if (node_flag_ptr && ja_node_ptr(*node_flag_ptr)) {
+               /*
+                * Target node has been updated between RCU lookup and
+                * lock acquisition. We need to re-try lookup and
+                * attach.
+                */
+               ret = -EAGAIN;
+               goto unlock_parent;
+       }
+
+       if (attach_node_flag_ptr && ja_node_ptr(*attach_node_flag_ptr) !=
+                       ja_node_ptr(attach_node_flag)) {
+               /*
+                * Target node has been updated between RCU lookup and
+                * lock acquisition. We need to re-try lookup and
+                * attach.
+                */
+               ret = -EAGAIN;
+               goto unlock_parent;
+       }
+
        /* Create new branch, starting from bottom */
        CDS_INIT_HLIST_HEAD(&head);
        cds_hlist_add_head_rcu(&child_node->list, &head);
@@ -1040,8 +1091,8 @@ int ja_attach_node(struct cds_ja *ja,
 
        /* Publish new branch */
        dbg_printf("Publish branch %p, replacing %p\n",
-               iter_node_flag, *node_flag_ptr);
-       rcu_assign_pointer(*node_flag_ptr, iter_node_flag);
+               iter_node_flag, *attach_node_flag_ptr);
+       rcu_assign_pointer(*attach_node_flag_ptr, iter_node_flag);
 
        /* Success */
        ret = 0;
@@ -1062,6 +1113,7 @@ check_error:
                        assert(!tmpret);
                }
        }
+unlock_parent:
        if (parent_shadow_node)
                rcuja_shadow_unlock(parent_shadow_node);
 unlock_shadow:
@@ -1080,31 +1132,43 @@ end:
 static
 int ja_chain_node(struct cds_ja *ja,
                struct cds_ja_inode_flag *parent_node_flag,
+               struct cds_ja_inode_flag **node_flag_ptr,
+               struct cds_ja_inode_flag *node_flag,
                struct cds_hlist_head *head,
                struct cds_ja_node *node)
 {
        struct cds_ja_shadow_node *shadow_node;
+       int ret = 0;
 
        shadow_node = rcuja_shadow_lookup_lock(ja->ht, parent_node_flag);
-       if (!shadow_node)
+       if (!shadow_node) {
                return -EAGAIN;
+       }
+       if (ja_node_ptr(*node_flag_ptr) != ja_node_ptr(node_flag)) {
+               ret = -EAGAIN;
+               goto end;
+       }
        cds_hlist_add_head_rcu(&node->list, head);
+end:
        rcuja_shadow_unlock(shadow_node);
-       return 0;
+       return ret;
 }
 
 int cds_ja_add(struct cds_ja *ja, uint64_t key,
                struct cds_ja_node *new_node)
 {
        unsigned int tree_depth, i;
-       struct cds_ja_inode_flag **node_flag_ptr;       /* in parent */
+       struct cds_ja_inode_flag **attach_node_flag_ptr,
+               **node_flag_ptr;
        struct cds_ja_inode_flag *node_flag,
                *parent_node_flag,
-               *parent2_node_flag;
+               *parent2_node_flag,
+               *attach_node_flag;
        int ret;
 
-       if (caa_unlikely(key > ja->key_max))
+       if (caa_unlikely(key > ja->key_max)) {
                return -EINVAL;
+       }
        tree_depth = ja->tree_depth;
 
 retry:
@@ -1113,6 +1177,8 @@ retry:
        parent2_node_flag = NULL;
        parent_node_flag =
                (struct cds_ja_inode_flag *) &ja->root; /* Use root ptr address as key for mutex */
+       attach_node_flag_ptr = &ja->root;
+       attach_node_flag = rcu_dereference(ja->root);
        node_flag_ptr = &ja->root;
        node_flag = rcu_dereference(ja->root);
 
@@ -1120,11 +1186,14 @@ retry:
        for (i = 1; i < tree_depth; i++) {
                uint8_t iter_key;
 
-               dbg_printf("cds_ja_add iter node_flag_ptr %p node_flag %p\n",
-                               *node_flag_ptr, node_flag);
+               dbg_printf("cds_ja_add iter attach_node_flag_ptr %p node_flag_ptr %p node_flag %p\n",
+                               attach_node_flag_ptr, node_flag_ptr, node_flag);
                if (!ja_node_ptr(node_flag)) {
-                       ret = ja_attach_node(ja, node_flag_ptr,
-                                       parent_node_flag, parent2_node_flag,
+                       ret = ja_attach_node(ja, attach_node_flag_ptr,
+                                       attach_node_flag,
+                                       node_flag_ptr,
+                                       parent_node_flag,
+                                       parent2_node_flag,
                                        key, i, new_node);
                        if (ret == -EAGAIN || ret == -EEXIST)
                                goto retry;
@@ -1135,10 +1204,14 @@ retry:
                parent2_node_flag = parent_node_flag;
                parent_node_flag = node_flag;
                node_flag = ja_node_get_nth(node_flag,
+                       &attach_node_flag_ptr,
+                       &attach_node_flag,
                        &node_flag_ptr,
                        iter_key);
-               dbg_printf("cds_ja_add iter key lookup %u finds node_flag %p node_flag_ptr %p\n",
-                               (unsigned int) iter_key, node_flag, *node_flag_ptr);
+               dbg_printf("cds_ja_add iter key lookup %u finds node_flag %p attach_node_flag_ptr %p node_flag_ptr %p\n",
+                               (unsigned int) iter_key, node_flag,
+                               attach_node_flag_ptr,
+                               node_flag_ptr);
        }
 
        /*
@@ -1146,17 +1219,21 @@ retry:
         * level, or chain it if key is already present.
         */
        if (!ja_node_ptr(node_flag)) {
-               dbg_printf("cds_ja_add last node_flag_ptr %p node_flag %p\n",
-                               *node_flag_ptr, node_flag);
-               ret = ja_attach_node(ja, node_flag_ptr, parent_node_flag,
+               dbg_printf("cds_ja_add attach_node_flag_ptr %p node_flag_ptr %p node_flag %p\n",
+                               attach_node_flag_ptr, node_flag_ptr, node_flag);
+               ret = ja_attach_node(ja, attach_node_flag_ptr,
+                               attach_node_flag,
+                               node_flag_ptr, parent_node_flag,
                                parent2_node_flag, key, i, new_node);
        } else {
                ret = ja_chain_node(ja,
                        parent_node_flag,
-                       (struct cds_hlist_head *) node_flag_ptr,
+                       node_flag_ptr,
+                       node_flag,
+                       (struct cds_hlist_head *) attach_node_flag_ptr,
                        new_node);
        }
-       if (ret == -EAGAIN)
+       if (ret == -EAGAIN || ret == -EEXIST)
                goto retry;
 end:
        return ret;
@@ -1169,6 +1246,9 @@ end:
  * ensure that when a match value -> pointer is found in a node, it is
  * _NEVER_ changed for that node without recompaction, and recompaction
  * reallocates the node.
+ * However, when a child is removed from "linear" nodes, its pointer
+ * is set to NULL. We therefore check, while holding the locks, if this
+ * pointer is NULL, and return -ENOENT to the caller if it is the case.
  */
 static
 int ja_detach_node(struct cds_ja *ja,
@@ -1205,9 +1285,21 @@ int ja_detach_node(struct cds_ja *ja,
                        ret = -EAGAIN;
                        goto end;
                }
-               assert(shadow_node->nr_child > 0);
                shadow_nodes[nr_shadow++] = shadow_node;
-               if (shadow_node->nr_child == 1)
+
+               /*
+                * Check if node has been removed between RCU
+                * lookup and lock acquisition.
+                */
+               assert(snapshot_ptr[i + 1]);
+               if (ja_node_ptr(*snapshot_ptr[i + 1])
+                               != ja_node_ptr(snapshot[i + 1])) {
+                       ret = -ENOENT;
+                       goto end;
+               }
+
+               assert(shadow_node->nr_child > 0);
+               if (shadow_node->nr_child == 1 && i > 1)
                        nr_clear++;
                nr_branch++;
                if (shadow_node->nr_child > 1 || i == 1) {
@@ -1219,10 +1311,23 @@ int ja_detach_node(struct cds_ja *ja,
                                goto end;
                        }
                        shadow_nodes[nr_shadow++] = shadow_node;
+
+                       /*
+                        * Check if node has been removed between RCU
+                        * lookup and lock acquisition.
+                        */
+                       assert(snapshot_ptr[i]);
+                       if (ja_node_ptr(*snapshot_ptr[i])
+                                       != ja_node_ptr(snapshot[i])) {
+                               ret = -ENOENT;
+                               goto end;
+                       }
+
                        node_flag_ptr = snapshot_ptr[i + 1];
                        n = snapshot_n[i + 1];
                        parent_node_flag_ptr = snapshot_ptr[i];
                        parent_node_flag = snapshot[i];
+
                        if (i > 1) {
                                /*
                                 * Lock parent's parent, in case we need
@@ -1235,7 +1340,19 @@ int ja_detach_node(struct cds_ja *ja,
                                        goto end;
                                }
                                shadow_nodes[nr_shadow++] = shadow_node;
+
+                               /*
+                                * Check if node has been removed between RCU
+                                * lookup and lock acquisition.
+                                */
+                               assert(snapshot_ptr[i - 1]);
+                               if (ja_node_ptr(*snapshot_ptr[i - 1])
+                                               != ja_node_ptr(snapshot[i - 1])) {
+                                       ret = -ENOENT;
+                                       goto end;
+                               }
                        }
+
                        break;
                }
        }
@@ -1244,8 +1361,8 @@ int ja_detach_node(struct cds_ja *ja,
         * At this point, we want to delete all nodes that are about to
         * be removed from shadow_nodes (except the last one, which is
         * either the root or the parent of the upmost node with 1
-        * child). OK to as to free lock here, because RCU read lock is
-        * held, and free only performed in call_rcu.
+        * child). OK to free lock here, because RCU read lock is held,
+        * and free only performed in call_rcu.
         */
 
        for (i = 0; i < nr_clear; i++) {
@@ -1264,6 +1381,8 @@ int ja_detach_node(struct cds_ja *ja,
                &iter_node_flag,        /* Old new parent ptr in its parent */
                shadow_nodes[nr_branch - 1],    /* of parent */
                n);
+       if (ret)
+               goto end;
 
        dbg_printf("ja_detach_node: publish %p instead of %p\n",
                iter_node_flag, *parent_node_flag_ptr);
@@ -1279,29 +1398,48 @@ end:
 static
 int ja_unchain_node(struct cds_ja *ja,
                struct cds_ja_inode_flag *parent_node_flag,
-               struct cds_hlist_head *head,
+               struct cds_ja_inode_flag **node_flag_ptr,
+               struct cds_ja_inode_flag *node_flag,
                struct cds_ja_node *node)
 {
        struct cds_ja_shadow_node *shadow_node;
        struct cds_hlist_node *hlist_node;
-       int ret = 0, count = 0;
+       struct cds_hlist_head hlist_head;
+       int ret = 0, count = 0, found = 0;
 
        shadow_node = rcuja_shadow_lookup_lock(ja->ht, parent_node_flag);
        if (!shadow_node)
                return -EAGAIN;
+       if (ja_node_ptr(*node_flag_ptr) != ja_node_ptr(node_flag)) {
+               ret = -EAGAIN;
+               goto end;
+       }
+       hlist_head.next = (struct cds_hlist_node *) ja_node_ptr(node_flag);
        /*
         * Retry if another thread removed all but one of duplicates
-        * since check (that was performed without lock).
+        * since check (this check was performed without lock).
+        * Ensure that the node we are about to remove is still in the
+        * list (while holding lock).
         */
-       cds_hlist_for_each_rcu(hlist_node, head, list) {
+       cds_hlist_for_each_rcu(hlist_node, &hlist_head) {
+               if (count == 0) {
+                       /* FIXME: currently a work-around */
+                       hlist_node->prev = (struct cds_hlist_node *) node_flag_ptr;
+               }
                count++;
+               if (hlist_node == &node->list)
+                       found++;
        }
-
-       if (count == 1) {
+       assert(found <= 1);
+       if (!found || count == 1) {
                ret = -EAGAIN;
                goto end;
        }
        cds_hlist_del_rcu(&node->list);
+       /*
+        * Validate that we indeed removed the node from linked list.
+        */
+       assert(ja_node_ptr(*node_flag_ptr) != (struct cds_ja_inode *) node);
 end:
        rcuja_shadow_unlock(shadow_node);
        return ret;
@@ -1318,7 +1456,8 @@ int cds_ja_del(struct cds_ja *ja, uint64_t key,
        struct cds_ja_inode_flag **snapshot_ptr[JA_MAX_DEPTH];
        uint8_t snapshot_n[JA_MAX_DEPTH];
        struct cds_ja_inode_flag *node_flag;
-       struct cds_ja_inode_flag **prev_node_flag_ptr;
+       struct cds_ja_inode_flag **prev_node_flag_ptr,
+               **node_flag_ptr;
        int nr_snapshot;
        int ret;
 
@@ -1338,6 +1477,7 @@ retry:
        snapshot[nr_snapshot++] = (struct cds_ja_inode_flag *) &ja->root;
        node_flag = rcu_dereference(ja->root);
        prev_node_flag_ptr = &ja->root;
+       node_flag_ptr = &ja->root;
 
        /* Iterate on all internal levels */
        for (i = 1; i < tree_depth; i++) {
@@ -1354,12 +1494,13 @@ retry:
                snapshot[nr_snapshot++] = node_flag;
                node_flag = ja_node_get_nth(node_flag,
                        &prev_node_flag_ptr,
+                       NULL,
+                       &node_flag_ptr,
                        iter_key);
                dbg_printf("cds_ja_del iter key lookup %u finds node_flag %p, prev_node_flag_ptr %p\n",
                                (unsigned int) iter_key, node_flag,
                                prev_node_flag_ptr);
        }
-
        /*
         * We reached bottom of tree, try to find the node we are trying
         * to remove. Fail if we cannot find it.
@@ -1401,10 +1542,15 @@ retry:
                                        snapshot_n, nr_snapshot, key, node);
                } else {
                        ret = ja_unchain_node(ja, snapshot[nr_snapshot - 1],
-                               &hlist_head, match);
+                               node_flag_ptr, node_flag, match);
                }
        }
-       if (ret == -EAGAIN)
+       /*
+        * Explanation of -ENOENT handling: caused by concurrent delete
+        * between RCU lookup and actual removal. Need to re-do the
+        * lookup and removal attempt.
+        */
+       if (ret == -EAGAIN || ret == -ENOENT)
                goto retry;
        return ret;
 }
@@ -1422,13 +1568,13 @@ struct cds_ja *_cds_ja_new(unsigned int key_bits,
 
        switch (key_bits) {
        case 8:
-               ja->key_max = UINT8_MAX;
-               break;
        case 16:
-               ja->key_max = UINT16_MAX;
-               break;
+       case 24:
        case 32:
-               ja->key_max = UINT32_MAX;
+       case 40:
+       case 48:
+       case 56:
+               ja->key_max = (1ULL << key_bits) - 1;
                break;
        case 64:
                ja->key_max = UINT64_MAX;
This page took 0.031097 seconds and 4 git commands to generate.