+
+static
+int _cds_ja_add(struct cds_ja *ja, uint64_t key,
+ struct cds_ja_node *node,
+ struct cds_ja_node **unique_node_ret)
+{
+ unsigned int tree_depth, i;
+ struct cds_ja_inode_flag *attach_node_flag,
+ *parent_node_flag,
+ *parent2_node_flag,
+ *node_flag,
+ *parent_attach_node_flag;
+ struct cds_ja_inode_flag **attach_node_flag_ptr,
+ **parent_node_flag_ptr,
+ **node_flag_ptr;
+ int ret;
+
+ if (caa_unlikely(key > ja->key_max)) {
+ return -EINVAL;
+ }
+ tree_depth = ja->tree_depth;
+
+retry:
+ dbg_printf("cds_ja_add attempt: key %" PRIu64 ", node %p\n",
+ key, node);
+ parent2_node_flag = NULL;
+ parent_node_flag =
+ (struct cds_ja_inode_flag *) &ja->root; /* Use root ptr address as key for mutex */
+ parent_node_flag_ptr = NULL;
+ node_flag = rcu_dereference(ja->root);
+ node_flag_ptr = &ja->root;
+
+ /* Iterate on all internal levels */
+ for (i = 1; i < tree_depth; i++) {
+ uint8_t iter_key;
+
+ if (!ja_node_ptr(node_flag))
+ break;
+ dbg_printf("cds_ja_add iter parent2_node_flag %p parent_node_flag %p node_flag_ptr %p node_flag %p\n",
+ parent2_node_flag, parent_node_flag, node_flag_ptr, node_flag);
+ iter_key = (uint8_t) (key >> (JA_BITS_PER_BYTE * (tree_depth - i - 1)));
+ parent2_node_flag = parent_node_flag;
+ parent_node_flag = node_flag;
+ parent_node_flag_ptr = node_flag_ptr;
+ node_flag = ja_node_get_nth(node_flag,
+ &node_flag_ptr,
+ iter_key);
+ }
+
+ /*
+ * We reached either bottom of tree or internal NULL node,
+ * simply add node to last internal level, or chain it if key is
+ * already present.
+ */
+ if (!ja_node_ptr(node_flag)) {
+ dbg_printf("cds_ja_add NULL parent2_node_flag %p parent_node_flag %p node_flag_ptr %p node_flag %p\n",
+ parent2_node_flag, parent_node_flag, node_flag_ptr, node_flag);
+
+ attach_node_flag = parent_node_flag;
+ attach_node_flag_ptr = parent_node_flag_ptr;
+ parent_attach_node_flag = parent2_node_flag;
+
+ ret = ja_attach_node(ja, attach_node_flag_ptr,
+ attach_node_flag,
+ parent_attach_node_flag,
+ node_flag_ptr,
+ node_flag,
+ key, i, node);
+ } else {
+ if (unique_node_ret) {
+ *unique_node_ret = (struct cds_ja_node *) ja_node_ptr(node_flag);
+ return -EEXIST;
+ }
+
+ dbg_printf("cds_ja_add duplicate parent2_node_flag %p parent_node_flag %p node_flag_ptr %p node_flag %p\n",
+ parent2_node_flag, parent_node_flag, node_flag_ptr, node_flag);
+
+ attach_node_flag = node_flag;
+ attach_node_flag_ptr = node_flag_ptr;
+ parent_attach_node_flag = parent_node_flag;
+
+ ret = ja_chain_node(ja,
+ parent_attach_node_flag,
+ attach_node_flag_ptr,
+ attach_node_flag,
+ node);
+ }
+ if (ret == -EAGAIN || ret == -EEXIST)
+ goto retry;
+
+ return ret;
+}
+
+int cds_ja_add(struct cds_ja *ja, uint64_t key,
+ struct cds_ja_node *node)
+{
+ return _cds_ja_add(ja, key, node, NULL);
+}
+
+struct cds_ja_node *cds_ja_add_unique(struct cds_ja *ja, uint64_t key,
+ struct cds_ja_node *node)
+{
+ int ret;
+ struct cds_ja_node *ret_node;
+
+ ret = _cds_ja_add(ja, key, node, &ret_node);
+ if (ret == -EEXIST)
+ return ret_node;
+ else
+ return node;
+}
+
+/*
+ * Note: there is no need to lookup the pointer address associated with
+ * each node's nth item after taking the lock: it's already been done by
+ * cds_ja_del while holding the rcu read-side lock, and our node rules
+ * ensure that when a match value -> pointer is found in a node, it is
+ * _NEVER_ changed for that node without recompaction, and recompaction
+ * reallocates the node.
+ * However, when a child is removed from "linear" nodes, its pointer
+ * is set to NULL. We therefore check, while holding the locks, if this
+ * pointer is NULL, and return -ENOENT to the caller if it is the case.
+ *
+ * ja_detach_node() ensures that a lookup will _never_ see a branch that
+ * leads to a dead-end: when removing branch, it makes sure to perform
+ * the "cut" at the highest node that has only one child, effectively
+ * replacing it with a NULL pointer.
+ */
+static
+int ja_detach_node(struct cds_ja *ja,
+ struct cds_ja_inode_flag **snapshot,
+ struct cds_ja_inode_flag ***snapshot_ptr,
+ uint8_t *snapshot_n,
+ int nr_snapshot,
+ uint64_t key,
+ struct cds_ja_node *node)
+{
+ struct cds_ja_shadow_node *shadow_nodes[JA_MAX_DEPTH];
+ struct cds_ja_inode_flag **node_flag_ptr = NULL,
+ *parent_node_flag = NULL,
+ **parent_node_flag_ptr = NULL;
+ struct cds_ja_inode_flag *iter_node_flag;
+ int ret, i, nr_shadow = 0, nr_clear = 0, nr_branch = 0;
+ uint8_t n = 0;
+
+ assert(nr_snapshot == ja->tree_depth + 1);
+
+ /*
+ * From the last internal level node going up, get the node
+ * lock, check if the node has only one child left. If it is the
+ * case, we continue iterating upward. When we reach a node
+ * which has more that one child left, we lock the parent, and
+ * proceed to the node deletion (removing its children too).
+ */
+ for (i = nr_snapshot - 2; i >= 1; i--) {
+ struct cds_ja_shadow_node *shadow_node;
+
+ shadow_node = rcuja_shadow_lookup_lock(ja->ht,
+ snapshot[i]);
+ if (!shadow_node) {
+ ret = -EAGAIN;
+ goto end;
+ }
+ shadow_nodes[nr_shadow++] = shadow_node;
+
+ /*
+ * Check if node has been removed between RCU
+ * lookup and lock acquisition.
+ */
+ assert(snapshot_ptr[i + 1]);
+ if (ja_node_ptr(*snapshot_ptr[i + 1])
+ != ja_node_ptr(snapshot[i + 1])) {
+ ret = -ENOENT;
+ goto end;
+ }
+
+ assert(shadow_node->nr_child > 0);
+ if (shadow_node->nr_child == 1 && i > 1)
+ nr_clear++;
+ nr_branch++;
+ if (shadow_node->nr_child > 1 || i == 1) {
+ /* Lock parent and break */
+ shadow_node = rcuja_shadow_lookup_lock(ja->ht,
+ snapshot[i - 1]);
+ if (!shadow_node) {
+ ret = -EAGAIN;
+ goto end;
+ }
+ shadow_nodes[nr_shadow++] = shadow_node;
+
+ /*
+ * Check if node has been removed between RCU
+ * lookup and lock acquisition.
+ */
+ assert(snapshot_ptr[i]);
+ if (ja_node_ptr(*snapshot_ptr[i])
+ != ja_node_ptr(snapshot[i])) {
+ ret = -ENOENT;
+ goto end;
+ }
+
+ node_flag_ptr = snapshot_ptr[i + 1];
+ n = snapshot_n[i + 1];
+ parent_node_flag_ptr = snapshot_ptr[i];
+ parent_node_flag = snapshot[i];
+
+ if (i > 1) {
+ /*
+ * Lock parent's parent, in case we need
+ * to recompact parent.
+ */
+ shadow_node = rcuja_shadow_lookup_lock(ja->ht,
+ snapshot[i - 2]);
+ if (!shadow_node) {
+ ret = -EAGAIN;
+ goto end;
+ }
+ shadow_nodes[nr_shadow++] = shadow_node;
+
+ /*
+ * Check if node has been removed between RCU
+ * lookup and lock acquisition.
+ */
+ assert(snapshot_ptr[i - 1]);
+ if (ja_node_ptr(*snapshot_ptr[i - 1])
+ != ja_node_ptr(snapshot[i - 1])) {
+ ret = -ENOENT;
+ goto end;
+ }
+ }
+
+ break;
+ }
+ }
+
+ /*
+ * At this point, we want to delete all nodes that are about to
+ * be removed from shadow_nodes (except the last one, which is
+ * either the root or the parent of the upmost node with 1
+ * child). OK to free lock here, because RCU read lock is held,
+ * and free only performed in call_rcu.
+ */
+
+ for (i = 0; i < nr_clear; i++) {
+ ret = rcuja_shadow_clear(ja->ht,
+ shadow_nodes[i]->node_flag,
+ shadow_nodes[i],
+ RCUJA_SHADOW_CLEAR_FREE_NODE
+ | RCUJA_SHADOW_CLEAR_FREE_LOCK);
+ assert(!ret);
+ }
+
+ iter_node_flag = parent_node_flag;
+ /* Remove from parent */
+ ret = ja_node_clear_ptr(ja,
+ node_flag_ptr, /* Pointer to location to nullify */
+ &iter_node_flag, /* Old new parent ptr in its parent */
+ shadow_nodes[nr_branch - 1], /* of parent */
+ n, nr_branch - 1);
+ if (ret)
+ goto end;
+
+ dbg_printf("ja_detach_node: publish %p instead of %p\n",
+ iter_node_flag, *parent_node_flag_ptr);
+ /* Update address of parent ptr in its parent */
+ rcu_assign_pointer(*parent_node_flag_ptr, iter_node_flag);
+
+end:
+ for (i = 0; i < nr_shadow; i++)
+ rcuja_shadow_unlock(shadow_nodes[i]);
+ return ret;
+}
+
+static
+int ja_unchain_node(struct cds_ja *ja,
+ struct cds_ja_inode_flag *parent_node_flag,
+ struct cds_ja_inode_flag **node_flag_ptr,
+ struct cds_ja_inode_flag *node_flag,
+ struct cds_ja_node *node)
+{
+ struct cds_ja_shadow_node *shadow_node;
+ struct cds_ja_node *iter_node, **iter_node_ptr, **prev_node_ptr = NULL;
+ int ret = 0, count = 0, found = 0;
+
+ shadow_node = rcuja_shadow_lookup_lock(ja->ht, parent_node_flag);
+ if (!shadow_node)
+ return -EAGAIN;
+ if (ja_node_ptr(*node_flag_ptr) != ja_node_ptr(node_flag)) {
+ ret = -EAGAIN;
+ goto end;
+ }
+ /*
+ * Find the previous node's next pointer pointing to our node,
+ * so we can update it. Retry if another thread removed all but
+ * one of duplicates since check (this check was performed
+ * without lock). Ensure that the node we are about to remove is
+ * still in the list (while holding lock). No need for RCU
+ * traversal here since we hold the lock on the parent.
+ */
+ iter_node_ptr = (struct cds_ja_node **) node_flag_ptr;
+ iter_node = (struct cds_ja_node *) ja_node_ptr(node_flag);
+ cds_ja_for_each_duplicate(iter_node) {
+ count++;
+ if (iter_node == node) {
+ prev_node_ptr = iter_node_ptr;
+ found++;
+ }
+ iter_node_ptr = &iter_node->next;
+ }
+ assert(found <= 1);
+ if (!found || count == 1) {
+ ret = -EAGAIN;
+ goto end;
+ }
+ CMM_STORE_SHARED(*prev_node_ptr, node->next);
+ /*
+ * Validate that we indeed removed the node from linked list.
+ */
+ assert(ja_node_ptr(*node_flag_ptr) != (struct cds_ja_inode *) node);
+end:
+ rcuja_shadow_unlock(shadow_node);
+ return ret;
+}
+
+/*
+ * Called with RCU read lock held.
+ */
+int cds_ja_del(struct cds_ja *ja, uint64_t key,
+ struct cds_ja_node *node)
+{
+ unsigned int tree_depth, i;
+ struct cds_ja_inode_flag *snapshot[JA_MAX_DEPTH];
+ struct cds_ja_inode_flag **snapshot_ptr[JA_MAX_DEPTH];
+ uint8_t snapshot_n[JA_MAX_DEPTH];
+ struct cds_ja_inode_flag *node_flag;
+ struct cds_ja_inode_flag **prev_node_flag_ptr,
+ **node_flag_ptr;
+ int nr_snapshot;
+ int ret;
+
+ if (caa_unlikely(key > ja->key_max))
+ return -EINVAL;
+ tree_depth = ja->tree_depth;
+
+retry:
+ nr_snapshot = 0;
+ dbg_printf("cds_ja_del attempt: key %" PRIu64 ", node %p\n",
+ key, node);
+
+ /* snapshot for level 0 is only for shadow node lookup */
+ snapshot_n[0] = 0;
+ snapshot_n[1] = 0;
+ snapshot_ptr[nr_snapshot] = NULL;
+ snapshot[nr_snapshot++] = (struct cds_ja_inode_flag *) &ja->root;
+ node_flag = rcu_dereference(ja->root);
+ prev_node_flag_ptr = &ja->root;
+ node_flag_ptr = &ja->root;
+
+ /* Iterate on all internal levels */
+ for (i = 1; i < tree_depth; i++) {
+ uint8_t iter_key;
+
+ dbg_printf("cds_ja_del iter node_flag %p\n",
+ node_flag);
+ if (!ja_node_ptr(node_flag)) {
+ return -ENOENT;
+ }
+ iter_key = (uint8_t) (key >> (JA_BITS_PER_BYTE * (tree_depth - i - 1)));
+ snapshot_n[nr_snapshot + 1] = iter_key;
+ snapshot_ptr[nr_snapshot] = prev_node_flag_ptr;
+ snapshot[nr_snapshot++] = node_flag;
+ node_flag = ja_node_get_nth(node_flag,
+ &node_flag_ptr,
+ iter_key);
+ if (node_flag)
+ prev_node_flag_ptr = node_flag_ptr;
+ dbg_printf("cds_ja_del iter key lookup %u finds node_flag %p, prev_node_flag_ptr %p\n",
+ (unsigned int) iter_key, node_flag,
+ prev_node_flag_ptr);
+ }
+ /*
+ * We reached bottom of tree, try to find the node we are trying
+ * to remove. Fail if we cannot find it.
+ */
+ if (!ja_node_ptr(node_flag)) {
+ dbg_printf("cds_ja_del: no node found for key %" PRIu64 "\n",
+ key);
+ return -ENOENT;
+ } else {
+ struct cds_ja_node *iter_node, *match = NULL;
+ int count = 0;
+
+ iter_node = (struct cds_ja_node *) ja_node_ptr(node_flag);
+ cds_ja_for_each_duplicate_rcu(iter_node) {
+ dbg_printf("cds_ja_del: compare %p with iter_node %p\n", node, iter_node);
+ if (iter_node == node)
+ match = iter_node;
+ count++;
+ }
+
+ if (!match) {
+ dbg_printf("cds_ja_del: no node match for node %p key %" PRIu64 "\n", node, key);
+ return -ENOENT;
+ }
+ assert(count > 0);
+ if (count == 1) {
+ /*
+ * Removing last of duplicates. Last snapshot
+ * does not have a shadow node (external leafs).
+ */
+ snapshot_ptr[nr_snapshot] = prev_node_flag_ptr;
+ snapshot[nr_snapshot++] = node_flag;
+ ret = ja_detach_node(ja, snapshot, snapshot_ptr,
+ snapshot_n, nr_snapshot, key, node);
+ } else {
+ ret = ja_unchain_node(ja, snapshot[nr_snapshot - 1],
+ node_flag_ptr, node_flag, match);
+ }
+ }
+ /*
+ * Explanation of -ENOENT handling: caused by concurrent delete
+ * between RCU lookup and actual removal. Need to re-do the
+ * lookup and removal attempt.
+ */
+ if (ret == -EAGAIN || ret == -ENOENT)
+ goto retry;
+ return ret;
+}
+
+struct cds_ja *_cds_ja_new(unsigned int key_bits,
+ const struct rcu_flavor_struct *flavor)
+{
+ struct cds_ja *ja;
+ int ret;
+ struct cds_ja_shadow_node *root_shadow_node;
+
+ ja = calloc(sizeof(*ja), 1);
+ if (!ja)
+ goto ja_error;
+
+ switch (key_bits) {
+ case 8:
+ case 16:
+ case 24:
+ case 32:
+ case 40:
+ case 48:
+ case 56:
+ ja->key_max = (1ULL << key_bits) - 1;
+ break;
+ case 64:
+ ja->key_max = UINT64_MAX;
+ break;
+ default:
+ goto check_error;
+ }
+
+ /* ja->root is NULL */
+ /* tree_depth 0 is for pointer to root node */
+ ja->tree_depth = (key_bits >> JA_LOG2_BITS_PER_BYTE) + 1;
+ assert(ja->tree_depth <= JA_MAX_DEPTH);
+ ja->ht = rcuja_create_ht(flavor);
+ if (!ja->ht)
+ goto ht_error;
+
+ /*
+ * Note: we should not free this node until judy array destroy.
+ */
+ root_shadow_node = rcuja_shadow_set(ja->ht,
+ (struct cds_ja_inode_flag *) &ja->root,
+ NULL, ja, 0);
+ if (!root_shadow_node) {
+ ret = -ENOMEM;
+ goto ht_node_error;
+ }
+
+ return ja;
+
+ht_node_error:
+ ret = rcuja_delete_ht(ja->ht);
+ assert(!ret);
+ht_error:
+check_error:
+ free(ja);
+ja_error:
+ return NULL;
+}
+
+static
+void print_debug_fallback_distribution(struct cds_ja *ja)
+{
+ int i;
+
+ fprintf(stderr, "Fallback node distribution:\n");
+ for (i = 0; i < JA_ENTRY_PER_NODE; i++) {
+ if (!ja->node_fallback_count_distribution[i])
+ continue;
+ fprintf(stderr, " %3u: %4lu\n",
+ i, ja->node_fallback_count_distribution[i]);
+ }
+}
+
+static
+int ja_final_checks(struct cds_ja *ja)
+{
+ double fallback_ratio;
+ unsigned long na, nf, nr_fallback;
+ int ret = 0;
+
+ fallback_ratio = (double) uatomic_read(&ja->nr_fallback);
+ fallback_ratio /= (double) uatomic_read(&ja->nr_nodes_allocated);
+ nr_fallback = uatomic_read(&ja->nr_fallback);
+ if (nr_fallback)
+ fprintf(stderr,
+ "[warning] RCU Judy Array used %lu fallback node(s) (ratio: %g)\n",
+ uatomic_read(&ja->nr_fallback),
+ fallback_ratio);
+
+ na = uatomic_read(&ja->nr_nodes_allocated);
+ nf = uatomic_read(&ja->nr_nodes_freed);
+ dbg_printf("Nodes allocated: %lu, Nodes freed: %lu.\n", na, nf);
+ if (nr_fallback)
+ print_debug_fallback_distribution(ja);
+
+ if (na != nf) {
+ fprintf(stderr, "[error] Judy array leaked %ld nodes. Allocated: %lu, freed: %lu.\n",
+ (long) na - nf, na, nf);
+ ret = -1;
+ }
+ return ret;
+}
+
+/*
+ * There should be no more concurrent add, delete, nor look-up performed
+ * on the Judy array while it is being destroyed (ensured by the
+ * caller).
+ */
+int cds_ja_destroy(struct cds_ja *ja)
+{
+ const struct rcu_flavor_struct *flavor;
+ int ret;
+
+ flavor = cds_lfht_rcu_flavor(ja->ht);
+ rcuja_shadow_prune(ja->ht,
+ RCUJA_SHADOW_CLEAR_FREE_NODE | RCUJA_SHADOW_CLEAR_FREE_LOCK);
+ flavor->thread_offline();
+ ret = rcuja_delete_ht(ja->ht);
+ if (ret)
+ return ret;
+
+ /* Wait for in-flight call_rcu free to complete. */
+ flavor->barrier();
+
+ flavor->thread_online();
+ ret = ja_final_checks(ja);
+ free(ja);
+ return ret;
+}