RCU hash table API documentation

[urcu.git] / rculfhash.c
diff --git a/rculfhash.c b/rculfhash.c

index 606f46e904b2fde4e0c3f4be1e66d5e5a437929d..380665b106cb4a68ed308fff7638bff3d3406665 100644 (file)
--- a/rculfhash.c
+++ b/rculfhash.c
@@ -20,6 +20,80 @@
   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
   */
  
+/*
+ * Based on the following articles:
+ * - Ori Shalev and Nir Shavit. Split-ordered lists: Lock-free
+ *   extensible hash tables. J. ACM 53, 3 (May 2006), 379-405.
+ * - Michael, M. M. High performance dynamic lock-free hash tables
+ *   and list-based sets. In Proceedings of the fourteenth annual ACM
+ *   symposium on Parallel algorithms and architectures, ACM Press,
+ *   (2002), 73-82.
+ *
+ * Some specificities of this Lock-Free Expandable RCU Hash Table
+ * implementation:
+ *
+ * - RCU read-side critical section allows readers to perform hash
+ *   table lookups and use the returned objects safely by delaying
+ *   memory reclaim of a grace period.
+ * - Add and remove operations are lock-free, and do not need to
+ *   allocate memory. They need to be executed within RCU read-side
+ *   critical section to ensure the objects they read are valid and to
+ *   deal with the cmpxchg ABA problem.
+ * - add and add_unique operations are supported. add_unique checks if
+ *   the node key already exists in the hash table. It ensures no key
+ *   duplicata exists.
+ * - The resize operation executes concurrently with add/remove/lookup.
+ * - Hash table nodes are contained within a split-ordered list. This
+ *   list is ordered by incrementing reversed-bits-hash value.
+ * - An index of dummy nodes is kept. These dummy nodes are the hash
+ *   table "buckets", and they are also chained together in the
+ *   split-ordered list, which allows recursive expansion.
+ * - The resize operation only allows expanding the hash table.
+ *   It is triggered either through an API call or automatically by
+ *   detecting long chains in the add operation.
+ * - Resize operation initiated by long chain detection is executed by a
+ *   call_rcu thread, which keeps lock-freedom of add and remove.
+ * - Resize operations are protected by a mutex.
+ * - The removal operation is split in two parts: first, a "removed"
+ *   flag is set in the next pointer within the node to remove. Then,
+ *   a "garbage collection" is performed in the bucket containing the
+ *   removed node (from the start of the bucket up to the removed node).
+ *   All encountered nodes with "removed" flag set in their next
+ *   pointers are removed from the linked-list. If the cmpxchg used for
+ *   removal fails (due to concurrent garbage-collection or concurrent
+ *   add), we retry from the beginning of the bucket. This ensures that
+ *   the node with "removed" flag set is removed from the hash table
+ *   (not visible to lookups anymore) before the RCU read-side critical
+ *   section held across removal ends. Furthermore, this ensures that
+ *   the node with "removed" flag set is removed from the linked-list
+ *   before its memory is reclaimed. Only the thread which removal
+ *   successfully set the "removed" flag (with a cmpxchg) into a node's
+ *   next pointer is considered to have succeeded its removal (and thus
+ *   owns the node to reclaim). Because we garbage-collect starting from
+ *   an invariant node (the start-of-bucket dummy node) up to the
+ *   "removed" node (or find a reverse-hash that is higher), we are sure
+ *   that a successful traversal of the chain leads to a chain that is
+ *   present in the linked-list (the start node is never removed) and
+ *   that is does not contain the "removed" node anymore, even if
+ *   concurrent delete/add operations are changing the structure of the
+ *   list concurrently.
+ * - The add operation performs gargage collection of buckets if it
+ *   encounters nodes with removed flag set in the bucket where it wants
+ *   to add its new node. This ensures lock-freedom of add operation by
+ *   helping the remover unlink nodes from the list rather than to wait
+ *   for it do to so.
+ * - A RCU "order table" indexed by log2(hash index) is copied and
+ *   expanded by the resize operation. This order table allows finding
+ *   the "dummy node" tables.
+ * - There is one dummy node table per hash index order. The size of
+ *   each dummy node table is half the number of hashes contained in
+ *   this order.
+ * - call_rcu is used to garbage-collect the old order table.
+ * - The per-order dummy node tables contain a compact version of the
+ *   hash table nodes. These tables are invariant after they are
+ *   populated into the hash table.
+ */
+
  #define _LGPL_SOURCE
  #include <stdlib.h>
  #include <errno.h>
@@ -38,16 +112,14 @@
  #include <stdio.h>
  #include <pthread.h>
  
-//#define DEBUG                /* Test */
-
  #ifdef DEBUG
-#define dbg_printf(args...)     printf(args)
+#define dbg_printf(fmt, args...)     printf("[debug rculfhash] " fmt, ## args)
  #else
-#define dbg_printf(args...)
+#define dbg_printf(fmt, args...)
  #endif
  
-#define CHAIN_LEN_TARGET               1
-#define CHAIN_LEN_RESIZE_THRESHOLD     2
+#define CHAIN_LEN_TARGET               4
+#define CHAIN_LEN_RESIZE_THRESHOLD     8
  
  #ifndef max
  #define max(a, b)      ((a) > (b) ? (a) : (b))
@@ -76,7 +148,7 @@ struct rcu_ht {
         ht_compare_fct compare_fct;
         unsigned long hash_seed;
         pthread_mutex_t resize_mutex;   /* resize mutex: add/del mutex */
-       unsigned int in_progress_resize;
+       unsigned int in_progress_resize, in_progress_destroy;
         void (*ht_call_rcu)(struct rcu_head *head,
                       void (*func)(struct rcu_head *head));
  };
@@ -286,11 +358,11 @@ void check_resize(struct rcu_ht *ht, struct rcu_table *t,
                   uint32_t chain_len)
  {
         if (chain_len > 100)
-               dbg_printf("rculfhash: WARNING: large chain length: %u.\n",
+               dbg_printf("WARNING: large chain length: %u.\n",
                            chain_len);
         if (chain_len >= CHAIN_LEN_RESIZE_THRESHOLD)
                 ht_resize_lazy(ht, t,
-                       get_count_order_u32(chain_len - CHAIN_LEN_TARGET + 1));
+                       get_count_order_u32(chain_len - (CHAIN_LEN_TARGET - 1)));
  }
  
  static
@@ -353,10 +425,10 @@ void _ht_gc_bucket(struct rcu_ht_node *dummy, struct rcu_ht_node *node)
                 for (;;) {
                         if (unlikely(!clear_flag(iter)))
                                 return;
-                       if (clear_flag(iter)->p.reverse_hash > node->p.reverse_hash)
+                       if (likely(clear_flag(iter)->p.reverse_hash > node->p.reverse_hash))
                                 return;
                         next = rcu_dereference(clear_flag(iter)->p.next);
-                       if (is_removed(next))
+                       if (likely(is_removed(next)))
                                 break;
                         iter_prev = clear_flag(iter);
                         iter = next;
@@ -402,10 +474,10 @@ struct rcu_ht_node *_ht_add(struct rcu_ht *ht, struct rcu_table *t,
                 for (;;) {
                         if (unlikely(!clear_flag(iter)))
                                 goto insert;
-                       if (clear_flag(iter)->p.reverse_hash > node->p.reverse_hash)
+                       if (likely(clear_flag(iter)->p.reverse_hash > node->p.reverse_hash))
                                 goto insert;
                         next = rcu_dereference(clear_flag(iter)->p.next);
-                       if (is_removed(next))
+                       if (unlikely(is_removed(next)))
                                 goto gc_node;
                         if (unique
                             && !is_dummy(next)
@@ -468,7 +540,7 @@ int _ht_remove(struct rcu_ht *ht, struct rcu_table *t, struct rcu_ht_node *node)
         old = rcu_dereference(node->p.next);
         do {
                 next = old;
-               if (is_removed(next))
+               if (unlikely(is_removed(next)))
                         goto end;
                 assert(!is_dummy(next));
                 old = uatomic_cmpxchg(&node->p.next, next,
@@ -507,7 +579,7 @@ void init_table(struct rcu_ht *ht, struct rcu_table *t,
  {
         unsigned long i, end_order;
  
-       dbg_printf("rculfhash: init table: first_order %lu end_order %lu\n",
+       dbg_printf("init table: first_order %lu end_order %lu\n",
                    first_order, first_order + len_order);
         end_order = first_order + len_order;
         t->size = !first_order ? 0 : (1UL << (first_order - 1));
@@ -515,20 +587,24 @@ void init_table(struct rcu_ht *ht, struct rcu_table *t,
                 unsigned long j, len;
  
                 len = !i ? 1 : 1UL << (i - 1);
-               dbg_printf("rculfhash: init order %lu len: %lu\n", i, len);
+               dbg_printf("init order %lu len: %lu\n", i, len);
                 t->tbl[i] = calloc(len, sizeof(struct _rcu_ht_node));
                 for (j = 0; j < len; j++) {
-                       dbg_printf("rculfhash: init entry: i %lu j %lu hash %lu\n",
+                       dbg_printf("init entry: i %lu j %lu hash %lu\n",
                                    i, j, !i ? 0 : (1UL << (i - 1)) + j);
                         struct rcu_ht_node *new_node =
                                 (struct rcu_ht_node *) &t->tbl[i][j];
                         new_node->p.reverse_hash =
                                 bit_reverse_ulong(!i ? 0 : (1UL << (i - 1)) + j);
                         (void) _ht_add(ht, t, new_node, 0, 1);
+                       if (CMM_LOAD_SHARED(ht->in_progress_destroy))
+                               break;
                 }
                 /* Update table size */
                 t->size = !i ? 1 : (1UL << i);
-               dbg_printf("rculfhash: init new size: %lu\n", t->size);
+               dbg_printf("init new size: %lu\n", t->size);
+               if (CMM_LOAD_SHARED(ht->in_progress_destroy))
+                       break;
         }
         t->resize_target = t->size;
         t->resize_initiated = 0;
@@ -576,7 +652,7 @@ struct rcu_ht_node *ht_lookup(struct rcu_ht *ht, void *key, size_t key_len)
         index = hash & (t->size - 1);
         order = get_count_order_ulong(index + 1);
         lookup = &t->tbl[order][index & ((1UL << (order - 1)) - 1)];
-       dbg_printf("rculfhash: lookup hash %lu index %lu order %lu aridx %lu\n",
+       dbg_printf("lookup hash %lu index %lu order %lu aridx %lu\n",
                    hash, index, order, index & ((1UL << (order - 1)) - 1));
         node = (struct rcu_ht_node *) lookup;
         for (;;) {
@@ -654,7 +730,7 @@ int ht_delete_dummy(struct rcu_ht *ht)
  
                 len = !order ? 1 : 1UL << (order - 1);
                 for (i = 0; i < len; i++) {
-                       dbg_printf("rculfhash: delete order %lu i %lu hash %lu\n",
+                       dbg_printf("delete order %lu i %lu hash %lu\n",
                                 order, i,
                                 bit_reverse_ulong(t->tbl[order][i].reverse_hash));
                         assert(is_dummy(t->tbl[order][i].next));
@@ -673,6 +749,7 @@ int ht_destroy(struct rcu_ht *ht)
         int ret;
  
         /* Wait for in-flight resize operations to complete */
+       CMM_STORE_SHARED(ht->in_progress_destroy, 1);
         while (uatomic_read(&ht->in_progress_resize))
                 poll(NULL, 0, 100);     /* wait for 100ms */
         ret = ht_delete_dummy(ht);
@@ -710,7 +787,7 @@ void ht_count_nodes(struct rcu_ht *ht,
                         (nr_dummy)++;
                 node = clear_flag(next);
         } while (node);
-       dbg_printf("rculfhash: number of dummy nodes: %lu\n", nr_dummy);
+       dbg_printf("number of dummy nodes: %lu\n", nr_dummy);
  }
  
  static
@@ -736,8 +813,8 @@ void _do_ht_resize(struct rcu_ht *ht)
         if (old_size == new_size)
                 return;
         new_order = get_count_order_ulong(new_size) + 1;
-       dbg_printf("rculfhash: resize from %lu (order %lu) to %lu (order %lu) buckets\n",
-                  old_size, old_order, new_size, new_order);
+       dbg_printf("resize from %lu (order %lu) to %lu (order %lu) buckets\n",
+              old_size, old_order, new_size, new_order);
         new_t = malloc(sizeof(struct rcu_table)
                         + (new_order * sizeof(struct _rcu_ht_node *)));
         assert(new_size > old_size);
@@ -762,6 +839,14 @@ void ht_resize(struct rcu_ht *ht, int growth)
         struct rcu_table *t = rcu_dereference(ht->t);
         unsigned long target_size;
  
+       if (growth < 0) {
+               /*
+                * Silently refuse to shrink hash table. (not supported)
+                */
+               dbg_printf("shrinking hash table not supported.\n");
+               return;
+       }
+
         target_size = resize_target_update(t, growth);
         if (t->size < target_size) {
                 CMM_STORE_SHARED(t->resize_initiated, 1);