rculfhash: remove unneeded clear_flag()

[userspace-rcu.git] / rculfhash.c
diff --git a/rculfhash.c b/rculfhash.c

index fd9241d613b964766e33463327fa3d5a15afe3aa..c1ff314bc0d5cc748e1329d8508f72641ae804ad 100644 (file)
--- a/rculfhash.c
+++ b/rculfhash.c
@@ -160,19 +160,15 @@
  #include "config.h"
  #include <urcu.h>
  #include <urcu-call-rcu.h>
+#include <urcu-flavor.h>
  #include <urcu/arch.h>
  #include <urcu/uatomic.h>
  #include <urcu/compiler.h>
  #include <urcu/rculfhash.h>
+#include <rculfhash-internal.h>
  #include <stdio.h>
  #include <pthread.h>
  
-#ifdef DEBUG
-#define dbg_printf(fmt, args...)     printf("[debug rculfhash] " fmt, ## args)
-#else
-#define dbg_printf(fmt, args...)
-#endif
-
  /*
   * Split-counters lazily update the global counter each 1024
   * addition/removal. It automatically keeps track of resize required.
@@ -190,26 +186,12 @@
  #define MIN_TABLE_ORDER                        0
  #define MIN_TABLE_SIZE                 (1UL << MIN_TABLE_ORDER)
  
-#if (CAA_BITS_PER_LONG == 32)
-#define MAX_TABLE_ORDER                        32
-#else
-#define MAX_TABLE_ORDER                        64
-#endif
-
  /*
   * Minimum number of bucket nodes to touch per thread to parallelize grow/shrink.
   */
  #define MIN_PARTITION_PER_THREAD_ORDER 12
  #define MIN_PARTITION_PER_THREAD       (1UL << MIN_PARTITION_PER_THREAD_ORDER)
  
-#ifndef min
-#define min(a, b)      ((a) < (b) ? (a) : (b))
-#endif
-
-#ifndef max
-#define max(a, b)      ((a) > (b) ? (a) : (b))
-#endif
-
  /*
   * The removed flag needs to be updated atomically with the pointer.
   * It indicates that no node must attach to the node scheduled for
@@ -219,7 +201,8 @@
   */
  #define REMOVED_FLAG           (1UL << 0)
  #define BUCKET_FLAG            (1UL << 1)
-#define FLAGS_MASK             ((1UL << 2) - 1)
+#define REMOVAL_OWNER_FLAG     (1UL << 2)
+#define FLAGS_MASK             ((1UL << 3) - 1)
  
  /* Value of the end pointer. Should not interact with flags. */
  #define END_VALUE              NULL
@@ -238,59 +221,6 @@ struct ht_items_count {
         unsigned long add, del;
  } __attribute__((aligned(CAA_CACHE_LINE_SIZE)));
  
-/*
- * rcu_table: Contains the size and desired new size if a resize
- * operation is in progress, as well as the statically-sized array of
- * bucket table pointers.
- */
-struct rcu_table {
-       unsigned long size;     /* always a power of 2, shared (RCU) */
-       unsigned long resize_target;
-       int resize_initiated;
-
-       /*
-        * Contains the per order-index-level bucket node table. The size
-        * of each bucket node table is half the number of hashes contained
-        * in this order (except for order 0). The minimum allocation size
-        * parameter allows combining the bucket node arrays of the lowermost
-        * levels to improve cache locality for small index orders.
-        */
-       struct cds_lfht_node *tbl[MAX_TABLE_ORDER];
-};
-
-/*
- * cds_lfht: Top-level data structure representing a lock-free hash
- * table. Defined in the implementation file to make it be an opaque
- * cookie to users.
- */
-struct cds_lfht {
-       struct rcu_table t;
-       unsigned long min_alloc_order;
-       unsigned long min_alloc_size;
-       int flags;
-       /*
-        * We need to put the work threads offline (QSBR) when taking this
-        * mutex, because we use synchronize_rcu within this mutex critical
-        * section, which waits on read-side critical sections, and could
-        * therefore cause grace-period deadlock if we hold off RCU G.P.
-        * completion.
-        */
-       pthread_mutex_t resize_mutex;   /* resize mutex: add/del mutex */
-       unsigned int in_progress_resize, in_progress_destroy;
-       void (*cds_lfht_call_rcu)(struct rcu_head *head,
-                     void (*func)(struct rcu_head *head));
-       void (*cds_lfht_synchronize_rcu)(void);
-       void (*cds_lfht_rcu_read_lock)(void);
-       void (*cds_lfht_rcu_read_unlock)(void);
-       void (*cds_lfht_rcu_thread_offline)(void);
-       void (*cds_lfht_rcu_thread_online)(void);
-       void (*cds_lfht_rcu_register_thread)(void);
-       void (*cds_lfht_rcu_unregister_thread)(void);
-       pthread_attr_t *resize_attr;    /* Resize threads attributes */
-       long count;                     /* global approximate item count */
-       struct ht_items_count *split_count;     /* split item count */
-};
-
  /*
   * rcu_resize_work: Contains arguments passed to RCU worker thread
   * responsible for performing lazy resize.
@@ -313,15 +243,6 @@ struct partition_resize_work {
                     unsigned long start, unsigned long len);
  };
  
-static
-void _cds_lfht_add(struct cds_lfht *ht,
-               cds_lfht_match_fct match,
-               const void *key,
-               unsigned long size,
-               struct cds_lfht_node *node,
-               struct cds_lfht_iter *unique_ret,
-               int bucket);
-
  /*
   * Algorithm to reverse bits in a word by lookup table, extended to
   * 64-bit words.
@@ -485,7 +406,7 @@ unsigned int fls_u32(uint32_t x)
  }
  #endif
  
-unsigned int fls_ulong(unsigned long x)
+unsigned int cds_lfht_fls_ulong(unsigned long x)
  {
  #if (CAA_BITS_PER_LONG == 32)
         return fls_u32(x);
@@ -498,7 +419,7 @@ unsigned int fls_ulong(unsigned long x)
   * Return the minimum order for which x <= (1UL << order).
   * Return -1 if x is 0.
   */
-int get_count_order_u32(uint32_t x)
+int cds_lfht_get_count_order_u32(uint32_t x)
  {
         if (!x)
                 return -1;
@@ -510,26 +431,14 @@ int get_count_order_u32(uint32_t x)
   * Return the minimum order for which x <= (1UL << order).
   * Return -1 if x is 0.
   */
-int get_count_order_ulong(unsigned long x)
+int cds_lfht_get_count_order_ulong(unsigned long x)
  {
         if (!x)
                 return -1;
  
-       return fls_ulong(x - 1);
+       return cds_lfht_fls_ulong(x - 1);
  }
  
-#ifdef POISON_FREE
-#define poison_free(ptr)                                       \
-       do {                                                    \
-               if (ptr) {                                      \
-                       memset(ptr, 0x42, sizeof(*(ptr)));      \
-                       free(ptr);                              \
-               }                                               \
-       } while (0)
-#else
-#define poison_free(ptr)       free(ptr)
-#endif
-
  static
  void cds_lfht_resize_lazy_grow(struct cds_lfht *ht, unsigned long size, int growth);
  
@@ -554,7 +463,7 @@ static void ht_init_nr_cpus_mask(void)
          * round up number of CPUs to next power of two, so we
          * can use & for modulo.
          */
-       maxcpus = 1UL << get_count_order_ulong(maxcpus);
+       maxcpus = 1UL << cds_lfht_get_count_order_ulong(maxcpus);
         nr_cpus_mask = maxcpus - 1;
  }
  #else /* #if defined(HAVE_SYSCONF) */
@@ -619,26 +528,28 @@ void ht_count_add(struct cds_lfht *ht, unsigned long size, unsigned long hash)
  {
         unsigned long split_count;
         int index;
+       long count;
  
         if (caa_unlikely(!ht->split_count))
                 return;
         index = ht_get_split_count_index(hash);
         split_count = uatomic_add_return(&ht->split_count[index].add, 1);
-       if (caa_unlikely(!(split_count & ((1UL << COUNT_COMMIT_ORDER) - 1)))) {
-               long count;
-
-               dbg_printf("add split count %lu\n", split_count);
-               count = uatomic_add_return(&ht->count,
-                                          1UL << COUNT_COMMIT_ORDER);
-               /* If power of 2 */
-               if (!(count & (count - 1))) {
-                       if ((count >> CHAIN_LEN_RESIZE_THRESHOLD) < size)
-                               return;
-                       dbg_printf("add set global %ld\n", count);
-                       cds_lfht_resize_lazy_count(ht, size,
-                               count >> (CHAIN_LEN_TARGET - 1));
-               }
-       }
+       if (caa_likely(split_count & ((1UL << COUNT_COMMIT_ORDER) - 1)))
+               return;
+       /* Only if number of add multiple of 1UL << COUNT_COMMIT_ORDER */
+
+       dbg_printf("add split count %lu\n", split_count);
+       count = uatomic_add_return(&ht->count,
+                                  1UL << COUNT_COMMIT_ORDER);
+       if (caa_likely(count & (count - 1)))
+               return;
+       /* Only if global count is power of 2 */
+
+       if ((count >> CHAIN_LEN_RESIZE_THRESHOLD) < size)
+               return;
+       dbg_printf("add set global %ld\n", count);
+       cds_lfht_resize_lazy_count(ht, size,
+               count >> (CHAIN_LEN_TARGET - 1));
  }
  
  static
@@ -646,32 +557,34 @@ void ht_count_del(struct cds_lfht *ht, unsigned long size, unsigned long hash)
  {
         unsigned long split_count;
         int index;
+       long count;
  
         if (caa_unlikely(!ht->split_count))
                 return;
         index = ht_get_split_count_index(hash);
         split_count = uatomic_add_return(&ht->split_count[index].del, 1);
-       if (caa_unlikely(!(split_count & ((1UL << COUNT_COMMIT_ORDER) - 1)))) {
-               long count;
-
-               dbg_printf("del split count %lu\n", split_count);
-               count = uatomic_add_return(&ht->count,
-                                          -(1UL << COUNT_COMMIT_ORDER));
-               /* If power of 2 */
-               if (!(count & (count - 1))) {
-                       if ((count >> CHAIN_LEN_RESIZE_THRESHOLD) >= size)
-                               return;
-                       dbg_printf("del set global %ld\n", count);
-                       /*
-                        * Don't shrink table if the number of nodes is below a
-                        * certain threshold.
-                        */
-                       if (count < (1UL << COUNT_COMMIT_ORDER) * (split_count_mask + 1))
-                               return;
-                       cds_lfht_resize_lazy_count(ht, size,
-                               count >> (CHAIN_LEN_TARGET - 1));
-               }
-       }
+       if (caa_likely(split_count & ((1UL << COUNT_COMMIT_ORDER) - 1)))
+               return;
+       /* Only if number of deletes multiple of 1UL << COUNT_COMMIT_ORDER */
+
+       dbg_printf("del split count %lu\n", split_count);
+       count = uatomic_add_return(&ht->count,
+                                  -(1UL << COUNT_COMMIT_ORDER));
+       if (caa_likely(count & (count - 1)))
+               return;
+       /* Only if global count is power of 2 */
+
+       if ((count >> CHAIN_LEN_RESIZE_THRESHOLD) >= size)
+               return;
+       dbg_printf("del set global %ld\n", count);
+       /*
+        * Don't shrink table if the number of nodes is below a
+        * certain threshold.
+        */
+       if (count < (1UL << COUNT_COMMIT_ORDER) * (split_count_mask + 1))
+               return;
+       cds_lfht_resize_lazy_count(ht, size,
+               count >> (CHAIN_LEN_TARGET - 1));
  }
  
  static
@@ -693,7 +606,7 @@ void check_resize(struct cds_lfht *ht, unsigned long size, uint32_t chain_len)
                            chain_len);
         if (chain_len >= CHAIN_LEN_RESIZE_THRESHOLD)
                 cds_lfht_resize_lazy_grow(ht, size,
-                       get_count_order_u32(chain_len - (CHAIN_LEN_TARGET - 1)));
+                       cds_lfht_get_count_order_u32(chain_len - (CHAIN_LEN_TARGET - 1)));
  }
  
  static
@@ -726,6 +639,18 @@ struct cds_lfht_node *flag_bucket(struct cds_lfht_node *node)
         return (struct cds_lfht_node *) (((unsigned long) node) | BUCKET_FLAG);
  }
  
+static
+int is_removal_owner(struct cds_lfht_node *node)
+{
+       return ((unsigned long) node) & REMOVAL_OWNER_FLAG;
+}
+
+static
+struct cds_lfht_node *flag_removal_owner(struct cds_lfht_node *node)
+{
+       return (struct cds_lfht_node *) (((unsigned long) node) | REMOVAL_OWNER_FLAG);
+}
+
  static
  struct cds_lfht_node *get_end(void)
  {
@@ -756,16 +681,7 @@ unsigned long _uatomic_xchg_monotonic_increase(unsigned long *ptr,
  static
  void cds_lfht_alloc_bucket_table(struct cds_lfht *ht, unsigned long order)
  {
-       if (order == 0) {
-               ht->t.tbl[0] = calloc(ht->min_alloc_size,
-                       sizeof(struct cds_lfht_node));
-               assert(ht->t.tbl[0]);
-       } else if (order > ht->min_alloc_order) {
-               ht->t.tbl[order] = calloc(1UL << (order -1),
-                       sizeof(struct cds_lfht_node));
-               assert(ht->t.tbl[order]);
-       }
-       /* Nothing to do for 0 < order && order <= ht->min_alloc_order */
+       return ht->mm->alloc_bucket_table(ht, order);
  }
  
  /*
@@ -776,32 +692,13 @@ void cds_lfht_alloc_bucket_table(struct cds_lfht *ht, unsigned long order)
  static
  void cds_lfht_free_bucket_table(struct cds_lfht *ht, unsigned long order)
  {
-       if (order == 0)
-               poison_free(ht->t.tbl[0]);
-       else if (order > ht->min_alloc_order)
-               poison_free(ht->t.tbl[order]);
-       /* Nothing to do for 0 < order && order <= ht->min_alloc_order */
+       return ht->mm->free_bucket_table(ht, order);
  }
  
  static inline
  struct cds_lfht_node *bucket_at(struct cds_lfht *ht, unsigned long index)
  {
-       unsigned long order;
-
-       if ((__builtin_constant_p(index) && index == 0)
-                       || index < ht->min_alloc_size) {
-               dbg_printf("bucket index %lu order 0 aridx 0\n", index);
-               return &ht->t.tbl[0][index];
-       }
-       /*
-        * equivalent to get_count_order_ulong(index + 1), but optimizes
-        * away the non-existing 0 special-case for
-        * get_count_order_ulong.
-        */
-       order = fls_ulong(index);
-       dbg_printf("bucket index %lu order %lu aridx %lu\n",
-                  index, order, index & ((1UL << (order - 1)) - 1));
-       return &ht->t.tbl[order][index & ((1UL << (order - 1)) - 1)];
+       return ht->bucket_at(ht, index);
  }
  
  static inline
@@ -855,7 +752,6 @@ void _cds_lfht_gc_bucket(struct cds_lfht_node *bucket, struct cds_lfht_node *nod
                         new_next = clear_flag(next);
                 (void) uatomic_cmpxchg(&iter_prev->next, iter, new_next);
         }
-       return;
  }
  
  static
@@ -883,9 +779,9 @@ int _cds_lfht_replace(struct cds_lfht *ht, unsigned long size,
                          */
                         return -ENOENT;
                 }
-               assert(!is_bucket(old_next));
-               assert(new_node != clear_flag(old_next));
-               new_node->next = clear_flag(old_next);
+               assert(old_next == clear_flag(old_next));
+               assert(new_node != old_next);
+               new_node->next = old_next;
                 /*
                  * Here is the whole trick for lock-free replace: we add
                  * the replacement node _after_ the node we want to
@@ -895,6 +791,9 @@ int _cds_lfht_replace(struct cds_lfht *ht, unsigned long size,
                  * next pointer, they will either skip the old node due
                  * to the removal flag and see the new node, or use
                  * the old node, but will not see the new one.
+                * This is a replacement of a node with another node
+                * that has the same value: we are therefore not
+                * removing a value from the hash table.
                  */
                 ret_next = uatomic_cmpxchg(&old_node->next,
                               old_next, flag_removed(new_node));
@@ -1030,10 +929,9 @@ end:
  
  static
  int _cds_lfht_del(struct cds_lfht *ht, unsigned long size,
-               struct cds_lfht_node *node,
-               int bucket_removal)
+               struct cds_lfht_node *node)
  {
-       struct cds_lfht_node *bucket, *next, *old;
+       struct cds_lfht_node *bucket, *next;
  
         if (!node)      /* Return -ENOENT if asked to delete NULL node */
                 return -ENOENT;
@@ -1041,20 +939,25 @@ int _cds_lfht_del(struct cds_lfht *ht, unsigned long size,
         /* logically delete the node */
         assert(!is_bucket(node));
         assert(!is_removed(node));
-       old = rcu_dereference(node->next);
-       do {
-               struct cds_lfht_node *new_next;
+       assert(!is_removal_owner(node));
  
-               next = old;
-               if (caa_unlikely(is_removed(next)))
-                       return -ENOENT;
-               if (bucket_removal)
-                       assert(is_bucket(next));
-               else
-                       assert(!is_bucket(next));
-               new_next = flag_removed(next);
-               old = uatomic_cmpxchg(&node->next, next, new_next);
-       } while (old != next);
+       /*
+        * We are first checking if the node had previously been
+        * logically removed (this check is not atomic with setting the
+        * logical removal flag). Return -ENOENT if the node had
+        * previously been removed.
+        */
+       next = rcu_dereference(node->next);
+       if (caa_unlikely(is_removed(next)))
+               return -ENOENT;
+       assert(!is_bucket(next));
+       /*
+        * We set the REMOVED_FLAG unconditionally. Note that there may
+        * be more than one concurrent thread setting this flag.
+        * Knowing which wins the race will be known after the garbage
+        * collection phase, stay tuned!
+        */
+       uatomic_or(&node->next, REMOVED_FLAG);
         /* We performed the (logical) deletion. */
  
         /*
@@ -1066,7 +969,23 @@ int _cds_lfht_del(struct cds_lfht *ht, unsigned long size,
         _cds_lfht_gc_bucket(bucket, node);
  
         assert(is_removed(rcu_dereference(node->next)));
-       return 0;
+       /*
+        * Last phase: atomically exchange node->next with a version
+        * having "REMOVAL_OWNER_FLAG" set. If the returned node->next
+        * pointer did _not_ have "REMOVAL_OWNER_FLAG" set, we now own
+        * the node and win the removal race.
+        * It is interesting to note that all "add" paths are forbidden
+        * to change the next pointer starting from the point where the
+        * REMOVED_FLAG is set, so here using a read, followed by a
+        * xchg() suffice to guarantee that the xchg() will ever only
+        * set the "REMOVAL_OWNER_FLAG" (or change nothing if the flag
+        * was already set).
+        */
+       if (!is_removal_owner(uatomic_xchg(&node->next,
+                       flag_removal_owner(node->next))))
+               return 0;
+       else
+               return -ENOENT;
  }
  
  static
@@ -1074,9 +993,9 @@ void *partition_resize_thread(void *arg)
  {
         struct partition_resize_work *work = arg;
  
-       work->ht->cds_lfht_rcu_register_thread();
+       work->ht->flavor->register_thread();
         work->fct(work->ht, work->i, work->start, work->len);
-       work->ht->cds_lfht_rcu_unregister_thread();
+       work->ht->flavor->unregister_thread();
         return NULL;
  }
  
@@ -1102,7 +1021,7 @@ void partition_resize_helper(struct cds_lfht *ht, unsigned long i,
         } else {
                 nr_threads = 1;
         }
-       partition_len = len >> get_count_order_ulong(nr_threads);
+       partition_len = len >> cds_lfht_get_count_order_ulong(nr_threads);
         work = calloc(nr_threads, sizeof(*work));
         assert(work);
         for (thread = 0; thread < nr_threads; thread++) {
@@ -1140,7 +1059,7 @@ void init_table_populate_partition(struct cds_lfht *ht, unsigned long i,
         unsigned long j, size = 1UL << (i - 1);
  
         assert(i > MIN_TABLE_ORDER);
-       ht->cds_lfht_rcu_read_lock();
+       ht->flavor->read_lock();
         for (j = size + start; j < size + start + len; j++) {
                 struct cds_lfht_node *new_node = bucket_at(ht, j);
  
@@ -1150,7 +1069,7 @@ void init_table_populate_partition(struct cds_lfht *ht, unsigned long i,
                 new_node->reverse_hash = bit_reverse_ulong(j);
                 _cds_lfht_add(ht, NULL, NULL, size, new_node, NULL, 1);
         }
-       ht->cds_lfht_rcu_read_unlock();
+       ht->flavor->read_unlock();
  }
  
  static
@@ -1159,9 +1078,9 @@ void init_table_populate(struct cds_lfht *ht, unsigned long i,
  {
         assert(nr_cpus_mask != -1);
         if (nr_cpus_mask < 0 || len < 2 * MIN_PARTITION_PER_THREAD) {
-               ht->cds_lfht_rcu_thread_online();
+               ht->flavor->thread_online();
                 init_table_populate_partition(ht, i, 0, len);
-               ht->cds_lfht_rcu_thread_offline();
+               ht->flavor->thread_offline();
                 return;
         }
         partition_resize_helper(ht, i, len, init_table_populate_partition);
@@ -1183,7 +1102,7 @@ void init_table(struct cds_lfht *ht,
                 dbg_printf("init order %lu len: %lu\n", i, len);
  
                 /* Stop expand if the resize target changes under us */
-               if (CMM_LOAD_SHARED(ht->t.resize_target) < (1UL << i))
+               if (CMM_LOAD_SHARED(ht->resize_target) < (1UL << i))
                         break;
  
                 cds_lfht_alloc_bucket_table(ht, i);
@@ -1198,7 +1117,7 @@ void init_table(struct cds_lfht *ht,
                  * Update table size.
                  */
                 cmm_smp_wmb();  /* populate data before RCU size */
-               CMM_STORE_SHARED(ht->t.size, 1UL << i);
+               CMM_STORE_SHARED(ht->size, 1UL << i);
  
                 dbg_printf("init new size: %lu\n", 1UL << i);
                 if (CMM_LOAD_SHARED(ht->in_progress_destroy))
@@ -1238,17 +1157,19 @@ void remove_table_partition(struct cds_lfht *ht, unsigned long i,
         unsigned long j, size = 1UL << (i - 1);
  
         assert(i > MIN_TABLE_ORDER);
-       ht->cds_lfht_rcu_read_lock();
+       ht->flavor->read_lock();
         for (j = size + start; j < size + start + len; j++) {
-               struct cds_lfht_node *fini_node = bucket_at(ht, j);
+               struct cds_lfht_node *fini_bucket = bucket_at(ht, j);
+               struct cds_lfht_node *parent_bucket = bucket_at(ht, j - size);
  
                 assert(j >= size && j < (size << 1));
                 dbg_printf("remove entry: order %lu index %lu hash %lu\n",
                            i, j, j);
-               fini_node->reverse_hash = bit_reverse_ulong(j);
-               (void) _cds_lfht_del(ht, size, fini_node, 1);
+               /* Set the REMOVED_FLAG to freeze the ->next for gc */
+               uatomic_or(&fini_bucket->next, REMOVED_FLAG);
+               _cds_lfht_gc_bucket(parent_bucket, fini_bucket);
         }
-       ht->cds_lfht_rcu_read_unlock();
+       ht->flavor->read_unlock();
  }
  
  static
@@ -1257,14 +1178,19 @@ void remove_table(struct cds_lfht *ht, unsigned long i, unsigned long len)
  
         assert(nr_cpus_mask != -1);
         if (nr_cpus_mask < 0 || len < 2 * MIN_PARTITION_PER_THREAD) {
-               ht->cds_lfht_rcu_thread_online();
+               ht->flavor->thread_online();
                 remove_table_partition(ht, i, 0, len);
-               ht->cds_lfht_rcu_thread_offline();
+               ht->flavor->thread_offline();
                 return;
         }
         partition_resize_helper(ht, i, len, remove_table_partition);
  }
  
+/*
+ * fini_table() is never called for first_order == 0, which is why
+ * free_by_rcu_order == 0 can be used as criterion to know if free must
+ * be called.
+ */
  static
  void fini_table(struct cds_lfht *ht,
                 unsigned long first_order, unsigned long last_order)
@@ -1282,11 +1208,11 @@ void fini_table(struct cds_lfht *ht,
                 dbg_printf("fini order %lu len: %lu\n", i, len);
  
                 /* Stop shrink if the resize target changes under us */
-               if (CMM_LOAD_SHARED(ht->t.resize_target) > (1UL << (i - 1)))
+               if (CMM_LOAD_SHARED(ht->resize_target) > (1UL << (i - 1)))
                         break;
  
                 cmm_smp_wmb();  /* populate data before RCU size */
-               CMM_STORE_SHARED(ht->t.size, 1UL << (i - 1));
+               CMM_STORE_SHARED(ht->size, 1UL << (i - 1));
  
                 /*
                  * We need to wait for all add operations to reach Q.S. (and
@@ -1294,7 +1220,7 @@ void fini_table(struct cds_lfht *ht,
                  * releasing the old bucket nodes. Otherwise their lookup will
                  * return a logically removed node as insert position.
                  */
-               ht->cds_lfht_synchronize_rcu();
+               ht->flavor->update_synchronize_rcu();
                 if (free_by_rcu_order)
                         cds_lfht_free_bucket_table(ht, free_by_rcu_order);
  
@@ -1314,7 +1240,7 @@ void fini_table(struct cds_lfht *ht,
         }
  
         if (free_by_rcu_order) {
-               ht->cds_lfht_synchronize_rcu();
+               ht->flavor->update_synchronize_rcu();
                 cds_lfht_free_bucket_table(ht, free_by_rcu_order);
         }
  }
@@ -1332,7 +1258,7 @@ void cds_lfht_create_bucket(struct cds_lfht *ht, unsigned long size)
         node->next = flag_bucket(get_end());
         node->reverse_hash = 0;
  
-       for (order = 1; order < get_count_order_ulong(size) + 1; order++) {
+       for (order = 1; order < cds_lfht_get_count_order_ulong(size) + 1; order++) {
                 len = 1UL << (order - 1);
                 cds_lfht_alloc_bucket_table(ht, order);
  
@@ -1364,51 +1290,76 @@ void cds_lfht_create_bucket(struct cds_lfht *ht, unsigned long size)
  }
  
  struct cds_lfht *_cds_lfht_new(unsigned long init_size,
-                       unsigned long min_alloc_size,
+                       unsigned long min_nr_alloc_buckets,
+                       unsigned long max_nr_buckets,
                         int flags,
-                       void (*cds_lfht_call_rcu)(struct rcu_head *head,
-                                       void (*func)(struct rcu_head *head)),
-                       void (*cds_lfht_synchronize_rcu)(void),
-                       void (*cds_lfht_rcu_read_lock)(void),
-                       void (*cds_lfht_rcu_read_unlock)(void),
-                       void (*cds_lfht_rcu_thread_offline)(void),
-                       void (*cds_lfht_rcu_thread_online)(void),
-                       void (*cds_lfht_rcu_register_thread)(void),
-                       void (*cds_lfht_rcu_unregister_thread)(void),
+                       const struct cds_lfht_mm_type *mm,
+                       const struct rcu_flavor_struct *flavor,
                         pthread_attr_t *attr)
  {
         struct cds_lfht *ht;
         unsigned long order;
  
-       /* min_alloc_size must be power of two */
-       if (!min_alloc_size || (min_alloc_size & (min_alloc_size - 1)))
+       /* min_nr_alloc_buckets must be power of two */
+       if (!min_nr_alloc_buckets || (min_nr_alloc_buckets & (min_nr_alloc_buckets - 1)))
                 return NULL;
+
         /* init_size must be power of two */
         if (!init_size || (init_size & (init_size - 1)))
                 return NULL;
-       min_alloc_size = max(min_alloc_size, MIN_TABLE_SIZE);
+
+       /*
+        * Memory management plugin default.
+        */
+       if (!mm) {
+               if (CAA_BITS_PER_LONG > 32
+                               && max_nr_buckets
+                               && max_nr_buckets <= (1ULL << 32)) {
+                       /*
+                        * For 64-bit architectures, with max number of
+                        * buckets small enough not to use the entire
+                        * 64-bit memory mapping space (and allowing a
+                        * fair number of hash table instances), use the
+                        * mmap allocator, which is faster than the
+                        * order allocator.
+                        */
+                       mm = &cds_lfht_mm_mmap;
+               } else {
+                       /*
+                        * The fallback is to use the order allocator.
+                        */
+                       mm = &cds_lfht_mm_order;
+               }
+       }
+
+       /* max_nr_buckets == 0 for order based mm means infinite */
+       if (mm == &cds_lfht_mm_order && !max_nr_buckets)
+               max_nr_buckets = 1UL << (MAX_TABLE_ORDER - 1);
+
+       /* max_nr_buckets must be power of two */
+       if (!max_nr_buckets || (max_nr_buckets & (max_nr_buckets - 1)))
+               return NULL;
+
+       min_nr_alloc_buckets = max(min_nr_alloc_buckets, MIN_TABLE_SIZE);
         init_size = max(init_size, MIN_TABLE_SIZE);
-       ht = calloc(1, sizeof(struct cds_lfht));
+       max_nr_buckets = max(max_nr_buckets, min_nr_alloc_buckets);
+       init_size = min(init_size, max_nr_buckets);
+
+       ht = mm->alloc_cds_lfht(min_nr_alloc_buckets, max_nr_buckets);
         assert(ht);
+       assert(ht->mm == mm);
+       assert(ht->bucket_at == mm->bucket_at);
+
         ht->flags = flags;
-       ht->cds_lfht_call_rcu = cds_lfht_call_rcu;
-       ht->cds_lfht_synchronize_rcu = cds_lfht_synchronize_rcu;
-       ht->cds_lfht_rcu_read_lock = cds_lfht_rcu_read_lock;
-       ht->cds_lfht_rcu_read_unlock = cds_lfht_rcu_read_unlock;
-       ht->cds_lfht_rcu_thread_offline = cds_lfht_rcu_thread_offline;
-       ht->cds_lfht_rcu_thread_online = cds_lfht_rcu_thread_online;
-       ht->cds_lfht_rcu_register_thread = cds_lfht_rcu_register_thread;
-       ht->cds_lfht_rcu_unregister_thread = cds_lfht_rcu_unregister_thread;
+       ht->flavor = flavor;
         ht->resize_attr = attr;
         alloc_split_items_count(ht);
         /* this mutex should not nest in read-side C.S. */
         pthread_mutex_init(&ht->resize_mutex, NULL);
-       order = get_count_order_ulong(init_size);
-       ht->t.resize_target = 1UL << order;
-       ht->min_alloc_size = min_alloc_size;
-       ht->min_alloc_order = get_count_order_ulong(min_alloc_size);
+       order = cds_lfht_get_count_order_ulong(init_size);
+       ht->resize_target = 1UL << order;
         cds_lfht_create_bucket(ht, 1UL << order);
-       ht->t.size = 1UL << order;
+       ht->size = 1UL << order;
         return ht;
  }
  
@@ -1421,7 +1372,7 @@ void cds_lfht_lookup(struct cds_lfht *ht, unsigned long hash,
  
         reverse_hash = bit_reverse_ulong(hash);
  
-       size = rcu_dereference(ht->t.size);
+       size = rcu_dereference(ht->size);
         bucket = lookup_bucket(ht, size, hash);
         /* We can always skip the bucket node initially */
         node = rcu_dereference(bucket->next);
@@ -1521,7 +1472,7 @@ void cds_lfht_add(struct cds_lfht *ht, unsigned long hash,
         unsigned long size;
  
         node->reverse_hash = bit_reverse_ulong((unsigned long) hash);
-       size = rcu_dereference(ht->t.size);
+       size = rcu_dereference(ht->size);
         _cds_lfht_add(ht, NULL, NULL, size, node, NULL, 0);
         ht_count_add(ht, size, hash);
  }
@@ -1536,7 +1487,7 @@ struct cds_lfht_node *cds_lfht_add_unique(struct cds_lfht *ht,
         struct cds_lfht_iter iter;
  
         node->reverse_hash = bit_reverse_ulong((unsigned long) hash);
-       size = rcu_dereference(ht->t.size);
+       size = rcu_dereference(ht->size);
         _cds_lfht_add(ht, match, key, size, node, &iter, 0);
         if (iter.node == node)
                 ht_count_add(ht, size, hash);
@@ -1553,7 +1504,7 @@ struct cds_lfht_node *cds_lfht_add_replace(struct cds_lfht *ht,
         struct cds_lfht_iter iter;
  
         node->reverse_hash = bit_reverse_ulong((unsigned long) hash);
-       size = rcu_dereference(ht->t.size);
+       size = rcu_dereference(ht->size);
         for (;;) {
                 _cds_lfht_add(ht, match, key, size, node, &iter, 0);
                 if (iter.node == node) {
@@ -1566,25 +1517,36 @@ struct cds_lfht_node *cds_lfht_add_replace(struct cds_lfht *ht,
         }
  }
  
-int cds_lfht_replace(struct cds_lfht *ht, struct cds_lfht_iter *old_iter,
+int cds_lfht_replace(struct cds_lfht *ht,
+               struct cds_lfht_iter *old_iter,
+               unsigned long hash,
+               cds_lfht_match_fct match,
+               const void *key,
                 struct cds_lfht_node *new_node)
  {
         unsigned long size;
  
-       size = rcu_dereference(ht->t.size);
+       new_node->reverse_hash = bit_reverse_ulong((unsigned long) hash);
+       if (!old_iter->node)
+               return -ENOENT;
+       if (caa_unlikely(old_iter->node->reverse_hash != new_node->reverse_hash))
+               return -EINVAL;
+       if (caa_unlikely(!match(old_iter->node, key)))
+               return -EINVAL;
+       size = rcu_dereference(ht->size);
         return _cds_lfht_replace(ht, size, old_iter->node, old_iter->next,
                         new_node);
  }
  
-int cds_lfht_del(struct cds_lfht *ht, struct cds_lfht_iter *iter)
+int cds_lfht_del(struct cds_lfht *ht, struct cds_lfht_node *node)
  {
         unsigned long size, hash;
         int ret;
  
-       size = rcu_dereference(ht->t.size);
-       ret = _cds_lfht_del(ht, size, iter->node, 0);
+       size = rcu_dereference(ht->size);
+       ret = _cds_lfht_del(ht, size, node);
         if (!ret) {
-               hash = bit_reverse_ulong(iter->node->reverse_hash);
+               hash = bit_reverse_ulong(node->reverse_hash);
                 ht_count_del(ht, size, hash);
         }
         return ret;
@@ -1608,7 +1570,7 @@ int cds_lfht_delete_bucket(struct cds_lfht *ht)
          * size accessed without rcu_dereference because hash table is
          * being destroyed.
          */
-       size = ht->t.size;
+       size = ht->size;
         /* Internal sanity check: all nodes left should be bucket */
         for (i = 0; i < size; i++) {
                 node = bucket_at(ht, i);
@@ -1617,7 +1579,7 @@ int cds_lfht_delete_bucket(struct cds_lfht *ht)
                 assert(is_bucket(node->next));
         }
  
-       for (order = get_count_order_ulong(size); (long)order >= 0; order--)
+       for (order = cds_lfht_get_count_order_ulong(size); (long)order >= 0; order--)
                 cds_lfht_free_bucket_table(ht, order);
  
         return 0;
@@ -1649,11 +1611,10 @@ int cds_lfht_destroy(struct cds_lfht *ht, pthread_attr_t **attr)
  void cds_lfht_count_nodes(struct cds_lfht *ht,
                 long *approx_before,
                 unsigned long *count,
-               unsigned long *removed,
                 long *approx_after)
  {
         struct cds_lfht_node *node, *next;
-       unsigned long nr_bucket = 0;
+       unsigned long nr_bucket = 0, nr_removed = 0;
  
         *approx_before = 0;
         if (ht->split_count) {
@@ -1666,7 +1627,6 @@ void cds_lfht_count_nodes(struct cds_lfht *ht,
         }
  
         *count = 0;
-       *removed = 0;
  
         /* Count non-bucket nodes in the table */
         node = bucket_at(ht, 0);
@@ -1674,7 +1634,7 @@ void cds_lfht_count_nodes(struct cds_lfht *ht,
                 next = rcu_dereference(node->next);
                 if (is_removed(next)) {
                         if (!is_bucket(next))
-                               (*removed)++;
+                               (nr_removed)++;
                         else
                                 (nr_bucket)++;
                 } else if (!is_bucket(next))
@@ -1683,6 +1643,7 @@ void cds_lfht_count_nodes(struct cds_lfht *ht,
                         (nr_bucket)++;
                 node = clear_flag(next);
         } while (!is_end(node));
+       dbg_printf("number of logically removed nodes: %lu\n", nr_removed);
         dbg_printf("number of bucket nodes: %lu\n", nr_bucket);
         *approx_after = 0;
         if (ht->split_count) {
@@ -1702,8 +1663,8 @@ void _do_cds_lfht_grow(struct cds_lfht *ht,
  {
         unsigned long old_order, new_order;
  
-       old_order = get_count_order_ulong(old_size);
-       new_order = get_count_order_ulong(new_size);
+       old_order = cds_lfht_get_count_order_ulong(old_size);
+       new_order = cds_lfht_get_count_order_ulong(new_size);
         dbg_printf("resize from %lu (order %lu) to %lu (order %lu) buckets\n",
                    old_size, old_order, new_size, new_order);
         assert(new_size > old_size);
@@ -1718,8 +1679,8 @@ void _do_cds_lfht_shrink(struct cds_lfht *ht,
         unsigned long old_order, new_order;
  
         new_size = max(new_size, MIN_TABLE_SIZE);
-       old_order = get_count_order_ulong(old_size);
-       new_order = get_count_order_ulong(new_size);
+       old_order = cds_lfht_get_count_order_ulong(old_size);
+       new_order = cds_lfht_get_count_order_ulong(new_size);
         dbg_printf("resize from %lu (order %lu) to %lu (order %lu) buckets\n",
                    old_size, old_order, new_size, new_order);
         assert(new_size < old_size);
@@ -1742,23 +1703,23 @@ void _do_cds_lfht_resize(struct cds_lfht *ht)
                 assert(uatomic_read(&ht->in_progress_resize));
                 if (CMM_LOAD_SHARED(ht->in_progress_destroy))
                         break;
-               ht->t.resize_initiated = 1;
-               old_size = ht->t.size;
-               new_size = CMM_LOAD_SHARED(ht->t.resize_target);
+               ht->resize_initiated = 1;
+               old_size = ht->size;
+               new_size = CMM_LOAD_SHARED(ht->resize_target);
                 if (old_size < new_size)
                         _do_cds_lfht_grow(ht, old_size, new_size);
                 else if (old_size > new_size)
                         _do_cds_lfht_shrink(ht, old_size, new_size);
-               ht->t.resize_initiated = 0;
+               ht->resize_initiated = 0;
                 /* write resize_initiated before read resize_target */
                 cmm_smp_mb();
-       } while (ht->t.size != CMM_LOAD_SHARED(ht->t.resize_target));
+       } while (ht->size != CMM_LOAD_SHARED(ht->resize_target));
  }
  
  static
  unsigned long resize_target_grow(struct cds_lfht *ht, unsigned long new_size)
  {
-       return _uatomic_xchg_monotonic_increase(&ht->t.resize_target, new_size);
+       return _uatomic_xchg_monotonic_increase(&ht->resize_target, new_size);
  }
  
  static
@@ -1766,18 +1727,19 @@ void resize_target_update_count(struct cds_lfht *ht,
                                 unsigned long count)
  {
         count = max(count, MIN_TABLE_SIZE);
-       uatomic_set(&ht->t.resize_target, count);
+       count = min(count, ht->max_nr_buckets);
+       uatomic_set(&ht->resize_target, count);
  }
  
  void cds_lfht_resize(struct cds_lfht *ht, unsigned long new_size)
  {
         resize_target_update_count(ht, new_size);
-       CMM_STORE_SHARED(ht->t.resize_initiated, 1);
-       ht->cds_lfht_rcu_thread_offline();
+       CMM_STORE_SHARED(ht->resize_initiated, 1);
+       ht->flavor->thread_offline();
         pthread_mutex_lock(&ht->resize_mutex);
         _do_cds_lfht_resize(ht);
         pthread_mutex_unlock(&ht->resize_mutex);
-       ht->cds_lfht_rcu_thread_online();
+       ht->flavor->thread_online();
  }
  
  static
@@ -1787,11 +1749,11 @@ void do_resize_cb(struct rcu_head *head)
                 caa_container_of(head, struct rcu_resize_work, head);
         struct cds_lfht *ht = work->ht;
  
-       ht->cds_lfht_rcu_thread_offline();
+       ht->flavor->thread_offline();
         pthread_mutex_lock(&ht->resize_mutex);
         _do_cds_lfht_resize(ht);
         pthread_mutex_unlock(&ht->resize_mutex);
-       ht->cds_lfht_rcu_thread_online();
+       ht->flavor->thread_online();
         poison_free(work);
         cmm_smp_mb();   /* finish resize before decrement */
         uatomic_dec(&ht->in_progress_resize);
@@ -1804,7 +1766,7 @@ void __cds_lfht_resize_lazy_launch(struct cds_lfht *ht)
  
         /* Store resize_target before read resize_initiated */
         cmm_smp_mb();
-       if (!CMM_LOAD_SHARED(ht->t.resize_initiated)) {
+       if (!CMM_LOAD_SHARED(ht->resize_initiated)) {
                 uatomic_inc(&ht->in_progress_resize);
                 cmm_smp_mb();   /* increment resize count before load destroy */
                 if (CMM_LOAD_SHARED(ht->in_progress_destroy)) {
@@ -1813,8 +1775,8 @@ void __cds_lfht_resize_lazy_launch(struct cds_lfht *ht)
                 }
                 work = malloc(sizeof(*work));
                 work->ht = ht;
-               ht->cds_lfht_call_rcu(&work->head, do_resize_cb);
-               CMM_STORE_SHARED(ht->t.resize_initiated, 1);
+               ht->flavor->update_call_rcu(&work->head, do_resize_cb);
+               CMM_STORE_SHARED(ht->resize_initiated, 1);
         }
  }
  
@@ -1823,6 +1785,7 @@ void cds_lfht_resize_lazy_grow(struct cds_lfht *ht, unsigned long size, int grow
  {
         unsigned long target_size = size << growth;
  
+       target_size = min(target_size, ht->max_nr_buckets);
         if (resize_target_grow(ht, target_size) >= target_size)
                 return;
  
@@ -1841,6 +1804,7 @@ void cds_lfht_resize_lazy_count(struct cds_lfht *ht, unsigned long size,
         if (!(ht->flags & CDS_LFHT_AUTO_RESIZE))
                 return;
         count = max(count, MIN_TABLE_SIZE);
+       count = min(count, ht->max_nr_buckets);
         if (count == size)
                 return;         /* Already the right size, no resize needed */
         if (count > size) {     /* lazy grow */
@@ -1850,7 +1814,7 @@ void cds_lfht_resize_lazy_count(struct cds_lfht *ht, unsigned long size,
                 for (;;) {
                         unsigned long s;
  
-                       s = uatomic_cmpxchg(&ht->t.resize_target, size, count);
+                       s = uatomic_cmpxchg(&ht->resize_target, size, count);
                         if (s == size)
                                 break;  /* no resize needed */
                         if (s > size)