rculfhash.c

   1 /*
   2  * rculfhash.c
   3  *
   4  * Userspace RCU library - Lock-Free Resizable RCU Hash Table
   5  *
   6  * Copyright 2010-2011 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
   7  *
   8  * This library is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2.1 of the License, or (at your option) any later version.
  12  *
  13  * This library is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with this library; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 /*
  24  * Based on the following articles:
  25  * - Ori Shalev and Nir Shavit. Split-ordered lists: Lock-free
  26  *   extensible hash tables. J. ACM 53, 3 (May 2006), 379-405.
  27  * - Michael, M. M. High performance dynamic lock-free hash tables
  28  *   and list-based sets. In Proceedings of the fourteenth annual ACM
  29  *   symposium on Parallel algorithms and architectures, ACM Press,
  30  *   (2002), 73-82.
  31  *
  32  * Some specificities of this Lock-Free Resizable RCU Hash Table
  33  * implementation:
  34  *
  35  * - RCU read-side critical section allows readers to perform hash
  36  *   table lookups and use the returned objects safely by delaying
  37  *   memory reclaim of a grace period.
  38  * - Add and remove operations are lock-free, and do not need to
  39  *   allocate memory. They need to be executed within RCU read-side
  40  *   critical section to ensure the objects they read are valid and to
  41  *   deal with the cmpxchg ABA problem.
  42  * - add and add_unique operations are supported. add_unique checks if
  43  *   the node key already exists in the hash table. It ensures no key
  44  *   duplicata exists.
  45  * - The resize operation executes concurrently with add/remove/lookup.
  46  * - Hash table nodes are contained within a split-ordered list. This
  47  *   list is ordered by incrementing reversed-bits-hash value.
  48  * - An index of dummy nodes is kept. These dummy nodes are the hash
  49  *   table "buckets", and they are also chained together in the
  50  *   split-ordered list, which allows recursive expansion.
  51  * - The resize operation for small tables only allows expanding the hash table.
  52  *   It is triggered automatically by detecting long chains in the add
  53  *   operation.
  54  * - The resize operation for larger tables (and available through an
  55  *   API) allows both expanding and shrinking the hash table.
  56  * - Per-CPU Split-counters are used to keep track of the number of
  57  *   nodes within the hash table for automatic resize triggering.
  58  * - Resize operation initiated by long chain detection is executed by a
  59  *   call_rcu thread, which keeps lock-freedom of add and remove.
  60  * - Resize operations are protected by a mutex.
  61  * - The removal operation is split in two parts: first, a "removed"
  62  *   flag is set in the next pointer within the node to remove. Then,
  63  *   a "garbage collection" is performed in the bucket containing the
  64  *   removed node (from the start of the bucket up to the removed node).
  65  *   All encountered nodes with "removed" flag set in their next
  66  *   pointers are removed from the linked-list. If the cmpxchg used for
  67  *   removal fails (due to concurrent garbage-collection or concurrent
  68  *   add), we retry from the beginning of the bucket. This ensures that
  69  *   the node with "removed" flag set is removed from the hash table
  70  *   (not visible to lookups anymore) before the RCU read-side critical
  71  *   section held across removal ends. Furthermore, this ensures that
  72  *   the node with "removed" flag set is removed from the linked-list
  73  *   before its memory is reclaimed. Only the thread which removal
  74  *   successfully set the "removed" flag (with a cmpxchg) into a node's
  75  *   next pointer is considered to have succeeded its removal (and thus
  76  *   owns the node to reclaim). Because we garbage-collect starting from
  77  *   an invariant node (the start-of-bucket dummy node) up to the
  78  *   "removed" node (or find a reverse-hash that is higher), we are sure
  79  *   that a successful traversal of the chain leads to a chain that is
  80  *   present in the linked-list (the start node is never removed) and
  81  *   that is does not contain the "removed" node anymore, even if
  82  *   concurrent delete/add operations are changing the structure of the
  83  *   list concurrently.
  84  * - The add operation performs gargage collection of buckets if it
  85  *   encounters nodes with removed flag set in the bucket where it wants
  86  *   to add its new node. This ensures lock-freedom of add operation by
  87  *   helping the remover unlink nodes from the list rather than to wait
  88  *   for it do to so.
  89  * - A RCU "order table" indexed by log2(hash index) is copied and
  90  *   expanded by the resize operation. This order table allows finding
  91  *   the "dummy node" tables.
  92  * - There is one dummy node table per hash index order. The size of
  93  *   each dummy node table is half the number of hashes contained in
  94  *   this order.
  95  * - call_rcu is used to garbage-collect the old order table.
  96  * - The per-order dummy node tables contain a compact version of the
  97  *   hash table nodes. These tables are invariant after they are
  98  *   populated into the hash table.
  99  *
 100  * A bit of ascii art explanation:
 101  *
 102  * Order index is the off-by-one compare to the actual power of 2 because
 103  * we use index 0 to deal with the 0 special-case.
 104  *
 105  * This shows the nodes for a small table ordered by reversed bits:
 106  *
 107  *    bits   reverse
 108  * 0  000        000
 109  * 4  100        001
 110  * 2  010        010
 111  * 6  110        011
 112  * 1  001        100
 113  * 5  101        101
 114  * 3  011        110
 115  * 7  111        111
 116  *
 117  * This shows the nodes in order of non-reversed bits, linked by
 118  * reversed-bit order.
 119  *
 120  * order              bits       reverse
 121  * 0               0  000        000
 122  *                 |
 123  * 1               |  1  001        100       <-    <-
 124  *                 |  |                        |     |
 125  * 2               |  |  2  010        010     |     |
 126  *                 |  |  |  3  011        110  | <-  |
 127  *                 |  |  |  |                  |  |  |
 128  * 3               -> |  |  |  4  100        001  |  |
 129  *                    -> |  |     5  101        101  |
 130  *                       -> |        6  110        011
 131  *                          ->          7  111        111
 132  */
 133
 134 #define _LGPL_SOURCE
 135 #include <stdlib.h>
 136 #include <errno.h>
 137 #include <assert.h>
 138 #include <stdio.h>
 139 #include <stdint.h>
 140 #include <string.h>
 141
 142 #include "config.h"
 143 #include <urcu.h>
 144 #include <urcu-call-rcu.h>
 145 #include <urcu/arch.h>
 146 #include <urcu/uatomic.h>
 147 #include <urcu/jhash.h>
 148 #include <urcu/compiler.h>
 149 #include <urcu/rculfhash.h>
 150 #include <stdio.h>
 151 #include <pthread.h>
 152
 153 #ifdef DEBUG
 154 #define dbg_printf(fmt, args...)     printf("[debug rculfhash] " fmt, ## args)
 155 #else
 156 #define dbg_printf(fmt, args...)
 157 #endif
 158
 159 /*
 160  * Per-CPU split-counters lazily update the global counter each 1024
 161  * addition/removal. It automatically keeps track of resize required.
 162  * We use the bucket length as indicator for need to expand for small
 163  * tables and machines lacking per-cpu data suppport.
 164  */
 165 #define COUNT_COMMIT_ORDER              10
 166 #define CHAIN_LEN_TARGET                1
 167 #define CHAIN_LEN_RESIZE_THRESHOLD      3
 168
 169 /*
 170  * Define the minimum table size.
 171  */
 172 #define MIN_TABLE_SIZE                  1
 173
 174 #if (CAA_BITS_PER_LONG == 32)
 175 #define MAX_TABLE_ORDER                 32
 176 #else
 177 #define MAX_TABLE_ORDER                 64
 178 #endif
 179
 180 /*
 181  * Minimum number of dummy nodes to touch per thread to parallelize grow/shrink.
 182  */
 183 #define MIN_PARTITION_PER_THREAD_ORDER  12
 184 #define MIN_PARTITION_PER_THREAD        (1UL << MIN_PARTITION_PER_THREAD_ORDER)
 185
 186 #ifndef min
 187 #define min(a, b)       ((a) < (b) ? (a) : (b))
 188 #endif
 189
 190 #ifndef max
 191 #define max(a, b)       ((a) > (b) ? (a) : (b))
 192 #endif
 193
 194 /*
 195  * The removed flag needs to be updated atomically with the pointer.
 196  * It indicates that no node must attach to the node scheduled for
 197  * removal, and that node garbage collection must be performed.
 198  * The dummy flag does not require to be updated atomically with the
 199  * pointer, but it is added as a pointer low bit flag to save space.
 200  */
 201 #define REMOVED_FLAG            (1UL << 0)
 202 #define DUMMY_FLAG              (1UL << 1)
 203 #define FLAGS_MASK              ((1UL << 2) - 1)
 204
 205 /* Value of the end pointer. Should not interact with flags. */
 206 #define END_VALUE               NULL
 207
 208 struct ht_items_count {
 209         unsigned long add, del;
 210 } __attribute__((aligned(CAA_CACHE_LINE_SIZE)));
 211
 212 struct rcu_level {
 213         struct rcu_head head;
 214         struct _cds_lfht_node nodes[0];
 215 };
 216
 217 struct rcu_table {
 218         unsigned long size;     /* always a power of 2, shared (RCU) */
 219         unsigned long resize_target;
 220         int resize_initiated;
 221         struct rcu_level *tbl[MAX_TABLE_ORDER];
 222 };
 223
 224 struct cds_lfht {
 225         struct rcu_table t;
 226         cds_lfht_hash_fct hash_fct;
 227         cds_lfht_compare_fct compare_fct;
 228         unsigned long hash_seed;
 229         int flags;
 230         /*
 231          * We need to put the work threads offline (QSBR) when taking this
 232          * mutex, because we use synchronize_rcu within this mutex critical
 233          * section, which waits on read-side critical sections, and could
 234          * therefore cause grace-period deadlock if we hold off RCU G.P.
 235          * completion.
 236          */
 237         pthread_mutex_t resize_mutex;   /* resize mutex: add/del mutex */
 238         unsigned int in_progress_resize, in_progress_destroy;
 239         void (*cds_lfht_call_rcu)(struct rcu_head *head,
 240                       void (*func)(struct rcu_head *head));
 241         void (*cds_lfht_synchronize_rcu)(void);
 242         void (*cds_lfht_rcu_read_lock)(void);
 243         void (*cds_lfht_rcu_read_unlock)(void);
 244         void (*cds_lfht_rcu_thread_offline)(void);
 245         void (*cds_lfht_rcu_thread_online)(void);
 246         void (*cds_lfht_rcu_register_thread)(void);
 247         void (*cds_lfht_rcu_unregister_thread)(void);
 248         pthread_attr_t *resize_attr;    /* Resize threads attributes */
 249         long count;                     /* global approximate item count */
 250         struct ht_items_count *percpu_count;    /* per-cpu item count */
 251 };
 252
 253 struct rcu_resize_work {
 254         struct rcu_head head;
 255         struct cds_lfht *ht;
 256 };
 257
 258 struct partition_resize_work {
 259         struct rcu_head head;
 260         struct cds_lfht *ht;
 261         unsigned long i, start, len;
 262         void (*fct)(struct cds_lfht *ht, unsigned long i,
 263                     unsigned long start, unsigned long len);
 264 };
 265
 266 enum add_mode {
 267         ADD_DEFAULT = 0,
 268         ADD_UNIQUE = 1,
 269         ADD_REPLACE = 2,
 270 };
 271
 272 static
 273 struct cds_lfht_node *_cds_lfht_add(struct cds_lfht *ht,
 274                                 unsigned long size,
 275                                 struct cds_lfht_node *node,
 276                                 enum add_mode mode, int dummy);
 277
 278 /*
 279  * Algorithm to reverse bits in a word by lookup table, extended to
 280  * 64-bit words.
 281  * Source:
 282  * http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
 283  * Originally from Public Domain.
 284  */
 285
 286 static const uint8_t BitReverseTable256[256] =
 287 {
 288 #define R2(n) (n),   (n) + 2*64,     (n) + 1*64,     (n) + 3*64
 289 #define R4(n) R2(n), R2((n) + 2*16), R2((n) + 1*16), R2((n) + 3*16)
 290 #define R6(n) R4(n), R4((n) + 2*4 ), R4((n) + 1*4 ), R4((n) + 3*4 )
 291         R6(0), R6(2), R6(1), R6(3)
 292 };
 293 #undef R2
 294 #undef R4
 295 #undef R6
 296
 297 static
 298 uint8_t bit_reverse_u8(uint8_t v)
 299 {
 300         return BitReverseTable256[v];
 301 }
 302
 303 static __attribute__((unused))
 304 uint32_t bit_reverse_u32(uint32_t v)
 305 {
 306         return ((uint32_t) bit_reverse_u8(v) << 24) |
 307                 ((uint32_t) bit_reverse_u8(v >> 8) << 16) |
 308                 ((uint32_t) bit_reverse_u8(v >> 16) << 8) |
 309                 ((uint32_t) bit_reverse_u8(v >> 24));
 310 }
 311
 312 static __attribute__((unused))
 313 uint64_t bit_reverse_u64(uint64_t v)
 314 {
 315         return ((uint64_t) bit_reverse_u8(v) << 56) |
 316                 ((uint64_t) bit_reverse_u8(v >> 8)  << 48) |
 317                 ((uint64_t) bit_reverse_u8(v >> 16) << 40) |
 318                 ((uint64_t) bit_reverse_u8(v >> 24) << 32) |
 319                 ((uint64_t) bit_reverse_u8(v >> 32) << 24) |
 320                 ((uint64_t) bit_reverse_u8(v >> 40) << 16) |
 321                 ((uint64_t) bit_reverse_u8(v >> 48) << 8) |
 322                 ((uint64_t) bit_reverse_u8(v >> 56));
 323 }
 324
 325 static
 326 unsigned long bit_reverse_ulong(unsigned long v)
 327 {
 328 #if (CAA_BITS_PER_LONG == 32)
 329         return bit_reverse_u32(v);
 330 #else
 331         return bit_reverse_u64(v);
 332 #endif
 333 }
 334
 335 /*
 336  * fls: returns the position of the most significant bit.
 337  * Returns 0 if no bit is set, else returns the position of the most
 338  * significant bit (from 1 to 32 on 32-bit, from 1 to 64 on 64-bit).
 339  */
 340 #if defined(__i386) || defined(__x86_64)
 341 static inline
 342 unsigned int fls_u32(uint32_t x)
 343 {
 344         int r;
 345
 346         asm("bsrl %1,%0\n\t"
 347             "jnz 1f\n\t"
 348             "movl $-1,%0\n\t"
 349             "1:\n\t"
 350             : "=r" (r) : "rm" (x));
 351         return r + 1;
 352 }
 353 #define HAS_FLS_U32
 354 #endif
 355
 356 #if defined(__x86_64)
 357 static inline
 358 unsigned int fls_u64(uint64_t x)
 359 {
 360         long r;
 361
 362         asm("bsrq %1,%0\n\t"
 363             "jnz 1f\n\t"
 364             "movq $-1,%0\n\t"
 365             "1:\n\t"
 366             : "=r" (r) : "rm" (x));
 367         return r + 1;
 368 }
 369 #define HAS_FLS_U64
 370 #endif
 371
 372 #ifndef HAS_FLS_U64
 373 static __attribute__((unused))
 374 unsigned int fls_u64(uint64_t x)
 375 {
 376         unsigned int r = 64;
 377
 378         if (!x)
 379                 return 0;
 380
 381         if (!(x & 0xFFFFFFFF00000000ULL)) {
 382                 x <<= 32;
 383                 r -= 32;
 384         }
 385         if (!(x & 0xFFFF000000000000ULL)) {
 386                 x <<= 16;
 387                 r -= 16;
 388         }
 389         if (!(x & 0xFF00000000000000ULL)) {
 390                 x <<= 8;
 391                 r -= 8;
 392         }
 393         if (!(x & 0xF000000000000000ULL)) {
 394                 x <<= 4;
 395                 r -= 4;
 396         }
 397         if (!(x & 0xC000000000000000ULL)) {
 398                 x <<= 2;
 399                 r -= 2;
 400         }
 401         if (!(x & 0x8000000000000000ULL)) {
 402                 x <<= 1;
 403                 r -= 1;
 404         }
 405         return r;
 406 }
 407 #endif
 408
 409 #ifndef HAS_FLS_U32
 410 static __attribute__((unused))
 411 unsigned int fls_u32(uint32_t x)
 412 {
 413         unsigned int r = 32;
 414
 415         if (!x)
 416                 return 0;
 417         if (!(x & 0xFFFF0000U)) {
 418                 x <<= 16;
 419                 r -= 16;
 420         }
 421         if (!(x & 0xFF000000U)) {
 422                 x <<= 8;
 423                 r -= 8;
 424         }
 425         if (!(x & 0xF0000000U)) {
 426                 x <<= 4;
 427                 r -= 4;
 428         }
 429         if (!(x & 0xC0000000U)) {
 430                 x <<= 2;
 431                 r -= 2;
 432         }
 433         if (!(x & 0x80000000U)) {
 434                 x <<= 1;
 435                 r -= 1;
 436         }
 437         return r;
 438 }
 439 #endif
 440
 441 unsigned int fls_ulong(unsigned long x)
 442 {
 443 #if (CAA_BITS_PER_lONG == 32)
 444         return fls_u32(x);
 445 #else
 446         return fls_u64(x);
 447 #endif
 448 }
 449
 450 int get_count_order_u32(uint32_t x)
 451 {
 452         int order;
 453
 454         order = fls_u32(x) - 1;
 455         if (x & (x - 1))
 456                 order++;
 457         return order;
 458 }
 459
 460 int get_count_order_ulong(unsigned long x)
 461 {
 462         int order;
 463
 464         order = fls_ulong(x) - 1;
 465         if (x & (x - 1))
 466                 order++;
 467         return order;
 468 }
 469
 470 #ifdef POISON_FREE
 471 #define poison_free(ptr)                                \
 472         do {                                            \
 473                 memset(ptr, 0x42, sizeof(*(ptr)));      \
 474                 free(ptr);                              \
 475         } while (0)
 476 #else
 477 #define poison_free(ptr)        free(ptr)
 478 #endif
 479
 480 static
 481 void cds_lfht_resize_lazy(struct cds_lfht *ht, unsigned long size, int growth);
 482
 483 /*
 484  * If the sched_getcpu() and sysconf(_SC_NPROCESSORS_CONF) calls are
 485  * available, then we support hash table item accounting.
 486  * In the unfortunate event the number of CPUs reported would be
 487  * inaccurate, we use modulo arithmetic on the number of CPUs we got.
 488  */
 489 #if defined(HAVE_SCHED_GETCPU) && defined(HAVE_SYSCONF)
 490
 491 static
 492 void cds_lfht_resize_lazy_count(struct cds_lfht *ht, unsigned long size,
 493                                 unsigned long count);
 494
 495 static long nr_cpus_mask = -1;
 496
 497 static
 498 struct ht_items_count *alloc_per_cpu_items_count(void)
 499 {
 500         struct ht_items_count *count;
 501
 502         switch (nr_cpus_mask) {
 503         case -2:
 504                 return NULL;
 505         case -1:
 506         {
 507                 long maxcpus;
 508
 509                 maxcpus = sysconf(_SC_NPROCESSORS_CONF);
 510                 if (maxcpus <= 0) {
 511                         nr_cpus_mask = -2;
 512                         return NULL;
 513                 }
 514                 /*
 515                  * round up number of CPUs to next power of two, so we
 516                  * can use & for modulo.
 517                  */
 518                 maxcpus = 1UL << get_count_order_ulong(maxcpus);
 519                 nr_cpus_mask = maxcpus - 1;
 520         }
 521                 /* Fall-through */
 522         default:
 523                 return calloc(nr_cpus_mask + 1, sizeof(*count));
 524         }
 525 }
 526
 527 static
 528 void free_per_cpu_items_count(struct ht_items_count *count)
 529 {
 530         poison_free(count);
 531 }
 532
 533 static
 534 int ht_get_cpu(void)
 535 {
 536         int cpu;
 537
 538         assert(nr_cpus_mask >= 0);
 539         cpu = sched_getcpu();
 540         if (unlikely(cpu < 0))
 541                 return cpu;
 542         else
 543                 return cpu & nr_cpus_mask;
 544 }
 545
 546 static
 547 void ht_count_add(struct cds_lfht *ht, unsigned long size)
 548 {
 549         unsigned long percpu_count;
 550         int cpu;
 551
 552         if (unlikely(!ht->percpu_count))
 553                 return;
 554         cpu = ht_get_cpu();
 555         if (unlikely(cpu < 0))
 556                 return;
 557         percpu_count = uatomic_add_return(&ht->percpu_count[cpu].add, 1);
 558         if (unlikely(!(percpu_count & ((1UL << COUNT_COMMIT_ORDER) - 1)))) {
 559                 long count;
 560
 561                 dbg_printf("add percpu %lu\n", percpu_count);
 562                 count = uatomic_add_return(&ht->count,
 563                                            1UL << COUNT_COMMIT_ORDER);
 564                 /* If power of 2 */
 565                 if (!(count & (count - 1))) {
 566                         if ((count >> CHAIN_LEN_RESIZE_THRESHOLD) < size)
 567                                 return;
 568                         dbg_printf("add set global %ld\n", count);
 569                         cds_lfht_resize_lazy_count(ht, size,
 570                                 count >> (CHAIN_LEN_TARGET - 1));
 571                 }
 572         }
 573 }
 574
 575 static
 576 void ht_count_del(struct cds_lfht *ht, unsigned long size)
 577 {
 578         unsigned long percpu_count;
 579         int cpu;
 580
 581         if (unlikely(!ht->percpu_count))
 582                 return;
 583         cpu = ht_get_cpu();
 584         if (unlikely(cpu < 0))
 585                 return;
 586         percpu_count = uatomic_add_return(&ht->percpu_count[cpu].del, 1);
 587         if (unlikely(!(percpu_count & ((1UL << COUNT_COMMIT_ORDER) - 1)))) {
 588                 long count;
 589
 590                 dbg_printf("del percpu %lu\n", percpu_count);
 591                 count = uatomic_add_return(&ht->count,
 592                                            -(1UL << COUNT_COMMIT_ORDER));
 593                 /* If power of 2 */
 594                 if (!(count & (count - 1))) {
 595                         if ((count >> CHAIN_LEN_RESIZE_THRESHOLD) >= size)
 596                                 return;
 597                         dbg_printf("del set global %ld\n", count);
 598                         /*
 599                          * Don't shrink table if the number of nodes is below a
 600                          * certain threshold.
 601                          */
 602                         if (count < (1UL << COUNT_COMMIT_ORDER) * (nr_cpus_mask + 1))
 603                                 return;
 604                         cds_lfht_resize_lazy_count(ht, size,
 605                                 count >> (CHAIN_LEN_TARGET - 1));
 606                 }
 607         }
 608 }
 609
 610 #else /* #if defined(HAVE_SCHED_GETCPU) && defined(HAVE_SYSCONF) */
 611
 612 static const long nr_cpus_mask = -1;
 613
 614 static
 615 struct ht_items_count *alloc_per_cpu_items_count(void)
 616 {
 617         return NULL;
 618 }
 619
 620 static
 621 void free_per_cpu_items_count(struct ht_items_count *count)
 622 {
 623 }
 624
 625 static
 626 void ht_count_add(struct cds_lfht *ht, unsigned long size)
 627 {
 628 }
 629
 630 static
 631 void ht_count_del(struct cds_lfht *ht, unsigned long size)
 632 {
 633 }
 634
 635 #endif /* #else #if defined(HAVE_SCHED_GETCPU) && defined(HAVE_SYSCONF) */
 636
 637
 638 static
 639 void check_resize(struct cds_lfht *ht, unsigned long size, uint32_t chain_len)
 640 {
 641         unsigned long count;
 642
 643         if (!(ht->flags & CDS_LFHT_AUTO_RESIZE))
 644                 return;
 645         count = uatomic_read(&ht->count);
 646         /*
 647          * Use bucket-local length for small table expand and for
 648          * environments lacking per-cpu data support.
 649          */
 650         if (count >= (1UL << COUNT_COMMIT_ORDER))
 651                 return;
 652         if (chain_len > 100)
 653                 dbg_printf("WARNING: large chain length: %u.\n",
 654                            chain_len);
 655         if (chain_len >= CHAIN_LEN_RESIZE_THRESHOLD)
 656                 cds_lfht_resize_lazy(ht, size,
 657                         get_count_order_u32(chain_len - (CHAIN_LEN_TARGET - 1)));
 658 }
 659
 660 static
 661 struct cds_lfht_node *clear_flag(struct cds_lfht_node *node)
 662 {
 663         return (struct cds_lfht_node *) (((unsigned long) node) & ~FLAGS_MASK);
 664 }
 665
 666 static
 667 int is_removed(struct cds_lfht_node *node)
 668 {
 669         return ((unsigned long) node) & REMOVED_FLAG;
 670 }
 671
 672 static
 673 struct cds_lfht_node *flag_removed(struct cds_lfht_node *node)
 674 {
 675         return (struct cds_lfht_node *) (((unsigned long) node) | REMOVED_FLAG);
 676 }
 677
 678 static
 679 int is_dummy(struct cds_lfht_node *node)
 680 {
 681         return ((unsigned long) node) & DUMMY_FLAG;
 682 }
 683
 684 static
 685 struct cds_lfht_node *flag_dummy(struct cds_lfht_node *node)
 686 {
 687         return (struct cds_lfht_node *) (((unsigned long) node) | DUMMY_FLAG);
 688 }
 689
 690 static
 691 struct cds_lfht_node *get_end(void)
 692 {
 693         return (struct cds_lfht_node *) END_VALUE;
 694 }
 695
 696 static
 697 int is_end(struct cds_lfht_node *node)
 698 {
 699         return clear_flag(node) == (struct cds_lfht_node *) END_VALUE;
 700 }
 701
 702 static
 703 unsigned long _uatomic_max(unsigned long *ptr, unsigned long v)
 704 {
 705         unsigned long old1, old2;
 706
 707         old1 = uatomic_read(ptr);
 708         do {
 709                 old2 = old1;
 710                 if (old2 >= v)
 711                         return old2;
 712         } while ((old1 = uatomic_cmpxchg(ptr, old2, v)) != old2);
 713         return v;
 714 }
 715
 716 static
 717 void cds_lfht_free_level(struct rcu_head *head)
 718 {
 719         struct rcu_level *l =
 720                 caa_container_of(head, struct rcu_level, head);
 721         poison_free(l);
 722 }
 723
 724 /*
 725  * Remove all logically deleted nodes from a bucket up to a certain node key.
 726  */
 727 static
 728 void _cds_lfht_gc_bucket(struct cds_lfht_node *dummy, struct cds_lfht_node *node)
 729 {
 730         struct cds_lfht_node *iter_prev, *iter, *next, *new_next;
 731
 732         assert(!is_dummy(dummy));
 733         assert(!is_removed(dummy));
 734         assert(!is_dummy(node));
 735         assert(!is_removed(node));
 736         for (;;) {
 737                 iter_prev = dummy;
 738                 /* We can always skip the dummy node initially */
 739                 iter = rcu_dereference(iter_prev->p.next);
 740                 assert(iter_prev->p.reverse_hash <= node->p.reverse_hash);
 741                 /*
 742                  * We should never be called with dummy (start of chain)
 743                  * and logically removed node (end of path compression
 744                  * marker) being the actual same node. This would be a
 745                  * bug in the algorithm implementation.
 746                  */
 747                 assert(dummy != node);
 748                 for (;;) {
 749                         if (unlikely(is_end(iter)))
 750                                 return;
 751                         if (likely(clear_flag(iter)->p.reverse_hash > node->p.reverse_hash))
 752                                 return;
 753                         next = rcu_dereference(clear_flag(iter)->p.next);
 754                         if (likely(is_removed(next)))
 755                                 break;
 756                         iter_prev = clear_flag(iter);
 757                         iter = next;
 758                 }
 759                 assert(!is_removed(iter));
 760                 if (is_dummy(iter))
 761                         new_next = flag_dummy(clear_flag(next));
 762                 else
 763                         new_next = clear_flag(next);
 764                 if (is_removed(iter))
 765                         new_next = flag_removed(new_next);
 766                 (void) uatomic_cmpxchg(&iter_prev->p.next, iter, new_next);
 767         }
 768         return;
 769 }
 770
 771 static
 772 int _cds_lfht_replace(struct cds_lfht *ht, unsigned long size,
 773                 struct cds_lfht_node *old_node,
 774                 struct cds_lfht_node *ret_next,
 775                 struct cds_lfht_node *new_node)
 776 {
 777         struct cds_lfht_node *dummy, *old_next;
 778         struct _cds_lfht_node *lookup;
 779         int flagged = 0;
 780         unsigned long hash, index, order;
 781
 782         if (!old_node)  /* Return -ENOENT if asked to replace NULL node */
 783                 goto end;
 784
 785         assert(!is_removed(old_node));
 786         assert(!is_dummy(old_node));
 787         assert(!is_removed(new_node));
 788         assert(!is_dummy(new_node));
 789         assert(new_node != old_node);
 790         do {
 791                 /* Insert after node to be replaced */
 792                 old_next = ret_next;
 793                 if (is_removed(old_next)) {
 794                         /*
 795                          * Too late, the old node has been removed under us
 796                          * between lookup and replace. Fail.
 797                          */
 798                         goto end;
 799                 }
 800                 assert(!is_dummy(old_next));
 801                 assert(new_node != clear_flag(old_next));
 802                 new_node->p.next = clear_flag(old_next);
 803                 /*
 804                  * Here is the whole trick for lock-free replace: we add
 805                  * the replacement node _after_ the node we want to
 806                  * replace by atomically setting its next pointer at the
 807                  * same time we set its removal flag. Given that
 808                  * the lookups/get next use an iterator aware of the
 809                  * next pointer, they will either skip the old node due
 810                  * to the removal flag and see the new node, or use
 811                  * the old node, but will not see the new one.
 812                  */
 813                 ret_next = uatomic_cmpxchg(&old_node->p.next,
 814                               old_next, flag_removed(new_node));
 815         } while (ret_next != old_next);
 816
 817         /* We performed the replacement. */
 818         flagged = 1;
 819
 820         /*
 821          * Ensure that the old node is not visible to readers anymore:
 822          * lookup for the node, and remove it (along with any other
 823          * logically removed node) if found.
 824          */
 825         hash = bit_reverse_ulong(old_node->p.reverse_hash);
 826         assert(size > 0);
 827         index = hash & (size - 1);
 828         order = get_count_order_ulong(index + 1);
 829         lookup = &ht->t.tbl[order]->nodes[index & (!order ? 0 : ((1UL << (order - 1)) - 1))];
 830         dummy = (struct cds_lfht_node *) lookup;
 831         _cds_lfht_gc_bucket(dummy, new_node);
 832 end:
 833         /*
 834          * Only the flagging action indicated that we (and no other)
 835          * replaced the node from the hash table.
 836          */
 837         if (flagged) {
 838                 assert(is_removed(rcu_dereference(old_node->p.next)));
 839                 return 0;
 840         } else {
 841                 return -ENOENT;
 842         }
 843 }
 844
 845 static
 846 struct cds_lfht_node *_cds_lfht_add(struct cds_lfht *ht,
 847                                 unsigned long size,
 848                                 struct cds_lfht_node *node,
 849                                 enum add_mode mode, int dummy)
 850 {
 851         struct cds_lfht_node *iter_prev, *iter, *next, *new_node, *new_next,
 852                         *dummy_node, *return_node;
 853         struct _cds_lfht_node *lookup;
 854         unsigned long hash, index, order;
 855
 856         assert(!is_dummy(node));
 857         assert(!is_removed(node));
 858         if (!size) {
 859                 assert(dummy);
 860                 node->p.next = flag_dummy(get_end());
 861                 return node;    /* Initial first add (head) */
 862         }
 863         hash = bit_reverse_ulong(node->p.reverse_hash);
 864         for (;;) {
 865                 uint32_t chain_len = 0;
 866
 867                 /*
 868                  * iter_prev points to the non-removed node prior to the
 869                  * insert location.
 870                  */
 871                 index = hash & (size - 1);
 872                 order = get_count_order_ulong(index + 1);
 873                 lookup = &ht->t.tbl[order]->nodes[index & ((!order ? 0 : (1UL << (order - 1))) - 1)];
 874                 iter_prev = (struct cds_lfht_node *) lookup;
 875                 /* We can always skip the dummy node initially */
 876                 iter = rcu_dereference(iter_prev->p.next);
 877                 assert(iter_prev->p.reverse_hash <= node->p.reverse_hash);
 878                 for (;;) {
 879                         if (unlikely(is_end(iter)))
 880                                 goto insert;
 881                         if (likely(clear_flag(iter)->p.reverse_hash > node->p.reverse_hash))
 882                                 goto insert;
 883                         next = rcu_dereference(clear_flag(iter)->p.next);
 884                         if (unlikely(is_removed(next)))
 885                                 goto gc_node;
 886                         if ((mode == ADD_UNIQUE || mode == ADD_REPLACE)
 887                             && !is_dummy(next)
 888                             && !ht->compare_fct(node->key, node->key_len,
 889                                                 clear_flag(iter)->key,
 890                                                 clear_flag(iter)->key_len)) {
 891                                 if (mode == ADD_UNIQUE)
 892                                         return clear_flag(iter);
 893                                 else /* mode == ADD_REPLACE */
 894                                         goto replace;
 895                         }
 896                         /* Only account for identical reverse hash once */
 897                         if (iter_prev->p.reverse_hash != clear_flag(iter)->p.reverse_hash
 898                             && !is_dummy(next))
 899                                 check_resize(ht, size, ++chain_len);
 900                         iter_prev = clear_flag(iter);
 901                         iter = next;
 902                 }
 903
 904         insert:
 905                 assert(node != clear_flag(iter));
 906                 assert(!is_removed(iter_prev));
 907                 assert(!is_removed(iter));
 908                 assert(iter_prev != node);
 909                 if (!dummy)
 910                         node->p.next = clear_flag(iter);
 911                 else
 912                         node->p.next = flag_dummy(clear_flag(iter));
 913                 if (is_dummy(iter))
 914                         new_node = flag_dummy(node);
 915                 else
 916                         new_node = node;
 917                 if (uatomic_cmpxchg(&iter_prev->p.next, iter,
 918                                     new_node) != iter) {
 919                         continue;       /* retry */
 920                 } else {
 921                         if (mode == ADD_REPLACE)
 922                                 return_node = NULL;
 923                         else    /* ADD_DEFAULT and ADD_UNIQUE */
 924                                 return_node = node;
 925                         goto gc_end;
 926                 }
 927
 928         replace:
 929
 930                 if (!_cds_lfht_replace(ht, size, clear_flag(iter), next,
 931                                     node)) {
 932                         return_node = clear_flag(iter);
 933                         goto end;       /* gc already done */
 934                 } else {
 935                         continue;       /* retry */
 936                 }
 937
 938         gc_node:
 939                 assert(!is_removed(iter));
 940                 if (is_dummy(iter))
 941                         new_next = flag_dummy(clear_flag(next));
 942                 else
 943                         new_next = clear_flag(next);
 944                 (void) uatomic_cmpxchg(&iter_prev->p.next, iter, new_next);
 945                 /* retry */
 946         }
 947 gc_end:
 948         /* Garbage collect logically removed nodes in the bucket */
 949         index = hash & (size - 1);
 950         order = get_count_order_ulong(index + 1);
 951         lookup = &ht->t.tbl[order]->nodes[index & (!order ? 0 : ((1UL << (order - 1)) - 1))];
 952         dummy_node = (struct cds_lfht_node *) lookup;
 953         _cds_lfht_gc_bucket(dummy_node, node);
 954 end:
 955         return return_node;
 956 }
 957
 958 static
 959 int _cds_lfht_del(struct cds_lfht *ht, unsigned long size,
 960                 struct cds_lfht_node *node,
 961                 int dummy_removal)
 962 {
 963         struct cds_lfht_node *dummy, *next, *old;
 964         struct _cds_lfht_node *lookup;
 965         int flagged = 0;
 966         unsigned long hash, index, order;
 967
 968         if (!node)      /* Return -ENOENT if asked to delete NULL node */
 969                 goto end;
 970
 971         /* logically delete the node */
 972         assert(!is_dummy(node));
 973         assert(!is_removed(node));
 974         old = rcu_dereference(node->p.next);
 975         do {
 976                 struct cds_lfht_node *new_next;
 977
 978                 next = old;
 979                 if (unlikely(is_removed(next)))
 980                         goto end;
 981                 if (dummy_removal)
 982                         assert(is_dummy(next));
 983                 else
 984                         assert(!is_dummy(next));
 985                 new_next = flag_removed(next);
 986                 old = uatomic_cmpxchg(&node->p.next, next, new_next);
 987         } while (old != next);
 988
 989         /* We performed the (logical) deletion. */
 990         flagged = 1;
 991
 992         /*
 993          * Ensure that the node is not visible to readers anymore: lookup for
 994          * the node, and remove it (along with any other logically removed node)
 995          * if found.
 996          */
 997         hash = bit_reverse_ulong(node->p.reverse_hash);
 998         assert(size > 0);
 999         index = hash & (size - 1);
1000         order = get_count_order_ulong(index + 1);
1001         lookup = &ht->t.tbl[order]->nodes[index & (!order ? 0 : ((1UL << (order - 1)) - 1))];
1002         dummy = (struct cds_lfht_node *) lookup;
1003         _cds_lfht_gc_bucket(dummy, node);
1004 end:
1005         /*
1006          * Only the flagging action indicated that we (and no other)
1007          * removed the node from the hash.
1008          */
1009         if (flagged) {
1010                 assert(is_removed(rcu_dereference(node->p.next)));
1011                 return 0;
1012         } else {
1013                 return -ENOENT;
1014         }
1015 }
1016
1017 static
1018 void *partition_resize_thread(void *arg)
1019 {
1020         struct partition_resize_work *work = arg;
1021
1022         work->ht->cds_lfht_rcu_register_thread();
1023         work->fct(work->ht, work->i, work->start, work->len);
1024         work->ht->cds_lfht_rcu_unregister_thread();
1025         return NULL;
1026 }
1027
1028 static
1029 void partition_resize_helper(struct cds_lfht *ht, unsigned long i,
1030                 unsigned long len,
1031                 void (*fct)(struct cds_lfht *ht, unsigned long i,
1032                         unsigned long start, unsigned long len))
1033 {
1034         unsigned long partition_len;
1035         struct partition_resize_work *work;
1036         int thread, ret;
1037         unsigned long nr_threads;
1038         pthread_t *thread_id;
1039
1040         /*
1041          * Note: nr_cpus_mask + 1 is always power of 2.
1042          * We spawn just the number of threads we need to satisfy the minimum
1043          * partition size, up to the number of CPUs in the system.
1044          */
1045         nr_threads = min(nr_cpus_mask + 1,
1046                          len >> MIN_PARTITION_PER_THREAD_ORDER);
1047         partition_len = len >> get_count_order_ulong(nr_threads);
1048         work = calloc(nr_threads, sizeof(*work));
1049         thread_id = calloc(nr_threads, sizeof(*thread_id));
1050         assert(work);
1051         for (thread = 0; thread < nr_threads; thread++) {
1052                 work[thread].ht = ht;
1053                 work[thread].i = i;
1054                 work[thread].len = partition_len;
1055                 work[thread].start = thread * partition_len;
1056                 work[thread].fct = fct;
1057                 ret = pthread_create(&thread_id[thread], ht->resize_attr,
1058                         partition_resize_thread, &work[thread]);
1059                 assert(!ret);
1060         }
1061         for (thread = 0; thread < nr_threads; thread++) {
1062                 ret = pthread_join(thread_id[thread], NULL);
1063                 assert(!ret);
1064         }
1065         free(work);
1066         free(thread_id);
1067 }
1068
1069 /*
1070  * Holding RCU read lock to protect _cds_lfht_add against memory
1071  * reclaim that could be performed by other call_rcu worker threads (ABA
1072  * problem).
1073  *
1074  * When we reach a certain length, we can split this population phase over
1075  * many worker threads, based on the number of CPUs available in the system.
1076  * This should therefore take care of not having the expand lagging behind too
1077  * many concurrent insertion threads by using the scheduler's ability to
1078  * schedule dummy node population fairly with insertions.
1079  */
1080 static
1081 void init_table_populate_partition(struct cds_lfht *ht, unsigned long i,
1082                                    unsigned long start, unsigned long len)
1083 {
1084         unsigned long j;
1085
1086         ht->cds_lfht_rcu_read_lock();
1087         for (j = start; j < start + len; j++) {
1088                 struct cds_lfht_node *new_node =
1089                         (struct cds_lfht_node *) &ht->t.tbl[i]->nodes[j];
1090
1091                 dbg_printf("init populate: i %lu j %lu hash %lu\n",
1092                            i, j, !i ? 0 : (1UL << (i - 1)) + j);
1093                 new_node->p.reverse_hash =
1094                         bit_reverse_ulong(!i ? 0 : (1UL << (i - 1)) + j);
1095                 (void) _cds_lfht_add(ht, !i ? 0 : (1UL << (i - 1)),
1096                                 new_node, ADD_DEFAULT, 1);
1097                 if (CMM_LOAD_SHARED(ht->in_progress_destroy))
1098                         break;
1099         }
1100         ht->cds_lfht_rcu_read_unlock();
1101 }
1102
1103 static
1104 void init_table_populate(struct cds_lfht *ht, unsigned long i,
1105                          unsigned long len)
1106 {
1107         assert(nr_cpus_mask != -1);
1108         if (nr_cpus_mask < 0 || len < 2 * MIN_PARTITION_PER_THREAD) {
1109                 ht->cds_lfht_rcu_thread_online();
1110                 init_table_populate_partition(ht, i, 0, len);
1111                 ht->cds_lfht_rcu_thread_offline();
1112                 return;
1113         }
1114         partition_resize_helper(ht, i, len, init_table_populate_partition);
1115 }
1116
1117 static
1118 void init_table(struct cds_lfht *ht,
1119                 unsigned long first_order, unsigned long len_order)
1120 {
1121         unsigned long i, end_order;
1122
1123         dbg_printf("init table: first_order %lu end_order %lu\n",
1124                    first_order, first_order + len_order);
1125         end_order = first_order + len_order;
1126         for (i = first_order; i < end_order; i++) {
1127                 unsigned long len;
1128
1129                 len = !i ? 1 : 1UL << (i - 1);
1130                 dbg_printf("init order %lu len: %lu\n", i, len);
1131
1132                 /* Stop expand if the resize target changes under us */
1133                 if (CMM_LOAD_SHARED(ht->t.resize_target) < (!i ? 1 : (1UL << i)))
1134                         break;
1135
1136                 ht->t.tbl[i] = calloc(1, sizeof(struct rcu_level)
1137                                 + (len * sizeof(struct _cds_lfht_node)));
1138                 assert(ht->t.tbl[i]);
1139
1140                 /*
1141                  * Set all dummy nodes reverse hash values for a level and
1142                  * link all dummy nodes into the table.
1143                  */
1144                 init_table_populate(ht, i, len);
1145
1146                 /*
1147                  * Update table size.
1148                  */
1149                 cmm_smp_wmb();  /* populate data before RCU size */
1150                 CMM_STORE_SHARED(ht->t.size, !i ? 1 : (1UL << i));
1151
1152                 dbg_printf("init new size: %lu\n", !i ? 1 : (1UL << i));
1153                 if (CMM_LOAD_SHARED(ht->in_progress_destroy))
1154                         break;
1155         }
1156 }
1157
1158 /*
1159  * Holding RCU read lock to protect _cds_lfht_remove against memory
1160  * reclaim that could be performed by other call_rcu worker threads (ABA
1161  * problem).
1162  * For a single level, we logically remove and garbage collect each node.
1163  *
1164  * As a design choice, we perform logical removal and garbage collection on a
1165  * node-per-node basis to simplify this algorithm. We also assume keeping good
1166  * cache locality of the operation would overweight possible performance gain
1167  * that could be achieved by batching garbage collection for multiple levels.
1168  * However, this would have to be justified by benchmarks.
1169  *
1170  * Concurrent removal and add operations are helping us perform garbage
1171  * collection of logically removed nodes. We guarantee that all logically
1172  * removed nodes have been garbage-collected (unlinked) before call_rcu is
1173  * invoked to free a hole level of dummy nodes (after a grace period).
1174  *
1175  * Logical removal and garbage collection can therefore be done in batch or on a
1176  * node-per-node basis, as long as the guarantee above holds.
1177  *
1178  * When we reach a certain length, we can split this removal over many worker
1179  * threads, based on the number of CPUs available in the system. This should
1180  * take care of not letting resize process lag behind too many concurrent
1181  * updater threads actively inserting into the hash table.
1182  */
1183 static
1184 void remove_table_partition(struct cds_lfht *ht, unsigned long i,
1185                             unsigned long start, unsigned long len)
1186 {
1187         unsigned long j;
1188
1189         ht->cds_lfht_rcu_read_lock();
1190         for (j = start; j < start + len; j++) {
1191                 struct cds_lfht_node *fini_node =
1192                         (struct cds_lfht_node *) &ht->t.tbl[i]->nodes[j];
1193
1194                 dbg_printf("remove entry: i %lu j %lu hash %lu\n",
1195                            i, j, !i ? 0 : (1UL << (i - 1)) + j);
1196                 fini_node->p.reverse_hash =
1197                         bit_reverse_ulong(!i ? 0 : (1UL << (i - 1)) + j);
1198                 (void) _cds_lfht_del(ht, !i ? 0 : (1UL << (i - 1)),
1199                                 fini_node, 1);
1200                 if (CMM_LOAD_SHARED(ht->in_progress_destroy))
1201                         break;
1202         }
1203         ht->cds_lfht_rcu_read_unlock();
1204 }
1205
1206 static
1207 void remove_table(struct cds_lfht *ht, unsigned long i, unsigned long len)
1208 {
1209
1210         assert(nr_cpus_mask != -1);
1211         if (nr_cpus_mask < 0 || len < 2 * MIN_PARTITION_PER_THREAD) {
1212                 ht->cds_lfht_rcu_thread_online();
1213                 remove_table_partition(ht, i, 0, len);
1214                 ht->cds_lfht_rcu_thread_offline();
1215                 return;
1216         }
1217         partition_resize_helper(ht, i, len, remove_table_partition);
1218 }
1219
1220 static
1221 void fini_table(struct cds_lfht *ht,
1222                 unsigned long first_order, unsigned long len_order)
1223 {
1224         long i, end_order;
1225
1226         dbg_printf("fini table: first_order %lu end_order %lu\n",
1227                    first_order, first_order + len_order);
1228         end_order = first_order + len_order;
1229         assert(first_order > 0);
1230         for (i = end_order - 1; i >= first_order; i--) {
1231                 unsigned long len;
1232
1233                 len = !i ? 1 : 1UL << (i - 1);
1234                 dbg_printf("fini order %lu len: %lu\n", i, len);
1235
1236                 /* Stop shrink if the resize target changes under us */
1237                 if (CMM_LOAD_SHARED(ht->t.resize_target) > (1UL << (i - 1)))
1238                         break;
1239
1240                 cmm_smp_wmb();  /* populate data before RCU size */
1241                 CMM_STORE_SHARED(ht->t.size, 1UL << (i - 1));
1242
1243                 /*
1244                  * We need to wait for all add operations to reach Q.S. (and
1245                  * thus use the new table for lookups) before we can start
1246                  * releasing the old dummy nodes. Otherwise their lookup will
1247                  * return a logically removed node as insert position.
1248                  */
1249                 ht->cds_lfht_synchronize_rcu();
1250
1251                 /*
1252                  * Set "removed" flag in dummy nodes about to be removed.
1253                  * Unlink all now-logically-removed dummy node pointers.
1254                  * Concurrent add/remove operation are helping us doing
1255                  * the gc.
1256                  */
1257                 remove_table(ht, i, len);
1258
1259                 ht->cds_lfht_call_rcu(&ht->t.tbl[i]->head, cds_lfht_free_level);
1260
1261                 dbg_printf("fini new size: %lu\n", 1UL << i);
1262                 if (CMM_LOAD_SHARED(ht->in_progress_destroy))
1263                         break;
1264         }
1265 }
1266
1267 struct cds_lfht *_cds_lfht_new(cds_lfht_hash_fct hash_fct,
1268                         cds_lfht_compare_fct compare_fct,
1269                         unsigned long hash_seed,
1270                         unsigned long init_size,
1271                         int flags,
1272                         void (*cds_lfht_call_rcu)(struct rcu_head *head,
1273                                         void (*func)(struct rcu_head *head)),
1274                         void (*cds_lfht_synchronize_rcu)(void),
1275                         void (*cds_lfht_rcu_read_lock)(void),
1276                         void (*cds_lfht_rcu_read_unlock)(void),
1277                         void (*cds_lfht_rcu_thread_offline)(void),
1278                         void (*cds_lfht_rcu_thread_online)(void),
1279                         void (*cds_lfht_rcu_register_thread)(void),
1280                         void (*cds_lfht_rcu_unregister_thread)(void),
1281                         pthread_attr_t *attr)
1282 {
1283         struct cds_lfht *ht;
1284         unsigned long order;
1285
1286         /* init_size must be power of two */
1287         if (init_size && (init_size & (init_size - 1)))
1288                 return NULL;
1289         ht = calloc(1, sizeof(struct cds_lfht));
1290         assert(ht);
1291         ht->hash_fct = hash_fct;
1292         ht->compare_fct = compare_fct;
1293         ht->hash_seed = hash_seed;
1294         ht->cds_lfht_call_rcu = cds_lfht_call_rcu;
1295         ht->cds_lfht_synchronize_rcu = cds_lfht_synchronize_rcu;
1296         ht->cds_lfht_rcu_read_lock = cds_lfht_rcu_read_lock;
1297         ht->cds_lfht_rcu_read_unlock = cds_lfht_rcu_read_unlock;
1298         ht->cds_lfht_rcu_thread_offline = cds_lfht_rcu_thread_offline;
1299         ht->cds_lfht_rcu_thread_online = cds_lfht_rcu_thread_online;
1300         ht->cds_lfht_rcu_register_thread = cds_lfht_rcu_register_thread;
1301         ht->cds_lfht_rcu_unregister_thread = cds_lfht_rcu_unregister_thread;
1302         ht->resize_attr = attr;
1303         ht->percpu_count = alloc_per_cpu_items_count();
1304         /* this mutex should not nest in read-side C.S. */
1305         pthread_mutex_init(&ht->resize_mutex, NULL);
1306         order = get_count_order_ulong(max(init_size, MIN_TABLE_SIZE)) + 1;
1307         ht->flags = flags;
1308         ht->cds_lfht_rcu_thread_offline();
1309         pthread_mutex_lock(&ht->resize_mutex);
1310         ht->t.resize_target = 1UL << (order - 1);
1311         init_table(ht, 0, order);
1312         pthread_mutex_unlock(&ht->resize_mutex);
1313         ht->cds_lfht_rcu_thread_online();
1314         return ht;
1315 }
1316
1317 void cds_lfht_lookup(struct cds_lfht *ht, void *key, size_t key_len,
1318                 struct cds_lfht_iter *iter)
1319 {
1320         struct cds_lfht_node *node, *next, *dummy_node;
1321         struct _cds_lfht_node *lookup;
1322         unsigned long hash, reverse_hash, index, order, size;
1323
1324         hash = ht->hash_fct(key, key_len, ht->hash_seed);
1325         reverse_hash = bit_reverse_ulong(hash);
1326
1327         size = rcu_dereference(ht->t.size);
1328         index = hash & (size - 1);
1329         order = get_count_order_ulong(index + 1);
1330         lookup = &ht->t.tbl[order]->nodes[index & (!order ? 0 : ((1UL << (order - 1))) - 1)];
1331         dbg_printf("lookup hash %lu index %lu order %lu aridx %lu\n",
1332                    hash, index, order, index & (!order ? 0 : ((1UL << (order - 1)) - 1)));
1333         dummy_node = (struct cds_lfht_node *) lookup;
1334         /* We can always skip the dummy node initially */
1335         node = rcu_dereference(dummy_node->p.next);
1336         node = clear_flag(node);
1337         for (;;) {
1338                 if (unlikely(is_end(node))) {
1339                         node = next = NULL;
1340                         break;
1341                 }
1342                 if (unlikely(node->p.reverse_hash > reverse_hash)) {
1343                         node = next = NULL;
1344                         break;
1345                 }
1346                 next = rcu_dereference(node->p.next);
1347                 if (likely(!is_removed(next))
1348                     && !is_dummy(next)
1349                     && likely(!ht->compare_fct(node->key, node->key_len, key, key_len))) {
1350                                 break;
1351                 }
1352                 node = clear_flag(next);
1353         }
1354         assert(!node || !is_dummy(rcu_dereference(node->p.next)));
1355         iter->node = node;
1356         iter->next = next;
1357 }
1358
1359 void cds_lfht_next(struct cds_lfht *ht, struct cds_lfht_iter *iter)
1360 {
1361         struct cds_lfht_node *node, *next;
1362         unsigned long reverse_hash;
1363         void *key;
1364         size_t key_len;
1365
1366         node = iter->node;
1367         reverse_hash = node->p.reverse_hash;
1368         key = node->key;
1369         key_len = node->key_len;
1370         next = iter->next;
1371         node = clear_flag(next);
1372
1373         for (;;) {
1374                 if (unlikely(is_end(node))) {
1375                         node = next = NULL;
1376                         break;
1377                 }
1378                 if (unlikely(node->p.reverse_hash > reverse_hash)) {
1379                         node = next = NULL;
1380                         break;
1381                 }
1382                 next = rcu_dereference(node->p.next);
1383                 if (likely(!is_removed(next))
1384                     && !is_dummy(next)
1385                     && likely(!ht->compare_fct(node->key, node->key_len, key, key_len))) {
1386                                 break;
1387                 }
1388                 node = clear_flag(next);
1389         }
1390         assert(!node || !is_dummy(rcu_dereference(node->p.next)));
1391         iter->node = node;
1392         iter->next = next;
1393 }
1394
1395 void cds_lfht_add(struct cds_lfht *ht, struct cds_lfht_node *node)
1396 {
1397         unsigned long hash, size;
1398
1399         hash = ht->hash_fct(node->key, node->key_len, ht->hash_seed);
1400         node->p.reverse_hash = bit_reverse_ulong((unsigned long) hash);
1401
1402         size = rcu_dereference(ht->t.size);
1403         (void) _cds_lfht_add(ht, size, node, ADD_DEFAULT, 0);
1404         ht_count_add(ht, size);
1405 }
1406
1407 struct cds_lfht_node *cds_lfht_add_unique(struct cds_lfht *ht,
1408                                 struct cds_lfht_node *node)
1409 {
1410         unsigned long hash, size;
1411         struct cds_lfht_node *ret;
1412
1413         hash = ht->hash_fct(node->key, node->key_len, ht->hash_seed);
1414         node->p.reverse_hash = bit_reverse_ulong((unsigned long) hash);
1415
1416         size = rcu_dereference(ht->t.size);
1417         ret = _cds_lfht_add(ht, size, node, ADD_UNIQUE, 0);
1418         if (ret == node)
1419                 ht_count_add(ht, size);
1420         return ret;
1421 }
1422
1423 struct cds_lfht_node *cds_lfht_add_replace(struct cds_lfht *ht,
1424                                 struct cds_lfht_node *node)
1425 {
1426         unsigned long hash, size;
1427         struct cds_lfht_node *ret;
1428
1429         hash = ht->hash_fct(node->key, node->key_len, ht->hash_seed);
1430         node->p.reverse_hash = bit_reverse_ulong((unsigned long) hash);
1431
1432         size = rcu_dereference(ht->t.size);
1433         ret = _cds_lfht_add(ht, size, node, ADD_REPLACE, 0);
1434         if (ret == NULL)
1435                 ht_count_add(ht, size);
1436         return ret;
1437 }
1438
1439 int cds_lfht_replace(struct cds_lfht *ht, struct cds_lfht_iter *old_iter,
1440                 struct cds_lfht_node *new_node)
1441 {
1442         unsigned long size;
1443
1444         size = rcu_dereference(ht->t.size);
1445         return _cds_lfht_replace(ht, size, old_iter->node, old_iter->next,
1446                         new_node);
1447 }
1448
1449 int cds_lfht_del(struct cds_lfht *ht, struct cds_lfht_iter *iter)
1450 {
1451         unsigned long size;
1452         int ret;
1453
1454         size = rcu_dereference(ht->t.size);
1455         ret = _cds_lfht_del(ht, size, iter->node, 0);
1456         if (!ret)
1457                 ht_count_del(ht, size);
1458         return ret;
1459 }
1460
1461 static
1462 int cds_lfht_delete_dummy(struct cds_lfht *ht)
1463 {
1464         struct cds_lfht_node *node;
1465         struct _cds_lfht_node *lookup;
1466         unsigned long order, i, size;
1467
1468         /* Check that the table is empty */
1469         lookup = &ht->t.tbl[0]->nodes[0];
1470         node = (struct cds_lfht_node *) lookup;
1471         do {
1472                 node = clear_flag(node)->p.next;
1473                 if (!is_dummy(node))
1474                         return -EPERM;
1475                 assert(!is_removed(node));
1476         } while (!is_end(node));
1477         /*
1478          * size accessed without rcu_dereference because hash table is
1479          * being destroyed.
1480          */
1481         size = ht->t.size;
1482         /* Internal sanity check: all nodes left should be dummy */
1483         for (order = 0; order < get_count_order_ulong(size) + 1; order++) {
1484                 unsigned long len;
1485
1486                 len = !order ? 1 : 1UL << (order - 1);
1487                 for (i = 0; i < len; i++) {
1488                         dbg_printf("delete order %lu i %lu hash %lu\n",
1489                                 order, i,
1490                                 bit_reverse_ulong(ht->t.tbl[order]->nodes[i].reverse_hash));
1491                         assert(is_dummy(ht->t.tbl[order]->nodes[i].next));
1492                 }
1493                 poison_free(ht->t.tbl[order]);
1494         }
1495         return 0;
1496 }
1497
1498 /*
1499  * Should only be called when no more concurrent readers nor writers can
1500  * possibly access the table.
1501  */
1502 int cds_lfht_destroy(struct cds_lfht *ht, pthread_attr_t **attr)
1503 {
1504         int ret;
1505
1506         /* Wait for in-flight resize operations to complete */
1507         CMM_STORE_SHARED(ht->in_progress_destroy, 1);
1508         while (uatomic_read(&ht->in_progress_resize))
1509                 poll(NULL, 0, 100);     /* wait for 100ms */
1510         ret = cds_lfht_delete_dummy(ht);
1511         if (ret)
1512                 return ret;
1513         free_per_cpu_items_count(ht->percpu_count);
1514         if (attr)
1515                 *attr = ht->resize_attr;
1516         poison_free(ht);
1517         return ret;
1518 }
1519
1520 void cds_lfht_count_nodes(struct cds_lfht *ht,
1521                 long *approx_before,
1522                 unsigned long *count,
1523                 unsigned long *removed,
1524                 long *approx_after)
1525 {
1526         struct cds_lfht_node *node, *next;
1527         struct _cds_lfht_node *lookup;
1528         unsigned long nr_dummy = 0;
1529
1530         *approx_before = 0;
1531         if (nr_cpus_mask >= 0) {
1532                 int i;
1533
1534                 for (i = 0; i < nr_cpus_mask + 1; i++) {
1535                         *approx_before += uatomic_read(&ht->percpu_count[i].add);
1536                         *approx_before -= uatomic_read(&ht->percpu_count[i].del);
1537                 }
1538         }
1539
1540         *count = 0;
1541         *removed = 0;
1542
1543         /* Count non-dummy nodes in the table */
1544         lookup = &ht->t.tbl[0]->nodes[0];
1545         node = (struct cds_lfht_node *) lookup;
1546         do {
1547                 next = rcu_dereference(node->p.next);
1548                 if (is_removed(next)) {
1549                         if (!is_dummy(next))
1550                                 (*removed)++;
1551                         else
1552                                 (nr_dummy)++;
1553                 } else if (!is_dummy(next))
1554                         (*count)++;
1555                 else
1556                         (nr_dummy)++;
1557                 node = clear_flag(next);
1558         } while (!is_end(node));
1559         dbg_printf("number of dummy nodes: %lu\n", nr_dummy);
1560         *approx_after = 0;
1561         if (nr_cpus_mask >= 0) {
1562                 int i;
1563
1564                 for (i = 0; i < nr_cpus_mask + 1; i++) {
1565                         *approx_after += uatomic_read(&ht->percpu_count[i].add);
1566                         *approx_after -= uatomic_read(&ht->percpu_count[i].del);
1567                 }
1568         }
1569 }
1570
1571 /* called with resize mutex held */
1572 static
1573 void _do_cds_lfht_grow(struct cds_lfht *ht,
1574                 unsigned long old_size, unsigned long new_size)
1575 {
1576         unsigned long old_order, new_order;
1577
1578         old_order = get_count_order_ulong(old_size) + 1;
1579         new_order = get_count_order_ulong(new_size) + 1;
1580         printf("resize from %lu (order %lu) to %lu (order %lu) buckets\n",
1581                old_size, old_order, new_size, new_order);
1582         assert(new_size > old_size);
1583         init_table(ht, old_order, new_order - old_order);
1584 }
1585
1586 /* called with resize mutex held */
1587 static
1588 void _do_cds_lfht_shrink(struct cds_lfht *ht,
1589                 unsigned long old_size, unsigned long new_size)
1590 {
1591         unsigned long old_order, new_order;
1592
1593         new_size = max(new_size, MIN_TABLE_SIZE);
1594         old_order = get_count_order_ulong(old_size) + 1;
1595         new_order = get_count_order_ulong(new_size) + 1;
1596         printf("resize from %lu (order %lu) to %lu (order %lu) buckets\n",
1597                old_size, old_order, new_size, new_order);
1598         assert(new_size < old_size);
1599
1600         /* Remove and unlink all dummy nodes to remove. */
1601         fini_table(ht, new_order, old_order - new_order);
1602 }
1603
1604
1605 /* called with resize mutex held */
1606 static
1607 void _do_cds_lfht_resize(struct cds_lfht *ht)
1608 {
1609         unsigned long new_size, old_size;
1610
1611         /*
1612          * Resize table, re-do if the target size has changed under us.
1613          */
1614         do {
1615                 ht->t.resize_initiated = 1;
1616                 old_size = ht->t.size;
1617                 new_size = CMM_LOAD_SHARED(ht->t.resize_target);
1618                 if (old_size < new_size)
1619                         _do_cds_lfht_grow(ht, old_size, new_size);
1620                 else if (old_size > new_size)
1621                         _do_cds_lfht_shrink(ht, old_size, new_size);
1622                 ht->t.resize_initiated = 0;
1623                 /* write resize_initiated before read resize_target */
1624                 cmm_smp_mb();
1625         } while (ht->t.size != CMM_LOAD_SHARED(ht->t.resize_target));
1626 }
1627
1628 static
1629 unsigned long resize_target_update(struct cds_lfht *ht, unsigned long size,
1630                                    int growth_order)
1631 {
1632         return _uatomic_max(&ht->t.resize_target,
1633                             size << growth_order);
1634 }
1635
1636 static
1637 void resize_target_update_count(struct cds_lfht *ht,
1638                                 unsigned long count)
1639 {
1640         count = max(count, MIN_TABLE_SIZE);
1641         uatomic_set(&ht->t.resize_target, count);
1642 }
1643
1644 void cds_lfht_resize(struct cds_lfht *ht, unsigned long new_size)
1645 {
1646         resize_target_update_count(ht, new_size);
1647         CMM_STORE_SHARED(ht->t.resize_initiated, 1);
1648         ht->cds_lfht_rcu_thread_offline();
1649         pthread_mutex_lock(&ht->resize_mutex);
1650         _do_cds_lfht_resize(ht);
1651         pthread_mutex_unlock(&ht->resize_mutex);
1652         ht->cds_lfht_rcu_thread_online();
1653 }
1654
1655 static
1656 void do_resize_cb(struct rcu_head *head)
1657 {
1658         struct rcu_resize_work *work =
1659                 caa_container_of(head, struct rcu_resize_work, head);
1660         struct cds_lfht *ht = work->ht;
1661
1662         ht->cds_lfht_rcu_thread_offline();
1663         pthread_mutex_lock(&ht->resize_mutex);
1664         _do_cds_lfht_resize(ht);
1665         pthread_mutex_unlock(&ht->resize_mutex);
1666         ht->cds_lfht_rcu_thread_online();
1667         poison_free(work);
1668         cmm_smp_mb();   /* finish resize before decrement */
1669         uatomic_dec(&ht->in_progress_resize);
1670 }
1671
1672 static
1673 void cds_lfht_resize_lazy(struct cds_lfht *ht, unsigned long size, int growth)
1674 {
1675         struct rcu_resize_work *work;
1676         unsigned long target_size;
1677
1678         target_size = resize_target_update(ht, size, growth);
1679         /* Store resize_target before read resize_initiated */
1680         cmm_smp_mb();
1681         if (!CMM_LOAD_SHARED(ht->t.resize_initiated) && size < target_size) {
1682                 uatomic_inc(&ht->in_progress_resize);
1683                 cmm_smp_mb();   /* increment resize count before calling it */
1684                 work = malloc(sizeof(*work));
1685                 work->ht = ht;
1686                 ht->cds_lfht_call_rcu(&work->head, do_resize_cb);
1687                 CMM_STORE_SHARED(ht->t.resize_initiated, 1);
1688         }
1689 }
1690
1691 #if defined(HAVE_SCHED_GETCPU) && defined(HAVE_SYSCONF)
1692
1693 static
1694 void cds_lfht_resize_lazy_count(struct cds_lfht *ht, unsigned long size,
1695                                 unsigned long count)
1696 {
1697         struct rcu_resize_work *work;
1698
1699         if (!(ht->flags & CDS_LFHT_AUTO_RESIZE))
1700                 return;
1701         resize_target_update_count(ht, count);
1702         /* Store resize_target before read resize_initiated */
1703         cmm_smp_mb();
1704         if (!CMM_LOAD_SHARED(ht->t.resize_initiated)) {
1705                 uatomic_inc(&ht->in_progress_resize);
1706                 cmm_smp_mb();   /* increment resize count before calling it */
1707                 work = malloc(sizeof(*work));
1708                 work->ht = ht;
1709                 ht->cds_lfht_call_rcu(&work->head, do_resize_cb);
1710                 CMM_STORE_SHARED(ht->t.resize_initiated, 1);
1711         }
1712 }
1713
1714 #endif