rculfhash.c

   1 /*
   2  * rculfhash.c
   3  *
   4  * Userspace RCU library - Lock-Free Resizable RCU Hash Table
   5  *
   6  * Copyright 2010-2011 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
   7  *
   8  * This library is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License as published by the Free Software Foundation; either
  11  * version 2.1 of the License, or (at your option) any later version.
  12  *
  13  * This library is distributed in the hope that it will be useful,
  14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  * Lesser General Public License for more details.
  17  *
  18  * You should have received a copy of the GNU Lesser General Public
  19  * License along with this library; if not, write to the Free Software
  20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21  */
  22
  23 /*
  24  * Based on the following articles:
  25  * - Ori Shalev and Nir Shavit. Split-ordered lists: Lock-free
  26  *   extensible hash tables. J. ACM 53, 3 (May 2006), 379-405.
  27  * - Michael, M. M. High performance dynamic lock-free hash tables
  28  *   and list-based sets. In Proceedings of the fourteenth annual ACM
  29  *   symposium on Parallel algorithms and architectures, ACM Press,
  30  *   (2002), 73-82.
  31  *
  32  * Some specificities of this Lock-Free Resizable RCU Hash Table
  33  * implementation:
  34  *
  35  * - RCU read-side critical section allows readers to perform hash
  36  *   table lookups and use the returned objects safely by delaying
  37  *   memory reclaim of a grace period.
  38  * - Add and remove operations are lock-free, and do not need to
  39  *   allocate memory. They need to be executed within RCU read-side
  40  *   critical section to ensure the objects they read are valid and to
  41  *   deal with the cmpxchg ABA problem.
  42  * - add and add_unique operations are supported. add_unique checks if
  43  *   the node key already exists in the hash table. It ensures no key
  44  *   duplicata exists.
  45  * - The resize operation executes concurrently with add/remove/lookup.
  46  * - Hash table nodes are contained within a split-ordered list. This
  47  *   list is ordered by incrementing reversed-bits-hash value.
  48  * - An index of dummy nodes is kept. These dummy nodes are the hash
  49  *   table "buckets", and they are also chained together in the
  50  *   split-ordered list, which allows recursive expansion.
  51  * - The resize operation for small tables only allows expanding the hash table.
  52  *   It is triggered automatically by detecting long chains in the add
  53  *   operation.
  54  * - The resize operation for larger tables (and available through an
  55  *   API) allows both expanding and shrinking the hash table.
  56  * - Per-CPU Split-counters are used to keep track of the number of
  57  *   nodes within the hash table for automatic resize triggering.
  58  * - Resize operation initiated by long chain detection is executed by a
  59  *   call_rcu thread, which keeps lock-freedom of add and remove.
  60  * - Resize operations are protected by a mutex.
  61  * - The removal operation is split in two parts: first, a "removed"
  62  *   flag is set in the next pointer within the node to remove. Then,
  63  *   a "garbage collection" is performed in the bucket containing the
  64  *   removed node (from the start of the bucket up to the removed node).
  65  *   All encountered nodes with "removed" flag set in their next
  66  *   pointers are removed from the linked-list. If the cmpxchg used for
  67  *   removal fails (due to concurrent garbage-collection or concurrent
  68  *   add), we retry from the beginning of the bucket. This ensures that
  69  *   the node with "removed" flag set is removed from the hash table
  70  *   (not visible to lookups anymore) before the RCU read-side critical
  71  *   section held across removal ends. Furthermore, this ensures that
  72  *   the node with "removed" flag set is removed from the linked-list
  73  *   before its memory is reclaimed. Only the thread which removal
  74  *   successfully set the "removed" flag (with a cmpxchg) into a node's
  75  *   next pointer is considered to have succeeded its removal (and thus
  76  *   owns the node to reclaim). Because we garbage-collect starting from
  77  *   an invariant node (the start-of-bucket dummy node) up to the
  78  *   "removed" node (or find a reverse-hash that is higher), we are sure
  79  *   that a successful traversal of the chain leads to a chain that is
  80  *   present in the linked-list (the start node is never removed) and
  81  *   that is does not contain the "removed" node anymore, even if
  82  *   concurrent delete/add operations are changing the structure of the
  83  *   list concurrently.
  84  * - The add operation performs gargage collection of buckets if it
  85  *   encounters nodes with removed flag set in the bucket where it wants
  86  *   to add its new node. This ensures lock-freedom of add operation by
  87  *   helping the remover unlink nodes from the list rather than to wait
  88  *   for it do to so.
  89  * - A RCU "order table" indexed by log2(hash index) is copied and
  90  *   expanded by the resize operation. This order table allows finding
  91  *   the "dummy node" tables.
  92  * - There is one dummy node table per hash index order. The size of
  93  *   each dummy node table is half the number of hashes contained in
  94  *   this order.
  95  * - call_rcu is used to garbage-collect the old order table.
  96  * - The per-order dummy node tables contain a compact version of the
  97  *   hash table nodes. These tables are invariant after they are
  98  *   populated into the hash table.
  99  *
 100  * A bit of ascii art explanation:
 101  *
 102  * Order index is the off-by-one compare to the actual power of 2 because
 103  * we use index 0 to deal with the 0 special-case.
 104  *
 105  * This shows the nodes for a small table ordered by reversed bits:
 106  *
 107  *    bits   reverse
 108  * 0  000        000
 109  * 4  100        001
 110  * 2  010        010
 111  * 6  110        011
 112  * 1  001        100
 113  * 5  101        101
 114  * 3  011        110
 115  * 7  111        111
 116  *
 117  * This shows the nodes in order of non-reversed bits, linked by
 118  * reversed-bit order.
 119  *
 120  * order              bits       reverse
 121  * 0               0  000        000
 122  *                 |
 123  * 1               |  1  001        100       <-    <-
 124  *                 |  |                        |     |
 125  * 2               |  |  2  010        010     |     |
 126  *                 |  |  |  3  011        110  | <-  |
 127  *                 |  |  |  |                  |  |  |
 128  * 3               -> |  |  |  4  100        001  |  |
 129  *                    -> |  |     5  101        101  |
 130  *                       -> |        6  110        011
 131  *                          ->          7  111        111
 132  */
 133
 134 #define _LGPL_SOURCE
 135 #include <stdlib.h>
 136 #include <errno.h>
 137 #include <assert.h>
 138 #include <stdio.h>
 139 #include <stdint.h>
 140 #include <string.h>
 141
 142 #include "config.h"
 143 #include <urcu.h>
 144 #include <urcu-call-rcu.h>
 145 #include <urcu/arch.h>
 146 #include <urcu/uatomic.h>
 147 #include <urcu/jhash.h>
 148 #include <urcu/compiler.h>
 149 #include <urcu/rculfhash.h>
 150 #include <stdio.h>
 151 #include <pthread.h>
 152
 153 #ifdef DEBUG
 154 #define dbg_printf(fmt, args...)     printf("[debug rculfhash] " fmt, ## args)
 155 #else
 156 #define dbg_printf(fmt, args...)
 157 #endif
 158
 159 /*
 160  * Per-CPU split-counters lazily update the global counter each 1024
 161  * addition/removal. It automatically keeps track of resize required.
 162  * We use the bucket length as indicator for need to expand for small
 163  * tables and machines lacking per-cpu data suppport.
 164  */
 165 #define COUNT_COMMIT_ORDER              10
 166 #define CHAIN_LEN_TARGET                1
 167 #define CHAIN_LEN_RESIZE_THRESHOLD      3
 168
 169 /*
 170  * Define the minimum table size.
 171  */
 172 #define MIN_TABLE_SIZE                  1
 173
 174 #if (CAA_BITS_PER_LONG == 32)
 175 #define MAX_TABLE_ORDER                 32
 176 #else
 177 #define MAX_TABLE_ORDER                 64
 178 #endif
 179
 180 /*
 181  * Minimum number of dummy nodes to touch per thread to parallelize grow/shrink.
 182  */
 183 #define MIN_PARTITION_PER_THREAD_ORDER  12
 184 #define MIN_PARTITION_PER_THREAD        (1UL << MIN_PARTITION_PER_THREAD_ORDER)
 185
 186 #ifndef min
 187 #define min(a, b)       ((a) < (b) ? (a) : (b))
 188 #endif
 189
 190 #ifndef max
 191 #define max(a, b)       ((a) > (b) ? (a) : (b))
 192 #endif
 193
 194 /*
 195  * The removed flag needs to be updated atomically with the pointer.
 196  * It indicates that no node must attach to the node scheduled for
 197  * removal. The gc flag also needs to be updated atomically with the
 198  * pointer. It indicates that node garbage collection must be performed.
 199  * The dummy flag does not require to be updated atomically with the
 200  * pointer, but it is added as a pointer low bit flag to save space.
 201  */
 202 #define REMOVED_FLAG            (1UL << 0)
 203 #define GC_FLAG                 (1UL << 1)
 204 #define DUMMY_FLAG              (1UL << 2)
 205 #define FLAGS_MASK              ((1UL << 3) - 1)
 206
 207 /* Value of the end pointer. Should not interact with flags. */
 208 #define END_VALUE               NULL
 209
 210 struct ht_items_count {
 211         unsigned long add, del;
 212 } __attribute__((aligned(CAA_CACHE_LINE_SIZE)));
 213
 214 struct rcu_level {
 215         struct rcu_head head;
 216         struct _cds_lfht_node nodes[0];
 217 };
 218
 219 struct rcu_table {
 220         unsigned long size;     /* always a power of 2, shared (RCU) */
 221         unsigned long resize_target;
 222         int resize_initiated;
 223         struct rcu_level *tbl[MAX_TABLE_ORDER];
 224 };
 225
 226 struct cds_lfht {
 227         struct rcu_table t;
 228         cds_lfht_hash_fct hash_fct;
 229         cds_lfht_compare_fct compare_fct;
 230         unsigned long hash_seed;
 231         int flags;
 232         /*
 233          * We need to put the work threads offline (QSBR) when taking this
 234          * mutex, because we use synchronize_rcu within this mutex critical
 235          * section, which waits on read-side critical sections, and could
 236          * therefore cause grace-period deadlock if we hold off RCU G.P.
 237          * completion.
 238          */
 239         pthread_mutex_t resize_mutex;   /* resize mutex: add/del mutex */
 240         unsigned int in_progress_resize, in_progress_destroy;
 241         void (*cds_lfht_call_rcu)(struct rcu_head *head,
 242                       void (*func)(struct rcu_head *head));
 243         void (*cds_lfht_synchronize_rcu)(void);
 244         void (*cds_lfht_rcu_read_lock)(void);
 245         void (*cds_lfht_rcu_read_unlock)(void);
 246         void (*cds_lfht_rcu_thread_offline)(void);
 247         void (*cds_lfht_rcu_thread_online)(void);
 248         void (*cds_lfht_rcu_register_thread)(void);
 249         void (*cds_lfht_rcu_unregister_thread)(void);
 250         pthread_attr_t *resize_attr;    /* Resize threads attributes */
 251         unsigned long count;            /* global approximate item count */
 252         struct ht_items_count *percpu_count;    /* per-cpu item count */
 253 };
 254
 255 struct rcu_resize_work {
 256         struct rcu_head head;
 257         struct cds_lfht *ht;
 258 };
 259
 260 struct partition_resize_work {
 261         struct rcu_head head;
 262         struct cds_lfht *ht;
 263         unsigned long i, start, len;
 264         void (*fct)(struct cds_lfht *ht, unsigned long i,
 265                     unsigned long start, unsigned long len);
 266 };
 267
 268 enum add_mode {
 269         ADD_DEFAULT = 0,
 270         ADD_UNIQUE = 1,
 271         ADD_REPLACE = 2,
 272 };
 273
 274 static
 275 struct cds_lfht_node *_cds_lfht_add(struct cds_lfht *ht,
 276                                 unsigned long size,
 277                                 struct cds_lfht_node *node,
 278                                 enum add_mode mode, int dummy);
 279
 280 static
 281 int _cds_lfht_del(struct cds_lfht *ht, unsigned long size,
 282                 struct cds_lfht_node *node,
 283                 int dummy_removal, int do_gc);
 284
 285 /*
 286  * Algorithm to reverse bits in a word by lookup table, extended to
 287  * 64-bit words.
 288  * Source:
 289  * http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
 290  * Originally from Public Domain.
 291  */
 292
 293 static const uint8_t BitReverseTable256[256] =
 294 {
 295 #define R2(n) (n),   (n) + 2*64,     (n) + 1*64,     (n) + 3*64
 296 #define R4(n) R2(n), R2((n) + 2*16), R2((n) + 1*16), R2((n) + 3*16)
 297 #define R6(n) R4(n), R4((n) + 2*4 ), R4((n) + 1*4 ), R4((n) + 3*4 )
 298         R6(0), R6(2), R6(1), R6(3)
 299 };
 300 #undef R2
 301 #undef R4
 302 #undef R6
 303
 304 static
 305 uint8_t bit_reverse_u8(uint8_t v)
 306 {
 307         return BitReverseTable256[v];
 308 }
 309
 310 static __attribute__((unused))
 311 uint32_t bit_reverse_u32(uint32_t v)
 312 {
 313         return ((uint32_t) bit_reverse_u8(v) << 24) |
 314                 ((uint32_t) bit_reverse_u8(v >> 8) << 16) |
 315                 ((uint32_t) bit_reverse_u8(v >> 16) << 8) |
 316                 ((uint32_t) bit_reverse_u8(v >> 24));
 317 }
 318
 319 static __attribute__((unused))
 320 uint64_t bit_reverse_u64(uint64_t v)
 321 {
 322         return ((uint64_t) bit_reverse_u8(v) << 56) |
 323                 ((uint64_t) bit_reverse_u8(v >> 8)  << 48) |
 324                 ((uint64_t) bit_reverse_u8(v >> 16) << 40) |
 325                 ((uint64_t) bit_reverse_u8(v >> 24) << 32) |
 326                 ((uint64_t) bit_reverse_u8(v >> 32) << 24) |
 327                 ((uint64_t) bit_reverse_u8(v >> 40) << 16) |
 328                 ((uint64_t) bit_reverse_u8(v >> 48) << 8) |
 329                 ((uint64_t) bit_reverse_u8(v >> 56));
 330 }
 331
 332 static
 333 unsigned long bit_reverse_ulong(unsigned long v)
 334 {
 335 #if (CAA_BITS_PER_LONG == 32)
 336         return bit_reverse_u32(v);
 337 #else
 338         return bit_reverse_u64(v);
 339 #endif
 340 }
 341
 342 /*
 343  * fls: returns the position of the most significant bit.
 344  * Returns 0 if no bit is set, else returns the position of the most
 345  * significant bit (from 1 to 32 on 32-bit, from 1 to 64 on 64-bit).
 346  */
 347 #if defined(__i386) || defined(__x86_64)
 348 static inline
 349 unsigned int fls_u32(uint32_t x)
 350 {
 351         int r;
 352
 353         asm("bsrl %1,%0\n\t"
 354             "jnz 1f\n\t"
 355             "movl $-1,%0\n\t"
 356             "1:\n\t"
 357             : "=r" (r) : "rm" (x));
 358         return r + 1;
 359 }
 360 #define HAS_FLS_U32
 361 #endif
 362
 363 #if defined(__x86_64)
 364 static inline
 365 unsigned int fls_u64(uint64_t x)
 366 {
 367         long r;
 368
 369         asm("bsrq %1,%0\n\t"
 370             "jnz 1f\n\t"
 371             "movq $-1,%0\n\t"
 372             "1:\n\t"
 373             : "=r" (r) : "rm" (x));
 374         return r + 1;
 375 }
 376 #define HAS_FLS_U64
 377 #endif
 378
 379 #ifndef HAS_FLS_U64
 380 static __attribute__((unused))
 381 unsigned int fls_u64(uint64_t x)
 382 {
 383         unsigned int r = 64;
 384
 385         if (!x)
 386                 return 0;
 387
 388         if (!(x & 0xFFFFFFFF00000000ULL)) {
 389                 x <<= 32;
 390                 r -= 32;
 391         }
 392         if (!(x & 0xFFFF000000000000ULL)) {
 393                 x <<= 16;
 394                 r -= 16;
 395         }
 396         if (!(x & 0xFF00000000000000ULL)) {
 397                 x <<= 8;
 398                 r -= 8;
 399         }
 400         if (!(x & 0xF000000000000000ULL)) {
 401                 x <<= 4;
 402                 r -= 4;
 403         }
 404         if (!(x & 0xC000000000000000ULL)) {
 405                 x <<= 2;
 406                 r -= 2;
 407         }
 408         if (!(x & 0x8000000000000000ULL)) {
 409                 x <<= 1;
 410                 r -= 1;
 411         }
 412         return r;
 413 }
 414 #endif
 415
 416 #ifndef HAS_FLS_U32
 417 static __attribute__((unused))
 418 unsigned int fls_u32(uint32_t x)
 419 {
 420         unsigned int r = 32;
 421
 422         if (!x)
 423                 return 0;
 424         if (!(x & 0xFFFF0000U)) {
 425                 x <<= 16;
 426                 r -= 16;
 427         }
 428         if (!(x & 0xFF000000U)) {
 429                 x <<= 8;
 430                 r -= 8;
 431         }
 432         if (!(x & 0xF0000000U)) {
 433                 x <<= 4;
 434                 r -= 4;
 435         }
 436         if (!(x & 0xC0000000U)) {
 437                 x <<= 2;
 438                 r -= 2;
 439         }
 440         if (!(x & 0x80000000U)) {
 441                 x <<= 1;
 442                 r -= 1;
 443         }
 444         return r;
 445 }
 446 #endif
 447
 448 unsigned int fls_ulong(unsigned long x)
 449 {
 450 #if (CAA_BITS_PER_lONG == 32)
 451         return fls_u32(x);
 452 #else
 453         return fls_u64(x);
 454 #endif
 455 }
 456
 457 int get_count_order_u32(uint32_t x)
 458 {
 459         int order;
 460
 461         order = fls_u32(x) - 1;
 462         if (x & (x - 1))
 463                 order++;
 464         return order;
 465 }
 466
 467 int get_count_order_ulong(unsigned long x)
 468 {
 469         int order;
 470
 471         order = fls_ulong(x) - 1;
 472         if (x & (x - 1))
 473                 order++;
 474         return order;
 475 }
 476
 477 #ifdef POISON_FREE
 478 #define poison_free(ptr)                                \
 479         do {                                            \
 480                 memset(ptr, 0x42, sizeof(*(ptr)));      \
 481                 free(ptr);                              \
 482         } while (0)
 483 #else
 484 #define poison_free(ptr)        free(ptr)
 485 #endif
 486
 487 static
 488 void cds_lfht_resize_lazy(struct cds_lfht *ht, unsigned long size, int growth);
 489
 490 /*
 491  * If the sched_getcpu() and sysconf(_SC_NPROCESSORS_CONF) calls are
 492  * available, then we support hash table item accounting.
 493  * In the unfortunate event the number of CPUs reported would be
 494  * inaccurate, we use modulo arithmetic on the number of CPUs we got.
 495  */
 496 #if defined(HAVE_SCHED_GETCPU) && defined(HAVE_SYSCONF)
 497
 498 static
 499 void cds_lfht_resize_lazy_count(struct cds_lfht *ht, unsigned long size,
 500                                 unsigned long count);
 501
 502 static long nr_cpus_mask = -1;
 503
 504 static
 505 struct ht_items_count *alloc_per_cpu_items_count(void)
 506 {
 507         struct ht_items_count *count;
 508
 509         switch (nr_cpus_mask) {
 510         case -2:
 511                 return NULL;
 512         case -1:
 513         {
 514                 long maxcpus;
 515
 516                 maxcpus = sysconf(_SC_NPROCESSORS_CONF);
 517                 if (maxcpus <= 0) {
 518                         nr_cpus_mask = -2;
 519                         return NULL;
 520                 }
 521                 /*
 522                  * round up number of CPUs to next power of two, so we
 523                  * can use & for modulo.
 524                  */
 525                 maxcpus = 1UL << get_count_order_ulong(maxcpus);
 526                 nr_cpus_mask = maxcpus - 1;
 527         }
 528                 /* Fall-through */
 529         default:
 530                 return calloc(nr_cpus_mask + 1, sizeof(*count));
 531         }
 532 }
 533
 534 static
 535 void free_per_cpu_items_count(struct ht_items_count *count)
 536 {
 537         poison_free(count);
 538 }
 539
 540 static
 541 int ht_get_cpu(void)
 542 {
 543         int cpu;
 544
 545         assert(nr_cpus_mask >= 0);
 546         cpu = sched_getcpu();
 547         if (unlikely(cpu < 0))
 548                 return cpu;
 549         else
 550                 return cpu & nr_cpus_mask;
 551 }
 552
 553 static
 554 void ht_count_add(struct cds_lfht *ht, unsigned long size)
 555 {
 556         unsigned long percpu_count;
 557         int cpu;
 558
 559         if (unlikely(!ht->percpu_count))
 560                 return;
 561         cpu = ht_get_cpu();
 562         if (unlikely(cpu < 0))
 563                 return;
 564         percpu_count = uatomic_add_return(&ht->percpu_count[cpu].add, 1);
 565         if (unlikely(!(percpu_count & ((1UL << COUNT_COMMIT_ORDER) - 1)))) {
 566                 unsigned long count;
 567
 568                 dbg_printf("add percpu %lu\n", percpu_count);
 569                 count = uatomic_add_return(&ht->count,
 570                                            1UL << COUNT_COMMIT_ORDER);
 571                 /* If power of 2 */
 572                 if (!(count & (count - 1))) {
 573                         if ((count >> CHAIN_LEN_RESIZE_THRESHOLD) < size)
 574                                 return;
 575                         dbg_printf("add set global %lu\n", count);
 576                         cds_lfht_resize_lazy_count(ht, size,
 577                                 count >> (CHAIN_LEN_TARGET - 1));
 578                 }
 579         }
 580 }
 581
 582 static
 583 void ht_count_del(struct cds_lfht *ht, unsigned long size)
 584 {
 585         unsigned long percpu_count;
 586         int cpu;
 587
 588         if (unlikely(!ht->percpu_count))
 589                 return;
 590         cpu = ht_get_cpu();
 591         if (unlikely(cpu < 0))
 592                 return;
 593         percpu_count = uatomic_add_return(&ht->percpu_count[cpu].del, -1);
 594         if (unlikely(!(percpu_count & ((1UL << COUNT_COMMIT_ORDER) - 1)))) {
 595                 unsigned long count;
 596
 597                 dbg_printf("del percpu %lu\n", percpu_count);
 598                 count = uatomic_add_return(&ht->count,
 599                                            -(1UL << COUNT_COMMIT_ORDER));
 600                 /* If power of 2 */
 601                 if (!(count & (count - 1))) {
 602                         if ((count >> CHAIN_LEN_RESIZE_THRESHOLD) >= size)
 603                                 return;
 604                         dbg_printf("del set global %lu\n", count);
 605                         cds_lfht_resize_lazy_count(ht, size,
 606                                 count >> (CHAIN_LEN_TARGET - 1));
 607                 }
 608         }
 609 }
 610
 611 #else /* #if defined(HAVE_SCHED_GETCPU) && defined(HAVE_SYSCONF) */
 612
 613 static const long nr_cpus_mask = -1;
 614
 615 static
 616 struct ht_items_count *alloc_per_cpu_items_count(void)
 617 {
 618         return NULL;
 619 }
 620
 621 static
 622 void free_per_cpu_items_count(struct ht_items_count *count)
 623 {
 624 }
 625
 626 static
 627 void ht_count_add(struct cds_lfht *ht, unsigned long size)
 628 {
 629 }
 630
 631 static
 632 void ht_count_del(struct cds_lfht *ht, unsigned long size)
 633 {
 634 }
 635
 636 #endif /* #else #if defined(HAVE_SCHED_GETCPU) && defined(HAVE_SYSCONF) */
 637
 638
 639 static
 640 void check_resize(struct cds_lfht *ht, unsigned long size, uint32_t chain_len)
 641 {
 642         unsigned long count;
 643
 644         if (!(ht->flags & CDS_LFHT_AUTO_RESIZE))
 645                 return;
 646         count = uatomic_read(&ht->count);
 647         /*
 648          * Use bucket-local length for small table expand and for
 649          * environments lacking per-cpu data support.
 650          */
 651         if (count >= (1UL << COUNT_COMMIT_ORDER))
 652                 return;
 653         if (chain_len > 100)
 654                 dbg_printf("WARNING: large chain length: %u.\n",
 655                            chain_len);
 656         if (chain_len >= CHAIN_LEN_RESIZE_THRESHOLD)
 657                 cds_lfht_resize_lazy(ht, size,
 658                         get_count_order_u32(chain_len - (CHAIN_LEN_TARGET - 1)));
 659 }
 660
 661 static
 662 struct cds_lfht_node *clear_flag(struct cds_lfht_node *node)
 663 {
 664         return (struct cds_lfht_node *) (((unsigned long) node) & ~FLAGS_MASK);
 665 }
 666
 667 static
 668 int is_removed(struct cds_lfht_node *node)
 669 {
 670         return ((unsigned long) node) & REMOVED_FLAG;
 671 }
 672
 673 static
 674 struct cds_lfht_node *flag_removed(struct cds_lfht_node *node)
 675 {
 676         return (struct cds_lfht_node *) (((unsigned long) node) | REMOVED_FLAG);
 677 }
 678
 679 static
 680 int is_gc(struct cds_lfht_node *node)
 681 {
 682         return ((unsigned long) node) & GC_FLAG;
 683 }
 684
 685 static
 686 struct cds_lfht_node *flag_gc(struct cds_lfht_node *node)
 687 {
 688         return (struct cds_lfht_node *) (((unsigned long) node) | GC_FLAG);
 689 }
 690
 691 static
 692 int is_dummy(struct cds_lfht_node *node)
 693 {
 694         return ((unsigned long) node) & DUMMY_FLAG;
 695 }
 696
 697 static
 698 struct cds_lfht_node *flag_dummy(struct cds_lfht_node *node)
 699 {
 700         return (struct cds_lfht_node *) (((unsigned long) node) | DUMMY_FLAG);
 701 }
 702
 703 static
 704 struct cds_lfht_node *get_end(void)
 705 {
 706         return (struct cds_lfht_node *) END_VALUE;
 707 }
 708
 709 static
 710 int is_end(struct cds_lfht_node *node)
 711 {
 712         return clear_flag(node) == (struct cds_lfht_node *) END_VALUE;
 713 }
 714
 715 static
 716 unsigned long _uatomic_max(unsigned long *ptr, unsigned long v)
 717 {
 718         unsigned long old1, old2;
 719
 720         old1 = uatomic_read(ptr);
 721         do {
 722                 old2 = old1;
 723                 if (old2 >= v)
 724                         return old2;
 725         } while ((old1 = uatomic_cmpxchg(ptr, old2, v)) != old2);
 726         return v;
 727 }
 728
 729 static
 730 void cds_lfht_free_level(struct rcu_head *head)
 731 {
 732         struct rcu_level *l =
 733                 caa_container_of(head, struct rcu_level, head);
 734         poison_free(l);
 735 }
 736
 737 /*
 738  * Remove all logically deleted nodes from a bucket up to a certain node key.
 739  */
 740 static
 741 void _cds_lfht_gc_bucket(struct cds_lfht_node *dummy, struct cds_lfht_node *node)
 742 {
 743         struct cds_lfht_node *iter_prev, *iter, *next, *new_next;
 744
 745         assert(!is_dummy(dummy));
 746         assert(!is_gc(dummy));
 747         assert(!is_removed(dummy));
 748         assert(!is_dummy(node));
 749         assert(!is_gc(node));
 750         assert(!is_removed(node));
 751         for (;;) {
 752                 iter_prev = dummy;
 753                 /* We can always skip the dummy node initially */
 754                 iter = rcu_dereference(iter_prev->p.next);
 755                 assert(iter_prev->p.reverse_hash <= node->p.reverse_hash);
 756                 /*
 757                  * We should never be called with dummy (start of chain)
 758                  * and logically removed node (end of path compression
 759                  * marker) being the actual same node. This would be a
 760                  * bug in the algorithm implementation.
 761                  */
 762                 assert(dummy != node);
 763                 for (;;) {
 764                         if (unlikely(is_end(iter)))
 765                                 return;
 766                         if (likely(clear_flag(iter)->p.reverse_hash > node->p.reverse_hash))
 767                                 return;
 768                         next = rcu_dereference(clear_flag(iter)->p.next);
 769                         if (likely(is_gc(next)))
 770                                 break;
 771                         iter_prev = clear_flag(iter);
 772                         iter = next;
 773                 }
 774                 assert(!is_gc(iter));
 775                 if (is_dummy(iter))
 776                         new_next = flag_dummy(clear_flag(next));
 777                 else
 778                         new_next = clear_flag(next);
 779                 if (is_removed(iter))
 780                         new_next = flag_removed(new_next);
 781                 (void) uatomic_cmpxchg(&iter_prev->p.next, iter, new_next);
 782         }
 783         return;
 784 }
 785
 786 static
 787 struct cds_lfht_node *_cds_lfht_add(struct cds_lfht *ht,
 788                                 unsigned long size,
 789                                 struct cds_lfht_node *node,
 790                                 enum add_mode mode, int dummy)
 791 {
 792         struct cds_lfht_node *iter_prev, *iter, *next, *new_node, *new_next,
 793                         *dummy_node, *return_node;
 794         struct _cds_lfht_node *lookup;
 795         unsigned long hash, index, order;
 796
 797         assert(!is_dummy(node));
 798         assert(!is_gc(node));
 799         assert(!is_removed(node));
 800         if (!size) {
 801                 assert(dummy);
 802                 node->p.next = flag_dummy(get_end());
 803                 return node;    /* Initial first add (head) */
 804         }
 805         hash = bit_reverse_ulong(node->p.reverse_hash);
 806         for (;;) {
 807                 uint32_t chain_len = 0;
 808
 809                 /*
 810                  * iter_prev points to the non-removed node prior to the
 811                  * insert location.
 812                  */
 813                 index = hash & (size - 1);
 814                 order = get_count_order_ulong(index + 1);
 815                 lookup = &ht->t.tbl[order]->nodes[index & ((!order ? 0 : (1UL << (order - 1))) - 1)];
 816                 iter_prev = (struct cds_lfht_node *) lookup;
 817                 /* We can always skip the dummy node initially */
 818                 iter = rcu_dereference(iter_prev->p.next);
 819                 assert(iter_prev->p.reverse_hash <= node->p.reverse_hash);
 820                 for (;;) {
 821                         if (unlikely(is_end(iter)))
 822                                 goto insert;
 823                         if (likely(clear_flag(iter)->p.reverse_hash > node->p.reverse_hash))
 824                                 goto insert;
 825                         next = rcu_dereference(clear_flag(iter)->p.next);
 826                         if (unlikely(is_gc(next)))
 827                                 goto gc_node;
 828                         assert(!is_removed(next));
 829                         if ((mode == ADD_UNIQUE || mode == ADD_REPLACE)
 830                             && !is_dummy(next)
 831                             && !ht->compare_fct(node->key, node->key_len,
 832                                                 clear_flag(iter)->key,
 833                                                 clear_flag(iter)->key_len)) {
 834                                 if (mode == ADD_UNIQUE)
 835                                         return clear_flag(iter);
 836                                 else /* mode == ADD_REPLACE */
 837                                         goto replace;
 838                         }
 839                         /* Only account for identical reverse hash once */
 840                         if (iter_prev->p.reverse_hash != clear_flag(iter)->p.reverse_hash
 841                             && !is_dummy(next))
 842                                 check_resize(ht, size, ++chain_len);
 843                         iter_prev = clear_flag(iter);
 844                         iter = next;
 845                 }
 846
 847         insert:
 848                 assert(node != clear_flag(iter));
 849                 assert(!is_removed(iter_prev));
 850                 assert(!is_removed(iter));
 851                 assert(!is_gc(iter_prev));
 852                 assert(!is_gc(iter));
 853                 assert(iter_prev != node);
 854                 if (!dummy)
 855                         node->p.next = clear_flag(iter);
 856                 else
 857                         node->p.next = flag_dummy(clear_flag(iter));
 858                 if (is_dummy(iter))
 859                         new_node = flag_dummy(node);
 860                 else
 861                         new_node = node;
 862                 if (uatomic_cmpxchg(&iter_prev->p.next, iter,
 863                                     new_node) != iter) {
 864                         continue;       /* retry */
 865                 } else {
 866                         if (mode == ADD_REPLACE)
 867                                 return_node = NULL;
 868                         else    /* ADD_DEFAULT and ADD_UNIQUE */
 869                                 return_node = node;
 870                         goto gc_end;
 871                 }
 872
 873         replace:
 874                 /* Insert after node to be replaced */
 875                 iter_prev = clear_flag(iter);
 876                 iter = next;
 877                 assert(node != clear_flag(iter));
 878                 assert(!is_removed(iter_prev));
 879                 assert(!is_removed(iter));
 880                 assert(!is_gc(iter_prev));
 881                 assert(!is_gc(iter));
 882                 assert(iter_prev != node);
 883                 assert(!dummy);
 884                 node->p.next = clear_flag(iter);
 885                 if (is_dummy(iter))
 886                         new_node = flag_dummy(node);
 887                 else
 888                         new_node = node;
 889                 /*
 890                  * Here is the whole trick for lock-free replace: we add
 891                  * the replacement node _after_ the node we want to
 892                  * replace by atomically setting its next pointer at the
 893                  * same time we set its removal and gc flags. Given that
 894                  * the lookups/get next use an iterator aware of the
 895                  * next pointer, they will either skip the old node due
 896                  * to the removal/gc flag and see the new node, or use
 897                  * the old new, but will not see the new one.
 898                  */
 899                 new_node = flag_removed(new_node);
 900                 new_node = flag_gc(new_node);
 901                 if (uatomic_cmpxchg(&iter_prev->p.next,
 902                               iter, new_node) != iter) {
 903                         continue;       /* retry */
 904                 } else {
 905                         return_node = iter_prev;
 906                         goto gc_end;
 907                 }
 908
 909         gc_node:
 910                 assert(!is_removed(iter));
 911                 assert(!is_gc(iter));
 912                 if (is_dummy(iter))
 913                         new_next = flag_dummy(clear_flag(next));
 914                 else
 915                         new_next = clear_flag(next);
 916                 (void) uatomic_cmpxchg(&iter_prev->p.next, iter, new_next);
 917                 /* retry */
 918         }
 919 gc_end:
 920         /* Garbage collect logically removed nodes in the bucket */
 921         index = hash & (size - 1);
 922         order = get_count_order_ulong(index + 1);
 923         lookup = &ht->t.tbl[order]->nodes[index & (!order ? 0 : ((1UL << (order - 1)) - 1))];
 924         dummy_node = (struct cds_lfht_node *) lookup;
 925         _cds_lfht_gc_bucket(dummy_node, node);
 926         return return_node;
 927 }
 928
 929 static
 930 int _cds_lfht_del(struct cds_lfht *ht, unsigned long size,
 931                 struct cds_lfht_node *node,
 932                 int dummy_removal, int do_gc)
 933 {
 934         struct cds_lfht_node *dummy, *next, *old;
 935         struct _cds_lfht_node *lookup;
 936         int flagged = 0;
 937         unsigned long hash, index, order;
 938
 939         /* logically delete the node */
 940         assert(!is_dummy(node));
 941         assert(!is_gc(node));
 942         assert(!is_removed(node));
 943         old = rcu_dereference(node->p.next);
 944         do {
 945                 struct cds_lfht_node *new_next;
 946
 947                 next = old;
 948                 if (unlikely(is_removed(next)))
 949                         goto end;
 950                 if (dummy_removal)
 951                         assert(is_dummy(next));
 952                 else
 953                         assert(!is_dummy(next));
 954                 new_next = flag_removed(next);
 955                 if (do_gc)
 956                         new_next = flag_gc(new_next);
 957                 old = uatomic_cmpxchg(&node->p.next, next, new_next);
 958         } while (old != next);
 959
 960         /* We performed the (logical) deletion. */
 961         flagged = 1;
 962
 963         if (!do_gc)
 964                 goto end;
 965
 966         /*
 967          * Ensure that the node is not visible to readers anymore: lookup for
 968          * the node, and remove it (along with any other logically removed node)
 969          * if found.
 970          */
 971         hash = bit_reverse_ulong(node->p.reverse_hash);
 972         assert(size > 0);
 973         index = hash & (size - 1);
 974         order = get_count_order_ulong(index + 1);
 975         lookup = &ht->t.tbl[order]->nodes[index & (!order ? 0 : ((1UL << (order - 1)) - 1))];
 976         dummy = (struct cds_lfht_node *) lookup;
 977         _cds_lfht_gc_bucket(dummy, node);
 978 end:
 979         /*
 980          * Only the flagging action indicated that we (and no other)
 981          * removed the node from the hash.
 982          */
 983         if (flagged) {
 984                 assert(is_removed(rcu_dereference(node->p.next)));
 985                 return 0;
 986         } else
 987                 return -ENOENT;
 988 }
 989
 990 static
 991 void *partition_resize_thread(void *arg)
 992 {
 993         struct partition_resize_work *work = arg;
 994
 995         work->ht->cds_lfht_rcu_register_thread();
 996         work->fct(work->ht, work->i, work->start, work->len);
 997         work->ht->cds_lfht_rcu_unregister_thread();
 998         return NULL;
 999 }
1000
1001 static
1002 void partition_resize_helper(struct cds_lfht *ht, unsigned long i,
1003                 unsigned long len,
1004                 void (*fct)(struct cds_lfht *ht, unsigned long i,
1005                         unsigned long start, unsigned long len))
1006 {
1007         unsigned long partition_len;
1008         struct partition_resize_work *work;
1009         int thread, ret;
1010         unsigned long nr_threads;
1011         pthread_t *thread_id;
1012
1013         /*
1014          * Note: nr_cpus_mask + 1 is always power of 2.
1015          * We spawn just the number of threads we need to satisfy the minimum
1016          * partition size, up to the number of CPUs in the system.
1017          */
1018         nr_threads = min(nr_cpus_mask + 1,
1019                          len >> MIN_PARTITION_PER_THREAD_ORDER);
1020         partition_len = len >> get_count_order_ulong(nr_threads);
1021         work = calloc(nr_threads, sizeof(*work));
1022         thread_id = calloc(nr_threads, sizeof(*thread_id));
1023         assert(work);
1024         for (thread = 0; thread < nr_threads; thread++) {
1025                 work[thread].ht = ht;
1026                 work[thread].i = i;
1027                 work[thread].len = partition_len;
1028                 work[thread].start = thread * partition_len;
1029                 work[thread].fct = fct;
1030                 ret = pthread_create(&thread_id[thread], ht->resize_attr,
1031                         partition_resize_thread, &work[thread]);
1032                 assert(!ret);
1033         }
1034         for (thread = 0; thread < nr_threads; thread++) {
1035                 ret = pthread_join(thread_id[thread], NULL);
1036                 assert(!ret);
1037         }
1038         free(work);
1039         free(thread_id);
1040 }
1041
1042 /*
1043  * Holding RCU read lock to protect _cds_lfht_add against memory
1044  * reclaim that could be performed by other call_rcu worker threads (ABA
1045  * problem).
1046  *
1047  * When we reach a certain length, we can split this population phase over
1048  * many worker threads, based on the number of CPUs available in the system.
1049  * This should therefore take care of not having the expand lagging behind too
1050  * many concurrent insertion threads by using the scheduler's ability to
1051  * schedule dummy node population fairly with insertions.
1052  */
1053 static
1054 void init_table_populate_partition(struct cds_lfht *ht, unsigned long i,
1055                                    unsigned long start, unsigned long len)
1056 {
1057         unsigned long j;
1058
1059         ht->cds_lfht_rcu_read_lock();
1060         for (j = start; j < start + len; j++) {
1061                 struct cds_lfht_node *new_node =
1062                         (struct cds_lfht_node *) &ht->t.tbl[i]->nodes[j];
1063
1064                 dbg_printf("init populate: i %lu j %lu hash %lu\n",
1065                            i, j, !i ? 0 : (1UL << (i - 1)) + j);
1066                 new_node->p.reverse_hash =
1067                         bit_reverse_ulong(!i ? 0 : (1UL << (i - 1)) + j);
1068                 (void) _cds_lfht_add(ht, !i ? 0 : (1UL << (i - 1)),
1069                                 new_node, ADD_DEFAULT, 1);
1070                 if (CMM_LOAD_SHARED(ht->in_progress_destroy))
1071                         break;
1072         }
1073         ht->cds_lfht_rcu_read_unlock();
1074 }
1075
1076 static
1077 void init_table_populate(struct cds_lfht *ht, unsigned long i,
1078                          unsigned long len)
1079 {
1080         assert(nr_cpus_mask != -1);
1081         if (nr_cpus_mask < 0 || len < 2 * MIN_PARTITION_PER_THREAD) {
1082                 ht->cds_lfht_rcu_thread_online();
1083                 init_table_populate_partition(ht, i, 0, len);
1084                 ht->cds_lfht_rcu_thread_offline();
1085                 return;
1086         }
1087         partition_resize_helper(ht, i, len, init_table_populate_partition);
1088 }
1089
1090 static
1091 void init_table(struct cds_lfht *ht,
1092                 unsigned long first_order, unsigned long len_order)
1093 {
1094         unsigned long i, end_order;
1095
1096         dbg_printf("init table: first_order %lu end_order %lu\n",
1097                    first_order, first_order + len_order);
1098         end_order = first_order + len_order;
1099         for (i = first_order; i < end_order; i++) {
1100                 unsigned long len;
1101
1102                 len = !i ? 1 : 1UL << (i - 1);
1103                 dbg_printf("init order %lu len: %lu\n", i, len);
1104
1105                 /* Stop expand if the resize target changes under us */
1106                 if (CMM_LOAD_SHARED(ht->t.resize_target) < (!i ? 1 : (1UL << i)))
1107                         break;
1108
1109                 ht->t.tbl[i] = calloc(1, sizeof(struct rcu_level)
1110                                 + (len * sizeof(struct _cds_lfht_node)));
1111                 assert(ht->t.tbl[i]);
1112
1113                 /*
1114                  * Set all dummy nodes reverse hash values for a level and
1115                  * link all dummy nodes into the table.
1116                  */
1117                 init_table_populate(ht, i, len);
1118
1119                 /*
1120                  * Update table size.
1121                  */
1122                 cmm_smp_wmb();  /* populate data before RCU size */
1123                 CMM_STORE_SHARED(ht->t.size, !i ? 1 : (1UL << i));
1124
1125                 dbg_printf("init new size: %lu\n", !i ? 1 : (1UL << i));
1126                 if (CMM_LOAD_SHARED(ht->in_progress_destroy))
1127                         break;
1128         }
1129 }
1130
1131 /*
1132  * Holding RCU read lock to protect _cds_lfht_remove against memory
1133  * reclaim that could be performed by other call_rcu worker threads (ABA
1134  * problem).
1135  * For a single level, we logically remove and garbage collect each node.
1136  *
1137  * As a design choice, we perform logical removal and garbage collection on a
1138  * node-per-node basis to simplify this algorithm. We also assume keeping good
1139  * cache locality of the operation would overweight possible performance gain
1140  * that could be achieved by batching garbage collection for multiple levels.
1141  * However, this would have to be justified by benchmarks.
1142  *
1143  * Concurrent removal and add operations are helping us perform garbage
1144  * collection of logically removed nodes. We guarantee that all logically
1145  * removed nodes have been garbage-collected (unlinked) before call_rcu is
1146  * invoked to free a hole level of dummy nodes (after a grace period).
1147  *
1148  * Logical removal and garbage collection can therefore be done in batch or on a
1149  * node-per-node basis, as long as the guarantee above holds.
1150  *
1151  * When we reach a certain length, we can split this removal over many worker
1152  * threads, based on the number of CPUs available in the system. This should
1153  * take care of not letting resize process lag behind too many concurrent
1154  * updater threads actively inserting into the hash table.
1155  */
1156 static
1157 void remove_table_partition(struct cds_lfht *ht, unsigned long i,
1158                             unsigned long start, unsigned long len)
1159 {
1160         unsigned long j;
1161
1162         ht->cds_lfht_rcu_read_lock();
1163         for (j = start; j < start + len; j++) {
1164                 struct cds_lfht_node *fini_node =
1165                         (struct cds_lfht_node *) &ht->t.tbl[i]->nodes[j];
1166
1167                 dbg_printf("remove entry: i %lu j %lu hash %lu\n",
1168                            i, j, !i ? 0 : (1UL << (i - 1)) + j);
1169                 fini_node->p.reverse_hash =
1170                         bit_reverse_ulong(!i ? 0 : (1UL << (i - 1)) + j);
1171                 (void) _cds_lfht_del(ht, !i ? 0 : (1UL << (i - 1)),
1172                                 fini_node, 1, 1);
1173                 if (CMM_LOAD_SHARED(ht->in_progress_destroy))
1174                         break;
1175         }
1176         ht->cds_lfht_rcu_read_unlock();
1177 }
1178
1179 static
1180 void remove_table(struct cds_lfht *ht, unsigned long i, unsigned long len)
1181 {
1182
1183         assert(nr_cpus_mask != -1);
1184         if (nr_cpus_mask < 0 || len < 2 * MIN_PARTITION_PER_THREAD) {
1185                 ht->cds_lfht_rcu_thread_online();
1186                 remove_table_partition(ht, i, 0, len);
1187                 ht->cds_lfht_rcu_thread_offline();
1188                 return;
1189         }
1190         partition_resize_helper(ht, i, len, remove_table_partition);
1191 }
1192
1193 static
1194 void fini_table(struct cds_lfht *ht,
1195                 unsigned long first_order, unsigned long len_order)
1196 {
1197         long i, end_order;
1198
1199         dbg_printf("fini table: first_order %lu end_order %lu\n",
1200                    first_order, first_order + len_order);
1201         end_order = first_order + len_order;
1202         assert(first_order > 0);
1203         for (i = end_order - 1; i >= first_order; i--) {
1204                 unsigned long len;
1205
1206                 len = !i ? 1 : 1UL << (i - 1);
1207                 dbg_printf("fini order %lu len: %lu\n", i, len);
1208
1209                 /* Stop shrink if the resize target changes under us */
1210                 if (CMM_LOAD_SHARED(ht->t.resize_target) > (1UL << (i - 1)))
1211                         break;
1212
1213                 cmm_smp_wmb();  /* populate data before RCU size */
1214                 CMM_STORE_SHARED(ht->t.size, 1UL << (i - 1));
1215
1216                 /*
1217                  * We need to wait for all add operations to reach Q.S. (and
1218                  * thus use the new table for lookups) before we can start
1219                  * releasing the old dummy nodes. Otherwise their lookup will
1220                  * return a logically removed node as insert position.
1221                  */
1222                 ht->cds_lfht_synchronize_rcu();
1223
1224                 /*
1225                  * Set "removed" flag in dummy nodes about to be removed.
1226                  * Unlink all now-logically-removed dummy node pointers.
1227                  * Concurrent add/remove operation are helping us doing
1228                  * the gc.
1229                  */
1230                 remove_table(ht, i, len);
1231
1232                 ht->cds_lfht_call_rcu(&ht->t.tbl[i]->head, cds_lfht_free_level);
1233
1234                 dbg_printf("fini new size: %lu\n", 1UL << i);
1235                 if (CMM_LOAD_SHARED(ht->in_progress_destroy))
1236                         break;
1237         }
1238 }
1239
1240 struct cds_lfht *_cds_lfht_new(cds_lfht_hash_fct hash_fct,
1241                         cds_lfht_compare_fct compare_fct,
1242                         unsigned long hash_seed,
1243                         unsigned long init_size,
1244                         int flags,
1245                         void (*cds_lfht_call_rcu)(struct rcu_head *head,
1246                                         void (*func)(struct rcu_head *head)),
1247                         void (*cds_lfht_synchronize_rcu)(void),
1248                         void (*cds_lfht_rcu_read_lock)(void),
1249                         void (*cds_lfht_rcu_read_unlock)(void),
1250                         void (*cds_lfht_rcu_thread_offline)(void),
1251                         void (*cds_lfht_rcu_thread_online)(void),
1252                         void (*cds_lfht_rcu_register_thread)(void),
1253                         void (*cds_lfht_rcu_unregister_thread)(void),
1254                         pthread_attr_t *attr)
1255 {
1256         struct cds_lfht *ht;
1257         unsigned long order;
1258
1259         /* init_size must be power of two */
1260         if (init_size && (init_size & (init_size - 1)))
1261                 return NULL;
1262         ht = calloc(1, sizeof(struct cds_lfht));
1263         assert(ht);
1264         ht->hash_fct = hash_fct;
1265         ht->compare_fct = compare_fct;
1266         ht->hash_seed = hash_seed;
1267         ht->cds_lfht_call_rcu = cds_lfht_call_rcu;
1268         ht->cds_lfht_synchronize_rcu = cds_lfht_synchronize_rcu;
1269         ht->cds_lfht_rcu_read_lock = cds_lfht_rcu_read_lock;
1270         ht->cds_lfht_rcu_read_unlock = cds_lfht_rcu_read_unlock;
1271         ht->cds_lfht_rcu_thread_offline = cds_lfht_rcu_thread_offline;
1272         ht->cds_lfht_rcu_thread_online = cds_lfht_rcu_thread_online;
1273         ht->cds_lfht_rcu_register_thread = cds_lfht_rcu_register_thread;
1274         ht->cds_lfht_rcu_unregister_thread = cds_lfht_rcu_unregister_thread;
1275         ht->resize_attr = attr;
1276         ht->percpu_count = alloc_per_cpu_items_count();
1277         /* this mutex should not nest in read-side C.S. */
1278         pthread_mutex_init(&ht->resize_mutex, NULL);
1279         order = get_count_order_ulong(max(init_size, MIN_TABLE_SIZE)) + 1;
1280         ht->flags = flags;
1281         ht->cds_lfht_rcu_thread_offline();
1282         pthread_mutex_lock(&ht->resize_mutex);
1283         ht->t.resize_target = 1UL << (order - 1);
1284         init_table(ht, 0, order);
1285         pthread_mutex_unlock(&ht->resize_mutex);
1286         ht->cds_lfht_rcu_thread_online();
1287         return ht;
1288 }
1289
1290 void cds_lfht_lookup(struct cds_lfht *ht, void *key, size_t key_len,
1291                 struct cds_lfht_iter *iter)
1292 {
1293         struct cds_lfht_node *node, *next, *dummy_node;
1294         struct _cds_lfht_node *lookup;
1295         unsigned long hash, reverse_hash, index, order, size;
1296
1297         hash = ht->hash_fct(key, key_len, ht->hash_seed);
1298         reverse_hash = bit_reverse_ulong(hash);
1299
1300         size = rcu_dereference(ht->t.size);
1301         index = hash & (size - 1);
1302         order = get_count_order_ulong(index + 1);
1303         lookup = &ht->t.tbl[order]->nodes[index & (!order ? 0 : ((1UL << (order - 1))) - 1)];
1304         dbg_printf("lookup hash %lu index %lu order %lu aridx %lu\n",
1305                    hash, index, order, index & (!order ? 0 : ((1UL << (order - 1)) - 1)));
1306         dummy_node = (struct cds_lfht_node *) lookup;
1307         /* We can always skip the dummy node initially */
1308         node = rcu_dereference(dummy_node->p.next);
1309         node = clear_flag(node);
1310         for (;;) {
1311                 if (unlikely(is_end(node))) {
1312                         node = NULL;
1313                         break;
1314                 }
1315                 if (unlikely(node->p.reverse_hash > reverse_hash)) {
1316                         node = NULL;
1317                         break;
1318                 }
1319                 next = rcu_dereference(node->p.next);
1320                 if (likely(!is_removed(next))
1321                     && !is_dummy(next)
1322                     && likely(!ht->compare_fct(node->key, node->key_len, key, key_len))) {
1323                                 break;
1324                 }
1325                 node = clear_flag(next);
1326         }
1327         assert(!node || !is_dummy(rcu_dereference(node->p.next)));
1328         iter->node = node;
1329         iter->next = next;
1330 }
1331
1332 void cds_lfht_next(struct cds_lfht *ht, struct cds_lfht_iter *iter)
1333 {
1334         struct cds_lfht_node *node, *next;
1335         unsigned long reverse_hash;
1336         void *key;
1337         size_t key_len;
1338
1339         node = iter->node;
1340         reverse_hash = node->p.reverse_hash;
1341         key = node->key;
1342         key_len = node->key_len;
1343         next = iter->next;
1344         node = clear_flag(next);
1345
1346         for (;;) {
1347                 if (unlikely(is_end(node))) {
1348                         node = NULL;
1349                         break;
1350                 }
1351                 if (unlikely(node->p.reverse_hash > reverse_hash)) {
1352                         node = NULL;
1353                         break;
1354                 }
1355                 next = rcu_dereference(node->p.next);
1356                 if (likely(!is_removed(next))
1357                     && !is_dummy(next)
1358                     && likely(!ht->compare_fct(node->key, node->key_len, key, key_len))) {
1359                                 break;
1360                 }
1361                 node = clear_flag(next);
1362         }
1363         assert(!node || !is_dummy(rcu_dereference(node->p.next)));
1364         iter->node = node;
1365         iter->next = next;
1366 }
1367
1368 void cds_lfht_add(struct cds_lfht *ht, struct cds_lfht_node *node)
1369 {
1370         unsigned long hash, size;
1371
1372         hash = ht->hash_fct(node->key, node->key_len, ht->hash_seed);
1373         node->p.reverse_hash = bit_reverse_ulong((unsigned long) hash);
1374
1375         size = rcu_dereference(ht->t.size);
1376         (void) _cds_lfht_add(ht, size, node, ADD_DEFAULT, 0);
1377         ht_count_add(ht, size);
1378 }
1379
1380 struct cds_lfht_node *cds_lfht_add_unique(struct cds_lfht *ht,
1381                                 struct cds_lfht_node *node)
1382 {
1383         unsigned long hash, size;
1384         struct cds_lfht_node *ret;
1385
1386         hash = ht->hash_fct(node->key, node->key_len, ht->hash_seed);
1387         node->p.reverse_hash = bit_reverse_ulong((unsigned long) hash);
1388
1389         size = rcu_dereference(ht->t.size);
1390         ret = _cds_lfht_add(ht, size, node, ADD_UNIQUE, 0);
1391         if (ret == node)
1392                 ht_count_add(ht, size);
1393         return ret;
1394 }
1395
1396 struct cds_lfht_node *cds_lfht_replace(struct cds_lfht *ht,
1397                                 struct cds_lfht_node *node)
1398 {
1399         unsigned long hash, size;
1400         struct cds_lfht_node *ret;
1401
1402         hash = ht->hash_fct(node->key, node->key_len, ht->hash_seed);
1403         node->p.reverse_hash = bit_reverse_ulong((unsigned long) hash);
1404
1405         size = rcu_dereference(ht->t.size);
1406         ret = _cds_lfht_add(ht, size, node, ADD_REPLACE, 0);
1407         if (ret == NULL)
1408                 ht_count_add(ht, size);
1409         return ret;
1410 }
1411
1412 int cds_lfht_del(struct cds_lfht *ht, struct cds_lfht_node *node)
1413 {
1414         unsigned long size;
1415         int ret;
1416
1417         size = rcu_dereference(ht->t.size);
1418         ret = _cds_lfht_del(ht, size, node, 0, 1);
1419         if (!ret)
1420                 ht_count_del(ht, size);
1421         return ret;
1422 }
1423
1424 static
1425 int cds_lfht_delete_dummy(struct cds_lfht *ht)
1426 {
1427         struct cds_lfht_node *node;
1428         struct _cds_lfht_node *lookup;
1429         unsigned long order, i, size;
1430
1431         /* Check that the table is empty */
1432         lookup = &ht->t.tbl[0]->nodes[0];
1433         node = (struct cds_lfht_node *) lookup;
1434         do {
1435                 node = clear_flag(node)->p.next;
1436                 if (!is_dummy(node))
1437                         return -EPERM;
1438                 assert(!is_removed(node));
1439                 assert(!is_gc(node));
1440         } while (!is_end(node));
1441         /*
1442          * size accessed without rcu_dereference because hash table is
1443          * being destroyed.
1444          */
1445         size = ht->t.size;
1446         /* Internal sanity check: all nodes left should be dummy */
1447         for (order = 0; order < get_count_order_ulong(size) + 1; order++) {
1448                 unsigned long len;
1449
1450                 len = !order ? 1 : 1UL << (order - 1);
1451                 for (i = 0; i < len; i++) {
1452                         dbg_printf("delete order %lu i %lu hash %lu\n",
1453                                 order, i,
1454                                 bit_reverse_ulong(ht->t.tbl[order]->nodes[i].reverse_hash));
1455                         assert(is_dummy(ht->t.tbl[order]->nodes[i].next));
1456                 }
1457                 poison_free(ht->t.tbl[order]);
1458         }
1459         return 0;
1460 }
1461
1462 /*
1463  * Should only be called when no more concurrent readers nor writers can
1464  * possibly access the table.
1465  */
1466 int cds_lfht_destroy(struct cds_lfht *ht, pthread_attr_t **attr)
1467 {
1468         int ret;
1469
1470         /* Wait for in-flight resize operations to complete */
1471         CMM_STORE_SHARED(ht->in_progress_destroy, 1);
1472         while (uatomic_read(&ht->in_progress_resize))
1473                 poll(NULL, 0, 100);     /* wait for 100ms */
1474         ret = cds_lfht_delete_dummy(ht);
1475         if (ret)
1476                 return ret;
1477         free_per_cpu_items_count(ht->percpu_count);
1478         if (attr)
1479                 *attr = ht->resize_attr;
1480         poison_free(ht);
1481         return ret;
1482 }
1483
1484 void cds_lfht_count_nodes(struct cds_lfht *ht,
1485                 unsigned long *count,
1486                 unsigned long *removed)
1487 {
1488         struct cds_lfht_node *node, *next;
1489         struct _cds_lfht_node *lookup;
1490         unsigned long nr_dummy = 0;
1491
1492         *count = 0;
1493         *removed = 0;
1494
1495         /* Count non-dummy nodes in the table */
1496         lookup = &ht->t.tbl[0]->nodes[0];
1497         node = (struct cds_lfht_node *) lookup;
1498         do {
1499                 next = rcu_dereference(node->p.next);
1500                 if (is_removed(next) || is_gc(next)) {
1501                         assert(!is_dummy(next));
1502                         (*removed)++;
1503                 } else if (!is_dummy(next))
1504                         (*count)++;
1505                 else
1506                         (nr_dummy)++;
1507                 node = clear_flag(next);
1508         } while (!is_end(node));
1509         dbg_printf("number of dummy nodes: %lu\n", nr_dummy);
1510 }
1511
1512 /* called with resize mutex held */
1513 static
1514 void _do_cds_lfht_grow(struct cds_lfht *ht,
1515                 unsigned long old_size, unsigned long new_size)
1516 {
1517         unsigned long old_order, new_order;
1518
1519         old_order = get_count_order_ulong(old_size) + 1;
1520         new_order = get_count_order_ulong(new_size) + 1;
1521         printf("resize from %lu (order %lu) to %lu (order %lu) buckets\n",
1522                old_size, old_order, new_size, new_order);
1523         assert(new_size > old_size);
1524         init_table(ht, old_order, new_order - old_order);
1525 }
1526
1527 /* called with resize mutex held */
1528 static
1529 void _do_cds_lfht_shrink(struct cds_lfht *ht,
1530                 unsigned long old_size, unsigned long new_size)
1531 {
1532         unsigned long old_order, new_order;
1533
1534         new_size = max(new_size, MIN_TABLE_SIZE);
1535         old_order = get_count_order_ulong(old_size) + 1;
1536         new_order = get_count_order_ulong(new_size) + 1;
1537         printf("resize from %lu (order %lu) to %lu (order %lu) buckets\n",
1538                old_size, old_order, new_size, new_order);
1539         assert(new_size < old_size);
1540
1541         /* Remove and unlink all dummy nodes to remove. */
1542         fini_table(ht, new_order, old_order - new_order);
1543 }
1544
1545
1546 /* called with resize mutex held */
1547 static
1548 void _do_cds_lfht_resize(struct cds_lfht *ht)
1549 {
1550         unsigned long new_size, old_size;
1551
1552         /*
1553          * Resize table, re-do if the target size has changed under us.
1554          */
1555         do {
1556                 ht->t.resize_initiated = 1;
1557                 old_size = ht->t.size;
1558                 new_size = CMM_LOAD_SHARED(ht->t.resize_target);
1559                 if (old_size < new_size)
1560                         _do_cds_lfht_grow(ht, old_size, new_size);
1561                 else if (old_size > new_size)
1562                         _do_cds_lfht_shrink(ht, old_size, new_size);
1563                 ht->t.resize_initiated = 0;
1564                 /* write resize_initiated before read resize_target */
1565                 cmm_smp_mb();
1566         } while (ht->t.size != CMM_LOAD_SHARED(ht->t.resize_target));
1567 }
1568
1569 static
1570 unsigned long resize_target_update(struct cds_lfht *ht, unsigned long size,
1571                                    int growth_order)
1572 {
1573         return _uatomic_max(&ht->t.resize_target,
1574                             size << growth_order);
1575 }
1576
1577 static
1578 void resize_target_update_count(struct cds_lfht *ht,
1579                                 unsigned long count)
1580 {
1581         count = max(count, MIN_TABLE_SIZE);
1582         uatomic_set(&ht->t.resize_target, count);
1583 }
1584
1585 void cds_lfht_resize(struct cds_lfht *ht, unsigned long new_size)
1586 {
1587         resize_target_update_count(ht, new_size);
1588         CMM_STORE_SHARED(ht->t.resize_initiated, 1);
1589         ht->cds_lfht_rcu_thread_offline();
1590         pthread_mutex_lock(&ht->resize_mutex);
1591         _do_cds_lfht_resize(ht);
1592         pthread_mutex_unlock(&ht->resize_mutex);
1593         ht->cds_lfht_rcu_thread_online();
1594 }
1595
1596 static
1597 void do_resize_cb(struct rcu_head *head)
1598 {
1599         struct rcu_resize_work *work =
1600                 caa_container_of(head, struct rcu_resize_work, head);
1601         struct cds_lfht *ht = work->ht;
1602
1603         ht->cds_lfht_rcu_thread_offline();
1604         pthread_mutex_lock(&ht->resize_mutex);
1605         _do_cds_lfht_resize(ht);
1606         pthread_mutex_unlock(&ht->resize_mutex);
1607         ht->cds_lfht_rcu_thread_online();
1608         poison_free(work);
1609         cmm_smp_mb();   /* finish resize before decrement */
1610         uatomic_dec(&ht->in_progress_resize);
1611 }
1612
1613 static
1614 void cds_lfht_resize_lazy(struct cds_lfht *ht, unsigned long size, int growth)
1615 {
1616         struct rcu_resize_work *work;
1617         unsigned long target_size;
1618
1619         target_size = resize_target_update(ht, size, growth);
1620         /* Store resize_target before read resize_initiated */
1621         cmm_smp_mb();
1622         if (!CMM_LOAD_SHARED(ht->t.resize_initiated) && size < target_size) {
1623                 uatomic_inc(&ht->in_progress_resize);
1624                 cmm_smp_mb();   /* increment resize count before calling it */
1625                 work = malloc(sizeof(*work));
1626                 work->ht = ht;
1627                 ht->cds_lfht_call_rcu(&work->head, do_resize_cb);
1628                 CMM_STORE_SHARED(ht->t.resize_initiated, 1);
1629         }
1630 }
1631
1632 #if defined(HAVE_SCHED_GETCPU) && defined(HAVE_SYSCONF)
1633
1634 static
1635 void cds_lfht_resize_lazy_count(struct cds_lfht *ht, unsigned long size,
1636                                 unsigned long count)
1637 {
1638         struct rcu_resize_work *work;
1639
1640         if (!(ht->flags & CDS_LFHT_AUTO_RESIZE))
1641                 return;
1642         resize_target_update_count(ht, count);
1643         /* Store resize_target before read resize_initiated */
1644         cmm_smp_mb();
1645         if (!CMM_LOAD_SHARED(ht->t.resize_initiated)) {
1646                 uatomic_inc(&ht->in_progress_resize);
1647                 cmm_smp_mb();   /* increment resize count before calling it */
1648                 work = malloc(sizeof(*work));
1649                 work->ht = ht;
1650                 ht->cds_lfht_call_rcu(&work->head, do_resize_cb);
1651                 CMM_STORE_SHARED(ht->t.resize_initiated, 1);
1652         }
1653 }
1654
1655 #endif