* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+/*
+ * Based on the following articles:
+ * - Ori Shalev and Nir Shavit. Split-ordered lists: Lock-free
+ * extensible hash tables. J. ACM 53, 3 (May 2006), 379-405.
+ * - Michael, M. M. High performance dynamic lock-free hash tables
+ * and list-based sets. In Proceedings of the fourteenth annual ACM
+ * symposium on Parallel algorithms and architectures, ACM Press,
+ * (2002), 73-82.
+ *
+ * Some specificities of this Lock-Free Expandable RCU Hash Table
+ * implementation:
+ *
+ * - RCU read-side critical section allows readers to perform hash
+ * table lookups and use the returned objects safely by delaying
+ * memory reclaim of a grace period.
+ * - Add and remove operations are lock-free, and do not need to
+ * allocate memory. They need to be executed within RCU read-side
+ * critical section to ensure the objects they read are valid and to
+ * deal with the cmpxchg ABA problem.
+ * - add and add_unique operations are supported. add_unique checks if
+ * the node key already exists in the hash table. It ensures no key
+ * duplicata exists.
+ * - The resize operation executes concurrently with add/remove/lookup.
+ * - Hash table nodes are contained within a split-ordered list. This
+ * list is ordered by incrementing reversed-bits-hash value.
+ * - An index of dummy nodes is kept. These dummy nodes are the hash
+ * table "buckets", and they are also chained together in the
+ * split-ordered list, which allows recursive expansion.
+ * - The resize operation only allows expanding the hash table.
+ * It is triggered either through an API call or automatically by
+ * detecting long chains in the add operation.
+ * - Resize operation initiated by long chain detection is executed by a
+ * call_rcu thread, which keeps lock-freedom of add and remove.
+ * - Resize operations are protected by a mutex.
+ * - The removal operation is split in two parts: first, a "removed"
+ * flag is set in the next pointer within the node to remove. Then,
+ * a "garbage collection" is performed in the bucket containing the
+ * removed node (from the start of the bucket up to the removed node).
+ * All encountered nodes with "removed" flag set in their next
+ * pointers are removed from the linked-list. If the cmpxchg used for
+ * removal fails (due to concurrent garbage-collection or concurrent
+ * add), we retry from the beginning of the bucket. This ensures that
+ * the node with "removed" flag set is removed from the hash table
+ * (not visible to lookups anymore) before the RCU read-side critical
+ * section held across removal ends. Furthermore, this ensures that
+ * the node with "removed" flag set is removed from the linked-list
+ * before its memory is reclaimed. Only the thread which removal
+ * successfully set the "removed" flag (with a cmpxchg) into a node's
+ * next pointer is considered to have succeeded its removal (and thus
+ * owns the node to reclaim). Because we garbage-collect starting from
+ * an invariant node (the start-of-bucket dummy node) up to the
+ * "removed" node (or find a reverse-hash that is higher), we are sure
+ * that a successful traversal of the chain leads to a chain that is
+ * present in the linked-list (the start node is never removed) and
+ * that is does not contain the "removed" node anymore, even if
+ * concurrent delete/add operations are changing the structure of the
+ * list concurrently.
+ * - The add operation performs gargage collection of buckets if it
+ * encounters nodes with removed flag set in the bucket where it wants
+ * to add its new node. This ensures lock-freedom of add operation by
+ * helping the remover unlink nodes from the list rather than to wait
+ * for it do to so.
+ * - A RCU "order table" indexed by log2(hash index) is copied and
+ * expanded by the resize operation. This order table allows finding
+ * the "dummy node" tables.
+ * - There is one dummy node table per hash index order. The size of
+ * each dummy node table is half the number of hashes contained in
+ * this order.
+ * - call_rcu is used to garbage-collect the old order table.
+ * - The per-order dummy node tables contain a compact version of the
+ * hash table nodes. These tables are invariant after they are
+ * populated into the hash table.
+ */
+
#define _LGPL_SOURCE
#include <stdlib.h>
#include <errno.h>
#include <stdio.h>
#include <pthread.h>
-//#define DEBUG /* Test */
-
#ifdef DEBUG
-#define dbg_printf(args...) printf(args)
+#define dbg_printf(fmt, args...) printf("[debug rculfhash] " fmt, ## args)
#else
-#define dbg_printf(args...)
+#define dbg_printf(fmt, args...)
#endif
-#define CHAIN_LEN_TARGET 1
-#define CHAIN_LEN_RESIZE_THRESHOLD 2
+#define CHAIN_LEN_TARGET 4
+#define CHAIN_LEN_RESIZE_THRESHOLD 8
#ifndef max
#define max(a, b) ((a) > (b) ? (a) : (b))
ht_compare_fct compare_fct;
unsigned long hash_seed;
pthread_mutex_t resize_mutex; /* resize mutex: add/del mutex */
- unsigned int in_progress_resize;
+ unsigned int in_progress_resize, in_progress_destroy;
void (*ht_call_rcu)(struct rcu_head *head,
void (*func)(struct rcu_head *head));
};
uint32_t chain_len)
{
if (chain_len > 100)
- dbg_printf("rculfhash: WARNING: large chain length: %u.\n",
+ dbg_printf("WARNING: large chain length: %u.\n",
chain_len);
if (chain_len >= CHAIN_LEN_RESIZE_THRESHOLD)
ht_resize_lazy(ht, t,
- get_count_order_u32(chain_len - CHAIN_LEN_TARGET + 1));
+ get_count_order_u32(chain_len - (CHAIN_LEN_TARGET - 1)));
}
static
for (;;) {
if (unlikely(!clear_flag(iter)))
return;
- if (clear_flag(iter)->p.reverse_hash > node->p.reverse_hash)
+ if (likely(clear_flag(iter)->p.reverse_hash > node->p.reverse_hash))
return;
next = rcu_dereference(clear_flag(iter)->p.next);
- if (is_removed(next))
+ if (likely(is_removed(next)))
break;
iter_prev = clear_flag(iter);
iter = next;
for (;;) {
if (unlikely(!clear_flag(iter)))
goto insert;
- if (clear_flag(iter)->p.reverse_hash > node->p.reverse_hash)
+ if (likely(clear_flag(iter)->p.reverse_hash > node->p.reverse_hash))
goto insert;
next = rcu_dereference(clear_flag(iter)->p.next);
- if (is_removed(next))
+ if (unlikely(is_removed(next)))
goto gc_node;
if (unique
&& !is_dummy(next)
old = rcu_dereference(node->p.next);
do {
next = old;
- if (is_removed(next))
+ if (unlikely(is_removed(next)))
goto end;
assert(!is_dummy(next));
old = uatomic_cmpxchg(&node->p.next, next,
{
unsigned long i, end_order;
- dbg_printf("rculfhash: init table: first_order %lu end_order %lu\n",
+ dbg_printf("init table: first_order %lu end_order %lu\n",
first_order, first_order + len_order);
end_order = first_order + len_order;
t->size = !first_order ? 0 : (1UL << (first_order - 1));
unsigned long j, len;
len = !i ? 1 : 1UL << (i - 1);
- dbg_printf("rculfhash: init order %lu len: %lu\n", i, len);
+ dbg_printf("init order %lu len: %lu\n", i, len);
t->tbl[i] = calloc(len, sizeof(struct _rcu_ht_node));
for (j = 0; j < len; j++) {
- dbg_printf("rculfhash: init entry: i %lu j %lu hash %lu\n",
+ dbg_printf("init entry: i %lu j %lu hash %lu\n",
i, j, !i ? 0 : (1UL << (i - 1)) + j);
struct rcu_ht_node *new_node =
(struct rcu_ht_node *) &t->tbl[i][j];
new_node->p.reverse_hash =
bit_reverse_ulong(!i ? 0 : (1UL << (i - 1)) + j);
(void) _ht_add(ht, t, new_node, 0, 1);
+ if (CMM_LOAD_SHARED(ht->in_progress_destroy))
+ break;
}
/* Update table size */
t->size = !i ? 1 : (1UL << i);
- dbg_printf("rculfhash: init new size: %lu\n", t->size);
+ dbg_printf("init new size: %lu\n", t->size);
+ if (CMM_LOAD_SHARED(ht->in_progress_destroy))
+ break;
}
t->resize_target = t->size;
t->resize_initiated = 0;
index = hash & (t->size - 1);
order = get_count_order_ulong(index + 1);
lookup = &t->tbl[order][index & ((1UL << (order - 1)) - 1)];
- dbg_printf("rculfhash: lookup hash %lu index %lu order %lu aridx %lu\n",
+ dbg_printf("lookup hash %lu index %lu order %lu aridx %lu\n",
hash, index, order, index & ((1UL << (order - 1)) - 1));
node = (struct rcu_ht_node *) lookup;
for (;;) {
len = !order ? 1 : 1UL << (order - 1);
for (i = 0; i < len; i++) {
- dbg_printf("rculfhash: delete order %lu i %lu hash %lu\n",
+ dbg_printf("delete order %lu i %lu hash %lu\n",
order, i,
bit_reverse_ulong(t->tbl[order][i].reverse_hash));
assert(is_dummy(t->tbl[order][i].next));
int ret;
/* Wait for in-flight resize operations to complete */
+ CMM_STORE_SHARED(ht->in_progress_destroy, 1);
while (uatomic_read(&ht->in_progress_resize))
poll(NULL, 0, 100); /* wait for 100ms */
ret = ht_delete_dummy(ht);
(nr_dummy)++;
node = clear_flag(next);
} while (node);
- dbg_printf("rculfhash: number of dummy nodes: %lu\n", nr_dummy);
+ dbg_printf("number of dummy nodes: %lu\n", nr_dummy);
}
static
if (old_size == new_size)
return;
new_order = get_count_order_ulong(new_size) + 1;
- dbg_printf("rculfhash: resize from %lu (order %lu) to %lu (order %lu) buckets\n",
- old_size, old_order, new_size, new_order);
+ dbg_printf("resize from %lu (order %lu) to %lu (order %lu) buckets\n",
+ old_size, old_order, new_size, new_order);
new_t = malloc(sizeof(struct rcu_table)
+ (new_order * sizeof(struct _rcu_ht_node *)));
assert(new_size > old_size);
struct rcu_table *t = rcu_dereference(ht->t);
unsigned long target_size;
+ if (growth < 0) {
+ /*
+ * Silently refuse to shrink hash table. (not supported)
+ */
+ dbg_printf("shrinking hash table not supported.\n");
+ return;
+ }
+
target_size = resize_target_update(t, growth);
if (t->size < target_size) {
CMM_STORE_SHARED(t->resize_initiated, 1);