X-Git-Url: https://git.liburcu.org/?p=urcu.git;a=blobdiff_plain;f=rculfhash.c;h=bff26a8d4fe668e2d4330927dcc812384aea039e;hp=6cc606d9e0dcbea64a7af89327c3874c636caa4c;hb=15cfbec77d2c573110cc936d5b33745d44207b50;hpb=4105056a2fa97794eb32bbf512d2795406071c9c diff --git a/rculfhash.c b/rculfhash.c index 6cc606d..bff26a8 100644 --- a/rculfhash.c +++ b/rculfhash.c @@ -144,7 +144,6 @@ #include #include #include -#include #include #include #include @@ -167,14 +166,9 @@ #define CHAIN_LEN_RESIZE_THRESHOLD 3 /* - * Define the minimum table size. Protects against hash table resize overload - * when too many entries are added quickly before the resize can complete. - * This is especially the case if the table could be shrinked to a size of 1. - * TODO: we might want to make the add/remove operations help the resize to - * add or remove dummy nodes when a resize is ongoing to ensure upper-bound on - * chain length. + * Define the minimum table size. */ -#define MIN_TABLE_SIZE 128 +#define MIN_TABLE_SIZE 1 #if (CAA_BITS_PER_LONG == 32) #define MAX_TABLE_ORDER 32 @@ -182,6 +176,12 @@ #define MAX_TABLE_ORDER 64 #endif +/* + * Minimum number of dummy nodes to touch per thread to parallelize grow/shrink. + */ +#define MIN_PARTITION_PER_THREAD_ORDER 12 +#define MIN_PARTITION_PER_THREAD (1UL << MIN_PARTITION_PER_THREAD_ORDER) + #ifndef min #define min(a, b) ((a) < (b) ? (a) : (b)) #endif @@ -192,6 +192,8 @@ /* * The removed flag needs to be updated atomically with the pointer. + * It indicates that no node must attach to the node scheduled for + * removal, and that node garbage collection must be performed. * The dummy flag does not require to be updated atomically with the * pointer, but it is added as a pointer low bit flag to save space. */ @@ -199,8 +201,11 @@ #define DUMMY_FLAG (1UL << 1) #define FLAGS_MASK ((1UL << 2) - 1) +/* Value of the end pointer. Should not interact with flags. */ +#define END_VALUE NULL + struct ht_items_count { - unsigned long add, remove; + unsigned long add, del; } __attribute__((aligned(CAA_CACHE_LINE_SIZE))); struct rcu_level { @@ -221,6 +226,13 @@ struct cds_lfht { cds_lfht_compare_fct compare_fct; unsigned long hash_seed; int flags; + /* + * We need to put the work threads offline (QSBR) when taking this + * mutex, because we use synchronize_rcu within this mutex critical + * section, which waits on read-side critical sections, and could + * therefore cause grace-period deadlock if we hold off RCU G.P. + * completion. + */ pthread_mutex_t resize_mutex; /* resize mutex: add/del mutex */ unsigned int in_progress_resize, in_progress_destroy; void (*cds_lfht_call_rcu)(struct rcu_head *head, @@ -228,7 +240,12 @@ struct cds_lfht { void (*cds_lfht_synchronize_rcu)(void); void (*cds_lfht_rcu_read_lock)(void); void (*cds_lfht_rcu_read_unlock)(void); - unsigned long count; /* global approximate item count */ + void (*cds_lfht_rcu_thread_offline)(void); + void (*cds_lfht_rcu_thread_online)(void); + void (*cds_lfht_rcu_register_thread)(void); + void (*cds_lfht_rcu_unregister_thread)(void); + pthread_attr_t *resize_attr; /* Resize threads attributes */ + long count; /* global approximate item count */ struct ht_items_count *percpu_count; /* per-cpu item count */ }; @@ -237,6 +254,26 @@ struct rcu_resize_work { struct cds_lfht *ht; }; +struct partition_resize_work { + struct rcu_head head; + struct cds_lfht *ht; + unsigned long i, start, len; + void (*fct)(struct cds_lfht *ht, unsigned long i, + unsigned long start, unsigned long len); +}; + +enum add_mode { + ADD_DEFAULT = 0, + ADD_UNIQUE = 1, + ADD_REPLACE = 2, +}; + +static +struct cds_lfht_node *_cds_lfht_add(struct cds_lfht *ht, + unsigned long size, + struct cds_lfht_node *node, + enum add_mode mode, int dummy); + /* * Algorithm to reverse bits in a word by lookup table, extended to * 64-bit words. @@ -518,7 +555,7 @@ void ht_count_add(struct cds_lfht *ht, unsigned long size) return; percpu_count = uatomic_add_return(&ht->percpu_count[cpu].add, 1); if (unlikely(!(percpu_count & ((1UL << COUNT_COMMIT_ORDER) - 1)))) { - unsigned long count; + long count; dbg_printf("add percpu %lu\n", percpu_count); count = uatomic_add_return(&ht->count, @@ -527,7 +564,7 @@ void ht_count_add(struct cds_lfht *ht, unsigned long size) if (!(count & (count - 1))) { if ((count >> CHAIN_LEN_RESIZE_THRESHOLD) < size) return; - dbg_printf("add set global %lu\n", count); + dbg_printf("add set global %ld\n", count); cds_lfht_resize_lazy_count(ht, size, count >> (CHAIN_LEN_TARGET - 1)); } @@ -535,7 +572,7 @@ void ht_count_add(struct cds_lfht *ht, unsigned long size) } static -void ht_count_remove(struct cds_lfht *ht, unsigned long size) +void ht_count_del(struct cds_lfht *ht, unsigned long size) { unsigned long percpu_count; int cpu; @@ -545,18 +582,24 @@ void ht_count_remove(struct cds_lfht *ht, unsigned long size) cpu = ht_get_cpu(); if (unlikely(cpu < 0)) return; - percpu_count = uatomic_add_return(&ht->percpu_count[cpu].remove, -1); + percpu_count = uatomic_add_return(&ht->percpu_count[cpu].del, 1); if (unlikely(!(percpu_count & ((1UL << COUNT_COMMIT_ORDER) - 1)))) { - unsigned long count; + long count; - dbg_printf("remove percpu %lu\n", percpu_count); + dbg_printf("del percpu %lu\n", percpu_count); count = uatomic_add_return(&ht->count, -(1UL << COUNT_COMMIT_ORDER)); /* If power of 2 */ if (!(count & (count - 1))) { if ((count >> CHAIN_LEN_RESIZE_THRESHOLD) >= size) return; - dbg_printf("remove set global %lu\n", count); + dbg_printf("del set global %ld\n", count); + /* + * Don't shrink table if the number of nodes is below a + * certain threshold. + */ + if (count < (1UL << COUNT_COMMIT_ORDER) * (nr_cpus_mask + 1)) + return; cds_lfht_resize_lazy_count(ht, size, count >> (CHAIN_LEN_TARGET - 1)); } @@ -584,7 +627,7 @@ void ht_count_add(struct cds_lfht *ht, unsigned long size) } static -void ht_count_remove(struct cds_lfht *ht, unsigned long size) +void ht_count_del(struct cds_lfht *ht, unsigned long size) { } @@ -642,7 +685,19 @@ struct cds_lfht_node *flag_dummy(struct cds_lfht_node *node) { return (struct cds_lfht_node *) (((unsigned long) node) | DUMMY_FLAG); } - + +static +struct cds_lfht_node *get_end(void) +{ + return (struct cds_lfht_node *) END_VALUE; +} + +static +int is_end(struct cds_lfht_node *node) +{ + return clear_flag(node) == (struct cds_lfht_node *) END_VALUE; +} + static unsigned long _uatomic_max(unsigned long *ptr, unsigned long v) { @@ -690,7 +745,7 @@ void _cds_lfht_gc_bucket(struct cds_lfht_node *dummy, struct cds_lfht_node *node */ assert(dummy != node); for (;;) { - if (unlikely(!clear_flag(iter))) + if (unlikely(is_end(iter))) return; if (likely(clear_flag(iter)->p.reverse_hash > node->p.reverse_hash)) return; @@ -705,18 +760,95 @@ void _cds_lfht_gc_bucket(struct cds_lfht_node *dummy, struct cds_lfht_node *node new_next = flag_dummy(clear_flag(next)); else new_next = clear_flag(next); + if (is_removed(iter)) + new_next = flag_removed(new_next); (void) uatomic_cmpxchg(&iter_prev->p.next, iter, new_next); } + return; +} + +static +int _cds_lfht_replace(struct cds_lfht *ht, unsigned long size, + struct cds_lfht_node *old_node, + struct cds_lfht_node *ret_next, + struct cds_lfht_node *new_node) +{ + struct cds_lfht_node *dummy, *old_next; + struct _cds_lfht_node *lookup; + int flagged = 0; + unsigned long hash, index, order; + + if (!old_node) /* Return -ENOENT if asked to replace NULL node */ + goto end; + + assert(!is_removed(old_node)); + assert(!is_dummy(old_node)); + assert(!is_removed(new_node)); + assert(!is_dummy(new_node)); + assert(new_node != old_node); + do { + /* Insert after node to be replaced */ + old_next = ret_next; + if (is_removed(old_next)) { + /* + * Too late, the old node has been removed under us + * between lookup and replace. Fail. + */ + goto end; + } + assert(!is_dummy(old_next)); + assert(new_node != clear_flag(old_next)); + new_node->p.next = clear_flag(old_next); + /* + * Here is the whole trick for lock-free replace: we add + * the replacement node _after_ the node we want to + * replace by atomically setting its next pointer at the + * same time we set its removal flag. Given that + * the lookups/get next use an iterator aware of the + * next pointer, they will either skip the old node due + * to the removal flag and see the new node, or use + * the old node, but will not see the new one. + */ + ret_next = uatomic_cmpxchg(&old_node->p.next, + old_next, flag_removed(new_node)); + } while (ret_next != old_next); + + /* We performed the replacement. */ + flagged = 1; + + /* + * Ensure that the old node is not visible to readers anymore: + * lookup for the node, and remove it (along with any other + * logically removed node) if found. + */ + hash = bit_reverse_ulong(old_node->p.reverse_hash); + assert(size > 0); + index = hash & (size - 1); + order = get_count_order_ulong(index + 1); + lookup = &ht->t.tbl[order]->nodes[index & (!order ? 0 : ((1UL << (order - 1)) - 1))]; + dummy = (struct cds_lfht_node *) lookup; + _cds_lfht_gc_bucket(dummy, new_node); +end: + /* + * Only the flagging action indicated that we (and no other) + * replaced the node from the hash table. + */ + if (flagged) { + assert(is_removed(rcu_dereference(old_node->p.next))); + return 0; + } else { + return -ENOENT; + } } static struct cds_lfht_node *_cds_lfht_add(struct cds_lfht *ht, unsigned long size, struct cds_lfht_node *node, - int unique, int dummy) + enum add_mode mode, int dummy) { struct cds_lfht_node *iter_prev, *iter, *next, *new_node, *new_next, - *dummy_node; + *dummy_node, *return_node; struct _cds_lfht_node *lookup; unsigned long hash, index, order; @@ -724,7 +856,7 @@ struct cds_lfht_node *_cds_lfht_add(struct cds_lfht *ht, assert(!is_removed(node)); if (!size) { assert(dummy); - node->p.next = flag_dummy(NULL); + node->p.next = flag_dummy(get_end()); return node; /* Initial first add (head) */ } hash = bit_reverse_ulong(node->p.reverse_hash); @@ -743,21 +875,23 @@ struct cds_lfht_node *_cds_lfht_add(struct cds_lfht *ht, iter = rcu_dereference(iter_prev->p.next); assert(iter_prev->p.reverse_hash <= node->p.reverse_hash); for (;;) { - /* TODO: check if removed */ - if (unlikely(!clear_flag(iter))) + if (unlikely(is_end(iter))) goto insert; - /* TODO: check if removed */ if (likely(clear_flag(iter)->p.reverse_hash > node->p.reverse_hash)) goto insert; next = rcu_dereference(clear_flag(iter)->p.next); if (unlikely(is_removed(next))) goto gc_node; - if (unique + if ((mode == ADD_UNIQUE || mode == ADD_REPLACE) && !is_dummy(next) && !ht->compare_fct(node->key, node->key_len, clear_flag(iter)->key, - clear_flag(iter)->key_len)) - return clear_flag(iter); + clear_flag(iter)->key_len)) { + if (mode == ADD_UNIQUE) + return clear_flag(iter); + else /* mode == ADD_REPLACE */ + goto replace; + } /* Only account for identical reverse hash once */ if (iter_prev->p.reverse_hash != clear_flag(iter)->p.reverse_hash && !is_dummy(next)) @@ -765,6 +899,7 @@ struct cds_lfht_node *_cds_lfht_add(struct cds_lfht *ht, iter_prev = clear_flag(iter); iter = next; } + insert: assert(node != clear_flag(iter)); assert(!is_removed(iter_prev)); @@ -779,10 +914,26 @@ struct cds_lfht_node *_cds_lfht_add(struct cds_lfht *ht, else new_node = node; if (uatomic_cmpxchg(&iter_prev->p.next, iter, - new_node) != iter) + new_node) != iter) { continue; /* retry */ - else + } else { + if (mode == ADD_REPLACE) + return_node = NULL; + else /* ADD_DEFAULT and ADD_UNIQUE */ + return_node = node; goto gc_end; + } + + replace: + + if (!_cds_lfht_replace(ht, size, clear_flag(iter), next, + node)) { + return_node = clear_flag(iter); + goto end; /* gc already done */ + } else { + continue; /* retry */ + } + gc_node: assert(!is_removed(iter)); if (is_dummy(iter)) @@ -799,11 +950,12 @@ gc_end: lookup = &ht->t.tbl[order]->nodes[index & (!order ? 0 : ((1UL << (order - 1)) - 1))]; dummy_node = (struct cds_lfht_node *) lookup; _cds_lfht_gc_bucket(dummy_node, node); - return node; +end: + return return_node; } static -int _cds_lfht_remove(struct cds_lfht *ht, unsigned long size, +int _cds_lfht_del(struct cds_lfht *ht, unsigned long size, struct cds_lfht_node *node, int dummy_removal) { @@ -812,11 +964,16 @@ int _cds_lfht_remove(struct cds_lfht *ht, unsigned long size, int flagged = 0; unsigned long hash, index, order; + if (!node) /* Return -ENOENT if asked to delete NULL node */ + goto end; + /* logically delete the node */ assert(!is_dummy(node)); assert(!is_removed(node)); old = rcu_dereference(node->p.next); do { + struct cds_lfht_node *new_next; + next = old; if (unlikely(is_removed(next))) goto end; @@ -824,8 +981,8 @@ int _cds_lfht_remove(struct cds_lfht *ht, unsigned long size, assert(is_dummy(next)); else assert(!is_dummy(next)); - old = uatomic_cmpxchg(&node->p.next, next, - flag_removed(next)); + new_next = flag_removed(next); + old = uatomic_cmpxchg(&node->p.next, next, new_next); } while (old != next); /* We performed the (logical) deletion. */ @@ -851,54 +1008,111 @@ end: if (flagged) { assert(is_removed(rcu_dereference(node->p.next))); return 0; - } else + } else { return -ENOENT; + } } static -void init_table_hash(struct cds_lfht *ht, unsigned long i, - unsigned long len) +void *partition_resize_thread(void *arg) { - unsigned long j; + struct partition_resize_work *work = arg; - for (j = 0; j < len; j++) { - struct cds_lfht_node *new_node = - (struct cds_lfht_node *) &ht->t.tbl[i]->nodes[j]; + work->ht->cds_lfht_rcu_register_thread(); + work->fct(work->ht, work->i, work->start, work->len); + work->ht->cds_lfht_rcu_unregister_thread(); + return NULL; +} - dbg_printf("init hash entry: i %lu j %lu hash %lu\n", - i, j, !i ? 0 : (1UL << (i - 1)) + j); - new_node->p.reverse_hash = - bit_reverse_ulong(!i ? 0 : (1UL << (i - 1)) + j); - if (CMM_LOAD_SHARED(ht->in_progress_destroy)) - break; +static +void partition_resize_helper(struct cds_lfht *ht, unsigned long i, + unsigned long len, + void (*fct)(struct cds_lfht *ht, unsigned long i, + unsigned long start, unsigned long len)) +{ + unsigned long partition_len; + struct partition_resize_work *work; + int thread, ret; + unsigned long nr_threads; + pthread_t *thread_id; + + /* + * Note: nr_cpus_mask + 1 is always power of 2. + * We spawn just the number of threads we need to satisfy the minimum + * partition size, up to the number of CPUs in the system. + */ + nr_threads = min(nr_cpus_mask + 1, + len >> MIN_PARTITION_PER_THREAD_ORDER); + partition_len = len >> get_count_order_ulong(nr_threads); + work = calloc(nr_threads, sizeof(*work)); + thread_id = calloc(nr_threads, sizeof(*thread_id)); + assert(work); + for (thread = 0; thread < nr_threads; thread++) { + work[thread].ht = ht; + work[thread].i = i; + work[thread].len = partition_len; + work[thread].start = thread * partition_len; + work[thread].fct = fct; + ret = pthread_create(&thread_id[thread], ht->resize_attr, + partition_resize_thread, &work[thread]); + assert(!ret); + } + for (thread = 0; thread < nr_threads; thread++) { + ret = pthread_join(thread_id[thread], NULL); + assert(!ret); } + free(work); + free(thread_id); } +/* + * Holding RCU read lock to protect _cds_lfht_add against memory + * reclaim that could be performed by other call_rcu worker threads (ABA + * problem). + * + * When we reach a certain length, we can split this population phase over + * many worker threads, based on the number of CPUs available in the system. + * This should therefore take care of not having the expand lagging behind too + * many concurrent insertion threads by using the scheduler's ability to + * schedule dummy node population fairly with insertions. + */ static -void init_table_link(struct cds_lfht *ht, unsigned long i, unsigned long len) +void init_table_populate_partition(struct cds_lfht *ht, unsigned long i, + unsigned long start, unsigned long len) { unsigned long j; ht->cds_lfht_rcu_read_lock(); - for (j = 0; j < len; j++) { + for (j = start; j < start + len; j++) { struct cds_lfht_node *new_node = (struct cds_lfht_node *) &ht->t.tbl[i]->nodes[j]; - dbg_printf("init link: i %lu j %lu hash %lu\n", + dbg_printf("init populate: i %lu j %lu hash %lu\n", i, j, !i ? 0 : (1UL << (i - 1)) + j); + new_node->p.reverse_hash = + bit_reverse_ulong(!i ? 0 : (1UL << (i - 1)) + j); (void) _cds_lfht_add(ht, !i ? 0 : (1UL << (i - 1)), - new_node, 0, 1); + new_node, ADD_DEFAULT, 1); if (CMM_LOAD_SHARED(ht->in_progress_destroy)) break; } ht->cds_lfht_rcu_read_unlock(); } -/* - * Holding RCU read lock to protect _cds_lfht_add against memory - * reclaim that could be performed by other call_rcu worker threads (ABA - * problem). - */ +static +void init_table_populate(struct cds_lfht *ht, unsigned long i, + unsigned long len) +{ + assert(nr_cpus_mask != -1); + if (nr_cpus_mask < 0 || len < 2 * MIN_PARTITION_PER_THREAD) { + ht->cds_lfht_rcu_thread_online(); + init_table_populate_partition(ht, i, 0, len); + ht->cds_lfht_rcu_thread_offline(); + return; + } + partition_resize_helper(ht, i, len, init_table_populate_partition); +} + static void init_table(struct cds_lfht *ht, unsigned long first_order, unsigned long len_order) @@ -913,37 +1127,66 @@ void init_table(struct cds_lfht *ht, len = !i ? 1 : 1UL << (i - 1); dbg_printf("init order %lu len: %lu\n", i, len); + + /* Stop expand if the resize target changes under us */ + if (CMM_LOAD_SHARED(ht->t.resize_target) < (!i ? 1 : (1UL << i))) + break; + ht->t.tbl[i] = calloc(1, sizeof(struct rcu_level) + (len * sizeof(struct _cds_lfht_node))); - - /* Set all dummy nodes reverse hash values for a level */ - init_table_hash(ht, i, len); + assert(ht->t.tbl[i]); /* - * Link all dummy nodes into the table. Concurrent - * add/remove are helping us. + * Set all dummy nodes reverse hash values for a level and + * link all dummy nodes into the table. */ - init_table_link(ht, i, len); + init_table_populate(ht, i, len); /* - * Update table size (after init for now, because no - * concurrent updater help (TODO)). + * Update table size. */ cmm_smp_wmb(); /* populate data before RCU size */ CMM_STORE_SHARED(ht->t.size, !i ? 1 : (1UL << i)); + dbg_printf("init new size: %lu\n", !i ? 1 : (1UL << i)); if (CMM_LOAD_SHARED(ht->in_progress_destroy)) break; } } +/* + * Holding RCU read lock to protect _cds_lfht_remove against memory + * reclaim that could be performed by other call_rcu worker threads (ABA + * problem). + * For a single level, we logically remove and garbage collect each node. + * + * As a design choice, we perform logical removal and garbage collection on a + * node-per-node basis to simplify this algorithm. We also assume keeping good + * cache locality of the operation would overweight possible performance gain + * that could be achieved by batching garbage collection for multiple levels. + * However, this would have to be justified by benchmarks. + * + * Concurrent removal and add operations are helping us perform garbage + * collection of logically removed nodes. We guarantee that all logically + * removed nodes have been garbage-collected (unlinked) before call_rcu is + * invoked to free a hole level of dummy nodes (after a grace period). + * + * Logical removal and garbage collection can therefore be done in batch or on a + * node-per-node basis, as long as the guarantee above holds. + * + * When we reach a certain length, we can split this removal over many worker + * threads, based on the number of CPUs available in the system. This should + * take care of not letting resize process lag behind too many concurrent + * updater threads actively inserting into the hash table. + */ static -void remove_table(struct cds_lfht *ht, unsigned long i, unsigned long len) +void remove_table_partition(struct cds_lfht *ht, unsigned long i, + unsigned long start, unsigned long len) { unsigned long j; ht->cds_lfht_rcu_read_lock(); - for (j = 0; j < len; j++) { + for (j = start; j < start + len; j++) { struct cds_lfht_node *fini_node = (struct cds_lfht_node *) &ht->t.tbl[i]->nodes[j]; @@ -951,7 +1194,7 @@ void remove_table(struct cds_lfht *ht, unsigned long i, unsigned long len) i, j, !i ? 0 : (1UL << (i - 1)) + j); fini_node->p.reverse_hash = bit_reverse_ulong(!i ? 0 : (1UL << (i - 1)) + j); - (void) _cds_lfht_remove(ht, !i ? 0 : (1UL << (i - 1)), + (void) _cds_lfht_del(ht, !i ? 0 : (1UL << (i - 1)), fini_node, 1); if (CMM_LOAD_SHARED(ht->in_progress_destroy)) break; @@ -959,11 +1202,20 @@ void remove_table(struct cds_lfht *ht, unsigned long i, unsigned long len) ht->cds_lfht_rcu_read_unlock(); } -/* - * Holding RCU read lock to protect _cds_lfht_remove against memory - * reclaim that could be performed by other call_rcu worker threads (ABA - * problem). - */ +static +void remove_table(struct cds_lfht *ht, unsigned long i, unsigned long len) +{ + + assert(nr_cpus_mask != -1); + if (nr_cpus_mask < 0 || len < 2 * MIN_PARTITION_PER_THREAD) { + ht->cds_lfht_rcu_thread_online(); + remove_table_partition(ht, i, 0, len); + ht->cds_lfht_rcu_thread_offline(); + return; + } + partition_resize_helper(ht, i, len, remove_table_partition); +} + static void fini_table(struct cds_lfht *ht, unsigned long first_order, unsigned long len_order) @@ -974,13 +1226,27 @@ void fini_table(struct cds_lfht *ht, first_order, first_order + len_order); end_order = first_order + len_order; assert(first_order > 0); - assert(ht->t.size == (1UL << (first_order - 1))); for (i = end_order - 1; i >= first_order; i--) { unsigned long len; len = !i ? 1 : 1UL << (i - 1); dbg_printf("fini order %lu len: %lu\n", i, len); + /* Stop shrink if the resize target changes under us */ + if (CMM_LOAD_SHARED(ht->t.resize_target) > (1UL << (i - 1))) + break; + + cmm_smp_wmb(); /* populate data before RCU size */ + CMM_STORE_SHARED(ht->t.size, 1UL << (i - 1)); + + /* + * We need to wait for all add operations to reach Q.S. (and + * thus use the new table for lookups) before we can start + * releasing the old dummy nodes. Otherwise their lookup will + * return a logically removed node as insert position. + */ + ht->cds_lfht_synchronize_rcu(); + /* * Set "removed" flag in dummy nodes about to be removed. * Unlink all now-logically-removed dummy node pointers. @@ -997,7 +1263,7 @@ void fini_table(struct cds_lfht *ht, } } -struct cds_lfht *cds_lfht_new(cds_lfht_hash_fct hash_fct, +struct cds_lfht *_cds_lfht_new(cds_lfht_hash_fct hash_fct, cds_lfht_compare_fct compare_fct, unsigned long hash_seed, unsigned long init_size, @@ -1006,7 +1272,12 @@ struct cds_lfht *cds_lfht_new(cds_lfht_hash_fct hash_fct, void (*func)(struct rcu_head *head)), void (*cds_lfht_synchronize_rcu)(void), void (*cds_lfht_rcu_read_lock)(void), - void (*cds_lfht_rcu_read_unlock)(void)) + void (*cds_lfht_rcu_read_unlock)(void), + void (*cds_lfht_rcu_thread_offline)(void), + void (*cds_lfht_rcu_thread_online)(void), + void (*cds_lfht_rcu_register_thread)(void), + void (*cds_lfht_rcu_unregister_thread)(void), + pthread_attr_t *attr) { struct cds_lfht *ht; unsigned long order; @@ -1015,6 +1286,7 @@ struct cds_lfht *cds_lfht_new(cds_lfht_hash_fct hash_fct, if (init_size && (init_size & (init_size - 1))) return NULL; ht = calloc(1, sizeof(struct cds_lfht)); + assert(ht); ht->hash_fct = hash_fct; ht->compare_fct = compare_fct; ht->hash_seed = hash_seed; @@ -1022,20 +1294,29 @@ struct cds_lfht *cds_lfht_new(cds_lfht_hash_fct hash_fct, ht->cds_lfht_synchronize_rcu = cds_lfht_synchronize_rcu; ht->cds_lfht_rcu_read_lock = cds_lfht_rcu_read_lock; ht->cds_lfht_rcu_read_unlock = cds_lfht_rcu_read_unlock; + ht->cds_lfht_rcu_thread_offline = cds_lfht_rcu_thread_offline; + ht->cds_lfht_rcu_thread_online = cds_lfht_rcu_thread_online; + ht->cds_lfht_rcu_register_thread = cds_lfht_rcu_register_thread; + ht->cds_lfht_rcu_unregister_thread = cds_lfht_rcu_unregister_thread; + ht->resize_attr = attr; ht->percpu_count = alloc_per_cpu_items_count(); /* this mutex should not nest in read-side C.S. */ pthread_mutex_init(&ht->resize_mutex, NULL); order = get_count_order_ulong(max(init_size, MIN_TABLE_SIZE)) + 1; ht->flags = flags; + ht->cds_lfht_rcu_thread_offline(); pthread_mutex_lock(&ht->resize_mutex); + ht->t.resize_target = 1UL << (order - 1); init_table(ht, 0, order); pthread_mutex_unlock(&ht->resize_mutex); + ht->cds_lfht_rcu_thread_online(); return ht; } -struct cds_lfht_node *cds_lfht_lookup(struct cds_lfht *ht, void *key, size_t key_len) +void cds_lfht_lookup(struct cds_lfht *ht, void *key, size_t key_len, + struct cds_lfht_iter *iter) { - struct cds_lfht_node *node, *next; + struct cds_lfht_node *node, *next, *dummy_node; struct _cds_lfht_node *lookup; unsigned long hash, reverse_hash, index, order, size; @@ -1048,12 +1329,17 @@ struct cds_lfht_node *cds_lfht_lookup(struct cds_lfht *ht, void *key, size_t key lookup = &ht->t.tbl[order]->nodes[index & (!order ? 0 : ((1UL << (order - 1))) - 1)]; dbg_printf("lookup hash %lu index %lu order %lu aridx %lu\n", hash, index, order, index & (!order ? 0 : ((1UL << (order - 1)) - 1))); - node = (struct cds_lfht_node *) lookup; + dummy_node = (struct cds_lfht_node *) lookup; + /* We can always skip the dummy node initially */ + node = rcu_dereference(dummy_node->p.next); + node = clear_flag(node); for (;;) { - if (unlikely(!node)) + if (unlikely(is_end(node))) { + node = next = NULL; break; + } if (unlikely(node->p.reverse_hash > reverse_hash)) { - node = NULL; + node = next = NULL; break; } next = rcu_dereference(node->p.next); @@ -1065,28 +1351,31 @@ struct cds_lfht_node *cds_lfht_lookup(struct cds_lfht *ht, void *key, size_t key node = clear_flag(next); } assert(!node || !is_dummy(rcu_dereference(node->p.next))); - return node; + iter->node = node; + iter->next = next; } -struct cds_lfht_node *cds_lfht_next(struct cds_lfht *ht, - struct cds_lfht_node *node) +void cds_lfht_next_duplicate(struct cds_lfht *ht, struct cds_lfht_iter *iter) { - struct cds_lfht_node *next; + struct cds_lfht_node *node, *next; unsigned long reverse_hash; void *key; size_t key_len; + node = iter->node; reverse_hash = node->p.reverse_hash; key = node->key; key_len = node->key_len; - next = rcu_dereference(node->p.next); + next = iter->next; node = clear_flag(next); for (;;) { - if (unlikely(!node)) + if (unlikely(is_end(node))) { + node = next = NULL; break; + } if (unlikely(node->p.reverse_hash > reverse_hash)) { - node = NULL; + node = next = NULL; break; } next = rcu_dereference(node->p.next); @@ -1098,7 +1387,46 @@ struct cds_lfht_node *cds_lfht_next(struct cds_lfht *ht, node = clear_flag(next); } assert(!node || !is_dummy(rcu_dereference(node->p.next))); - return node; + iter->node = node; + iter->next = next; +} + +void cds_lfht_next(struct cds_lfht *ht, struct cds_lfht_iter *iter) +{ + struct cds_lfht_node *node, *next; + + node = iter->node; + next = iter->next; + node = clear_flag(next); + + for (;;) { + if (unlikely(is_end(node))) { + node = next = NULL; + break; + } + next = rcu_dereference(node->p.next); + if (likely(!is_removed(next)) + && !is_dummy(next)) { + break; + } + node = clear_flag(next); + } + assert(!node || !is_dummy(rcu_dereference(node->p.next))); + iter->node = node; + iter->next = next; +} + +void cds_lfht_first(struct cds_lfht *ht, struct cds_lfht_iter *iter) +{ + struct _cds_lfht_node *lookup; + + /* + * Get next after first dummy node. The first dummy node is the + * first node of the linked list. + */ + lookup = &ht->t.tbl[0]->nodes[0]; + iter->node = (struct cds_lfht_node *) lookup; + cds_lfht_next(ht, iter); } void cds_lfht_add(struct cds_lfht *ht, struct cds_lfht_node *node) @@ -1109,12 +1437,12 @@ void cds_lfht_add(struct cds_lfht *ht, struct cds_lfht_node *node) node->p.reverse_hash = bit_reverse_ulong((unsigned long) hash); size = rcu_dereference(ht->t.size); - (void) _cds_lfht_add(ht, size, node, 0, 0); + (void) _cds_lfht_add(ht, size, node, ADD_DEFAULT, 0); ht_count_add(ht, size); } struct cds_lfht_node *cds_lfht_add_unique(struct cds_lfht *ht, - struct cds_lfht_node *node) + struct cds_lfht_node *node) { unsigned long hash, size; struct cds_lfht_node *ret; @@ -1123,21 +1451,47 @@ struct cds_lfht_node *cds_lfht_add_unique(struct cds_lfht *ht, node->p.reverse_hash = bit_reverse_ulong((unsigned long) hash); size = rcu_dereference(ht->t.size); - ret = _cds_lfht_add(ht, size, node, 1, 0); - if (ret != node) + ret = _cds_lfht_add(ht, size, node, ADD_UNIQUE, 0); + if (ret == node) ht_count_add(ht, size); return ret; } -int cds_lfht_remove(struct cds_lfht *ht, struct cds_lfht_node *node) +struct cds_lfht_node *cds_lfht_add_replace(struct cds_lfht *ht, + struct cds_lfht_node *node) +{ + unsigned long hash, size; + struct cds_lfht_node *ret; + + hash = ht->hash_fct(node->key, node->key_len, ht->hash_seed); + node->p.reverse_hash = bit_reverse_ulong((unsigned long) hash); + + size = rcu_dereference(ht->t.size); + ret = _cds_lfht_add(ht, size, node, ADD_REPLACE, 0); + if (ret == NULL) + ht_count_add(ht, size); + return ret; +} + +int cds_lfht_replace(struct cds_lfht *ht, struct cds_lfht_iter *old_iter, + struct cds_lfht_node *new_node) +{ + unsigned long size; + + size = rcu_dereference(ht->t.size); + return _cds_lfht_replace(ht, size, old_iter->node, old_iter->next, + new_node); +} + +int cds_lfht_del(struct cds_lfht *ht, struct cds_lfht_iter *iter) { unsigned long size; int ret; size = rcu_dereference(ht->t.size); - ret = _cds_lfht_remove(ht, size, node, 0); + ret = _cds_lfht_del(ht, size, iter->node, 0); if (!ret) - ht_count_remove(ht, size); + ht_count_del(ht, size); return ret; } @@ -1156,7 +1510,7 @@ int cds_lfht_delete_dummy(struct cds_lfht *ht) if (!is_dummy(node)) return -EPERM; assert(!is_removed(node)); - } while (clear_flag(node)); + } while (!is_end(node)); /* * size accessed without rcu_dereference because hash table is * being destroyed. @@ -1182,7 +1536,7 @@ int cds_lfht_delete_dummy(struct cds_lfht *ht) * Should only be called when no more concurrent readers nor writers can * possibly access the table. */ -int cds_lfht_destroy(struct cds_lfht *ht) +int cds_lfht_destroy(struct cds_lfht *ht, pthread_attr_t **attr) { int ret; @@ -1194,18 +1548,32 @@ int cds_lfht_destroy(struct cds_lfht *ht) if (ret) return ret; free_per_cpu_items_count(ht->percpu_count); + if (attr) + *attr = ht->resize_attr; poison_free(ht); return ret; } void cds_lfht_count_nodes(struct cds_lfht *ht, + long *approx_before, unsigned long *count, - unsigned long *removed) + unsigned long *removed, + long *approx_after) { struct cds_lfht_node *node, *next; struct _cds_lfht_node *lookup; unsigned long nr_dummy = 0; + *approx_before = 0; + if (nr_cpus_mask >= 0) { + int i; + + for (i = 0; i < nr_cpus_mask + 1; i++) { + *approx_before += uatomic_read(&ht->percpu_count[i].add); + *approx_before -= uatomic_read(&ht->percpu_count[i].del); + } + } + *count = 0; *removed = 0; @@ -1215,15 +1583,26 @@ void cds_lfht_count_nodes(struct cds_lfht *ht, do { next = rcu_dereference(node->p.next); if (is_removed(next)) { - assert(!is_dummy(next)); - (*removed)++; + if (!is_dummy(next)) + (*removed)++; + else + (nr_dummy)++; } else if (!is_dummy(next)) (*count)++; else (nr_dummy)++; node = clear_flag(next); - } while (node); + } while (!is_end(node)); dbg_printf("number of dummy nodes: %lu\n", nr_dummy); + *approx_after = 0; + if (nr_cpus_mask >= 0) { + int i; + + for (i = 0; i < nr_cpus_mask + 1; i++) { + *approx_after += uatomic_read(&ht->percpu_count[i].add); + *approx_after -= uatomic_read(&ht->percpu_count[i].del); + } + } } /* called with resize mutex held */ @@ -1255,17 +1634,6 @@ void _do_cds_lfht_shrink(struct cds_lfht *ht, old_size, old_order, new_size, new_order); assert(new_size < old_size); - cmm_smp_wmb(); /* populate data before RCU size */ - CMM_STORE_SHARED(ht->t.size, new_size); - - /* - * We need to wait for all add operations to reach Q.S. (and - * thus use the new table for lookups) before we can start - * releasing the old dummy nodes. Otherwise their lookup will - * return a logically removed node as insert position. - */ - ht->cds_lfht_synchronize_rcu(); - /* Remove and unlink all dummy nodes to remove. */ fini_table(ht, new_order, old_order - new_order); } @@ -1291,7 +1659,7 @@ void _do_cds_lfht_resize(struct cds_lfht *ht) ht->t.resize_initiated = 0; /* write resize_initiated before read resize_target */ cmm_smp_mb(); - } while (new_size != CMM_LOAD_SHARED(ht->t.resize_target)); + } while (ht->t.size != CMM_LOAD_SHARED(ht->t.resize_target)); } static @@ -1314,9 +1682,11 @@ void cds_lfht_resize(struct cds_lfht *ht, unsigned long new_size) { resize_target_update_count(ht, new_size); CMM_STORE_SHARED(ht->t.resize_initiated, 1); + ht->cds_lfht_rcu_thread_offline(); pthread_mutex_lock(&ht->resize_mutex); _do_cds_lfht_resize(ht); pthread_mutex_unlock(&ht->resize_mutex); + ht->cds_lfht_rcu_thread_online(); } static @@ -1326,9 +1696,11 @@ void do_resize_cb(struct rcu_head *head) caa_container_of(head, struct rcu_resize_work, head); struct cds_lfht *ht = work->ht; + ht->cds_lfht_rcu_thread_offline(); pthread_mutex_lock(&ht->resize_mutex); _do_cds_lfht_resize(ht); pthread_mutex_unlock(&ht->resize_mutex); + ht->cds_lfht_rcu_thread_online(); poison_free(work); cmm_smp_mb(); /* finish resize before decrement */ uatomic_dec(&ht->in_progress_resize);