+ unsigned long partition_len;
+ struct partition_resize_work *work;
+ int thread, ret;
+ unsigned long nr_threads;
+ pthread_t *thread_id;
+
+ /*
+ * Note: nr_cpus_mask + 1 is always power of 2.
+ * We spawn just the number of threads we need to satisfy the minimum
+ * partition size, up to the number of CPUs in the system.
+ */
+ if (nr_cpus_mask > 0) {
+ nr_threads = min(nr_cpus_mask + 1,
+ len >> MIN_PARTITION_PER_THREAD_ORDER);
+ } else {
+ nr_threads = 1;
+ }
+ partition_len = len >> get_count_order_ulong(nr_threads);
+ work = calloc(nr_threads, sizeof(*work));
+ thread_id = calloc(nr_threads, sizeof(*thread_id));
+ assert(work);
+ for (thread = 0; thread < nr_threads; thread++) {
+ work[thread].ht = ht;
+ work[thread].i = i;
+ work[thread].len = partition_len;
+ work[thread].start = thread * partition_len;
+ work[thread].fct = fct;
+ ret = pthread_create(&thread_id[thread], ht->resize_attr,
+ partition_resize_thread, &work[thread]);
+ assert(!ret);
+ }
+ for (thread = 0; thread < nr_threads; thread++) {
+ ret = pthread_join(thread_id[thread], NULL);
+ assert(!ret);
+ }
+ free(work);
+ free(thread_id);
+}
+
+/*
+ * Holding RCU read lock to protect _cds_lfht_add against memory
+ * reclaim that could be performed by other call_rcu worker threads (ABA
+ * problem).
+ *
+ * When we reach a certain length, we can split this population phase over
+ * many worker threads, based on the number of CPUs available in the system.
+ * This should therefore take care of not having the expand lagging behind too
+ * many concurrent insertion threads by using the scheduler's ability to
+ * schedule dummy node population fairly with insertions.
+ */
+static
+void init_table_populate_partition(struct cds_lfht *ht, unsigned long i,
+ unsigned long start, unsigned long len)
+{
+ unsigned long j;
+
+ ht->cds_lfht_rcu_read_lock();
+ for (j = start; j < start + len; j++) {
+ struct cds_lfht_node *new_node =
+ (struct cds_lfht_node *) &ht->t.tbl[i]->nodes[j];
+
+ dbg_printf("init populate: i %lu j %lu hash %lu\n",
+ i, j, !i ? 0 : (1UL << (i - 1)) + j);
+ new_node->p.reverse_hash =
+ bit_reverse_ulong(!i ? 0 : (1UL << (i - 1)) + j);
+ (void) _cds_lfht_add(ht, !i ? 0 : (1UL << (i - 1)),
+ new_node, ADD_DEFAULT, 1);
+ }
+ ht->cds_lfht_rcu_read_unlock();
+}
+
+static
+void init_table_populate(struct cds_lfht *ht, unsigned long i,
+ unsigned long len)
+{
+ assert(nr_cpus_mask != -1);
+ if (nr_cpus_mask < 0 || len < 2 * MIN_PARTITION_PER_THREAD) {
+ ht->cds_lfht_rcu_thread_online();
+ init_table_populate_partition(ht, i, 0, len);
+ ht->cds_lfht_rcu_thread_offline();
+ return;
+ }
+ partition_resize_helper(ht, i, len, init_table_populate_partition);
+}
+
+static
+void init_table(struct cds_lfht *ht,
+ unsigned long first_order, unsigned long len_order)
+{
+ unsigned long i, end_order;
+
+ dbg_printf("init table: first_order %lu end_order %lu\n",
+ first_order, first_order + len_order);
+ end_order = first_order + len_order;
+ for (i = first_order; i < end_order; i++) {
+ unsigned long len;
+
+ len = !i ? 1 : 1UL << (i - 1);
+ dbg_printf("init order %lu len: %lu\n", i, len);
+
+ /* Stop expand if the resize target changes under us */
+ if (CMM_LOAD_SHARED(ht->t.resize_target) < (!i ? 1 : (1UL << i)))
+ break;
+
+ ht->t.tbl[i] = calloc(1, sizeof(struct rcu_level)
+ + (len * sizeof(struct _cds_lfht_node)));
+ assert(ht->t.tbl[i]);
+
+ /*
+ * Set all dummy nodes reverse hash values for a level and
+ * link all dummy nodes into the table.
+ */
+ init_table_populate(ht, i, len);
+
+ /*
+ * Update table size.
+ */
+ cmm_smp_wmb(); /* populate data before RCU size */
+ CMM_STORE_SHARED(ht->t.size, !i ? 1 : (1UL << i));
+
+ dbg_printf("init new size: %lu\n", !i ? 1 : (1UL << i));
+ if (CMM_LOAD_SHARED(ht->in_progress_destroy))
+ break;
+ }
+}
+
+/*
+ * Holding RCU read lock to protect _cds_lfht_remove against memory
+ * reclaim that could be performed by other call_rcu worker threads (ABA
+ * problem).
+ * For a single level, we logically remove and garbage collect each node.
+ *
+ * As a design choice, we perform logical removal and garbage collection on a
+ * node-per-node basis to simplify this algorithm. We also assume keeping good
+ * cache locality of the operation would overweight possible performance gain
+ * that could be achieved by batching garbage collection for multiple levels.
+ * However, this would have to be justified by benchmarks.
+ *
+ * Concurrent removal and add operations are helping us perform garbage
+ * collection of logically removed nodes. We guarantee that all logically
+ * removed nodes have been garbage-collected (unlinked) before call_rcu is
+ * invoked to free a hole level of dummy nodes (after a grace period).
+ *
+ * Logical removal and garbage collection can therefore be done in batch or on a
+ * node-per-node basis, as long as the guarantee above holds.
+ *
+ * When we reach a certain length, we can split this removal over many worker
+ * threads, based on the number of CPUs available in the system. This should
+ * take care of not letting resize process lag behind too many concurrent
+ * updater threads actively inserting into the hash table.
+ */
+static
+void remove_table_partition(struct cds_lfht *ht, unsigned long i,
+ unsigned long start, unsigned long len)
+{
+ unsigned long j;
+
+ ht->cds_lfht_rcu_read_lock();
+ for (j = start; j < start + len; j++) {
+ struct cds_lfht_node *fini_node =
+ (struct cds_lfht_node *) &ht->t.tbl[i]->nodes[j];
+
+ dbg_printf("remove entry: i %lu j %lu hash %lu\n",
+ i, j, !i ? 0 : (1UL << (i - 1)) + j);
+ fini_node->p.reverse_hash =
+ bit_reverse_ulong(!i ? 0 : (1UL << (i - 1)) + j);
+ (void) _cds_lfht_del(ht, !i ? 0 : (1UL << (i - 1)),
+ fini_node, 1);
+ }
+ ht->cds_lfht_rcu_read_unlock();
+}
+
+static
+void remove_table(struct cds_lfht *ht, unsigned long i, unsigned long len)
+{
+
+ assert(nr_cpus_mask != -1);
+ if (nr_cpus_mask < 0 || len < 2 * MIN_PARTITION_PER_THREAD) {
+ ht->cds_lfht_rcu_thread_online();
+ remove_table_partition(ht, i, 0, len);
+ ht->cds_lfht_rcu_thread_offline();
+ return;
+ }
+ partition_resize_helper(ht, i, len, remove_table_partition);
+}
+
+static
+void fini_table(struct cds_lfht *ht,
+ unsigned long first_order, unsigned long len_order)
+{
+ long i, end_order;
+
+ dbg_printf("fini table: first_order %lu end_order %lu\n",
+ first_order, first_order + len_order);
+ end_order = first_order + len_order;
+ assert(first_order > 0);
+ for (i = end_order - 1; i >= first_order; i--) {
+ unsigned long len;
+
+ len = !i ? 1 : 1UL << (i - 1);
+ dbg_printf("fini order %lu len: %lu\n", i, len);
+
+ /* Stop shrink if the resize target changes under us */
+ if (CMM_LOAD_SHARED(ht->t.resize_target) > (1UL << (i - 1)))
+ break;
+
+ cmm_smp_wmb(); /* populate data before RCU size */
+ CMM_STORE_SHARED(ht->t.size, 1UL << (i - 1));
+
+ /*
+ * We need to wait for all add operations to reach Q.S. (and
+ * thus use the new table for lookups) before we can start
+ * releasing the old dummy nodes. Otherwise their lookup will
+ * return a logically removed node as insert position.
+ */
+ ht->cds_lfht_synchronize_rcu();
+
+ /*
+ * Set "removed" flag in dummy nodes about to be removed.
+ * Unlink all now-logically-removed dummy node pointers.
+ * Concurrent add/remove operation are helping us doing
+ * the gc.
+ */
+ remove_table(ht, i, len);