Fix: rcuja merge fixes
[userspace-rcu.git] / rcuja / rcuja-range.c
1 /*
2 * rcuja/rcuja-range.c
3 *
4 * Userspace RCU library - RCU Judy Array Range Support
5 *
6 * Copyright 2012-2013 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
7 *
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with this library; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23 #define _LGPL_SOURCE
24 #include <stdint.h>
25 #include <errno.h>
26 #include <limits.h>
27 #include <string.h>
28 #include <assert.h>
29 #include <pthread.h>
30 #include <urcu/rcuja.h>
31 #include <urcu/compiler.h>
32 #include <urcu/arch.h>
33 #include <urcu-pointer.h>
34 #include <urcu/uatomic.h>
35 #include <urcu/rcuja-range.h>
36 #include <urcu-flavor.h>
37
38 #include "rcuja-internal.h"
39
40 /*
41 * Discussion about order of lookup/lock vs allocated node deletion.
42 *
43 * - If node deletion returns before call to
44 * cds_ja_range_lookup(), the node will not be found by lookup.
45 * - If node deletion is called after cds_ja_range_lock() returns a
46 * non-NULL range, the deletion will wait until the lock is released
47 * before it takes place.
48 * - If node deletion call/return overlaps with the call to
49 * cds_ja_range_lookup() and return from cds_ja_range_lock(), the node
50 * may or may not be found by each of cds_ja_range_lookup() and
51 * cds_ja_range_lock().
52 */
53
54 /*
55 * Discussion about order of lookup/lock vs allocated node add. Assuming
56 * no concurrent delete.
57 *
58 * - If node add returns before call to
59 * cds_ja_range_lookup(), the node will be found by lookup.
60 * - If node add is called after cds_ja_range_lookup returns, the node
61 * will not be found by lookup.
62 * - If node add call/return overlaps with the call to and return from
63 * cds_ja_range_lookup(), the node may or may not be found.
64 * - If node add call/return overlaps with call to cds_ja_range_lookup()
65 * and return from cds_ja_range_lock(), in the specific case where
66 * cds_ja_range_lookup() _does_ succeed, then cds_ja_range_lock() will
67 * succeed (still assuming no concurrent deletion).
68 */
69
70 /*
71 * Discussion: concurrent deletion of contiguous allocated ranges.
72 *
73 * Ensuring that merge of contiguous free ranges is always performed, we
74 * need to ensure locking of concurrent removal of contiguous allocated
75 * ranges one with respect to another. This is done by locking the
76 * ranges prior to and after the range to remove, even if that range is
77 * allocated. This serializes removal of contiguous ranges. The only
78 * cases for which there is no range to lock is when removing an
79 * allocated range starting at 0, and/or ending at the end of the key
80 * space.
81 */
82
83 /*
84 * Discussion: concurrent lookup vs add
85 *
86 * When executed concurrently with node add, the inequality
87 * lookup can see no node for the looked-up range, because a range can
88 * be shrinked. This can happen if, for instance, we lookup key 2
89 * between addition of a "free" range for values [1,2], and removal of
90 * the old "free" range for values [0,2]. We would then fail to observe
91 * any range for key 2. Given that the lookup is performed during a
92 * range transition, we can safely return that there is no allocated
93 * node in the range.
94 */
95
96 /*
97 * Discussion: concurrent lookup vs del
98 *
99 * There is no special case for lookups performed concurrently with node
100 * del, because node del either replaces the node with the exact same
101 * start key (see duplicates guarantees), or replaces it with a larger
102 * range containing the prior range. Therefore, we are sure that
103 * inequality lookups will see the larger range before the old range is
104 * deleted, in whichever direction the lookup is performed.
105 */
106
107 /*
108 * Discussion of the type state transitions.
109 *
110 * State transitions of "type" always go from either:
111 *
112 * CDS_JA_RANGE_FREE -> CDS_JA_RANGE_REMOVED
113 * or
114 * CDS_JA_RANGE_ALLOCATED -> CDS_JA_RANGE_REMOVED
115 *
116 * A range type never changes otherwise.
117 */
118
119 //#define RANGE_DEBUG
120
121 #undef dbg_printf
122
123 #ifdef RANGE_DEBUG
124 #define dbg_printf(fmt, args...) \
125 fprintf(stderr, "[debug rcuja-range %lu %s()@%s:%u] " fmt, \
126 (unsigned long) gettid(), __func__, \
127 __FILE__, __LINE__, ## args)
128 #else
129 #define dbg_printf(fmt, args...) \
130 do { \
131 /* do nothing but check printf format */ \
132 if (0) \
133 fprintf(stderr, "[debug rcuja-range %lu %s()@%s:%u] " fmt, \
134 (unsigned long) gettid(), __func__, \
135 __FILE__, __LINE__, ## args); \
136 } while (0)
137 #endif
138
139 enum cds_ja_range_type {
140 CDS_JA_RANGE_ALLOCATED,
141 CDS_JA_RANGE_FREE,
142 CDS_JA_RANGE_REMOVED,
143 };
144
145 /*
146 * Range goes from start (inclusive) to end (inclusive).
147 * Range start is used as node key in the Judy array.
148 */
149 struct cds_ja_range {
150 uint64_t end;
151 struct cds_ja_node ja_node;
152 pthread_mutex_t lock;
153 void *priv;
154 enum cds_ja_range_type type;
155
156 /* not required on lookup fast-path */
157 uint64_t start;
158 struct rcu_head head;
159 };
160
161 struct cds_ja_range *cds_ja_range_lookup(struct cds_ja *ja, uint64_t key)
162 {
163 struct cds_ja_node *node, *last_node;
164 struct cds_ja_range *range;
165
166 dbg_printf("key: %" PRIu64 "\n", key);
167 node = cds_ja_lookup_below_equal(ja, key, NULL);
168 if (!node)
169 return NULL;
170 /*
171 * Get the last of duplicate chain. Adding a node to Judy array
172 * duplicates inserts them at the end of the chain.
173 */
174 cds_ja_for_each_duplicate_rcu(node)
175 last_node = node;
176 range = caa_container_of(last_node, struct cds_ja_range, ja_node);
177
178 /* Check if range is currently hidden by concurrent add */
179 if (range->end < key)
180 return NULL;
181
182 /*
183 * If last node in the duplicates is removed or free, we can
184 * consider that either a removal or add operation is in
185 * progress, or removal is the last completed operation to
186 * update this range. We can therefore consider that this area
187 * is not allocated.
188 */
189 if (range->type != CDS_JA_RANGE_ALLOCATED)
190 return NULL;
191 /*
192 * We found an allocated range. We can return it for use with
193 * RCU read-side protection for existence. However, we have no
194 * mutual exclusion against removal at this point.
195 */
196 return range;
197 }
198
199 /*
200 * Provide mutual exclusion against removal.
201 */
202 struct cds_ja_range *cds_ja_range_lock(struct cds_ja_range *range)
203 {
204 pthread_mutex_lock(&range->lock);
205
206 if (range->type == CDS_JA_RANGE_REMOVED)
207 goto removed;
208 return range;
209
210 removed:
211 pthread_mutex_unlock(&range->lock);
212 return NULL;
213 }
214
215 void cds_ja_range_unlock(struct cds_ja_range *range)
216 {
217 pthread_mutex_unlock(&range->lock);
218 }
219
220 void cds_ja_range_get_values(const struct cds_ja_range *range,
221 uint64_t *start, uint64_t *end, void **priv)
222 {
223 *start = range->start;
224 *end = range->end;
225 *priv = range->priv;
226 }
227
228 static
229 struct cds_ja_range *range_create(
230 uint64_t start, /* inclusive */
231 uint64_t end, /* inclusive */
232 void *priv,
233 enum cds_ja_range_type type)
234 {
235 struct cds_ja_range *range;
236
237 range = calloc(sizeof(*range), 1);
238 if (!range)
239 return NULL;
240 range->start = start;
241 range->end = end;
242 range->priv = priv;
243 range->type = type;
244 pthread_mutex_init(&range->lock, NULL);
245 return range;
246 }
247
248 static
249 void free_range_cb(struct rcu_head *head)
250 {
251 struct cds_ja_range *range =
252 caa_container_of(head, struct cds_ja_range, head);
253 free(range);
254 }
255
256 static
257 void free_range(struct cds_ja_range *range)
258 {
259 free(range);
260 }
261
262 static
263 void rcu_free_range(struct cds_ja *ja, struct cds_ja_range *range)
264 {
265 cds_lfht_rcu_flavor(ja->ht)->update_call_rcu(&range->head,
266 free_range_cb);
267 }
268
269 int cds_ja_range_add(struct cds_ja *ja,
270 uint64_t start, /* inclusive */
271 uint64_t end, /* inclusive */
272 void *priv)
273 {
274 struct cds_ja_node *old_node;
275 struct cds_ja_range *old_range, *new_range, *ranges[3];
276 unsigned int nr_ranges, i;
277 int ret;
278
279 if (start > end || end == UINT64_MAX || end > ja->key_max)
280 return -EINVAL;
281
282 retry:
283 dbg_printf("start: %" PRIu64 ", end: %" PRIu64 ", priv %p\n",
284 start, end, priv);
285 /*
286 * Find if requested range is entirely contained within a single
287 * free range.
288 */
289 old_node = cds_ja_lookup_below_equal(ja, start, NULL);
290 /* Range hidden by concurrent add */
291 if (!old_node)
292 goto retry;
293
294 old_range = caa_container_of(old_node, struct cds_ja_range, ja_node);
295
296 /* Range hidden by concurrent add */
297 if (old_range->end < start)
298 goto retry;
299
300 /* We now know that old_range overlaps with our range */
301 switch (CMM_LOAD_SHARED(old_range->type)) {
302 case CDS_JA_RANGE_ALLOCATED:
303 return -EEXIST;
304 case CDS_JA_RANGE_FREE:
305 break;
306 case CDS_JA_RANGE_REMOVED:
307 goto retry;
308 }
309
310 /* We do not fit entirely within the range */
311 if (old_range->end < end)
312 return -EEXIST;
313
314 pthread_mutex_lock(&old_range->lock);
315
316 if (old_range->type == CDS_JA_RANGE_REMOVED) {
317 pthread_mutex_unlock(&old_range->lock);
318 goto retry;
319 }
320
321 /* Create replacement ranges: at most 2 free and 1 allocated */
322 if (start == old_range->start) {
323 if (end == old_range->end) {
324 /* 1 range */
325 ranges[0] = new_range = range_create(start, end,
326 priv, CDS_JA_RANGE_ALLOCATED);
327 nr_ranges = 1;
328 } else {
329 /* 2 ranges */
330 assert(old_range->end > end);
331 ranges[0] = new_range = range_create(start, end,
332 priv, CDS_JA_RANGE_ALLOCATED);
333 ranges[1] = range_create(end + 1, old_range->end,
334 NULL, CDS_JA_RANGE_FREE);
335 nr_ranges = 2;
336 }
337 } else {
338 if (end == old_range->end) {
339 /* 2 ranges */
340 assert(old_range->start < start);
341 ranges[0] = range_create(old_range->start, start - 1,
342 NULL, CDS_JA_RANGE_FREE);
343 ranges[1] = new_range = range_create(start, end,
344 priv, CDS_JA_RANGE_ALLOCATED);
345 nr_ranges = 2;
346 } else {
347 /* 3 ranges */
348 assert(old_range->start < start);
349 assert(old_range->end > end);
350 ranges[0] = range_create(old_range->start, start - 1,
351 NULL, CDS_JA_RANGE_FREE);
352 ranges[1] = new_range = range_create(start, end,
353 priv, CDS_JA_RANGE_ALLOCATED);
354 ranges[2] = range_create(end + 1, old_range->end,
355 NULL, CDS_JA_RANGE_FREE);
356 nr_ranges = 3;
357 }
358 }
359
360 /* Add replacement ranges to Judy array */
361 for (i = 0; i < nr_ranges; i++) {
362 dbg_printf("ADD RANGE: %" PRIu64 "-%" PRIu64 " %s.\n",
363 ranges[i]->start, ranges[i]->end,
364 ranges[i]->type == CDS_JA_RANGE_ALLOCATED ?
365 "allocated" : "free");
366 pthread_mutex_lock(&ranges[i]->lock);
367 ret = cds_ja_add(ja, ranges[i]->start, &ranges[i]->ja_node);
368 assert(!ret);
369 }
370
371 /*
372 * We add replacement ranges _before_ removing old ranges, so
373 * concurrent traversals will always see one or the other. This
374 * is OK because we temporarily have a duplicate key, and Judy
375 * arrays provide key existence guarantee for lookups performed
376 * concurrently with add followed by del of duplicate keys.
377 */
378
379 dbg_printf("REM RANGE: %" PRIu64 "-%" PRIu64 " %s.\n",
380 old_range->start, old_range->end,
381 old_range->type == CDS_JA_RANGE_ALLOCATED ?
382 "allocated" : "free");
383 /* Remove old free range */
384 ret = cds_ja_del(ja, old_range->start, &old_range->ja_node);
385 assert(!ret);
386 old_range->type = CDS_JA_RANGE_REMOVED;
387 pthread_mutex_unlock(&old_range->lock);
388 for (i = 0; i < nr_ranges; i++)
389 pthread_mutex_unlock(&ranges[i]->lock);
390
391 rcu_free_range(ja, old_range);
392
393 dbg_printf("<SUCCEED>\n");
394
395 return 0;
396 }
397
398 int cds_ja_range_del(struct cds_ja *ja, struct cds_ja_range *range)
399 {
400 struct cds_ja_node *prev_node, *next_node;
401 struct cds_ja_range *new_range;
402 struct cds_ja_range *merge_ranges[3], *lock_ranges[3];
403 unsigned int nr_merge, nr_lock, i;
404 uint64_t start, end;
405 int ret;
406
407 retry:
408 dbg_printf("start: %" PRIu64 ", end %" PRIu64 ", priv: %p\n",
409 range->start, range->end, range->priv);
410
411 nr_merge = 0;
412 nr_lock = 0;
413
414 /*
415 * Range has been concurrently updated.
416 */
417 if (range->type != CDS_JA_RANGE_ALLOCATED)
418 return -ENOENT;
419
420 if (range->start > 0) {
421 struct cds_ja_range *prev_range;
422
423 prev_node = cds_ja_lookup_below_equal(ja, range->start - 1,
424 NULL);
425 if (!prev_node)
426 goto retry;
427
428 prev_range = caa_container_of(prev_node,
429 struct cds_ja_range, ja_node);
430 /* Prev range temporarily hidden due to concurrent add. */
431 if (prev_range->end != range->start - 1)
432 goto retry;
433
434 lock_ranges[nr_lock++] = prev_range;
435 if (prev_range->type != CDS_JA_RANGE_ALLOCATED)
436 merge_ranges[nr_merge++] = prev_range;
437 }
438
439 lock_ranges[nr_lock++] = range;
440 merge_ranges[nr_merge++] = range;
441
442 if (range->end < UINT64_MAX - 1) {
443 struct cds_ja_range *next_range;
444
445 next_node = cds_ja_lookup_below_equal(ja, range->end + 1,
446 NULL);
447 /* Next range temporarily hidden due to concurrent add. */
448 if (!next_node)
449 goto retry;
450
451 next_range = caa_container_of(next_node,
452 struct cds_ja_range, ja_node);
453 if (next_range->start != range->end + 1)
454 goto retry;
455
456 lock_ranges[nr_lock++] = next_range;
457 if (next_range->type != CDS_JA_RANGE_ALLOCATED)
458 merge_ranges[nr_merge++] = next_range;
459 }
460
461 /* Acquire locks in increasing key order for range merge */
462 for (i = 0; i < nr_lock; i++)
463 pthread_mutex_lock(&lock_ranges[i]->lock);
464 if (range->type != CDS_JA_RANGE_ALLOCATED) {
465 ret = -ENOENT;
466 goto unlock_error;
467 }
468 /* Ensure they are valid */
469 for (i = 0; i < nr_lock; i++) {
470 if (lock_ranges[i]->type == CDS_JA_RANGE_REMOVED)
471 goto unlock_retry;
472 }
473
474 /* Create new free range */
475 start = merge_ranges[0]->start;
476 end = merge_ranges[nr_merge - 1]->end;
477 new_range = range_create(start, end, NULL, CDS_JA_RANGE_FREE);
478 pthread_mutex_lock(&new_range->lock);
479
480 dbg_printf("ADD RANGE: %" PRIu64 "-%" PRIu64 " %s.\n",
481 new_range->start, new_range->end,
482 new_range->type == CDS_JA_RANGE_ALLOCATED ?
483 "allocated" : "free");
484
485 ret = cds_ja_add(ja, start, &new_range->ja_node);
486 assert(!ret);
487
488 /* Remove old ranges */
489 for (i = 0; i < nr_merge; i++) {
490
491 dbg_printf("REM RANGE: %" PRIu64 "-%" PRIu64 " %s.\n",
492 merge_ranges[i]->start, merge_ranges[i]->end,
493 merge_ranges[i]->type == CDS_JA_RANGE_ALLOCATED ?
494 "allocated" : "free");
495 ret = cds_ja_del(ja, merge_ranges[i]->start,
496 &merge_ranges[i]->ja_node);
497 assert(!ret);
498 merge_ranges[i]->type = CDS_JA_RANGE_REMOVED;
499 }
500 for (i = 0; i < nr_lock; i++)
501 pthread_mutex_unlock(&lock_ranges[i]->lock);
502 pthread_mutex_unlock(&new_range->lock);
503 /* Free old merged ranges */
504 for (i = 0; i < nr_merge; i++)
505 rcu_free_range(ja, merge_ranges[i]);
506
507 dbg_printf("<SUCCEED>\n");
508
509 return 0;
510
511 /* retry paths */
512 unlock_retry:
513 for (i = 0; i < nr_lock; i++)
514 pthread_mutex_unlock(&lock_ranges[i]->lock);
515 goto retry;
516 /* error paths */
517 unlock_error:
518 for (i = 0; i < nr_lock; i++)
519 pthread_mutex_unlock(&lock_ranges[i]->lock);
520 return ret;
521 }
522
523 struct cds_ja *_cds_ja_range_new(unsigned int key_bits,
524 const struct rcu_flavor_struct *flavor)
525 {
526 struct cds_ja_range *range;
527 struct cds_ja *ja;
528 int ret;
529
530 ja = _cds_ja_new(key_bits, flavor);
531 if (!ja)
532 return NULL;
533 range = range_create(0, UINT64_MAX - 1, NULL, CDS_JA_RANGE_FREE);
534 if (!range)
535 goto free_ja;
536 cds_lfht_rcu_flavor(ja->ht)->read_lock();
537 ret = cds_ja_add(ja, 0, &range->ja_node);
538 cds_lfht_rcu_flavor(ja->ht)->read_unlock();
539 if (ret)
540 goto free_range;
541 return ja;
542
543 free_range:
544 free_range(range);
545 free_ja:
546 ret = cds_ja_destroy(ja);
547 assert(!ret);
548 return NULL;
549 }
550
551 int cds_ja_range_validate(struct cds_ja *ja)
552 {
553 uint64_t iter_key, start, end, last_end = UINT64_MAX;
554 struct cds_ja_node *ja_node, *last_node;
555 int ret = 0;
556
557 cds_lfht_rcu_flavor(ja->ht)->read_lock();
558 cds_ja_for_each_key_rcu(ja, iter_key, ja_node) {
559 struct cds_ja_range *range;
560 struct cds_ja_node *first_node;
561
562 first_node = ja_node;
563 cds_ja_for_each_duplicate_rcu(ja_node)
564 last_node = ja_node;
565 if (last_node != first_node) {
566 struct cds_ja_range *first_range = caa_container_of(first_node,
567 struct cds_ja_range, ja_node);
568 struct cds_ja_range *last_range = caa_container_of(last_node,
569 struct cds_ja_range, ja_node);
570 fprintf(stderr, "found duplicate node: first %" PRIu64 "-%" PRIu64 " last %" PRIu64 "-%" PRIu64 "\n",
571 first_range->start, first_range->end, last_range->start, last_range->end);
572 ret |= -1;
573 }
574 range = caa_container_of(last_node,
575 struct cds_ja_range, ja_node);
576 start = range->start;
577 end = range->end;
578 if (last_end != UINT64_MAX) {
579 if (start != last_end + 1) {
580 fprintf(stderr, "ja range discrepancy: last end: %" PRIu64 ", start: %" PRIu64 "\n",
581 last_end, start);
582 ret |= -1;
583 }
584 }
585 last_end = end;
586 }
587 if (last_end != UINT64_MAX - 1) {
588 fprintf(stderr, "ja range error: end of last range is: %" PRIu64 "\n",
589 last_end);
590 ret |= 1;
591 }
592 cds_lfht_rcu_flavor(ja->ht)->read_unlock();
593 return ret;
594 }
595
596 int cds_ja_range_destroy(struct cds_ja *ja,
597 void (*free_priv)(void *ptr))
598 {
599 uint64_t key;
600 struct cds_ja_node *ja_node;
601 int ret = 0;
602
603 cds_lfht_rcu_flavor(ja->ht)->read_lock();
604 cds_ja_for_each_key_rcu(ja, key, ja_node) {
605 struct cds_ja_node *tmp_node;
606
607 cds_ja_for_each_duplicate_safe_rcu(ja_node, tmp_node) {
608 struct cds_ja_range *range;
609
610 range = caa_container_of(ja_node,
611 struct cds_ja_range, ja_node);
612 ret = cds_ja_del(ja, key, &range->ja_node);
613 if (ret)
614 goto error;
615 if (free_priv)
616 free_priv(range->priv);
617 /* Alone using Judy array, OK to free now */
618 free_range(range);
619 }
620 }
621 cds_lfht_rcu_flavor(ja->ht)->read_unlock();
622 return cds_ja_destroy(ja);
623
624 error:
625 cds_lfht_rcu_flavor(ja->ht)->read_unlock();
626 return ret;
627 }
This page took 0.046042 seconds and 4 git commands to generate.