rcuja range: bugfixes and validation
[userspace-rcu.git] / rcuja / rcuja-range.c
1 /*
2 * rcuja/rcuja-range.c
3 *
4 * Userspace RCU library - RCU Judy Array Range Support
5 *
6 * Copyright 2012-2013 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
7 *
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with this library; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23 #define _LGPL_SOURCE
24 #include <stdint.h>
25 #include <errno.h>
26 #include <limits.h>
27 #include <string.h>
28 #include <assert.h>
29 #include <pthread.h>
30 #include <urcu/rcuja.h>
31 #include <urcu/compiler.h>
32 #include <urcu/arch.h>
33 #include <urcu-pointer.h>
34 #include <urcu/uatomic.h>
35 #include <urcu/rcuja-range.h>
36 #include <urcu-flavor.h>
37
38 #include "rcuja-internal.h"
39
40 /*
41 * Discussion about order of lookup/lock vs allocated node deletion.
42 *
43 * - If node deletion returns before call to
44 * cds_ja_range_lookup(), the node will not be found by lookup.
45 * - If node deletion is called after cds_ja_range_lock() returns a
46 * non-NULL range, the deletion will wait until the lock is released
47 * before it takes place.
48 * - If node deletion call/return overlaps with the call to
49 * cds_ja_range_lookup() and return from cds_ja_range_lock(), the node
50 * may or may not be found by each of cds_ja_range_lookup() and
51 * cds_ja_range_lock().
52 */
53
54 /*
55 * Discussion about order of lookup/lock vs allocated node add. Assuming
56 * no concurrent delete.
57 *
58 * - If node add returns before call to
59 * cds_ja_range_lookup(), the node will be found by lookup.
60 * - If node add is called after cds_ja_range_lookup returns, the node
61 * will not be found by lookup.
62 * - If node add call/return overlaps with the call to and return from
63 * cds_ja_range_lookup(), the node may or may not be found.
64 * - If node add call/return overlaps with call to cds_ja_range_lookup()
65 * and return from cds_ja_range_lock(), in the specific case where
66 * cds_ja_range_lookup() _does_ succeed, then cds_ja_range_lock() will
67 * succeed (still assuming no concurrent deletion).
68 */
69
70 /*
71 * Discussion: concurrent deletion of contiguous allocated ranges.
72 *
73 * Ensuring that merge of contiguous free ranges is always performed, we
74 * need to ensure locking of concurrent removal of contiguous allocated
75 * ranges one with respect to another. This is done by locking the
76 * ranges prior to and after the range to remove, even if that range is
77 * allocated. This serializes removal of contiguous ranges. The only
78 * cases for which there is no range to lock is when removing an
79 * allocated range starting at 0, and/or ending at the end of the key
80 * space.
81 */
82
83 /*
84 * Discussion: concurrent lookup vs add
85 *
86 * When executed concurrently with node add, the inequality
87 * lookup can see no node for the looked-up range, because a range can
88 * be shrinked. This can happen if, for instance, we lookup key 2
89 * between addition of a "free" range for values [1,2], and removal of
90 * the old "free" range for values [0,2]. We would then fail to observe
91 * any range for key 2. Given that the lookup is performed during a
92 * range transition, we can safely return that there is no allocated
93 * node in the range.
94 */
95
96 /*
97 * Discussion: concurrent lookup vs del
98 *
99 * There is no special case for lookups performed concurrently with node
100 * del, because node del either replaces the node with the exact same
101 * start key (see duplicates guarantees), or replaces it with a larger
102 * range containing the prior range. Therefore, we are sure that
103 * inequality lookups will see the larger range before the old range is
104 * deleted, in whichever direction the lookup is performed.
105 */
106
107 /*
108 * Discussion of the type state transitions.
109 *
110 * State transitions of "type" always go from either:
111 *
112 * CDS_JA_RANGE_FREE -> CDS_JA_RANGE_REMOVED
113 * or
114 * CDS_JA_RANGE_ALLOCATED -> CDS_JA_RANGE_REMOVED
115 *
116 * A range type never changes otherwise.
117 */
118
119 //#define RANGE_DEBUG
120
121 #undef dbg_printf
122
123 #ifdef RANGE_DEBUG
124 #define dbg_printf(fmt, args...) \
125 fprintf(stderr, "[debug rcuja-range %lu %s()@%s:%u] " fmt, \
126 (unsigned long) gettid(), __func__, \
127 __FILE__, __LINE__, ## args)
128 #else
129 #define dbg_printf(fmt, args...) \
130 do { \
131 /* do nothing but check printf format */ \
132 if (0) \
133 fprintf(stderr, "[debug rcuja-range %lu %s()@%s:%u] " fmt, \
134 (unsigned long) gettid(), __func__, \
135 __FILE__, __LINE__, ## args); \
136 } while (0)
137 #endif
138
139 #define CDS_JA_RANGE_KEY_BITS 64
140
141 enum cds_ja_range_type {
142 CDS_JA_RANGE_ALLOCATED,
143 CDS_JA_RANGE_FREE,
144 CDS_JA_RANGE_REMOVED,
145 };
146
147 /*
148 * Range goes from start (inclusive) to end (inclusive).
149 * Range start is used as node key in the Judy array.
150 */
151 struct cds_ja_range {
152 uint64_t end;
153 struct cds_ja_node ja_node;
154 pthread_mutex_t lock;
155 void *priv;
156 enum cds_ja_range_type type;
157
158 /* not required on lookup fast-path */
159 uint64_t start;
160 struct rcu_head head;
161 };
162
163 struct cds_ja_range *cds_ja_range_lookup(struct cds_ja *ja, uint64_t key)
164 {
165 struct cds_ja_node *node, *last_node;
166 struct cds_ja_range *range;
167
168 dbg_printf("key: %" PRIu64 "\n", key);
169 node = cds_ja_lookup_below_equal(ja, key, NULL);
170 if (!node)
171 return NULL;
172 /*
173 * Get the last of duplicate chain. Adding a node to Judy array
174 * duplicates inserts them at the end of the chain.
175 */
176 cds_ja_for_each_duplicate_rcu(node)
177 last_node = node;
178 range = caa_container_of(last_node, struct cds_ja_range, ja_node);
179
180 /* Check if range is currently hidden by concurrent add */
181 if (range->end < key)
182 return NULL;
183
184 /*
185 * If last node in the duplicates is removed or free, we can
186 * consider that either a removal or add operation is in
187 * progress, or removal is the last completed operation to
188 * update this range. We can therefore consider that this area
189 * is not allocated.
190 */
191 if (range->type != CDS_JA_RANGE_ALLOCATED)
192 return NULL;
193 /*
194 * We found an allocated range. We can return it for use with
195 * RCU read-side protection for existence. However, we have no
196 * mutual exclusion against removal at this point.
197 */
198 return range;
199 }
200
201 /*
202 * Provide mutual exclusion against removal.
203 */
204 struct cds_ja_range *cds_ja_range_lock(struct cds_ja_range *range)
205 {
206 pthread_mutex_lock(&range->lock);
207
208 if (range->type == CDS_JA_RANGE_REMOVED)
209 goto removed;
210 return range;
211
212 removed:
213 pthread_mutex_unlock(&range->lock);
214 return NULL;
215 }
216
217 void cds_ja_range_unlock(struct cds_ja_range *range)
218 {
219 pthread_mutex_unlock(&range->lock);
220 }
221
222 static
223 struct cds_ja_range *range_create(
224 uint64_t start, /* inclusive */
225 uint64_t end, /* inclusive */
226 void *priv,
227 enum cds_ja_range_type type)
228 {
229 struct cds_ja_range *range;
230
231 range = calloc(sizeof(*range), 1);
232 if (!range)
233 return NULL;
234 range->start = start;
235 range->end = end;
236 range->priv = priv;
237 range->type = type;
238 pthread_mutex_init(&range->lock, NULL);
239 return range;
240 }
241
242 static
243 void free_range_cb(struct rcu_head *head)
244 {
245 struct cds_ja_range *range =
246 caa_container_of(head, struct cds_ja_range, head);
247 free(range);
248 }
249
250 static
251 void free_range(struct cds_ja_range *range)
252 {
253 free(range);
254 }
255
256 static
257 void rcu_free_range(struct cds_ja *ja, struct cds_ja_range *range)
258 {
259 cds_lfht_rcu_flavor(ja->ht)->update_call_rcu(&range->head,
260 free_range_cb);
261 }
262
263 int cds_ja_range_add(struct cds_ja *ja,
264 uint64_t start, /* inclusive */
265 uint64_t end, /* inclusive */
266 void *priv)
267 {
268 struct cds_ja_node *old_node;
269 struct cds_ja_range *old_range, *new_range, *ranges[3];
270 unsigned int nr_ranges, i;
271 int ret;
272
273 if (start > end || end == UINT64_MAX)
274 return -EINVAL;
275
276 retry:
277 dbg_printf("start: %" PRIu64 ", end: %" PRIu64 ", priv %p\n",
278 start, end, priv);
279 /*
280 * Find if requested range is entirely contained within a single
281 * free range.
282 */
283 old_node = cds_ja_lookup_below_equal(ja, start, NULL);
284 /* Range hidden by concurrent add */
285 if (!old_node)
286 goto retry;
287
288 old_range = caa_container_of(old_node, struct cds_ja_range, ja_node);
289
290 /* Range hidden by concurrent add */
291 if (old_range->end < start)
292 goto retry;
293
294 /* We now know that old_range overlaps with our range */
295 switch (CMM_LOAD_SHARED(old_range->type)) {
296 case CDS_JA_RANGE_ALLOCATED:
297 return -EEXIST;
298 case CDS_JA_RANGE_FREE:
299 break;
300 case CDS_JA_RANGE_REMOVED:
301 goto retry;
302 }
303
304 /* We do not fit entirely within the range */
305 if (old_range->end < end)
306 return -EEXIST;
307
308 pthread_mutex_lock(&old_range->lock);
309
310 if (old_range->type == CDS_JA_RANGE_REMOVED) {
311 pthread_mutex_unlock(&old_range->lock);
312 goto retry;
313 }
314
315 /* Create replacement ranges: at most 2 free and 1 allocated */
316 if (start == old_range->start) {
317 if (end == old_range->end) {
318 /* 1 range */
319 ranges[0] = new_range = range_create(start, end,
320 priv, CDS_JA_RANGE_ALLOCATED);
321 nr_ranges = 1;
322 } else {
323 /* 2 ranges */
324 assert(old_range->end > end);
325 ranges[0] = new_range = range_create(start, end,
326 priv, CDS_JA_RANGE_ALLOCATED);
327 ranges[1] = range_create(end + 1, old_range->end,
328 NULL, CDS_JA_RANGE_FREE);
329 nr_ranges = 2;
330 }
331 } else {
332 if (end == old_range->end) {
333 /* 2 ranges */
334 assert(old_range->start < start);
335 ranges[0] = range_create(old_range->start, start - 1,
336 NULL, CDS_JA_RANGE_FREE);
337 ranges[1] = new_range = range_create(start, end,
338 priv, CDS_JA_RANGE_ALLOCATED);
339 nr_ranges = 2;
340 } else {
341 /* 3 ranges */
342 assert(old_range->start < start);
343 assert(old_range->end > end);
344 ranges[0] = range_create(old_range->start, start - 1,
345 NULL, CDS_JA_RANGE_FREE);
346 ranges[1] = new_range = range_create(start, end,
347 priv, CDS_JA_RANGE_ALLOCATED);
348 ranges[2] = range_create(end + 1, old_range->end,
349 NULL, CDS_JA_RANGE_FREE);
350 nr_ranges = 3;
351 }
352 }
353
354 /* Add replacement ranges to Judy array */
355 for (i = 0; i < nr_ranges; i++) {
356 dbg_printf("ADD RANGE: %" PRIu64 "-%" PRIu64 " %s.\n",
357 ranges[i]->start, ranges[i]->end,
358 ranges[i]->type == CDS_JA_RANGE_ALLOCATED ?
359 "allocated" : "free");
360 pthread_mutex_lock(&ranges[i]->lock);
361 ret = cds_ja_add(ja, ranges[i]->start, &ranges[i]->ja_node);
362 assert(!ret);
363 }
364
365 /*
366 * We add replacement ranges _before_ removing old ranges, so
367 * concurrent traversals will always see one or the other. This
368 * is OK because we temporarily have a duplicate key, and Judy
369 * arrays provide key existence guarantee for lookups performed
370 * concurrently with add followed by del of duplicate keys.
371 */
372
373 dbg_printf("REM RANGE: %" PRIu64 "-%" PRIu64 " %s.\n",
374 old_range->start, old_range->end,
375 old_range->type == CDS_JA_RANGE_ALLOCATED ?
376 "allocated" : "free");
377 /* Remove old free range */
378 ret = cds_ja_del(ja, old_range->start, &old_range->ja_node);
379 assert(!ret);
380 old_range->type = CDS_JA_RANGE_REMOVED;
381 pthread_mutex_unlock(&old_range->lock);
382 for (i = 0; i < nr_ranges; i++)
383 pthread_mutex_unlock(&ranges[i]->lock);
384
385 rcu_free_range(ja, old_range);
386
387 dbg_printf("<SUCCEED>\n");
388
389 return 0;
390 }
391
392 int cds_ja_range_del(struct cds_ja *ja, struct cds_ja_range *range)
393 {
394 struct cds_ja_node *prev_node, *next_node;
395 struct cds_ja_range *new_range;
396 struct cds_ja_range *merge_ranges[3], *lock_ranges[3];
397 unsigned int nr_merge, nr_lock, i;
398 uint64_t start, end;
399 int ret;
400
401 retry:
402 dbg_printf("start: %" PRIu64 ", end %" PRIu64 ", priv: %p\n",
403 range->start, range->end, range->priv);
404
405 nr_merge = 0;
406 nr_lock = 0;
407
408 /*
409 * Range has been concurrently updated.
410 */
411 if (range->type != CDS_JA_RANGE_ALLOCATED)
412 return -ENOENT;
413
414 if (range->start > 0) {
415 struct cds_ja_range *prev_range;
416
417 prev_node = cds_ja_lookup_below_equal(ja, range->start - 1,
418 NULL);
419 if (!prev_node)
420 goto retry;
421
422 prev_range = caa_container_of(prev_node,
423 struct cds_ja_range, ja_node);
424 /* Prev range temporarily hidden due to concurrent add. */
425 if (prev_range->end != range->start - 1)
426 goto retry;
427
428 lock_ranges[nr_lock++] = prev_range;
429 if (prev_range->type != CDS_JA_RANGE_ALLOCATED)
430 merge_ranges[nr_merge++] = prev_range;
431 }
432
433 lock_ranges[nr_lock++] = range;
434 merge_ranges[nr_merge++] = range;
435
436 if (range->end < UINT64_MAX - 1) {
437 struct cds_ja_range *next_range;
438
439 next_node = cds_ja_lookup_below_equal(ja, range->end + 1,
440 NULL);
441 /* Next range temporarily hidden due to concurrent add. */
442 if (!next_node)
443 goto retry;
444
445 next_range = caa_container_of(next_node,
446 struct cds_ja_range, ja_node);
447 if (next_range->start != range->end + 1)
448 goto retry;
449
450 lock_ranges[nr_lock++] = next_range;
451 if (next_range->type != CDS_JA_RANGE_ALLOCATED)
452 merge_ranges[nr_merge++] = next_range;
453 }
454
455 /* Acquire locks in increasing key order for range merge */
456 for (i = 0; i < nr_lock; i++)
457 pthread_mutex_lock(&lock_ranges[i]->lock);
458 if (range->type != CDS_JA_RANGE_ALLOCATED) {
459 ret = -ENOENT;
460 goto unlock_error;
461 }
462 /* Ensure they are valid */
463 for (i = 0; i < nr_lock; i++) {
464 if (lock_ranges[i]->type == CDS_JA_RANGE_REMOVED)
465 goto unlock_retry;
466 }
467
468 /* Create new free range */
469 start = merge_ranges[0]->start;
470 end = merge_ranges[nr_merge - 1]->end;
471 new_range = range_create(start, end, NULL, CDS_JA_RANGE_FREE);
472 pthread_mutex_lock(&new_range->lock);
473
474 dbg_printf("ADD RANGE: %" PRIu64 "-%" PRIu64 " %s.\n",
475 new_range->start, new_range->end,
476 new_range->type == CDS_JA_RANGE_ALLOCATED ?
477 "allocated" : "free");
478
479 ret = cds_ja_add(ja, start, &new_range->ja_node);
480 assert(!ret);
481
482 /* Remove old ranges */
483 for (i = 0; i < nr_merge; i++) {
484
485 dbg_printf("REM RANGE: %" PRIu64 "-%" PRIu64 " %s.\n",
486 merge_ranges[i]->start, merge_ranges[i]->end,
487 merge_ranges[i]->type == CDS_JA_RANGE_ALLOCATED ?
488 "allocated" : "free");
489 ret = cds_ja_del(ja, merge_ranges[i]->start,
490 &merge_ranges[i]->ja_node);
491 assert(!ret);
492 merge_ranges[i]->type = CDS_JA_RANGE_REMOVED;
493 }
494 for (i = 0; i < nr_lock; i++)
495 pthread_mutex_unlock(&lock_ranges[i]->lock);
496 pthread_mutex_unlock(&new_range->lock);
497 /* Free old merged ranges */
498 for (i = 0; i < nr_merge; i++)
499 rcu_free_range(ja, merge_ranges[i]);
500
501 dbg_printf("<SUCCEED>\n");
502
503 return 0;
504
505 /* retry paths */
506 unlock_retry:
507 for (i = 0; i < nr_lock; i++)
508 pthread_mutex_unlock(&lock_ranges[i]->lock);
509 goto retry;
510 /* error paths */
511 unlock_error:
512 for (i = 0; i < nr_lock; i++)
513 pthread_mutex_unlock(&lock_ranges[i]->lock);
514 return ret;
515 }
516
517 struct cds_ja *_cds_ja_range_new(const struct rcu_flavor_struct *flavor)
518 {
519 struct cds_ja_range *range;
520 struct cds_ja *ja;
521 int ret;
522
523 ja = _cds_ja_new(CDS_JA_RANGE_KEY_BITS, flavor);
524 if (!ja)
525 return NULL;
526 range = range_create(0, UINT64_MAX - 1, NULL, CDS_JA_RANGE_FREE);
527 if (!range)
528 goto free_ja;
529 cds_lfht_rcu_flavor(ja->ht)->read_lock();
530 ret = cds_ja_add(ja, 0, &range->ja_node);
531 cds_lfht_rcu_flavor(ja->ht)->read_unlock();
532 if (ret)
533 goto free_range;
534 return ja;
535
536 free_range:
537 free_range(range);
538 free_ja:
539 ret = cds_ja_destroy(ja);
540 assert(!ret);
541 return NULL;
542 }
543
544 int cds_ja_range_validate(struct cds_ja *ja)
545 {
546 uint64_t iter_key, start, end, last_end = UINT64_MAX;
547 struct cds_ja_node *ja_node, *last_node;
548 int ret = 0;
549
550 cds_lfht_rcu_flavor(ja->ht)->read_lock();
551 cds_ja_for_each_key_rcu(ja, iter_key, ja_node) {
552 struct cds_ja_range *range;
553 struct cds_ja_node *first_node;
554
555 first_node = ja_node;
556 cds_ja_for_each_duplicate_rcu(ja_node)
557 last_node = ja_node;
558 if (last_node != first_node) {
559 struct cds_ja_range *first_range = caa_container_of(first_node,
560 struct cds_ja_range, ja_node);
561 struct cds_ja_range *last_range = caa_container_of(last_node,
562 struct cds_ja_range, ja_node);
563 fprintf(stderr, "found duplicate node: first %" PRIu64 "-%" PRIu64 " last %" PRIu64 "-%" PRIu64 "\n",
564 first_range->start, first_range->end, last_range->start, last_range->end);
565 ret |= -1;
566 }
567 range = caa_container_of(last_node,
568 struct cds_ja_range, ja_node);
569 start = range->start;
570 end = range->end;
571 if (last_end != UINT64_MAX) {
572 if (start != last_end + 1) {
573 fprintf(stderr, "ja range discrepancy: last end: %" PRIu64 ", start: %" PRIu64 "\n",
574 last_end, start);
575 ret |= -1;
576 }
577 }
578 last_end = end;
579 }
580 if (last_end != UINT64_MAX - 1) {
581 fprintf(stderr, "ja range error: end of last range is: %" PRIu64 "\n",
582 last_end);
583 ret |= 1;
584 }
585 cds_lfht_rcu_flavor(ja->ht)->read_unlock();
586 return ret;
587 }
588
589 int cds_ja_range_destroy(struct cds_ja *ja,
590 void (*free_priv)(void *ptr))
591 {
592 uint64_t key;
593 struct cds_ja_node *ja_node;
594 int ret = 0;
595
596 cds_lfht_rcu_flavor(ja->ht)->read_lock();
597 cds_ja_for_each_key_rcu(ja, key, ja_node) {
598 struct cds_ja_node *tmp_node;
599
600 cds_ja_for_each_duplicate_safe_rcu(ja_node, tmp_node) {
601 struct cds_ja_range *range;
602
603 range = caa_container_of(ja_node,
604 struct cds_ja_range, ja_node);
605 ret = cds_ja_del(ja, key, &range->ja_node);
606 if (ret)
607 goto error;
608 if (free_priv)
609 free_priv(range->priv);
610 /* Alone using Judy array, OK to free now */
611 free_range(range);
612 }
613 }
614 cds_lfht_rcu_flavor(ja->ht)->read_unlock();
615 return cds_ja_destroy(ja);
616
617 error:
618 cds_lfht_rcu_flavor(ja->ht)->read_unlock();
619 return ret;
620 }
This page took 0.050597 seconds and 5 git commands to generate.