From 61009379872b40dbd0d69bb3baaf82e55ec58b39 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 8 Mar 2012 20:16:27 -0500 Subject: [PATCH] RCU Judy Array Design and initial files Signed-off-by: Mathieu Desnoyers --- Makefile.am | 6 +- rcuja/design.txt | 373 +++++++++++++++++++++++++++++++++++++++++ rcuja/rcuja-internal.h | 27 +++ rcuja/rcuja.c | 25 +++ 4 files changed, 429 insertions(+), 2 deletions(-) create mode 100644 rcuja/design.txt create mode 100644 rcuja/rcuja-internal.h create mode 100644 rcuja/rcuja.c diff --git a/Makefile.am b/Makefile.am index 0a4d357..df170f3 100644 --- a/Makefile.am +++ b/Makefile.am @@ -17,7 +17,7 @@ nobase_dist_include_HEADERS = urcu/compiler.h urcu/hlist.h urcu/list.h \ urcu/wfqueue.h urcu/rculfstack.h urcu/rculfqueue.h \ urcu/ref.h urcu/cds.h urcu/urcu_ref.h urcu/urcu-futex.h \ urcu/uatomic_arch.h urcu/rculfhash.h urcu/wfcqueue.h \ - urcu/lfstack.h \ + urcu/rcuja.h urcu/lfstack.h \ $(top_srcdir)/urcu/map/*.h \ $(top_srcdir)/urcu/static/*.h \ urcu/tls-compat.h @@ -45,6 +45,8 @@ endif RCULFHASH = rculfhash.c rculfhash-mm-order.c rculfhash-mm-chunk.c \ rculfhash-mm-mmap.c +RCUJA = rcuja/rcuja.c + lib_LTLIBRARIES = liburcu-common.la \ liburcu.la liburcu-qsbr.la \ liburcu-mb.la liburcu-signal.la liburcu-bp.la \ @@ -74,7 +76,7 @@ liburcu_bp_la_SOURCES = urcu-bp.c urcu-pointer.c $(COMPAT) liburcu_bp_la_LIBADD = liburcu-common.la liburcu_cds_la_SOURCES = rculfqueue.c rculfstack.c lfstack.c \ - $(RCULFHASH) $(COMPAT) + $(RCULFHASH) $(RCUJA) $(COMPAT) liburcu_cds_la_LIBADD = liburcu-common.la pkgconfigdir = $(libdir)/pkgconfig diff --git a/rcuja/design.txt b/rcuja/design.txt new file mode 100644 index 0000000..775ec97 --- /dev/null +++ b/rcuja/design.txt @@ -0,0 +1,373 @@ +RCU Judy Array Design +Mathieu Desnoyers +March 8, 2012 + +Initial ideas based on the released Judy Shop Manual +(http://judy.sourceforge.net/). Judy was invented by Doug Baskins and +implemented by Hewlett-Packard. + +Thresholds and RCU-specific analysis is introduced in this document. + +Advantages of using Judy Array (compressed nodes) for RCU tree: +- no rebalancing +- no transplant +- RCU-friendly! +- favor cache-line alignment of structures + +Disadvantage: +- updates that need to reallocate nodes are slower than, e.g. non-rcu + red-black trees. + +Choice: Using 256 entries intermediate nodes (index can be represented +on 8 bits): 4 levels on 32-bit, 8 levels on 64-bit + + +* Node types (from less dense node to most dense) + + +- empty node: + +Parent pointer is NULL. + + +- Type A: sequential search in value and pointer arrays + ++ Add/removal just needs to update value and pointer array, single-entry + (non-RCU...). For RCU, we might need to update the entire node anyway. +- Requires sequential search through all value array for lookup fail + test. + +Filled at 3 entries max 64-bit +8 bits indicating number of children +Array of 8-bit values followed by array of associated pointers. +64-bit: 1 byte + 3 bytes + 4 bytes pad + 3*8 = 32 bytes + +-> up to this point on 64-bit, sequential lookup and pointer read fit in +a 32-byte cache line. + - lookup fail&success: 1 cache-line. + +Filled at 6 entries max 32-bit, 7 entries max 64-bit +8 bits indicating number of children +Array of 8-bit values followed by array of associated pointers. +32-bit: 1 byte + 6 bytes + 1 byte pad + 6*4bytes = 32 bytes +64-bit: 1 byte + 7 bytes + 7*8 = 64 bytes + +-> up to this point on 32-bit, sequential lookup and pointer read fit in +a 32-byte cache line. + - lookup fail&success: 1 cache-line. + +Filled at 12 entries max 32-bit, 14 entries max 64-bit +8 bits indicating number of children +Array of 8-bit values followed by array of associated pointers. +32-bit: 1 byte + 12 bytes + 3 bytes pad + 12*4bytes = 64 bytes +64-bit: 1 byte + 14 bytes + 1 byte pad + 14*8 = 128 bytes + +Filled at 25 entries max 32-bit, 28 entries max 64-bit +8 bits indicating number of children +Array of 8-bit values followed by array of associated pointers. +32-bit: 1 byte + 25 bytes + 2 bytes pad + 25*4bytes = 128 bytes +64-bit: 1 byte + 28 bytes + 3 bytes pad + 28*8 = 256 bytes + +---> up to this point, on both 32-bit and 64-bit, the sequential lookup +in values array fits in a 32-byte cache line. + - lookup failure: 1 cache line. + - lookup success: 2 cache lines. + +The two below are listed for completeness sake, but because they require +2 32-byte cache lines for lookup, these are deemed inappropriate. + +Filled at 51 entries max 32-bit, 56 entries max 64-bit +8 bits indicating number of children +Array of 8-bit values followed by array of associated pointers. +32-bit: 1 byte + 51 bytes + 51*4bytes = 256 bytes +64-bit: 1 byte + 56 bytes + 7 bytes pad + 56*8 = 512 bytes + +Filled at 102 entries max 32-bit, 113 entries max 64-bit +8 bits indicating number of children +Array of 8-bit values followed by array of associated pointers. +32-bit: 1 byte + 102 bytes + 1 byte pad + 102*4bytes = 512 bytes +64-bit: 1 byte + 113 bytes + 6 bytes pad + 113*8 = 1024 bytes + + +- Type B: bitmap, followed by pointers array. + +bitmask (256 entries -> 256 bits -> 32 bytes) of populated children +followed by an array of children pointers, in same order as appears in +the bitmask + ++ Allows lookup failure to use 32-byte cache-line only. (1 cacheline) ++ Allows lookup success to use 32-byte cache-line (bitmap), + followed by direct lookup into pointer array. (2 cachelines) + +Filled at 8 entries 32-bit, 12 entries 64-bit +32-bit: 32 + (4*8) -> 64 bytes +64-bit: 32 + (8*12) -> 128 bytes + +Filled at 24 entries 32-bit, 28 entries 64-bit +32-bit: 32 + (4*24) -> 128 bytes +64-bit: 32 + (8*28) -> 256 bytes + +Filled at 56 entries 32-bit, 60 entries 64-bit +32-bit: 32 + (4*56) -> 256 bytes +64-bit: 32 + (8*60) -> 512 bytes + +Filled at 120 entries 32-bit, 124 entries 64-bit +32-bit: 32 + (4*95) -> 512 bytes +64-bit: 32 + (8*124) -> 1024 bytes + + +- Type C: pigeon-hole array + +Filled at 47.2%/48.8% or more (32-bit: 121 entries+, 64-bit: 125 entries+) +Array of children node pointers. Pointers NULL if no child at index. +32-bit: 4*256 = 1024 bytes +64-bit: 8*256 = 2048 bytes + + +* Analysis of the thresholds: + +Analysis of number of cache-lines touched for each node, per-node-type, +depending on the number of children per node, as we increment the number +of children from 0 to 256. Through this, we choose number of children +thresholds at which it is worthwhile to use a different node type. + +Per node: + +- ALWAYS 1 cache line hit for lookup failure (all cases) + +32-bit + +- Unexisting + +0 children + +- Type A: sequential search in value and pointer arrays +- 1 cache line hit for lookup success +- 32 bytes storage + +up to 6 children + +- 2 cache line hit for lookup success +- 64 bytes storage + +up to 12 children + +- Type B: bitmap, followed by pointers array. +- 2 cache line hit for lookup success +- 128 bytes storage + +up to 24 children + +- 256 bytes storage +up to 56 children + +- 512 bytes storage +up to 120 children + +- Type C: pigeon-hole array +- 1 cache line hit for lookup success +- 1024 bytes storage + +up to 256 children + + +64-bit + +- Unexisting + +0 children + +- Type A: sequential search in value and pointer arrays +- 1 cache line hit for lookup success +- 32 bytes storage + +up to 3 children + +- 2 cache line hit for lookup success +- 64 bytes storage + +up to 7 children + +- 128 bytes storage + +up to 14 children + +- Type B: bitmap, followed by pointers array. +- 2 cache line hit for lookup success +- 256 bytes storage + +up to 28 children + +- 512 bytes storage +up to 60 children + +- 1024 bytes storage +up to 124 children + +- Type C: pigeon-hole array +- 1 cache line hit for lookup success +- 2048 bytes storage + +up to 256 children + + +* Analysis of node type encoding and node pointers: + +Lookups are _always_ from the top of the tree going down. This +facilitates RCU replacement as we only keep track of pointers going +downward. + +Type of node encoded in the parent's pointer. Need to reserve 2 +least-significant bits. + +Types of children: + +enum child_type { + LINEAR = 0, /* Type A */ + /* 32-bit: 1 to 12 children, 32 to 64 bytes */ + /* 64-bit: 1 to 14 children, 32 to 128 bytes */ + BITMAP = 1, /* Type B */ + /* 32-bit: 13 to 120 children, 128 to 512 bytes */ + /* 64-bit: 15 to 124 children, 256 to 1024 bytes */ + PIGEON = 2, /* Type C */ + /* 32-bit: 121 to 256 children, 1024 bytes */ + /* 64-bit: 125 to 256 children, 2048 bytes */ + LEAF = 3, /* Leaf node */ +}; + +If entire pointer is NULL, children is empty. + + +* Lookup and Update Algorithms + +Let's propose a quite simple scheme that uses a mutex on nodes to manage +update concurrency. It's certainly not optimal in terms of concurrency +management within a node, but it has the advantage of being simple to +implement and understand. + +We need to keep a count of the number of children nodes (for each node), +to keep track of when the node type thresholds are reached. It would be +important to put an hysteresis loop so we don't change between node +types too often for a loop on add/removal of the same node. + +We acquire locks from child to parent, nested. We take all locks +required to perform a given update in the tree (but no more) to keep it +consistent with respect to number of children per node. + +If check for node being gc'd (always under node lock) fails, we simply +need to release the lock and lookup the node again. + + +- Leaf lookup + +rcu_read_lock() + +RCU-lookup each level of the tree. If level is not populated, fail. +Until we reach the leaf node. + +rcu_read_unlock() + + +- Leaf insertion + +A) Lookup + +rcu_read_lock() +RCU-lookup insert position. Find location in tree where nodes are +missing for this insertion. If leaf is already present, insert fails, +releasing the rcu read lock. The insert location consists of a parent +node to which we want to attach a new node. + +B) Lock + +RCU-lookup parent node. Take the parent lock. If the parent needs to be +reallocated to make room for this insertion, RCU-lookup parent-parent +node and take the parent-parent lock. For each lock taken, check if +node is being gc'd. If gc'd, release lock, re-RCU-lookup this node, and +retry. + +C) Create + +Construct the whole branch from the new topmost intermediate node down +to the new leaf node we are inserting. + +D) Populate: + - If parent node reallocation is needed: + Reallocate the parent node, adding the new branch to it, and + increment its node count. + set gc flag in old nodes. + call_rcu free for all old nodes. + Populate new parent node with rcu_assign_pointer. + - Else: + Increment parent node count. + Use rcu_assign_pointer to populate this new branch into the parent + node. + +E) Locks + +Release parent and (if taken) parent-parent locks. +rcu_read_unlock() + + +- Leaf removal + +A) Lookup + +rcu_read_lock() +RCU-lookup leaf to remove. If leaf is missing, fail and release rcu +read lock. + +B) Lock + +RCU-lookup parent. Take the parent lock. If the parent needs to be +reallocated because it would be too large for the decremented number of +children, RCU-lookup parent-parent and take the parent-parent lock. Do +so recursively until no node reallocation is needed, or until root is +reached. + +For each lock taken, check if node is being gc'd. If gc'd, release lock, +re-RCU-lookup this node, and retry. + +C) Create + +The branch (or portion of branch) consisting of taken locks necessarily +has a simple node removal or update as operation to do on its top node. + +If the operation is a node removal, then, necessarily, the entire branch +under the node removal operation will simply disappear. No node +allocation is needed. + +Else, if the operation is a child node reallocation, the child node will +necessarily do a node removal. So _its_ entire child branch will +disappear. So reallocate this child node without the removed branch +(remember to decrement its nr children count). + +D) Populate + +No reallocation case: simply set the appropriate child pointer in the +topmost locked node to NULL. Decrement its nr children count. + +Reallocation case: set the child pointer in the topmost locked node to +the newly allocated node. +set old nodes gc flag. +call_rcu free for all old nodes. + +E) Locks + +Release all locks. +rcu_read_unlock() + + +For the various types of nodes: + +- sequential search (type A) + - RCU replacement: mutex + - Entry update: mutex + +- bitmap followed by pointer array (type B) + - RCU replacement: mutex + - Entry update: mutex + +- pigeon hole array (type C) + - RCU replacement: mutex + - Entry update: mutex diff --git a/rcuja/rcuja-internal.h b/rcuja/rcuja-internal.h new file mode 100644 index 0000000..604cb8e --- /dev/null +++ b/rcuja/rcuja-internal.h @@ -0,0 +1,27 @@ +#ifndef _URCU_RCUJA_INTERNAL_H +#define _URCU_RCUJA_INTERNAL_H + +/* + * rcuja/rcuja-internal.h + * + * Userspace RCU library - RCU Judy Array Internal Header + * + * Copyright 2012 - Mathieu Desnoyers + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + + +#endif /* _URCU_RCUJA_INTERNAL_H */ diff --git a/rcuja/rcuja.c b/rcuja/rcuja.c new file mode 100644 index 0000000..6dcd9c6 --- /dev/null +++ b/rcuja/rcuja.c @@ -0,0 +1,25 @@ +/* + * rcuja/rcuja.c + * + * Userspace RCU library - RCU Judy Array + * + * Copyright 2012 - Mathieu Desnoyers + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include "rcuja-internal.h" + -- 2.34.1