From 61009379872b40dbd0d69bb3baaf82e55ec58b39 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Date: Thu, 8 Mar 2012 20:16:27 -0500
Subject: [PATCH] RCU Judy Array Design and initial files

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
---
 Makefile.am            |   6 +-
 rcuja/design.txt       | 373 +++++++++++++++++++++++++++++++++++++++++
 rcuja/rcuja-internal.h |  27 +++
 rcuja/rcuja.c          |  25 +++
 4 files changed, 429 insertions(+), 2 deletions(-)
 create mode 100644 rcuja/design.txt
 create mode 100644 rcuja/rcuja-internal.h
 create mode 100644 rcuja/rcuja.c

diff --git a/Makefile.am b/Makefile.am
index 0a4d357..df170f3 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -17,7 +17,7 @@ nobase_dist_include_HEADERS = urcu/compiler.h urcu/hlist.h urcu/list.h \
 		urcu/wfqueue.h urcu/rculfstack.h urcu/rculfqueue.h \
 		urcu/ref.h urcu/cds.h urcu/urcu_ref.h urcu/urcu-futex.h \
 		urcu/uatomic_arch.h urcu/rculfhash.h urcu/wfcqueue.h \
-		urcu/lfstack.h \
+		urcu/rcuja.h urcu/lfstack.h \
 		$(top_srcdir)/urcu/map/*.h \
 		$(top_srcdir)/urcu/static/*.h \
 		urcu/tls-compat.h
@@ -45,6 +45,8 @@ endif
 RCULFHASH = rculfhash.c rculfhash-mm-order.c rculfhash-mm-chunk.c \
 		rculfhash-mm-mmap.c
 
+RCUJA = rcuja/rcuja.c
+
 lib_LTLIBRARIES = liburcu-common.la \
 		liburcu.la liburcu-qsbr.la \
 		liburcu-mb.la liburcu-signal.la liburcu-bp.la \
@@ -74,7 +76,7 @@ liburcu_bp_la_SOURCES = urcu-bp.c urcu-pointer.c $(COMPAT)
 liburcu_bp_la_LIBADD = liburcu-common.la
 
 liburcu_cds_la_SOURCES = rculfqueue.c rculfstack.c lfstack.c \
-	$(RCULFHASH) $(COMPAT)
+	$(RCULFHASH) $(RCUJA) $(COMPAT)
 liburcu_cds_la_LIBADD = liburcu-common.la
 
 pkgconfigdir = $(libdir)/pkgconfig
diff --git a/rcuja/design.txt b/rcuja/design.txt
new file mode 100644
index 0000000..775ec97
--- /dev/null
+++ b/rcuja/design.txt
@@ -0,0 +1,373 @@
+RCU Judy Array Design
+Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+March 8, 2012
+
+Initial ideas based on the released Judy Shop Manual
+(http://judy.sourceforge.net/). Judy was invented by Doug Baskins and
+implemented by Hewlett-Packard.
+
+Thresholds and RCU-specific analysis is introduced in this document.
+
+Advantages of using Judy Array (compressed nodes) for RCU tree:
+- no rebalancing
+- no transplant
+- RCU-friendly!
+- favor cache-line alignment of structures
+
+Disadvantage:
+- updates that need to reallocate nodes are slower than, e.g. non-rcu
+  red-black trees.
+
+Choice: Using 256 entries intermediate nodes (index can be represented
+on 8 bits): 4 levels on 32-bit, 8 levels on 64-bit
+
+
+* Node types (from less dense node to most dense)
+
+
+- empty node:
+
+Parent pointer is NULL.
+
+
+- Type A: sequential search in value and pointer arrays
+
++ Add/removal just needs to update value and pointer array, single-entry
+  (non-RCU...). For RCU, we might need to update the entire node anyway.
+- Requires sequential search through all value array for lookup fail
+  test.
+
+Filled at 3 entries max 64-bit
+8 bits indicating number of children
+Array of 8-bit values followed by array of associated pointers.
+64-bit: 1 byte + 3 bytes + 4 bytes pad + 3*8 = 32 bytes
+
+-> up to this point on 64-bit, sequential lookup and pointer read fit in
+a 32-byte cache line.
+  - lookup fail&success: 1 cache-line.
+
+Filled at 6 entries max 32-bit, 7 entries max 64-bit
+8 bits indicating number of children
+Array of 8-bit values followed by array of associated pointers.
+32-bit: 1 byte + 6 bytes + 1 byte pad + 6*4bytes = 32 bytes
+64-bit: 1 byte + 7 bytes + 7*8 = 64 bytes
+
+-> up to this point on 32-bit, sequential lookup and pointer read fit in
+a 32-byte cache line.
+  - lookup fail&success: 1 cache-line.
+
+Filled at 12 entries max 32-bit, 14 entries max 64-bit
+8 bits indicating number of children
+Array of 8-bit values followed by array of associated pointers.
+32-bit: 1 byte + 12 bytes + 3 bytes pad + 12*4bytes = 64 bytes
+64-bit: 1 byte + 14 bytes + 1 byte pad + 14*8 = 128 bytes
+
+Filled at 25 entries max 32-bit, 28 entries max 64-bit
+8 bits indicating number of children
+Array of 8-bit values followed by array of associated pointers.
+32-bit: 1 byte + 25 bytes + 2 bytes pad + 25*4bytes = 128 bytes
+64-bit: 1 byte + 28 bytes + 3 bytes pad + 28*8 = 256 bytes
+
+---> up to this point, on both 32-bit and 64-bit, the sequential lookup
+in values array fits in a 32-byte cache line.
+  - lookup failure: 1 cache line.
+  - lookup success: 2 cache lines.
+
+The two below are listed for completeness sake, but because they require
+2 32-byte cache lines for lookup, these are deemed inappropriate.
+
+Filled at 51 entries max 32-bit, 56 entries max 64-bit
+8 bits indicating number of children
+Array of 8-bit values followed by array of associated pointers.
+32-bit: 1 byte + 51 bytes + 51*4bytes = 256 bytes
+64-bit: 1 byte + 56 bytes + 7 bytes pad + 56*8 = 512 bytes
+
+Filled at 102 entries max 32-bit, 113 entries max 64-bit
+8 bits indicating number of children
+Array of 8-bit values followed by array of associated pointers.
+32-bit: 1 byte + 102 bytes + 1 byte pad + 102*4bytes = 512 bytes
+64-bit: 1 byte + 113 bytes + 6 bytes pad + 113*8 = 1024 bytes
+
+
+- Type B: bitmap, followed by pointers array.
+
+bitmask (256 entries -> 256 bits -> 32 bytes) of populated children
+followed by an array of children pointers, in same order as appears in
+the bitmask
+
++ Allows lookup failure to use 32-byte cache-line only. (1 cacheline)
++ Allows lookup success to use 32-byte cache-line (bitmap),
+  followed by direct lookup into pointer array. (2 cachelines)
+
+Filled at 8 entries 32-bit, 12 entries 64-bit
+32-bit: 32 + (4*8) ->  64 bytes
+64-bit: 32 + (8*12) ->  128 bytes
+
+Filled at 24 entries 32-bit, 28 entries 64-bit
+32-bit: 32 + (4*24) ->  128 bytes
+64-bit: 32 + (8*28) ->  256 bytes
+
+Filled at 56 entries 32-bit, 60 entries 64-bit
+32-bit: 32 + (4*56) ->  256 bytes
+64-bit: 32 + (8*60) ->  512 bytes
+
+Filled at 120 entries 32-bit, 124 entries 64-bit
+32-bit: 32 + (4*95) ->  512 bytes
+64-bit: 32 + (8*124) ->  1024 bytes
+
+
+- Type C: pigeon-hole array
+
+Filled at 47.2%/48.8% or more (32-bit: 121 entries+, 64-bit: 125 entries+)
+Array of children node pointers. Pointers NULL if no child at index.
+32-bit: 4*256 = 1024 bytes
+64-bit: 8*256 = 2048 bytes
+
+
+* Analysis of the thresholds:
+
+Analysis of number of cache-lines touched for each node, per-node-type,
+depending on the number of children per node, as we increment the number
+of children from 0 to 256. Through this, we choose number of children
+thresholds at which it is worthwhile to use a different node type.
+
+Per node:
+
+- ALWAYS 1 cache line hit for lookup failure (all cases)
+
+32-bit
+
+- Unexisting
+
+0 children
+
+- Type A: sequential search in value and pointer arrays
+- 1 cache line hit for lookup success
+- 32 bytes storage
+
+up to 6 children
+
+- 2 cache line hit for lookup success
+- 64 bytes storage
+
+up to 12 children
+
+- Type B: bitmap, followed by pointers array.
+- 2 cache line hit for lookup success
+- 128 bytes storage
+
+up to 24 children
+
+- 256 bytes storage
+up to 56 children
+
+- 512 bytes storage
+up to 120 children
+
+- Type C: pigeon-hole array
+- 1 cache line hit for lookup success
+- 1024 bytes storage
+
+up to 256 children
+
+
+64-bit
+
+- Unexisting
+
+0 children
+
+- Type A: sequential search in value and pointer arrays
+- 1 cache line hit for lookup success
+- 32 bytes storage
+
+up to 3 children
+
+- 2 cache line hit for lookup success
+- 64 bytes storage
+
+up to 7 children
+
+- 128 bytes storage
+
+up to 14 children
+
+- Type B: bitmap, followed by pointers array.
+- 2 cache line hit for lookup success
+- 256 bytes storage
+
+up to 28 children
+
+- 512 bytes storage
+up to 60 children
+
+- 1024 bytes storage
+up to 124 children
+
+- Type C: pigeon-hole array
+- 1 cache line hit for lookup success
+- 2048 bytes storage
+
+up to 256 children
+
+
+* Analysis of node type encoding and node pointers:
+
+Lookups are _always_ from the top of the tree going down. This
+facilitates RCU replacement as we only keep track of pointers going
+downward.
+
+Type of node encoded in the parent's pointer. Need to reserve 2
+least-significant bits.
+
+Types of children:
+
+enum child_type {
+	LINEAR = 0,	/* Type A */
+			/* 32-bit: 1 to 12 children, 32 to 64 bytes */
+			/* 64-bit: 1 to 14 children, 32 to 128 bytes */
+	BITMAP = 1,	/* Type B */
+			/* 32-bit: 13 to 120 children, 128 to 512 bytes */
+			/* 64-bit: 15 to 124 children, 256 to 1024 bytes */
+	PIGEON = 2,	/* Type C */
+			/* 32-bit: 121 to 256 children, 1024 bytes */
+			/* 64-bit: 125 to 256 children, 2048 bytes */
+	LEAF   = 3,	/* Leaf node */
+};
+
+If entire pointer is NULL, children is empty.
+
+
+* Lookup and Update Algorithms
+
+Let's propose a quite simple scheme that uses a mutex on nodes to manage
+update concurrency. It's certainly not optimal in terms of concurrency
+management within a node, but it has the advantage of being simple to
+implement and understand.
+
+We need to keep a count of the number of children nodes (for each node),
+to keep track of when the node type thresholds are reached. It would be
+important to put an hysteresis loop so we don't change between node
+types too often for a loop on add/removal of the same node.
+
+We acquire locks from child to parent, nested. We take all locks
+required to perform a given update in the tree (but no more) to keep it
+consistent with respect to number of children per node.
+
+If check for node being gc'd (always under node lock) fails, we simply
+need to release the lock and lookup the node again.
+
+
+- Leaf lookup
+
+rcu_read_lock()
+
+RCU-lookup each level of the tree. If level is not populated, fail.
+Until we reach the leaf node.
+
+rcu_read_unlock()
+
+
+- Leaf insertion
+
+A) Lookup
+
+rcu_read_lock()
+RCU-lookup insert position. Find location in tree where nodes are
+missing for this insertion. If leaf is already present, insert fails,
+releasing the rcu read lock.  The insert location consists of a parent
+node to which we want to attach a new node.
+
+B) Lock
+
+RCU-lookup parent node. Take the parent lock. If the parent needs to be
+reallocated to make room for this insertion, RCU-lookup parent-parent
+node and take the parent-parent lock.  For each lock taken, check if
+node is being gc'd. If gc'd, release lock, re-RCU-lookup this node, and
+retry.
+
+C) Create
+
+Construct the whole branch from the new topmost intermediate node down
+to the new leaf node we are inserting. 
+
+D) Populate:
+  - If parent node reallocation is needed:
+     Reallocate the parent node, adding the new branch to it, and
+     increment its node count.
+     set gc flag in old nodes.
+     call_rcu free for all old nodes.
+     Populate new parent node with rcu_assign_pointer.
+  - Else:
+    Increment parent node count.
+    Use rcu_assign_pointer to populate this new branch into the parent
+    node.
+
+E) Locks
+
+Release parent and (if taken) parent-parent locks.
+rcu_read_unlock()
+
+
+- Leaf removal
+
+A) Lookup
+
+rcu_read_lock()
+RCU-lookup leaf to remove. If leaf is missing, fail and release rcu
+read lock.
+
+B) Lock
+
+RCU-lookup parent. Take the parent lock. If the parent needs to be
+reallocated because it would be too large for the decremented number of
+children, RCU-lookup parent-parent and take the parent-parent lock. Do
+so recursively until no node reallocation is needed, or until root is
+reached.
+
+For each lock taken, check if node is being gc'd. If gc'd, release lock,
+re-RCU-lookup this node, and retry.
+
+C) Create
+
+The branch (or portion of branch) consisting of taken locks necessarily
+has a simple node removal or update as operation to do on its top node.
+
+If the operation is a node removal, then, necessarily, the entire branch
+under the node removal operation will simply disappear. No node
+allocation is needed.
+
+Else, if the operation is a child node reallocation, the child node will
+necessarily do a node removal. So _its_ entire child branch will
+disappear. So reallocate this child node without the removed branch
+(remember to decrement its nr children count).
+
+D) Populate
+
+No reallocation case: simply set the appropriate child pointer in the
+topmost locked node to NULL. Decrement its nr children count.
+
+Reallocation case: set the child pointer in the topmost locked node to
+the newly allocated node.
+set old nodes gc flag.
+call_rcu free for all old nodes.
+
+E) Locks
+
+Release all locks.
+rcu_read_unlock()
+
+
+For the various types of nodes:
+
+- sequential search (type A)
+  - RCU replacement: mutex
+  - Entry update: mutex
+
+- bitmap followed by pointer array (type B)
+  - RCU replacement: mutex
+  - Entry update: mutex
+
+- pigeon hole array (type C)
+  - RCU replacement: mutex
+  - Entry update: mutex
diff --git a/rcuja/rcuja-internal.h b/rcuja/rcuja-internal.h
new file mode 100644
index 0000000..604cb8e
--- /dev/null
+++ b/rcuja/rcuja-internal.h
@@ -0,0 +1,27 @@
+#ifndef _URCU_RCUJA_INTERNAL_H
+#define _URCU_RCUJA_INTERNAL_H
+
+/*
+ * rcuja/rcuja-internal.h
+ *
+ * Userspace RCU library - RCU Judy Array Internal Header
+ *
+ * Copyright 2012 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+
+#endif /* _URCU_RCUJA_INTERNAL_H */
diff --git a/rcuja/rcuja.c b/rcuja/rcuja.c
new file mode 100644
index 0000000..6dcd9c6
--- /dev/null
+++ b/rcuja/rcuja.c
@@ -0,0 +1,25 @@
+/*
+ * rcuja/rcuja.c
+ *
+ * Userspace RCU library - RCU Judy Array
+ *
+ * Copyright 2012 - Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <urcu/rcuja.h>
+#include "rcuja-internal.h"
+
-- 
2.34.1