From 2d6debff95ad695255d2ea9d590d1e418590b238 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 10 Apr 2009 12:41:19 -0400
Subject: [PATCH] Split out architecture-dependent definitions into api.h and
 arch.h.

Break out architecture-specific definitions into api.h and arch.h.
Modify Makefile to add pthreads-x86 and pthreads-ppc targets to adapt
to either architecture.  Create api_x86.h, api_ppc.h, arch_x86.h, and
arch_ppc.h files.

The api/arch distinction is historical in nature.  In a perfect world,
these would be merged.  In reality, the __thread storage class does not
adapt nicely to accessing one thread's variables from another thread,
though sufficiently insane gcc hackery could probably make this work.
In the event, arch.h manually constructs an array of pointers to the
__thread variables that need cross-thread access, while api.h punts
and uses an array for the per-thread variables themselves, paying a
significant performance penalty for so doing.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
---
 Makefile             |   8 +++
 api.h                |   4 +-
 rcutorture.h         |   4 +-
 test_rwlock_timing.c |  16 ------
 test_urcu_timing.c   |  16 ------
 urcu.h               | 116 +------------------------------------------
 6 files changed, 14 insertions(+), 150 deletions(-)

diff --git a/Makefile b/Makefile
index 2a04467..70709ac 100644
--- a/Makefile
+++ b/Makefile
@@ -11,6 +11,14 @@ SRC_DEP=`echo $^ | sed 's/[^ ]*.h//g'`
 all: test_urcu test_urcu_timing test_rwlock_timing test_urcu_yield urcu-asm.S \
 	urcu-asm.o urcutorture urcutorture-yield
 
+pthreads-x86: clean
+	cp api_x86.h api.h
+	cp arch_x86.h arch.h
+
+pthreads-ppc: clean
+	cp api_ppc.h api.h
+	cp arch_ppc.h arch.h
+
 test_urcu: urcu.o test_urcu.c urcu.h
 	$(CC) ${CFLAGS} $(LDFLAGS) -o $@ $(SRC_DEP)
 
diff --git a/api.h b/api.h
index 230c11d..8bc52e8 100644
--- a/api.h
+++ b/api.h
@@ -1,5 +1,7 @@
 /* MECHANICALLY GENERATED, DO NOT EDIT!!! */
 
+#define _INCLUDE_API_H
+
 /*
  * common.h: Common Linux kernel-isms.
  *
@@ -148,7 +150,7 @@ static __inline__ int atomic_sub_and_test(int i, atomic_t *v)
  * 
  * Atomically increments @v by 1.
  */ 
-static __inline__ void __atomic_inc(atomic_t *v)
+static __inline__ void atomic_inc(atomic_t *v)
 {
 	__asm__ __volatile__(
 		LOCK_PREFIX "incl %0"
diff --git a/rcutorture.h b/rcutorture.h
index 8ba6763..8681ef7 100644
--- a/rcutorture.h
+++ b/rcutorture.h
@@ -116,7 +116,7 @@ void *rcu_read_perf_test(void *arg)
 
 	urcu_register_thread();
 	run_on(me);
-	__atomic_inc(&nthreadsrunning);
+	atomic_inc(&nthreadsrunning);
 	while (goflag == GOFLAG_INIT)
 		poll(NULL, 0, 1);
 	mark_rcu_quiescent_state();
@@ -141,7 +141,7 @@ void *rcu_update_perf_test(void *arg)
 {
 	long long n_updates_local = 0;
 
-	__atomic_inc(&nthreadsrunning);
+	atomic_inc(&nthreadsrunning);
 	while (goflag == GOFLAG_INIT)
 		poll(NULL, 0, 1);
 	while (goflag == GOFLAG_RUN) {
diff --git a/test_rwlock_timing.c b/test_rwlock_timing.c
index 6e0523c..9ea2494 100644
--- a/test_rwlock_timing.c
+++ b/test_rwlock_timing.c
@@ -35,22 +35,6 @@ static inline pid_t gettid(void)
 }
 #endif
 
-#define rdtscll(val) do { \
-     unsigned int __a,__d; \
-     asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \
-     (val) = ((unsigned long long)__a) | (((unsigned long long)__d)<<32); \
-} while(0)
-
-typedef unsigned long long cycles_t;
-
-static inline cycles_t get_cycles (void)
-{
-        unsigned long long ret = 0;
-
-        rdtscll(ret);
-        return ret;
-}
-
 #include "urcu.h"
 
 struct test_array {
diff --git a/test_urcu_timing.c b/test_urcu_timing.c
index d469508..ac23846 100644
--- a/test_urcu_timing.c
+++ b/test_urcu_timing.c
@@ -34,22 +34,6 @@ static inline pid_t gettid(void)
 }
 #endif
 
-#define rdtscll(val) do { \
-     unsigned int __a,__d; \
-     asm volatile("rdtsc" : "=a" (__a), "=d" (__d)); \
-     (val) = ((unsigned long long)__a) | (((unsigned long long)__d)<<32); \
-} while(0)
-
-typedef unsigned long long cycles_t;
-
-static inline cycles_t get_cycles (void)
-{
-        unsigned long long ret = 0;
-
-        rdtscll(ret);
-        return ret;
-}
-
 #include "urcu.h"
 
 pthread_mutex_t rcu_copy_mutex = PTHREAD_MUTEX_INITIALIZER;
diff --git a/urcu.h b/urcu.h
index 539b1d5..0ff0877 100644
--- a/urcu.h
+++ b/urcu.h
@@ -26,62 +26,10 @@
 #define likely(x)       __builtin_expect(!!(x), 1)
 #define unlikely(x)     __builtin_expect(!!(x), 0)
 
-/*
- * Assume the architecture has coherent caches. Blackfin will want this unset.
- */
-#define CONFIG_HAVE_MEM_COHERENCY 1
-
-/* Assume P4 or newer */
-#define CONFIG_HAVE_FENCE 1
-
 /* Assume SMP machine, given we don't have this information */
 #define CONFIG_SMP 1
 
 
-#ifdef CONFIG_HAVE_MEM_COHERENCY
-/*
- * Caches are coherent, no need to flush them.
- */
-#define mc()	barrier()
-#define rmc()	barrier()
-#define wmc()	barrier()
-#else
-#error "The architecture must create its own cache flush primitives"
-#define mc()	arch_cache_flush()
-#define rmc()	arch_cache_flush_read()
-#define wmc()	arch_cache_flush_write()
-#endif
-
-
-#ifdef CONFIG_HAVE_MEM_COHERENCY
-
-/* x86 32/64 specific */
-#ifdef CONFIG_HAVE_FENCE
-#define mb()    asm volatile("mfence":::"memory")
-#define rmb()   asm volatile("lfence":::"memory")
-#define wmb()   asm volatile("sfence"::: "memory")
-#else
-/*
- * Some non-Intel clones support out of order store. wmb() ceases to be a
- * nop for these.
- */
-#define mb()    asm volatile("lock; addl $0,0(%%esp)":::"memory")
-#define rmb()   asm volatile("lock; addl $0,0(%%esp)":::"memory")
-#define wmb()   asm volatile("lock; addl $0,0(%%esp)"::: "memory")
-#endif
-
-#else /* !CONFIG_HAVE_MEM_COHERENCY */
-
-/*
- * Without cache coherency, the memory barriers become cache flushes.
- */
-#define mb()    mc()
-#define rmb()   rmc()
-#define wmb()   wmc()
-
-#endif /* !CONFIG_HAVE_MEM_COHERENCY */
-
-
 #ifdef CONFIG_SMP
 #define smp_mb()	mb()
 #define smp_rmb()	rmb()
@@ -98,69 +46,7 @@
 #define smp_wmc()	barrier()
 #endif
 
-/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
-static inline void rep_nop(void)
-{
-	asm volatile("rep; nop" ::: "memory");
-}
-
-static inline void cpu_relax(void)
-{
-	rep_nop();
-}
-
-static inline void atomic_inc(int *v)
-{
-	asm volatile("lock; incl %0"
-		     : "+m" (*v));
-}
-
-#define xchg(ptr, v)							\
-	((__typeof__(*(ptr)))__xchg((unsigned long)(v), (ptr), sizeof(*(ptr))))
-
-struct __xchg_dummy {
-	unsigned long a[100];
-};
-#define __xg(x) ((struct __xchg_dummy *)(x))
-
-/*
- * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
- * Note 2: xchg has side effect, so that attribute volatile is necessary,
- *	  but generally the primitive is invalid, *ptr is output argument. --ANK
- * x is considered local, ptr is considered remote.
- */
-static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
-				   int size)
-{
-	switch (size) {
-	case 1:
-		asm volatile("xchgb %b0,%1"
-			     : "=q" (x)
-			     : "m" (*__xg(ptr)), "0" (x)
-			     : "memory");
-		break;
-	case 2:
-		asm volatile("xchgw %w0,%1"
-			     : "=r" (x)
-			     : "m" (*__xg(ptr)), "0" (x)
-			     : "memory");
-		break;
-	case 4:
-		asm volatile("xchgl %k0,%1"
-			     : "=r" (x)
-			     : "m" (*__xg(ptr)), "0" (x)
-			     : "memory");
-		break;
-	case 8:
-		asm volatile("xchgq %0,%1"
-			     : "=r" (x)
-			     : "m" (*__xg(ptr)), "0" (x)
-			     : "memory");
-		break;
-	}
-	smp_wmc();
-	return x;
-}
+#include "arch.h"
 
 /* Nop everywhere except on alpha. */
 #define smp_read_barrier_depends()
-- 
2.34.1