OpenBSD cilkrts portability patch

2013-11-09 Thread John Carr
I am trying to build the trunk version of gcc to have Cilk on OpenBSD.

I attach three changes to libcilkrts.  Two changes treat OpenBSD like
FreeBSD.  One change is needed because OpenBSD defines PTHREAD_MUTEX_* as
enumerations rather than preprocessor constants.

Index: /data/jfc/src/GNU/trunk/libcilkrts/runtime/os-unix.c
===
--- /data/jfc/src/GNU/trunk/libcilkrts/runtime/os-unix.c	(revision 204488)
+++ /data/jfc/src/GNU/trunk/libcilkrts/runtime/os-unix.c	(working copy)
@@ -54,7 +54,7 @@
 #elif defined __APPLE__
 #   include 
 // Uses sysconf(_SC_NPROCESSORS_ONLN) in verbose output
-#elif defined  __FreeBSD__
+#elif defined  __FreeBSD__ || defined __OpenBSD__
 // No additional include files
 #elif defined __CYGWIN__
 // Cygwin on Windows - no additional include files
@@ -369,7 +369,7 @@
 assert((unsigned)count == count);
 
 return count;
-#elif defined  __FreeBSD__ || defined __CYGWIN__
+#elif defined  __FreeBSD__ || defined __OpenBSD__ || defined __CYGWIN__
 int ncores = sysconf(_SC_NPROCESSORS_ONLN);
 
 return ncores;
Index: /data/jfc/src/GNU/trunk/libcilkrts/runtime/os_mutex-unix.c
===
--- /data/jfc/src/GNU/trunk/libcilkrts/runtime/os_mutex-unix.c	(revision 204488)
+++ /data/jfc/src/GNU/trunk/libcilkrts/runtime/os_mutex-unix.c	(working copy)
@@ -89,7 +89,7 @@
 status = pthread_mutexattr_init(&attr);
 CILK_ASSERT (status == 0);
 #if defined DEBUG || CILK_LIB_DEBUG 
-#ifdef PTHREAD_MUTEX_ERRORCHECK
+#if defined PTHREAD_MUTEX_ERRORCHECK || defined __OpenBSD__
 status = pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK);
 #else
 status = pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_ERRORCHECK_NP);


[PATCH] ARM: Weaker memory barriers

2014-03-10 Thread John Carr
A comment in arm/sync.md notes "We should consider issuing a inner
shareability zone barrier here instead."  Here is my first attempt
at a patch to emit weaker memory barriers.  Three instructions seem
to be relevant for user mode code on my Cortex A9 Linux box:

dmb ishst, dmb ish, dmb sy

I believe these correspond to a release barrier, a full barrier
with respect to other CPUs, and a full barrier that also orders
relative to I/O.

Consider this a request for comments on whether the approach is correct.
I haven't done any testing yet (beyond eyeballing the assembly output).


2014-03-10  John F. Carr  

	* config/arm/sync.md (mem_thread_fence): New expander for weaker
	memory barriers.
	* config/arm/arm.c (arm_pre_atomic_barrier, arm_post_atomic_barrier): 
	Emit only as strong a fence as needed.

Index: config/arm/arm.c
===
--- config/arm/arm.c	(revision 208470)
+++ config/arm/arm.c	(working copy)
@@ -29813,7 +29813,12 @@
 arm_pre_atomic_barrier (enum memmodel model)
 {
   if (need_atomic_barrier_p (model, true))
-emit_insn (gen_memory_barrier ());
+{
+  if (HAVE_mem_thread_fence)
+	emit_insn (gen_mem_thread_fence (GEN_INT ((int) model)));
+  else
+	emit_insn (gen_memory_barrier ());
+}
 }
 
 static void
@@ -29820,7 +29825,12 @@
 arm_post_atomic_barrier (enum memmodel model)
 {
   if (need_atomic_barrier_p (model, false))
-emit_insn (gen_memory_barrier ());
+{
+  if (HAVE_mem_thread_fence)
+	emit_insn (gen_mem_thread_fence (GEN_INT ((int) model)));
+  else
+	emit_insn (gen_memory_barrier ());
+}
 }
 
 /* Emit the load-exclusive and store-exclusive instructions.
Index: config/arm/sync.md
===
--- config/arm/sync.md	(revision 208470)
+++ config/arm/sync.md	(working copy)
@@ -34,26 +34,54 @@
 (define_mode_attr sync_sfx
   [(QI "b") (HI "h") (SI "") (DI "d")])
 
+(define_expand "mem_thread_fence"
+  [(set (match_dup 1)
+	(unspec:BLK
+	 [(match_dup 1)
+	  (match_operand:SI 0 "const_int_operand")]
+	 UNSPEC_MEMORY_BARRIER))]
+  "TARGET_HAVE_DMB"
+{
+  if (INTVAL(operands[0]) == MEMMODEL_RELAXED)
+DONE;
+  operands[1] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
+  MEM_VOLATILE_P (operands[1]) = 1;
+})
+
 (define_expand "memory_barrier"
   [(set (match_dup 0)
-	(unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BARRIER))]
+	(unspec:BLK [(match_dup 0) (match_dup 1)]
+		UNSPEC_MEMORY_BARRIER))]
   "TARGET_HAVE_MEMORY_BARRIER"
 {
   operands[0] = gen_rtx_MEM (BLKmode, gen_rtx_SCRATCH (Pmode));
   MEM_VOLATILE_P (operands[0]) = 1;
+  operands[1] = GEN_INT((int) MEMMODEL_SEQ_CST);
 })
 
 (define_insn "*memory_barrier"
   [(set (match_operand:BLK 0 "" "")
-	(unspec:BLK [(match_dup 0)] UNSPEC_MEMORY_BARRIER))]
-  "TARGET_HAVE_MEMORY_BARRIER"
+	(unspec:BLK
+	 [(match_dup 0) (match_operand:SI 1 "const_int_operand")]
+	 UNSPEC_MEMORY_BARRIER))]
+  "TARGET_HAVE_DMB || TARGET_HAVE_MEMORY_BARRIER"
   {
 if (TARGET_HAVE_DMB)
   {
-	/* Note we issue a system level barrier. We should consider issuing
-	   a inner shareabilty zone barrier here instead, ie. "DMB ISH".  */
-	/* ??? Differentiate based on SEQ_CST vs less strict?  */
-	return "dmb\tsy";
+switch (INTVAL(operands[1]))
+	{
+	case MEMMODEL_RELEASE:
+  return "dmb\tishst";
+	case MEMMODEL_CONSUME:
+	case MEMMODEL_ACQUIRE:
+	case MEMMODEL_ACQ_REL:
+	  return "dmb\tish";
+	case MEMMODEL_SEQ_CST:
+	  return "dmb\tsy";
+	case MEMMODEL_RELAXED:
+	default:
+	  gcc_unreachable ();
+}
   }
 
 if (TARGET_HAVE_DMB_MCR)


Re: [PATCH] ARM: Weaker memory barriers

2014-03-11 Thread John Carr

Will Deacon  wrote:

> 
> Hi John,
> 
> On Tue, Mar 11, 2014 at 02:54:18AM +0000, John Carr wrote:
> > A comment in arm/sync.md notes "We should consider issuing a inner
> > shareability zone barrier here instead."  Here is my first attempt
> > at a patch to emit weaker memory barriers.  Three instructions seem
> > to be relevant for user mode code on my Cortex A9 Linux box:
> > 
> > dmb ishst, dmb ish, dmb sy
> > 
> > I believe these correspond to a release barrier, a full barrier
> > with respect to other CPUs, and a full barrier that also orders
> > relative to I/O.
> 
> Not quite; DMB ISHST only orders writes with other writes, so loads can move
> across it in both directions. That means it's not sufficient for releasing a
> lock, for example.

Release in this context doesn't mean "lock release".  I understand
it to mean release in the specific context of the C++11 memory model.
(Similarly, if you're arguing standards compliance "inline" really
means "relax the one definition rule for this function.")

I don't see a prohibition on moving non-atomic loads across a release
store.  Can you point to an analysis that shows a full barrier is needed?

If we assume that gcc is used to generate code for processes running
within a single inner shareable domain, then we can start by demoting
"dmb sy" to "dmb ish" for the memory barrier with no other change.

If a store-store barrier has no place in the gcc atomic memory model,
that supports my hypothesis that a twisty maze of ifdefs is superior to
a "portable" attractive nuisance.

> 
> 
> 
> I gave a presentation at ELCE about the various ARMv7 barrier options (from
> a kernel perspective):
> 
>   https://www.youtube.com/watch?v=6ORn6_35kKo
> 
> 

Conveniently just about the same length as a dryer cycle.

I learned that inner shareable domain is the right one to use
for normal user mode code, and the Linux kernel doesn't care
because it has its own memory model and barrier functions.