On Wednesday, November 24, 2010 8:29:35 pm Peter Maydell wrote:
> This wiki page came up during the toolchain call:
> https://wiki.linaro.org/Internal/People/KenWerner/AtomicMemoryOperations/
> 
> It gives the code generated for __sync_val_compare_and_swap
> as including a push {r4} / pop {r4} pair because it uses too many
> temporaries to fit them all in callee-saves registers. I think you
> can tweak it a bit to get rid of that:
> 
> # int __sync_val_compare_and_swap (int *mem, int old, int new);
> # if the current value of *mem is old, then write new into *mem
> # r0: mem, r1 old, r2 new
>         mov     r3, r0       # move r0 into r3
>         dmb     sy           # full memory barrier
>         .LSYT7:
>         ldrex   r0, [r3]     # load (exclusive) from memory pointed to
> by r3 into r0
>         cmp     r0, r1       # compare contents of r0 (mem) with r1
> (old) -> updates the condition flag
>         bne     .LSYB7       # branch to LSYB7 if mem != old
>         # This strex trashes the r0 we just loaded, but since we didn't
> take # the branch we know that r0 == r1
>         strex   r0, r2, [r3] # store r2 (new) into  memory pointed to
> by r3 (mem)
>                              # r0 contains 0 if the store was
> successful, otherwise 1
>         teq     r0, #0       # compares contents of r0 with zero ->
> updates the condition flag
>         bne     .LSYT7       # branch to LSYT7 if r0 != 0 (if the
> store wasn't successful)
>         # Move the value that was in memory into the right register to
> return it mov     r0, r1
>         dmb     sy           # full memory barrier
>         .LSYB7:
>         bx      lr           # return
> 
> I think you can do a similar trick with __sync_fetch_and_add
> (although you have to use a subtract to regenerate r0 from
> r1 and r2).
> 
> On the other hand I just looked at the gcc code that does this
> and it's not simply dumping canned sequences out to the
> assembler, so maybe it's not worth the effort just to drop a
> stack push/pop.

Hi,

Attached is a small GCC patch that attempts to optimize the __sync_* builtins 
as described above. Since "or" and "(n)and" are non-reversible the 
corresponding builtins still need the push/pop instructions.
Any suggestions or comments are welcome.

Regards
Ken
=== modified file 'gcc/config/arm/arm.c'
--- gcc/config/arm/arm.c	2010-11-11 11:50:33 +0000
+++ gcc/config/arm/arm.c	2010-11-29 12:59:50 +0000
@@ -23084,10 +23084,46 @@
       break;
     }
 
-  arm_output_strex (emit, mode, "", t2, t1, memory);
-  operands[0] = t2;
-  arm_output_asm_insn (emit, 0, operands, "teq\t%%0, #0");
-  arm_output_asm_insn (emit, 0, operands, "bne\t%sLSYT%%=", LOCAL_LABEL_PREFIX);
+  if (t2)
+    {
+       arm_output_strex (emit, mode, "", t2, t1, memory);
+       operands[0] = t2;
+       arm_output_asm_insn (emit, 0, operands, "teq\t%%0, #0");
+       arm_output_asm_insn (emit, 0, operands, "bne\t%sLSYT%%=",
+			    LOCAL_LABEL_PREFIX);
+    }
+  else
+    {
+      /* Use old_value for the return value because for some operations
+	 the old_value can easily be restored.  This saves one register.  */
+      arm_output_strex (emit, mode, "", old_value, t1, memory);
+      operands[0] = old_value;
+      arm_output_asm_insn (emit, 0, operands, "teq\t%%0, #0");
+      arm_output_asm_insn (emit, 0, operands, "bne\t%sLSYT%%=",
+			   LOCAL_LABEL_PREFIX);
+
+      switch (sync_op)
+	{
+	case SYNC_OP_ADD:
+	  arm_output_op3 (emit, "sub", old_value, t1, new_value);
+	  break;
+
+	case SYNC_OP_SUB:
+	  arm_output_op3 (emit, "add", old_value, t1, new_value);
+	  break;
+
+	case SYNC_OP_XOR:
+	  arm_output_op3 (emit, "eor", old_value, t1, new_value);
+	  break;
+
+	case SYNC_OP_NONE:
+	  arm_output_op2 (emit, "mov", old_value, required_value);
+	  break;
+
+	default:
+	  gcc_unreachable ();
+	}
+    }
 
   arm_process_output_memory_barrier (emit, NULL);
   arm_output_asm_insn (emit, 1, operands, "%sLSYB%%=:", LOCAL_LABEL_PREFIX);

=== modified file 'gcc/config/arm/sync.md'
--- gcc/config/arm/sync.md	2010-09-13 15:39:11 +0000
+++ gcc/config/arm/sync.md	2010-11-29 13:54:17 +0000
@@ -103,6 +103,18 @@
 			      (plus "add")
 			      (minus "sub")])
 
+(define_code_attr sync_clobber [(ior "=&r")
+				(and "=&r")
+				(xor "X")
+				(plus "X")
+				(minus "X")])
+
+(define_code_attr sync_t2_reqd [(ior "4")
+				(and "4")
+				(xor "*")
+				(plus "*")
+				(minus "*")])
+
 (define_expand "sync_<sync_optab>si"
   [(match_operand:SI 0 "memory_operand")
    (match_operand:SI 1 "s_register_operand")
@@ -286,7 +298,6 @@
 	  VUNSPEC_SYNC_COMPARE_AND_SWAP))
    (set (match_dup 1) (unspec_volatile:SI [(match_dup 2)]
                                           VUNSPEC_SYNC_COMPARE_AND_SWAP))
-   (clobber:SI (match_scratch:SI 4 "=&r"))
    (set (reg:CC CC_REGNUM) (unspec_volatile:CC [(match_dup 1)]
                                                 VUNSPEC_SYNC_COMPARE_AND_SWAP))
    ]
@@ -299,7 +310,6 @@
    (set_attr "sync_required_value"  "2")
    (set_attr "sync_new_value"       "3")
    (set_attr "sync_t1"              "0")
-   (set_attr "sync_t2"              "4")
    (set_attr "conds" "clob")
    (set_attr "predicable" "no")])
 
@@ -313,7 +323,6 @@
 	    VUNSPEC_SYNC_COMPARE_AND_SWAP)))
    (set (match_dup 1) (unspec_volatile:NARROW [(match_dup 2)]
                                           VUNSPEC_SYNC_COMPARE_AND_SWAP))
-   (clobber:SI (match_scratch:SI 4 "=&r"))
    (set (reg:CC CC_REGNUM) (unspec_volatile:CC [(match_dup 1)]
                                                 VUNSPEC_SYNC_COMPARE_AND_SWAP))
    ]
@@ -326,7 +335,6 @@
    (set_attr "sync_required_value"  "2")
    (set_attr "sync_new_value"       "3")
    (set_attr "sync_t1"              "0")
-   (set_attr "sync_t2"              "4")
    (set_attr "conds" "clob")
    (set_attr "predicable" "no")])
 
@@ -487,7 +495,7 @@
 	                    VUNSPEC_SYNC_OLD_OP))
    (clobber (reg:CC CC_REGNUM))
    (clobber (match_scratch:SI 3 "=&r"))
-   (clobber (match_scratch:SI 4 "=&r"))]
+   (clobber (match_scratch:SI 4 "<sync_clobber>"))]
   "TARGET_HAVE_LDREX && TARGET_HAVE_MEMORY_BARRIER"
   {
     return arm_output_sync_insn (insn, operands);
@@ -496,7 +504,7 @@
    (set_attr "sync_memory"          "1")
    (set_attr "sync_new_value"       "2")
    (set_attr "sync_t1"              "3")
-   (set_attr "sync_t2"              "4")
+   (set_attr "sync_t2"              "<sync_t2_reqd>")
    (set_attr "sync_op"              "<sync_optab>")
    (set_attr "conds" "clob")
    (set_attr "predicable" "no")])
@@ -540,7 +548,7 @@
 	                    VUNSPEC_SYNC_OLD_OP))
    (clobber (reg:CC CC_REGNUM))
    (clobber (match_scratch:SI 3 "=&r"))
-   (clobber (match_scratch:SI 4 "=&r"))]
+   (clobber (match_scratch:SI 4 "<sync_clobber>"))]
   "TARGET_HAVE_LDREXBHD && TARGET_HAVE_MEMORY_BARRIER"
   {
     return arm_output_sync_insn (insn, operands);
@@ -549,7 +557,7 @@
    (set_attr "sync_memory"          "1")
    (set_attr "sync_new_value"       "2")
    (set_attr "sync_t1"              "3")
-   (set_attr "sync_t2"              "4")
+   (set_attr "sync_t2"              "<sync_t2_reqd>")
    (set_attr "sync_op"              "<sync_optab>")
    (set_attr "conds" 		    "clob")
    (set_attr "predicable" "no")])

_______________________________________________
linaro-toolchain mailing list
linaro-toolchain@lists.linaro.org
http://lists.linaro.org/mailman/listinfo/linaro-toolchain

Reply via email to