Hello,
Ive been working on trying to improve the performance of directfb 1.3.0
on the arm platform. The attached patch will replace the default libc
memcpy with a faster implementation. Ive tested this patch using an
AT91RM9200, but should work on other ARM targets.
Hope this will be useful to others.
Regards,
Vince
diff -Naur DirectFB-1.3.0-org/configure.in DirectFB-1.3.0/configure.in
--- DirectFB-1.3.0-org/configure.in 2009-03-18 09:11:21.000000000 +0000
+++ DirectFB-1.3.0/configure.in 2009-03-18 09:12:47.000000000 +0000
@@ -198,6 +198,7 @@
*arm*)
have_arm=yes
+ AC_DEFINE(ARCH_ARM,1,[Define to 1 if you are compiling for ARM.])
;;
ppc-*-linux* | powerpc-*)
@@ -221,6 +222,7 @@
need_libc_r=no
need_libdl=yes
want_ppcasm=yes
+want_armasm=yes
case "$target_or_host" in
*-linux*)
@@ -236,6 +238,7 @@
need_libc_r=yes
need_libdl=no
want_ppcasm=yes
+ want_armasm=yes
CPPFLAGS="$CPPFLAGS -I/usr/local/include"
LDFLAGS="$LDFLAGS -L/usr/local/lib"
;;
@@ -244,6 +247,7 @@
need_libc_r=yes
need_libdl=no
want_ppcasm=no
+ want_armasm=no
CPPFLAGS="$CPPFLAGS -I/usr/local/include"
LDFLAGS="$LDFLAGS -L/usr/local/lib"
;;
@@ -252,6 +256,7 @@
need_libc_r=no
need_libdl=no
want_ppcasm=yes
+ want_armasm=yes
CPPFLAGS="$CPPFLAGS -I/usr/pkg/include"
LDFLAGS="$LDFLAGS -L/usr/pkg/lib"
;;
@@ -260,6 +265,7 @@
need_libc_r=no
need_libdl=yes
want_ppcasm=no
+ want_armasm=no
CPPFLAGS="$CPPFLAGS -I/sw/include"
LDFLAGS="$LDFLAGS -L/sw/lib"
;;
@@ -281,6 +287,13 @@
AC_DEFINE(USE_PPCASM,1,[Define to 1 if ppc assembly is available.])
fi
+
+AM_CONDITIONAL(BUILDARMASM, test "$have_arm" = "yes" && test "$want_armasm" = "yes")
+
+if test "$have_arm" = "yes" && test "$want_armasm" = "yes"; then
+ AC_DEFINE(USE_ARMASM,1,[Define to 1 if arm assembly is available.])
+fi
+
if test "$have_kos" = "yes"; then
AC_DEFINE(USE_KOS,1,[Define to 1 if compiling on KallistiOS.])
fi
diff -Naur DirectFB-1.3.0-org/lib/direct/armasm_memcpy.h DirectFB-1.3.0/lib/direct/armasm_memcpy.h
--- DirectFB-1.3.0-org/lib/direct/armasm_memcpy.h 1970-01-01 01:00:00.000000000 +0100
+++ DirectFB-1.3.0/lib/direct/armasm_memcpy.h 2009-03-18 10:22:26.000000000 +0000
@@ -0,0 +1,28 @@
+/*
+ * ARM memcpy asm replacement.
+ *
+ * Copyright (C) 2009 Bluush Dev Team.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+#ifndef __ARMASM_MEMCPY_H__
+#define __ARMASM_MEMCPY_H__
+
+void *direct_armasm_memcpy ( void *dest, const void *src, size_t n);
+
+#endif /* __ARMASM_MEMCPY_H__ */
+
diff -Naur DirectFB-1.3.0-org/lib/direct/armasm_memcpy.S DirectFB-1.3.0/lib/direct/armasm_memcpy.S
--- DirectFB-1.3.0-org/lib/direct/armasm_memcpy.S 1970-01-01 01:00:00.000000000 +0100
+++ DirectFB-1.3.0/lib/direct/armasm_memcpy.S 2009-03-18 10:22:41.000000000 +0000
@@ -0,0 +1,126 @@
+/*
+ * ARM memcpy asm replacement.
+ *
+ * Copyright (C) 2009 Bluush Dev Team.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+
+#define _LABEL(f) f :
+
+
+.global direct_armasm_memcpy
+
+
+
+ .code 32
+
+
+/*
+ * Fast copy n bytes from source p2 to destination p1.
+ *
+ * void *direct_armasm_memcpy(void *p1, const void *p2, int n)
+ */
+
+_LABEL(direct_armasm_memcpy)
+ teq r2,#0 /* is arg n == 0 ? */
+ moveq pc,lr /* if n == 0, return */
+
+ stmdb sp!,{lr} /* push return address */
+ mov r12,r0 /* copy pointer p1 */
+ cmp r2,#0x8 /* is string long or short? */
+ ble ByteSerial /* jump if long string */
+
+ sub r3,r0,r1 /* compare pointers p1, p2 */
+ tst r3,#3 /* strings aligned same? */
+ bne ByteSerial /* jump if strings not aligned */
+
+/*
+ * Both strings are similarly aligned WRT word boundaries.
+ * At least a portion of the data can be copied an entire
+ * word at a time, which is faster than copying bytes.
+ */
+_LABEL(WordSerial)
+ ands r3,r0,#3 /* check byte alignment */
+ beq WordAligned /* jump if p1, p2 word-aligned */
+
+ rsb r3,r3,#4 /* m = no. of odd initial bytes */
+ sub r2,r2,r3 /* n = n - m */
+
+/*
+ * If the two strings do not begin on word boundaries, begin
+ * by copying the odd bytes that precede the first full word.
+ */
+_LABEL(PreLoop)
+ ldrb lr,[r1],#1 /* read byte from string 2 */
+ subs r3,r3,#1 /* --m (decrement loop count) */
+ strb lr,[r12],#1 /* write byte to string 1 */
+ bne PreLoop /* loop if more bytes to move */
+
+_LABEL(WordAligned)
+ movs r3,r2,asr #5 /* any chunks of 8 words? */
+ beq OctsDone /* jump if no 8-word chunks */
+
+ and r2,r2,#0x1f /* subtract chunks from n */
+ stmdb sp!,{r4-r10} /* save registers on stack */
+
+/*
+ * The strings are long enough that we can transfer at least
+ * some portion of the data in 8-word chunks.
+ */
+_LABEL(OctLoop)
+ ldmia r1!,{r4-r10,lr} /* load 8 words from string 2 */
+ subs r3,r3,#1 /* more 8-word chunks to move? */
+ stmia r12!,{r4-r10,lr} /* write 8 words to string 1 */
+ bne OctLoop /* loop if more chunks */
+
+ ldmia sp!,{r4-r10} /* restore registers from stack */
+
+_LABEL(OctsDone)
+ movs r3,r2,asr #2 /* any more whole words to move? */
+ beq WordsDone /* jump if no more whole words */
+
+/*
+ * Copy as much of the remaining data as possible one word at
+ * a time.
+ */
+_LABEL(WordLoop2)
+ ldr lr,[r1],#4 /* read next word from string 2 */
+ subs r3,r3,#1 /* decrement word count */
+ str lr,[r12],#4 /* write next word to string 1 */
+ bne WordLoop2 /* loop while more words to move */
+
+_LABEL(WordsDone)
+ ands r2,r2,#3 /* any last bytes to transfer? */
+ ldmeqia sp!,{pc} /* return if already done */
+
+/*
+ * The two strings do not end on word boundaries.
+ * Copy the remaining data one byte at a time.
+ */
+_LABEL(ByteSerial)
+ ldrb lr,[r1],#1 /* read byte from string 2 */
+ subs r2,r2,#1 /* --n (decrement loop count) */
+ strb lr,[r12],#1 /* write byte to string 1 */
+ bne ByteSerial /* loop if more bytes to move */
+
+ ldmia sp!,{pc} /* return to caller */
+
+ .ltorg
+
+
+
diff -Naur DirectFB-1.3.0-org/lib/direct/Makefile.am DirectFB-1.3.0/lib/direct/Makefile.am
--- DirectFB-1.3.0-org/lib/direct/Makefile.am 2009-03-18 09:11:21.000000000 +0000
+++ DirectFB-1.3.0/lib/direct/Makefile.am 2009-03-18 09:15:21.000000000 +0000
@@ -29,6 +29,10 @@
ppcasm_headers = ppcasm_memcpy.h ppc_asm.h
endif
+if BUILDARMASM
+armasm_sources = armasm_memcpy.S
+armasm_header = armasm_memcpy.h
+endif
# If the old location isn't cleared, builds of external modules fail
install-exec-local:
@@ -39,6 +43,7 @@
include_HEADERS = \
$(ppcasm_headers) \
+ $(armasm_headers) \
build.h \
clock.h \
conf.h \
@@ -69,6 +74,7 @@
libdirect_la_SOURCES = \
$(ppcasm_sources) \
+ $(armasm_sources) \
clock.c \
conf.c \
debug.c \
diff -Naur DirectFB-1.3.0-org/lib/direct/memcpy.c DirectFB-1.3.0/lib/direct/memcpy.c
--- DirectFB-1.3.0-org/lib/direct/memcpy.c 2009-03-18 09:11:21.000000000 +0000
+++ DirectFB-1.3.0/lib/direct/memcpy.c 2009-03-18 09:14:45.000000000 +0000
@@ -44,7 +44,7 @@
#include <direct/memcpy.h>
#include <direct/messages.h>
-#if defined (ARCH_PPC) || (SIZEOF_LONG == 8)
+#if defined (ARCH_PPC) || defined (ARCH_ARM) || (SIZEOF_LONG == 8)
# define RUN_BENCHMARK 1
#else
# define RUN_BENCHMARK 0
@@ -58,6 +58,10 @@
#include "ppcasm_memcpy.h"
#endif
+#ifdef USE_ARMASM
+#include "armasm_memcpy.h"
+#endif
+
#if SIZEOF_LONG == 8
@@ -152,6 +156,9 @@
{ "ppccache", "ppcasm_cacheable_memcpy()", direct_ppcasm_cacheable_memcpy, 0, 0},
#endif /* __LINUX__ */
#endif /* USE_PPCASM */
+#ifdef USE_ARMASM
+ { "arm", "armasm_memcpy()", direct_armasm_memcpy, 0, 0},
+#endif
{ NULL, NULL, NULL, 0, 0}
};
_______________________________________________
directfb-dev mailing list
[email protected]
http://mail.directfb.org/cgi-bin/mailman/listinfo/directfb-dev