Adding perf benchmarks to test the arch independent and x86[64] versions of
do_csum to the perf suite.  Other arches can be added as needed.  To avoid
creating a new suite instance (as I didn't think it was warranted), the csum
benchmarks have been added to the mem suite

Signed-off-by: Neil Horman <[email protected]>
CC: [email protected]
CC: Thomas Gleixner <[email protected]>
CC: Ingo Molnar <[email protected]>
CC: "H. Peter Anvin" <[email protected]>
CC: [email protected]
---
 tools/perf/Makefile.perf               |   3 +
 tools/perf/bench/bench.h               |   2 +
 tools/perf/bench/mem-csum-generic.c    |  21 +++
 tools/perf/bench/mem-csum-x86-64-def.h |   8 +
 tools/perf/bench/mem-csum-x86-64.c     |  51 +++++++
 tools/perf/bench/mem-csum.c            | 266 +++++++++++++++++++++++++++++++++
 tools/perf/bench/mem-csum.h            |  46 ++++++
 tools/perf/builtin-bench.c             |   1 +
 8 files changed, 398 insertions(+)
 create mode 100644 tools/perf/bench/mem-csum-generic.c
 create mode 100644 tools/perf/bench/mem-csum-x86-64-def.h
 create mode 100644 tools/perf/bench/mem-csum-x86-64.c
 create mode 100644 tools/perf/bench/mem-csum.c
 create mode 100644 tools/perf/bench/mem-csum.h

diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
index 5b86390..d0ac05b 100644
--- a/tools/perf/Makefile.perf
+++ b/tools/perf/Makefile.perf
@@ -413,9 +413,12 @@ BUILTIN_OBJS += $(OUTPUT)bench/sched-pipe.o
 ifeq ($(RAW_ARCH),x86_64)
 BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy-x86-64-asm.o
 BUILTIN_OBJS += $(OUTPUT)bench/mem-memset-x86-64-asm.o
+BUILTIN_OBJS += $(OUTPUT)bench/mem-csum-x86-64.o
 endif
 BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy.o
 BUILTIN_OBJS += $(OUTPUT)bench/mem-memset.o
+BUILTIN_OBJS += $(OUTPUT)bench/mem-csum.o
+BUILTIN_OBJS += $(OUTPUT)bench/mem-csum-generic.o
 
 BUILTIN_OBJS += $(OUTPUT)builtin-diff.o
 BUILTIN_OBJS += $(OUTPUT)builtin-evlist.o
diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h
index 0fdc852..3bbe43e 100644
--- a/tools/perf/bench/bench.h
+++ b/tools/perf/bench/bench.h
@@ -32,6 +32,8 @@ extern int bench_mem_memcpy(int argc, const char **argv,
                            const char *prefix __maybe_unused);
 extern int bench_mem_memset(int argc, const char **argv, const char *prefix);
 
+extern int bench_mem_csum(int argc, const char **argv, const char *prefix);
+
 #define BENCH_FORMAT_DEFAULT_STR       "default"
 #define BENCH_FORMAT_DEFAULT           0
 #define BENCH_FORMAT_SIMPLE_STR                "simple"
diff --git a/tools/perf/bench/mem-csum-generic.c 
b/tools/perf/bench/mem-csum-generic.c
new file mode 100644
index 0000000..3e77b0d
--- /dev/null
+++ b/tools/perf/bench/mem-csum-generic.c
@@ -0,0 +1,21 @@
+#include "mem-csum.h"
+
+u32 generic_do_csum(unsigned char *buff, unsigned int len);
+
+__wsum csum_partial_copy(const void *src, void *dst, int len, __wsum sum);
+
+/*
+ * Each arch specific implementation file exports these functions,
+ * So we get link time conflicts.  Since we're not testing these paths right 
now
+ * just rename them to something generic here
+ */
+#define csum_partial(x, y, z) csum_partial_generic(x, y, z)
+#define ip_compute_csum(x, y) ip_complete_csum_generic(x, y)
+
+#include "../../../lib/checksum.c"
+
+u32 generic_do_csum(unsigned char *buff, unsigned int len)
+{
+       return do_csum(buff, len);
+}
+
diff --git a/tools/perf/bench/mem-csum-x86-64-def.h 
b/tools/perf/bench/mem-csum-x86-64-def.h
new file mode 100644
index 0000000..6698193
--- /dev/null
+++ b/tools/perf/bench/mem-csum-x86-64-def.h
@@ -0,0 +1,8 @@
+/*
+ * Arch specific bench tests for x86[_64]
+ */
+
+CSUM_FN(x86_do_csum, x86_do_csum_init,
+       "x86-64-csum",
+       "x86 unrolled optimized csum() from kernel")
+
diff --git a/tools/perf/bench/mem-csum-x86-64.c 
b/tools/perf/bench/mem-csum-x86-64.c
new file mode 100644
index 0000000..72bc855
--- /dev/null
+++ b/tools/perf/bench/mem-csum-x86-64.c
@@ -0,0 +1,51 @@
+#include "mem-csum.h"
+
+static int clflush_size;
+
+/*
+ * This overrides the cache_line_size() function from the kernel
+ * The kernel version returns the size of the processor cache line, so 
+ * we emulate that here
+ */
+static inline int cache_line_size(void)
+{
+       return clflush_size;
+}
+
+/*
+ * userspace has no idea what these macros do, and since we don't 
+ * need them to do anything for perf, just make them go away
+ */
+#define unlikely(x) x
+#define EXPORT_SYMBOL(x)
+
+u32 x86_do_csum(unsigned char *buff, unsigned int len);
+void x86_do_csum_init(void);
+
+#include "../../../arch/x86/lib/csum-partial_64.c"
+
+u32 x86_do_csum(unsigned char *buff, unsigned int len)
+{
+       return do_csum(buff, len);
+}
+
+void x86_do_csum_init(void)
+{
+       /*
+        * The do_csum routine we're testing requires the kernel
+        * implementation of cache_line_size(), which relies on data
+        * parsed from the cpuid instruction, do that computation here
+        */
+       asm("mov $0x1, %%eax\n\t"
+           "cpuid\n\t"
+           "mov %%ebx, %[size]\n"
+           : : [size] "m" (clflush_size));
+
+       /*
+        * The size of a cache line evicted by a clflush operation is
+        * contained in bits 15:8 of ebx when cpuid 0x1 is issued
+        * and is reported in 8 byte words, hence the multiplcation below
+        */
+       clflush_size = (clflush_size >> 8) & 0x0000000f;
+       clflush_size *= 8;
+}
diff --git a/tools/perf/bench/mem-csum.c b/tools/perf/bench/mem-csum.c
new file mode 100644
index 0000000..3676f6e
--- /dev/null
+++ b/tools/perf/bench/mem-csum.c
@@ -0,0 +1,266 @@
+/*
+ * mem-csum.c
+ *
+ * csum: checksum speed tests
+ *
+ */
+
+#include "../perf.h"
+#include "../util/util.h"
+#include "../util/parse-options.h"
+#include "../util/header.h"
+#include "bench.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <errno.h>
+
+#define K 1024
+
+static const char      *length_str     = "1500B";
+static const char      *size_str       = "64MB";
+static const char      *routine        = "default";
+static int             iterations      = 1;
+static bool            use_cycle;
+static int             cycle_fd;
+
+static const struct option options[] = {
+       OPT_STRING('l', "length", &length_str, "1MB",
+                   "Specify length of memory to checksum. "
+                   "Available units: B, KB, MB, GB and TB (upper and lower)"),
+       OPT_STRING('s', "size", &size_str, "64MB",
+                  "Size of working set to draw csumed buffer from."
+                  "Available units: B, KB, MB, GB and TB"),
+       OPT_STRING('r', "routine", &routine, "default",
+                   "Specify routine to set"),
+       OPT_INTEGER('i', "iterations", &iterations,
+                   "repeat csum() invocation this number of times"),
+       OPT_BOOLEAN('c', "cycle", &use_cycle,
+                   "Use cycles event instead of gettimeofday() for measuring"),
+       OPT_END()
+};
+
+
+extern u32 generic_do_csum(unsigned char *buff, unsigned int len);
+
+#ifdef HAVE_ARCH_X86_64_SUPPORT
+extern u32 x86_do_csum(unsigned char *buff, unsigned int len);
+extern void x86_do_csum_init(void);
+#endif
+
+typedef u32 (*csum_t)(unsigned char *, unsigned int);
+typedef void (*csum_init_t)(void);
+
+struct routine {
+       const char *name;
+       const char *desc;
+       csum_t fn;
+       csum_init_t initfn;
+};
+
+static const struct routine routines[] = {
+       { "default",
+         "Default arch-independent csum",
+         generic_do_csum,
+         NULL },
+#ifdef HAVE_ARCH_X86_64_SUPPORT
+#define CSUM_FN(fn, init, name, desc) { name, desc, fn, init },
+#include "mem-csum-x86-64-def.h"
+#undef CSUM_FN
+
+#endif
+
+       { NULL,
+         NULL,
+         NULL,
+         NULL }
+};
+
+static const char * const bench_mem_csum_usage[] = {
+       "perf bench mem csum <options>",
+       NULL
+};
+
+static struct perf_event_attr cycle_attr = {
+       .type           = PERF_TYPE_HARDWARE,
+       .config         = PERF_COUNT_HW_CPU_CYCLES
+};
+
+static void init_cycle(void)
+{
+       cycle_fd = sys_perf_event_open(&cycle_attr, getpid(), -1, -1, 0);
+
+       if (cycle_fd < 0 && errno == ENOSYS)
+               die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
+       else
+               BUG_ON(cycle_fd < 0);
+}
+
+static u64 get_cycle(void)
+{
+       int ret;
+       u64 clk;
+
+       ret = read(cycle_fd, &clk, sizeof(u64));
+       BUG_ON(ret != sizeof(u64));
+
+       return clk;
+}
+
+static double timeval2double(struct timeval *ts)
+{
+       return (double)ts->tv_sec +
+               (double)ts->tv_usec / (double)1000000;
+}
+
+static void alloc_mem(void **dst, size_t length)
+{
+       *dst = malloc(length);
+       if (!*dst)
+               die("memory allocation failed - maybe length is too large?\n");
+}
+
+
+static u64 do_csum_cycle(csum_t fn, size_t size, size_t len)
+{
+       u64 cycle_start = 0ULL, cycle_end = 0ULL;
+       void *dst = NULL;
+       void *pool = NULL;
+       unsigned int segments;
+       u64 total_cycles = 0;
+       int i;
+
+       alloc_mem(&pool, size);
+
+       segments = (size / len) - 1;
+       for (i = 0; i < iterations; ++i) {
+               dst = pool + ((random() % segments) * len);
+               cycle_start = get_cycle();
+               fn(dst, len);
+               cycle_end = get_cycle();
+               total_cycles += (cycle_end - cycle_start);
+       }
+
+       free(pool);
+       return total_cycles;
+}
+
+static double do_csum_gettimeofday(csum_t fn, size_t size, size_t len)
+{
+       struct timeval tv_start, tv_end, tv_diff, tv_total;
+       void *dst = NULL;
+       void *pool = NULL;
+       unsigned int segments;
+       int i;
+
+       alloc_mem(&pool, size);
+       timerclear(&tv_total);
+       segments = (size / len) - 1;
+
+       for (i = 0; i < iterations; ++i) {
+               dst = pool + ((random() % segments) * len);
+               BUG_ON(gettimeofday(&tv_start, NULL));
+               fn(dst, len);
+               BUG_ON(gettimeofday(&tv_end, NULL));
+               timersub(&tv_end, &tv_start, &tv_diff);
+               timeradd(&tv_total, &tv_diff, &tv_total);
+       }
+
+
+       free(pool);
+       return (double)((double)(len*iterations) / timeval2double(&tv_total));
+}
+
+#define print_bps(x) do {                                      \
+               if (x < K)                                      \
+                       printf(" %14lf B/Sec\n", x);            \
+               else if (x < K * K)                             \
+                       printf(" %14lfd KB/Sec\n", x / K);      \
+               else if (x < K * K * K)                         \
+                       printf(" %14lf MB/Sec\n", x / K / K);   \
+               else                                            \
+                       printf(" %14lf GB/Sec\n", x / K / K / K); \
+       } while (0)
+
+int bench_mem_csum(int argc, const char **argv,
+                  const char *prefix __maybe_unused)
+{
+       int i;
+       size_t len;
+       size_t setsize;
+       double result_bps;
+       u64 result_cycle;
+
+       argc = parse_options(argc, argv, options,
+                            bench_mem_csum_usage, 0);
+
+       if (use_cycle)
+               init_cycle();
+
+       len = (size_t)perf_atoll((char *)length_str);
+       setsize = (size_t)perf_atoll((char *)size_str);
+
+       result_cycle = 0ULL;
+       result_bps = 0.0;
+
+       if ((s64)len <= 0) {
+               fprintf(stderr, "Invalid length:%s\n", length_str);
+               return 1;
+       }
+
+       for (i = 0; routines[i].name; i++) {
+               if (!strcmp(routines[i].name, routine))
+                       break;
+       }
+       if (!routines[i].name) {
+               printf("Unknown routine:%s\n", routine);
+               printf("Available routines...\n");
+               for (i = 0; routines[i].name; i++) {
+                       printf("\t%s ... %s\n",
+                              routines[i].name, routines[i].desc);
+               }
+               return 1;
+       }
+
+       if (routines[i].initfn)
+               routines[i].initfn();
+
+       if (bench_format == BENCH_FORMAT_DEFAULT)
+               printf("# Copying %s Bytes ...\n\n", length_str);
+
+       if (use_cycle) {
+               result_cycle =
+                       do_csum_cycle(routines[i].fn, setsize, len);
+       } else {
+               result_bps =
+                       do_csum_gettimeofday(routines[i].fn, setsize, len);
+       }
+
+       switch (bench_format) {
+       case BENCH_FORMAT_DEFAULT:
+               if (use_cycle) {
+                       printf(" %14lf Cycle/Byte\n",
+                               (double)result_cycle
+                               / (double)(len*iterations));
+               } else
+                       print_bps(result_bps);
+
+
+               break;
+       case BENCH_FORMAT_SIMPLE:
+               if (use_cycle) {
+                       printf("%lf\n", (double)result_cycle
+                               / (double)(len*iterations));
+               } else
+                       printf("%lf\n", result_bps);
+               break;
+       default:
+               /* reaching this means there's some disaster: */
+               die("unknown format: %d\n", bench_format);
+               break;
+       }
+
+       return 0;
+}
diff --git a/tools/perf/bench/mem-csum.h b/tools/perf/bench/mem-csum.h
new file mode 100644
index 0000000..cca9a77
--- /dev/null
+++ b/tools/perf/bench/mem-csum.h
@@ -0,0 +1,46 @@
+/*
+ * Header for mem-csum
+ * mostly trickery to get the kernel code to compile
+ * in user space
+ */
+
+#include "../util/util.h"
+
+#include <linux/types.h>
+
+
+typedef __u16 __le16;
+typedef __u16 __be16;
+typedef __u32 __le32;
+typedef __u32 __be32;
+typedef __u64 __le64;
+typedef __u64 __be64;
+
+typedef __u16 __sum16;
+typedef __u32 __wsum;
+
+/*
+ * __visible isn't defined in userspace, so make it dissappear
+ */
+#define __visible
+
+/*
+ * These get multiple definitions in the kernel with a common inline version
+ * We're not testing them so just move them to another name
+ */
+#define ip_fast_csum ip_fast_csum_backup
+#define csum_tcpudp_nofold csum_tcpudp_nofold_backup
+
+/*
+ * Most csum implementations need this defined, for the copy_and_csum variants.
+ * Since we're building in userspace, this can be voided out
+ */
+static inline int __copy_from_user(void *dst, const void *src, size_t len)
+{
+       (void)dst;
+       (void)src;
+       (void)len;
+       return 0;
+}
+
+
diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c
index e47f90c..44199e0 100644
--- a/tools/perf/builtin-bench.c
+++ b/tools/perf/builtin-bench.c
@@ -50,6 +50,7 @@ static struct bench sched_benchmarks[] = {
 static struct bench mem_benchmarks[] = {
        { "memcpy",     "Benchmark for memcpy()",                       
bench_mem_memcpy        },
        { "memset",     "Benchmark for memset() tests",                 
bench_mem_memset        },
+       { "csum",       "Simple csum timing for various arches",        
bench_mem_csum          },
        { "all",        "Test all memory benchmarks",                   NULL    
                },
        { NULL,         NULL,                                           NULL    
                }
 };
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to