
#define _GNU_SOURCE
#include <sched.h>
#include <pthread.h>
#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <math.h>
#include <sys/mman.h>
#include <stdarg.h>

#include "barrier.h"
#include "perf.h"
#include "stat.h"

static struct perf_event_attr perf_attr = {
	.type = PERF_TYPE_HARDWARE,
	.config = PERF_COUNT_HW_CPU_CYCLES,
	.exclude_kernel = 1,
	.pinned = 1,
};

void die(const char *err, ...)
{
	va_list params;

	va_start(params, err);
	vfprintf(stderr, err, params);
	va_end(params);

	exit(-1);
}

static unsigned long page_size;

static inline u64 mul_u32_u32(u32 a, u32 b)
{
#if __i386__
	u32 high, low;

	asm ("mull %[b]" : "=a" (low), "=d" (high)
			 : [a] "a" (a), [b] "g" (b) );

	return low | ((u64)high << 32);
#else
	return (u64)a * b;
#endif
}

static inline u64 mul_u64_u32_shr_cond(u64 a, u32 mul, unsigned int shift)
{
	u32 ah, al;
	u64 ret;

	al = a;
	ah = a >> 32;

	ret = mul_u32_u32(al, mul) >> shift;
	if (ah)
		ret += mul_u32_u32(ah, mul) << (32 - shift);

	return ret;
}

static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift)
{
	u32 ah, al;
	u64 ret;

	al = a;
	ah = a >> 32;

	ret = mul_u32_u32(al, mul) >> shift;
	ret += mul_u32_u32(ah, mul) << (32 - shift);

	return ret;
}

#ifdef __SIZEOF_INT128__
static inline u64 mul_u64_u32_shr_128(u64 a, u32 mul, unsigned int shift)
{
	return (u64)(((unsigned __int128)a * mul) >> shift);
}
#endif

void main(void)
{
	void *event;
	int fd, i;
	u64 cyc, delta;
	struct stats s1, s2, s3, s4;

	init_stats(&s1);
	init_stats(&s2);
	init_stats(&s3);
	init_stats(&s4);

	page_size = sysconf(_SC_PAGESIZE);

	fd = sys_perf_event_open(&perf_attr, 0, -1, -1, 0);
	if (fd < 0)
		die("failed to create perf_event");

	event = mmap(NULL, page_size, PROT_READ, MAP_SHARED, fd, 0);
	if (event == (void *)-1)
		die("failed to mmap perf_event");

	close(fd);

	cyc = mmap_read_pinned(event);

	for (i = 0; i < 1000000; i++) {
		cyc = mmap_read_pinned(event);
//		barrier();
		cyc = mmap_read_pinned(event) - cyc;
		update_stats(&s1, cyc);

		cyc = mmap_read_pinned(event);
//		barrier();
		delta += mul_u64_u32_shr_cond(delta, i, 10);
		cyc = mmap_read_pinned(event) - cyc;
		update_stats(&s2, cyc);

		cyc = mmap_read_pinned(event);
//		barrier();
		delta += mul_u64_u32_shr(delta, i, 10);
		cyc = mmap_read_pinned(event) - cyc;
		update_stats(&s3, cyc);

#ifdef __SIZEOF_INT128__
		cyc = mmap_read_pinned(event);
//		barrier();
		delta += mul_u64_u32_shr_128(delta, i, 10);
		cyc = mmap_read_pinned(event) - cyc;
		update_stats(&s4, cyc);
#endif
	}

	printf("%Ld\n", delta);

	printf("cond: avg: %f +- %f\n", 
			avg_stats(&s2) - avg_stats(&s1),
			stddev_stats(&s2));

	printf("uncond: avg: %f +- %f\n", 
			avg_stats(&s3) - avg_stats(&s1),
			stddev_stats(&s3));

#ifdef __SIZEOF_INT128__
	printf("128: avg: %f +- %f\n", 
			avg_stats(&s4) - avg_stats(&s1),
			stddev_stats(&s4));
#endif
}

