#define NDEBUG
#include <string.h>
#include <assert.h>
#include <limits.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>

/* default: SipHash-2-4 */
#define cROUNDS 1
#define dROUNDS 3

#define ALIGN (sizeof(size_t))
#define ONES ((size_t)-1 / UCHAR_MAX)
#define HIGHS (ONES * (UCHAR_MAX / 2 + 1))
#define HASZERO(x) ((x)-ONES & ~(x)&HIGHS)

#define ONES_64 ((uint64_t)-1 / UCHAR_MAX)
#define HIGHS_64 (ONES * (UCHAR_MAX / 2 + 1))
#define HASZERO_64(x) ((x)-ONES & ~(x)&HIGHS)

#define ONES_32 ((uint32_t)-1 / UCHAR_MAX)
#define HIGHS_32 (ONES_32 * (UCHAR_MAX / 2 + 1))
#define HASZERO_32(x) ((x)-ONES_32 & ~(x)&HIGHS_32)

#ifdef CHECK_FOR_EQUALS
#define NEEDS_TO_STOP(x) ((x) == (uint8_t)'\0' || (x) == (uint8_t)'=')
#define NEEDS_TO_STOP_32(x) (HASZERO_32(x) || HASZERO_32((x) ^ (ONES_32 * '=')))
#define NEEDS_TO_STOP_64(x) (HASZERO_64(x) || HASZERO_64((x) ^ (ONES_64 * '=')))
#else
#define NEEDS_TO_STOP(x) ((x) == (uint8_t)'\0')
#define NEEDS_TO_STOP_32(x) (HASZERO_32(x))
#define NEEDS_TO_STOP_64(x) (HASZERO_64(x))
#endif

#define grol(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
#define gror(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
#define ROTL(x, b) (uint32_t)(((x) << (b)) | ((x) >> (32 - (b))))

#ifdef __LITTLE_ENDIAN__
//#define U32TO8_LE(p, v) __builtin_memcpy((p), &(v), sizeof((v)))
#define U32TO8_LE(p, v) (*(uint32_t *)(p) = (v))
#define U8TO32_LE(p) (*(uint32_t *)(p))
//#define U8TO32_LE(p) ({ uint32_t _tmp; __builtin_memcpy(&_tmp, (p), sizeof(_tmp)); _tmp; })
#else
#define U32TO8_LE(p, v)                                                        \
	(p)[0] = (uint8_t)((v));                                               \
	(p)[1] = (uint8_t)((v) >> 8);                                          \
	(p)[2] = (uint8_t)((v) >> 16);                                         \
	(p)[3] = (uint8_t)((v) >> 24);

#define U8TO32_LE(p)                                                           \
	(((uint32_t)((p)[0])) | ((uint32_t)((p)[1]) << 8) |                    \
	 ((uint32_t)((p)[2]) << 16) | ((uint32_t)((p)[3]) << 24))

#endif
#define SIPROUND                                                               \
	do {                                                                   \
		v0 += v1;                                                      \
		v1 = ROTL(v1, 5);                                              \
		v1 ^= v0;                                                      \
		v0 = ROTL(v0, 16);                                             \
		v2 += v3;                                                      \
		v3 = ROTL(v3, 8);                                              \
		v3 ^= v2;                                                      \
		v0 += v3;                                                      \
		v3 = ROTL(v3, 7);                                              \
		v3 ^= v0;                                                      \
		v2 += v1;                                                      \
		v1 = ROTL(v1, 13);                                             \
		v1 ^= v2;                                                      \
		v2 = ROTL(v2, 16);                                             \
	} while (0)

#ifdef DEBUG
#define TRACE                                                                  \
	do {                                                                   \
		printf("(%3d) v0 %08x\n", (int)inlen, v0);                     \
		printf("(%3d) v1 %08x\n", (int)inlen, v1);                     \
		printf("(%3d) v2 %08x\n", (int)inlen, v2);                     \
		printf("(%3d) v3 %08x\n", (int)inlen, v3);                     \
	} while (0)
#else
#define TRACE
#endif
#ifdef ONE_AT_A_TIME
static inline uint32_t read_bytes(const uint8_t *in)
{
	uint32_t ret;
	if (NEEDS_TO_STOP(in[0]) || NEEDS_TO_STOP(in[1]) ||
	    NEEDS_TO_STOP(in[2]) || NEEDS_TO_STOP(in[3])) {
		return 0;
	}
	memcpy(&ret, in, sizeof(ret));
	return ret;
}

static inline uint64_t read_bytes_64(const uint8_t *in)
{
	uint64_t ret;
	if (NEEDS_TO_STOP(in[0]) || NEEDS_TO_STOP(in[1]) ||
	    NEEDS_TO_STOP(in[2]) || NEEDS_TO_STOP(in[3]) ||
	    NEEDS_TO_STOP(in[4]) || NEEDS_TO_STOP(in[5]) ||
	    NEEDS_TO_STOP(in[6]) || NEEDS_TO_STOP(in[7])) {
		return 0;
	}
	memcpy(&ret, in, sizeof(ret));
	return ret;
}
#else
static inline uint32_t read_bytes(const uint8_t *in)
{
	uint32_t ret;
	memcpy(&ret, in, sizeof(ret));
	;
	if (NEEDS_TO_STOP_32(ret))
		return 0;
	return ret;
}

static inline uint64_t read_bytes_64(const uint8_t *in)
{
	uint64_t ret;
	/* On 32-bit, it is better to split into a uint32_t before checking. */
	if (sizeof(void *) == 4) {
		uint32_t split[2];
		memcpy(split, in, sizeof(split));
		if (NEEDS_TO_STOP_32(split[0]))
			return 0;
		if (NEEDS_TO_STOP_32(split[1]))
			return 0;
		ret = split[0] | ((uint64_t)split[1] << 32);
	} else {
		memcpy(&ret, in, sizeof(ret));
		if (NEEDS_TO_STOP_64(ret))
			return 0;
	}
	return ret;
}
#endif
typedef unsigned u32x4 __attribute__((vector_size(16)));
uint32_t halfsiphash(const uint8_t *in, const size_t inlen, const uint8_t *k)
{
	uint32_t v0 = 0;
	uint32_t v1 = 0;
	uint32_t v2 = 0x6c796765;
	uint32_t v3 = 0x74656462;
	uint32_t k0 = U8TO32_LE(k);
	uint32_t k1 = U8TO32_LE(k + 4);
	uint32_t m;
	int i;
	const uint8_t *end = in + inlen - (inlen % sizeof(uint32_t));
	const int left = inlen & 3;
	uint32_t b = ((uint32_t)inlen) << 24;
	v3 ^= k1;
	v2 ^= k0;
	v1 ^= k1;
	v0 ^= k0;

	for (; in != end; in += 4) {
		m = U8TO32_LE(in);
		v3 ^= m;

		TRACE;
		for (i = 0; i < cROUNDS; ++i)
			SIPROUND;

		v0 ^= m;
	}

	switch (left) {
	case 3:
		b |= ((uint32_t)in[2]) << 16;
	case 2:
		b |= ((uint32_t)in[1]) << 8;
	case 1:
		b |= ((uint32_t)in[0]);
		break;
	case 0:
		break;
	}

	v3 ^= b;

	TRACE;
	for (i = 0; i < cROUNDS; ++i)
		SIPROUND;

	v0 ^= b;

	v2 ^= 0xff;

	TRACE;
	for (i = 0; i < dROUNDS; ++i)
		SIPROUND;

	b = v1 ^ v3;
	return b;
}

uint32_t halfsiphash_direct(const uint8_t *in, const uint8_t *k)
{
	uint32_t v0 = 0;
	uint32_t v1 = 0;
	uint32_t v2 = 0x6c796765;
	uint32_t v3 = 0x74656462;
	const uint8_t *start = in;
	uint32_t k0 = U8TO32_LE(k);
	uint32_t k1 = U8TO32_LE(k + 4);
	uint32_t m;
	uint32_t b = 0;
	int i;

	v3 ^= k1;
	v2 ^= k0;
	v1 ^= k1;
	v0 ^= k0;

	while ((m = read_bytes(in))) {
		v3 ^= m;

		TRACE;
		for (i = 0; i < cROUNDS; ++i)
			SIPROUND;

		v0 ^= m;
		in += 4;
	}

	/* Mix in the leftovers. */
	for (i = 0; i < sizeof(uint32_t); i++) {
		if (NEEDS_TO_STOP(*in))
			break;
		b |= ((uint32_t)*in++) << (i * CHAR_BIT);
	}

	/* add the length. In the original hash, this is done at the
     * beginning, but we don't know the length until now. */
	b |= (in - start) << 24;

	v3 ^= b;

	TRACE;
	for (i = 0; i < cROUNDS; ++i)
		SIPROUND;

	v0 ^= b;

	v2 ^= 0xff;

	TRACE;
	for (i = 0; i < dROUNDS; ++i)
		SIPROUND;

	b = v1 ^ v3;
	return b;
}
#undef SIPROUND
#undef TRACE
#undef ROTL

#define ROTL(x, b) (uint64_t)(((x) << (b)) | ((x) >> (64 - (b))))

#define U64TO8_LE(p, v)                                                        \
	U32TO8_LE((p), (uint32_t)((v)));                                       \
	U32TO8_LE((p) + 4, (uint32_t)((v) >> 32));
#define U8TO64_LE(p) (*(uint64_t *)(p))

#define SIPROUND                                                               \
	do {                                                                   \
		v0 += v1;                                                      \
		v1 = ROTL(v1, 13);                                             \
		v1 ^= v0;                                                      \
		v0 = ROTL(v0, 32);                                             \
		v2 += v3;                                                      \
		v3 = ROTL(v3, 16);                                             \
		v3 ^= v2;                                                      \
		v0 += v3;                                                      \
		v3 = ROTL(v3, 21);                                             \
		v3 ^= v0;                                                      \
		v2 += v1;                                                      \
		v1 = ROTL(v1, 17);                                             \
		v1 ^= v2;                                                      \
		v2 = ROTL(v2, 32);                                             \
	} while (0)

#ifdef DEBUG
#define TRACE                                                                  \
	do {                                                                   \
		printf("(%3d) v0 %08x %08x\n", (int)inlen,                     \
		       (uint32_t)(v0 >> 32), (uint32_t)v0);                    \
		printf("(%3d) v1 %08x %08x\n", (int)inlen,                     \
		       (uint32_t)(v1 >> 32), (uint32_t)v1);                    \
		printf("(%3d) v2 %08x %08x\n", (int)inlen,                     \
		       (uint32_t)(v2 >> 32), (uint32_t)v2);                    \
		printf("(%3d) v3 %08x %08x\n", (int)inlen,                     \
		       (uint32_t)(v3 >> 32), (uint32_t)v3);                    \
	} while (0)
#else
#define TRACE
#endif
#undef v0
#undef v1
#undef v2
#undef v3
uint64_t siphash(const uint8_t *in, const uint8_t *k)
{
	uint64_t v0 = 0x736f6d6570736575ULL;
	uint64_t v1 = 0x646f72616e646f6dULL;
	uint64_t v2 = 0x6c7967656e657261ULL;
	uint64_t v3 = 0x7465646279746573ULL;
	const uint8_t *start = in;
	uint64_t k0 = U8TO64_LE(k);
	uint64_t k1 = U8TO64_LE(k + 8);
	uint64_t m;
	int i;
	uint64_t b = 0;
	v3 ^= k1;
	v2 ^= k0;
	v1 ^= k1;
	v0 ^= k0;

	while ((m = read_bytes_64(in))) {
		v3 ^= m;

		TRACE;
		for (i = 0; i < cROUNDS; ++i)
			SIPROUND;

		v0 ^= m;
		in += 8;
	}

	/* Mix in the leftovers. */
	for (i = 0; i < sizeof(uint64_t); i++) {
		if (NEEDS_TO_STOP(*in))
			break;
		b |= ((uint64_t)*in++) << (i * CHAR_BIT);
	}
	/* add the length. In the original hash, this is done at the
     * beginning, but we don't know the length until now. */
	b |= (uint64_t)(in - start) << 56;

	v3 ^= b;

	TRACE;
	for (i = 0; i < cROUNDS; ++i)
		SIPROUND;

	v0 ^= b;

	v2 ^= 0xff;

	TRACE;
	for (i = 0; i < dROUNDS; ++i)
		SIPROUND;

	b = v0 ^ v1 ^ v2 ^ v3;

	return b;
}
__attribute__((__noinline__)) char *bsd_strchrnul(const char *p, int ch)
{
	char c;

	c = ch;
	for (;; ++p) {
		if (*p == c || *p == '\0')
			return ((char *)p);
	}
	/* NOTREACHED */
}
__attribute__((__noinline__)) char *musl_strchrnul(const char *s, int c)
{
	c = (unsigned char)c;
	if (!c)
		return (char *)s + strlen(s);

#ifdef __GNUC__
	typedef size_t __attribute__((__may_alias__)) word;
	const word *w;
	for (; (uintptr_t)s % ALIGN; s++)
		if (!*s || *(const unsigned char *)s == c)
			return (char *)s;
	size_t k = ONES * c;
	for (w = (const word *)s; !HASZERO(*w) && !HASZERO(*w ^ k); w++)
		;
	s = (const char *)w;
#endif
	for (; *s && *(unsigned char *)s != c; s++)
		;
	return (char *)s;
}
static uint8_t key[16] = { 0 };
#pragma weak strchrnul
extern "C" char *strchrnul(const char *a, int c) __attribute__((weak_import)); // weak declaration must always be present

#ifdef CHECK_FOR_EQUALS
static unsigned HalfSipStrChrBSD(const char *s)
{
	size_t len = bsd_strchrnul(s, '=') - s;
	return halfsiphash((const uint8_t *)s, len, key);
}

static unsigned HalfSipStrChrMusl(const char *s)
{
	size_t len = musl_strchrnul(s, '=') - s;
	return halfsiphash((const uint8_t *)s, len, key);
}

static unsigned HalfSipStrChr(const char *s)
{
	if (strchrnul != NULL) {
		size_t len = strchrnul(s, '=') - s;
		return halfsiphash((const uint8_t *)s, len, key);
	}
	return 0;
}
#else
static unsigned HalfSipStrlen(const char *s)
{
	size_t len = strlen(s);
	return halfsiphash((const uint8_t *)s, len, key);
}
#endif

static unsigned HalfSipDirect(const char *str)
{
	return halfsiphash_direct((const uint8_t *)str, key);
}

static unsigned SipDirect(const char *str)
{
	uint64_t ret = siphash((const uint8_t *)str, key);
	return (unsigned)(ret ^ (ret >> 32));
}

unsigned hash_seed = 0;
static unsigned hash_var(const char *name)
{
	const unsigned char *str = (const unsigned char *)name;
	uint32_t h1 = hash_seed ^ 0x3b00;
	uint32_t h2 = grol(hash_seed, 15);
	while (!NEEDS_TO_STOP(*str)) {
		h1 += *str++;
		h1 += h1 << 3; /* h1 *= 9 */
		h2 += h1;
		/* the rest could be as in MicroOAAT: h1 = grol(h1, 7)
		 * but clang doesn't generate ROTL instruction then. */
		h2 = grol(h2, 7);
		h2 += h2 << 2; /* h2 *= 5 */
	}
	h1 ^= h2;
	/* now h1 passes all collision checks,
	 * so it is suitable for hash-tables with prime numbers. */
	h1 += grol(h2, 14);
	h2 ^= h1;
	h2 += gror(h1, 6);
	h1 ^= h2;
	h1 += grol(h2, 5);
	h2 ^= h1;
	h2 += gror(h1, 8);
	return h2;
}

static unsigned hash_var_fnv(const char *name)
{
	const unsigned prime = 16777619U;
	unsigned hash = 2166136261U ^ hash_seed;
	while (!NEEDS_TO_STOP(*name))
		hash = (hash ^ (uint8_t)*name++) * prime;
	hash += hash << 13;
	hash ^= hash >> 7;
	hash += hash << 3;
	hash ^= hash >> 17;
	hash += hash << 5;
	return hash;
}

static unsigned bad_hash(const char *name)
{
	unsigned hash = (unsigned)(name[0]) << 4;

	while (!NEEDS_TO_STOP(*name)) {
		hash += (unsigned char)*name++;
	}
	return hash;
}

static unsigned DoNothing(const char *name)
{
	return 0;
}

static unsigned JustRead(const char *name)
{
	volatile uint8_t a;
	while (!NEEDS_TO_STOP(*name)) {
		a = (unsigned char)*name++;
	}
	return 0;
}

static unsigned JustRead32(const char *name)
{
	volatile uint32_t a;
	while ((a = read_bytes((const uint8_t *)name))) {
		name += 4;
	}
	return 0;
}

static unsigned JustRead64(const char *name)
{
	volatile uint8_t a;
	while ((a = read_bytes_64((const uint8_t *)name))) {
		name += 8;
	}
	return 0;
}

#ifndef rounds
#define rounds 100
#endif
#include <vector>
#include <unistd.h>
#include <sys/random.h>
static FILE *f;
__attribute__((__noinline__)) double _run(const char *name,
					  unsigned (*func)(const char *),
					  const std::vector<const char *> &vec)
{
	double start, end;
	unsigned acc = 0;
	start = (double)clock();
	for (int i = 0; i < rounds; i++) {
		for (const auto &str : vec) {
			acc += (*func)(str);
			__asm__("" : "+r"(acc));
		}
	}
	end = (double)clock();
	printf("%-20s%lfs, hash sum: %u\n", name,
	       (end - start) / CLOCKS_PER_SEC, acc);
	return end - start;
}


#define run(x) _run(#x, x, vec)
int main()
{
	char *buf = NULL;
	size_t size = 0;
	std::vector<const char *> vec;

	getentropy(key, sizeof(key));
	puts("Enter a filename:");
	getline(&buf, &size, stdin);

	char *end = musl_strchrnul(buf, '\n');
	*end = '\0';

	f = fopen(buf, "rb");
	if (!f) {
		puts("Couldn't open file!");
		return 1;
	}
	puts("Reading file...");
	// Sorry, not dealing with a dynamic array in C.
	while (getline(&buf, &size, f) > 0) {
		vec.push_back(strdup(buf));
	}
	fclose(f);

	run(DoNothing); // does nothing
	run(JustRead); // reads and checks for terminators, that's it
	run(JustRead32); // same as above but reads in 32-bit chunks
	run(JustRead64); // same for 64-bit chunks
	run(HalfSipDirect);
	run(SipDirect);
#ifdef CHECK_FOR_EQUALS
	if (strchrnul != NULL) {
		run(HalfSipStrChr);
	}
	run(HalfSipStrChrMusl);
	run(HalfSipStrChrBSD);
#else
	run(HalfSipStrlen);
#endif
	run(hash_var);
	run(hash_var_fnv);
	run(bad_hash);
}
