#include "crc32c.h"

#define CRC32C3X8(ITR) \
	crc1 = __crc32cd(crc1, *((const uint64_t *)data + 42*1 + (ITR)));\
	crc2 = __crc32cd(crc2, *((const uint64_t *)data + 42*2 + (ITR)));\
	crc0 = __crc32cd(crc0, *((const uint64_t *)data + 42*0 + (ITR)));

#define CRC32C7X3X8(ITR) do {\
	CRC32C3X8((ITR)*7+0) \
	CRC32C3X8((ITR)*7+1) \
	CRC32C3X8((ITR)*7+2) \
	CRC32C3X8((ITR)*7+3) \
	CRC32C3X8((ITR)*7+4) \
	CRC32C3X8((ITR)*7+5) \
	CRC32C3X8((ITR)*7+6) \
	} while(0)

#ifndef HWCAP_CRC32
#define HWCAP_CRC32             (1 << 7)
#endif /* HWCAP_CRC32 */

int crc32c_arm64_available = 0;

#ifdef ARCH_HAVE_ARM64_CRC_CRYPTO

#include <sys/auxv.h>
#include <arm_acle.h>
#include <arm_neon.h>

static int crc32c_probed;

/*
 * Function to calculate reflected crc with PMULL Instruction
 * crc done "by 3" for fixed input block size of 1024 bytes
 */
uint32_t crc32c_arm64(unsigned char const *data, unsigned long length)
{
	signed long len = length;
	uint32_t crc = ~0;
	uint32_t crc0, crc1, crc2;

	/* Load two consts: K1 and K2 */
	const poly64_t k1 = 0xe417f38a, k2 = 0x8f158014;
	uint64_t t0, t1;

	while ((len -= 1024) >= 0) {
		/* Do first 8 bytes here for better pipelining */
		crc0 = __crc32cd(crc, *(const uint64_t *)data);
		crc1 = 0;
		crc2 = 0;
		data += sizeof(uint64_t);

		/* Process block inline
		   Process crc0 last to avoid dependency with above */
		CRC32C7X3X8(0);
		CRC32C7X3X8(1);
		CRC32C7X3X8(2);
		CRC32C7X3X8(3);
		CRC32C7X3X8(4);
		CRC32C7X3X8(5);

		data += 42*3*sizeof(uint64_t);

		/* Merge crc0 and crc1 into crc2
		   crc1 multiply by K2
		   crc0 multiply by K1 */

		t1 = (uint64_t)vmull_p64(crc1, k2);
		t0 = (uint64_t)vmull_p64(crc0, k1);
		crc = __crc32cd(crc2, *(const uint64_t *)data);
		crc1 = __crc32cd(0, t1);
		crc ^= crc1;
		crc0 = __crc32cd(0, t0);
		crc ^= crc0;

		data += sizeof(uint64_t);
	}

	if (!(len += 1024))
		return crc;

	while ((len -= sizeof(uint64_t)) >= 0) {
                crc = __crc32cd(crc, *(const uint64_t *)data);
                data += sizeof(uint64_t);
        }

        /* The following is more efficient than the straight loop */
        if (len & sizeof(uint32_t)) {
                crc = __crc32cw(crc, *(const uint32_t *)data);
                data += sizeof(uint32_t);
        }
        if (len & sizeof(uint16_t)) {
                crc = __crc32ch(crc, *(const uint16_t *)data);
                data += sizeof(uint16_t);
        }
        if (len & sizeof(uint8_t)) {
                crc = __crc32cb(crc, *(const uint8_t *)data);
        }

	return crc;
}

void crc32c_arm64_probe(void)
{
	unsigned long hwcap;

	if (!crc32c_probed) {
		hwcap = getauxval(AT_HWCAP);
		if (hwcap & HWCAP_CRC32)
			crc32c_arm64_available = 1;
		crc32c_probed = 1;
	}
}

#endif /* ARCH_HAVE_ARM64_CRC_CRYPTO */