Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Upgrade embedded xxhash (-> v0.8.1).
authorArnaud Giersch <arnaud.giersch@univ-fcomte.fr>
Sun, 19 Feb 2023 14:48:56 +0000 (15:48 +0100)
committerArnaud Giersch <arnaud.giersch@univ-fcomte.fr>
Sun, 19 Feb 2023 14:53:09 +0000 (15:53 +0100)
Taken from
https://github.com/RedSpah/xxhash_cpp/commit/d21e37c34c4ad4ad562e4ec01ec036e342c422e3

COPYING
src/3rd-party/xxhash.hpp

diff --git a/COPYING b/COPYING
index f4b1b6d..8b87fe6 100644 (file)
--- a/COPYING
+++ b/COPYING
@@ -193,8 +193,8 @@ License: GPL-3
 
 Files: src/3rd-party/xxhash.hpp
 Copyright:
-  Copyright (C) 2012-2018, Yann Collet.
-  Copyright (C) 2017-2018, Piotr Pliszka.
+  Copyright (C) 2012-2022, Yann Collet.
+  Copyright (C) 2017-2022, Red Gavin.
 License: BSD 2-Clause
 
 License: BSL-1.0
index a3dbe98..5917a50 100644 (file)
@@ -1,18 +1,16 @@
 #pragma once
-#include <array>
 #include <cstdint>
 #include <cstring>
-#include <string>
+#include <array>
 #include <type_traits>
 #include <vector>
-
-#include <iostream>
+#include <string>
 
 /*
 xxHash - Extremely Fast Hash algorithm
 Header File
-Copyright (C) 2012-2018, Yann Collet.
-Copyright (C) 2017-2018, Piotr Pliszka.
+Copyright (C) 2012-2022, Yann Collet.
+Copyright (C) 2017-2022, Red Gavin.
 All rights reserved.
 
 BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
@@ -41,717 +39,2213 @@ You can contact the author at :
 - xxHash C++ port repository : https://github.com/RedSpah/xxhash_cpp
 */
 
-/* *************************************
- *  Tuning parameters
- ***************************************/
-/*!XXH_FORCE_MEMORY_ACCESS :
- * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable.
- * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal.
- * The below switch allow to select different access method for improved performance.
- * Method 0 (default) : use `memcpy()`. Safe and portable.
- * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable).
- *            This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`.
- * Method 2 : direct access. This method doesn't depend on compiler but violate C standard.
- *            It can generate buggy code on targets which do not support unaligned memory accesses.
- *            But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6)
- * See http://stackoverflow.com/a/32095106/646947 for details.
- * Prefer these methods in priority order (0 > 1 > 2)
- */
-#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */
-#if defined(__GNUC__) && (defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) ||           \
-                          defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__))
-#define XXH_FORCE_MEMORY_ACCESS 2
-#elif defined(__INTEL_COMPILER) ||                                                                                     \
-    (defined(__GNUC__) && (defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) ||          \
-                           defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__)))
-#define XXH_FORCE_MEMORY_ACCESS 1
-#endif
-#endif
+/* Intrinsics
+* Sadly has to be included in the global namespace or literally everything breaks
+*/
+#include <immintrin.h>
 
-/*!XXH_FORCE_NATIVE_FORMAT :
- * By default, xxHash library provides endian-independent Hash values, based on little-endian convention.
- * Results are therefore identical for little-endian and big-endian CPU.
- * This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format.
- * Should endian-independence be of no importance for your application, you may set the #define below to 1,
- * to improve speed for Big-endian CPU.
- * This option has no impact on Little_Endian CPU.
- */
-#if !defined(XXH_FORCE_NATIVE_FORMAT) || (XXH_FORCE_NATIVE_FORMAT == 0) /* can be defined externally */
-#define XXH_FORCE_NATIVE_FORMAT 0
-#define XXH_CPU_LITTLE_ENDIAN 1
+namespace xxh
+{
+       /* *************************************
+       *  Versioning
+       ***************************************/
+
+       namespace version
+       {
+               constexpr int cpp_version_major = 0;
+               constexpr int cpp_version_minor = 8;
+               constexpr int cpp_version_release = 1;
+       }
+
+       constexpr uint32_t version_number() 
+       { 
+               return version::cpp_version_major * 10000 + version::cpp_version_minor * 100 + version::cpp_version_release;
+       }
+
+
+       /* *************************************
+       *  Basic Types - Predefining uint128_t for intrin
+       ***************************************/
+
+       namespace typedefs
+       {
+               struct alignas(16) uint128_t
+               {
+                       uint64_t low64 = 0;
+                       uint64_t high64 = 0;
+
+                       bool operator==(const uint128_t & other)
+                       {
+                               return (low64 == other.low64 && high64 == other.high64);
+                       }
+
+                       bool operator>(const uint128_t & other)
+                       {
+                               return (high64 > other.high64 || low64 > other.low64);
+                       }
+
+                       bool operator>=(const uint128_t & other)
+                       {
+                               return (*this > other || *this == other);
+                       }
+
+                       bool operator<(const uint128_t & other)
+                       {
+                               return !(*this >= other);
+                       }
+
+                       bool operator<=(const uint128_t & other)
+                       {
+                               return !(*this > other);
+                       }
+
+                       bool operator!=(const uint128_t & other)
+                       {
+                               return !(*this == other);
+                       }
+
+                       uint128_t(uint64_t low, uint64_t high) : low64(low), high64(high) {}
+
+                       uint128_t() {}
+               };
+
+       }
+
+       using uint128_t = typedefs::uint128_t;
+
+
+       /* *************************************
+       *  Compiler / Platform Specific Features
+       ***************************************/
+
+       namespace intrin
+       {
+               /*!XXH_CPU_LITTLE_ENDIAN :
+               * This is a CPU endian detection macro, will be
+               * automatically set to 1 (little endian) if it is left undefined.
+               * If compiling for a big endian system (why), XXH_CPU_LITTLE_ENDIAN has to be explicitly defined as 0.
+               */
+#ifndef XXH_CPU_LITTLE_ENDIAN
+#      define XXH_CPU_LITTLE_ENDIAN 1
 #endif
 
-/*!XXH_FORCE_ALIGN_CHECK :
- * This is a minor performance trick, only useful with lots of very small keys.
- * It means : check for aligned/unaligned input.
- * The check costs one initial branch per hash;
- * set it to 0 when the input is guaranteed to be aligned,
- * or when alignment doesn't matter for performance.
- */
-#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
-#if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
-#define XXH_FORCE_ALIGN_CHECK 0
-#else
-#define XXH_FORCE_ALIGN_CHECK 1
-#endif
-#endif
 
-/*!XXH_CPU_LITTLE_ENDIAN :
- * This is a CPU endian detection macro, will be
- * automatically set to 1 (little endian) if XXH_FORCE_NATIVE_FORMAT
- * is left undefined, XXH_FORCE_NATIVE_FORMAT is defined to 0, or if an x86/x86_64 compiler macro is defined.
- * If left undefined, endianness will be determined at runtime, at the cost of a slight one-time overhead
- * and a larger overhead due to get_endian() not being constexpr.
- */
-#ifndef XXH_CPU_LITTLE_ENDIAN
-#if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
-#define XXH_CPU_LITTLE_ENDIAN 1
-#endif
+               /* Vectorization Detection
+               * NOTE: XXH_NEON and XXH_VSX aren't supported in this C++ port.
+               * The primary reason is that I don't have access to an ARM and PowerPC
+               * machines to test them, and the secondary reason is that I even doubt anyone writing
+               * code for such machines would bother using a C++ port rather than the original C version.
+               */
+#ifndef XXH_VECTOR   /* can be predefined on command line */
+#      if defined(__AVX512F__) 
+#              define XXH_VECTOR 3 /* AVX512 for Skylake and Icelake */
+#      elif defined(__AVX2__)
+#              define XXH_VECTOR 2 /* AVX2 for Haswell and Bulldozer */
+#      elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
+#              define XXH_VECTOR 1 /* SSE2 for Pentium 4 and all x86_64 */
+#      else
+#              define XXH_VECTOR 0 /* Portable scalar version */
+#      endif
 #endif
 
-/* *************************************
- *  Compiler Specific Options
- ***************************************/
-#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
-
-namespace xxh {
-/* *************************************
- *  Version
- ***************************************/
-constexpr int cpp_version_major   = 0;
-constexpr int cpp_version_minor   = 6;
-constexpr int cpp_version_release = 5;
-constexpr uint32_t version_number()
-{
-  return cpp_version_major * 10000 + cpp_version_minor * 100 + cpp_version_release;
-}
+               constexpr int vector_mode = XXH_VECTOR;
+
+#if XXH_VECTOR == 3            /* AVX512 for Skylake and Icelake */
+               constexpr int acc_align = 64;
+               using avx512_underlying = __m512i;
+               using avx2_underlying = __m256i;
+               using sse2_underlying = __m128i;
+#elif XXH_VECTOR == 2          /* AVX2 for Haswell and Bulldozer */
+               constexpr int acc_align = 32;
+               using avx512_underlying = void;
+               using avx2_underlying = __m256i;
+               using sse2_underlying = __m128i;
+#elif XXH_VECTOR == 1  /* SSE2 for Pentium 4 and all x86_64 */
+               using avx512_underlying = void;
+               using avx2_underlying = void; //std::array<__m128i, 2>;
+               using sse2_underlying = __m128i;
+               constexpr int acc_align = 16;
+#else                                  /* Portable scalar version */
+               using avx512_underlying = void;
+               using avx2_underlying = void; //std::array<uint64_t, 4>;
+               using sse2_underlying = void; //std::array<uint64_t, 2>;
+               constexpr int acc_align = 8;
+#endif
 
-namespace hash_t_impl {
-/* *************************************
- *  Basic Types - Detail
- ***************************************/
-
-using _hash32_underlying = uint32_t;
-using _hash64_underlying = uint64_t;
-
-template <size_t N> struct hash_type {
-  using type = void;
-};
-template <> struct hash_type<32> {
-  using type = _hash32_underlying;
-};
-template <> struct hash_type<64> {
-  using type = _hash64_underlying;
-};
-} // namespace hash_t_impl
-
-/* *************************************
- *  Basic Types - Public
- ***************************************/
-
-template <size_t N> using hash_t = typename hash_t_impl::hash_type<N>::type;
-using hash32_t                   = hash_t<32>;
-using hash64_t                   = hash_t<64>;
-
-/* *************************************
- *  Bit Functions - Public
- ***************************************/
-
-namespace bit_ops {
-/* ****************************************
- *  Intrinsics and Bit Operations
- ******************************************/
 
-#if defined(_MSC_VER)
-inline uint32_t rotl32(uint32_t x, int32_t r)
-{
-  return _rotl(x, r);
-}
-inline uint64_t rotl64(uint64_t x, int32_t r)
-{
-  return _rotl64(x, r);
-}
-#else
-inline uint32_t rotl32(uint32_t x, int32_t r)
-{
-  return ((x << r) | (x >> (32 - r)));
-}
-inline uint64_t rotl64(uint64_t x, int32_t r)
-{
-  return ((x << r) | (x >> (64 - r)));
-}
+               /* Compiler Specifics
+               * Defines inline macros and includes specific compiler's instrinsics.
+               * */
+#ifdef XXH_FORCE_INLINE /* First undefining the symbols in case they're already defined */
+#      undef XXH_FORCE_INLINE
+#endif 
+#ifdef XXH_NO_INLINE
+#      undef XXH_NO_INLINE
 #endif
 
-#if defined(_MSC_VER) /* Visual Studio */
-inline uint32_t swap32(uint32_t x)
-{
-  return _byteswap_ulong(x);
-}
-inline uint64_t swap64(uint64_t x)
-{
-  return _byteswap_uint64(x);
-}
-#elif XXH_GCC_VERSION >= 403
-inline uint32_t swap32(uint32_t x)
-{
-  return __builtin_bswap32(x);
-}
-inline uint64_t swap64(uint64_t x)
-{
-  return __builtin_bswap64(x);
-}
+#ifdef _MSC_VER    /* Visual Studio */
+#      pragma warning(disable : 4127)    
+#      define XXH_FORCE_INLINE static __forceinline
+#      define XXH_NO_INLINE static __declspec(noinline)
+#      include <intrin.h>
+#elif defined(__GNUC__)  /* Clang / GCC */
+#      define XXH_FORCE_INLINE static inline __attribute__((always_inline))
+#      define XXH_NO_INLINE static __attribute__((noinline))
+#      include <mmintrin.h>
 #else
-inline uint32_t swap32(uint32_t x)
-{
-  return ((x << 24) & 0xff000000) | ((x << 8) & 0x00ff0000) | ((x >> 8) & 0x0000ff00) | ((x >> 24) & 0x000000ff);
-}
-inline uint64_t swap64(uint64_t x)
-{
-  return ((x << 56) & 0xff00000000000000ULL) | ((x << 40) & 0x00ff000000000000ULL) |
-         ((x << 24) & 0x0000ff0000000000ULL) | ((x << 8) & 0x000000ff00000000ULL) | ((x >> 8) & 0x00000000ff000000ULL) |
-         ((x >> 24) & 0x0000000000ff0000ULL) | ((x >> 40) & 0x000000000000ff00ULL) |
-         ((x >> 56) & 0x00000000000000ffULL);
-}
+#      define XXH_FORCE_INLINE static inline
+#      define XXH_NO_INLINE static
 #endif
-template <size_t N> inline hash_t<N> rotl(hash_t<N> n, int32_t r){};
-
-template <> inline hash_t<32> rotl<32>(hash_t<32> n, int32_t r)
-{
-  return rotl32(n, r);
-};
-
-template <> inline hash_t<64> rotl<64>(hash_t<64> n, int32_t r)
-{
-  return rotl64(n, r);
-};
 
-template <size_t N> inline hash_t<N> swap(hash_t<N> n){};
 
-template <> inline hash_t<32> swap<32>(hash_t<32> n)
-{
-  return swap32(n);
-};
-
-template <> inline hash_t<64> swap<64>(hash_t<64> n)
-{
-  return swap64(n);
-};
-} // namespace bit_ops
-
-/* *************************************
- *  Memory Functions - Public
- ***************************************/
-
-enum class alignment : uint8_t { aligned, unaligned };
-enum class endianness : uint8_t { big_endian = 0, little_endian = 1, unspecified = 2 };
-
-namespace mem_ops {
-/* *************************************
- *  Memory Access
- ***************************************/
-#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 2))
-
-/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */
-template <size_t N> inline hash_t<N> read_unaligned(const void* memPtr)
-{
-  return *(const hash_t<N>*)memPtr;
-}
-
-#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS == 1))
-
-/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */
-/* currently only defined for gcc and icc */
-template <size_t N> using unalign = union {
-  hash_t<N> uval;
-} __attribute((packed));
-
-template <size_t N> inline hash_t<N> read_unaligned(const void* memPtr)
-{
-  return ((const unalign*)memPtr)->uval;
-}
+               /* Prefetch
+               * Can be disabled by defining XXH_NO_PREFETCH
+               */
+#if defined(XXH_NO_PREFETCH)
+               XXH_FORCE_INLINE void prefetch(const void* ptr) {}
+#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))  
+               XXH_FORCE_INLINE void prefetch(const void* ptr) { _mm_prefetch((const char*)(ptr), _MM_HINT_T0); }
+#elif defined(__GNUC__) 
+               XXH_FORCE_INLINE void prefetch(const void* ptr) { __builtin_prefetch((ptr), 0, 3); }
 #else
+               XXH_FORCE_INLINE void prefetch(const void* ptr) {}
+#endif
 
-/* portable and safe solution. Generally efficient.
- * see : http://stackoverflow.com/a/32095106/646947
- */
-template <size_t N> inline hash_t<N> read_unaligned(const void* memPtr)
-{
-  hash_t<N> val;
-  memcpy(&val, memPtr, sizeof(val));
-  return val;
-}
-
-#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */
-
-inline hash_t<32> read32(const void* memPtr)
-{
-  return read_unaligned<32>(memPtr);
-}
-inline hash_t<64> read64(const void* memPtr)
-{
-  return read_unaligned<64>(memPtr);
-}
-
-/* *************************************
- *  Architecture Macros
- ***************************************/
-
-/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */
-
-#ifndef XXH_CPU_LITTLE_ENDIAN
-
-inline endianness get_endian(endianness endian)
-{
-  static struct _dummy_t {
-    std::array<endianness, 3> endian_lookup = {endianness::big_endian, endianness::little_endian,
-                                               endianness::unspecified};
-    const int g_one                         = 1;
-    _dummy_t() { endian_lookup[2] = static_cast<endianness>(*(const char*)(&g_one)); }
-  } _dummy;
-
-  return _dummy.endian_lookup[(uint8_t)endian];
-}
 
-inline bool is_little_endian()
-{
-  return get_endian(endianness::unspecified) == endianness::little_endian;
-}
+               /* Restrict
+               * Defines macro for restrict, which in C++ is sadly just a compiler extension (for now).
+               * Can be disabled by defining XXH_NO_RESTRICT
+               */
+#ifdef XXH_RESTRICT
+#      undef XXH_RESTRICT
+#endif
 
+#if (defined(__GNUC__) || defined(_MSC_VER)) && defined(__cplusplus) && !defined(XXH_NO_RESTRICT)
+#      define XXH_RESTRICT  __restrict
 #else
-constexpr endianness get_endian(endianness endian)
-{
-  constexpr std::array<endianness, 3> endian_lookup = {
-      {endianness::big_endian, endianness::little_endian,
-       (XXH_CPU_LITTLE_ENDIAN) ? endianness::little_endian : endianness::big_endian}};
-  return endian_lookup[static_cast<uint8_t>(endian)];
-}
-
-constexpr bool is_little_endian()
-{
-  return get_endian(endianness::unspecified) == endianness::little_endian;
-}
-
+#      define XXH_RESTRICT 
 #endif
 
-/* ***************************
- *  Memory reads
- *****************************/
-
-template <size_t N> inline hash_t<N> readLE_align(const void* ptr, endianness endian, alignment align)
-{
-  if (align == alignment::unaligned) {
-    return endian == endianness::little_endian ? read_unaligned<N>(ptr) : bit_ops::swap<N>(read_unaligned<N>(ptr));
-  } else {
-    return endian == endianness::little_endian ? *reinterpret_cast<const hash_t<N>*>(ptr)
-                                               : bit_ops::swap<N>(*reinterpret_cast<const hash_t<N>*>(ptr));
-  }
-}
-
-template <size_t N> inline hash_t<N> readLE(const void* ptr, endianness endian)
-{
-  return readLE_align<N>(ptr, endian, alignment::unaligned);
-}
-
-template <size_t N> inline hash_t<N> readBE(const void* ptr)
-{
-  return is_little_endian() ? bit_ops::swap<N>(read_unaligned<N>(ptr)) : read_unaligned<N>(ptr);
-}
-
-template <size_t N> inline alignment get_alignment(const void* input)
-{
-  return ((XXH_FORCE_ALIGN_CHECK) && ((reinterpret_cast<uintptr_t>(input) & ((N / 8) - 1)) == 0))
-             ? xxh::alignment::aligned
-             : xxh::alignment::unaligned;
-}
-} // namespace mem_ops
-
-/* *******************************************************************
- *  Hash functions
- *********************************************************************/
-
-namespace detail {
-/* *******************************************************************
- *  Hash functions - Implementation
- *********************************************************************/
-
-constexpr static std::array<hash32_t, 5> primes32 = {{2654435761U, 2246822519U, 3266489917U, 668265263U, 374761393U}};
-constexpr static std::array<hash64_t, 5> primes64 = {{11400714785074694791ULL, 14029467366897019727ULL,
-                                                      1609587929392839161ULL, 9650029242287828579ULL,
-                                                      2870177450012600261ULL}};
-
-template <size_t N> constexpr hash_t<N> PRIME(int32_t n){};
-
-template <> constexpr hash32_t PRIME<32>(int32_t n)
-{
-  return primes32[n - 1];
-}
-
-template <> constexpr hash64_t PRIME<64>(int32_t n)
-{
-  return primes64[n - 1];
-}
 
-template <size_t N> inline hash_t<N> round(hash_t<N> seed, hash_t<N> input)
-{
-  seed += input * PRIME<N>(2);
-  seed = bit_ops::rotl<N>(seed, ((N == 32) ? 13 : 31));
-  seed *= PRIME<N>(1);
-  return seed;
-}
+               /* Likely / Unlikely
+               * Defines macros for Likely / Unlikely, which are official in C++20, but sadly this library aims the previous standard.
+               * Not present on MSVC.
+               * Can be disabled by defining XXH_NO_BRANCH_HINTS
+               */
+#if ((defined(__GNUC__) && (__GNUC__ >= 3))  || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || defined(__clang__)) && !defined(XXH_NO_BRANCH_HINTS)
+#    define XXH_likely(x) __builtin_expect(x, 1)
+#    define XXH_unlikely(x) __builtin_expect(x, 0)
+#else
+#    define XXH_likely(x) (x)
+#    define XXH_unlikely(x) (x)
+#endif
 
-inline hash64_t mergeRound64(hash64_t acc, hash64_t val)
-{
-  val = round<64>(0, val);
-  acc ^= val;
-  acc = acc * PRIME<64>(1) + PRIME<64>(4);
-  return acc;
-}
 
-template <size_t N>
-inline void endian_align_sub_mergeround(
-#if __cplusplus < 201703L
-    __attribute__((unused))
+               namespace bit_ops
+               {
+#if defined(_MSC_VER)
+                       static inline uint32_t rotl32(uint32_t x, int32_t r) { return _rotl(x, r); }
+                       static inline uint64_t rotl64(uint64_t x, int32_t r) { return _rotl64(x, r); }
+                       static inline uint32_t rotr32(uint32_t x, int32_t r) { return _rotr(x, r); }
+                       static inline uint64_t rotr64(uint64_t x, int32_t r) { return _rotr64(x, r); }
 #else
-    [[maybe_unused]]
+                       static inline uint32_t rotl32(uint32_t x, int32_t r) { return ((x << r) | (x >> (32 - r))); }
+                       static inline uint64_t rotl64(uint64_t x, int32_t r) { return ((x << r) | (x >> (64 - r))); }
+                       static inline uint32_t rotr32(uint32_t x, int32_t r) { return ((x >> r) | (x << (32 - r))); }
+                       static inline uint64_t rotr64(uint64_t x, int32_t r) { return ((x >> r) | (x << (64 - r))); }
 #endif
-    hash_t<N>& hash_ret,
-    hash_t<N> v1, hash_t<N> v2, hash_t<N> v3, hash_t<N> v4){};
 
-template <>
-inline void endian_align_sub_mergeround<64>(hash_t<64>& hash_ret, hash_t<64> v1, hash_t<64> v2, hash_t<64> v3,
-                                            hash_t<64> v4)
-{
-  hash_ret = mergeRound64(hash_ret, v1);
-  hash_ret = mergeRound64(hash_ret, v2);
-  hash_ret = mergeRound64(hash_ret, v3);
-  hash_ret = mergeRound64(hash_ret, v4);
-}
 
-template <size_t N>
-inline hash_t<N> endian_align_sub_ending(hash_t<N> hash_ret, const uint8_t* p, const uint8_t* bEnd,
-                                         xxh::endianness endian, xxh::alignment align){};
+#if defined(_MSC_VER)     /* Visual Studio */
+                       static inline uint32_t swap32(uint32_t x) { return _byteswap_ulong(x); }
+                       static inline uint64_t swap64(uint64_t x) { return _byteswap_uint64(x); }
+#elif defined(__GNUC__)
+                       static inline uint32_t swap32(uint32_t x) { return __builtin_bswap32(x); }
+                       static inline uint64_t swap64(uint64_t x) { return __builtin_bswap64(x); }
+#else
+                       static inline uint32_t swap32(uint32_t x) { return ((x << 24) & 0xff000000) | ((x << 8) & 0x00ff0000) | ((x >> 8) & 0x0000ff00) | ((x >> 24) & 0x000000ff); }
+                       static inline uint64_t swap64(uint64_t x) { return ((x << 56) & 0xff00000000000000ULL) | ((x << 40) & 0x00ff000000000000ULL) | ((x << 24) & 0x0000ff0000000000ULL) | ((x << 8) & 0x000000ff00000000ULL) | ((x >> 8) & 0x00000000ff000000ULL) | ((x >> 24) & 0x0000000000ff0000ULL) | ((x >> 40) & 0x000000000000ff00ULL) | ((x >> 56) & 0x00000000000000ffULL); }
+#endif
 
-template <>
-inline hash_t<32> endian_align_sub_ending<32>(hash_t<32> hash_ret, const uint8_t* p, const uint8_t* bEnd,
-                                              xxh::endianness endian, xxh::alignment align)
-{
-  while ((p + 4) <= bEnd) {
-    hash_ret += mem_ops::readLE_align<32>(p, endian, align) * PRIME<32>(3);
-    hash_ret = bit_ops::rotl<32>(hash_ret, 17) * PRIME<32>(4);
-    p += 4;
-  }
-
-  while (p < bEnd) {
-    hash_ret += (*p) * PRIME<32>(5);
-    hash_ret = bit_ops::rotl<32>(hash_ret, 11) * PRIME<32>(1);
-    p++;
-  }
-
-  hash_ret ^= hash_ret >> 15;
-  hash_ret *= PRIME<32>(2);
-  hash_ret ^= hash_ret >> 13;
-  hash_ret *= PRIME<32>(3);
-  hash_ret ^= hash_ret >> 16;
-
-  return hash_ret;
-}
 
-template <>
-inline hash_t<64> endian_align_sub_ending<64>(hash_t<64> hash_ret, const uint8_t* p, const uint8_t* bEnd,
-                                              xxh::endianness endian, xxh::alignment align)
-{
-  while (p + 8 <= bEnd) {
-    const hash64_t k1 = round<64>(0, mem_ops::readLE_align<64>(p, endian, align));
-    hash_ret ^= k1;
-    hash_ret = bit_ops::rotl<64>(hash_ret, 27) * PRIME<64>(1) + PRIME<64>(4);
-    p += 8;
-  }
-
-  if (p + 4 <= bEnd) {
-    hash_ret ^= static_cast<hash64_t>(mem_ops::readLE_align<32>(p, endian, align)) * PRIME<64>(1);
-    hash_ret = bit_ops::rotl<64>(hash_ret, 23) * PRIME<64>(2) + PRIME<64>(3);
-    p += 4;
-  }
-
-  while (p < bEnd) {
-    hash_ret ^= (*p) * PRIME<64>(5);
-    hash_ret = bit_ops::rotl<64>(hash_ret, 11) * PRIME<64>(1);
-    p++;
-  }
-
-  hash_ret ^= hash_ret >> 33;
-  hash_ret *= PRIME<64>(2);
-  hash_ret ^= hash_ret >> 29;
-  hash_ret *= PRIME<64>(3);
-  hash_ret ^= hash_ret >> 32;
-
-  return hash_ret;
-}
+#if defined(_MSC_VER) && defined(_M_IX86) // Only for 32-bit MSVC.
+                       XXH_FORCE_INLINE uint64_t mult32to64(uint32_t x, uint32_t y) { return __emulu(x, y); }
+#else
+                       XXH_FORCE_INLINE uint64_t mult32to64(uint32_t x, uint32_t y) { return (uint64_t)(uint32_t)(x) * (uint64_t)(uint32_t)(y); }
+#endif
 
-template <size_t N>
-inline hash_t<N> endian_align(const void* input, size_t len, hash_t<N> seed, xxh::endianness endian,
-                              xxh::alignment align)
-{
-  static_assert(!(N != 32 && N != 64), "You can only call endian_align in 32 or 64 bit mode.");
-
-  const uint8_t* p    = static_cast<const uint8_t*>(input);
-  const uint8_t* bEnd = p + len;
-  hash_t<N> hash_ret;
-
-  if (len >= (N / 2)) {
-    const uint8_t* const limit = bEnd - (N / 2);
-    hash_t<N> v1               = seed + PRIME<N>(1) + PRIME<N>(2);
-    hash_t<N> v2               = seed + PRIME<N>(2);
-    hash_t<N> v3               = seed + 0;
-    hash_t<N> v4               = seed - PRIME<N>(1);
-
-    do {
-      v1 = round<N>(v1, mem_ops::readLE_align<N>(p, endian, align));
-      p += (N / 8);
-      v2 = round<N>(v2, mem_ops::readLE_align<N>(p, endian, align));
-      p += (N / 8);
-      v3 = round<N>(v3, mem_ops::readLE_align<N>(p, endian, align));
-      p += (N / 8);
-      v4 = round<N>(v4, mem_ops::readLE_align<N>(p, endian, align));
-      p += (N / 8);
-    } while (p <= limit);
-
-    hash_ret = bit_ops::rotl<N>(v1, 1) + bit_ops::rotl<N>(v2, 7) + bit_ops::rotl<N>(v3, 12) + bit_ops::rotl<N>(v4, 18);
-
-    endian_align_sub_mergeround<N>(hash_ret, v1, v2, v3, v4);
-  } else {
-    hash_ret = seed + PRIME<N>(5);
-  }
-
-  hash_ret += static_cast<hash_t<N>>(len);
-
-  return endian_align_sub_ending<N>(hash_ret, p, bEnd, endian, align);
-}
-} // namespace detail
 
-template <size_t N>
-hash_t<N> xxhash(const void* input, size_t len, hash_t<N> seed = 0, endianness endian = endianness::unspecified)
-{
-  static_assert(!(N != 32 && N != 64), "You can only call xxhash in 32 or 64 bit mode.");
-  return detail::endian_align<N>(input, len, seed, mem_ops::get_endian(endian), mem_ops::get_alignment<N>(input));
-}
+#if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
+                       __attribute__((__target__("no-sse")))
+#endif
+                       static inline uint128_t mult64to128(uint64_t lhs, uint64_t rhs)
+                       {
 
-template <size_t N, typename T>
-hash_t<N> xxhash(const std::basic_string<T>& input, hash_t<N> seed = 0, endianness endian = endianness::unspecified)
-{
-  static_assert(!(N != 32 && N != 64), "You can only call xxhash in 32 or 64 bit mode.");
-  return detail::endian_align<N>(static_cast<const void*>(input.data()), input.length() * sizeof(T), seed,
-                                 mem_ops::get_endian(endian),
-                                 mem_ops::get_alignment<N>(static_cast<const void*>(input.data())));
-}
+#if defined(__GNUC__) && !defined(__wasm__) \
+    && defined(__SIZEOF_INT128__) \
+    || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
 
-template <size_t N, typename ContiguousIterator>
-hash_t<N> xxhash(ContiguousIterator begin, ContiguousIterator end, hash_t<N> seed = 0,
-                 endianness endian = endianness::unspecified)
-{
-  static_assert(!(N != 32 && N != 64), "You can only call xxhash in 32 or 64 bit mode.");
-  using T = typename std::decay_t<decltype(*end)>;
-  return detail::endian_align<N>(static_cast<const void*>(&*begin), (end - begin) * sizeof(T), seed,
-                                 mem_ops::get_endian(endian),
-                                 mem_ops::get_alignment<N>(static_cast<const void*>(&*begin)));
-}
+                               __uint128_t product = (__uint128_t)lhs * (__uint128_t)rhs;
+                               uint128_t r128;
+                               r128.low64 = (uint64_t)(product);
+                               r128.high64 = (uint64_t)(product >> 64);
+                               return r128;
 
-template <size_t N, typename T>
-hash_t<N> xxhash(const std::vector<T>& input, hash_t<N> seed = 0, endianness endian = endianness::unspecified)
-{
-  static_assert(!(N != 32 && N != 64), "You can only call xxhash in 32 or 64 bit mode.");
-  return detail::endian_align<N>(static_cast<const void*>(input.data()), input.size() * sizeof(T), seed,
-                                 mem_ops::get_endian(endian),
-                                 mem_ops::get_alignment<N>(static_cast<const void*>(input.data())));
-}
+#elif defined(_M_X64) || defined(_M_IA64)
 
-template <size_t N, typename T, size_t AN>
-hash_t<N> xxhash(const std::array<T, AN>& input, hash_t<N> seed = 0, endianness endian = endianness::unspecified)
-{
-  static_assert(!(N != 32 && N != 64), "You can only call xxhash in 32 or 64 bit mode.");
-  return detail::endian_align<N>(static_cast<const void*>(input.data()), AN * sizeof(T), seed,
-                                 mem_ops::get_endian(endian),
-                                 mem_ops::get_alignment<N>(static_cast<const void*>(input.data())));
-}
+#ifndef _MSC_VER
+#   pragma intrinsic(_umul128)
+#endif
+                               uint64_t product_high;
+                               uint64_t const product_low = _umul128(lhs, rhs, &product_high);
+                               uint128_t r128;
+                               r128.low64 = product_low;
+                               r128.high64 = product_high;
+                               return r128;
 
-template <size_t N, typename T>
-hash_t<N> xxhash(const std::initializer_list<T>& input, hash_t<N> seed = 0, endianness endian = endianness::unspecified)
-{
-  static_assert(!(N != 32 && N != 64), "You can only call xxhash in 32 or 64 bit mode.");
-  return detail::endian_align<N>(static_cast<const void*>(input.begin()), input.size() * sizeof(T), seed,
-                                 mem_ops::get_endian(endian),
-                                 mem_ops::get_alignment<N>(static_cast<const void*>(input.begin())));
+#else
+                               uint64_t const lo_lo = bit_ops::mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
+                               uint64_t const hi_lo = bit_ops::mult32to64(lhs >> 32, rhs & 0xFFFFFFFF);
+                               uint64_t const lo_hi = bit_ops::mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
+                               uint64_t const hi_hi = bit_ops::mult32to64(lhs >> 32, rhs >> 32);
+
+                               /* Now add the products together. These will never overflow. */
+                               uint64_t const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
+                               uint64_t const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
+                               uint64_t const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
+
+                               uint128_t r128;
+                               r128.low64 = lower;
+                               r128.high64 = upper;
+                               return r128;
+#endif
+                       }
+               }
+       }
+
+
+       /* *************************************
+       *  Basic Types - Everything else
+       ***************************************/
+
+       namespace typedefs
+       {
+               /* *************************************
+               *  Basic Types - Detail
+               ***************************************/
+
+               template <size_t N>
+               struct hash_type
+               {
+                       using type = void;
+               };
+
+               template <>
+               struct hash_type<32>
+               {
+                       using type = uint32_t;
+               };
+
+               template <>
+               struct hash_type<64>
+               {
+                       using type = uint64_t;
+               };
+
+               template <>
+               struct hash_type<128>
+               {
+                       using type = uint128_t;
+               };
+
+
+               template <size_t N>
+               struct vec_type
+               {
+                       using type = void;
+               };
+
+               template <>
+               struct vec_type<64>
+               {
+                       using type = uint64_t;
+               };
+
+               template <>
+               struct vec_type<128>
+               {
+                       using type = intrin::sse2_underlying;
+               };
+
+               template <>
+               struct vec_type<256>
+               {
+                       using type = intrin::avx2_underlying;
+               };
+
+               template <>
+               struct vec_type<512>
+               {
+                       using type = intrin::avx512_underlying;
+               };
+
+               /* Rationale
+               * On the surface level uint_type appears to be pointless,
+               * as it is just a copy of hash_type. They do use the same types,
+               * that is true, but the reasoning for the difference is aimed at humans,
+               * not the compiler, as a difference between values that are 'just' numbers,
+               * and those that represent actual hash values.
+               */
+               template <size_t N>
+               struct uint_type
+               {
+                       using type = void;
+               };
+
+               template <>
+               struct uint_type<32>
+               {
+                       using type = uint32_t;
+               };
+
+               template <>
+               struct uint_type<64>
+               {
+                       using type = uint64_t;
+               };
+
+               template <>
+               struct uint_type<128>
+               {
+                       using type = uint128_t;
+               };
+       }
+
+       template <size_t N>
+       using hash_t = typename typedefs::hash_type<N>::type;
+       using hash32_t = hash_t<32>;
+       using hash64_t = hash_t<64>;
+       using hash128_t = hash_t<128>;
+
+       template <size_t N>
+       using vec_t = typename typedefs::vec_type<N>::type;
+       using vec64_t = vec_t<64>;
+       using vec128_t = vec_t<128>;
+       using vec256_t = vec_t<256>;
+       using vec512_t = vec_t<512>;
+
+       template <size_t N>
+       using uint_t = typename typedefs::uint_type<N>::type;
+       
+
+
+       /* *************************************
+       *  Bit Operations
+       ***************************************/
+
+       namespace bit_ops
+       {
+               /* ****************************************
+               *  Bit Operations
+               ******************************************/
+
+               template <size_t N>
+               static inline uint_t<N> rotl(uint_t<N> n, int32_t r)
+               {
+                       if constexpr (N == 32)
+                       {
+                               return intrin::bit_ops::rotl32(n, r);
+                       }
+
+                       if constexpr (N == 64)
+                       {
+                               return intrin::bit_ops::rotl64(n, r);
+                       }
+               }
+
+               template <size_t N>
+               static inline uint_t<N> rotr(uint_t<N> n, int32_t r)
+               {
+                       if constexpr (N == 32)
+                       {
+                               return intrin::bit_ops::rotr32(n, r);
+                       }
+
+                       if constexpr (N == 64)
+                       {
+                               return intrin::bit_ops::rotr64(n, r);
+                       }
+               }
+
+               template <size_t N>
+               static inline uint_t<N> swap(uint_t<N> n)
+               {
+                       if constexpr (N == 32)
+                       {
+                               return intrin::bit_ops::swap32(n);
+                       }
+
+                       if constexpr (N == 64)
+                       {
+                               return intrin::bit_ops::swap64(n);
+                       }
+               }
+
+               template <size_t N = 64>
+               static inline vec_t<N> mul32to64(vec_t<N> x, vec_t<N> y)
+               { 
+                       if constexpr (N == 64)
+                       {
+                               return intrin::bit_ops::mult32to64(static_cast<uint32_t>(x), static_cast<uint32_t>(y));
+                       }
+                       else
+                       {
+                               return 0;
+                       }
+               }
+
+               static inline uint128_t mul64to128(uint64_t x, uint64_t y)
+               { 
+                       return intrin::bit_ops::mult64to128(x, y); 
+               }
+
+               static inline uint64_t mul128fold64(uint64_t x, uint64_t y)
+               {
+                       uint128_t product = mul64to128(x, y);
+
+                       return (product.low64 ^ product.high64);
+               }
+       }
+
+
+       /* *************************************
+       *  Memory Functions 
+       ***************************************/
+
+       namespace mem_ops
+       {
+
+               /* *************************************
+               * Endianness
+               ***************************************/
+
+               constexpr bool is_little_endian()
+               {
+                       return (XXH_CPU_LITTLE_ENDIAN == 1);
+               }
+
+
+               /* *************************************
+               *  Memory Access
+               ***************************************/
+
+               template <size_t N>
+               static inline uint_t<N> read(const void* memPtr)
+               {
+                       uint_t<N> val;
+
+                       memcpy(&val, memPtr, sizeof(val));
+                       return val;
+               }
+
+               template <size_t N>
+               static inline uint_t<N> readLE(const void* ptr)
+               {
+                       if constexpr (is_little_endian())
+                       {
+                               return read<N>(ptr);
+                       }
+                       else
+                       {
+                               return bit_ops::swap<N>(read<N>(ptr));
+                       }
+               }
+
+               template <size_t N>
+               static inline uint_t<N> readBE(const void* ptr)
+               {
+                       if constexpr (is_little_endian())
+                       {
+                               return bit_ops::swap<N>(read<N>(ptr));
+                       }
+                       else
+                       {
+                               return read<N>(ptr);
+                       }
+               }
+
+               template <size_t N>
+               static void writeLE(void* dst, uint_t<N> v)
+               {
+                       if constexpr (!is_little_endian())
+                       {
+                               v = bit_ops::swap<N>(v);
+                       }
+
+                       memcpy(dst, &v, sizeof(v));
+               }
+       }
+
+
+       /* *************************************
+       *  Vector Functions 
+       ***************************************/
+
+       namespace vec_ops
+       {
+               template <size_t N>
+               XXH_FORCE_INLINE vec_t<N> loadu(const vec_t<N>* input)
+               { 
+                       static_assert(!(N != 128 && N != 256 && N != 64 && N != 512), "Invalid template argument passed to xxh::vec_ops::loadu");
+
+                       if constexpr (N == 128)
+                       {
+                               return _mm_loadu_si128(input);
+                       }
+
+                       if constexpr (N == 256)
+                       {
+                               return _mm256_loadu_si256(input);
+                       }
+
+                       if constexpr (N == 512)
+                       {
+                               return _mm512_loadu_si512(input);
+                       }
+
+                       if constexpr (N == 64)
+                       {
+                               return mem_ops::readLE<64>(input);
+                       }
+
+               }
+
+
+               // 'xorv' instead of 'xor' because 'xor' is a weird wacky alternate operator expression thing. 
+               template <size_t N>
+               XXH_FORCE_INLINE vec_t<N> xorv(vec_t<N> a, vec_t<N> b)
+               { 
+                       static_assert(!(N != 128 && N != 256 && N != 64 && N != 512), "Invalid argument passed to xxh::vec_ops::xorv");
+               
+                       if constexpr (N == 128)
+                       {
+                               return _mm_xor_si128(a, b);
+                       }
+
+                       if constexpr (N == 256)
+                       {
+                               return _mm256_xor_si256(a, b);
+                       }
+
+                       if constexpr (N == 512)
+                       {
+                               return _mm512_xor_si512(a, b);
+                       }
+
+                       if constexpr (N == 64)
+                       {
+                               return a ^ b;
+                       }
+               }
+               
+
+               template <size_t N>
+               XXH_FORCE_INLINE vec_t<N> mul(vec_t<N> a, vec_t<N> b)
+               {
+                       static_assert(!(N != 128 && N != 256 && N != 64 && N != 512), "Invalid argument passed to xxh::vec_ops::mul");
+
+                       if constexpr (N == 128)
+                       {
+                               return _mm_mul_epu32(a, b);
+                       }
+
+                       if constexpr (N == 256)
+                       {
+                               return _mm256_mul_epu32(a, b);
+                       }
+
+                       if constexpr (N == 512)
+                       {
+                               return _mm512_mul_epu32(a, b);
+                       }
+
+                       if constexpr (N == 64)
+                       {
+                               return a * b;
+                       }
+               }
+
+
+               template <size_t N>
+               XXH_FORCE_INLINE vec_t<N> add(vec_t<N> a, vec_t<N> b)
+               {
+                       static_assert(!(N != 128 && N != 256 && N != 64 && N != 512), "Invalid argument passed to xxh::vec_ops::add");
+
+                       if constexpr (N == 128)
+                       {
+                               return _mm_add_epi64(a, b);
+                       }
+
+                       if constexpr (N == 256)
+                       {
+                               return _mm256_add_epi64(a, b);
+                       }
+
+                       if constexpr (N == 512)
+                       {
+                               return _mm512_add_epi64(a, b);
+                       }
+
+                       if constexpr (N == 64)
+                       {
+                               return a + b;
+                       }
+               }
+
+
+               template <size_t N, uint8_t S1, uint8_t S2, uint8_t S3, uint8_t S4>
+               XXH_FORCE_INLINE vec_t<N> shuffle(vec_t<N> a)
+               { 
+                       static_assert(!(N != 128 && N != 256 && N != 64 && N != 512), "Invalid argument passed to xxh::vec_ops::shuffle");
+
+                       if constexpr (N == 128)
+                       {
+                               return _mm_shuffle_epi32(a, _MM_SHUFFLE(S1, S2, S3, S4));
+                       }
+
+                       if constexpr (N == 256)
+                       {
+                               return _mm256_shuffle_epi32(a, _MM_SHUFFLE(S1, S2, S3, S4));
+                       }
+
+                       if constexpr (N == 512)
+                       {
+                               return _mm512_shuffle_epi32(a, _MM_SHUFFLE(S1, S2, S3, S4));
+                       }
+
+                       if constexpr (N == 64)
+                       {
+                               return a;
+                       }
+               }
+
+
+               template <size_t N>
+               XXH_FORCE_INLINE vec_t<N> set1(int64_t a)
+               {
+                       static_assert(!(N != 128 && N != 256 && N != 64 && N != 512), "Invalid argument passed to xxh::vec_ops::set1");
+
+                       if constexpr (N == 128)
+                       {
+                               return _mm_set1_epi32(static_cast<int>(a));
+                       }
+
+                       if constexpr (N == 256)
+                       {
+                               return _mm256_set1_epi32(static_cast<int>(a));
+                       }
+
+                       if constexpr (N == 512)
+                       {
+                               return _mm512_set1_epi32(static_cast<int>(a));
+                       }
+
+                       if constexpr (N == 64)
+                       {
+                               return a;
+                       }
+               }
+
+
+               template <size_t N>
+               XXH_FORCE_INLINE vec_t<N> srli(vec_t<N> n, int a)
+               {
+                       static_assert(!(N != 128 && N != 256 && N != 64 && N != 512), "Invalid argument passed to xxh::vec_ops::srli");
+
+                       if constexpr (N == 128)
+                       {
+                               return _mm_srli_epi64(n, a);
+                       }
+
+                       if constexpr (N == 256)
+                       {
+                               return _mm256_srli_epi64(n, a);
+                       }
+
+                       if constexpr (N == 512)
+                       {
+                               return _mm512_srli_epi64(n, a);
+                       }
+
+                       if constexpr (N == 64)
+                       {
+                               return n >> a;
+                       }
+               }
+
+
+               template <size_t N>
+               XXH_FORCE_INLINE vec_t<N> slli(vec_t<N> n, int a)
+               {
+                       static_assert(!(N != 128 && N != 256 && N != 64 && N != 512), "Invalid argument passed to xxh::vec_ops::slli");
+
+                       if constexpr (N == 128)
+                       {
+                               return _mm_slli_epi64(n, a);
+                       }
+
+                       if constexpr (N == 256)
+                       {
+                               return _mm256_slli_epi64(n, a);
+                       }
+
+                       if constexpr (N == 512)
+                       {
+                               return _mm512_slli_epi64(n, a);
+                       }
+
+                       if constexpr (N == 64)
+                       {
+                               return n << a;
+                       }
+               }
+       }
+
+       /* *************************************
+       *  Canonical represenation
+       ***************************************/
+
+       template <size_t bit_mode>
+       struct canonical_t
+       {
+               std::array<uint8_t, bit_mode / 8> digest{ 0 };
+
+               canonical_t(hash_t<bit_mode> hash)
+               {
+                       if constexpr (bit_mode < 128)
+                       {
+                               if (mem_ops::is_little_endian())
+                               {
+                                       hash = bit_ops::swap<bit_mode>(hash);
+                               }
+
+                               memcpy(digest.data(), &hash, sizeof(canonical_t<bit_mode>));
+                       }
+                       else
+                       {
+                               if (mem_ops::is_little_endian())
+                               {
+                                       hash.low64 = bit_ops::swap<64>(hash.low64);
+                                       hash.high64 = bit_ops::swap<64>(hash.high64);
+                               }
+
+                               memcpy(digest.data(), &hash.high64, sizeof(hash.high64));
+                               memcpy(digest.data() + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
+                       }
+               }
+
+               hash_t<bit_mode> get_hash() const
+               {
+                       if constexpr (bit_mode < 128)
+                       {
+                               return mem_ops::readBE<bit_mode>(&digest);
+                       }
+                       else
+                       {
+                               return { mem_ops::readBE<64>(&digest[8]), mem_ops::readBE<64>(&digest) };
+                       }
+               }
+       };
+
+       using canonical32_t = canonical_t<32>;
+       using canonical64_t = canonical_t<64>;
+       using canonical128_t = canonical_t<128>;
+
+       template <size_t bit_mode>
+       inline hash_t<bit_mode> to_canonical(hash_t<bit_mode> hash)
+       {
+               static_assert(!(bit_mode != 128 && bit_mode != 64 && bit_mode != 32), "Canonical form can only be obtained from 32, 64 and 128 bit hashes.");
+               canonical_t<bit_mode> canon(hash);
+               hash_t<bit_mode> res;
+               memcpy(&res, &canon, bit_mode / 4);
+
+               return res;
+       }
+
+
+       /* *************************************
+       *  Algorithm Implementation - xxhash
+       ***************************************/
+
+       namespace detail
+       {
+               using namespace mem_ops;
+               using namespace bit_ops;
+
+
+               /* *************************************
+               *  Constants
+               ***************************************/
+
+               constexpr static std::array<uint32_t, 5> primes32 = { 2654435761U, 2246822519U, 3266489917U, 668265263U, 374761393U };
+               constexpr static std::array<uint64_t, 5> primes64 = { 11400714785074694791ULL, 14029467366897019727ULL, 1609587929392839161ULL, 9650029242287828579ULL, 2870177450012600261ULL };
+
+               template <size_t N>
+               constexpr uint_t<N> PRIME(uint64_t n) 
+               {
+                       if constexpr (N == 32)
+                       {
+                               return primes32[n - 1];
+                       }
+                       else
+                       {
+                               return primes64[n - 1];
+                       }
+               }
+
+
+               /* *************************************
+               *  Functions
+               ***************************************/
+
+               template <size_t N> 
+               XXH_FORCE_INLINE uint_t<N> avalanche(uint_t<N> hash)
+               {
+                       if constexpr (N == 32)
+                       {
+                               hash ^= hash >> 15;
+                               hash *= PRIME<32>(2);
+                               hash ^= hash >> 13;
+                               hash *= PRIME<32>(3);
+                               hash ^= hash >> 16;
+                               return hash;
+                       }
+                       else if constexpr (N == 64)
+                       {
+                               hash ^= hash >> 33;
+                               hash *= PRIME<64>(2);
+                               hash ^= hash >> 29;
+                               hash *= PRIME<64>(3);
+                               hash ^= hash >> 32;
+                               return hash;
+                       }
+                       else return 0;
+               }
+
+               template <size_t N>
+               XXH_FORCE_INLINE uint_t<N> round(uint_t<N> seed, uint_t<N> input)
+               {
+                       seed += input * PRIME<N>(2);
+
+                       if constexpr (N == 32)
+                       {
+                               seed = rotl<N>(seed, 13);
+                       }
+                       else
+                       {
+                               seed = rotl<N>(seed, 31);
+                       }
+
+                       seed *= PRIME<N>(1);
+                       return seed;
+               }
+
+               XXH_FORCE_INLINE uint64_t mergeRound64(hash64_t acc, uint64_t val)
+               {
+                       val = round<64>(0, val);
+                       acc ^= val;
+                       acc = acc * PRIME<64>(1) + PRIME<64>(4);
+                       return acc;
+               }
+
+               XXH_FORCE_INLINE void endian_align_sub_mergeround(hash64_t& hash_ret, uint64_t v1, uint64_t v2, uint64_t v3, uint64_t v4)
+               {
+                       hash_ret = mergeRound64(hash_ret, v1);
+                       hash_ret = mergeRound64(hash_ret, v2);
+                       hash_ret = mergeRound64(hash_ret, v3);
+                       hash_ret = mergeRound64(hash_ret, v4);
+               }
+
+               template <size_t N>
+               static inline hash_t<N> endian_align_sub_ending(hash_t<N> hash_ret, const uint8_t* p, const uint8_t* bEnd)
+               {
+                       if constexpr (N == 32)
+                       {
+                               while ((p + 4) <= bEnd)
+                               {
+                                       hash_ret += readLE<32>(p) * PRIME<32>(3);
+                                       hash_ret = rotl<32>(hash_ret, 17) * PRIME<32>(4);
+                                       p += 4;
+                               }
+
+                               while (p < bEnd)
+                               {
+                                       hash_ret += (*p) * PRIME<32>(5);
+                                       hash_ret = rotl<32>(hash_ret, 11) * PRIME<32>(1);
+                                       p++;
+                               }
+
+                               return avalanche<32>(hash_ret);
+                       }
+                       else
+                       {
+                               while (p + 8 <= bEnd)
+                               {
+                                       const uint64_t k1 = round<64>(0, readLE<64>(p));
+
+                                       hash_ret ^= k1;
+                                       hash_ret = rotl<64>(hash_ret, 27) * PRIME<64>(1) + PRIME<64>(4);
+                                       p += 8;
+                               }
+
+                               if (p + 4 <= bEnd)
+                               {
+                                       hash_ret ^= static_cast<uint64_t>(readLE<32>(p))* PRIME<64>(1);
+                                       hash_ret = rotl<64>(hash_ret, 23) * PRIME<64>(2) + PRIME<64>(3);
+                                       p += 4;
+                               }
+
+                               while (p < bEnd)
+                               {
+                                       hash_ret ^= (*p) * PRIME<64>(5);
+                                       hash_ret = rotl<64>(hash_ret, 11) * PRIME<64>(1);
+                                       p++;
+                               }
+
+                               return avalanche<64>(hash_ret);
+                       }
+               }
+
+               template <size_t N>
+               static inline hash_t<N> endian_align(const void* input, size_t len, uint_t<N> seed)
+               {
+                       static_assert(!(N != 32 && N != 64), "You can only call endian_align in 32 or 64 bit mode.");
+
+                       const uint8_t* p = static_cast<const uint8_t*>(input);
+                       const uint8_t* bEnd = p + len;
+                       hash_t<N> hash_ret;
+
+                       if (len >= (N / 2))
+                       {
+                               const uint8_t* const limit = bEnd - (N / 2);
+                               uint_t<N> v1 = seed + PRIME<N>(1) + PRIME<N>(2);
+                               uint_t<N> v2 = seed + PRIME<N>(2);
+                               uint_t<N> v3 = seed + 0;
+                               uint_t<N> v4 = seed - PRIME<N>(1);
+
+                               do
+                               {
+                                       v1 = round<N>(v1, readLE<N>(p)); 
+                                       p += (N / 8);
+                                       v2 = round<N>(v2, readLE<N>(p)); 
+                                       p += (N / 8);
+                                       v3 = round<N>(v3, readLE<N>(p)); 
+                                       p += (N / 8);
+                                       v4 = round<N>(v4, readLE<N>(p)); 
+                                       p += (N / 8);
+                               } 
+                               while (p <= limit);
+
+                               hash_ret = rotl<N>(v1, 1) + rotl<N>(v2, 7) + rotl<N>(v3, 12) + rotl<N>(v4, 18);
+
+                               if constexpr (N == 64)
+                               {
+                                       endian_align_sub_mergeround(hash_ret, v1, v2, v3, v4);
+                               }
+                       }
+                       else 
+                       { 
+                               hash_ret = seed + PRIME<N>(5); 
+                       }
+
+                       hash_ret += static_cast<hash_t<N>>(len);
+
+                       return endian_align_sub_ending<N>(hash_ret, p, bEnd);
+               }
+       }
+
+
+       /* *************************************
+       *  Algorithm Implementation - xxhash3
+       ***************************************/
+
+       namespace detail3
+       {
+               using namespace vec_ops;
+               using namespace detail;
+               using namespace mem_ops;
+               using namespace bit_ops;
+
+
+               /* *************************************
+               *  Enums
+               ***************************************/
+
+               enum class vec_mode : uint8_t { scalar = 0, sse2 = 1, avx2 = 2, avx512 = 3 };
+
+
+               /* *************************************
+               *  Constants
+               ***************************************/
+
+               constexpr uint64_t secret_default_size = 192;
+               constexpr uint64_t secret_size_min = 136;
+               constexpr uint64_t secret_consume_rate = 8;
+               constexpr uint64_t stripe_len = 64;
+               constexpr uint64_t acc_nb = 8;
+               constexpr uint64_t prefetch_distance = 384;
+               constexpr uint64_t secret_lastacc_start = 7;
+               constexpr uint64_t secret_mergeaccs_start = 11;
+               constexpr uint64_t midsize_max = 240;
+               constexpr uint64_t midsize_startoffset = 3;
+               constexpr uint64_t midsize_lastoffset = 17;
+
+               constexpr vec_mode vector_mode = static_cast<vec_mode>(intrin::vector_mode);
+               constexpr uint64_t acc_align = intrin::acc_align;
+               constexpr std::array<uint64_t, 4> vector_bit_width { 64, 128, 256, 512 };
+
+               
+               /* *************************************
+               *  Defaults
+               ***************************************/
+               
+               alignas(64) constexpr uint8_t default_secret[secret_default_size] = {
+                       0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
+                       0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
+                       0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
+                       0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
+                       0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
+                       0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
+                       0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
+                       0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
+                       0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
+                       0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
+                       0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
+                       0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
+               };
+
+               constexpr std::array<uint64_t, 8> init_acc = { PRIME<32>(3), PRIME<64>(1), PRIME<64>(2), PRIME<64>(3), PRIME<64>(4), PRIME<32>(2), PRIME<64>(5), PRIME<32>(1) };
+       
+
+               /* *************************************
+               *  Functions
+               ***************************************/
+       
+               XXH_FORCE_INLINE hash_t<64> avalanche(hash_t<64> h64)
+               {
+                       constexpr uint64_t avalanche_mul_prime = 0x165667919E3779F9ULL;
+
+                       h64 ^= h64 >> 37;
+                       h64 *= avalanche_mul_prime;
+                       h64 ^= h64 >> 32;
+                       return h64;
+               }
+
+               XXH_FORCE_INLINE hash_t<64> rrmxmx(hash_t<64> h64, uint64_t len)
+               {
+                       h64 ^= rotl<64>(h64, 49) ^ rotl<64>(h64, 24);
+                       h64 *= 0x9FB21C651E98DF25ULL;
+                       h64 ^= (h64 >> 35) + len;
+                       h64 *= 0x9FB21C651E98DF25ULL;
+                       h64 ^= (h64 >> 28);
+                       return h64;
+               }
+
+               XXH_FORCE_INLINE void combine_16(void* dest, hash128_t h128)
+               {
+                       writeLE<64>(dest, readLE<64>(dest) ^ h128.low64);
+                       writeLE<64>((uint8_t*)dest + 8, readLE<64>((uint8_t*)dest + 8) ^ h128.high64);
+               }
+
+               XXH_FORCE_INLINE void accumulate_512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT input, const void* XXH_RESTRICT secret)
+               {
+                       constexpr uint64_t bits = vector_bit_width[static_cast<uint8_t>(vector_mode)];
+
+                       using vec_t = vec_t<bits>;
+                       
+                       alignas(sizeof(vec_t)) vec_t* const xacc = static_cast<vec_t*>(acc);
+                       const vec_t* const xinput = static_cast<const vec_t*>(input);
+                       const vec_t* const xsecret = static_cast<const vec_t*>(secret);
+
+                       for (size_t i = 0; i < stripe_len / sizeof(vec_t); i++)
+                       {
+                               vec_t const data_vec = loadu<bits>(xinput + i);
+                               vec_t const key_vec = loadu<bits>(xsecret + i);
+                               vec_t const data_key = xorv<bits>(data_vec, key_vec);
+                               vec_t product = set1<bits>(0);
+
+                               if constexpr (vector_mode == vec_mode::scalar)
+                               {
+                                       product = mul32to64<bits>(srli<bits>(slli<bits>(data_key, 32),32), srli<bits>(data_key, 32));
+                                       xacc[i ^ 1] = add<bits>(xacc[i ^ 1], data_vec);
+                                       xacc[i] = add<bits>(xacc[i], product);
+                               }
+                               else
+                               {
+                                       vec_t const data_key_lo = shuffle<bits, 0, 3, 0, 1>(data_key);
+                                       product = mul<bits>(data_key, data_key_lo);
+
+                                       vec_t const data_swap = shuffle<bits, 1, 0, 3, 2>(data_vec);
+                                       vec_t const sum = add<bits>(xacc[i], data_swap);
+                                       xacc[i] = add<bits>(sum, product);
+                               }                               
+                       }
+               }
+
+               XXH_FORCE_INLINE void scramble_acc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
+               {
+                       constexpr uint64_t bits = vector_bit_width[static_cast<uint8_t>(vector_mode)];;
+
+                       using vec_t = vec_t<bits>;
+
+                       alignas(sizeof(vec_t)) vec_t* const xacc = (vec_t*)acc;
+                       const vec_t* const xsecret = (const vec_t*)secret;  
+
+                       for (size_t i = 0; i < stripe_len / sizeof(vec_t); i++)
+                       {
+                               vec_t const acc_vec = xacc[i];
+                               vec_t const shifted = srli<bits>(acc_vec, 47);
+                               vec_t const data_vec = xorv<bits>(acc_vec, shifted);
+                               vec_t const key_vec = loadu<bits>(xsecret + i);
+                               vec_t const data_key = xorv<bits>(data_vec, key_vec);
+                               
+                               if constexpr (vector_mode == vec_mode::scalar)
+                               {
+                                       xacc[i] = mul<bits>(data_key, set1<bits>(PRIME<32>(1)));
+                               }
+                               else
+                               {
+                                       vec_t const prime32 = set1<bits>(PRIME<32>(1));
+                                       vec_t const data_key_hi = shuffle<bits, 0, 3, 0, 1>(data_key);
+                                       vec_t const prod_lo = mul<bits>(data_key, prime32);
+                                       vec_t const prod_hi = mul<bits>(data_key_hi, prime32);
+
+                                       xacc[i] = add<bits>(prod_lo, vec_ops::slli<bits>(prod_hi, 32));
+                               }
+                       }
+               }
+
+               XXH_FORCE_INLINE void accumulate(uint64_t* XXH_RESTRICT acc, const uint8_t* XXH_RESTRICT input, const uint8_t* XXH_RESTRICT secret, size_t nbStripes)
+               {
+                       for (size_t n = 0; n < nbStripes; n++) 
+                       {
+                               const uint8_t* const in = input + n * stripe_len;
+
+                               intrin::prefetch(in + prefetch_distance);
+                               accumulate_512(acc, in, secret + n * secret_consume_rate);
+                       }
+               }
+
+               XXH_FORCE_INLINE void hash_long_internal_loop(uint64_t* XXH_RESTRICT acc, const uint8_t* XXH_RESTRICT input, size_t len, const uint8_t* XXH_RESTRICT secret, size_t secretSize)
+               {
+                       size_t const nb_rounds = (secretSize - stripe_len) / secret_consume_rate;
+                       size_t const block_len = stripe_len * nb_rounds;
+                       size_t const nb_blocks = (len-1) / block_len;
+
+                       for (size_t n = 0; n < nb_blocks; n++) 
+                       {
+                               accumulate(acc, input + n * block_len, secret, nb_rounds);
+                               scramble_acc(acc, secret + secretSize - stripe_len);
+                       }
+
+                       /* last partial block */
+                       size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / stripe_len;
+
+                       accumulate(acc, input + nb_blocks * block_len, secret, nbStripes);
+
+                       /* last stripe */
+                       const uint8_t* const p = input + len - stripe_len;
+
+                       accumulate_512(acc, p, secret + secretSize - stripe_len - secret_lastacc_start);
+               }
+
+               XXH_FORCE_INLINE uint64_t mix_2_accs(const uint64_t* XXH_RESTRICT acc, const uint8_t* XXH_RESTRICT secret)
+               {
+                       return mul128fold64(acc[0] ^ readLE<64>(secret), acc[1] ^ readLE<64>(secret + 8));
+               }
+
+               XXH_FORCE_INLINE uint64_t merge_accs(const uint64_t* XXH_RESTRICT acc, const uint8_t* XXH_RESTRICT secret, uint64_t start)
+               {
+                       uint64_t result64 = start;
+
+                       result64 += mix_2_accs(acc + 0, secret + 0);
+                       result64 += mix_2_accs(acc + 2, secret + 16);
+                       result64 += mix_2_accs(acc + 4, secret + 32);
+                       result64 += mix_2_accs(acc + 6, secret + 48);
+
+                       return avalanche(result64);
+               }
+
+               XXH_FORCE_INLINE void init_custom_secret(uint8_t* customSecret, uint64_t seed)
+               {
+                       for (uint64_t i = 0; i < secret_default_size / 16; i++) 
+                       {
+                               writeLE<64>(customSecret + i * 16, readLE<64>(default_secret + i * 16) + seed);
+                               writeLE<64>(customSecret + i * 16 + 8, readLE<64>(default_secret + i * 16 + 8) - seed);
+                       }
+               }
+
+               template <size_t N>
+               XXH_FORCE_INLINE hash_t<N> len_1to3(const uint8_t* input, size_t len, const uint8_t* secret, uint64_t seed)
+               {
+                       if constexpr (N == 64)
+                       {
+                               uint8_t const c1 = input[0];
+                               uint8_t const c2 = input[len >> 1];
+                               uint8_t const c3 = input[len - 1];
+                               uint32_t const combined = ((uint32_t)c1 << 16) | (((uint32_t)c2) << 24) | (((uint32_t)c3) << 0) | (((uint32_t)len) << 8);
+                               uint64_t const bitflip = (readLE<32>(secret) ^ readLE<32>(secret + 4)) + seed;
+                               uint64_t const keyed = (uint64_t)combined ^ bitflip;
+                               return detail::avalanche<64>(keyed);
+                       }
+                       else
+                       {
+                               uint8_t const c1 = input[0];
+                               uint8_t const c2 = input[len >> 1];
+                               uint8_t const c3 = input[len - 1];
+                               uint32_t const combinedl = ((uint32_t)c1 << 16) + (((uint32_t)c2) << 24) + (((uint32_t)c3) << 0) + (((uint32_t)len) << 8);
+                               uint32_t const combinedh = rotl<32>(swap<32>(combinedl), 13);
+                               uint64_t const bitflipl = (readLE<32>(secret) ^ readLE<32>(secret + 4)) + seed;
+                               uint64_t const bitfliph = (readLE<32>(secret + 8) ^ readLE<32>(secret + 12)) - seed;
+                               uint64_t const keyed_lo = (uint64_t)combinedl ^ bitflipl;
+                               uint64_t const keyed_hi = (uint64_t)combinedh ^ bitfliph;
+                               hash128_t const h128 = { detail::avalanche<64>(keyed_lo), detail::avalanche<64>(keyed_hi)};
+
+                               return h128;
+                       }
+               }
+
+               template <size_t N>
+               XXH_FORCE_INLINE hash_t<N> len_4to8(const uint8_t* input, size_t len, const uint8_t* secret, uint64_t seed)
+               {
+                       constexpr uint64_t mix_constant = 0x9FB21C651E98DF25ULL;
+
+                       seed ^= (uint64_t)swap<32>((uint32_t)seed) << 32;
+
+                       if constexpr (N == 64)
+                       {               
+                               uint32_t const input1 = readLE<32>(input);
+                               uint32_t const input2 = readLE<32>(input + len - 4);
+                               uint64_t const bitflip = (readLE<64>(secret + 8) ^ readLE<64>(secret + 16)) - seed;
+                               uint64_t const input64 = input2 + ((uint64_t)input1 << 32);
+                               uint64_t keyed = input64 ^ bitflip;
+                       
+                               return rrmxmx(keyed, len);
+                       }
+                       else
+                       {
+                               uint32_t const input_lo = readLE<32>(input);
+                               uint32_t const input_hi = readLE<32>(input + len - 4);
+                               uint64_t const input_64 = input_lo + ((uint64_t)input_hi << 32);
+                               uint64_t const bitflip = (readLE<64>(secret + 16) ^ readLE<64>(secret + 24)) + seed;
+                               uint64_t const keyed = input_64 ^ bitflip;
+                               uint128_t m128 = mul64to128(keyed, PRIME<64>(1) + (len << 2));
+
+                               m128.high64 += (m128.low64 << 1);
+                               m128.low64 ^= (m128.high64 >> 3);
+                               m128.low64 ^= (m128.low64 >> 35);
+                               m128.low64 *= mix_constant;
+                               m128.low64 ^= (m128.low64 >> 28);
+                               m128.high64 = avalanche(m128.high64);
+
+                               return m128;            
+                       }
+               }
+
+               template <size_t N>
+               XXH_FORCE_INLINE hash_t<N> len_9to16(const uint8_t* input, size_t len, const uint8_t* secret, uint64_t seed)
+               {
+                       if constexpr (N == 64)
+                       {
+                               uint64_t const bitflip1 = (readLE<64>(secret + 24) ^ readLE<64>(secret + 32)) + seed;
+                               uint64_t const bitflip2 = (readLE<64>(secret + 40) ^ readLE<64>(secret + 48)) - seed;
+                               uint64_t const input_lo = readLE<64>(input) ^ bitflip1;
+                               uint64_t const input_hi = readLE<64>(input + len - 8) ^ bitflip2;
+                               uint64_t const acc = len + swap<64>(input_lo) + input_hi + mul128fold64(input_lo, input_hi);
+
+                               return avalanche(acc);
+                       }
+                       else
+                       {
+                               uint64_t const bitflipl = (readLE<64>(secret + 32) ^ readLE<64>(secret + 40)) - seed;
+                               uint64_t const bitfliph = (readLE<64>(secret + 48) ^ readLE<64>(secret + 56)) + seed;
+                               uint64_t const input_lo = readLE<64>(input);
+                               uint64_t input_hi = readLE<64>(input + len - 8);
+                               uint128_t m128 = mul64to128(input_lo ^ input_hi ^ bitflipl, PRIME<64>(1));
+
+                               m128.low64 += (uint64_t)(len - 1) << 54;
+                               input_hi ^= bitfliph;
+
+                               if constexpr (sizeof(void*) < sizeof(uint64_t)) // 32-bit version
+                               {
+                                       m128.high64 += (input_hi & 0xFFFFFFFF00000000) + mul32to64((uint32_t)input_hi, PRIME<32>(2));
+                               }
+                               else
+                               {
+                                       m128.high64 += input_hi + mul32to64((uint32_t)input_hi, PRIME<32>(2) - 1);
+                               }
+
+                               m128.low64 ^= swap<64>(m128.high64);
+
+                               hash128_t h128 = mul64to128(m128.low64, PRIME<64>(2));
+
+                               h128.high64 += m128.high64 * PRIME<64>(2);
+                               h128.low64 = avalanche(h128.low64);
+                               h128.high64 = avalanche(h128.high64);
+
+                               return h128;
+                       }
+               }
+
+               template <size_t N>
+               XXH_FORCE_INLINE hash_t<N> len_0to16(const uint8_t* input, size_t len, const uint8_t* secret, uint64_t seed)
+               {
+                       if (XXH_likely(len > 8))
+                       {
+                               return len_9to16<N>(input, len, secret, seed);
+                       }
+                       else if (XXH_likely(len >= 4))
+                       {
+                               return len_4to8<N>(input, len, secret, seed);
+                       }
+                       else if (len)
+                       {
+                               return len_1to3<N>(input, len, secret, seed);
+                       }
+                       else
+                       {
+                               if constexpr (N == 64)
+                               {
+                                       return detail::avalanche<64>((seed) ^ (readLE<64>(secret + 56) ^ readLE<64>(secret + 64)));
+                               }
+                               else
+                               {
+                                       uint64_t const bitflipl = readLE<64>(secret + 64) ^ readLE<64>(secret + 72);
+                                       uint64_t const bitfliph = readLE<64>(secret + 80) ^ readLE<64>(secret + 88);
+
+                                       return hash128_t(detail::avalanche<64>(( seed) ^ bitflipl), detail::avalanche<64>(( seed) ^ bitfliph));
+                               }                       
+                       }
+               }
+
+               template <size_t N>
+               XXH_FORCE_INLINE hash_t<N> hash_long_internal(const uint8_t* XXH_RESTRICT input, size_t len, const uint8_t* XXH_RESTRICT secret = default_secret, size_t secretSize = sizeof(default_secret))
+               {
+                       alignas(acc_align) std::array<uint64_t, acc_nb> acc = init_acc;
+
+                       if constexpr (N == 64)
+                       {                               
+                               hash_long_internal_loop(acc.data(), input, len, secret, secretSize);
+
+                               /* converge into final hash */
+                               return merge_accs(acc.data(), secret + secret_mergeaccs_start, (uint64_t)len * PRIME<64>(1));
+                       }
+                       else
+                       {
+                               hash_long_internal_loop(acc.data(), input, len, secret, secretSize);
+
+                               /* converge into final hash */
+                               uint64_t const low64 = merge_accs(acc.data(), secret + secret_mergeaccs_start, (uint64_t)len * PRIME<64>(1));
+                               uint64_t const high64 = merge_accs(acc.data(), secret + secretSize - sizeof(acc) - secret_mergeaccs_start, ~((uint64_t)len * PRIME<64>(2)));
+
+                               return hash128_t(low64, high64);
+                       }
+               }
+
+               XXH_FORCE_INLINE uint64_t mix_16b(const uint8_t* XXH_RESTRICT input, const uint8_t* XXH_RESTRICT secret, uint64_t seed)
+               {
+                       uint64_t const input_lo = readLE<64>(input);
+                       uint64_t const input_hi = readLE<64>(input + 8);
+
+                       return mul128fold64(input_lo ^ (readLE<64>(secret) + seed), input_hi ^ (readLE<64>(secret + 8) - seed));
+               }
+
+               XXH_FORCE_INLINE uint128_t mix_32b(uint128_t acc, const uint8_t* input1, const uint8_t* input2, const uint8_t* secret, uint64_t seed)
+               {
+                       acc.low64 += mix_16b(input1, secret + 0, seed);
+                       acc.low64 ^= readLE<64>(input2) + readLE<64>(input2 + 8);
+                       acc.high64 += mix_16b(input2, secret + 16, seed);
+                       acc.high64 ^= readLE<64>(input1) + readLE<64>(input1 + 8);
+
+                       return acc;     
+               }
+
+               template <size_t N>
+               XXH_FORCE_INLINE hash_t<N> len_17to128(const uint8_t* XXH_RESTRICT input, size_t len, const uint8_t* XXH_RESTRICT secret, uint64_t seed)
+               {
+                       if constexpr (N == 64)
+                       {
+                               hash64_t acc = len * PRIME<64>(1);
+
+                               if (len > 32) 
+                               {
+                                       if (len > 64) 
+                                       {
+                                               if (len > 96) 
+                                               {
+                                                       acc += mix_16b(input + 48, secret + 96, seed);
+                                                       acc += mix_16b(input + len - 64, secret + 112, seed);
+                                               }
+
+                                               acc += mix_16b(input + 32, secret + 64, seed);
+                                               acc += mix_16b(input + len - 48, secret + 80, seed);
+                                       }
+
+                                       acc += mix_16b(input + 16, secret + 32, seed);
+                                       acc += mix_16b(input + len - 32, secret + 48, seed);
+                               }
+
+                               acc += mix_16b(input + 0, secret + 0, seed);
+                               acc += mix_16b(input + len - 16, secret + 16, seed);
+
+                               return avalanche(acc);
+                       }
+                       else
+                       {
+                               hash128_t acc = { len * PRIME<64>(1), 0 };
+
+                               if (len > 32) 
+                               {
+                                       if (len > 64) 
+                                       {
+                                               if (len > 96) 
+                                               {
+                                                       acc = mix_32b(acc, input + 48, input + len - 64, secret + 96, seed);
+                                               }
+
+                                               acc = mix_32b(acc, input + 32, input + len - 48, secret + 64, seed);
+                                       }
+
+                                       acc = mix_32b(acc, input + 16, input + len - 32, secret + 32, seed);
+                               }
+
+                               acc = mix_32b(acc, input, input + len - 16, secret, seed);
+
+                               uint64_t const low64 = acc.low64 + acc.high64;
+                               uint64_t const high64 = (acc.low64 * PRIME<64>(1)) + (acc.high64 * PRIME<64>(4)) + ((len - seed) * PRIME<64>(2));
+
+                               return { avalanche(low64), (uint64_t)0 - avalanche(high64) };
+                       }
+               }
+
+               template <size_t N>
+               XXH_NO_INLINE hash_t<N> len_129to240(const uint8_t* XXH_RESTRICT input, size_t len, const uint8_t* XXH_RESTRICT secret, uint64_t seed)
+               {
+                       if constexpr (N == 64)
+                       {
+                               uint64_t acc = len * PRIME<64>(1);
+                               size_t const nbRounds = len / 16;
+
+                               for (size_t i = 0; i < 8; i++) 
+                               {
+                                       acc += mix_16b(input + (i * 16), secret + (i * 16), seed);
+                               }
+
+                               acc = avalanche(acc);
+
+                               for (size_t i = 8; i < nbRounds; i++) 
+                               {
+                                       acc += mix_16b(input + (i * 16), secret + ((i - 8) * 16) + midsize_startoffset, seed);
+                               }
+
+                               /* last bytes */
+                               acc += mix_16b(input + len - 16, secret + secret_size_min - midsize_lastoffset, seed);
+
+                               return avalanche(acc);
+                       }
+                       else
+                       {
+                               hash128_t acc;
+                               uint64_t const nbRounds = len / 32;
+
+                               acc.low64 = len * PRIME<64>(1);
+                               acc.high64 = 0;
+
+                               for (size_t i = 0; i < 4; i++) 
+                               {
+                                       acc = mix_32b(acc, input + (i * 32), input + (i * 32) + 16, secret + (i * 32), seed);
+                               }
+
+                               acc.low64 = avalanche(acc.low64);
+                               acc.high64 = avalanche(acc.high64);
+
+                               for (size_t i = 4; i < nbRounds; i++) 
+                               {
+                                       acc = mix_32b(acc, input + (i * 32), input + (i * 32) + 16, secret + midsize_startoffset + ((i - 4) * 32), seed);
+                               }
+
+                               /* last bytes */
+                               acc = mix_32b(acc, input + len - 16, input + len - 32, secret + secret_size_min - midsize_lastoffset - 16, 0ULL - seed);
+
+                               uint64_t const low64 = acc.low64 + acc.high64;
+                               uint64_t const high64 = (acc.low64 * PRIME<64>(1)) + (acc.high64 * PRIME<64>(4)) + ((len - seed) * PRIME<64>(2));
+
+                               return { avalanche(low64), (uint64_t)0 - avalanche(high64) };
+                       }
+
+               }
+
+               template <size_t N>
+               XXH_NO_INLINE hash_t<N> xxhash3_impl(const void* XXH_RESTRICT input, size_t len, hash64_t seed, const void* XXH_RESTRICT secret = default_secret, size_t secretSize = secret_default_size)
+               {
+
+                       alignas(64) uint8_t custom_secret[secret_default_size];
+
+                       const void* short_secret = secret;
+               
+                       if (seed != 0)
+                       {
+                               init_custom_secret(custom_secret, seed);
+                               short_secret = default_secret;
+                       }
+
+                       if (len <= 16)
+                       {
+                               return len_0to16<N>(static_cast<const uint8_t*>(input), len, static_cast<const uint8_t*>(short_secret), seed);
+                       }
+                       else if (len <= 128)
+                       {
+                               return len_17to128<N>(static_cast<const uint8_t*>(input), len, static_cast<const uint8_t*>(short_secret), seed);
+                       }
+                       else if (len <= midsize_max)
+                       {
+                               return len_129to240<N>(static_cast<const uint8_t*>(input), len, static_cast<const uint8_t*>(short_secret), seed);
+                       }
+                       else
+                       {
+                               return hash_long_internal<N>(static_cast<const uint8_t*>(input), len, static_cast<const uint8_t*>(((seed == 0) ? secret : ((secret == default_secret) ? custom_secret : secret))), ((seed == 0) ? secretSize : ((secret == default_secret) ? secret_default_size : secretSize)));
+                       }
+               }
+
+               XXH_NO_INLINE void generate_secret(void* secret_buffer, size_t secret_size, const void* custom_seed, size_t seed_size)
+               {
+                       if (seed_size == 0)
+                       {
+                               custom_seed = default_secret;
+                               seed_size = secret_default_size;
+                       }
+
+                       size_t pos = 0;
+                       while (pos < secret_size)
+                       {
+                               size_t const copy_len = std::min(secret_size - pos, seed_size);
+                               memcpy((uint8_t*)secret_buffer + pos, custom_seed, copy_len);
+                               pos += copy_len;
+                       }
+
+                       size_t const nbseg16 = secret_size / 16;
+                       canonical128_t scrambled(xxhash3_impl<128>(custom_seed, seed_size, 0));
+                       for (size_t n = 0; n < nbseg16; n++)
+                       {
+                               hash128_t const h128 = xxhash3_impl<128>(&scrambled, sizeof(scrambled), n);
+                               combine_16((uint8_t*)secret_buffer + n * 16, h128);
+                       }
+
+                       combine_16((uint8_t*)secret_buffer + secret_size - 16, scrambled.get_hash());
+               }
+       }
+
+
+       /* *************************************
+       *  Public Access Point - xxhash
+       ***************************************/
+
+       template <size_t bit_mode>
+       inline hash_t<bit_mode> xxhash(const void* input, size_t len, uint_t<bit_mode> seed = 0)
+       {
+               static_assert(!(bit_mode != 32 && bit_mode != 64), "xxhash can only be used in 32 and 64 bit modes.");
+               return detail::endian_align<bit_mode>(input, len, seed);
+       }
+
+       template <size_t bit_mode, typename T>
+       inline hash_t<bit_mode> xxhash(const std::basic_string<T>& input, uint_t<bit_mode> seed = 0)
+       {
+               static_assert(!(bit_mode != 32 && bit_mode != 64), "xxhash can only be used in 32 and 64 bit modes.");
+               return detail::endian_align<bit_mode>(static_cast<const void*>(input.data()), input.length() * sizeof(T), seed);
+       }
+
+       template <size_t bit_mode, typename ContiguousIterator>
+       inline hash_t<bit_mode> xxhash(ContiguousIterator begin, ContiguousIterator end, uint_t<bit_mode> seed = 0)
+       {
+               static_assert(!(bit_mode != 32 && bit_mode != 64), "xxhash can only be used in 32 and 64 bit modes.");
+               using T = typename std::decay_t<decltype(*end)>;
+               return detail::endian_align<bit_mode>(static_cast<const void*>(&*begin), (end - begin) * sizeof(T), seed);
+       }
+
+       template <size_t bit_mode, typename T>
+       inline hash_t<bit_mode> xxhash(const std::vector<T>& input, uint_t<bit_mode> seed = 0)
+       {
+               static_assert(!(bit_mode != 32 && bit_mode != 64), "xxhash can only be used in 32 and 64 bit modes.");
+               return detail::endian_align<bit_mode>(static_cast<const void*>(input.data()), input.size() * sizeof(T), seed);
+       }
+
+       template <size_t bit_mode, typename T, size_t AN>
+       inline hash_t<bit_mode> xxhash(const std::array<T, AN>& input, uint_t<bit_mode> seed = 0)
+       {
+               static_assert(!(bit_mode != 32 && bit_mode != 64), "xxhash can only be used in 32 and 64 bit modes.");
+               return detail::endian_align<bit_mode>(static_cast<const void*>(input.data()), AN * sizeof(T), seed);
+       }
+
+       template <size_t bit_mode, typename T>
+       inline hash_t<bit_mode> xxhash(const std::initializer_list<T>& input, uint_t<bit_mode> seed = 0)
+       {
+               static_assert(!(bit_mode != 32 && bit_mode != 64), "xxhash can only be used in 32 and 64 bit modes.");
+               return detail::endian_align<bit_mode>(static_cast<const void*>(input.begin()), input.size() * sizeof(T), seed);
+       }
+
+
+       /* *************************************
+       *  Public Access Point - xxhash3
+       ***************************************/
+
+       template <size_t bit_mode>
+       inline hash_t<bit_mode> xxhash3(const void* input, size_t len, uint64_t seed = 0)
+       {
+               static_assert(!(bit_mode != 128 && bit_mode != 64), "xxhash3 can only be used in 64 and 128 bit modes.");
+               return detail3::xxhash3_impl<bit_mode>(input, len, seed);
+       }
+
+       template <size_t bit_mode>
+       inline hash_t<bit_mode> xxhash3(const void* input, size_t len, const void* secret, size_t secretSize, uint64_t seed = 0)
+       {
+               static_assert(!(bit_mode != 128 && bit_mode != 64), "xxhash3 can only be used in 64 and 128 bit modes.");
+               return detail3::xxhash3_impl<bit_mode>(input, len, seed, secret, secretSize);
+       }
+
+       template <size_t bit_mode, typename T>
+       inline hash_t<bit_mode> xxhash3(const std::basic_string<T>& input, uint64_t seed = 0)
+       {
+               static_assert(!(bit_mode != 128 && bit_mode != 64), "xxhash3 can only be used in 64 and 128 bit modes.");
+               return detail3::xxhash3_impl<bit_mode>(static_cast<const void*>(input.data()), input.length() * sizeof(T), seed);
+       }
+
+       template <size_t bit_mode, typename T>
+       inline hash_t<bit_mode> xxhash3(const std::basic_string<T>& input, const void* secret, size_t secretSize, uint64_t seed = 0)
+       {
+               static_assert(!(bit_mode != 128 && bit_mode != 64), "xxhash3 can only be used in 64 and 128 bit modes.");
+               return detail3::xxhash3_impl<bit_mode>(static_cast<const void*>(input.data()), input.length() * sizeof(T), seed, secret, secretSize);
+       }
+
+       template <size_t bit_mode, typename ContiguousIterator>
+       inline hash_t<bit_mode> xxhash3(ContiguousIterator begin, ContiguousIterator end, uint64_t seed = 0)
+       {
+               static_assert(!(bit_mode != 128 && bit_mode != 64), "xxhash3 can only be used in 64 and 128 bit modes.");
+               using T = typename std::decay_t<decltype(*end)>;
+               return detail3::xxhash3_impl<bit_mode>(static_cast<const void*>(&*begin), (end - begin) * sizeof(T), seed);
+       }
+
+       template <size_t bit_mode, typename ContiguousIterator>
+       inline hash_t<bit_mode> xxhash3(ContiguousIterator begin, ContiguousIterator end, const void* secret, size_t secretSize, uint64_t seed = 0)
+       {
+               static_assert(!(bit_mode != 128 && bit_mode != 64), "xxhash3 can only be used in 64 and 128 bit modes.");
+               using T = typename std::decay_t<decltype(*end)>;
+               return detail3::xxhash3_impl<bit_mode>(static_cast<const void*>(&*begin), (end - begin) * sizeof(T), seed, secret, secretSize);
+       }
+
+       template <size_t bit_mode, typename T>
+       inline hash_t<bit_mode> xxhash3(const std::vector<T>& input, uint64_t seed = 0)
+       {
+               static_assert(!(bit_mode != 128 && bit_mode != 64), "xxhash3 can only be used in 64 and 128 bit modes.");
+               return detail3::xxhash3_impl<bit_mode>(static_cast<const void*>(input.data()), input.size() * sizeof(T), seed);
+       }
+
+       template <size_t bit_mode, typename T>
+       inline hash_t<bit_mode> xxhash3(const std::vector<T>& input, const void* secret, size_t secretSize, uint64_t seed = 0)
+       {
+               static_assert(!(bit_mode != 128 && bit_mode != 64), "xxhash3 can only be used in 64 and 128 bit modes.");
+               return detail3::xxhash3_impl<bit_mode>(static_cast<const void*>(input.data()), input.size() * sizeof(T), seed, secret, secretSize);
+       }
+
+       template <size_t bit_mode, typename T, size_t AN>
+       inline hash_t<bit_mode> xxhash3(const std::array<T, AN>& input, uint64_t seed = 0)
+       {
+               static_assert(!(bit_mode != 128 && bit_mode != 64), "xxhash3 can only be used in 64 and 128 bit modes.");
+               return detail3::xxhash3_impl<bit_mode>(static_cast<const void*>(input.data()), AN * sizeof(T), seed);
+       }
+
+       template <size_t bit_mode, typename T, size_t AN>
+       inline hash_t<bit_mode> xxhash3(const std::array<T, AN>& input, const void* secret, size_t secretSize, uint64_t seed = 0)
+       {
+               static_assert(!(bit_mode != 128 && bit_mode != 64), "xxhash3 can only be used in 64 and 128 bit modes.");
+               return detail3::xxhash3_impl<bit_mode>(static_cast<const void*>(input.data()), AN * sizeof(T), seed, secret, secretSize);
+       }
+
+       template <size_t bit_mode, typename T>
+       inline hash_t<bit_mode> xxhash3(const std::initializer_list<T>& input, uint64_t seed = 0)
+       {
+               static_assert(!(bit_mode != 128 && bit_mode != 64), "xxhash3 can only be used in 64 and 128 bit modes.");
+               return detail3::xxhash3_impl<bit_mode>(static_cast<const void*>(input.begin()), input.size() * sizeof(T), seed);
+       }
+
+       template <size_t bit_mode, typename T>
+       inline hash_t<bit_mode> xxhash3(const std::initializer_list<T>& input, const void* secret, size_t secretSize, uint64_t seed = 0)
+       {
+               static_assert(!(bit_mode != 128 && bit_mode != 64), "xxhash3 can only be used in 64 and 128 bit modes.");
+               return detail3::xxhash3_impl<bit_mode>(static_cast<const void*>(input.begin()), input.size() * sizeof(T), seed, secret, secretSize);
+       }
+
+
+       /* *************************************
+       *  Secret Generation Functions
+       ***************************************/
+        
+       inline void generate_secret(void* secret_buffer, size_t secret_size, const void* custom_seed = detail3::default_secret, size_t seed_length = 0)
+       {
+               detail3::generate_secret(secret_buffer, secret_size, custom_seed, seed_length);
+       }
+
+       template <typename T, size_t AN>
+       inline void generate_secret(void* secret_buffer, size_t secret_size, const std::array<T, AN>& custom_seed)
+       {
+               detail3::generate_secret(secret_buffer, secret_size, static_cast<const void*>(custom_seed.data()), AN * sizeof(T));
+       }
+
+       template <typename T>
+       inline void generate_secret(void* secret_buffer, size_t secret_size, const std::initializer_list<T>& custom_seed)
+       {
+               detail3::generate_secret(secret_buffer, secret_size, static_cast<const void*>(custom_seed.begin()), custom_seed.size() * sizeof(T));
+       }
+
+       template <typename T>
+       inline void generate_secret(void* secret_buffer, size_t secret_size, const std::vector<T>& custom_seed)
+       {
+               detail3::generate_secret(secret_buffer, secret_size, static_cast<const void*>(custom_seed.data()), custom_seed.size() * sizeof(T));
+       }
+
+       template <typename T>
+       inline void generate_secret(void* secret_buffer, size_t secret_size, const std::basic_string<T>& custom_seed)
+       {
+               detail3::generate_secret(secret_buffer, secret_size, static_cast<const void*>(custom_seed.data()), custom_seed.length() * sizeof(T));
+       }
+
+       template <typename ContiguousIterator>
+       inline void generate_secret(void* secret_buffer, size_t secret_size, ContiguousIterator begin, ContiguousIterator end)
+       {
+               using T = typename std::decay_t<decltype(*end)>;
+               detail3::generate_secret(secret_buffer, secret_size, static_cast<const void*>(&*begin), (end - begin) * sizeof(T));
+       }
+
+       inline void generate_secret_from_seed(void* secret_buffer, uint64_t seed = 0)
+       {
+               alignas(64) uint8_t custom_secret[detail3::secret_default_size];
+               detail3::init_custom_secret(custom_secret, seed);
+               memcpy(secret_buffer, custom_secret, detail3::secret_default_size);
+       }
+
+
+       /* *************************************
+       *  Hash streaming - xxhash
+       ***************************************/
+
+       template <size_t bit_mode>
+       class hash_state_t 
+       {
+               uint64_t total_len = 0;
+               uint_t<bit_mode> v1 = 0, v2 = 0, v3 = 0, v4 = 0;
+               std::array<hash_t<bit_mode>, 4> mem = {0, 0, 0, 0};
+               uint32_t memsize = 0;
+
+               inline void update_impl(const void* input, size_t length)
+               {
+                       const uint8_t* p = reinterpret_cast<const uint8_t*>(input);
+                       const uint8_t* const bEnd = p + length;
+
+                       total_len += length;
+
+                       if (memsize + length < (bit_mode / 2))
+                       {   /* fill in tmp buffer */
+                               memcpy(reinterpret_cast<uint8_t*>(mem.data()) + memsize, input, length);
+                               memsize += static_cast<uint32_t>(length);
+                               return;
+                       }
+
+                       if (memsize > 0)
+                       {   /* some data left from previous update */
+                               memcpy(reinterpret_cast<uint8_t*>(mem.data()) + memsize, input, (bit_mode / 2) - memsize);
+
+                               const uint_t<bit_mode>* ptr = mem.data();
+
+                               v1 = detail::round<bit_mode>(v1, mem_ops::readLE<bit_mode>(ptr)); 
+                               ptr++;
+                               v2 = detail::round<bit_mode>(v2, mem_ops::readLE<bit_mode>(ptr)); 
+                               ptr++;
+                               v3 = detail::round<bit_mode>(v3, mem_ops::readLE<bit_mode>(ptr)); 
+                               ptr++;
+                               v4 = detail::round<bit_mode>(v4, mem_ops::readLE<bit_mode>(ptr));
+
+                               p += (bit_mode / 2) - memsize;
+                               memsize = 0;
+                       }
+
+                       if (p <= bEnd - (bit_mode / 2))
+                       {
+                               const uint8_t* const limit = bEnd - (bit_mode / 2);
+
+                               do
+                               {
+                                       v1 = detail::round<bit_mode>(v1, mem_ops::readLE<bit_mode>(p)); 
+                                       p += (bit_mode / 8);
+                                       v2 = detail::round<bit_mode>(v2, mem_ops::readLE<bit_mode>(p)); 
+                                       p += (bit_mode / 8);
+                                       v3 = detail::round<bit_mode>(v3, mem_ops::readLE<bit_mode>(p)); 
+                                       p += (bit_mode / 8);
+                                       v4 = detail::round<bit_mode>(v4, mem_ops::readLE<bit_mode>(p)); 
+                                       p += (bit_mode / 8);
+                               } 
+                               while (p <= limit);
+                       }
+
+                       if (p < bEnd)
+                       {
+                               memcpy(mem.data(), p, static_cast<size_t>(bEnd - p));
+                               memsize = static_cast<uint32_t>(bEnd - p);
+                       }
+               }
+
+               inline hash_t<bit_mode> digest_impl() const
+               {
+                       const uint8_t* p = reinterpret_cast<const uint8_t*>(mem.data());
+                       const uint8_t* const bEnd = reinterpret_cast<const uint8_t*>(mem.data()) + memsize;
+                       hash_t<bit_mode> hash_ret;
+
+                       if (total_len >= (bit_mode / 2))
+                       {
+                               hash_ret = bit_ops::rotl<bit_mode>(v1, 1) + bit_ops::rotl<bit_mode>(v2, 7) + bit_ops::rotl<bit_mode>(v3, 12) + bit_ops::rotl<bit_mode>(v4, 18);
+
+                               if constexpr (bit_mode == 64)
+                               {
+                                       detail::endian_align_sub_mergeround(hash_ret, v1, v2, v3, v4);
+                               }
+                       }
+                       else 
+                       { 
+                               hash_ret = v3 + detail::PRIME<bit_mode>(5); 
+                       }
+
+                       hash_ret += static_cast<hash_t<bit_mode>>(total_len);
+
+                       return detail::endian_align_sub_ending<bit_mode>(hash_ret, p, bEnd);
+               }
+
+       public:
+
+               hash_state_t(uint_t<bit_mode> seed = 0)
+               {
+                       static_assert(!(bit_mode != 32 && bit_mode != 64), "xxhash streaming can only be used in 32 and 64 bit modes.");
+                       v1 = seed + detail::PRIME<bit_mode>(1) + detail::PRIME<bit_mode>(2);
+                       v2 = seed + detail::PRIME<bit_mode>(2);
+                       v3 = seed + 0;
+                       v4 = seed - detail::PRIME<bit_mode>(1);
+               };
+
+               hash_state_t operator=(hash_state_t<bit_mode>& other)
+               {
+                       memcpy(this, &other, sizeof(hash_state_t<bit_mode>));
+               }
+
+               void reset(uint_t<bit_mode> seed = 0)
+               {
+                       memset(this, 0, sizeof(hash_state_t<bit_mode>));
+                       v1 = seed + detail::PRIME<bit_mode>(1) + detail::PRIME<bit_mode>(2);
+                       v2 = seed + detail::PRIME<bit_mode>(2);
+                       v3 = seed + 0;
+                       v4 = seed - detail::PRIME<bit_mode>(1);
+               }
+
+               void update(const void* input, size_t length)
+               {
+                       return update_impl(input, length);
+               }
+
+               template <typename T>
+               void update(const std::basic_string<T>& input)
+               {
+                       return update_impl(static_cast<const void*>(input.data()), input.length() * sizeof(T));
+               }
+
+               template <typename ContiguousIterator>
+               void update(ContiguousIterator begin, ContiguousIterator end)
+               {
+                       using T = typename std::decay_t<decltype(*end)>;
+                       return update_impl(static_cast<const void*>(&*begin), (end - begin) * sizeof(T));
+               }
+
+               template <typename T>
+               void update(const std::vector<T>& input)
+               {
+                       return update_impl(static_cast<const void*>(input.data()), input.size() * sizeof(T));
+               }
+
+               template <typename T, size_t AN>
+               void update(const std::array<T, AN>& input)
+               {
+                       return update_impl(static_cast<const void*>(input.data()), AN * sizeof(T));
+               }
+
+               template <typename T>
+               void update(const std::initializer_list<T>& input)
+               {
+                       return update_impl(static_cast<const void*>(input.begin()), input.size() * sizeof(T));
+               }
+
+               hash_t<bit_mode> digest() const
+               {
+                       return digest_impl();
+               }
+       };
+
+       using hash_state32_t = hash_state_t<32>;
+       using hash_state64_t = hash_state_t<64>;
+
+
+       /* *************************************
+       *  Hash streaming - xxhash3
+       ***************************************/
+
+       template <size_t bit_mode>
+       class alignas(64) hash3_state_t 
+       {   
+               constexpr static int internal_buffer_size = 256;
+               constexpr static int internal_buffer_stripes = (internal_buffer_size / detail3::stripe_len);
+       
+               alignas(64) uint64_t acc[8];
+               alignas(64) uint8_t customSecret[detail3::secret_default_size];  /* used to store a custom secret generated from the seed. Makes state larger. Design might change */
+               alignas(64) uint8_t buffer[internal_buffer_size];
+               uint32_t bufferedSize = 0;
+               uint32_t nbStripesPerBlock = 0;
+               uint32_t nbStripesSoFar = 0;
+               uint32_t secretLimit = 0;
+               uint32_t reserved32 = 0;
+               uint32_t reserved32_2 = 0;
+               uint64_t totalLen = 0;
+               uint64_t seed = 0;
+               bool useSeed = false;
+               uint64_t reserved64 = 0;
+               const uint8_t* secret = nullptr;    /* note : there is some padding after, due to alignment on 64 bytes */
+
+
+               void consume_stripes(uint64_t* acc, uint32_t& nbStripesSoFar, size_t totalStripes, const uint8_t* input)
+               {
+                       if (nbStripesPerBlock - nbStripesSoFar <= totalStripes) /* need a scrambling operation */
+                       {                       
+                               size_t const nbStripes = nbStripesPerBlock - nbStripesSoFar;
+
+                               detail3::accumulate(acc, input, secret + (nbStripesSoFar * detail3::secret_consume_rate), nbStripes);
+                               detail3::scramble_acc(acc, secret + secretLimit);
+                               detail3::accumulate(acc, input + nbStripes * detail3::stripe_len, secret, totalStripes - nbStripes);
+                               nbStripesSoFar = (uint32_t)(totalStripes - nbStripes);
+                       }
+                       else 
+                       {
+                               detail3::accumulate(acc, input, secret + (nbStripesSoFar * detail3::secret_consume_rate), totalStripes);
+                               nbStripesSoFar += (uint32_t)totalStripes;
+                       }
+               }
+
+               void update_impl(const void* input_, size_t len)
+               {
+                       const uint8_t* input = static_cast<const uint8_t*>(input_);
+                       const uint8_t* const bEnd = input + len;
+
+                       totalLen += len;
+
+                       if (bufferedSize + len <= internal_buffer_size) 
+                       {  /* fill in tmp buffer */
+                               memcpy(buffer + bufferedSize, input, len);
+                               bufferedSize += (uint32_t)len;
+                               return;
+                       }
+                       /* input now > XXH3_INTERNALBUFFER_SIZE */
+
+                       if (bufferedSize > 0) 
+                       {   /* some input within internal buffer: fill then consume it */
+                               size_t const loadSize = internal_buffer_size - bufferedSize;
+
+                               memcpy(buffer + bufferedSize, input, loadSize);
+                               input += loadSize;
+                               consume_stripes(acc, nbStripesSoFar, internal_buffer_stripes, buffer);
+                               bufferedSize = 0;
+                       }
+
+                       /* consume input by full buffer quantities */
+                       if (input + internal_buffer_size <= bEnd) 
+                       {
+                               const uint8_t* const limit = bEnd - internal_buffer_size;
+
+                               do 
+                               {
+                                       consume_stripes(acc, nbStripesSoFar, internal_buffer_stripes, input);
+                                       input += internal_buffer_size;
+                               } 
+                               while (input < limit);
+
+                               memcpy(buffer + sizeof(buffer) - detail3::stripe_len, input - detail3::stripe_len, detail3::stripe_len);
+                       }
+
+                       if (input < bEnd) 
+                       { /* some remaining input input : buffer it */
+                               memcpy(buffer, input, (size_t)(bEnd - input));
+                               bufferedSize = (uint32_t)(bEnd - input);
+                       }
+               }
+
+               void digest_long(uint64_t* acc_)
+               {
+                       memcpy(acc_, acc, sizeof(acc));  /* digest locally, state remains unaltered, and can continue ingesting more input afterwards */
+
+                       if (bufferedSize >= detail3::stripe_len) 
+                       {
+                               size_t const totalNbStripes = (bufferedSize - 1) / detail3::stripe_len;
+                               uint32_t nbStripesSoFar = this->nbStripesSoFar;
+
+                               consume_stripes(acc_, nbStripesSoFar, totalNbStripes, buffer);
+
+                               /* one last partial stripe */
+                               detail3::accumulate_512(acc_, buffer + bufferedSize - detail3::stripe_len, secret + secretLimit - detail3::secret_lastacc_start);
+                       }
+                       else 
+                       {  /* bufferedSize < STRIPE_LEN */
+                               /* one last stripe */
+                               uint8_t lastStripe[detail3::stripe_len];
+                               size_t const catchupSize = detail3::stripe_len - bufferedSize;
+                               memcpy(lastStripe, buffer + sizeof(buffer) - catchupSize, catchupSize);
+                               memcpy(lastStripe + catchupSize, buffer, bufferedSize);
+                               detail3::accumulate_512(acc_, lastStripe, secret + secretLimit - detail3::secret_lastacc_start);
+                       }
+               }
+
+               void reset_internal(uint64_t seed_reset, const void* secret_reset, size_t secret_size)
+               {
+                       memset(this, 0, sizeof(*this));
+                       memcpy(acc, detail3::init_acc.data(), sizeof(detail3::init_acc));
+                       seed = seed_reset;
+                       useSeed = (seed != 0);
+                       secret = (const uint8_t*)secret_reset;
+                       secretLimit = (uint32_t)(secret_size - detail3::stripe_len);
+                       nbStripesPerBlock = secretLimit / detail3::secret_consume_rate;
+               }
+
+       public:
+
+               hash3_state_t operator=(hash3_state_t& other)
+               {
+                       memcpy(this, &other, sizeof(hash3_state_t));
+               }
+
+               hash3_state_t(uint64_t seed = 0)
+               {
+                       static_assert(!(bit_mode != 128 && bit_mode != 64), "xxhash3 streaming can only be used in 64 and 128 bit modes.");
+                       reset(seed);
+               }
+
+               hash3_state_t(const void* secret, size_t secretSize, uint64_t seed = 0)
+               {
+                       static_assert(!(bit_mode != 128 && bit_mode != 64), "xxhash3 streaming can only be used in 64 and 128 bit modes.");
+                       reset(secret, secretSize, seed);
+               }
+
+               void reset(uint64_t seed = 0)
+               { 
+                       reset_internal(seed, detail3::default_secret, detail3::secret_default_size);
+                       detail3::init_custom_secret(customSecret, seed);
+                       secret = customSecret;
+                       /*
+                       memset(this, 0, sizeof(*this));
+                       memcpy(acc, detail3::init_acc.data(), sizeof(detail3::init_acc));
+                       (*this).seed = seed;
+
+                       if (seed == 0)
+                       {
+                               secret = detail3::default_secret;
+                       }
+                       else
+                       {
+                               detail3::init_custom_secret(customSecret, seed);
+                               secret = customSecret;
+                       }
+
+                       secretLimit = (uint32_t)(detail3::secret_default_size - detail3::stripe_len);
+                       nbStripesPerBlock = secretLimit / detail3::secret_consume_rate;*/
+               }
+
+               void reset(const void* secret, size_t secretSize, uint64_t seed = 0)
+               {
+                       reset_internal(seed, secret, secretSize);
+                       useSeed = true;
+                       /*
+
+                       memset(this, 0, sizeof(*this));
+                       memcpy(acc, detail3::init_acc.data(), sizeof(detail3::init_acc));
+                       seed = 0;
+
+                       (*this).secret = (const uint8_t*)secret;
+                       secretLimit = (uint32_t)(secretSize - detail3::stripe_len);
+                       nbStripesPerBlock = secretLimit / detail3::secret_consume_rate;*/
+               }
+
+               void update(const void* input, size_t len)
+               {
+                       return update_impl(static_cast<const void*>(input), len);
+               }
+
+               template <typename T>
+               void update(const std::basic_string<T>& input)
+               {
+                       return update_impl(static_cast<const void*>(input.data()), input.length() * sizeof(T));
+               }
+
+               template <typename ContiguousIterator>
+               void update(ContiguousIterator begin, ContiguousIterator end)
+               {
+                       using T = typename std::decay_t<decltype(*end)>;
+                       return update_impl(static_cast<const void*>(&*begin), (end - begin) * sizeof(T));
+               }
+
+               template <typename T>
+               void update(const std::vector<T>& input)
+               {
+                       return update_impl(static_cast<const void*>(input.data()), input.size() * sizeof(T));
+               }
+
+               template <typename T, size_t AN>
+               void update(const std::array<T, AN>& input)
+               {
+                       return update_impl(static_cast<const void*>(input.data()), AN * sizeof(T));
+               }
+
+               template <typename T>
+               void update(const std::initializer_list<T>& input)
+               {
+                       return update_impl(static_cast<const void*>(input.begin()), input.size() * sizeof(T));
+               }
+
+               hash_t<bit_mode> digest()
+               {       
+                       if (totalLen > detail3::midsize_max) 
+                       {
+                               alignas(128) hash64_t acc[detail3::acc_nb];
+                               
+                               digest_long(acc);
+
+                               if constexpr (bit_mode == 64)
+                               {
+                                       return detail3::merge_accs(acc, secret + detail3::secret_mergeaccs_start, (uint64_t)totalLen * detail::PRIME<64>(1));
+                               }
+                               else
+                               {
+                                       uint64_t const low64 = detail3::merge_accs(acc, secret + detail3::secret_mergeaccs_start, (uint64_t)totalLen * detail::PRIME<64>(1));
+                                       uint64_t const high64 = detail3::merge_accs(acc, secret + secretLimit + detail3::stripe_len - sizeof(acc) - detail3::secret_mergeaccs_start, ~((uint64_t)totalLen * detail::PRIME<64>(2)));
+
+                                       return { low64, high64 };
+                               }
+                       }
+                       else
+                       {
+                               return detail3::xxhash3_impl<bit_mode>(buffer, totalLen, seed, secret, secretLimit + detail3::stripe_len);
+                       }
+               }
+       };
+
+       using hash3_state64_t = hash3_state_t<64>;
+       using hash3_state128_t = hash3_state_t<128>;
 }
-
-/* *******************************************************************
- *  Hash streaming
- *********************************************************************/
-enum class error_code : uint8_t { ok = 0, error };
-
-template <size_t N> class hash_state_t {
-  uint64_t total_len = 0;
-  hash_t<N> v1 = 0, v2 = 0, v3 = 0, v4 = 0;
-  std::array<hash_t<N>, 4> mem = {{0, 0, 0, 0}};
-  uint32_t memsize             = 0;
-
-  inline error_code _update_impl(const void* input, size_t length, endianness endian)
-  {
-    const uint8_t* p          = reinterpret_cast<const uint8_t*>(input);
-    const uint8_t* const bEnd = p + length;
-
-    if (!input) {
-      return xxh::error_code::error;
-    }
-
-    total_len += length;
-
-    if (memsize + length < (N / 2)) { /* fill in tmp buffer */
-      memcpy(reinterpret_cast<uint8_t*>(mem.data()) + memsize, input, length);
-      memsize += static_cast<uint32_t>(length);
-      return error_code::ok;
-    }
-
-    if (memsize) { /* some data left from previous update */
-      memcpy(reinterpret_cast<uint8_t*>(mem.data()) + memsize, input, (N / 2) - memsize);
-
-      const hash_t<N>* ptr = mem.data();
-      v1                   = detail::round<N>(v1, mem_ops::readLE<N>(ptr, endian));
-      ptr++;
-      v2 = detail::round<N>(v2, mem_ops::readLE<N>(ptr, endian));
-      ptr++;
-      v3 = detail::round<N>(v3, mem_ops::readLE<N>(ptr, endian));
-      ptr++;
-      v4 = detail::round<N>(v4, mem_ops::readLE<N>(ptr, endian));
-
-      p += (N / 2) - memsize;
-      memsize = 0;
-    }
-
-    if (p <= bEnd - (N / 2)) {
-      const uint8_t* const limit = bEnd - (N / 2);
-
-      do {
-        v1 = detail::round<N>(v1, mem_ops::readLE<N>(p, endian));
-        p += (N / 8);
-        v2 = detail::round<N>(v2, mem_ops::readLE<N>(p, endian));
-        p += (N / 8);
-        v3 = detail::round<N>(v3, mem_ops::readLE<N>(p, endian));
-        p += (N / 8);
-        v4 = detail::round<N>(v4, mem_ops::readLE<N>(p, endian));
-        p += (N / 8);
-      } while (p <= limit);
-    }
-
-    if (p < bEnd) {
-      memcpy(mem.data(), p, static_cast<size_t>(bEnd - p));
-      memsize = static_cast<uint32_t>(bEnd - p);
-    }
-
-    return error_code::ok;
-  }
-
-  inline hash_t<N> _digest_impl(endianness endian) const
-  {
-    const uint8_t* p          = reinterpret_cast<const uint8_t*>(mem.data());
-    const uint8_t* const bEnd = reinterpret_cast<const uint8_t*>(mem.data()) + memsize;
-    hash_t<N> hash_ret;
-
-    if (total_len > (N / 2)) {
-      hash_ret =
-          bit_ops::rotl<N>(v1, 1) + bit_ops::rotl<N>(v2, 7) + bit_ops::rotl<N>(v3, 12) + bit_ops::rotl<N>(v4, 18);
-
-      detail::endian_align_sub_mergeround<N>(hash_ret, v1, v2, v3, v4);
-    } else {
-      hash_ret = v3 + detail::PRIME<N>(5);
-    }
-
-    hash_ret += static_cast<hash_t<N>>(total_len);
-
-    return detail::endian_align_sub_ending<N>(hash_ret, p, bEnd, endian, alignment::unaligned);
-  }
-
-public:
-  hash_state_t(hash_t<N> seed = 0)
-  {
-    static_assert(!(N != 32 && N != 64), "You can only stream hashing in 32 or 64 bit mode.");
-    v1 = seed + detail::PRIME<N>(1) + detail::PRIME<N>(2);
-    v2 = seed + detail::PRIME<N>(2);
-    v3 = seed + 0;
-    v4 = seed - detail::PRIME<N>(1);
-  };
-
-  hash_state_t operator=(hash_state_t<N>& other) { memcpy(this, other, sizeof(hash_state_t<N>)); }
-
-  error_code reset(hash_t<N> seed = 0)
-  {
-    memset(this, 0, sizeof(hash_state_t<N>));
-    v1 = seed + detail::PRIME<N>(1) + detail::PRIME<N>(2);
-    v2 = seed + detail::PRIME<N>(2);
-    v3 = seed + 0;
-    v4 = seed - detail::PRIME<N>(1);
-    return error_code::ok;
-  }
-
-  error_code update(const void* input, size_t length, endianness endian = endianness::unspecified)
-  {
-    return _update_impl(input, length, mem_ops::get_endian(endian));
-  }
-
-  template <typename T>
-  error_code update(const std::basic_string<T>& input, endianness endian = endianness::unspecified)
-  {
-    return _update_impl(static_cast<const void*>(input.data()), input.length() * sizeof(T),
-                        mem_ops::get_endian(endian));
-  }
-
-  template <typename ContiguousIterator>
-  error_code update(ContiguousIterator begin, ContiguousIterator end, endianness endian = endianness::unspecified)
-  {
-    using T = typename std::decay_t<decltype(*end)>;
-    return _update_impl(static_cast<const void*>(&*begin), (end - begin) * sizeof(T), mem_ops::get_endian(endian));
-  }
-
-  template <typename T> error_code update(const std::vector<T>& input, endianness endian = endianness::unspecified)
-  {
-    return _update_impl(static_cast<const void*>(input.data()), input.size() * sizeof(T), mem_ops::get_endian(endian));
-  }
-
-  template <typename T, size_t AN>
-  error_code update(const std::array<T, AN>& input, endianness endian = endianness::unspecified)
-  {
-    return _update_impl(static_cast<const void*>(input.data()), AN * sizeof(T), mem_ops::get_endian(endian));
-  }
-
-  template <typename T>
-  error_code update(const std::initializer_list<T>& input, endianness endian = endianness::unspecified)
-  {
-    return _update_impl(static_cast<const void*>(input.begin()), input.size() * sizeof(T), mem_ops::get_endian(endian));
-  }
-
-  hash_t<N> digest(endianness endian = endianness::unspecified) { return _digest_impl(mem_ops::get_endian(endian)); }
-};
-
-using hash_state32_t = hash_state_t<32>;
-using hash_state64_t = hash_state_t<64>;
-
-/* *******************************************************************
- *  Canonical
- *********************************************************************/
-
-template <size_t N> struct canonical_t {
-  std::array<uint8_t, N / 8> digest;
-
-  canonical_t(hash_t<N> hash)
-  {
-    if (mem_ops::is_little_endian()) {
-      hash = bit_ops::swap<N>(hash);
-    }
-    memcpy(digest.data(), &hash, sizeof(canonical_t<N>));
-  }
-
-  hash_t<N> get_hash() const { return mem_ops::readBE<N>(&digest); }
-};
-
-using canonical32_t = canonical_t<32>;
-using canonical64_t = canonical_t<64>;
-} // namespace xxh