Logo AND Algorithmique Numérique Distribuée

Public GIT Repository
Upgrade embedded xxhash (-> v0.8.1).
[simgrid.git] / src / 3rd-party / xxhash.hpp
1 #pragma once
2 #include <cstdint>
3 #include <cstring>
4 #include <array>
5 #include <type_traits>
6 #include <vector>
7 #include <string>
8
9 /*
10 xxHash - Extremely Fast Hash algorithm
11 Header File
12 Copyright (C) 2012-2022, Yann Collet.
13 Copyright (C) 2017-2022, Red Gavin.
14 All rights reserved.
15
16 BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
17 Redistribution and use in source and binary forms, with or without
18 modification, are permitted provided that the following conditions are
19 met:
20 * Redistributions of source code must retain the above copyright
21 notice, this list of conditions and the following disclaimer.
22 * Redistributions in binary form must reproduce the above
23 copyright notice, this list of conditions and the following disclaimer
24 in the documentation and/or other materials provided with the
25 distribution.
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
27 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
28 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
29 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
30 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
31 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
32 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
33 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
34 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
35 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
36 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 You can contact the author at :
38 - xxHash source repository : https://github.com/Cyan4973/xxHash
39 - xxHash C++ port repository : https://github.com/RedSpah/xxhash_cpp
40 */
41
42 /* Intrinsics
43 * Sadly has to be included in the global namespace or literally everything breaks
44 */
45 #include <immintrin.h>
46
47 namespace xxh
48 {
49         /* *************************************
50         *  Versioning
51         ***************************************/
52
53         namespace version
54         {
55                 constexpr int cpp_version_major = 0;
56                 constexpr int cpp_version_minor = 8;
57                 constexpr int cpp_version_release = 1;
58         }
59
60         constexpr uint32_t version_number() 
61         { 
62                 return version::cpp_version_major * 10000 + version::cpp_version_minor * 100 + version::cpp_version_release;
63         }
64
65
66         /* *************************************
67         *  Basic Types - Predefining uint128_t for intrin
68         ***************************************/
69
70         namespace typedefs
71         {
72                 struct alignas(16) uint128_t
73                 {
74                         uint64_t low64 = 0;
75                         uint64_t high64 = 0;
76
77                         bool operator==(const uint128_t & other)
78                         {
79                                 return (low64 == other.low64 && high64 == other.high64);
80                         }
81
82                         bool operator>(const uint128_t & other)
83                         {
84                                 return (high64 > other.high64 || low64 > other.low64);
85                         }
86
87                         bool operator>=(const uint128_t & other)
88                         {
89                                 return (*this > other || *this == other);
90                         }
91
92                         bool operator<(const uint128_t & other)
93                         {
94                                 return !(*this >= other);
95                         }
96
97                         bool operator<=(const uint128_t & other)
98                         {
99                                 return !(*this > other);
100                         }
101
102                         bool operator!=(const uint128_t & other)
103                         {
104                                 return !(*this == other);
105                         }
106
107                         uint128_t(uint64_t low, uint64_t high) : low64(low), high64(high) {}
108
109                         uint128_t() {}
110                 };
111
112         }
113
114         using uint128_t = typedefs::uint128_t;
115
116
117         /* *************************************
118         *  Compiler / Platform Specific Features
119         ***************************************/
120
121         namespace intrin
122         {
123                 /*!XXH_CPU_LITTLE_ENDIAN :
124                 * This is a CPU endian detection macro, will be
125                 * automatically set to 1 (little endian) if it is left undefined.
126                 * If compiling for a big endian system (why), XXH_CPU_LITTLE_ENDIAN has to be explicitly defined as 0.
127                 */
128 #ifndef XXH_CPU_LITTLE_ENDIAN
129 #       define XXH_CPU_LITTLE_ENDIAN 1
130 #endif
131
132
133                 /* Vectorization Detection
134                 * NOTE: XXH_NEON and XXH_VSX aren't supported in this C++ port.
135                 * The primary reason is that I don't have access to an ARM and PowerPC
136                 * machines to test them, and the secondary reason is that I even doubt anyone writing
137                 * code for such machines would bother using a C++ port rather than the original C version.
138                 */
139 #ifndef XXH_VECTOR   /* can be predefined on command line */
140 #       if defined(__AVX512F__) 
141 #               define XXH_VECTOR 3 /* AVX512 for Skylake and Icelake */
142 #       elif defined(__AVX2__)
143 #               define XXH_VECTOR 2 /* AVX2 for Haswell and Bulldozer */
144 #       elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
145 #               define XXH_VECTOR 1 /* SSE2 for Pentium 4 and all x86_64 */
146 #       else
147 #               define XXH_VECTOR 0 /* Portable scalar version */
148 #       endif
149 #endif
150
151                 constexpr int vector_mode = XXH_VECTOR;
152
153 #if XXH_VECTOR == 3             /* AVX512 for Skylake and Icelake */
154                 constexpr int acc_align = 64;
155                 using avx512_underlying = __m512i;
156                 using avx2_underlying = __m256i;
157                 using sse2_underlying = __m128i;
158 #elif XXH_VECTOR == 2           /* AVX2 for Haswell and Bulldozer */
159                 constexpr int acc_align = 32;
160                 using avx512_underlying = void;
161                 using avx2_underlying = __m256i;
162                 using sse2_underlying = __m128i;
163 #elif XXH_VECTOR == 1   /* SSE2 for Pentium 4 and all x86_64 */
164                 using avx512_underlying = void;
165                 using avx2_underlying = void; //std::array<__m128i, 2>;
166                 using sse2_underlying = __m128i;
167                 constexpr int acc_align = 16;
168 #else                                   /* Portable scalar version */
169                 using avx512_underlying = void;
170                 using avx2_underlying = void; //std::array<uint64_t, 4>;
171                 using sse2_underlying = void; //std::array<uint64_t, 2>;
172                 constexpr int acc_align = 8;
173 #endif
174
175
176                 /* Compiler Specifics
177                 * Defines inline macros and includes specific compiler's instrinsics.
178                 * */
179 #ifdef XXH_FORCE_INLINE /* First undefining the symbols in case they're already defined */
180 #       undef XXH_FORCE_INLINE
181 #endif 
182 #ifdef XXH_NO_INLINE
183 #       undef XXH_NO_INLINE
184 #endif
185
186 #ifdef _MSC_VER    /* Visual Studio */
187 #       pragma warning(disable : 4127)    
188 #       define XXH_FORCE_INLINE static __forceinline
189 #       define XXH_NO_INLINE static __declspec(noinline)
190 #       include <intrin.h>
191 #elif defined(__GNUC__)  /* Clang / GCC */
192 #       define XXH_FORCE_INLINE static inline __attribute__((always_inline))
193 #       define XXH_NO_INLINE static __attribute__((noinline))
194 #       include <mmintrin.h>
195 #else
196 #       define XXH_FORCE_INLINE static inline
197 #       define XXH_NO_INLINE static
198 #endif
199
200
201                 /* Prefetch
202                 * Can be disabled by defining XXH_NO_PREFETCH
203                 */
204 #if defined(XXH_NO_PREFETCH)
205                 XXH_FORCE_INLINE void prefetch(const void* ptr) {}
206 #elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))  
207                 XXH_FORCE_INLINE void prefetch(const void* ptr) { _mm_prefetch((const char*)(ptr), _MM_HINT_T0); }
208 #elif defined(__GNUC__) 
209                 XXH_FORCE_INLINE void prefetch(const void* ptr) { __builtin_prefetch((ptr), 0, 3); }
210 #else
211                 XXH_FORCE_INLINE void prefetch(const void* ptr) {}
212 #endif
213
214
215                 /* Restrict
216                 * Defines macro for restrict, which in C++ is sadly just a compiler extension (for now).
217                 * Can be disabled by defining XXH_NO_RESTRICT
218                 */
219 #ifdef XXH_RESTRICT
220 #       undef XXH_RESTRICT
221 #endif
222
223 #if (defined(__GNUC__) || defined(_MSC_VER)) && defined(__cplusplus) && !defined(XXH_NO_RESTRICT)
224 #       define XXH_RESTRICT  __restrict
225 #else
226 #       define XXH_RESTRICT 
227 #endif
228
229
230                 /* Likely / Unlikely
231                 * Defines macros for Likely / Unlikely, which are official in C++20, but sadly this library aims the previous standard.
232                 * Not present on MSVC.
233                 * Can be disabled by defining XXH_NO_BRANCH_HINTS
234                 */
235 #if ((defined(__GNUC__) && (__GNUC__ >= 3))  || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) || defined(__clang__)) && !defined(XXH_NO_BRANCH_HINTS)
236 #    define XXH_likely(x) __builtin_expect(x, 1)
237 #    define XXH_unlikely(x) __builtin_expect(x, 0)
238 #else
239 #    define XXH_likely(x) (x)
240 #    define XXH_unlikely(x) (x)
241 #endif
242
243
244                 namespace bit_ops
245                 {
246 #if defined(_MSC_VER)
247                         static inline uint32_t rotl32(uint32_t x, int32_t r) { return _rotl(x, r); }
248                         static inline uint64_t rotl64(uint64_t x, int32_t r) { return _rotl64(x, r); }
249                         static inline uint32_t rotr32(uint32_t x, int32_t r) { return _rotr(x, r); }
250                         static inline uint64_t rotr64(uint64_t x, int32_t r) { return _rotr64(x, r); }
251 #else
252                         static inline uint32_t rotl32(uint32_t x, int32_t r) { return ((x << r) | (x >> (32 - r))); }
253                         static inline uint64_t rotl64(uint64_t x, int32_t r) { return ((x << r) | (x >> (64 - r))); }
254                         static inline uint32_t rotr32(uint32_t x, int32_t r) { return ((x >> r) | (x << (32 - r))); }
255                         static inline uint64_t rotr64(uint64_t x, int32_t r) { return ((x >> r) | (x << (64 - r))); }
256 #endif
257
258
259 #if defined(_MSC_VER)     /* Visual Studio */
260                         static inline uint32_t swap32(uint32_t x) { return _byteswap_ulong(x); }
261                         static inline uint64_t swap64(uint64_t x) { return _byteswap_uint64(x); }
262 #elif defined(__GNUC__)
263                         static inline uint32_t swap32(uint32_t x) { return __builtin_bswap32(x); }
264                         static inline uint64_t swap64(uint64_t x) { return __builtin_bswap64(x); }
265 #else
266                         static inline uint32_t swap32(uint32_t x) { return ((x << 24) & 0xff000000) | ((x << 8) & 0x00ff0000) | ((x >> 8) & 0x0000ff00) | ((x >> 24) & 0x000000ff); }
267                         static inline uint64_t swap64(uint64_t x) { return ((x << 56) & 0xff00000000000000ULL) | ((x << 40) & 0x00ff000000000000ULL) | ((x << 24) & 0x0000ff0000000000ULL) | ((x << 8) & 0x000000ff00000000ULL) | ((x >> 8) & 0x00000000ff000000ULL) | ((x >> 24) & 0x0000000000ff0000ULL) | ((x >> 40) & 0x000000000000ff00ULL) | ((x >> 56) & 0x00000000000000ffULL); }
268 #endif
269
270
271 #if defined(_MSC_VER) && defined(_M_IX86) // Only for 32-bit MSVC.
272                         XXH_FORCE_INLINE uint64_t mult32to64(uint32_t x, uint32_t y) { return __emulu(x, y); }
273 #else
274                         XXH_FORCE_INLINE uint64_t mult32to64(uint32_t x, uint32_t y) { return (uint64_t)(uint32_t)(x) * (uint64_t)(uint32_t)(y); }
275 #endif
276
277
278 #if defined(__GNUC__) && !defined(__clang__) && defined(__i386__)
279                         __attribute__((__target__("no-sse")))
280 #endif
281                         static inline uint128_t mult64to128(uint64_t lhs, uint64_t rhs)
282                         {
283
284 #if defined(__GNUC__) && !defined(__wasm__) \
285     && defined(__SIZEOF_INT128__) \
286     || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)
287
288                                 __uint128_t product = (__uint128_t)lhs * (__uint128_t)rhs;
289                                 uint128_t r128;
290                                 r128.low64 = (uint64_t)(product);
291                                 r128.high64 = (uint64_t)(product >> 64);
292                                 return r128;
293
294 #elif defined(_M_X64) || defined(_M_IA64)
295
296 #ifndef _MSC_VER
297 #   pragma intrinsic(_umul128)
298 #endif
299                                 uint64_t product_high;
300                                 uint64_t const product_low = _umul128(lhs, rhs, &product_high);
301                                 uint128_t r128;
302                                 r128.low64 = product_low;
303                                 r128.high64 = product_high;
304                                 return r128;
305
306 #else
307                                 uint64_t const lo_lo = bit_ops::mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF);
308                                 uint64_t const hi_lo = bit_ops::mult32to64(lhs >> 32, rhs & 0xFFFFFFFF);
309                                 uint64_t const lo_hi = bit_ops::mult32to64(lhs & 0xFFFFFFFF, rhs >> 32);
310                                 uint64_t const hi_hi = bit_ops::mult32to64(lhs >> 32, rhs >> 32);
311
312                                 /* Now add the products together. These will never overflow. */
313                                 uint64_t const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
314                                 uint64_t const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi;
315                                 uint64_t const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF);
316
317                                 uint128_t r128;
318                                 r128.low64 = lower;
319                                 r128.high64 = upper;
320                                 return r128;
321 #endif
322                         }
323                 }
324         }
325
326
327         /* *************************************
328         *  Basic Types - Everything else
329         ***************************************/
330
331         namespace typedefs
332         {
333                 /* *************************************
334                 *  Basic Types - Detail
335                 ***************************************/
336
337                 template <size_t N>
338                 struct hash_type
339                 {
340                         using type = void;
341                 };
342
343                 template <>
344                 struct hash_type<32>
345                 {
346                         using type = uint32_t;
347                 };
348
349                 template <>
350                 struct hash_type<64>
351                 {
352                         using type = uint64_t;
353                 };
354
355                 template <>
356                 struct hash_type<128>
357                 {
358                         using type = uint128_t;
359                 };
360
361
362                 template <size_t N>
363                 struct vec_type
364                 {
365                         using type = void;
366                 };
367
368                 template <>
369                 struct vec_type<64>
370                 {
371                         using type = uint64_t;
372                 };
373
374                 template <>
375                 struct vec_type<128>
376                 {
377                         using type = intrin::sse2_underlying;
378                 };
379
380                 template <>
381                 struct vec_type<256>
382                 {
383                         using type = intrin::avx2_underlying;
384                 };
385
386                 template <>
387                 struct vec_type<512>
388                 {
389                         using type = intrin::avx512_underlying;
390                 };
391
392                 /* Rationale
393                 * On the surface level uint_type appears to be pointless,
394                 * as it is just a copy of hash_type. They do use the same types,
395                 * that is true, but the reasoning for the difference is aimed at humans,
396                 * not the compiler, as a difference between values that are 'just' numbers,
397                 * and those that represent actual hash values.
398                 */
399                 template <size_t N>
400                 struct uint_type
401                 {
402                         using type = void;
403                 };
404
405                 template <>
406                 struct uint_type<32>
407                 {
408                         using type = uint32_t;
409                 };
410
411                 template <>
412                 struct uint_type<64>
413                 {
414                         using type = uint64_t;
415                 };
416
417                 template <>
418                 struct uint_type<128>
419                 {
420                         using type = uint128_t;
421                 };
422         }
423
424         template <size_t N>
425         using hash_t = typename typedefs::hash_type<N>::type;
426         using hash32_t = hash_t<32>;
427         using hash64_t = hash_t<64>;
428         using hash128_t = hash_t<128>;
429
430         template <size_t N>
431         using vec_t = typename typedefs::vec_type<N>::type;
432         using vec64_t = vec_t<64>;
433         using vec128_t = vec_t<128>;
434         using vec256_t = vec_t<256>;
435         using vec512_t = vec_t<512>;
436
437         template <size_t N>
438         using uint_t = typename typedefs::uint_type<N>::type;
439         
440
441
442         /* *************************************
443         *  Bit Operations
444         ***************************************/
445
446         namespace bit_ops
447         {
448                 /* ****************************************
449                 *  Bit Operations
450                 ******************************************/
451
452                 template <size_t N>
453                 static inline uint_t<N> rotl(uint_t<N> n, int32_t r)
454                 {
455                         if constexpr (N == 32)
456                         {
457                                 return intrin::bit_ops::rotl32(n, r);
458                         }
459
460                         if constexpr (N == 64)
461                         {
462                                 return intrin::bit_ops::rotl64(n, r);
463                         }
464                 }
465
466                 template <size_t N>
467                 static inline uint_t<N> rotr(uint_t<N> n, int32_t r)
468                 {
469                         if constexpr (N == 32)
470                         {
471                                 return intrin::bit_ops::rotr32(n, r);
472                         }
473
474                         if constexpr (N == 64)
475                         {
476                                 return intrin::bit_ops::rotr64(n, r);
477                         }
478                 }
479
480                 template <size_t N>
481                 static inline uint_t<N> swap(uint_t<N> n)
482                 {
483                         if constexpr (N == 32)
484                         {
485                                 return intrin::bit_ops::swap32(n);
486                         }
487
488                         if constexpr (N == 64)
489                         {
490                                 return intrin::bit_ops::swap64(n);
491                         }
492                 }
493
494                 template <size_t N = 64>
495                 static inline vec_t<N> mul32to64(vec_t<N> x, vec_t<N> y)
496                 { 
497                         if constexpr (N == 64)
498                         {
499                                 return intrin::bit_ops::mult32to64(static_cast<uint32_t>(x), static_cast<uint32_t>(y));
500                         }
501                         else
502                         {
503                                 return 0;
504                         }
505                 }
506
507                 static inline uint128_t mul64to128(uint64_t x, uint64_t y)
508                 { 
509                         return intrin::bit_ops::mult64to128(x, y); 
510                 }
511
512                 static inline uint64_t mul128fold64(uint64_t x, uint64_t y)
513                 {
514                         uint128_t product = mul64to128(x, y);
515
516                         return (product.low64 ^ product.high64);
517                 }
518         }
519
520
521         /* *************************************
522         *  Memory Functions 
523         ***************************************/
524
525         namespace mem_ops
526         {
527
528                 /* *************************************
529                 * Endianness
530                 ***************************************/
531
532                 constexpr bool is_little_endian()
533                 {
534                         return (XXH_CPU_LITTLE_ENDIAN == 1);
535                 }
536
537
538                 /* *************************************
539                 *  Memory Access
540                 ***************************************/
541
542                 template <size_t N>
543                 static inline uint_t<N> read(const void* memPtr)
544                 {
545                         uint_t<N> val;
546
547                         memcpy(&val, memPtr, sizeof(val));
548                         return val;
549                 }
550
551                 template <size_t N>
552                 static inline uint_t<N> readLE(const void* ptr)
553                 {
554                         if constexpr (is_little_endian())
555                         {
556                                 return read<N>(ptr);
557                         }
558                         else
559                         {
560                                 return bit_ops::swap<N>(read<N>(ptr));
561                         }
562                 }
563
564                 template <size_t N>
565                 static inline uint_t<N> readBE(const void* ptr)
566                 {
567                         if constexpr (is_little_endian())
568                         {
569                                 return bit_ops::swap<N>(read<N>(ptr));
570                         }
571                         else
572                         {
573                                 return read<N>(ptr);
574                         }
575                 }
576
577                 template <size_t N>
578                 static void writeLE(void* dst, uint_t<N> v)
579                 {
580                         if constexpr (!is_little_endian())
581                         {
582                                 v = bit_ops::swap<N>(v);
583                         }
584
585                         memcpy(dst, &v, sizeof(v));
586                 }
587         }
588
589
590         /* *************************************
591         *  Vector Functions 
592         ***************************************/
593
594         namespace vec_ops
595         {
596                 template <size_t N>
597                 XXH_FORCE_INLINE vec_t<N> loadu(const vec_t<N>* input)
598                 { 
599                         static_assert(!(N != 128 && N != 256 && N != 64 && N != 512), "Invalid template argument passed to xxh::vec_ops::loadu");
600
601                         if constexpr (N == 128)
602                         {
603                                 return _mm_loadu_si128(input);
604                         }
605
606                         if constexpr (N == 256)
607                         {
608                                 return _mm256_loadu_si256(input);
609                         }
610
611                         if constexpr (N == 512)
612                         {
613                                 return _mm512_loadu_si512(input);
614                         }
615
616                         if constexpr (N == 64)
617                         {
618                                 return mem_ops::readLE<64>(input);
619                         }
620
621                 }
622
623
624                 // 'xorv' instead of 'xor' because 'xor' is a weird wacky alternate operator expression thing. 
625                 template <size_t N>
626                 XXH_FORCE_INLINE vec_t<N> xorv(vec_t<N> a, vec_t<N> b)
627                 { 
628                         static_assert(!(N != 128 && N != 256 && N != 64 && N != 512), "Invalid argument passed to xxh::vec_ops::xorv");
629                 
630                         if constexpr (N == 128)
631                         {
632                                 return _mm_xor_si128(a, b);
633                         }
634
635                         if constexpr (N == 256)
636                         {
637                                 return _mm256_xor_si256(a, b);
638                         }
639
640                         if constexpr (N == 512)
641                         {
642                                 return _mm512_xor_si512(a, b);
643                         }
644
645                         if constexpr (N == 64)
646                         {
647                                 return a ^ b;
648                         }
649                 }
650                 
651
652                 template <size_t N>
653                 XXH_FORCE_INLINE vec_t<N> mul(vec_t<N> a, vec_t<N> b)
654                 {
655                         static_assert(!(N != 128 && N != 256 && N != 64 && N != 512), "Invalid argument passed to xxh::vec_ops::mul");
656
657                         if constexpr (N == 128)
658                         {
659                                 return _mm_mul_epu32(a, b);
660                         }
661
662                         if constexpr (N == 256)
663                         {
664                                 return _mm256_mul_epu32(a, b);
665                         }
666
667                         if constexpr (N == 512)
668                         {
669                                 return _mm512_mul_epu32(a, b);
670                         }
671
672                         if constexpr (N == 64)
673                         {
674                                 return a * b;
675                         }
676                 }
677
678
679                 template <size_t N>
680                 XXH_FORCE_INLINE vec_t<N> add(vec_t<N> a, vec_t<N> b)
681                 {
682                         static_assert(!(N != 128 && N != 256 && N != 64 && N != 512), "Invalid argument passed to xxh::vec_ops::add");
683
684                         if constexpr (N == 128)
685                         {
686                                 return _mm_add_epi64(a, b);
687                         }
688
689                         if constexpr (N == 256)
690                         {
691                                 return _mm256_add_epi64(a, b);
692                         }
693
694                         if constexpr (N == 512)
695                         {
696                                 return _mm512_add_epi64(a, b);
697                         }
698
699                         if constexpr (N == 64)
700                         {
701                                 return a + b;
702                         }
703                 }
704
705
706                 template <size_t N, uint8_t S1, uint8_t S2, uint8_t S3, uint8_t S4>
707                 XXH_FORCE_INLINE vec_t<N> shuffle(vec_t<N> a)
708                 { 
709                         static_assert(!(N != 128 && N != 256 && N != 64 && N != 512), "Invalid argument passed to xxh::vec_ops::shuffle");
710
711                         if constexpr (N == 128)
712                         {
713                                 return _mm_shuffle_epi32(a, _MM_SHUFFLE(S1, S2, S3, S4));
714                         }
715
716                         if constexpr (N == 256)
717                         {
718                                 return _mm256_shuffle_epi32(a, _MM_SHUFFLE(S1, S2, S3, S4));
719                         }
720
721                         if constexpr (N == 512)
722                         {
723                                 return _mm512_shuffle_epi32(a, _MM_SHUFFLE(S1, S2, S3, S4));
724                         }
725
726                         if constexpr (N == 64)
727                         {
728                                 return a;
729                         }
730                 }
731
732
733                 template <size_t N>
734                 XXH_FORCE_INLINE vec_t<N> set1(int64_t a)
735                 {
736                         static_assert(!(N != 128 && N != 256 && N != 64 && N != 512), "Invalid argument passed to xxh::vec_ops::set1");
737
738                         if constexpr (N == 128)
739                         {
740                                 return _mm_set1_epi32(static_cast<int>(a));
741                         }
742
743                         if constexpr (N == 256)
744                         {
745                                 return _mm256_set1_epi32(static_cast<int>(a));
746                         }
747
748                         if constexpr (N == 512)
749                         {
750                                 return _mm512_set1_epi32(static_cast<int>(a));
751                         }
752
753                         if constexpr (N == 64)
754                         {
755                                 return a;
756                         }
757                 }
758
759
760                 template <size_t N>
761                 XXH_FORCE_INLINE vec_t<N> srli(vec_t<N> n, int a)
762                 {
763                         static_assert(!(N != 128 && N != 256 && N != 64 && N != 512), "Invalid argument passed to xxh::vec_ops::srli");
764
765                         if constexpr (N == 128)
766                         {
767                                 return _mm_srli_epi64(n, a);
768                         }
769
770                         if constexpr (N == 256)
771                         {
772                                 return _mm256_srli_epi64(n, a);
773                         }
774
775                         if constexpr (N == 512)
776                         {
777                                 return _mm512_srli_epi64(n, a);
778                         }
779
780                         if constexpr (N == 64)
781                         {
782                                 return n >> a;
783                         }
784                 }
785
786
787                 template <size_t N>
788                 XXH_FORCE_INLINE vec_t<N> slli(vec_t<N> n, int a)
789                 {
790                         static_assert(!(N != 128 && N != 256 && N != 64 && N != 512), "Invalid argument passed to xxh::vec_ops::slli");
791
792                         if constexpr (N == 128)
793                         {
794                                 return _mm_slli_epi64(n, a);
795                         }
796
797                         if constexpr (N == 256)
798                         {
799                                 return _mm256_slli_epi64(n, a);
800                         }
801
802                         if constexpr (N == 512)
803                         {
804                                 return _mm512_slli_epi64(n, a);
805                         }
806
807                         if constexpr (N == 64)
808                         {
809                                 return n << a;
810                         }
811                 }
812         }
813
814         /* *************************************
815         *  Canonical represenation
816         ***************************************/
817
818         template <size_t bit_mode>
819         struct canonical_t
820         {
821                 std::array<uint8_t, bit_mode / 8> digest{ 0 };
822
823                 canonical_t(hash_t<bit_mode> hash)
824                 {
825                         if constexpr (bit_mode < 128)
826                         {
827                                 if (mem_ops::is_little_endian())
828                                 {
829                                         hash = bit_ops::swap<bit_mode>(hash);
830                                 }
831
832                                 memcpy(digest.data(), &hash, sizeof(canonical_t<bit_mode>));
833                         }
834                         else
835                         {
836                                 if (mem_ops::is_little_endian())
837                                 {
838                                         hash.low64 = bit_ops::swap<64>(hash.low64);
839                                         hash.high64 = bit_ops::swap<64>(hash.high64);
840                                 }
841
842                                 memcpy(digest.data(), &hash.high64, sizeof(hash.high64));
843                                 memcpy(digest.data() + sizeof(hash.high64), &hash.low64, sizeof(hash.low64));
844                         }
845                 }
846
847                 hash_t<bit_mode> get_hash() const
848                 {
849                         if constexpr (bit_mode < 128)
850                         {
851                                 return mem_ops::readBE<bit_mode>(&digest);
852                         }
853                         else
854                         {
855                                 return { mem_ops::readBE<64>(&digest[8]), mem_ops::readBE<64>(&digest) };
856                         }
857                 }
858         };
859
860         using canonical32_t = canonical_t<32>;
861         using canonical64_t = canonical_t<64>;
862         using canonical128_t = canonical_t<128>;
863
864         template <size_t bit_mode>
865         inline hash_t<bit_mode> to_canonical(hash_t<bit_mode> hash)
866         {
867                 static_assert(!(bit_mode != 128 && bit_mode != 64 && bit_mode != 32), "Canonical form can only be obtained from 32, 64 and 128 bit hashes.");
868                 canonical_t<bit_mode> canon(hash);
869                 hash_t<bit_mode> res;
870                 memcpy(&res, &canon, bit_mode / 4);
871
872                 return res;
873         }
874
875
876         /* *************************************
877         *  Algorithm Implementation - xxhash
878         ***************************************/
879
880         namespace detail
881         {
882                 using namespace mem_ops;
883                 using namespace bit_ops;
884
885
886                 /* *************************************
887                 *  Constants
888                 ***************************************/
889
890                 constexpr static std::array<uint32_t, 5> primes32 = { 2654435761U, 2246822519U, 3266489917U, 668265263U, 374761393U };
891                 constexpr static std::array<uint64_t, 5> primes64 = { 11400714785074694791ULL, 14029467366897019727ULL, 1609587929392839161ULL, 9650029242287828579ULL, 2870177450012600261ULL };
892
893                 template <size_t N>
894                 constexpr uint_t<N> PRIME(uint64_t n) 
895                 {
896                         if constexpr (N == 32)
897                         {
898                                 return primes32[n - 1];
899                         }
900                         else
901                         {
902                                 return primes64[n - 1];
903                         }
904                 }
905
906
907                 /* *************************************
908                 *  Functions
909                 ***************************************/
910
911                 template <size_t N> 
912                 XXH_FORCE_INLINE uint_t<N> avalanche(uint_t<N> hash)
913                 {
914                         if constexpr (N == 32)
915                         {
916                                 hash ^= hash >> 15;
917                                 hash *= PRIME<32>(2);
918                                 hash ^= hash >> 13;
919                                 hash *= PRIME<32>(3);
920                                 hash ^= hash >> 16;
921                                 return hash;
922                         }
923                         else if constexpr (N == 64)
924                         {
925                                 hash ^= hash >> 33;
926                                 hash *= PRIME<64>(2);
927                                 hash ^= hash >> 29;
928                                 hash *= PRIME<64>(3);
929                                 hash ^= hash >> 32;
930                                 return hash;
931                         }
932                         else return 0;
933                 }
934
935                 template <size_t N>
936                 XXH_FORCE_INLINE uint_t<N> round(uint_t<N> seed, uint_t<N> input)
937                 {
938                         seed += input * PRIME<N>(2);
939
940                         if constexpr (N == 32)
941                         {
942                                 seed = rotl<N>(seed, 13);
943                         }
944                         else
945                         {
946                                 seed = rotl<N>(seed, 31);
947                         }
948
949                         seed *= PRIME<N>(1);
950                         return seed;
951                 }
952
953                 XXH_FORCE_INLINE uint64_t mergeRound64(hash64_t acc, uint64_t val)
954                 {
955                         val = round<64>(0, val);
956                         acc ^= val;
957                         acc = acc * PRIME<64>(1) + PRIME<64>(4);
958                         return acc;
959                 }
960
961                 XXH_FORCE_INLINE void endian_align_sub_mergeround(hash64_t& hash_ret, uint64_t v1, uint64_t v2, uint64_t v3, uint64_t v4)
962                 {
963                         hash_ret = mergeRound64(hash_ret, v1);
964                         hash_ret = mergeRound64(hash_ret, v2);
965                         hash_ret = mergeRound64(hash_ret, v3);
966                         hash_ret = mergeRound64(hash_ret, v4);
967                 }
968
969                 template <size_t N>
970                 static inline hash_t<N> endian_align_sub_ending(hash_t<N> hash_ret, const uint8_t* p, const uint8_t* bEnd)
971                 {
972                         if constexpr (N == 32)
973                         {
974                                 while ((p + 4) <= bEnd)
975                                 {
976                                         hash_ret += readLE<32>(p) * PRIME<32>(3);
977                                         hash_ret = rotl<32>(hash_ret, 17) * PRIME<32>(4);
978                                         p += 4;
979                                 }
980
981                                 while (p < bEnd)
982                                 {
983                                         hash_ret += (*p) * PRIME<32>(5);
984                                         hash_ret = rotl<32>(hash_ret, 11) * PRIME<32>(1);
985                                         p++;
986                                 }
987
988                                 return avalanche<32>(hash_ret);
989                         }
990                         else
991                         {
992                                 while (p + 8 <= bEnd)
993                                 {
994                                         const uint64_t k1 = round<64>(0, readLE<64>(p));
995
996                                         hash_ret ^= k1;
997                                         hash_ret = rotl<64>(hash_ret, 27) * PRIME<64>(1) + PRIME<64>(4);
998                                         p += 8;
999                                 }
1000
1001                                 if (p + 4 <= bEnd)
1002                                 {
1003                                         hash_ret ^= static_cast<uint64_t>(readLE<32>(p))* PRIME<64>(1);
1004                                         hash_ret = rotl<64>(hash_ret, 23) * PRIME<64>(2) + PRIME<64>(3);
1005                                         p += 4;
1006                                 }
1007
1008                                 while (p < bEnd)
1009                                 {
1010                                         hash_ret ^= (*p) * PRIME<64>(5);
1011                                         hash_ret = rotl<64>(hash_ret, 11) * PRIME<64>(1);
1012                                         p++;
1013                                 }
1014
1015                                 return avalanche<64>(hash_ret);
1016                         }
1017                 }
1018
1019                 template <size_t N>
1020                 static inline hash_t<N> endian_align(const void* input, size_t len, uint_t<N> seed)
1021                 {
1022                         static_assert(!(N != 32 && N != 64), "You can only call endian_align in 32 or 64 bit mode.");
1023
1024                         const uint8_t* p = static_cast<const uint8_t*>(input);
1025                         const uint8_t* bEnd = p + len;
1026                         hash_t<N> hash_ret;
1027
1028                         if (len >= (N / 2))
1029                         {
1030                                 const uint8_t* const limit = bEnd - (N / 2);
1031                                 uint_t<N> v1 = seed + PRIME<N>(1) + PRIME<N>(2);
1032                                 uint_t<N> v2 = seed + PRIME<N>(2);
1033                                 uint_t<N> v3 = seed + 0;
1034                                 uint_t<N> v4 = seed - PRIME<N>(1);
1035
1036                                 do
1037                                 {
1038                                         v1 = round<N>(v1, readLE<N>(p)); 
1039                                         p += (N / 8);
1040                                         v2 = round<N>(v2, readLE<N>(p)); 
1041                                         p += (N / 8);
1042                                         v3 = round<N>(v3, readLE<N>(p)); 
1043                                         p += (N / 8);
1044                                         v4 = round<N>(v4, readLE<N>(p)); 
1045                                         p += (N / 8);
1046                                 } 
1047                                 while (p <= limit);
1048
1049                                 hash_ret = rotl<N>(v1, 1) + rotl<N>(v2, 7) + rotl<N>(v3, 12) + rotl<N>(v4, 18);
1050
1051                                 if constexpr (N == 64)
1052                                 {
1053                                         endian_align_sub_mergeround(hash_ret, v1, v2, v3, v4);
1054                                 }
1055                         }
1056                         else 
1057                         { 
1058                                 hash_ret = seed + PRIME<N>(5); 
1059                         }
1060
1061                         hash_ret += static_cast<hash_t<N>>(len);
1062
1063                         return endian_align_sub_ending<N>(hash_ret, p, bEnd);
1064                 }
1065         }
1066
1067
1068         /* *************************************
1069         *  Algorithm Implementation - xxhash3
1070         ***************************************/
1071
1072         namespace detail3
1073         {
1074                 using namespace vec_ops;
1075                 using namespace detail;
1076                 using namespace mem_ops;
1077                 using namespace bit_ops;
1078
1079
1080                 /* *************************************
1081                 *  Enums
1082                 ***************************************/
1083
1084                 enum class vec_mode : uint8_t { scalar = 0, sse2 = 1, avx2 = 2, avx512 = 3 };
1085
1086
1087                 /* *************************************
1088                 *  Constants
1089                 ***************************************/
1090
1091                 constexpr uint64_t secret_default_size = 192;
1092                 constexpr uint64_t secret_size_min = 136;
1093                 constexpr uint64_t secret_consume_rate = 8;
1094                 constexpr uint64_t stripe_len = 64;
1095                 constexpr uint64_t acc_nb = 8;
1096                 constexpr uint64_t prefetch_distance = 384;
1097                 constexpr uint64_t secret_lastacc_start = 7;
1098                 constexpr uint64_t secret_mergeaccs_start = 11;
1099                 constexpr uint64_t midsize_max = 240;
1100                 constexpr uint64_t midsize_startoffset = 3;
1101                 constexpr uint64_t midsize_lastoffset = 17;
1102
1103                 constexpr vec_mode vector_mode = static_cast<vec_mode>(intrin::vector_mode);
1104                 constexpr uint64_t acc_align = intrin::acc_align;
1105                 constexpr std::array<uint64_t, 4> vector_bit_width { 64, 128, 256, 512 };
1106
1107                 
1108                 /* *************************************
1109                 *  Defaults
1110                 ***************************************/
1111                 
1112                 alignas(64) constexpr uint8_t default_secret[secret_default_size] = {
1113                         0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c,
1114                         0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f,
1115                         0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21,
1116                         0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c,
1117                         0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3,
1118                         0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8,
1119                         0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d,
1120                         0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64,
1121                         0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb,
1122                         0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e,
1123                         0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce,
1124                         0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e,
1125                 };
1126
1127                 constexpr std::array<uint64_t, 8> init_acc = { PRIME<32>(3), PRIME<64>(1), PRIME<64>(2), PRIME<64>(3), PRIME<64>(4), PRIME<32>(2), PRIME<64>(5), PRIME<32>(1) };
1128         
1129
1130                 /* *************************************
1131                 *  Functions
1132                 ***************************************/
1133         
1134                 XXH_FORCE_INLINE hash_t<64> avalanche(hash_t<64> h64)
1135                 {
1136                         constexpr uint64_t avalanche_mul_prime = 0x165667919E3779F9ULL;
1137
1138                         h64 ^= h64 >> 37;
1139                         h64 *= avalanche_mul_prime;
1140                         h64 ^= h64 >> 32;
1141                         return h64;
1142                 }
1143
1144                 XXH_FORCE_INLINE hash_t<64> rrmxmx(hash_t<64> h64, uint64_t len)
1145                 {
1146                         h64 ^= rotl<64>(h64, 49) ^ rotl<64>(h64, 24);
1147                         h64 *= 0x9FB21C651E98DF25ULL;
1148                         h64 ^= (h64 >> 35) + len;
1149                         h64 *= 0x9FB21C651E98DF25ULL;
1150                         h64 ^= (h64 >> 28);
1151                         return h64;
1152                 }
1153
1154                 XXH_FORCE_INLINE void combine_16(void* dest, hash128_t h128)
1155                 {
1156                         writeLE<64>(dest, readLE<64>(dest) ^ h128.low64);
1157                         writeLE<64>((uint8_t*)dest + 8, readLE<64>((uint8_t*)dest + 8) ^ h128.high64);
1158                 }
1159
1160                 XXH_FORCE_INLINE void accumulate_512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT input, const void* XXH_RESTRICT secret)
1161                 {
1162                         constexpr uint64_t bits = vector_bit_width[static_cast<uint8_t>(vector_mode)];
1163
1164                         using vec_t = vec_t<bits>;
1165                         
1166                         alignas(sizeof(vec_t)) vec_t* const xacc = static_cast<vec_t*>(acc);
1167                         const vec_t* const xinput = static_cast<const vec_t*>(input);
1168                         const vec_t* const xsecret = static_cast<const vec_t*>(secret);
1169
1170                         for (size_t i = 0; i < stripe_len / sizeof(vec_t); i++)
1171                         {
1172                                 vec_t const data_vec = loadu<bits>(xinput + i);
1173                                 vec_t const key_vec = loadu<bits>(xsecret + i);
1174                                 vec_t const data_key = xorv<bits>(data_vec, key_vec);
1175                                 vec_t product = set1<bits>(0);
1176
1177                                 if constexpr (vector_mode == vec_mode::scalar)
1178                                 {
1179                                         product = mul32to64<bits>(srli<bits>(slli<bits>(data_key, 32),32), srli<bits>(data_key, 32));
1180                                         xacc[i ^ 1] = add<bits>(xacc[i ^ 1], data_vec);
1181                                         xacc[i] = add<bits>(xacc[i], product);
1182                                 }
1183                                 else
1184                                 {
1185                                         vec_t const data_key_lo = shuffle<bits, 0, 3, 0, 1>(data_key);
1186                                         product = mul<bits>(data_key, data_key_lo);
1187
1188                                         vec_t const data_swap = shuffle<bits, 1, 0, 3, 2>(data_vec);
1189                                         vec_t const sum = add<bits>(xacc[i], data_swap);
1190                                         xacc[i] = add<bits>(sum, product);
1191                                 }                               
1192                         }
1193                 }
1194
1195                 XXH_FORCE_INLINE void scramble_acc(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret)
1196                 {
1197                         constexpr uint64_t bits = vector_bit_width[static_cast<uint8_t>(vector_mode)];;
1198
1199                         using vec_t = vec_t<bits>;
1200
1201                         alignas(sizeof(vec_t)) vec_t* const xacc = (vec_t*)acc;
1202                         const vec_t* const xsecret = (const vec_t*)secret;  
1203
1204                         for (size_t i = 0; i < stripe_len / sizeof(vec_t); i++)
1205                         {
1206                                 vec_t const acc_vec = xacc[i];
1207                                 vec_t const shifted = srli<bits>(acc_vec, 47);
1208                                 vec_t const data_vec = xorv<bits>(acc_vec, shifted);
1209                                 vec_t const key_vec = loadu<bits>(xsecret + i);
1210                                 vec_t const data_key = xorv<bits>(data_vec, key_vec);
1211                                 
1212                                 if constexpr (vector_mode == vec_mode::scalar)
1213                                 {
1214                                         xacc[i] = mul<bits>(data_key, set1<bits>(PRIME<32>(1)));
1215                                 }
1216                                 else
1217                                 {
1218                                         vec_t const prime32 = set1<bits>(PRIME<32>(1));
1219                                         vec_t const data_key_hi = shuffle<bits, 0, 3, 0, 1>(data_key);
1220                                         vec_t const prod_lo = mul<bits>(data_key, prime32);
1221                                         vec_t const prod_hi = mul<bits>(data_key_hi, prime32);
1222
1223                                         xacc[i] = add<bits>(prod_lo, vec_ops::slli<bits>(prod_hi, 32));
1224                                 }
1225                         }
1226                 }
1227
1228                 XXH_FORCE_INLINE void accumulate(uint64_t* XXH_RESTRICT acc, const uint8_t* XXH_RESTRICT input, const uint8_t* XXH_RESTRICT secret, size_t nbStripes)
1229                 {
1230                         for (size_t n = 0; n < nbStripes; n++) 
1231                         {
1232                                 const uint8_t* const in = input + n * stripe_len;
1233
1234                                 intrin::prefetch(in + prefetch_distance);
1235                                 accumulate_512(acc, in, secret + n * secret_consume_rate);
1236                         }
1237                 }
1238
1239                 XXH_FORCE_INLINE void hash_long_internal_loop(uint64_t* XXH_RESTRICT acc, const uint8_t* XXH_RESTRICT input, size_t len, const uint8_t* XXH_RESTRICT secret, size_t secretSize)
1240                 {
1241                         size_t const nb_rounds = (secretSize - stripe_len) / secret_consume_rate;
1242                         size_t const block_len = stripe_len * nb_rounds;
1243                         size_t const nb_blocks = (len-1) / block_len;
1244
1245                         for (size_t n = 0; n < nb_blocks; n++) 
1246                         {
1247                                 accumulate(acc, input + n * block_len, secret, nb_rounds);
1248                                 scramble_acc(acc, secret + secretSize - stripe_len);
1249                         }
1250
1251                         /* last partial block */
1252                         size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / stripe_len;
1253
1254                         accumulate(acc, input + nb_blocks * block_len, secret, nbStripes);
1255
1256                         /* last stripe */
1257                         const uint8_t* const p = input + len - stripe_len;
1258
1259                         accumulate_512(acc, p, secret + secretSize - stripe_len - secret_lastacc_start);
1260                 }
1261
1262                 XXH_FORCE_INLINE uint64_t mix_2_accs(const uint64_t* XXH_RESTRICT acc, const uint8_t* XXH_RESTRICT secret)
1263                 {
1264                         return mul128fold64(acc[0] ^ readLE<64>(secret), acc[1] ^ readLE<64>(secret + 8));
1265                 }
1266
1267                 XXH_FORCE_INLINE uint64_t merge_accs(const uint64_t* XXH_RESTRICT acc, const uint8_t* XXH_RESTRICT secret, uint64_t start)
1268                 {
1269                         uint64_t result64 = start;
1270
1271                         result64 += mix_2_accs(acc + 0, secret + 0);
1272                         result64 += mix_2_accs(acc + 2, secret + 16);
1273                         result64 += mix_2_accs(acc + 4, secret + 32);
1274                         result64 += mix_2_accs(acc + 6, secret + 48);
1275
1276                         return avalanche(result64);
1277                 }
1278
1279                 XXH_FORCE_INLINE void init_custom_secret(uint8_t* customSecret, uint64_t seed)
1280                 {
1281                         for (uint64_t i = 0; i < secret_default_size / 16; i++) 
1282                         {
1283                                 writeLE<64>(customSecret + i * 16, readLE<64>(default_secret + i * 16) + seed);
1284                                 writeLE<64>(customSecret + i * 16 + 8, readLE<64>(default_secret + i * 16 + 8) - seed);
1285                         }
1286                 }
1287
1288                 template <size_t N>
1289                 XXH_FORCE_INLINE hash_t<N> len_1to3(const uint8_t* input, size_t len, const uint8_t* secret, uint64_t seed)
1290                 {
1291                         if constexpr (N == 64)
1292                         {
1293                                 uint8_t const c1 = input[0];
1294                                 uint8_t const c2 = input[len >> 1];
1295                                 uint8_t const c3 = input[len - 1];
1296                                 uint32_t const combined = ((uint32_t)c1 << 16) | (((uint32_t)c2) << 24) | (((uint32_t)c3) << 0) | (((uint32_t)len) << 8);
1297                                 uint64_t const bitflip = (readLE<32>(secret) ^ readLE<32>(secret + 4)) + seed;
1298                                 uint64_t const keyed = (uint64_t)combined ^ bitflip;
1299                                 return detail::avalanche<64>(keyed);
1300                         }
1301                         else
1302                         {
1303                                 uint8_t const c1 = input[0];
1304                                 uint8_t const c2 = input[len >> 1];
1305                                 uint8_t const c3 = input[len - 1];
1306                                 uint32_t const combinedl = ((uint32_t)c1 << 16) + (((uint32_t)c2) << 24) + (((uint32_t)c3) << 0) + (((uint32_t)len) << 8);
1307                                 uint32_t const combinedh = rotl<32>(swap<32>(combinedl), 13);
1308                                 uint64_t const bitflipl = (readLE<32>(secret) ^ readLE<32>(secret + 4)) + seed;
1309                                 uint64_t const bitfliph = (readLE<32>(secret + 8) ^ readLE<32>(secret + 12)) - seed;
1310                                 uint64_t const keyed_lo = (uint64_t)combinedl ^ bitflipl;
1311                                 uint64_t const keyed_hi = (uint64_t)combinedh ^ bitfliph;
1312                                 hash128_t const h128 = { detail::avalanche<64>(keyed_lo), detail::avalanche<64>(keyed_hi)};
1313
1314                                 return h128;
1315                         }
1316                 }
1317
1318                 template <size_t N>
1319                 XXH_FORCE_INLINE hash_t<N> len_4to8(const uint8_t* input, size_t len, const uint8_t* secret, uint64_t seed)
1320                 {
1321                         constexpr uint64_t mix_constant = 0x9FB21C651E98DF25ULL;
1322
1323                         seed ^= (uint64_t)swap<32>((uint32_t)seed) << 32;
1324
1325                         if constexpr (N == 64)
1326                         {               
1327                                 uint32_t const input1 = readLE<32>(input);
1328                                 uint32_t const input2 = readLE<32>(input + len - 4);
1329                                 uint64_t const bitflip = (readLE<64>(secret + 8) ^ readLE<64>(secret + 16)) - seed;
1330                                 uint64_t const input64 = input2 + ((uint64_t)input1 << 32);
1331                                 uint64_t keyed = input64 ^ bitflip;
1332                         
1333                                 return rrmxmx(keyed, len);
1334                         }
1335                         else
1336                         {
1337                                 uint32_t const input_lo = readLE<32>(input);
1338                                 uint32_t const input_hi = readLE<32>(input + len - 4);
1339                                 uint64_t const input_64 = input_lo + ((uint64_t)input_hi << 32);
1340                                 uint64_t const bitflip = (readLE<64>(secret + 16) ^ readLE<64>(secret + 24)) + seed;
1341                                 uint64_t const keyed = input_64 ^ bitflip;
1342                                 uint128_t m128 = mul64to128(keyed, PRIME<64>(1) + (len << 2));
1343
1344                                 m128.high64 += (m128.low64 << 1);
1345                                 m128.low64 ^= (m128.high64 >> 3);
1346                                 m128.low64 ^= (m128.low64 >> 35);
1347                                 m128.low64 *= mix_constant;
1348                                 m128.low64 ^= (m128.low64 >> 28);
1349                                 m128.high64 = avalanche(m128.high64);
1350
1351                                 return m128;            
1352                         }
1353                 }
1354
1355                 template <size_t N>
1356                 XXH_FORCE_INLINE hash_t<N> len_9to16(const uint8_t* input, size_t len, const uint8_t* secret, uint64_t seed)
1357                 {
1358                         if constexpr (N == 64)
1359                         {
1360                                 uint64_t const bitflip1 = (readLE<64>(secret + 24) ^ readLE<64>(secret + 32)) + seed;
1361                                 uint64_t const bitflip2 = (readLE<64>(secret + 40) ^ readLE<64>(secret + 48)) - seed;
1362                                 uint64_t const input_lo = readLE<64>(input) ^ bitflip1;
1363                                 uint64_t const input_hi = readLE<64>(input + len - 8) ^ bitflip2;
1364                                 uint64_t const acc = len + swap<64>(input_lo) + input_hi + mul128fold64(input_lo, input_hi);
1365
1366                                 return avalanche(acc);
1367                         }
1368                         else
1369                         {
1370                                 uint64_t const bitflipl = (readLE<64>(secret + 32) ^ readLE<64>(secret + 40)) - seed;
1371                                 uint64_t const bitfliph = (readLE<64>(secret + 48) ^ readLE<64>(secret + 56)) + seed;
1372                                 uint64_t const input_lo = readLE<64>(input);
1373                                 uint64_t input_hi = readLE<64>(input + len - 8);
1374                                 uint128_t m128 = mul64to128(input_lo ^ input_hi ^ bitflipl, PRIME<64>(1));
1375
1376                                 m128.low64 += (uint64_t)(len - 1) << 54;
1377                                 input_hi ^= bitfliph;
1378
1379                                 if constexpr (sizeof(void*) < sizeof(uint64_t)) // 32-bit version
1380                                 {
1381                                         m128.high64 += (input_hi & 0xFFFFFFFF00000000) + mul32to64((uint32_t)input_hi, PRIME<32>(2));
1382                                 }
1383                                 else
1384                                 {
1385                                         m128.high64 += input_hi + mul32to64((uint32_t)input_hi, PRIME<32>(2) - 1);
1386                                 }
1387
1388                                 m128.low64 ^= swap<64>(m128.high64);
1389
1390                                 hash128_t h128 = mul64to128(m128.low64, PRIME<64>(2));
1391
1392                                 h128.high64 += m128.high64 * PRIME<64>(2);
1393                                 h128.low64 = avalanche(h128.low64);
1394                                 h128.high64 = avalanche(h128.high64);
1395
1396                                 return h128;
1397                         }
1398                 }
1399
1400                 template <size_t N>
1401                 XXH_FORCE_INLINE hash_t<N> len_0to16(const uint8_t* input, size_t len, const uint8_t* secret, uint64_t seed)
1402                 {
1403                         if (XXH_likely(len > 8))
1404                         {
1405                                 return len_9to16<N>(input, len, secret, seed);
1406                         }
1407                         else if (XXH_likely(len >= 4))
1408                         {
1409                                 return len_4to8<N>(input, len, secret, seed);
1410                         }
1411                         else if (len)
1412                         {
1413                                 return len_1to3<N>(input, len, secret, seed);
1414                         }
1415                         else
1416                         {
1417                                 if constexpr (N == 64)
1418                                 {
1419                                         return detail::avalanche<64>((seed) ^ (readLE<64>(secret + 56) ^ readLE<64>(secret + 64)));
1420                                 }
1421                                 else
1422                                 {
1423                                         uint64_t const bitflipl = readLE<64>(secret + 64) ^ readLE<64>(secret + 72);
1424                                         uint64_t const bitfliph = readLE<64>(secret + 80) ^ readLE<64>(secret + 88);
1425
1426                                         return hash128_t(detail::avalanche<64>(( seed) ^ bitflipl), detail::avalanche<64>(( seed) ^ bitfliph));
1427                                 }                       
1428                         }
1429                 }
1430
1431                 template <size_t N>
1432                 XXH_FORCE_INLINE hash_t<N> hash_long_internal(const uint8_t* XXH_RESTRICT input, size_t len, const uint8_t* XXH_RESTRICT secret = default_secret, size_t secretSize = sizeof(default_secret))
1433                 {
1434                         alignas(acc_align) std::array<uint64_t, acc_nb> acc = init_acc;
1435
1436                         if constexpr (N == 64)
1437                         {                               
1438                                 hash_long_internal_loop(acc.data(), input, len, secret, secretSize);
1439
1440                                 /* converge into final hash */
1441                                 return merge_accs(acc.data(), secret + secret_mergeaccs_start, (uint64_t)len * PRIME<64>(1));
1442                         }
1443                         else
1444                         {
1445                                 hash_long_internal_loop(acc.data(), input, len, secret, secretSize);
1446
1447                                 /* converge into final hash */
1448                                 uint64_t const low64 = merge_accs(acc.data(), secret + secret_mergeaccs_start, (uint64_t)len * PRIME<64>(1));
1449                                 uint64_t const high64 = merge_accs(acc.data(), secret + secretSize - sizeof(acc) - secret_mergeaccs_start, ~((uint64_t)len * PRIME<64>(2)));
1450
1451                                 return hash128_t(low64, high64);
1452                         }
1453                 }
1454
1455                 XXH_FORCE_INLINE uint64_t mix_16b(const uint8_t* XXH_RESTRICT input, const uint8_t* XXH_RESTRICT secret, uint64_t seed)
1456                 {
1457                         uint64_t const input_lo = readLE<64>(input);
1458                         uint64_t const input_hi = readLE<64>(input + 8);
1459
1460                         return mul128fold64(input_lo ^ (readLE<64>(secret) + seed), input_hi ^ (readLE<64>(secret + 8) - seed));
1461                 }
1462
1463                 XXH_FORCE_INLINE uint128_t mix_32b(uint128_t acc, const uint8_t* input1, const uint8_t* input2, const uint8_t* secret, uint64_t seed)
1464                 {
1465                         acc.low64 += mix_16b(input1, secret + 0, seed);
1466                         acc.low64 ^= readLE<64>(input2) + readLE<64>(input2 + 8);
1467                         acc.high64 += mix_16b(input2, secret + 16, seed);
1468                         acc.high64 ^= readLE<64>(input1) + readLE<64>(input1 + 8);
1469
1470                         return acc;     
1471                 }
1472
1473                 template <size_t N>
1474                 XXH_FORCE_INLINE hash_t<N> len_17to128(const uint8_t* XXH_RESTRICT input, size_t len, const uint8_t* XXH_RESTRICT secret, uint64_t seed)
1475                 {
1476                         if constexpr (N == 64)
1477                         {
1478                                 hash64_t acc = len * PRIME<64>(1);
1479
1480                                 if (len > 32) 
1481                                 {
1482                                         if (len > 64) 
1483                                         {
1484                                                 if (len > 96) 
1485                                                 {
1486                                                         acc += mix_16b(input + 48, secret + 96, seed);
1487                                                         acc += mix_16b(input + len - 64, secret + 112, seed);
1488                                                 }
1489
1490                                                 acc += mix_16b(input + 32, secret + 64, seed);
1491                                                 acc += mix_16b(input + len - 48, secret + 80, seed);
1492                                         }
1493
1494                                         acc += mix_16b(input + 16, secret + 32, seed);
1495                                         acc += mix_16b(input + len - 32, secret + 48, seed);
1496                                 }
1497
1498                                 acc += mix_16b(input + 0, secret + 0, seed);
1499                                 acc += mix_16b(input + len - 16, secret + 16, seed);
1500
1501                                 return avalanche(acc);
1502                         }
1503                         else
1504                         {
1505                                 hash128_t acc = { len * PRIME<64>(1), 0 };
1506
1507                                 if (len > 32) 
1508                                 {
1509                                         if (len > 64) 
1510                                         {
1511                                                 if (len > 96) 
1512                                                 {
1513                                                         acc = mix_32b(acc, input + 48, input + len - 64, secret + 96, seed);
1514                                                 }
1515
1516                                                 acc = mix_32b(acc, input + 32, input + len - 48, secret + 64, seed);
1517                                         }
1518
1519                                         acc = mix_32b(acc, input + 16, input + len - 32, secret + 32, seed);
1520                                 }
1521
1522                                 acc = mix_32b(acc, input, input + len - 16, secret, seed);
1523
1524                                 uint64_t const low64 = acc.low64 + acc.high64;
1525                                 uint64_t const high64 = (acc.low64 * PRIME<64>(1)) + (acc.high64 * PRIME<64>(4)) + ((len - seed) * PRIME<64>(2));
1526
1527                                 return { avalanche(low64), (uint64_t)0 - avalanche(high64) };
1528                         }
1529                 }
1530
1531                 template <size_t N>
1532                 XXH_NO_INLINE hash_t<N> len_129to240(const uint8_t* XXH_RESTRICT input, size_t len, const uint8_t* XXH_RESTRICT secret, uint64_t seed)
1533                 {
1534                         if constexpr (N == 64)
1535                         {
1536                                 uint64_t acc = len * PRIME<64>(1);
1537                                 size_t const nbRounds = len / 16;
1538
1539                                 for (size_t i = 0; i < 8; i++) 
1540                                 {
1541                                         acc += mix_16b(input + (i * 16), secret + (i * 16), seed);
1542                                 }
1543
1544                                 acc = avalanche(acc);
1545
1546                                 for (size_t i = 8; i < nbRounds; i++) 
1547                                 {
1548                                         acc += mix_16b(input + (i * 16), secret + ((i - 8) * 16) + midsize_startoffset, seed);
1549                                 }
1550
1551                                 /* last bytes */
1552                                 acc += mix_16b(input + len - 16, secret + secret_size_min - midsize_lastoffset, seed);
1553
1554                                 return avalanche(acc);
1555                         }
1556                         else
1557                         {
1558                                 hash128_t acc;
1559                                 uint64_t const nbRounds = len / 32;
1560
1561                                 acc.low64 = len * PRIME<64>(1);
1562                                 acc.high64 = 0;
1563
1564                                 for (size_t i = 0; i < 4; i++) 
1565                                 {
1566                                         acc = mix_32b(acc, input + (i * 32), input + (i * 32) + 16, secret + (i * 32), seed);
1567                                 }
1568
1569                                 acc.low64 = avalanche(acc.low64);
1570                                 acc.high64 = avalanche(acc.high64);
1571
1572                                 for (size_t i = 4; i < nbRounds; i++) 
1573                                 {
1574                                         acc = mix_32b(acc, input + (i * 32), input + (i * 32) + 16, secret + midsize_startoffset + ((i - 4) * 32), seed);
1575                                 }
1576
1577                                 /* last bytes */
1578                                 acc = mix_32b(acc, input + len - 16, input + len - 32, secret + secret_size_min - midsize_lastoffset - 16, 0ULL - seed);
1579
1580                                 uint64_t const low64 = acc.low64 + acc.high64;
1581                                 uint64_t const high64 = (acc.low64 * PRIME<64>(1)) + (acc.high64 * PRIME<64>(4)) + ((len - seed) * PRIME<64>(2));
1582
1583                                 return { avalanche(low64), (uint64_t)0 - avalanche(high64) };
1584                         }
1585
1586                 }
1587
1588                 template <size_t N>
1589                 XXH_NO_INLINE hash_t<N> xxhash3_impl(const void* XXH_RESTRICT input, size_t len, hash64_t seed, const void* XXH_RESTRICT secret = default_secret, size_t secretSize = secret_default_size)
1590                 {
1591
1592                         alignas(64) uint8_t custom_secret[secret_default_size];
1593
1594                         const void* short_secret = secret;
1595                 
1596                         if (seed != 0)
1597                         {
1598                                 init_custom_secret(custom_secret, seed);
1599                                 short_secret = default_secret;
1600                         }
1601
1602                         if (len <= 16)
1603                         {
1604                                 return len_0to16<N>(static_cast<const uint8_t*>(input), len, static_cast<const uint8_t*>(short_secret), seed);
1605                         }
1606                         else if (len <= 128)
1607                         {
1608                                 return len_17to128<N>(static_cast<const uint8_t*>(input), len, static_cast<const uint8_t*>(short_secret), seed);
1609                         }
1610                         else if (len <= midsize_max)
1611                         {
1612                                 return len_129to240<N>(static_cast<const uint8_t*>(input), len, static_cast<const uint8_t*>(short_secret), seed);
1613                         }
1614                         else
1615                         {
1616                                 return hash_long_internal<N>(static_cast<const uint8_t*>(input), len, static_cast<const uint8_t*>(((seed == 0) ? secret : ((secret == default_secret) ? custom_secret : secret))), ((seed == 0) ? secretSize : ((secret == default_secret) ? secret_default_size : secretSize)));
1617                         }
1618                 }
1619
1620                 XXH_NO_INLINE void generate_secret(void* secret_buffer, size_t secret_size, const void* custom_seed, size_t seed_size)
1621                 {
1622                         if (seed_size == 0)
1623                         {
1624                                 custom_seed = default_secret;
1625                                 seed_size = secret_default_size;
1626                         }
1627
1628                         size_t pos = 0;
1629                         while (pos < secret_size)
1630                         {
1631                                 size_t const copy_len = std::min(secret_size - pos, seed_size);
1632                                 memcpy((uint8_t*)secret_buffer + pos, custom_seed, copy_len);
1633                                 pos += copy_len;
1634                         }
1635
1636                         size_t const nbseg16 = secret_size / 16;
1637                         canonical128_t scrambled(xxhash3_impl<128>(custom_seed, seed_size, 0));
1638                         for (size_t n = 0; n < nbseg16; n++)
1639                         {
1640                                 hash128_t const h128 = xxhash3_impl<128>(&scrambled, sizeof(scrambled), n);
1641                                 combine_16((uint8_t*)secret_buffer + n * 16, h128);
1642                         }
1643
1644                         combine_16((uint8_t*)secret_buffer + secret_size - 16, scrambled.get_hash());
1645                 }
1646         }
1647
1648
1649         /* *************************************
1650         *  Public Access Point - xxhash
1651         ***************************************/
1652
1653         template <size_t bit_mode>
1654         inline hash_t<bit_mode> xxhash(const void* input, size_t len, uint_t<bit_mode> seed = 0)
1655         {
1656                 static_assert(!(bit_mode != 32 && bit_mode != 64), "xxhash can only be used in 32 and 64 bit modes.");
1657                 return detail::endian_align<bit_mode>(input, len, seed);
1658         }
1659
1660         template <size_t bit_mode, typename T>
1661         inline hash_t<bit_mode> xxhash(const std::basic_string<T>& input, uint_t<bit_mode> seed = 0)
1662         {
1663                 static_assert(!(bit_mode != 32 && bit_mode != 64), "xxhash can only be used in 32 and 64 bit modes.");
1664                 return detail::endian_align<bit_mode>(static_cast<const void*>(input.data()), input.length() * sizeof(T), seed);
1665         }
1666
1667         template <size_t bit_mode, typename ContiguousIterator>
1668         inline hash_t<bit_mode> xxhash(ContiguousIterator begin, ContiguousIterator end, uint_t<bit_mode> seed = 0)
1669         {
1670                 static_assert(!(bit_mode != 32 && bit_mode != 64), "xxhash can only be used in 32 and 64 bit modes.");
1671                 using T = typename std::decay_t<decltype(*end)>;
1672                 return detail::endian_align<bit_mode>(static_cast<const void*>(&*begin), (end - begin) * sizeof(T), seed);
1673         }
1674
1675         template <size_t bit_mode, typename T>
1676         inline hash_t<bit_mode> xxhash(const std::vector<T>& input, uint_t<bit_mode> seed = 0)
1677         {
1678                 static_assert(!(bit_mode != 32 && bit_mode != 64), "xxhash can only be used in 32 and 64 bit modes.");
1679                 return detail::endian_align<bit_mode>(static_cast<const void*>(input.data()), input.size() * sizeof(T), seed);
1680         }
1681
1682         template <size_t bit_mode, typename T, size_t AN>
1683         inline hash_t<bit_mode> xxhash(const std::array<T, AN>& input, uint_t<bit_mode> seed = 0)
1684         {
1685                 static_assert(!(bit_mode != 32 && bit_mode != 64), "xxhash can only be used in 32 and 64 bit modes.");
1686                 return detail::endian_align<bit_mode>(static_cast<const void*>(input.data()), AN * sizeof(T), seed);
1687         }
1688
1689         template <size_t bit_mode, typename T>
1690         inline hash_t<bit_mode> xxhash(const std::initializer_list<T>& input, uint_t<bit_mode> seed = 0)
1691         {
1692                 static_assert(!(bit_mode != 32 && bit_mode != 64), "xxhash can only be used in 32 and 64 bit modes.");
1693                 return detail::endian_align<bit_mode>(static_cast<const void*>(input.begin()), input.size() * sizeof(T), seed);
1694         }
1695
1696
1697         /* *************************************
1698         *  Public Access Point - xxhash3
1699         ***************************************/
1700
1701         template <size_t bit_mode>
1702         inline hash_t<bit_mode> xxhash3(const void* input, size_t len, uint64_t seed = 0)
1703         {
1704                 static_assert(!(bit_mode != 128 && bit_mode != 64), "xxhash3 can only be used in 64 and 128 bit modes.");
1705                 return detail3::xxhash3_impl<bit_mode>(input, len, seed);
1706         }
1707
1708         template <size_t bit_mode>
1709         inline hash_t<bit_mode> xxhash3(const void* input, size_t len, const void* secret, size_t secretSize, uint64_t seed = 0)
1710         {
1711                 static_assert(!(bit_mode != 128 && bit_mode != 64), "xxhash3 can only be used in 64 and 128 bit modes.");
1712                 return detail3::xxhash3_impl<bit_mode>(input, len, seed, secret, secretSize);
1713         }
1714
1715         template <size_t bit_mode, typename T>
1716         inline hash_t<bit_mode> xxhash3(const std::basic_string<T>& input, uint64_t seed = 0)
1717         {
1718                 static_assert(!(bit_mode != 128 && bit_mode != 64), "xxhash3 can only be used in 64 and 128 bit modes.");
1719                 return detail3::xxhash3_impl<bit_mode>(static_cast<const void*>(input.data()), input.length() * sizeof(T), seed);
1720         }
1721
1722         template <size_t bit_mode, typename T>
1723         inline hash_t<bit_mode> xxhash3(const std::basic_string<T>& input, const void* secret, size_t secretSize, uint64_t seed = 0)
1724         {
1725                 static_assert(!(bit_mode != 128 && bit_mode != 64), "xxhash3 can only be used in 64 and 128 bit modes.");
1726                 return detail3::xxhash3_impl<bit_mode>(static_cast<const void*>(input.data()), input.length() * sizeof(T), seed, secret, secretSize);
1727         }
1728
1729         template <size_t bit_mode, typename ContiguousIterator>
1730         inline hash_t<bit_mode> xxhash3(ContiguousIterator begin, ContiguousIterator end, uint64_t seed = 0)
1731         {
1732                 static_assert(!(bit_mode != 128 && bit_mode != 64), "xxhash3 can only be used in 64 and 128 bit modes.");
1733                 using T = typename std::decay_t<decltype(*end)>;
1734                 return detail3::xxhash3_impl<bit_mode>(static_cast<const void*>(&*begin), (end - begin) * sizeof(T), seed);
1735         }
1736
1737         template <size_t bit_mode, typename ContiguousIterator>
1738         inline hash_t<bit_mode> xxhash3(ContiguousIterator begin, ContiguousIterator end, const void* secret, size_t secretSize, uint64_t seed = 0)
1739         {
1740                 static_assert(!(bit_mode != 128 && bit_mode != 64), "xxhash3 can only be used in 64 and 128 bit modes.");
1741                 using T = typename std::decay_t<decltype(*end)>;
1742                 return detail3::xxhash3_impl<bit_mode>(static_cast<const void*>(&*begin), (end - begin) * sizeof(T), seed, secret, secretSize);
1743         }
1744
1745         template <size_t bit_mode, typename T>
1746         inline hash_t<bit_mode> xxhash3(const std::vector<T>& input, uint64_t seed = 0)
1747         {
1748                 static_assert(!(bit_mode != 128 && bit_mode != 64), "xxhash3 can only be used in 64 and 128 bit modes.");
1749                 return detail3::xxhash3_impl<bit_mode>(static_cast<const void*>(input.data()), input.size() * sizeof(T), seed);
1750         }
1751
1752         template <size_t bit_mode, typename T>
1753         inline hash_t<bit_mode> xxhash3(const std::vector<T>& input, const void* secret, size_t secretSize, uint64_t seed = 0)
1754         {
1755                 static_assert(!(bit_mode != 128 && bit_mode != 64), "xxhash3 can only be used in 64 and 128 bit modes.");
1756                 return detail3::xxhash3_impl<bit_mode>(static_cast<const void*>(input.data()), input.size() * sizeof(T), seed, secret, secretSize);
1757         }
1758
1759         template <size_t bit_mode, typename T, size_t AN>
1760         inline hash_t<bit_mode> xxhash3(const std::array<T, AN>& input, uint64_t seed = 0)
1761         {
1762                 static_assert(!(bit_mode != 128 && bit_mode != 64), "xxhash3 can only be used in 64 and 128 bit modes.");
1763                 return detail3::xxhash3_impl<bit_mode>(static_cast<const void*>(input.data()), AN * sizeof(T), seed);
1764         }
1765
1766         template <size_t bit_mode, typename T, size_t AN>
1767         inline hash_t<bit_mode> xxhash3(const std::array<T, AN>& input, const void* secret, size_t secretSize, uint64_t seed = 0)
1768         {
1769                 static_assert(!(bit_mode != 128 && bit_mode != 64), "xxhash3 can only be used in 64 and 128 bit modes.");
1770                 return detail3::xxhash3_impl<bit_mode>(static_cast<const void*>(input.data()), AN * sizeof(T), seed, secret, secretSize);
1771         }
1772
1773         template <size_t bit_mode, typename T>
1774         inline hash_t<bit_mode> xxhash3(const std::initializer_list<T>& input, uint64_t seed = 0)
1775         {
1776                 static_assert(!(bit_mode != 128 && bit_mode != 64), "xxhash3 can only be used in 64 and 128 bit modes.");
1777                 return detail3::xxhash3_impl<bit_mode>(static_cast<const void*>(input.begin()), input.size() * sizeof(T), seed);
1778         }
1779
1780         template <size_t bit_mode, typename T>
1781         inline hash_t<bit_mode> xxhash3(const std::initializer_list<T>& input, const void* secret, size_t secretSize, uint64_t seed = 0)
1782         {
1783                 static_assert(!(bit_mode != 128 && bit_mode != 64), "xxhash3 can only be used in 64 and 128 bit modes.");
1784                 return detail3::xxhash3_impl<bit_mode>(static_cast<const void*>(input.begin()), input.size() * sizeof(T), seed, secret, secretSize);
1785         }
1786
1787
1788         /* *************************************
1789         *  Secret Generation Functions
1790         ***************************************/
1791          
1792         inline void generate_secret(void* secret_buffer, size_t secret_size, const void* custom_seed = detail3::default_secret, size_t seed_length = 0)
1793         {
1794                 detail3::generate_secret(secret_buffer, secret_size, custom_seed, seed_length);
1795         }
1796
1797         template <typename T, size_t AN>
1798         inline void generate_secret(void* secret_buffer, size_t secret_size, const std::array<T, AN>& custom_seed)
1799         {
1800                 detail3::generate_secret(secret_buffer, secret_size, static_cast<const void*>(custom_seed.data()), AN * sizeof(T));
1801         }
1802
1803         template <typename T>
1804         inline void generate_secret(void* secret_buffer, size_t secret_size, const std::initializer_list<T>& custom_seed)
1805         {
1806                 detail3::generate_secret(secret_buffer, secret_size, static_cast<const void*>(custom_seed.begin()), custom_seed.size() * sizeof(T));
1807         }
1808
1809         template <typename T>
1810         inline void generate_secret(void* secret_buffer, size_t secret_size, const std::vector<T>& custom_seed)
1811         {
1812                 detail3::generate_secret(secret_buffer, secret_size, static_cast<const void*>(custom_seed.data()), custom_seed.size() * sizeof(T));
1813         }
1814
1815         template <typename T>
1816         inline void generate_secret(void* secret_buffer, size_t secret_size, const std::basic_string<T>& custom_seed)
1817         {
1818                 detail3::generate_secret(secret_buffer, secret_size, static_cast<const void*>(custom_seed.data()), custom_seed.length() * sizeof(T));
1819         }
1820
1821         template <typename ContiguousIterator>
1822         inline void generate_secret(void* secret_buffer, size_t secret_size, ContiguousIterator begin, ContiguousIterator end)
1823         {
1824                 using T = typename std::decay_t<decltype(*end)>;
1825                 detail3::generate_secret(secret_buffer, secret_size, static_cast<const void*>(&*begin), (end - begin) * sizeof(T));
1826         }
1827
1828         inline void generate_secret_from_seed(void* secret_buffer, uint64_t seed = 0)
1829         {
1830                 alignas(64) uint8_t custom_secret[detail3::secret_default_size];
1831                 detail3::init_custom_secret(custom_secret, seed);
1832                 memcpy(secret_buffer, custom_secret, detail3::secret_default_size);
1833         }
1834
1835
1836         /* *************************************
1837         *  Hash streaming - xxhash
1838         ***************************************/
1839
1840         template <size_t bit_mode>
1841         class hash_state_t 
1842         {
1843                 uint64_t total_len = 0;
1844                 uint_t<bit_mode> v1 = 0, v2 = 0, v3 = 0, v4 = 0;
1845                 std::array<hash_t<bit_mode>, 4> mem = {0, 0, 0, 0};
1846                 uint32_t memsize = 0;
1847
1848                 inline void update_impl(const void* input, size_t length)
1849                 {
1850                         const uint8_t* p = reinterpret_cast<const uint8_t*>(input);
1851                         const uint8_t* const bEnd = p + length;
1852
1853                         total_len += length;
1854
1855                         if (memsize + length < (bit_mode / 2))
1856                         {   /* fill in tmp buffer */
1857                                 memcpy(reinterpret_cast<uint8_t*>(mem.data()) + memsize, input, length);
1858                                 memsize += static_cast<uint32_t>(length);
1859                                 return;
1860                         }
1861
1862                         if (memsize > 0)
1863                         {   /* some data left from previous update */
1864                                 memcpy(reinterpret_cast<uint8_t*>(mem.data()) + memsize, input, (bit_mode / 2) - memsize);
1865
1866                                 const uint_t<bit_mode>* ptr = mem.data();
1867
1868                                 v1 = detail::round<bit_mode>(v1, mem_ops::readLE<bit_mode>(ptr)); 
1869                                 ptr++;
1870                                 v2 = detail::round<bit_mode>(v2, mem_ops::readLE<bit_mode>(ptr)); 
1871                                 ptr++;
1872                                 v3 = detail::round<bit_mode>(v3, mem_ops::readLE<bit_mode>(ptr)); 
1873                                 ptr++;
1874                                 v4 = detail::round<bit_mode>(v4, mem_ops::readLE<bit_mode>(ptr));
1875
1876                                 p += (bit_mode / 2) - memsize;
1877                                 memsize = 0;
1878                         }
1879
1880                         if (p <= bEnd - (bit_mode / 2))
1881                         {
1882                                 const uint8_t* const limit = bEnd - (bit_mode / 2);
1883
1884                                 do
1885                                 {
1886                                         v1 = detail::round<bit_mode>(v1, mem_ops::readLE<bit_mode>(p)); 
1887                                         p += (bit_mode / 8);
1888                                         v2 = detail::round<bit_mode>(v2, mem_ops::readLE<bit_mode>(p)); 
1889                                         p += (bit_mode / 8);
1890                                         v3 = detail::round<bit_mode>(v3, mem_ops::readLE<bit_mode>(p)); 
1891                                         p += (bit_mode / 8);
1892                                         v4 = detail::round<bit_mode>(v4, mem_ops::readLE<bit_mode>(p)); 
1893                                         p += (bit_mode / 8);
1894                                 } 
1895                                 while (p <= limit);
1896                         }
1897
1898                         if (p < bEnd)
1899                         {
1900                                 memcpy(mem.data(), p, static_cast<size_t>(bEnd - p));
1901                                 memsize = static_cast<uint32_t>(bEnd - p);
1902                         }
1903                 }
1904
1905                 inline hash_t<bit_mode> digest_impl() const
1906                 {
1907                         const uint8_t* p = reinterpret_cast<const uint8_t*>(mem.data());
1908                         const uint8_t* const bEnd = reinterpret_cast<const uint8_t*>(mem.data()) + memsize;
1909                         hash_t<bit_mode> hash_ret;
1910
1911                         if (total_len >= (bit_mode / 2))
1912                         {
1913                                 hash_ret = bit_ops::rotl<bit_mode>(v1, 1) + bit_ops::rotl<bit_mode>(v2, 7) + bit_ops::rotl<bit_mode>(v3, 12) + bit_ops::rotl<bit_mode>(v4, 18);
1914
1915                                 if constexpr (bit_mode == 64)
1916                                 {
1917                                         detail::endian_align_sub_mergeround(hash_ret, v1, v2, v3, v4);
1918                                 }
1919                         }
1920                         else 
1921                         { 
1922                                 hash_ret = v3 + detail::PRIME<bit_mode>(5); 
1923                         }
1924
1925                         hash_ret += static_cast<hash_t<bit_mode>>(total_len);
1926
1927                         return detail::endian_align_sub_ending<bit_mode>(hash_ret, p, bEnd);
1928                 }
1929
1930         public:
1931
1932                 hash_state_t(uint_t<bit_mode> seed = 0)
1933                 {
1934                         static_assert(!(bit_mode != 32 && bit_mode != 64), "xxhash streaming can only be used in 32 and 64 bit modes.");
1935                         v1 = seed + detail::PRIME<bit_mode>(1) + detail::PRIME<bit_mode>(2);
1936                         v2 = seed + detail::PRIME<bit_mode>(2);
1937                         v3 = seed + 0;
1938                         v4 = seed - detail::PRIME<bit_mode>(1);
1939                 };
1940
1941                 hash_state_t operator=(hash_state_t<bit_mode>& other)
1942                 {
1943                         memcpy(this, &other, sizeof(hash_state_t<bit_mode>));
1944                 }
1945
1946                 void reset(uint_t<bit_mode> seed = 0)
1947                 {
1948                         memset(this, 0, sizeof(hash_state_t<bit_mode>));
1949                         v1 = seed + detail::PRIME<bit_mode>(1) + detail::PRIME<bit_mode>(2);
1950                         v2 = seed + detail::PRIME<bit_mode>(2);
1951                         v3 = seed + 0;
1952                         v4 = seed - detail::PRIME<bit_mode>(1);
1953                 }
1954
1955                 void update(const void* input, size_t length)
1956                 {
1957                         return update_impl(input, length);
1958                 }
1959
1960                 template <typename T>
1961                 void update(const std::basic_string<T>& input)
1962                 {
1963                         return update_impl(static_cast<const void*>(input.data()), input.length() * sizeof(T));
1964                 }
1965
1966                 template <typename ContiguousIterator>
1967                 void update(ContiguousIterator begin, ContiguousIterator end)
1968                 {
1969                         using T = typename std::decay_t<decltype(*end)>;
1970                         return update_impl(static_cast<const void*>(&*begin), (end - begin) * sizeof(T));
1971                 }
1972
1973                 template <typename T>
1974                 void update(const std::vector<T>& input)
1975                 {
1976                         return update_impl(static_cast<const void*>(input.data()), input.size() * sizeof(T));
1977                 }
1978
1979                 template <typename T, size_t AN>
1980                 void update(const std::array<T, AN>& input)
1981                 {
1982                         return update_impl(static_cast<const void*>(input.data()), AN * sizeof(T));
1983                 }
1984
1985                 template <typename T>
1986                 void update(const std::initializer_list<T>& input)
1987                 {
1988                         return update_impl(static_cast<const void*>(input.begin()), input.size() * sizeof(T));
1989                 }
1990
1991                 hash_t<bit_mode> digest() const
1992                 {
1993                         return digest_impl();
1994                 }
1995         };
1996
1997         using hash_state32_t = hash_state_t<32>;
1998         using hash_state64_t = hash_state_t<64>;
1999
2000
2001         /* *************************************
2002         *  Hash streaming - xxhash3
2003         ***************************************/
2004
2005         template <size_t bit_mode>
2006         class alignas(64) hash3_state_t 
2007         {   
2008                 constexpr static int internal_buffer_size = 256;
2009                 constexpr static int internal_buffer_stripes = (internal_buffer_size / detail3::stripe_len);
2010         
2011                 alignas(64) uint64_t acc[8];
2012                 alignas(64) uint8_t customSecret[detail3::secret_default_size];  /* used to store a custom secret generated from the seed. Makes state larger. Design might change */
2013                 alignas(64) uint8_t buffer[internal_buffer_size];
2014                 uint32_t bufferedSize = 0;
2015                 uint32_t nbStripesPerBlock = 0;
2016                 uint32_t nbStripesSoFar = 0;
2017                 uint32_t secretLimit = 0;
2018                 uint32_t reserved32 = 0;
2019                 uint32_t reserved32_2 = 0;
2020                 uint64_t totalLen = 0;
2021                 uint64_t seed = 0;
2022                 bool useSeed = false;
2023                 uint64_t reserved64 = 0;
2024                 const uint8_t* secret = nullptr;    /* note : there is some padding after, due to alignment on 64 bytes */
2025
2026
2027                 void consume_stripes(uint64_t* acc, uint32_t& nbStripesSoFar, size_t totalStripes, const uint8_t* input)
2028                 {
2029                         if (nbStripesPerBlock - nbStripesSoFar <= totalStripes) /* need a scrambling operation */
2030                         {                       
2031                                 size_t const nbStripes = nbStripesPerBlock - nbStripesSoFar;
2032
2033                                 detail3::accumulate(acc, input, secret + (nbStripesSoFar * detail3::secret_consume_rate), nbStripes);
2034                                 detail3::scramble_acc(acc, secret + secretLimit);
2035                                 detail3::accumulate(acc, input + nbStripes * detail3::stripe_len, secret, totalStripes - nbStripes);
2036                                 nbStripesSoFar = (uint32_t)(totalStripes - nbStripes);
2037                         }
2038                         else 
2039                         {
2040                                 detail3::accumulate(acc, input, secret + (nbStripesSoFar * detail3::secret_consume_rate), totalStripes);
2041                                 nbStripesSoFar += (uint32_t)totalStripes;
2042                         }
2043                 }
2044
2045                 void update_impl(const void* input_, size_t len)
2046                 {
2047                         const uint8_t* input = static_cast<const uint8_t*>(input_);
2048                         const uint8_t* const bEnd = input + len;
2049
2050                         totalLen += len;
2051
2052                         if (bufferedSize + len <= internal_buffer_size) 
2053                         {  /* fill in tmp buffer */
2054                                 memcpy(buffer + bufferedSize, input, len);
2055                                 bufferedSize += (uint32_t)len;
2056                                 return;
2057                         }
2058                         /* input now > XXH3_INTERNALBUFFER_SIZE */
2059
2060                         if (bufferedSize > 0) 
2061                         {   /* some input within internal buffer: fill then consume it */
2062                                 size_t const loadSize = internal_buffer_size - bufferedSize;
2063
2064                                 memcpy(buffer + bufferedSize, input, loadSize);
2065                                 input += loadSize;
2066                                 consume_stripes(acc, nbStripesSoFar, internal_buffer_stripes, buffer);
2067                                 bufferedSize = 0;
2068                         }
2069
2070                         /* consume input by full buffer quantities */
2071                         if (input + internal_buffer_size <= bEnd) 
2072                         {
2073                                 const uint8_t* const limit = bEnd - internal_buffer_size;
2074
2075                                 do 
2076                                 {
2077                                         consume_stripes(acc, nbStripesSoFar, internal_buffer_stripes, input);
2078                                         input += internal_buffer_size;
2079                                 } 
2080                                 while (input < limit);
2081
2082                                 memcpy(buffer + sizeof(buffer) - detail3::stripe_len, input - detail3::stripe_len, detail3::stripe_len);
2083                         }
2084
2085                         if (input < bEnd) 
2086                         { /* some remaining input input : buffer it */
2087                                 memcpy(buffer, input, (size_t)(bEnd - input));
2088                                 bufferedSize = (uint32_t)(bEnd - input);
2089                         }
2090                 }
2091
2092                 void digest_long(uint64_t* acc_)
2093                 {
2094                         memcpy(acc_, acc, sizeof(acc));  /* digest locally, state remains unaltered, and can continue ingesting more input afterwards */
2095
2096                         if (bufferedSize >= detail3::stripe_len) 
2097                         {
2098                                 size_t const totalNbStripes = (bufferedSize - 1) / detail3::stripe_len;
2099                                 uint32_t nbStripesSoFar = this->nbStripesSoFar;
2100
2101                                 consume_stripes(acc_, nbStripesSoFar, totalNbStripes, buffer);
2102
2103                                 /* one last partial stripe */
2104                                 detail3::accumulate_512(acc_, buffer + bufferedSize - detail3::stripe_len, secret + secretLimit - detail3::secret_lastacc_start);
2105                         }
2106                         else 
2107                         {  /* bufferedSize < STRIPE_LEN */
2108                                 /* one last stripe */
2109                                 uint8_t lastStripe[detail3::stripe_len];
2110                                 size_t const catchupSize = detail3::stripe_len - bufferedSize;
2111                                 memcpy(lastStripe, buffer + sizeof(buffer) - catchupSize, catchupSize);
2112                                 memcpy(lastStripe + catchupSize, buffer, bufferedSize);
2113                                 detail3::accumulate_512(acc_, lastStripe, secret + secretLimit - detail3::secret_lastacc_start);
2114                         }
2115                 }
2116
2117                 void reset_internal(uint64_t seed_reset, const void* secret_reset, size_t secret_size)
2118                 {
2119                         memset(this, 0, sizeof(*this));
2120                         memcpy(acc, detail3::init_acc.data(), sizeof(detail3::init_acc));
2121                         seed = seed_reset;
2122                         useSeed = (seed != 0);
2123                         secret = (const uint8_t*)secret_reset;
2124                         secretLimit = (uint32_t)(secret_size - detail3::stripe_len);
2125                         nbStripesPerBlock = secretLimit / detail3::secret_consume_rate;
2126                 }
2127
2128         public:
2129
2130                 hash3_state_t operator=(hash3_state_t& other)
2131                 {
2132                         memcpy(this, &other, sizeof(hash3_state_t));
2133                 }
2134
2135                 hash3_state_t(uint64_t seed = 0)
2136                 {
2137                         static_assert(!(bit_mode != 128 && bit_mode != 64), "xxhash3 streaming can only be used in 64 and 128 bit modes.");
2138                         reset(seed);
2139                 }
2140
2141                 hash3_state_t(const void* secret, size_t secretSize, uint64_t seed = 0)
2142                 {
2143                         static_assert(!(bit_mode != 128 && bit_mode != 64), "xxhash3 streaming can only be used in 64 and 128 bit modes.");
2144                         reset(secret, secretSize, seed);
2145                 }
2146
2147                 void reset(uint64_t seed = 0)
2148                 { 
2149                         reset_internal(seed, detail3::default_secret, detail3::secret_default_size);
2150                         detail3::init_custom_secret(customSecret, seed);
2151                         secret = customSecret;
2152                         /*
2153                         memset(this, 0, sizeof(*this));
2154                         memcpy(acc, detail3::init_acc.data(), sizeof(detail3::init_acc));
2155                         (*this).seed = seed;
2156
2157                         if (seed == 0)
2158                         {
2159                                 secret = detail3::default_secret;
2160                         }
2161                         else
2162                         {
2163                                 detail3::init_custom_secret(customSecret, seed);
2164                                 secret = customSecret;
2165                         }
2166
2167                         secretLimit = (uint32_t)(detail3::secret_default_size - detail3::stripe_len);
2168                         nbStripesPerBlock = secretLimit / detail3::secret_consume_rate;*/
2169                 }
2170
2171                 void reset(const void* secret, size_t secretSize, uint64_t seed = 0)
2172                 {
2173                         reset_internal(seed, secret, secretSize);
2174                         useSeed = true;
2175                         /*
2176
2177                         memset(this, 0, sizeof(*this));
2178                         memcpy(acc, detail3::init_acc.data(), sizeof(detail3::init_acc));
2179                         seed = 0;
2180
2181                         (*this).secret = (const uint8_t*)secret;
2182                         secretLimit = (uint32_t)(secretSize - detail3::stripe_len);
2183                         nbStripesPerBlock = secretLimit / detail3::secret_consume_rate;*/
2184                 }
2185
2186                 void update(const void* input, size_t len)
2187                 {
2188                         return update_impl(static_cast<const void*>(input), len);
2189                 }
2190
2191                 template <typename T>
2192                 void update(const std::basic_string<T>& input)
2193                 {
2194                         return update_impl(static_cast<const void*>(input.data()), input.length() * sizeof(T));
2195                 }
2196
2197                 template <typename ContiguousIterator>
2198                 void update(ContiguousIterator begin, ContiguousIterator end)
2199                 {
2200                         using T = typename std::decay_t<decltype(*end)>;
2201                         return update_impl(static_cast<const void*>(&*begin), (end - begin) * sizeof(T));
2202                 }
2203
2204                 template <typename T>
2205                 void update(const std::vector<T>& input)
2206                 {
2207                         return update_impl(static_cast<const void*>(input.data()), input.size() * sizeof(T));
2208                 }
2209
2210                 template <typename T, size_t AN>
2211                 void update(const std::array<T, AN>& input)
2212                 {
2213                         return update_impl(static_cast<const void*>(input.data()), AN * sizeof(T));
2214                 }
2215
2216                 template <typename T>
2217                 void update(const std::initializer_list<T>& input)
2218                 {
2219                         return update_impl(static_cast<const void*>(input.begin()), input.size() * sizeof(T));
2220                 }
2221
2222                 hash_t<bit_mode> digest()
2223                 {       
2224                         if (totalLen > detail3::midsize_max) 
2225                         {
2226                                 alignas(128) hash64_t acc[detail3::acc_nb];
2227                                 
2228                                 digest_long(acc);
2229
2230                                 if constexpr (bit_mode == 64)
2231                                 {
2232                                         return detail3::merge_accs(acc, secret + detail3::secret_mergeaccs_start, (uint64_t)totalLen * detail::PRIME<64>(1));
2233                                 }
2234                                 else
2235                                 {
2236                                         uint64_t const low64 = detail3::merge_accs(acc, secret + detail3::secret_mergeaccs_start, (uint64_t)totalLen * detail::PRIME<64>(1));
2237                                         uint64_t const high64 = detail3::merge_accs(acc, secret + secretLimit + detail3::stripe_len - sizeof(acc) - detail3::secret_mergeaccs_start, ~((uint64_t)totalLen * detail::PRIME<64>(2)));
2238
2239                                         return { low64, high64 };
2240                                 }
2241                         }
2242                         else
2243                         {
2244                                 return detail3::xxhash3_impl<bit_mode>(buffer, totalLen, seed, secret, secretLimit + detail3::stripe_len);
2245                         }
2246                 }
2247         };
2248
2249         using hash3_state64_t = hash3_state_t<64>;
2250         using hash3_state128_t = hash3_state_t<128>;
2251 }