745 lines
26 KiB
C
745 lines
26 KiB
C
/* ========================================================================
|
|
|
|
Meow - A Fast Non-cryptographic Hash
|
|
(C) Copyright 2018-2019 by Molly Rocket, Inc. (https://mollyrocket.com)
|
|
|
|
See https://mollyrocket.com/meowhash for details.
|
|
|
|
========================================================================
|
|
|
|
zlib License
|
|
|
|
(C) Copyright 2018-2019 Molly Rocket, Inc.
|
|
|
|
This software is provided 'as-is', without any express or implied
|
|
warranty. In no event will the authors be held liable for any damages
|
|
arising from the use of this software.
|
|
|
|
Permission is granted to anyone to use this software for any purpose,
|
|
including commercial applications, and to alter it and redistribute it
|
|
freely, subject to the following restrictions:
|
|
|
|
1. The origin of this software must not be misrepresented; you must not
|
|
claim that you wrote the original software. If you use this software
|
|
in a product, an acknowledgment in the product documentation would be
|
|
appreciated but is not required.
|
|
2. Altered source versions must be plainly marked as such, and must not be
|
|
misrepresented as being the original software.
|
|
3. This notice may not be removed or altered from any source distribution.
|
|
|
|
========================================================================
|
|
|
|
FAQ
|
|
|
|
Q: What is it?
|
|
|
|
A: Meow is a 128-bit Level 3 hash taking 128 bytes of seed. It operates
|
|
at very high speeds on x64 processors, and potentially other processors
|
|
that provide accelerated AES instructions.
|
|
|
|
Q: What is it GOOD for?
|
|
|
|
A: Quickly hashing any amount of data for comparison purposes such as
|
|
block deduplication or change detection. It is fast on all buffer
|
|
sizes, and can generally be used anywhere you need fast Level 3
|
|
hashing without worrying about how big or small the inputs tend to be.
|
|
|
|
However, substantial speed improvements could be made over Meow
|
|
if you either a) know you are always hashing an exact, small number of bytes,
|
|
or b) can always supply a small number of bytes in a buffer padded to some
|
|
fixed multiple of 16.
|
|
|
|
Q: What is it BAD for?
|
|
|
|
A: Anything requiring Level 4 or Level 5 security guarantees (see
|
|
http://nohatcoder.dk/2019-05-19-1.html#level3). Also, note that
|
|
Meow is a new hash and has not had the extensive community
|
|
cryptanalysis necessary to ensure that it is not breakable down to
|
|
a lower level of hash, so you must do your due diligence in
|
|
deciding when and where to use Meow instead of a slower but
|
|
more extensively studied existing hash. We have tried to design
|
|
it to provide Level 3 security, but the possibility of the hash
|
|
being broken in the future always exists.
|
|
|
|
Q: Why is it called the "Meow hash"?
|
|
|
|
A: It is named after a character in Meow the Infinite
|
|
(https://meowtheinfinite.com)
|
|
|
|
Q: Who wrote it?
|
|
|
|
A: The final Meow Hash was created as a collaboration between
|
|
JACOB CHRISTIAN MUNCH-ANDERSEN (https://twitter.com/nohatcoder) and
|
|
CASEY MURATORI (https://caseymuratori.com). Casey wrote the original
|
|
implementation for use in processing large-footprint assets for the
|
|
game 1935 (https://molly1935.com). Jacob was the first to analyze
|
|
that implementation and determine the adversarial bit strength, which
|
|
was weaker than they would have liked.
|
|
|
|
Following that, the two collaborated to figure out how the hash
|
|
could be strengthened without reducing Meow's 16 bytes/cycle
|
|
maximum theoretical throughput. Jacob created the hash candidates
|
|
and Casey did the performance validation. After a long and
|
|
exhaustive effort, Jacob found the unaligned aes/add/xor formulation
|
|
that forms the current Meow hash core.
|
|
|
|
A number of valuable additions to Meow Hash were also contributed
|
|
by other great folks along the way:
|
|
|
|
JEFF ROBERTS (https://radgametools.com) provided a super slick
|
|
way to handle the residual end-of-buffer bytes that dramatically
|
|
improved Meow's small hash performance.
|
|
|
|
MARTINS MOZEIKO (https://matrins.ninja) ported Meow to ARM and
|
|
ANSI-C, and added the proper preprocessor dressing for clean
|
|
compilation on a variety of compiler configurations.
|
|
|
|
FABIAN GIESEN (https://fgiesen.wordpress.com) analyzed many
|
|
performance oddities that came up during development, and
|
|
helped get the benchmarking working properly across a number
|
|
of platforms.
|
|
|
|
ARAS PRANCKEVICIUS (https://aras-p.info) provided the allocation
|
|
shim for compilation on Mac OS X.
|
|
|
|
======================================================================== */
|
|
|
|
//
|
|
// IMPORTANT(casey): We are currently evaluating this hash construction as
|
|
// the final one for Meow Hash. If you find a way to produce collisions
|
|
// that should not be possible with a Level 3 hash, find significant performance
|
|
// problems, or see any bugs in this version, please be sure to report them
|
|
// to the Meow Hash GitHub as soon as possible. We would like to know as
|
|
// much as we can about the robustness and performance before committing to
|
|
// it as the final construction.
|
|
//
|
|
|
|
#if !defined(MEOW_HASH_X64_AESNI_H)
|
|
|
|
#define MEOW_HASH_VERSION 5
|
|
#define MEOW_HASH_VERSION_NAME "0.5/calico"
|
|
|
|
#if !defined(meow_u8)
|
|
|
|
#if _MSC_VER
|
|
#if !defined(__clang__)
|
|
#define INSTRUCTION_REORDER_BARRIER _ReadWriteBarrier()
|
|
#else
|
|
#endif
|
|
#include <intrin.h>
|
|
#else
|
|
#include <x86intrin.h>
|
|
#endif
|
|
|
|
#define meow_u8 char unsigned
|
|
#define meow_u64 long long unsigned
|
|
#define meow_u128 __m128i
|
|
|
|
#if __x86_64__ || _M_AMD64
|
|
#define meow_umm long long unsigned
|
|
#define MeowU64From(A, I) (_mm_extract_epi64((A), (I)))
|
|
#elif __i386__ || _M_IX86
|
|
#define meow_umm int unsigned
|
|
#define MeowU64From(A, I) (*(meow_u64 *)&(A))
|
|
#else
|
|
#error Cannot determine architecture to use!
|
|
#endif
|
|
|
|
#define MeowU32From(A, I) (_mm_extract_epi32((A), (I)))
|
|
#define MeowHashesAreEqual(A, B) (_mm_movemask_epi8(_mm_cmpeq_epi8((A), (B))) == 0xFFFF)
|
|
|
|
#if !defined INSTRUCTION_REORDER_BARRIER
|
|
#define INSTRUCTION_REORDER_BARRIER
|
|
#endif
|
|
|
|
#if !defined MEOW_PAGESIZE
|
|
#define MEOW_PAGESIZE 4096
|
|
#endif
|
|
|
|
#if !defined MEOW_PREFETCH
|
|
#define MEOW_PREFETCH 4096
|
|
#endif
|
|
|
|
#if !defined MEOW_PREFETCH_LIMIT
|
|
#define MEOW_PREFETCH_LIMIT 0x3ff
|
|
#endif
|
|
|
|
#endif
|
|
|
|
#define prefetcht0(A) _mm_prefetch((char *)(A), _MM_HINT_T0)
|
|
#define movdqu(A, B) A = _mm_loadu_si128((__m128i *)(B))
|
|
#define movdqu_mem(A, B) _mm_storeu_si128((__m128i *)(A), B)
|
|
#define movq(A, B) A = _mm_set_epi64x(0, B);
|
|
#define aesdec(A, B) A = _mm_aesdec_si128(A, B)
|
|
#define pshufb(A, B) A = _mm_shuffle_epi8(A, B)
|
|
#define pxor(A, B) A = _mm_xor_si128(A, B)
|
|
#define paddq(A, B) A = _mm_add_epi64(A, B)
|
|
#define pand(A, B) A = _mm_and_si128(A, B)
|
|
#define palignr(A, B, i) A = _mm_alignr_epi8(A, B, i)
|
|
#define pxor_clear(A, B) A = _mm_setzero_si128(); // NOTE(casey): pxor_clear is a nonsense thing that is only here because compilers don't detect xor(a, a) is clearing a :(
|
|
|
|
#define MEOW_MIX_REG(r1, r2, r3, r4, r5, i1, i2, i3, i4) \
|
|
aesdec(r1, r2); \
|
|
INSTRUCTION_REORDER_BARRIER; \
|
|
paddq(r3, i1); \
|
|
pxor(r2, i2); \
|
|
aesdec(r2, r4); \
|
|
INSTRUCTION_REORDER_BARRIER; \
|
|
paddq(r5, i3); \
|
|
pxor(r4, i4);
|
|
|
|
#define MEOW_MIX(r1, r2, r3, r4, r5, ptr) \
|
|
MEOW_MIX_REG(r1, r2, r3, r4, r5, _mm_loadu_si128( (__m128i *) ((ptr) + 15) ), _mm_loadu_si128( (__m128i *) ((ptr) + 0) ), _mm_loadu_si128( (__m128i *) ((ptr) + 1) ), _mm_loadu_si128( (__m128i *) ((ptr) + 16) ))
|
|
|
|
#define MEOW_SHUFFLE(r1, r2, r3, r4, r5, r6) \
|
|
aesdec(r1, r4); \
|
|
paddq(r2, r5); \
|
|
pxor(r4, r6); \
|
|
aesdec(r4, r2); \
|
|
paddq(r5, r6); \
|
|
pxor(r2, r3)
|
|
|
|
#if MEOW_DUMP
|
|
struct meow_dump
|
|
{
|
|
meow_u128 xmm[8];
|
|
void *Ptr;
|
|
char const *Title;
|
|
};
|
|
extern "C" meow_dump *MeowDumpTo;
|
|
meow_dump *MeowDumpTo;
|
|
#define MEOW_DUMP_STATE(T, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, ptr) \
|
|
if(MeowDumpTo) \
|
|
{ \
|
|
MeowDumpTo->xmm[0] = xmm0; \
|
|
MeowDumpTo->xmm[1] = xmm1; \
|
|
MeowDumpTo->xmm[2] = xmm2; \
|
|
MeowDumpTo->xmm[3] = xmm3; \
|
|
MeowDumpTo->xmm[4] = xmm4; \
|
|
MeowDumpTo->xmm[5] = xmm5; \
|
|
MeowDumpTo->xmm[6] = xmm6; \
|
|
MeowDumpTo->xmm[7] = xmm7; \
|
|
MeowDumpTo->Ptr = ptr; \
|
|
MeowDumpTo->Title = T; \
|
|
++MeowDumpTo; \
|
|
}
|
|
#else
|
|
#define MEOW_DUMP_STATE(...)
|
|
#endif
|
|
|
|
static meow_u8 MeowShiftAdjust[32] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
|
|
static meow_u8 MeowMaskLen[32] = {255,255,255,255, 255,255,255,255, 255,255,255,255, 255,255,255,255, 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0};
|
|
|
|
// NOTE(casey): The default seed is now a "nothing-up-our-sleeves" number for good measure. You may verify that it is just an encoding of Pi.
|
|
static meow_u8 MeowDefaultSeed[128] =
|
|
{
|
|
0x32, 0x43, 0xF6, 0xA8, 0x88, 0x5A, 0x30, 0x8D,
|
|
0x31, 0x31, 0x98, 0xA2, 0xE0, 0x37, 0x07, 0x34,
|
|
0x4A, 0x40, 0x93, 0x82, 0x22, 0x99, 0xF3, 0x1D,
|
|
0x00, 0x82, 0xEF, 0xA9, 0x8E, 0xC4, 0xE6, 0xC8,
|
|
0x94, 0x52, 0x82, 0x1E, 0x63, 0x8D, 0x01, 0x37,
|
|
0x7B, 0xE5, 0x46, 0x6C, 0xF3, 0x4E, 0x90, 0xC6,
|
|
0xCC, 0x0A, 0xC2, 0x9B, 0x7C, 0x97, 0xC5, 0x0D,
|
|
0xD3, 0xF8, 0x4D, 0x5B, 0x5B, 0x54, 0x70, 0x91,
|
|
0x79, 0x21, 0x6D, 0x5D, 0x98, 0x97, 0x9F, 0xB1,
|
|
0xBD, 0x13, 0x10, 0xBA, 0x69, 0x8D, 0xFB, 0x5A,
|
|
0xC2, 0xFF, 0xD7, 0x2D, 0xBD, 0x01, 0xAD, 0xFB,
|
|
0x7B, 0x8E, 0x1A, 0xFE, 0xD6, 0xA2, 0x67, 0xE9,
|
|
0x6B, 0xA7, 0xC9, 0x04, 0x5F, 0x12, 0xC7, 0xF9,
|
|
0x92, 0x4A, 0x19, 0x94, 0x7B, 0x39, 0x16, 0xCF,
|
|
0x70, 0x80, 0x1F, 0x2E, 0x28, 0x58, 0xEF, 0xC1,
|
|
0x66, 0x36, 0x92, 0x0D, 0x87, 0x15, 0x74, 0xE6
|
|
};
|
|
|
|
//
|
|
// NOTE(casey): Single block version
|
|
//
|
|
|
|
static meow_u128
|
|
MeowHash(void *Seed128Init, meow_umm Len, void *SourceInit)
|
|
{
|
|
meow_u128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; // NOTE(casey): xmm0-xmm7 are the hash accumulation lanes
|
|
meow_u128 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15; // NOTE(casey): xmm8-xmm15 hold values to be appended (residual, length)
|
|
|
|
meow_u8 *rax = (meow_u8 *)SourceInit;
|
|
meow_u8 *rcx = (meow_u8 *)Seed128Init;
|
|
|
|
//
|
|
// NOTE(casey): Seed the eight hash registers
|
|
//
|
|
|
|
movdqu(xmm0, rcx + 0x00);
|
|
movdqu(xmm1, rcx + 0x10);
|
|
movdqu(xmm2, rcx + 0x20);
|
|
movdqu(xmm3, rcx + 0x30);
|
|
|
|
movdqu(xmm4, rcx + 0x40);
|
|
movdqu(xmm5, rcx + 0x50);
|
|
movdqu(xmm6, rcx + 0x60);
|
|
movdqu(xmm7, rcx + 0x70);
|
|
|
|
MEOW_DUMP_STATE("Seed", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0);
|
|
|
|
//
|
|
// NOTE(casey): Hash all full 256-byte blocks
|
|
//
|
|
|
|
meow_umm BlockCount = (Len >> 8);
|
|
if(BlockCount > MEOW_PREFETCH_LIMIT)
|
|
{
|
|
// NOTE(casey): For large input, modern Intel x64's can't hit full speed without prefetching, so we use this loop
|
|
while(BlockCount--)
|
|
{
|
|
prefetcht0(rax + MEOW_PREFETCH + 0x00);
|
|
prefetcht0(rax + MEOW_PREFETCH + 0x40);
|
|
prefetcht0(rax + MEOW_PREFETCH + 0x80);
|
|
prefetcht0(rax + MEOW_PREFETCH + 0xc0);
|
|
|
|
MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0x00);
|
|
MEOW_MIX(xmm1,xmm5,xmm7,xmm2,xmm3, rax + 0x20);
|
|
MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x40);
|
|
MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x60);
|
|
MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x80);
|
|
MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0xa0);
|
|
MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0xc0);
|
|
MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xe0);
|
|
|
|
rax += 0x100;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// NOTE(casey): For small input, modern Intel x64's can't hit full speed _with_ prefetching (because of port pressure), so we use this loop.
|
|
while(BlockCount--)
|
|
{
|
|
MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0x00);
|
|
MEOW_MIX(xmm1,xmm5,xmm7,xmm2,xmm3, rax + 0x20);
|
|
MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x40);
|
|
MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x60);
|
|
MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x80);
|
|
MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0xa0);
|
|
MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0xc0);
|
|
MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xe0);
|
|
|
|
rax += 0x100;
|
|
}
|
|
}
|
|
|
|
MEOW_DUMP_STATE("PostBlocks", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0);
|
|
|
|
//
|
|
// NOTE(casey): Load any less-than-32-byte residual
|
|
//
|
|
|
|
pxor_clear(xmm9, xmm9);
|
|
pxor_clear(xmm11, xmm11);
|
|
|
|
//
|
|
// TODO(casey): I need to put more thought into how the end-of-buffer stuff is actually working out here,
|
|
// because I _think_ it may be possible to remove the first branch (on Len8) and let the mask zero out the
|
|
// result, but it would take a little thought to make sure it couldn't read off the end of the buffer due
|
|
// to the & 0xf on the align computation.
|
|
//
|
|
|
|
// NOTE(casey): First, we have to load the part that is _not_ 16-byte aligned
|
|
meow_u8 *Last = (meow_u8 *)SourceInit + (Len & ~0xf);
|
|
int unsigned Len8 = (Len & 0xf);
|
|
if(Len8)
|
|
{
|
|
// NOTE(casey): Load the mask early
|
|
movdqu(xmm8, &MeowMaskLen[0x10 - Len8]);
|
|
|
|
meow_u8 *LastOk = (meow_u8*)((((meow_umm)(((meow_u8 *)SourceInit)+Len - 1)) | (MEOW_PAGESIZE - 1)) - 16);
|
|
int Align = (Last > LastOk) ? ((int)(meow_umm)Last) & 0xf : 0;
|
|
movdqu(xmm10, &MeowShiftAdjust[Align]);
|
|
movdqu(xmm9, Last - Align);
|
|
pshufb(xmm9, xmm10);
|
|
|
|
// NOTE(jeffr): and off the extra bytes
|
|
pand(xmm9, xmm8);
|
|
}
|
|
|
|
// NOTE(casey): Next, we have to load the part that _is_ 16-byte aligned
|
|
if(Len & 0x10)
|
|
{
|
|
xmm11 = xmm9;
|
|
movdqu(xmm9, Last - 0x10);
|
|
}
|
|
|
|
//
|
|
// NOTE(casey): Construct the residual and length injests
|
|
//
|
|
|
|
xmm8 = xmm9;
|
|
xmm10 = xmm9;
|
|
palignr(xmm8, xmm11, 15);
|
|
palignr(xmm10, xmm11, 1);
|
|
|
|
// NOTE(casey): We have room for a 128-bit nonce and a 64-bit none here, but
|
|
// the decision was made to leave them zero'd so as not to confuse people
|
|
// about hwo to use them or what security implications they had.
|
|
pxor_clear(xmm12, xmm12);
|
|
pxor_clear(xmm13, xmm13);
|
|
pxor_clear(xmm14, xmm14);
|
|
movq(xmm15, Len);
|
|
palignr(xmm12, xmm15, 15);
|
|
palignr(xmm14, xmm15, 1);
|
|
|
|
MEOW_DUMP_STATE("Residuals", xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 0);
|
|
|
|
// NOTE(casey): To maintain the mix-down pattern, we always Meow Mix the less-than-32-byte residual, even if it was empty
|
|
MEOW_MIX_REG(xmm0, xmm4, xmm6, xmm1, xmm2, xmm8, xmm9, xmm10, xmm11);
|
|
|
|
// NOTE(casey): Append the length, to avoid problems with our 32-byte padding
|
|
MEOW_MIX_REG(xmm1, xmm5, xmm7, xmm2, xmm3, xmm12, xmm13, xmm14, xmm15);
|
|
|
|
MEOW_DUMP_STATE("PostAppend", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0);
|
|
|
|
//
|
|
// NOTE(casey): Hash all full 32-byte blocks
|
|
//
|
|
int unsigned LaneCount = (Len >> 5) & 0x7;
|
|
if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x00); --LaneCount;
|
|
if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x20); --LaneCount;
|
|
if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x40); --LaneCount;
|
|
if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0x60); --LaneCount;
|
|
if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0x80); --LaneCount;
|
|
if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xa0); --LaneCount;
|
|
if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0xc0); --LaneCount;
|
|
|
|
//
|
|
// NOTE(casey): Mix the eight lanes down to one 128-bit hash
|
|
//
|
|
|
|
MixDown:
|
|
|
|
MEOW_DUMP_STATE("PostLanes", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0);
|
|
|
|
MEOW_SHUFFLE(xmm0, xmm1, xmm2, xmm4, xmm5, xmm6);
|
|
MEOW_SHUFFLE(xmm1, xmm2, xmm3, xmm5, xmm6, xmm7);
|
|
MEOW_SHUFFLE(xmm2, xmm3, xmm4, xmm6, xmm7, xmm0);
|
|
MEOW_SHUFFLE(xmm3, xmm4, xmm5, xmm7, xmm0, xmm1);
|
|
MEOW_SHUFFLE(xmm4, xmm5, xmm6, xmm0, xmm1, xmm2);
|
|
MEOW_SHUFFLE(xmm5, xmm6, xmm7, xmm1, xmm2, xmm3);
|
|
MEOW_SHUFFLE(xmm6, xmm7, xmm0, xmm2, xmm3, xmm4);
|
|
MEOW_SHUFFLE(xmm7, xmm0, xmm1, xmm3, xmm4, xmm5);
|
|
MEOW_SHUFFLE(xmm0, xmm1, xmm2, xmm4, xmm5, xmm6);
|
|
MEOW_SHUFFLE(xmm1, xmm2, xmm3, xmm5, xmm6, xmm7);
|
|
MEOW_SHUFFLE(xmm2, xmm3, xmm4, xmm6, xmm7, xmm0);
|
|
MEOW_SHUFFLE(xmm3, xmm4, xmm5, xmm7, xmm0, xmm1);
|
|
|
|
MEOW_DUMP_STATE("PostMix", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0);
|
|
|
|
paddq(xmm0, xmm2);
|
|
paddq(xmm1, xmm3);
|
|
paddq(xmm4, xmm6);
|
|
paddq(xmm5, xmm7);
|
|
pxor(xmm0, xmm1);
|
|
pxor(xmm4, xmm5);
|
|
paddq(xmm0, xmm4);
|
|
|
|
MEOW_DUMP_STATE("PostFold", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0);
|
|
|
|
return(xmm0);
|
|
}
|
|
|
|
//
|
|
// NOTE(casey): Streaming construction
|
|
//
|
|
|
|
typedef struct meow_state
|
|
{
|
|
meow_u128 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
|
|
meow_u64 TotalLengthInBytes;
|
|
|
|
int unsigned BufferLen;
|
|
|
|
meow_u8 Buffer[256];
|
|
meow_u128 Pad[2]; // NOTE(casey): So we know we can over-read Buffer as necessary
|
|
} meow_state;
|
|
|
|
static void
|
|
MeowBegin(meow_state *State, void *Seed128)
|
|
{
|
|
meow_u8 *rcx = (meow_u8 *)Seed128;
|
|
|
|
movdqu(State->xmm0, rcx + 0x00);
|
|
movdqu(State->xmm1, rcx + 0x10);
|
|
movdqu(State->xmm2, rcx + 0x20);
|
|
movdqu(State->xmm3, rcx + 0x30);
|
|
movdqu(State->xmm4, rcx + 0x40);
|
|
movdqu(State->xmm5, rcx + 0x50);
|
|
movdqu(State->xmm6, rcx + 0x60);
|
|
movdqu(State->xmm7, rcx + 0x70);
|
|
|
|
MEOW_DUMP_STATE("Seed", State->xmm0, State->xmm1, State->xmm2, State->xmm3, State->xmm4, State->xmm5, State->xmm6, State->xmm7, 0);
|
|
|
|
State->BufferLen = 0;
|
|
State->TotalLengthInBytes = 0;
|
|
}
|
|
|
|
static void
|
|
MeowAbsorbBlocks(meow_state *State, meow_umm BlockCount, meow_u8 *rax)
|
|
{
|
|
meow_u128 xmm0 = State->xmm0;
|
|
meow_u128 xmm1 = State->xmm1;
|
|
meow_u128 xmm2 = State->xmm2;
|
|
meow_u128 xmm3 = State->xmm3;
|
|
meow_u128 xmm4 = State->xmm4;
|
|
meow_u128 xmm5 = State->xmm5;
|
|
meow_u128 xmm6 = State->xmm6;
|
|
meow_u128 xmm7 = State->xmm7;
|
|
|
|
if(BlockCount > MEOW_PREFETCH_LIMIT)
|
|
{
|
|
while(BlockCount--)
|
|
{
|
|
prefetcht0(rax + MEOW_PREFETCH + 0x00);
|
|
prefetcht0(rax + MEOW_PREFETCH + 0x40);
|
|
prefetcht0(rax + MEOW_PREFETCH + 0x80);
|
|
prefetcht0(rax + MEOW_PREFETCH + 0xc0);
|
|
|
|
MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0x00);
|
|
MEOW_MIX(xmm1,xmm5,xmm7,xmm2,xmm3, rax + 0x20);
|
|
MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x40);
|
|
MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x60);
|
|
MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x80);
|
|
MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0xa0);
|
|
MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0xc0);
|
|
MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xe0);
|
|
|
|
rax += 0x100;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
while(BlockCount--)
|
|
{
|
|
MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0x00);
|
|
MEOW_MIX(xmm1,xmm5,xmm7,xmm2,xmm3, rax + 0x20);
|
|
MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x40);
|
|
MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x60);
|
|
MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x80);
|
|
MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0xa0);
|
|
MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0xc0);
|
|
MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xe0);
|
|
|
|
rax += 0x100;
|
|
}
|
|
}
|
|
|
|
State->xmm0 = xmm0;
|
|
State->xmm1 = xmm1;
|
|
State->xmm2 = xmm2;
|
|
State->xmm3 = xmm3;
|
|
State->xmm4 = xmm4;
|
|
State->xmm5 = xmm5;
|
|
State->xmm6 = xmm6;
|
|
State->xmm7 = xmm7;
|
|
}
|
|
|
|
static void
|
|
MeowAbsorb(meow_state *State, meow_umm Len, void *SourceInit)
|
|
{
|
|
State->TotalLengthInBytes += Len;
|
|
meow_u8 *Source = (meow_u8 *)SourceInit;
|
|
|
|
// NOTE(casey): Handle any buffered residual
|
|
if(State->BufferLen)
|
|
{
|
|
int unsigned Fill = (sizeof(State->Buffer) - State->BufferLen);
|
|
if(Fill > Len)
|
|
{
|
|
Fill = (int unsigned)Len;
|
|
}
|
|
|
|
Len -= Fill;
|
|
while(Fill--)
|
|
{
|
|
State->Buffer[State->BufferLen++] = *Source++;
|
|
}
|
|
|
|
if(State->BufferLen == sizeof(State->Buffer))
|
|
{
|
|
MeowAbsorbBlocks(State, 1, State->Buffer);
|
|
State->BufferLen = 0;
|
|
}
|
|
}
|
|
|
|
// NOTE(casey): Handle any full blocks
|
|
meow_u64 BlockCount = (Len >> 8);
|
|
meow_u64 Advance = (BlockCount << 8);
|
|
MeowAbsorbBlocks(State, BlockCount, Source);
|
|
|
|
Len -= Advance;
|
|
Source += Advance;
|
|
|
|
// NOTE(casey): Store residual
|
|
while(Len--)
|
|
{
|
|
State->Buffer[State->BufferLen++] = *Source++;
|
|
}
|
|
}
|
|
|
|
static meow_u128
|
|
MeowEnd(meow_state *State, meow_u8 *Store128)
|
|
{
|
|
meow_umm Len = State->TotalLengthInBytes;
|
|
|
|
meow_u128 xmm0 = State->xmm0;
|
|
meow_u128 xmm1 = State->xmm1;
|
|
meow_u128 xmm2 = State->xmm2;
|
|
meow_u128 xmm3 = State->xmm3;
|
|
meow_u128 xmm4 = State->xmm4;
|
|
meow_u128 xmm5 = State->xmm5;
|
|
meow_u128 xmm6 = State->xmm6;
|
|
meow_u128 xmm7 = State->xmm7;
|
|
|
|
meow_u128 xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
|
|
|
|
meow_u8 *rax = State->Buffer;
|
|
|
|
pxor_clear(xmm9, xmm9);
|
|
pxor_clear(xmm11, xmm11);
|
|
|
|
meow_u8 *Last = (meow_u8 *)rax + (Len & 0xf0);
|
|
int unsigned Len8 = (Len & 0xf);
|
|
if(Len8)
|
|
{
|
|
movdqu(xmm8, &MeowMaskLen[0x10 - Len8]);
|
|
movdqu(xmm9, Last);
|
|
pand(xmm9, xmm8);
|
|
}
|
|
|
|
if(Len & 0x10)
|
|
{
|
|
xmm11 = xmm9;
|
|
movdqu(xmm9, Last - 0x10);
|
|
}
|
|
|
|
xmm8 = xmm9;
|
|
xmm10 = xmm9;
|
|
palignr(xmm8, xmm11, 15);
|
|
palignr(xmm10, xmm11, 1);
|
|
|
|
pxor_clear(xmm12, xmm12);
|
|
pxor_clear(xmm13, xmm13);
|
|
pxor_clear(xmm14, xmm14);
|
|
movq(xmm15, Len);
|
|
palignr(xmm12, xmm15, 15);
|
|
palignr(xmm14, xmm15, 1);
|
|
|
|
MEOW_DUMP_STATE("PostBlocks", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0);
|
|
MEOW_DUMP_STATE("Residuals", xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15, 0);
|
|
|
|
// NOTE(casey): To maintain the mix-down pattern, we always Meow Mix the less-than-32-byte residual, even if it was empty
|
|
MEOW_MIX_REG(xmm0, xmm4, xmm6, xmm1, xmm2, xmm8, xmm9, xmm10, xmm11);
|
|
|
|
// NOTE(casey): Append the length, to avoid problems with our 32-byte padding
|
|
MEOW_MIX_REG(xmm1, xmm5, xmm7, xmm2, xmm3, xmm12, xmm13, xmm14, xmm15);
|
|
|
|
MEOW_DUMP_STATE("PostAppend", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0);
|
|
|
|
//
|
|
// NOTE(casey): Hash all full 32-byte blocks
|
|
//
|
|
int unsigned LaneCount = (Len >> 5) & 0x7;
|
|
if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm2,xmm6,xmm0,xmm3,xmm4, rax + 0x00); --LaneCount;
|
|
if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm3,xmm7,xmm1,xmm4,xmm5, rax + 0x20); --LaneCount;
|
|
if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm4,xmm0,xmm2,xmm5,xmm6, rax + 0x40); --LaneCount;
|
|
if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm5,xmm1,xmm3,xmm6,xmm7, rax + 0x60); --LaneCount;
|
|
if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm6,xmm2,xmm4,xmm7,xmm0, rax + 0x80); --LaneCount;
|
|
if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm7,xmm3,xmm5,xmm0,xmm1, rax + 0xa0); --LaneCount;
|
|
if(LaneCount == 0) goto MixDown; MEOW_MIX(xmm0,xmm4,xmm6,xmm1,xmm2, rax + 0xc0); --LaneCount;
|
|
|
|
//
|
|
// NOTE(casey): Mix the eight lanes down to one 128-bit hash
|
|
//
|
|
|
|
MixDown:
|
|
|
|
MEOW_DUMP_STATE("PostLanes", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0);
|
|
|
|
MEOW_SHUFFLE(xmm0, xmm1, xmm2, xmm4, xmm5, xmm6);
|
|
MEOW_SHUFFLE(xmm1, xmm2, xmm3, xmm5, xmm6, xmm7);
|
|
MEOW_SHUFFLE(xmm2, xmm3, xmm4, xmm6, xmm7, xmm0);
|
|
MEOW_SHUFFLE(xmm3, xmm4, xmm5, xmm7, xmm0, xmm1);
|
|
MEOW_SHUFFLE(xmm4, xmm5, xmm6, xmm0, xmm1, xmm2);
|
|
MEOW_SHUFFLE(xmm5, xmm6, xmm7, xmm1, xmm2, xmm3);
|
|
MEOW_SHUFFLE(xmm6, xmm7, xmm0, xmm2, xmm3, xmm4);
|
|
MEOW_SHUFFLE(xmm7, xmm0, xmm1, xmm3, xmm4, xmm5);
|
|
MEOW_SHUFFLE(xmm0, xmm1, xmm2, xmm4, xmm5, xmm6);
|
|
MEOW_SHUFFLE(xmm1, xmm2, xmm3, xmm5, xmm6, xmm7);
|
|
MEOW_SHUFFLE(xmm2, xmm3, xmm4, xmm6, xmm7, xmm0);
|
|
MEOW_SHUFFLE(xmm3, xmm4, xmm5, xmm7, xmm0, xmm1);
|
|
|
|
MEOW_DUMP_STATE("PostMix", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0);
|
|
|
|
if(Store128)
|
|
{
|
|
movdqu_mem(Store128 + 0x00, xmm0);
|
|
movdqu_mem(Store128 + 0x10, xmm1);
|
|
movdqu_mem(Store128 + 0x20, xmm2);
|
|
movdqu_mem(Store128 + 0x30, xmm3);
|
|
movdqu_mem(Store128 + 0x40, xmm4);
|
|
movdqu_mem(Store128 + 0x50, xmm5);
|
|
movdqu_mem(Store128 + 0x60, xmm6);
|
|
movdqu_mem(Store128 + 0x70, xmm7);
|
|
}
|
|
|
|
paddq(xmm0, xmm2);
|
|
paddq(xmm1, xmm3);
|
|
paddq(xmm4, xmm6);
|
|
paddq(xmm5, xmm7);
|
|
pxor(xmm0, xmm1);
|
|
pxor(xmm4, xmm5);
|
|
paddq(xmm0, xmm4);
|
|
|
|
MEOW_DUMP_STATE("PostFold", xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, 0);
|
|
|
|
return(xmm0);
|
|
}
|
|
|
|
#undef INSTRUCTION_REORDER_BARRIER
|
|
#undef prefetcht0
|
|
#undef movdqu
|
|
#undef movdqu_mem
|
|
#undef movq
|
|
#undef aesdec
|
|
#undef pshufb
|
|
#undef pxor
|
|
#undef paddq
|
|
#undef pand
|
|
#undef palignr
|
|
#undef pxor_clear
|
|
#undef MEOW_MIX
|
|
#undef MEOW_MIX_REG
|
|
#undef MEOW_SHUFFLE
|
|
#undef MEOW_DUMP_STATE
|
|
|
|
//
|
|
// NOTE(casey): If you need to create your own seed from non-random data, you can use MeowExpandSeed
|
|
// to create a seed which you then store for repeated use. It is _expensive_ to generate the seed,
|
|
// so you do not want to do this every time you hash. You _only_ want to do it when you actually
|
|
// need to create a new seed.
|
|
//
|
|
|
|
static void
|
|
MeowExpandSeed(meow_umm InputLen, void *Input, meow_u8 *SeedResult)
|
|
{
|
|
meow_state State;
|
|
meow_u64 LengthTab = (meow_u64)InputLen; // NOTE(casey): We need to always injest 8-byte lengths exactly, even on 32-bit builds, to ensure identical results
|
|
meow_umm InjestCount = (256 / InputLen) + 2;
|
|
|
|
MeowBegin(&State, MeowDefaultSeed);
|
|
MeowAbsorb(&State, sizeof(LengthTab), &LengthTab);
|
|
while(InjestCount--)
|
|
{
|
|
MeowAbsorb(&State, InputLen, Input);
|
|
}
|
|
MeowEnd(&State, SeedResult);
|
|
}
|
|
|
|
#define MEOW_HASH_X64_AESNI_H
|
|
#endif
|