Name: Anonymous 2014-07-22 13:29
Can be yours or not, I just want to see something good.
#
# uint64_t memhash(void const* key, size_t n, uint64_t seed);
#
# Based off of the fast byteswap hash function by Steven Fuerst.
# http://locklessinc.com/articles/fast_hash/
#
.text
.align 16, 0x90
factor128:
.octa 0xd6c573e9c613993d5a379ab38dc5a46b
mix1:
.octa 0x1591aefa5e7e5a171591aefa5e7e5a17
mix2:
.octa 0x2bb6863566c4e7612bb6863566c4e761
.global memhash
.type memhash, @function
memhash:
movl 4(%esp), %ecx # key
movl 8(%esp), %edx # n
movq 12(%esp), %xmm2 # seed
pushl %edi
movd %edx, %xmm1
pushl %esi
movdqa (factor128), %xmm3
movdqa (mix1), %xmm6
movdqa (mix2), %xmm7
cmpl $32, %edx
jb 2f
mov %edx, %eax
shr $5, %eax
testb $15, %cl
jne 5f
# Aligned main loop, process 32-byte aligned chunks at a time
.align 8, 0x90
1: paddq (%ecx), %xmm1
paddq 16(%ecx), %xmm2
pmullw %xmm3, %xmm1
pmullw %xmm3, %xmm2
add $32, %ecx
movdqa %xmm1, %xmm0
punpckhbw %xmm2, %xmm1
punpcklbw %xmm0, %xmm2
dec %eax
jne 1b
pxor %xmm2, %xmm1
# Check for remaining chunk, otherwise extract 128 bits state and proceed to final mix
2: testb $31, %dl
jne 3f
movhlps %xmm1, %xmm2
jmp 4f
# Hash remaining chunk, up to 31-bytes in size
3: testb $16, %dl
je 3f
movdqu (%ecx), %xmm0
add $16, %ecx
pxor %xmm0, %xmm1
pmullw %xmm3, %xmm1
3: pxor %xmm3, %xmm3
movhlps %xmm1, %xmm2
testb $8, %dl
je 3f
movq (%ecx), %xmm0
add $8, %ecx
pxor %xmm0, %xmm1
3: testb $4, %dl
je 3f
movd (%ecx), %xmm3
add $4, %ecx
3: testb $2, %dl
je 3f
movzwl (%ecx), %eax
psllq $16, %xmm3
movd %eax, %xmm0
add $2, %ecx
paddq %xmm0, %xmm3
3: testb $1, %dl
je 3f
movzbl (%ecx), %eax
psllq $8, %xmm3
movd %eax, %xmm0
paddq %xmm0, %xmm3
3: pxor %xmm3, %xmm2
# Use 3x 128bit multiply and xorshift for final mix
4: # Iteration #0
pshufd $0b10110001, %xmm6, %xmm3
punpckldq %xmm1, %xmm1
pmuludq %xmm1, %xmm3 # [H|G|F|E] = [hash1.high * mix1.high | hash1.low * mix1.high]
punpckldq %xmm2, %xmm2
pmuludq %xmm6, %xmm1 # [D|C|B|A] = [hash1.high * mix1.low | hash1.low * mix1.low]
movd %xmm3, %ecx
pshufd $0b11100001, %xmm1, %xmm1
pshufd $0b11100001, %xmm3, %xmm3
movd %xmm1, %edx
movd %xmm3, %esi
pshufd $0b11010010, %xmm1, %xmm1
pshufd $0b01110010, %xmm3, %xmm3
movd %xmm1, %eax
xor %edi, %edi
pshufd $0b00100111, %xmm1, %xmm1
add %eax, %edx # X = B + C
movd %xmm1, %eax
adc $0, %eax # Y = D + carry
adc $0, %edi # Z = carry
add %ecx, %edx # X = X + E
punpckhdq %xmm1, %xmm1 # [0|0|0|A]
movd %edx, %xmm0 # [0|0|0|X]
adc %esi, %eax # Y = Y + F + carry
movd %xmm3, %edx
punpckhdq %xmm3, %xmm3
adc $0, %edi # Z = Z + carry
movd %xmm3, %esi
add %edx, %eax # Y = Y + G
pshufd $0b10101111, %xmm7, %xmm3
adc %esi, %edi # Z = Z + H + carry
movd %eax, %xmm5 # [0|0|0|Y]
movd %edi, %xmm4 # [0|0|0|Z]
punpckldq %xmm0, %xmm1 # [0|0|X|A] = [0 | lower 64-bits of hash1 * mix1]
punpckldq %xmm4, %xmm5 # [0|0|Z|Y] = [0 | upper 64-bits of hash1 * mix1]
pmuludq %xmm2, %xmm3 # [h|g|f|e] = [hash2.high * mix2.low | hash2.low * mix2.high]
pxor %xmm0, %xmm0
pmuludq %xmm7, %xmm2 # [d|c|b|a] = [hash2.high * mix2.high | hash2.low * mix2.low]
punpckldq %xmm3, %xmm0 # [e|0|e|0]
pxor %xmm4, %xmm4
paddd %xmm0, %xmm2 # [d+e|c|b+e|a]
punpckhdq %xmm3, %xmm4 # [g|0|g|0]
paddd %xmm4, %xmm2 # [d+e+g|c|b+e+g|a] = [garbage | lower 64-bits of hash2 * mix2]
paddq %xmm5, %xmm2 # hash2 = (hash2 * mix2) + upper_64bits(hash1 * mix1)
pxor %xmm2, %xmm1 # hash1 = hash2 ^ lower_64bits(hash1 * mix1)
# Iteration #1
pshufd $0b10110001, %xmm6, %xmm3
punpckldq %xmm1, %xmm1
pmuludq %xmm1, %xmm3 # [H|G|F|E] = [hash1.high * mix1.high | hash1.low * mix1.high]
punpckldq %xmm2, %xmm2
pmuludq %xmm6, %xmm1 # [D|C|B|A] = [hash1.high * mix1.low | hash1.low * mix1.low]
movd %xmm3, %ecx
pshufd $0b11100001, %xmm1, %xmm1
pshufd $0b11100001, %xmm3, %xmm3
movd %xmm1, %edx
movd %xmm3, %esi
pshufd $0b11010010, %xmm1, %xmm1
pshufd $0b01110010, %xmm3, %xmm3
movd %xmm1, %eax
xor %edi, %edi
pshufd $0b00100111, %xmm1, %xmm1
add %eax, %edx # X = B + C
movd %xmm1, %eax
adc $0, %eax # Y = D + carry
adc $0, %edi # Z = carry
add %ecx, %edx # X = X + E
punpckhdq %xmm1, %xmm1 # [0|0|0|A]
movd %edx, %xmm0 # [0|0|0|X]
adc %esi, %eax # Y = Y + F + carry
movd %xmm3, %edx
punpckhdq %xmm3, %xmm3
adc $0, %edi # Z = Z + carry
movd %xmm3, %esi
add %edx, %eax # Y = Y + G
pshufd $0b10101111, %xmm7, %xmm3
adc %esi, %edi # Z = Z + H + carry
movd %eax, %xmm5 # [0|0|0|Y]
movd %edi, %xmm4 # [0|0|0|Z]
punpckldq %xmm0, %xmm1 # [0|0|X|A] = [0 | lower 64-bits of hash1 * mix1]
punpckldq %xmm4, %xmm5 # [0|0|Z|Y] = [0 | upper 64-bits of hash1 * mix1]
pmuludq %xmm2, %xmm3 # [h|g|f|e] = [hash2.high * mix2.low | hash2.low * mix2.high]
pxor %xmm0, %xmm0
pmuludq %xmm7, %xmm2 # [d|c|b|a] = [hash2.high * mix2.high | hash2.low * mix2.low]
punpckldq %xmm3, %xmm0 # [e|0|e|0]
pxor %xmm4, %xmm4
paddd %xmm0, %xmm2 # [d+e|c|b+e|a]
punpckhdq %xmm3, %xmm4 # [g|0|g|0]
paddd %xmm4, %xmm2 # [d+e+g|c|b+e+g|a] = [garbage | lower 64-bits of hash2 * mix2]
paddq %xmm5, %xmm2 # hash2 = (hash2 * mix2) + upper_64bits(hash1 * mix1)
pxor %xmm2, %xmm1 # hash1 = hash2 ^ lower_64bits(hash1 * mix1)
# Iteration #2
pshufd $0b10110001, %xmm6, %xmm3
punpckldq %xmm1, %xmm1
pmuludq %xmm1, %xmm3 # [H|G|F|E] = [hash1.high * mix1.high | hash1.low * mix1.high]
punpckldq %xmm2, %xmm2
pmuludq %xmm6, %xmm1 # [D|C|B|A] = [hash1.high * mix1.low | hash1.low * mix1.low]
movd %xmm3, %ecx
pshufd $0b11100001, %xmm1, %xmm1
pshufd $0b11100001, %xmm3, %xmm3
movd %xmm1, %edx
movd %xmm3, %esi
pshufd $0b11010010, %xmm1, %xmm1
pshufd $0b01110010, %xmm3, %xmm3
movd %xmm1, %eax
xor %edi, %edi
pshufd $0b00100111, %xmm1, %xmm1
add %eax, %edx # X = B + C
movd %xmm1, %eax
adc $0, %eax # Y = D + carry
adc $0, %edi # Z = carry
add %ecx, %edx # X = X + E
punpckhdq %xmm1, %xmm1 # [0|0|0|A]
movd %edx, %xmm0 # [0|0|0|X]
adc %esi, %eax # Y = Y + F + carry
movd %xmm3, %edx
punpckhdq %xmm3, %xmm3
adc $0, %edi # Z = Z + carry
movd %xmm3, %esi
add %edx, %eax # Y = Y + G
pshufd $0b10101111, %xmm7, %xmm3
adc %esi, %edi # Z = Z + H + carry
movd %eax, %xmm5 # [0|0|0|Y]
movd %edi, %xmm4 # [0|0|0|Z]
punpckldq %xmm0, %xmm1 # [0|0|X|A] = [0 | lower 64-bits of hash1 * mix1]
punpckldq %xmm4, %xmm5 # [0|0|Z|Y] = [0 | upper 64-bits of hash1 * mix1]
pmuludq %xmm2, %xmm3 # [h|g|f|e] = [hash2.high * mix2.low | hash2.low * mix2.high]
pxor %xmm0, %xmm0
pmuludq %xmm7, %xmm2 # [d|c|b|a] = [hash2.high * mix2.high | hash2.low * mix2.low]
punpckldq %xmm3, %xmm0 # [e|0|e|0]
pxor %xmm4, %xmm4
paddd %xmm0, %xmm2 # [d+e|c|b+e|a]
punpckhdq %xmm3, %xmm4 # [g|0|g|0]
paddd %xmm4, %xmm2 # [d+e+g|c|b+e+g|a] = [garbage | lower 64-bits of hash2 * mix2]
paddq %xmm5, %xmm2 # hash2 = (hash2 * mix2) + upper_64bits(hash1 * mix1)
pxor %xmm2, %xmm1 # hash1 = hash2 ^ lower_64bits(hash1 * mix1)
# Place 64-bit result in %edx:%eax and return
popl %esi
movd %xmm1, %eax
pshufd $0b11100001, %xmm1, %xmm1
popl %edi
movd %xmm1, %edx
ret
# Unaligned main-loop, process 32-byte unaligned chunks at a time
.align 16, 0x90
5: movdqu (%ecx), %xmm0
add $32, %ecx
paddq %xmm0, %xmm1
movdqu -16(%ecx), %xmm0
pmullw %xmm3, %xmm1
paddq %xmm0, %xmm2
pmullw %xmm3, %xmm2
movdqa %xmm1, %xmm0
punpckhbw %xmm2, %xmm1
punpcklbw %xmm0, %xmm2
dec %eax
jne 5b
pxor %xmm2, %xmm1
jmp 2b
.size memhash, .-memhash