quic: implement rapidhash for hashing improvements · nodejs/node@76d9c24
1+#pragma once
2+3+#if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS
4+5+// Fast, high-quality hash function for byte sequences.
6+//
7+// Provides HashBytes() for use in hash tables. Uses native-width integer
8+// loads and a 128-bit multiply-and-fold mixer for excellent avalanche
9+// properties on short byte sequences (network identifiers, addresses,
10+// tokens).
11+//
12+// Based on rapidhash V3 by Nicolas De Carli, which evolved from wyhash
13+// by Wang Yi. Both use the same core mixing primitive (MUM: multiply,
14+// then XOR the high and low halves of the 128-bit result).
15+//
16+// rapidhash: https://github.com/Nicoshev/rapidhash
17+// Copyright (C) 2025 Nicolas De Carli — MIT License
18+// wyhash: https://github.com/wangyi-fudan/wyhash
19+// Wang Yi — public domain (The Unlicense)
20+//
21+// The implementation here uses rapidhash's read strategy (native-width
22+// overlapping reads, optimized for short inputs) and secret constants.
23+// The core mixing function (rapid_mum/rapid_mix) is identical to
24+// wyhash's wymum/wymix.
25+26+#include <cstddef>
27+#include <cstdint>
28+#include <cstring>
29+30+#if defined(_MSC_VER)
31+#include <intrin.h>
32+#if defined(_M_X64) && !defined(_M_ARM64EC)
33+#pragma intrinsic(_umul128)
34+#endif
35+#endif
36+37+namespace node {
38+39+namespace hash_detail {
40+41+// 128-bit multiply, then XOR the high and low halves.
42+// This is the core mixing function ("rapid_mum" / "wymum").
43+// On 64-bit platforms with __int128, this compiles to a single
44+// mul instruction + shift + xor.
45+inline uint64_t RapidMix(uint64_t a, uint64_t b) {
46+#ifdef __SIZEOF_INT128__
47+__uint128_t r = static_cast<__uint128_t>(a) * b;
48+ a = static_cast<uint64_t>(r);
49+ b = static_cast<uint64_t>(r >> 64);
50+#elif defined(_MSC_VER) && (defined(_WIN64) || defined(_M_HYBRID_CHPE_ARM64))
51+#if defined(_M_X64)
52+ a = _umul128(a, b, &b);
53+#else
54+uint64_t hi = __umulh(a, b);
55+ a = a * b;
56+ b = hi;
57+#endif
58+#else
59+// Portable 64x64 -> 128-bit multiply fallback for 32-bit platforms.
60+uint64_t ha = a >> 32, hb = b >> 32;
61+uint64_t la = static_cast<uint32_t>(a), lb = static_cast<uint32_t>(b);
62+uint64_t rh = ha * hb, rm0 = ha * lb, rm1 = hb * la, rl = la * lb;
63+uint64_t t = rl + (rm0 << 32);
64+uint64_t lo = t + (rm1 << 32);
65+uint64_t hi = rh + (rm0 >> 32) + (rm1 >> 32) + (t < rl) + (lo < t);
66+ a = lo;
67+ b = hi;
68+#endif
69+return a ^ b;
70+}
71+72+// Inline 128-bit multiply WITHOUT the final XOR (used in the
73+// penultimate mixing step where a and b are updated separately).
74+inline void RapidMum(uint64_t* a, uint64_t* b) {
75+#ifdef __SIZEOF_INT128__
76+__uint128_t r = static_cast<__uint128_t>(*a) * (*b);
77+ *a = static_cast<uint64_t>(r);
78+ *b = static_cast<uint64_t>(r >> 64);
79+#elif defined(_MSC_VER) && (defined(_WIN64) || defined(_M_HYBRID_CHPE_ARM64))
80+#if defined(_M_X64)
81+ *a = _umul128(*a, *b, b);
82+#else
83+uint64_t hi = __umulh(*a, *b);
84+ *a = (*a) * (*b);
85+ *b = hi;
86+#endif
87+#else
88+uint64_t ha = *a >> 32, hb = *b >> 32;
89+uint64_t la = static_cast<uint32_t>(*a), lb = static_cast<uint32_t>(*b);
90+uint64_t rh = ha * hb, rm0 = ha * lb, rm1 = hb * la, rl = la * lb;
91+uint64_t t = rl + (rm0 << 32);
92+ *a = t + (rm1 << 32);
93+ *b = rh + (rm0 >> 32) + (rm1 >> 32) + (t < rl) + (*a < t);
94+#endif
95+}
96+97+// Read functions. The compiler optimizes small fixed-size memcpy calls
98+// to single load instructions — no actual byte-by-byte copy occurs.
99+inline uint64_t RapidRead64(const uint8_t* p) {
100+uint64_t v;
101+memcpy(&v, p, sizeof(uint64_t));
102+return v;
103+}
104+105+inline uint64_t RapidRead32(const uint8_t* p) {
106+uint32_t v;
107+memcpy(&v, p, sizeof(uint32_t));
108+return v;
109+}
110+111+// Default rapidhash secret parameters.
112+constexpr uint64_t kSecret[8] = {0x2d358dccaa6c78a5ULL,
113+0x8bb84b93962eacc9ULL,
114+0x4b33a62ed433d4a3ULL,
115+0x4d5a2da51de1aa47ULL,
116+0xa0761d6478bd642fULL,
117+0xe7037ed1a0b428dbULL,
118+0x90ed1765281c388cULL,
119+0xaaaaaaaaaaaaaaaaULL};
120+121+} // namespace hash_detail
122+123+// Hash a contiguous byte range. Optimized for short inputs (≤48 bytes)
124+// which is the common case for network identifiers and addresses. For
125+// inputs >48 bytes, falls through to a loop processing 48-byte chunks.
126+inline size_t HashBytes(const void* data, size_t len) {
127+const uint8_t* p = static_cast<const uint8_t*>(data);
128+129+// Seed initialization.
130+uint64_t seed = hash_detail::RapidMix(0 ^ hash_detail::kSecret[2],
131+ hash_detail::kSecret[1]);
132+uint64_t a = 0;
133+uint64_t b = 0;
134+size_t i = len;
135+136+if (len <= 16) {
137+if (len >= 4) {
138+// Mix length into seed for better distribution of
139+// different-length inputs with shared prefixes.
140+ seed ^= len;
141+if (len >= 8) {
142+// 8-16 bytes: two native 64-bit reads (overlapping from end).
143+ a = hash_detail::RapidRead64(p);
144+ b = hash_detail::RapidRead64(p + len - 8);
145+ } else {
146+// 4-7 bytes: two 32-bit reads (overlapping from end).
147+ a = hash_detail::RapidRead32(p);
148+ b = hash_detail::RapidRead32(p + len - 4);
149+ }
150+ } else if (len > 0) {
151+// 1-3 bytes: spread bytes across two values for mixing.
152+ a = (static_cast<uint64_t>(p[0]) << 45) | p[len - 1];
153+ b = p[len >> 1];
154+ } else {
155+ a = b = 0;
156+ }
157+ } else if (len <= 48) {
158+// 17-48 bytes: process in 16-byte chunks, then read the tail.
159+ seed = hash_detail::RapidMix(
160+hash_detail::RapidRead64(p) ^ hash_detail::kSecret[2],
161+hash_detail::RapidRead64(p + 8) ^ seed);
162+if (len > 32) {
163+ seed = hash_detail::RapidMix(
164+hash_detail::RapidRead64(p + 16) ^ hash_detail::kSecret[2],
165+hash_detail::RapidRead64(p + 24) ^ seed);
166+ }
167+ a = hash_detail::RapidRead64(p + len - 16) ^ len;
168+ b = hash_detail::RapidRead64(p + len - 8);
169+ } else {
170+// >48 bytes: process 48-byte chunks with three parallel mix lanes.
171+uint64_t see1 = seed;
172+uint64_t see2 = seed;
173+do {
174+ seed = hash_detail::RapidMix(
175+hash_detail::RapidRead64(p) ^ hash_detail::kSecret[0],
176+hash_detail::RapidRead64(p + 8) ^ seed);
177+ see1 = hash_detail::RapidMix(
178+hash_detail::RapidRead64(p + 16) ^ hash_detail::kSecret[1],
179+hash_detail::RapidRead64(p + 24) ^ see1);
180+ see2 = hash_detail::RapidMix(
181+hash_detail::RapidRead64(p + 32) ^ hash_detail::kSecret[2],
182+hash_detail::RapidRead64(p + 40) ^ see2);
183+ p += 48;
184+ i -= 48;
185+ } while (i > 48);
186+ seed ^= see1 ^ see2;
187+// Process remaining 17-48 bytes.
188+if (i > 16) {
189+ seed = hash_detail::RapidMix(
190+hash_detail::RapidRead64(p) ^ hash_detail::kSecret[2],
191+hash_detail::RapidRead64(p + 8) ^ seed);
192+if (i > 32) {
193+ seed = hash_detail::RapidMix(
194+hash_detail::RapidRead64(p + 16) ^ hash_detail::kSecret[2],
195+hash_detail::RapidRead64(p + 24) ^ seed);
196+ }
197+ }
198+ a = hash_detail::RapidRead64(p + i - 16) ^ i;
199+ b = hash_detail::RapidRead64(p + i - 8);
200+ }
201+202+// Final mix.
203+ a ^= hash_detail::kSecret[1];
204+ b ^= seed;
205+hash_detail::RapidMum(&a, &b);
206+return static_cast<size_t>(hash_detail::RapidMix(
207+ a ^ hash_detail::kSecret[7], b ^ hash_detail::kSecret[1] ^ len));
208+}
209+210+} // namespace node
211+212+#endif // defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS