quic: implement rapidhash for hashing improvements · nodejs/node@76d9c24

1+#pragma once2+3+#if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS4+5+// Fast, high-quality hash function for byte sequences.6+//7+// Provides HashBytes() for use in hash tables. Uses native-width integer8+// loads and a 128-bit multiply-and-fold mixer for excellent avalanche9+// properties on short byte sequences (network identifiers, addresses,10+// tokens).11+//12+// Based on rapidhash V3 by Nicolas De Carli, which evolved from wyhash13+// by Wang Yi. Both use the same core mixing primitive (MUM: multiply,14+// then XOR the high and low halves of the 128-bit result).15+//16+// rapidhash: https://github.com/Nicoshev/rapidhash17+// Copyright (C) 2025 Nicolas De Carli — MIT License18+// wyhash: https://github.com/wangyi-fudan/wyhash19+// Wang Yi — public domain (The Unlicense)20+//21+// The implementation here uses rapidhash's read strategy (native-width22+// overlapping reads, optimized for short inputs) and secret constants.23+// The core mixing function (rapid_mum/rapid_mix) is identical to24+// wyhash's wymum/wymix.25+26+#include <cstddef>27+#include <cstdint>28+#include <cstring>29+30+#if defined(_MSC_VER)31+#include <intrin.h>32+#if defined(_M_X64) && !defined(_M_ARM64EC)33+#pragma intrinsic(_umul128)34+#endif35+#endif36+37+namespace node {38+39+namespace hash_detail {40+41+// 128-bit multiply, then XOR the high and low halves.42+// This is the core mixing function ("rapid_mum" / "wymum").43+// On 64-bit platforms with __int128, this compiles to a single44+// mul instruction + shift + xor.45+inline uint64_t RapidMix(uint64_t a, uint64_t b) {46+#ifdef __SIZEOF_INT128__47+__uint128_t r = static_cast<__uint128_t>(a) * b;48+ a = static_cast<uint64_t>(r);49+ b = static_cast<uint64_t>(r >> 64);50+#elif defined(_MSC_VER) && (defined(_WIN64) || defined(_M_HYBRID_CHPE_ARM64))51+#if defined(_M_X64)52+ a = _umul128(a, b, &b);53+#else54+uint64_t hi = __umulh(a, b);55+ a = a * b;56+ b = hi;57+#endif58+#else59+// Portable 64x64 -> 128-bit multiply fallback for 32-bit platforms.60+uint64_t ha = a >> 32, hb = b >> 32;61+uint64_t la = static_cast<uint32_t>(a), lb = static_cast<uint32_t>(b);62+uint64_t rh = ha * hb, rm0 = ha * lb, rm1 = hb * la, rl = la * lb;63+uint64_t t = rl + (rm0 << 32);64+uint64_t lo = t + (rm1 << 32);65+uint64_t hi = rh + (rm0 >> 32) + (rm1 >> 32) + (t < rl) + (lo < t);66+ a = lo;67+ b = hi;68+#endif69+return a ^ b;70+}71+72+// Inline 128-bit multiply WITHOUT the final XOR (used in the73+// penultimate mixing step where a and b are updated separately).74+inline void RapidMum(uint64_t* a, uint64_t* b) {75+#ifdef __SIZEOF_INT128__76+__uint128_t r = static_cast<__uint128_t>(*a) * (*b);77+ *a = static_cast<uint64_t>(r);78+ *b = static_cast<uint64_t>(r >> 64);79+#elif defined(_MSC_VER) && (defined(_WIN64) || defined(_M_HYBRID_CHPE_ARM64))80+#if defined(_M_X64)81+ *a = _umul128(*a, *b, b);82+#else83+uint64_t hi = __umulh(*a, *b);84+ *a = (*a) * (*b);85+ *b = hi;86+#endif87+#else88+uint64_t ha = *a >> 32, hb = *b >> 32;89+uint64_t la = static_cast<uint32_t>(*a), lb = static_cast<uint32_t>(*b);90+uint64_t rh = ha * hb, rm0 = ha * lb, rm1 = hb * la, rl = la * lb;91+uint64_t t = rl + (rm0 << 32);92+ *a = t + (rm1 << 32);93+ *b = rh + (rm0 >> 32) + (rm1 >> 32) + (t < rl) + (*a < t);94+#endif95+}96+97+// Read functions. The compiler optimizes small fixed-size memcpy calls98+// to single load instructions — no actual byte-by-byte copy occurs.99+inline uint64_t RapidRead64(const uint8_t* p) {100+uint64_t v;101+memcpy(&v, p, sizeof(uint64_t));102+return v;103+}104+105+inline uint64_t RapidRead32(const uint8_t* p) {106+uint32_t v;107+memcpy(&v, p, sizeof(uint32_t));108+return v;109+}110+111+// Default rapidhash secret parameters.112+constexpr uint64_t kSecret[8] = {0x2d358dccaa6c78a5ULL,113+0x8bb84b93962eacc9ULL,114+0x4b33a62ed433d4a3ULL,115+0x4d5a2da51de1aa47ULL,116+0xa0761d6478bd642fULL,117+0xe7037ed1a0b428dbULL,118+0x90ed1765281c388cULL,119+0xaaaaaaaaaaaaaaaaULL};120+121+} // namespace hash_detail122+123+// Hash a contiguous byte range. Optimized for short inputs (≤48 bytes)124+// which is the common case for network identifiers and addresses. For125+// inputs >48 bytes, falls through to a loop processing 48-byte chunks.126+inline size_t HashBytes(const void* data, size_t len) {127+const uint8_t* p = static_cast<const uint8_t*>(data);128+129+// Seed initialization.130+uint64_t seed = hash_detail::RapidMix(0 ^ hash_detail::kSecret[2],131+ hash_detail::kSecret[1]);132+uint64_t a = 0;133+uint64_t b = 0;134+size_t i = len;135+136+if (len <= 16) {137+if (len >= 4) {138+// Mix length into seed for better distribution of139+// different-length inputs with shared prefixes.140+ seed ^= len;141+if (len >= 8) {142+// 8-16 bytes: two native 64-bit reads (overlapping from end).143+ a = hash_detail::RapidRead64(p);144+ b = hash_detail::RapidRead64(p + len - 8);145+ } else {146+// 4-7 bytes: two 32-bit reads (overlapping from end).147+ a = hash_detail::RapidRead32(p);148+ b = hash_detail::RapidRead32(p + len - 4);149+ }150+ } else if (len > 0) {151+// 1-3 bytes: spread bytes across two values for mixing.152+ a = (static_cast<uint64_t>(p[0]) << 45) | p[len - 1];153+ b = p[len >> 1];154+ } else {155+ a = b = 0;156+ }157+ } else if (len <= 48) {158+// 17-48 bytes: process in 16-byte chunks, then read the tail.159+ seed = hash_detail::RapidMix(160+hash_detail::RapidRead64(p) ^ hash_detail::kSecret[2],161+hash_detail::RapidRead64(p + 8) ^ seed);162+if (len > 32) {163+ seed = hash_detail::RapidMix(164+hash_detail::RapidRead64(p + 16) ^ hash_detail::kSecret[2],165+hash_detail::RapidRead64(p + 24) ^ seed);166+ }167+ a = hash_detail::RapidRead64(p + len - 16) ^ len;168+ b = hash_detail::RapidRead64(p + len - 8);169+ } else {170+// >48 bytes: process 48-byte chunks with three parallel mix lanes.171+uint64_t see1 = seed;172+uint64_t see2 = seed;173+do {174+ seed = hash_detail::RapidMix(175+hash_detail::RapidRead64(p) ^ hash_detail::kSecret[0],176+hash_detail::RapidRead64(p + 8) ^ seed);177+ see1 = hash_detail::RapidMix(178+hash_detail::RapidRead64(p + 16) ^ hash_detail::kSecret[1],179+hash_detail::RapidRead64(p + 24) ^ see1);180+ see2 = hash_detail::RapidMix(181+hash_detail::RapidRead64(p + 32) ^ hash_detail::kSecret[2],182+hash_detail::RapidRead64(p + 40) ^ see2);183+ p += 48;184+ i -= 48;185+ } while (i > 48);186+ seed ^= see1 ^ see2;187+// Process remaining 17-48 bytes.188+if (i > 16) {189+ seed = hash_detail::RapidMix(190+hash_detail::RapidRead64(p) ^ hash_detail::kSecret[2],191+hash_detail::RapidRead64(p + 8) ^ seed);192+if (i > 32) {193+ seed = hash_detail::RapidMix(194+hash_detail::RapidRead64(p + 16) ^ hash_detail::kSecret[2],195+hash_detail::RapidRead64(p + 24) ^ seed);196+ }197+ }198+ a = hash_detail::RapidRead64(p + i - 16) ^ i;199+ b = hash_detail::RapidRead64(p + i - 8);200+ }201+202+// Final mix.203+ a ^= hash_detail::kSecret[1];204+ b ^= seed;205+hash_detail::RapidMum(&a, &b);206+return static_cast<size_t>(hash_detail::RapidMix(207+ a ^ hash_detail::kSecret[7], b ^ hash_detail::kSecret[1] ^ len));208+}209+210+} // namespace node211+212+#endif // defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS