◐ Shell
clean mode source ↗

quic: implement rapidhash for hashing improvements · nodejs/node@76d9c24

1+

#pragma once

2+3+

#if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS

4+5+

// Fast, high-quality hash function for byte sequences.

6+

//

7+

// Provides HashBytes() for use in hash tables. Uses native-width integer

8+

// loads and a 128-bit multiply-and-fold mixer for excellent avalanche

9+

// properties on short byte sequences (network identifiers, addresses,

10+

// tokens).

11+

//

12+

// Based on rapidhash V3 by Nicolas De Carli, which evolved from wyhash

13+

// by Wang Yi. Both use the same core mixing primitive (MUM: multiply,

14+

// then XOR the high and low halves of the 128-bit result).

15+

//

16+

// rapidhash: https://github.com/Nicoshev/rapidhash

17+

// Copyright (C) 2025 Nicolas De Carli — MIT License

18+

// wyhash: https://github.com/wangyi-fudan/wyhash

19+

// Wang Yi — public domain (The Unlicense)

20+

//

21+

// The implementation here uses rapidhash's read strategy (native-width

22+

// overlapping reads, optimized for short inputs) and secret constants.

23+

// The core mixing function (rapid_mum/rapid_mix) is identical to

24+

// wyhash's wymum/wymix.

25+26+

#include <cstddef>

27+

#include <cstdint>

28+

#include <cstring>

29+30+

#if defined(_MSC_VER)

31+

#include <intrin.h>

32+

#if defined(_M_X64) && !defined(_M_ARM64EC)

33+

#pragma intrinsic(_umul128)

34+

#endif

35+

#endif

36+37+

namespace node {

38+39+

namespace hash_detail {

40+41+

// 128-bit multiply, then XOR the high and low halves.

42+

// This is the core mixing function ("rapid_mum" / "wymum").

43+

// On 64-bit platforms with __int128, this compiles to a single

44+

// mul instruction + shift + xor.

45+

inline uint64_t RapidMix(uint64_t a, uint64_t b) {

46+

#ifdef __SIZEOF_INT128__

47+

__uint128_t r = static_cast<__uint128_t>(a) * b;

48+

a = static_cast<uint64_t>(r);

49+

b = static_cast<uint64_t>(r >> 64);

50+

#elif defined(_MSC_VER) && (defined(_WIN64) || defined(_M_HYBRID_CHPE_ARM64))

51+

#if defined(_M_X64)

52+

a = _umul128(a, b, &b);

53+

#else

54+

uint64_t hi = __umulh(a, b);

55+

a = a * b;

56+

b = hi;

57+

#endif

58+

#else

59+

// Portable 64x64 -> 128-bit multiply fallback for 32-bit platforms.

60+

uint64_t ha = a >> 32, hb = b >> 32;

61+

uint64_t la = static_cast<uint32_t>(a), lb = static_cast<uint32_t>(b);

62+

uint64_t rh = ha * hb, rm0 = ha * lb, rm1 = hb * la, rl = la * lb;

63+

uint64_t t = rl + (rm0 << 32);

64+

uint64_t lo = t + (rm1 << 32);

65+

uint64_t hi = rh + (rm0 >> 32) + (rm1 >> 32) + (t < rl) + (lo < t);

66+

a = lo;

67+

b = hi;

68+

#endif

69+

return a ^ b;

70+

}

71+72+

// Inline 128-bit multiply WITHOUT the final XOR (used in the

73+

// penultimate mixing step where a and b are updated separately).

74+

inline void RapidMum(uint64_t* a, uint64_t* b) {

75+

#ifdef __SIZEOF_INT128__

76+

__uint128_t r = static_cast<__uint128_t>(*a) * (*b);

77+

*a = static_cast<uint64_t>(r);

78+

*b = static_cast<uint64_t>(r >> 64);

79+

#elif defined(_MSC_VER) && (defined(_WIN64) || defined(_M_HYBRID_CHPE_ARM64))

80+

#if defined(_M_X64)

81+

*a = _umul128(*a, *b, b);

82+

#else

83+

uint64_t hi = __umulh(*a, *b);

84+

*a = (*a) * (*b);

85+

*b = hi;

86+

#endif

87+

#else

88+

uint64_t ha = *a >> 32, hb = *b >> 32;

89+

uint64_t la = static_cast<uint32_t>(*a), lb = static_cast<uint32_t>(*b);

90+

uint64_t rh = ha * hb, rm0 = ha * lb, rm1 = hb * la, rl = la * lb;

91+

uint64_t t = rl + (rm0 << 32);

92+

*a = t + (rm1 << 32);

93+

*b = rh + (rm0 >> 32) + (rm1 >> 32) + (t < rl) + (*a < t);

94+

#endif

95+

}

96+97+

// Read functions. The compiler optimizes small fixed-size memcpy calls

98+

// to single load instructions — no actual byte-by-byte copy occurs.

99+

inline uint64_t RapidRead64(const uint8_t* p) {

100+

uint64_t v;

101+

memcpy(&v, p, sizeof(uint64_t));

102+

return v;

103+

}

104+105+

inline uint64_t RapidRead32(const uint8_t* p) {

106+

uint32_t v;

107+

memcpy(&v, p, sizeof(uint32_t));

108+

return v;

109+

}

110+111+

// Default rapidhash secret parameters.

112+

constexpr uint64_t kSecret[8] = {0x2d358dccaa6c78a5ULL,

113+

0x8bb84b93962eacc9ULL,

114+

0x4b33a62ed433d4a3ULL,

115+

0x4d5a2da51de1aa47ULL,

116+

0xa0761d6478bd642fULL,

117+

0xe7037ed1a0b428dbULL,

118+

0x90ed1765281c388cULL,

119+

0xaaaaaaaaaaaaaaaaULL};

120+121+

} // namespace hash_detail

122+123+

// Hash a contiguous byte range. Optimized for short inputs (≤48 bytes)

124+

// which is the common case for network identifiers and addresses. For

125+

// inputs >48 bytes, falls through to a loop processing 48-byte chunks.

126+

inline size_t HashBytes(const void* data, size_t len) {

127+

const uint8_t* p = static_cast<const uint8_t*>(data);

128+129+

// Seed initialization.

130+

uint64_t seed = hash_detail::RapidMix(0 ^ hash_detail::kSecret[2],

131+

hash_detail::kSecret[1]);

132+

uint64_t a = 0;

133+

uint64_t b = 0;

134+

size_t i = len;

135+136+

if (len <= 16) {

137+

if (len >= 4) {

138+

// Mix length into seed for better distribution of

139+

// different-length inputs with shared prefixes.

140+

seed ^= len;

141+

if (len >= 8) {

142+

// 8-16 bytes: two native 64-bit reads (overlapping from end).

143+

a = hash_detail::RapidRead64(p);

144+

b = hash_detail::RapidRead64(p + len - 8);

145+

} else {

146+

// 4-7 bytes: two 32-bit reads (overlapping from end).

147+

a = hash_detail::RapidRead32(p);

148+

b = hash_detail::RapidRead32(p + len - 4);

149+

}

150+

} else if (len > 0) {

151+

// 1-3 bytes: spread bytes across two values for mixing.

152+

a = (static_cast<uint64_t>(p[0]) << 45) | p[len - 1];

153+

b = p[len >> 1];

154+

} else {

155+

a = b = 0;

156+

}

157+

} else if (len <= 48) {

158+

// 17-48 bytes: process in 16-byte chunks, then read the tail.

159+

seed = hash_detail::RapidMix(

160+

hash_detail::RapidRead64(p) ^ hash_detail::kSecret[2],

161+

hash_detail::RapidRead64(p + 8) ^ seed);

162+

if (len > 32) {

163+

seed = hash_detail::RapidMix(

164+

hash_detail::RapidRead64(p + 16) ^ hash_detail::kSecret[2],

165+

hash_detail::RapidRead64(p + 24) ^ seed);

166+

}

167+

a = hash_detail::RapidRead64(p + len - 16) ^ len;

168+

b = hash_detail::RapidRead64(p + len - 8);

169+

} else {

170+

// >48 bytes: process 48-byte chunks with three parallel mix lanes.

171+

uint64_t see1 = seed;

172+

uint64_t see2 = seed;

173+

do {

174+

seed = hash_detail::RapidMix(

175+

hash_detail::RapidRead64(p) ^ hash_detail::kSecret[0],

176+

hash_detail::RapidRead64(p + 8) ^ seed);

177+

see1 = hash_detail::RapidMix(

178+

hash_detail::RapidRead64(p + 16) ^ hash_detail::kSecret[1],

179+

hash_detail::RapidRead64(p + 24) ^ see1);

180+

see2 = hash_detail::RapidMix(

181+

hash_detail::RapidRead64(p + 32) ^ hash_detail::kSecret[2],

182+

hash_detail::RapidRead64(p + 40) ^ see2);

183+

p += 48;

184+

i -= 48;

185+

} while (i > 48);

186+

seed ^= see1 ^ see2;

187+

// Process remaining 17-48 bytes.

188+

if (i > 16) {

189+

seed = hash_detail::RapidMix(

190+

hash_detail::RapidRead64(p) ^ hash_detail::kSecret[2],

191+

hash_detail::RapidRead64(p + 8) ^ seed);

192+

if (i > 32) {

193+

seed = hash_detail::RapidMix(

194+

hash_detail::RapidRead64(p + 16) ^ hash_detail::kSecret[2],

195+

hash_detail::RapidRead64(p + 24) ^ seed);

196+

}

197+

}

198+

a = hash_detail::RapidRead64(p + i - 16) ^ i;

199+

b = hash_detail::RapidRead64(p + i - 8);

200+

}

201+202+

// Final mix.

203+

a ^= hash_detail::kSecret[1];

204+

b ^= seed;

205+

hash_detail::RapidMum(&a, &b);

206+

return static_cast<size_t>(hash_detail::RapidMix(

207+

a ^ hash_detail::kSecret[7], b ^ hash_detail::kSecret[1] ^ len));

208+

}

209+210+

} // namespace node

211+212+

#endif // defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS