gh-111545: Add PyHash_Double() function by vstinner · Pull Request #112095 · python/cpython
I ran microbenchmarks on 3 different APIs. Measured performance is between 13.6 ns and 14.7 ns. The maximum difference is 1.1 ns: 1.08x slower. It seems like the current _Py_HashDouble() API is the fastest.
In the 3 APIs, the C double input value is passed through the xmm0 register at the ABI level.
I expected Py_hash_t PyHash_Double(double value) to be the fastest API. It's not the case. I always get bad surprises in my benchmarks. You should try to reproduce them and play with the code to double check that I didn't mess up with my measurement.
I wrote 3 benchmarks on:
- (A)
Py_hash_t _Py_HashDouble(PyObject*, double)of the main branch - (B)
int PyHash_Double(double value, Py_hash_t *result)of PR gh-111545: Add Py_HashDouble() function #112449 - (C)
Py_hash_t PyHash_Double(double value)
Results using CPU isolation, Python built with gcc -O3 (without PGO, without LTO, just ./configure):
- A: 13.6 ns +- 0.0 ns
- B: 14.0 ns +- 0.0 ns
- C: 14.7 ns +- 0.1 ns
+-----------+---------+-----------------------+-----------------------+
| Benchmark | A | B | C |
+===========+=========+=======================+=======================+
| bench | 13.6 ns | 14.0 ns: 1.03x slower | 14.7 ns: 1.08x slower |
+-----------+---------+-----------------------+-----------------------+
I added benchmark code in _testinternalcapi extension which is built as a shared library, so calling _Py_HashDouble() and PyHash_Double() goes go through PLT (procedure linkage table) indirection.
I added an artificial test on the hash value to use it in the benchmark, so the compiler doesn't remove the whole function call, and the code is a little bit more realistic.
(A) assembly code:
Py_hash_t hash = _Py_HashDouble(obj, d);
if (hash == -1) {
...
}
mov rax,QWORD PTR [rip+0x698e] # 0x7ffff7c09690
mov rdi,rbp
movq xmm0,rax
call 0x7ffff7c022b0 <_Py_HashDouble@plt>
cmp rax,0xffffffffffffffff
jne ...
(B) assembly code:
Py_hash_t hash;
if (PyHash_Double(d, &hash) == 0) {
return NULL;
}
mov rax,QWORD PTR [rip+0x6a1f] # 0x7ffff7c09690
mov rdi,rbp
movq xmm0,rax
call 0x7ffff7c02990 <PyHash_Double@plt>
test eax,eax
jne ...
(C) assembly code:
Py_hash_t hash = PyHash_Double(d);
if (hash == -1) {
...
}
mov rax,QWORD PTR [rip+0x6a26] # 0x7ffff7c09690
movq xmm0,rax
call 0x7ffff7c02990 <PyHash_Double@plt>
cmp rax,0xffffffffffffffff
jne ...
Change used to benchmark (A) and (B). Measuring (C) requires minor changes.
diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c index 4607a3faf17..9b671acf916 100644 --- a/Modules/_testinternalcapi.c +++ b/Modules/_testinternalcapi.c @@ -1625,6 +1625,51 @@ get_type_module_name(PyObject *self, PyObject *type) } +static PyObject * +test_bench_private_hash_double(PyObject *Py_UNUSED(module), PyObject *args) +{ + Py_ssize_t loops; + if (!PyArg_ParseTuple(args, "n", &loops)) { + return NULL; + } + PyObject *obj = Py_None; + double d = 1.0; + + _PyTime_t t1 = _PyTime_GetPerfCounter(); + for (Py_ssize_t i=0; i < loops; i++) { + Py_hash_t hash = _Py_HashDouble(obj, d); + if (hash == -1) { + return NULL; + } + } + _PyTime_t t2 = _PyTime_GetPerfCounter(); + + return PyFloat_FromDouble(_PyTime_AsSecondsDouble(t2 - t1)); +} + + +static PyObject * +test_bench_public_hash_double(PyObject *Py_UNUSED(module), PyObject *args) +{ + Py_ssize_t loops; + if (!PyArg_ParseTuple(args, "n", &loops)) { + return NULL; + } + double d = 1.0; + + _PyTime_t t1 = _PyTime_GetPerfCounter(); + for (Py_ssize_t i=0; i < loops; i++) { + Py_hash_t hash; + if (PyHash_Double(d, &hash) == 0) { + return NULL; + } + } + _PyTime_t t2 = _PyTime_GetPerfCounter(); + + return PyFloat_FromDouble(_PyTime_AsSecondsDouble(t2 - t1)); +} + + static PyMethodDef module_functions[] = { {"get_configs", get_configs, METH_NOARGS}, {"get_recursion_depth", get_recursion_depth, METH_NOARGS}, @@ -1688,6 +1733,8 @@ static PyMethodDef module_functions[] = { {"restore_crossinterp_data", restore_crossinterp_data, METH_VARARGS}, _TESTINTERNALCAPI_TEST_LONG_NUMBITS_METHODDEF {"get_type_module_name", get_type_module_name, METH_O}, + {"bench_private_hash_double", test_bench_private_hash_double, METH_VARARGS}, + {"bench_public_hash_double", test_bench_public_hash_double, METH_VARARGS}, {NULL, NULL} /* sentinel */ };
Bench script for (A), private API:
import pyperf import _testinternalcapi runner = pyperf.Runner() runner.bench_time_func('bench', _testinternalcapi.bench_private_hash_double)
Bench script for (B) and (C), public API:
import pyperf import _testinternalcapi runner = pyperf.Runner() runner.bench_time_func('bench', _testinternalcapi.bench_public_hash_double)