gh-111545: Add PyHash_Double() function by vstinner · Pull Request #112095 · python/cpython

vstinner

I ran microbenchmarks on 3 different APIs. Measured performance is between 13.6 ns and 14.7 ns. The maximum difference is 1.1 ns: 1.08x slower. It seems like the current _Py_HashDouble() API is the fastest.

In the 3 APIs, the C double input value is passed through the xmm0 register at the ABI level.

I expected Py_hash_t PyHash_Double(double value) to be the fastest API. It's not the case. I always get bad surprises in my benchmarks. You should try to reproduce them and play with the code to double check that I didn't mess up with my measurement.

I wrote 3 benchmarks on:

(A) Py_hash_t _Py_HashDouble(PyObject*, double) of the main branch
(B) int PyHash_Double(double value, Py_hash_t *result) of PR gh-111545: Add Py_HashDouble() function #112449
(C) Py_hash_t PyHash_Double(double value)

Results using CPU isolation, Python built with gcc -O3 (without PGO, without LTO, just ./configure):

A: 13.6 ns +- 0.0 ns
B: 14.0 ns +- 0.0 ns
C: 14.7 ns +- 0.1 ns

+-----------+---------+-----------------------+-----------------------+
| Benchmark | A       | B                     | C                     |
+===========+=========+=======================+=======================+
| bench     | 13.6 ns | 14.0 ns: 1.03x slower | 14.7 ns: 1.08x slower |
+-----------+---------+-----------------------+-----------------------+

I added benchmark code in _testinternalcapi extension which is built as a shared library, so calling _Py_HashDouble() and PyHash_Double() goes go through PLT (procedure linkage table) indirection.

I added an artificial test on the hash value to use it in the benchmark, so the compiler doesn't remove the whole function call, and the code is a little bit more realistic.

(A) assembly code:

Py_hash_t hash = _Py_HashDouble(obj, d);
if (hash == -1) {
    ...
}

mov    rax,QWORD PTR [rip+0x698e]        # 0x7ffff7c09690
mov    rdi,rbp
movq   xmm0,rax
call   0x7ffff7c022b0 <_Py_HashDouble@plt>
cmp    rax,0xffffffffffffffff
jne    ...

(B) assembly code:

Py_hash_t hash;
if (PyHash_Double(d, &hash) == 0) {
    return NULL;
}

mov    rax,QWORD PTR [rip+0x6a1f]        # 0x7ffff7c09690
mov    rdi,rbp
movq   xmm0,rax
call   0x7ffff7c02990 <PyHash_Double@plt>
test   eax,eax
jne    ...

Py_hash_t hash = PyHash_Double(d);
if (hash == -1) {
    ...
}

mov    rax,QWORD PTR [rip+0x6a26]        # 0x7ffff7c09690
movq   xmm0,rax
call   0x7ffff7c02990 <PyHash_Double@plt>
cmp    rax,0xffffffffffffffff
jne ...

Change used to benchmark (A) and (B). Measuring (C) requires minor changes.

diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c
index 4607a3faf17..9b671acf916 100644
--- a/Modules/_testinternalcapi.c
+++ b/Modules/_testinternalcapi.c
@@ -1625,6 +1625,51 @@ get_type_module_name(PyObject *self, PyObject *type)
 }
 
 
+static PyObject *
+test_bench_private_hash_double(PyObject *Py_UNUSED(module), PyObject *args)
+{
+    Py_ssize_t loops;
+    if (!PyArg_ParseTuple(args, "n", &loops)) {
+        return NULL;
+    }
+    PyObject *obj = Py_None;
+    double d = 1.0;
+
+    _PyTime_t t1 = _PyTime_GetPerfCounter();
+    for (Py_ssize_t i=0; i < loops; i++) {
+        Py_hash_t hash = _Py_HashDouble(obj, d);
+        if (hash == -1) {
+            return NULL;
+        }
+    }
+    _PyTime_t t2 = _PyTime_GetPerfCounter();
+
+    return PyFloat_FromDouble(_PyTime_AsSecondsDouble(t2 - t1));
+}
+
+
+static PyObject *
+test_bench_public_hash_double(PyObject *Py_UNUSED(module), PyObject *args)
+{
+    Py_ssize_t loops;
+    if (!PyArg_ParseTuple(args, "n", &loops)) {
+        return NULL;
+    }
+    double d = 1.0;
+
+    _PyTime_t t1 = _PyTime_GetPerfCounter();
+    for (Py_ssize_t i=0; i < loops; i++) {
+        Py_hash_t hash;
+        if (PyHash_Double(d, &hash) == 0) {
+            return NULL;
+        }
+    }
+    _PyTime_t t2 = _PyTime_GetPerfCounter();
+
+    return PyFloat_FromDouble(_PyTime_AsSecondsDouble(t2 - t1));
+}
+
+
 static PyMethodDef module_functions[] = {
     {"get_configs", get_configs, METH_NOARGS},
     {"get_recursion_depth", get_recursion_depth, METH_NOARGS},
@@ -1688,6 +1733,8 @@ static PyMethodDef module_functions[] = {
     {"restore_crossinterp_data", restore_crossinterp_data,       METH_VARARGS},
     _TESTINTERNALCAPI_TEST_LONG_NUMBITS_METHODDEF
     {"get_type_module_name",    get_type_module_name,            METH_O},
+    {"bench_private_hash_double", test_bench_private_hash_double, METH_VARARGS},
+    {"bench_public_hash_double", test_bench_public_hash_double, METH_VARARGS},
     {NULL, NULL} /* sentinel */
 };

Bench script for (A), private API:

import pyperf
import _testinternalcapi
runner = pyperf.Runner()
runner.bench_time_func('bench', _testinternalcapi.bench_private_hash_double)

Bench script for (B) and (C), public API:

import pyperf
import _testinternalcapi
runner = pyperf.Runner()
runner.bench_time_func('bench', _testinternalcapi.bench_public_hash_double)