bpo-44850: Improve the performance of methodcaller.#27782
Conversation
0429366 to
c074967
Compare
August 16, 2021 18:54
c074967 to
563988d
Compare
August 17, 2021 08:55
|
A thought: there could be another performance boost in adding a tp_vectorcall_offset to the |
Sorry, something went wrong.
|
This is not possible because methodcaller is a heap type, for which vectorcall is not recommended (https://docs.python.org/3/c-api/typeobj.html#c.PyTypeObject.tp_vectorcall_offset). |
Sorry, something went wrong.
|
Is that restriction still applicable since |
Sorry, something went wrong.
|
OK, I gave it a try, but I wasn't able to make it work :( If you want to give it a try, the patch is below; the printf() shows that methodcaller_vectorcall is never reached. diff --git i/Modules/_operator.c w/Modules/_operator.c
index de4613a6a8..cee6df24fe 100644
--- i/Modules/_operator.c
+++ w/Modules/_operator.c
@@ -1,5 +1,6 @@
#include "Python.h"
#include "pycore_moduleobject.h" // _PyModule_GetState()
+#include "structmember.h"
#include "clinic/_operator.c.h"
typedef struct {
@@ -1480,8 +1481,44 @@ typedef struct {
PyObject *kwds;
PyObject **vectorcall_args; /* Borrowed references */
PyObject *vectorcall_kwnames;
+ vectorcallfunc vectorcall;
} methodcallerobject;
+static PyObject *
+methodcaller_vectorcall(
+ methodcallerobject *mc, PyObject *const *args, size_t nargsf, PyObject* kwnames)
+{
+ printf("vectorcall\n");
+ if (!_PyArg_NoKwnames("methodcaller", kwnames))
+ return NULL;
+ if (PyVectorcall_NARGS(nargsf) != 1) {
+ PyErr_Format(
+ PyExc_TypeError,
+ "methodcaller expected 1 argument, got %zd",
+ PyVectorcall_NARGS(nargsf));
+ return NULL;
+ }
+ mc->vectorcall_args[0] = args[0];
+ return PyObject_VectorcallMethod(
+ mc->name, mc->vectorcall_args,
+ (1 + PyTuple_GET_SIZE(mc->args)) | PY_VECTORCALL_ARGUMENTS_OFFSET,
+ mc->vectorcall_kwnames);
+}
+
+static PyObject *
+methodcaller_call(methodcallerobject *mc, PyObject *args, PyObject *kw)
+{
+ if (!_PyArg_NoKeywords("methodcaller", kw))
+ return NULL;
+ if (!_PyArg_CheckPositional("methodcaller", PyTuple_GET_SIZE(args), 1, 1))
+ return NULL;
+ mc->vectorcall_args[0] = PyTuple_GET_ITEM(args, 0);
+ return PyObject_VectorcallMethod(
+ mc->name, mc->vectorcall_args,
+ (1 + PyTuple_GET_SIZE(mc->args)) | PY_VECTORCALL_ARGUMENTS_OFFSET,
+ mc->vectorcall_kwnames);
+}
+
/* AC 3.5: variable number of arguments, not currently support by AC */
static PyObject *
methodcaller_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
@@ -1548,6 +1585,7 @@ methodcaller_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
else {
mc->vectorcall_kwnames = NULL;
}
+ mc->vectorcall = (vectorcallfunc)methodcaller_vectorcall;
PyObject_GC_Track(mc);
return (PyObject *)mc;
@@ -1584,20 +1622,6 @@ methodcaller_traverse(methodcallerobject *mc, visitproc visit, void *arg)
return 0;
}
-static PyObject *
-methodcaller_call(methodcallerobject *mc, PyObject *args, PyObject *kw)
-{
- if (!_PyArg_NoKeywords("methodcaller", kw))
- return NULL;
- if (!_PyArg_CheckPositional("methodcaller", PyTuple_GET_SIZE(args), 1, 1))
- return NULL;
- mc->vectorcall_args[0] = PyTuple_GET_ITEM(args, 0);
- return PyObject_VectorcallMethod(
- mc->name, mc->vectorcall_args,
- (1 + PyTuple_GET_SIZE(mc->args)) | PY_VECTORCALL_ARGUMENTS_OFFSET,
- mc->vectorcall_kwnames);
-}
-
static PyObject *
methodcaller_repr(methodcallerobject *mc)
{
@@ -1722,6 +1746,12 @@ static PyMethodDef methodcaller_methods[] = {
reduce_doc},
{NULL}
};
+
+static PyMemberDef methodcaller_members[] = {
+ {"__vectorcalloffset__", T_PYSSIZET, offsetof(methodcallerobject, vectorcall), READONLY},
+ {NULL}
+};
+
PyDoc_STRVAR(methodcaller_doc,
"methodcaller(name, ...) --> methodcaller object\n\
\n\
@@ -1737,6 +1767,7 @@ static PyType_Slot methodcaller_type_slots[] = {
{Py_tp_traverse, methodcaller_traverse},
{Py_tp_clear, methodcaller_clear},
{Py_tp_methods, methodcaller_methods},
+ {Py_tp_members, methodcaller_members},
{Py_tp_new, methodcaller_new},
{Py_tp_getattro, PyObject_GenericGetAttr},
{Py_tp_repr, methodcaller_repr}, |
Sorry, something went wrong.
|
The patch should work adding |
Sorry, something went wrong.
563988d to
a4a04bf
Compare
August 18, 2021 16:32
|
Ah yes, thanks, it now works, and this provided another ~25-33% speedup on the same benchmarks. |
Sorry, something went wrong.
|
Without benchmarks showing This PR also makes |
Sorry, something went wrong.
|
Here's a still toyish example, but actually representative of the original motivation (which was in fact to speed up from operator import methodcaller
import secrets
from timeit import Timer
# Construct many rows of pseudo-random data...
randdata = [secrets.token_hex(4) for _ in range(1_000_000)]
# ... on which we want to apply some simple operation. We'll also sum() these
# (or perform some other kind of reduction).
strcount0 = methodcaller("count", "0")
tglobals = {"randdata": randdata, "strcount0": strcount0}
# With a genexp.
timer = Timer("sum(s.count('0') for s in randdata)",
globals=tglobals)
n, t = timer.autorange()
print(t/n)
# With map + methodcaller.
timer = Timer("sum(map(strcount0, randdata))",
globals=tglobals)
n, t = timer.autorange()
print(t/n)
# Check that we got our implementation right.
assert sum(s.count('0') for s in randdata) == sum(map(strcount0, randdata))Before this patch, the timings where After, they go to i.e. speeding up methodcaller significantly improved the performance. Note that the docs of the itertools module expressly suggest using the "high-speed functions in the operator module", which (admittedly implicitly) hints that the per-iteration performance of operators is perhaps more worthy of consideration that the construction time. |
Sorry, something went wrong.
|
Edit: I missed the comment about lambdas at https://bugs.python.org/issue44850#msg399903, so here's the same benchmark as above including the lambda approach (unsuprisingly the slowest): from operator import methodcaller
import secrets
from timeit import Timer
# Construct many rows of pseudo-random data...
randdata = [secrets.token_hex(4) for _ in range(1_000_000)]
# ... on which we want to apply some simple operation. We'll also sum() these
# (or perform some other kind of reduction).
func = lambda x: x.count("0")
mc = methodcaller("count", "0")
tglobals = {"randdata": randdata, "func": func, "mc": mc}
# With a genexp.
timer = Timer("sum(s.count('0') for s in randdata)", globals=tglobals)
n, t = timer.autorange()
print("genexp ", t/n)
# With map + lambda.
timer = Timer("sum(map(func, randdata))", globals=tglobals)
n, t = timer.autorange()
print("lambda ", t/n)
# With map + methodcaller.
timer = Timer("sum(map(mc, randdata))", globals=tglobals)
n, t = timer.autorange()
print("methcall", t/n)
# Check that we got our implementation right.
assert sum(s.count('0') for s in randdata) == sum(map(func, randdata)) == sum(map(mc, randdata))Results: |
Sorry, something went wrong.
22b89cf to
5897c07
Compare
August 22, 2021 09:58
5897c07 to
12a80bd
Compare
August 22, 2021 10:06
|
This PR is stale because it has been open for 30 days with no activity. |
Sorry, something went wrong.
|
This is still ready from my side, AFAICT. |
Sorry, something went wrong.
|
This all looks reasonable to me. |
Sorry, something went wrong.
|
@anntzer The PR has merge conflicts, but they are trivial to resolve (feel free to merge https://github.com/eendebakpt/cpython/tree/fastmethodcaller where I merged with current main). Compared to current main this PR is still faster. With: I get |
Sorry, something went wrong.
|
@eendebakpt If you are interested, do you mind taking over this patch? I've kind of moved on since 2y ago. |
Sorry, something went wrong.
|
This pr can be closed, since the alternate has been merged. |
Sorry, something went wrong.
See discussion at https://bugs.python.org/issue44850.
This is currently split into two separate commits, so that the separate speedups from _PyObject_GetMethod and vectorcall can be evaluated separately (plus a third doc-only commit). Edit: Now a single commit, per #27782 (comment), so only consider the first and last benchmarks.
The benchmark script tests builtin and python methods, with and without kwargs:
( export PYTHON=./python echo 'No kws.' $PYTHON -m pyperf timeit -s "from operator import methodcaller as mc" -s "call = mc('sort')" -s "arr = []" "call(arr)" $PYTHON -m pyperf timeit -s "call = lambda x: x.sort()" -s "arr = []" "call(arr)" $PYTHON -m pyperf timeit -s "from operator import methodcaller as mc" -s "call = mc('sort')" -s "class A: sort = lambda self: None" -s "arr=A()" "call(arr)" $PYTHON -m pyperf timeit -s "call = lambda x: x.sort()" -s "class A: sort = lambda self: None" -s "arr=A()" "call(arr)" echo 'With kws.' $PYTHON -m pyperf timeit -s "from operator import methodcaller as mc" -s "call = mc('sort', reverse=True)" -s "arr = []" "call(arr)" $PYTHON -m pyperf timeit -s "call = lambda x: x.sort(reverse=True)" -s "arr = []" "call(arr)" $PYTHON -m pyperf timeit -s "from operator import methodcaller as mc" -s "call = mc('sort', reverse=True)" -s "class A: sort = lambda self, reverse=False: None" -s "arr=A()" "call(arr)" $PYTHON -m pyperf timeit -s "call = lambda x: x.sort(reverse=True)" -s "class A: sort = lambda self, reverse=False: None" -s "arr=A()" "call(arr)" )Before:
Using _PyObject_GetMethod:
Using _PyObject_GetMethod and vectorcall:
https://bugs.python.org/issue44850