gh-144586: Improve _Py_yield to improve light weight cpu instruction by corona10 · Pull Request #144587 · python/cpython
Benchmark on my Mac mini (consistenty enhanced)
baseline
Python: 3.15.0a5+ free-threading build (heads/gh-115697:e682141c495, Feb 8 2026, 16:38:00) [Clang 17.0.0 (clang-1700.6.3.2)]
GIL enabled: False
CPUs: 12
2 threads: 0.0284s (14,103,668 ops/sec)
4 threads: 0.0728s (10,988,690 ops/sec)
8 threads: 0.3063s (5,223,362 ops/sec)
with PR
➜ cpython git:(gh-144586) ✗ ./python.exe bench_mutex_contention.py
Python: 3.15.0a5+ free-threading build (heads/gh-144586:21bd43c7e5e, Feb 8 2026, 16:34:31) [Clang 17.0.0 (clang-1700.6.3.2)]
GIL enabled: False
CPUs: 12
2 threads: 0.0239s (16,738,824 ops/sec)
4 threads: 0.0559s (14,300,174 ops/sec)
8 threads: 0.1813s (8,824,965 ops/sec)
script
import threading
import time
import sys
import os
NUM_THREADS_LIST = [2, 4, 8]
OPS_PER_THREAD = 200_000
ROUNDS = 3
def contention_bench(num_threads, ops):
lock = threading.Lock()
total = [0]
barrier = threading.Barrier(num_threads + 1)
def worker():
barrier.wait()
for _ in range(ops):
with lock:
total[0] += 1
threads = [threading.Thread(target=worker) for _ in range(num_threads)]
for t in threads:
t.start()
barrier.wait()
t0 = time.perf_counter()
for t in threads:
t.join()
return time.perf_counter() - t0, total[0]
if __name__ == "__main__":
print(f"Python: {sys.version}")
if hasattr(sys, "_is_gil_enabled"):
print(f"GIL enabled: {sys._is_gil_enabled()}")
print(f"CPUs: {os.cpu_count()}\n")
for nt in NUM_THREADS_LIST:
best = float("inf")
for _ in range(ROUNDS):
elapsed, total = contention_bench(nt, OPS_PER_THREAD)
best = min(best, elapsed)
print(f"{nt} threads: {best:.4f}s ({total/best:,.0f} ops/sec)")