gh-144586: Improve _Py_yield to improve light weight cpu instruction by corona10 · Pull Request #144587 · python/cpython

bedevere-app

Benchmark on my Mac mini (consistenty enhanced)

baseline

Python: 3.15.0a5+ free-threading build (heads/gh-115697:e682141c495, Feb  8 2026, 16:38:00) [Clang 17.0.0 (clang-1700.6.3.2)]
GIL enabled: False
CPUs: 12

2 threads: 0.0284s  (14,103,668 ops/sec)
4 threads: 0.0728s  (10,988,690 ops/sec)
8 threads: 0.3063s  (5,223,362 ops/sec)

with PR

➜  cpython git:(gh-144586) ✗ ./python.exe bench_mutex_contention.py
Python: 3.15.0a5+ free-threading build (heads/gh-144586:21bd43c7e5e, Feb  8 2026, 16:34:31) [Clang 17.0.0 (clang-1700.6.3.2)]
GIL enabled: False
CPUs: 12

2 threads: 0.0239s  (16,738,824 ops/sec)
4 threads: 0.0559s  (14,300,174 ops/sec)
8 threads: 0.1813s  (8,824,965 ops/sec)

script

import threading
import time
import sys
import os

NUM_THREADS_LIST = [2, 4, 8]
OPS_PER_THREAD = 200_000
ROUNDS = 3


def contention_bench(num_threads, ops):
    lock = threading.Lock()
    total = [0]
    barrier = threading.Barrier(num_threads + 1)

    def worker():
        barrier.wait()
        for _ in range(ops):
            with lock:
                total[0] += 1

    threads = [threading.Thread(target=worker) for _ in range(num_threads)]
    for t in threads:
        t.start()
    barrier.wait()
    t0 = time.perf_counter()
    for t in threads:
        t.join()
    return time.perf_counter() - t0, total[0]


if __name__ == "__main__":
    print(f"Python: {sys.version}")
    if hasattr(sys, "_is_gil_enabled"):
        print(f"GIL enabled: {sys._is_gil_enabled()}")
    print(f"CPUs: {os.cpu_count()}\n")

    for nt in NUM_THREADS_LIST:
        best = float("inf")
        for _ in range(ROUNDS):
            elapsed, total = contention_bench(nt, OPS_PER_THREAD)
            best = min(best, elapsed)
        print(f"{nt} threads: {best:.4f}s  ({total/best:,.0f} ops/sec)")