Improve _Py_yield to use light weight cpu instruction
Proposal:
_Py_yield() currently relies on sched_yield()/SwitchToThread(), which are OS-level syscalls. We can replace these with lightweight CPU pause instructions (x86 PAUSE, AArch64 WFE, etc.) as CPython's bundled mimalloc already does in mi_atomic_yield():
| // Yield | |
| #if defined(__cplusplus) | |
| #include <thread> | |
| static inline void mi_atomic_yield(void) { | |
| std::this_thread::yield(); | |
| } | |
| #elif defined(_WIN32) | |
| #define WIN32_LEAN_AND_MEAN | |
| #include <windows.h> | |
| static inline void mi_atomic_yield(void) { | |
| YieldProcessor(); | |
| } | |
| #elif defined(__SSE2__) | |
| #include <emmintrin.h> | |
| static inline void mi_atomic_yield(void) { | |
| _mm_pause(); | |
| } | |
| #elif (defined(__GNUC__) || defined(__clang__)) && \ | |
| (defined(__x86_64__) || defined(__i386__) || \ | |
| defined(__aarch64__) || defined(__arm__) || \ | |
| defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__POWERPC__)) | |
| #if defined(__x86_64__) || defined(__i386__) | |
| static inline void mi_atomic_yield(void) { | |
| __asm__ volatile ("pause" ::: "memory"); | |
| } | |
| #elif defined(__aarch64__) | |
| static inline void mi_atomic_yield(void) { | |
| __asm__ volatile("wfe"); | |
| } | |
| #elif defined(__arm__) | |
| #if __ARM_ARCH >= 7 | |
| static inline void mi_atomic_yield(void) { | |
| __asm__ volatile("yield" ::: "memory"); | |
| } | |
| #else | |
| static inline void mi_atomic_yield(void) { | |
| __asm__ volatile ("nop" ::: "memory"); | |
| } | |
| #endif | |
| #elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__POWERPC__) | |
| #ifdef __APPLE__ | |
| static inline void mi_atomic_yield(void) { | |
| __asm__ volatile ("or r27,r27,r27" ::: "memory"); | |
| } | |
| #else | |
| static inline void mi_atomic_yield(void) { | |
| __asm__ __volatile__ ("or 27,27,27" ::: "memory"); | |
| } | |
| #endif | |
| #endif | |
| #elif defined(__sun) | |
| // Fallback for other archs | |
| #include <synch.h> | |
| static inline void mi_atomic_yield(void) { | |
| smt_pause(); | |
| } | |
| #elif defined(__wasi__) | |
| #include <sched.h> | |
| static inline void mi_atomic_yield(void) { | |
| sched_yield(); | |
| } | |
| #else | |
| #include <unistd.h> | |
| static inline void mi_atomic_yield(void) { | |
| sleep(0); | |
| } | |
| #endif |
Has this already been discussed elsewhere?
No response given
Links to previous discussion of this feature:
No response