From 776a1e1ca877894b899b775ae562312adb65b6f2 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Thu, 15 Aug 2024 11:00:32 -0700 Subject: [PATCH 01/67] Assign threads indices into bytecode copies --- Include/internal/pycore_code.h | 6 + Include/internal/pycore_index_pool.h | 49 +++++++ Include/internal/pycore_interp.h | 2 + Include/internal/pycore_tstate.h | 3 + Makefile.pre.in | 2 + Objects/codeobject.c | 2 + Python/index_pool.c | 193 +++++++++++++++++++++++++++ Python/pystate.c | 10 ++ Python/specialize.c | 16 +++ 9 files changed, 283 insertions(+) create mode 100644 Include/internal/pycore_index_pool.h create mode 100644 Python/index_pool.c diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index 57e0a14bb9b5bd..f05f4b1de4bb9c 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -11,6 +11,7 @@ extern "C" { #include "pycore_stackref.h" // _PyStackRef #include "pycore_lock.h" // PyMutex #include "pycore_backoff.h" // _Py_BackoffCounter +#include "pycore_tstate.h" // _PyThreadStateImpl /* Each instruction in a code object is a fixed-width value, @@ -600,6 +601,11 @@ struct _PyCode8 _PyCode_DEF(8); PyAPI_DATA(const struct _PyCode8) _Py_InitCleanup; +#ifdef Py_GIL_DISABLED +extern int _Py_ReserveSpecializedCodeIndex(PyInterpreterState *interp); +extern void _Py_ClearSpecializedCodeIndex(_PyThreadStateImpl *tstate); +#endif + #ifdef __cplusplus } #endif diff --git a/Include/internal/pycore_index_pool.h b/Include/internal/pycore_index_pool.h new file mode 100644 index 00000000000000..e393c52aecf7f2 --- /dev/null +++ b/Include/internal/pycore_index_pool.h @@ -0,0 +1,49 @@ +#ifndef Py_INTERNAL_INDEX_POOL_H +#define Py_INTERNAL_INDEX_POOL_H + +#include "Python.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef Py_BUILD_CORE +# error "this header requires Py_BUILD_CORE define" +#endif + +#ifdef Py_GIL_DISABLED + +typedef struct _PyIndexHeap { + Py_ssize_t *values; + + // Number of items stored in values + Py_ssize_t size; + + // Maximum number of items that can be stored in values + Py_ssize_t capacity; +} _PyIndexHeap; + +typedef struct _PyIndexPool { + PyMutex mutex; + + // Min heap of indices available for allocation + _PyIndexHeap free_indices; + + // Next index to allocate if no free indices are available + Py_ssize_t next_index; +} _PyIndexPool; + +// Allocate the smallest available index. Returns -1 on error. +extern Py_ssize_t _PyIndexPool_AllocIndex(_PyIndexPool *indices); + +// Release `index` back to the pool +extern void _PyIndexPool_FreeIndex(_PyIndexPool *indices, Py_ssize_t index); + +extern void _PyIndexPool_Fini(_PyIndexPool *indices); + +#endif // Py_GIL_DISABLED + +#ifdef __cplusplus +} +#endif +#endif // !Py_INTERNAL_INDEX_POOL_H diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h index a1c1dd0c957230..caf86b44374da8 100644 --- a/Include/internal/pycore_interp.h +++ b/Include/internal/pycore_interp.h @@ -26,6 +26,7 @@ extern "C" { #include "pycore_genobject.h" // _PyGen_FetchStopIterationValue #include "pycore_global_objects.h"// struct _Py_interp_cached_objects #include "pycore_import.h" // struct _import_state +#include "pycore_index_pool.h" // _PyIndexPool #include "pycore_instruments.h" // _PY_MONITORING_EVENTS #include "pycore_list.h" // struct _Py_list_state #include "pycore_mimalloc.h" // struct _mimalloc_interp_state @@ -223,6 +224,7 @@ struct _is { struct _brc_state brc; // biased reference counting state struct _Py_type_id_pool type_ids; PyMutex weakref_locks[NUM_WEAKREF_LIST_LOCKS]; + _PyIndexPool specialized_code_indices; #endif // Per-interpreter state for the obmalloc allocator. For the main diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h index f681b644c9ad5d..e977c8afcea361 100644 --- a/Include/internal/pycore_tstate.h +++ b/Include/internal/pycore_tstate.h @@ -41,6 +41,9 @@ typedef struct _PyThreadStateImpl { // If set, don't use thread-local refcounts int is_finalized; } types; + + // Index to use to retrieve specialized bytecode for this thread + Py_ssize_t specialized_code_index; #endif #if defined(Py_REF_DEBUG) && defined(Py_GIL_DISABLED) diff --git a/Makefile.pre.in b/Makefile.pre.in index 77455c0978f71d..95d51237bb9dc4 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -453,6 +453,7 @@ PYTHON_OBJS= \ Python/hashtable.o \ Python/import.o \ Python/importdl.o \ + Python/index_pool.o \ Python/initconfig.o \ Python/interpconfig.o \ Python/instrumentation.o \ @@ -1218,6 +1219,7 @@ PYTHON_HEADERS= \ $(srcdir)/Include/internal/pycore_hashtable.h \ $(srcdir)/Include/internal/pycore_import.h \ $(srcdir)/Include/internal/pycore_importdl.h \ + $(srcdir)/Include/internal/pycore_index_pool.h \ $(srcdir)/Include/internal/pycore_initconfig.h \ $(srcdir)/Include/internal/pycore_instruments.h \ $(srcdir)/Include/internal/pycore_instruction_sequence.h \ diff --git a/Objects/codeobject.c b/Objects/codeobject.c index 6f0b3f8b9a3262..9d3897b2f53c6e 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -6,6 +6,7 @@ #include "pycore_code.h" // _PyCodeConstructor #include "pycore_frame.h" // FRAME_SPECIALS_SIZE #include "pycore_hashtable.h" // _Py_hashtable_t +#include "pycore_index_pool.h" // _PyIndexPool #include "pycore_initconfig.h" // _PyStatus_OK() #include "pycore_interp.h" // PyInterpreterState.co_extra_freefuncs #include "pycore_object.h" // _PyObject_SetDeferredRefcount @@ -2633,5 +2634,6 @@ _PyCode_Fini(PyInterpreterState *interp) _Py_hashtable_destroy(state->constants); state->constants = NULL; } + _PyIndexPool_Fini(&interp->specialized_code_indices); #endif } diff --git a/Python/index_pool.c b/Python/index_pool.c new file mode 100644 index 00000000000000..ecc55935416268 --- /dev/null +++ b/Python/index_pool.c @@ -0,0 +1,193 @@ +#include + +#include "Python.h" + +#include "pycore_index_pool.h" +#include "pycore_lock.h" + +#ifdef Py_GIL_DISABLED + +static inline void +swap(Py_ssize_t *values, Py_ssize_t i, Py_ssize_t j) +{ + Py_ssize_t tmp = values[i]; + values[i] = values[j]; + values[j] = tmp; +} + +static bool +heap_try_swap(_PyIndexHeap *heap, Py_ssize_t i, Py_ssize_t j) +{ + if (i < 0 || i >= heap->size) { + return 0; + } + if (j < 0 || j >= heap->size) { + return 0; + } + if (i <= j) { + if (heap->values[i] <= heap->values[j]) { + return 0; + } + } + else if (heap->values[j] <= heap->values[i]) { + return 0; + } + swap(heap->values, i, j); + return 1; +} + +static inline Py_ssize_t +parent(Py_ssize_t i) +{ + return (i - 1) / 2; +} + +static inline Py_ssize_t +left_child(Py_ssize_t i) +{ + return 2*i + 1; +} + +static inline Py_ssize_t +right_child(Py_ssize_t i) +{ + return 2*i + 2; +} + +static void +heap_add(_PyIndexHeap *heap, Py_ssize_t val) +{ + assert(heap->size < heap->capacity); + // Add val to end + heap->values[heap->size] = val; + heap->size++; + // Sift up + for (Py_ssize_t cur = heap->size - 1; cur > 0; cur = parent(cur)) { + if (!heap_try_swap(heap, cur, parent(cur))) { + break; + } + } +} + +static Py_ssize_t +heap_min_child(_PyIndexHeap *heap, Py_ssize_t i) +{ + if (left_child(i) < heap->size) { + if (right_child(i) < heap->size) { + Py_ssize_t lval = heap->values[left_child(i)]; + Py_ssize_t rval = heap->values[right_child(i)]; + return lval < rval ? left_child(i) : right_child(i); + } + return left_child(i); + } + else if (right_child(i) < heap->size) { + return right_child(i); + } + return -1; +} + +static Py_ssize_t +heap_pop(_PyIndexHeap *heap) +{ + assert(heap->size > 0); + // Pop smallest and replace with the last element + Py_ssize_t result = heap->values[0]; + heap->values[0] = heap->values[heap->size - 1]; + heap->size--; + // Sift down + for (Py_ssize_t cur = 0; cur < heap->size;) { + Py_ssize_t min_child = heap_min_child(heap, cur); + if (min_child > -1 && heap_try_swap(heap, cur, min_child)) { + cur = min_child; + } + else { + break; + } + } + return result; +} + +static int +heap_ensure_capacity(_PyIndexHeap *heap, Py_ssize_t limit) +{ + assert(limit > 0); + if (heap->capacity > limit) { + return 0; + } + Py_ssize_t new_capacity = heap->capacity ? heap->capacity : 1024; + while (new_capacity && new_capacity < limit) { + new_capacity <<= 1; + } + if (!new_capacity) { + return -1; + } + Py_ssize_t *new_values = PyMem_RawCalloc(new_capacity, sizeof(Py_ssize_t)); + if (new_values == NULL) { + return -1; + } + if (heap->values != NULL) { + memcpy(new_values, heap->values, heap->capacity); + PyMem_RawFree(heap->values); + } + heap->values = new_values; + heap->capacity = new_capacity; + return 0; +} + +static void +heap_fini(_PyIndexHeap *heap) +{ + if (heap->values != NULL) { + PyMem_RawFree(heap->values); + heap->values = NULL; + } + heap->size = -1; + heap->capacity = -1; +} + +#define LOCK_POOL(pool) PyMutex_LockFlags(&pool->mutex, _Py_LOCK_DONT_DETACH) +#define UNLOCK_POOL(pool) PyMutex_Unlock(&pool->mutex) + +Py_ssize_t +_PyIndexPool_AllocIndex(_PyIndexPool *pool) +{ + LOCK_POOL(pool); + Py_ssize_t index; + _PyIndexHeap *free_indices = &pool->free_indices; + if (free_indices->size == 0) { + // No free indices. Make sure the heap can always store all of the + // indices that have been allocated to avoid having to allocate memory + // (which can fail) when freeing an index. Freeing indices happens when + // threads are being destroyed, which makes error handling awkward / + // impossible. This arrangement shifts handling of allocation failures + // to when indices are allocated, which happens at thread creation, + // where we are better equipped to deal with failure. + if (heap_ensure_capacity(free_indices, pool->next_index + 1) < 0) { + UNLOCK_POOL(pool); + PyErr_NoMemory(); + return -1; + } + index = pool->next_index++; + } + else { + index = heap_pop(free_indices); + } + UNLOCK_POOL(pool); + return index; +} + +void +_PyIndexPool_FreeIndex(_PyIndexPool *pool, Py_ssize_t index) +{ + LOCK_POOL(pool); + heap_add(&pool->free_indices, index); + UNLOCK_POOL(pool); +} + +void +_PyIndexPool_Fini(_PyIndexPool *pool) +{ + heap_fini(&pool->free_indices); +} + +#endif // Py_GIL_DISABLED diff --git a/Python/pystate.c b/Python/pystate.c index 54caf373e91d6c..afc63cdd9deb2b 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -1550,6 +1550,11 @@ new_threadstate(PyInterpreterState *interp, int whence) PyMem_RawFree(new_tstate); return NULL; } + Py_ssize_t code_idx = _Py_ReserveSpecializedCodeIndex(interp); + if (code_idx < 0) { + PyMem_RawFree(new_tstate); + return NULL; + } #endif /* We serialize concurrent creation to protect global state. */ @@ -1592,6 +1597,7 @@ new_threadstate(PyInterpreterState *interp, int whence) #ifdef Py_GIL_DISABLED // Must be called with lock unlocked to avoid lock ordering deadlocks. _Py_qsbr_register(tstate, interp, qsbr_idx); + tstate->specialized_code_index = code_idx; #endif return (PyThreadState *)tstate; @@ -1743,6 +1749,10 @@ PyThreadState_Clear(PyThreadState *tstate) // Remove ourself from the biased reference counting table of threads. _Py_brc_remove_thread(tstate); + + // Release our thread-local copies of the bytecode for reuse by another + // thread + _Py_ClearSpecializedCodeIndex((_PyThreadStateImpl *)tstate); #endif // Merge our queue of pointers to be freed into the interpreter queue. diff --git a/Python/specialize.c b/Python/specialize.c index da618952e85978..be916d39ab017e 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -2721,6 +2721,22 @@ _Py_Specialize_ContainsOp(_PyStackRef value_st, _Py_CODEUNIT *instr) cache->counter = adaptive_counter_cooldown(); } +#ifdef Py_GIL_DISABLED +int +_Py_ReserveSpecializedCodeIndex(PyInterpreterState *interp) +{ + return _PyIndexPool_AllocIndex(&interp->specialized_code_indices); +} + +void +_Py_ClearSpecializedCodeIndex(_PyThreadStateImpl *tstate) +{ + PyInterpreterState *interp = ((PyThreadState*) tstate)->interp; + _PyIndexPool_FreeIndex(&interp->specialized_code_indices, tstate->specialized_code_index); +} + +#endif + /* Code init cleanup. * CALL_ALLOC_AND_ENTER_INIT will set up * the frame to execute the EXIT_INIT_CHECK From 2b40870422ee524cad3a8cf31424b629e5161a7b Mon Sep 17 00:00:00 2001 From: Matt Page Date: Tue, 27 Aug 2024 16:25:12 -0700 Subject: [PATCH 02/67] Replace most usage of PyCode_CODE --- Include/internal/pycore_frame.h | 8 ++++++-- Include/internal/pycore_uop_metadata.h | 2 +- Objects/codeobject.c | 4 ++-- Objects/frameobject.c | 6 +++--- Objects/typeobject.c | 9 +++++---- Programs/test_frozenmain.h | 14 +++++++------- Python/bytecodes.c | 14 +++++++------- Python/ceval.c | 8 +++----- Python/ceval_macros.h | 2 +- Python/executor_cases.c.h | 10 +++++----- Python/frame.c | 2 +- Python/generated_cases.c.h | 6 +++--- Python/specialize.c | 5 ++--- 13 files changed, 46 insertions(+), 44 deletions(-) diff --git a/Include/internal/pycore_frame.h b/Include/internal/pycore_frame.h index a6f7c1735b349f..4adb668b9a7795 100644 --- a/Include/internal/pycore_frame.h +++ b/Include/internal/pycore_frame.h @@ -76,13 +76,17 @@ typedef struct _PyInterpreterFrame { } _PyInterpreterFrame; #define _PyInterpreterFrame_LASTI(IF) \ - ((int)((IF)->instr_ptr - _PyCode_CODE(_PyFrame_GetCode(IF)))) + ((int)((IF)->instr_ptr - _PyFrame_GetBytecode((IF)))) static inline PyCodeObject *_PyFrame_GetCode(_PyInterpreterFrame *f) { assert(PyCode_Check(f->f_executable)); return (PyCodeObject *)f->f_executable; } +static inline _Py_CODEUNIT *_PyFrame_GetBytecode(_PyInterpreterFrame *f) { + return _PyCode_CODE(_PyFrame_GetCode(f)); +} + static inline _PyStackRef *_PyFrame_Stackbase(_PyInterpreterFrame *f) { return (f->localsplus + _PyFrame_GetCode(f)->co_nlocalsplus); } @@ -216,7 +220,7 @@ _PyFrame_IsIncomplete(_PyInterpreterFrame *frame) return true; } return frame->owner != FRAME_OWNED_BY_GENERATOR && - frame->instr_ptr < _PyCode_CODE(_PyFrame_GetCode(frame)) + _PyFrame_GetCode(frame)->_co_firsttraceable; + frame->instr_ptr < _PyFrame_GetBytecode(frame) + _PyFrame_GetCode(frame)->_co_firsttraceable; } static inline _PyInterpreterFrame * diff --git a/Include/internal/pycore_uop_metadata.h b/Include/internal/pycore_uop_metadata.h index 4d0ab22e6aa8f3..bd64df66d0c90d 100644 --- a/Include/internal/pycore_uop_metadata.h +++ b/Include/internal/pycore_uop_metadata.h @@ -277,7 +277,7 @@ const uint16_t _PyUop_Flags[MAX_UOP_ID+1] = { [_FATAL_ERROR] = 0, [_CHECK_VALIDITY_AND_SET_IP] = HAS_DEOPT_FLAG, [_DEOPT] = 0, - [_ERROR_POP_N] = HAS_ARG_FLAG, + [_ERROR_POP_N] = HAS_ARG_FLAG | HAS_ESCAPES_FLAG, [_TIER2_RESUME_CHECK] = HAS_DEOPT_FLAG, }; diff --git a/Objects/codeobject.c b/Objects/codeobject.c index 9d3897b2f53c6e..d7c6794042e543 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -448,7 +448,7 @@ _PyCode_Validate(struct _PyCodeConstructor *con) return 0; } -extern void _PyCode_Quicken(PyCodeObject *code); +extern void _PyCode_Quicken(_Py_CODEUNIT *instructions, Py_ssize_t size); static void init_code(PyCodeObject *co, struct _PyCodeConstructor *con) @@ -519,7 +519,7 @@ init_code(PyCodeObject *co, struct _PyCodeConstructor *con) entry_point++; } co->_co_firsttraceable = entry_point; - _PyCode_Quicken(co); + _PyCode_Quicken(_PyCode_CODE(co), Py_SIZE(co)); notify_code_watchers(PY_CODE_EVENT_CREATE, co); } diff --git a/Objects/frameobject.c b/Objects/frameobject.c index 85c24550d0b409..1387cdcaa3fced 100644 --- a/Objects/frameobject.c +++ b/Objects/frameobject.c @@ -1571,7 +1571,7 @@ frame_setlineno(PyFrameObject *f, PyObject* p_new_lineno, void *Py_UNUSED(ignore } /* Finally set the new lasti and return OK. */ f->f_lineno = 0; - f->f_frame->instr_ptr = _PyCode_CODE(code) + best_addr; + f->f_frame->instr_ptr = _PyFrame_GetBytecode(f->f_frame) + best_addr; return 0; } @@ -1865,7 +1865,7 @@ frame_init_get_vars(_PyInterpreterFrame *frame) // here: PyCodeObject *co = _PyFrame_GetCode(frame); int lasti = _PyInterpreterFrame_LASTI(frame); - if (!(lasti < 0 && _PyCode_CODE(co)->op.code == COPY_FREE_VARS + if (!(lasti < 0 && _PyFrame_GetBytecode(frame)->op.code == COPY_FREE_VARS && PyFunction_Check(frame->f_funcobj))) { /* Free vars are initialized */ @@ -1880,7 +1880,7 @@ frame_init_get_vars(_PyInterpreterFrame *frame) frame->localsplus[offset + i] = PyStackRef_FromPyObjectNew(o); } // COPY_FREE_VARS doesn't have inline CACHEs, either: - frame->instr_ptr = _PyCode_CODE(_PyFrame_GetCode(frame)); + frame->instr_ptr = _PyFrame_GetBytecode(frame); } diff --git a/Objects/typeobject.c b/Objects/typeobject.c index a6483f74b7947d..4f1f5c8295a966 100644 --- a/Objects/typeobject.c +++ b/Objects/typeobject.c @@ -11547,9 +11547,10 @@ super_descr_get(PyObject *self, PyObject *obj, PyObject *type) } static int -super_init_without_args(_PyInterpreterFrame *cframe, PyCodeObject *co, +super_init_without_args(_PyInterpreterFrame *cframe, PyTypeObject **type_p, PyObject **obj_p) { + PyCodeObject *co = _PyFrame_GetCode(cframe); if (co->co_argcount == 0) { PyErr_SetString(PyExc_RuntimeError, "super(): no arguments"); @@ -11565,8 +11566,8 @@ super_init_without_args(_PyInterpreterFrame *cframe, PyCodeObject *co, if (_PyInterpreterFrame_LASTI(cframe) >= 0) { // MAKE_CELL and COPY_FREE_VARS have no quickened forms, so no need // to use _PyOpcode_Deopt here: - assert(_PyCode_CODE(co)[0].op.code == MAKE_CELL || - _PyCode_CODE(co)[0].op.code == COPY_FREE_VARS); + assert(_PyFrame_GetBytecode(cframe)[0].op.code == MAKE_CELL || + _PyFrame_GetBytecode(cframe)[0].op.code == COPY_FREE_VARS); assert(PyCell_Check(firstarg)); firstarg = PyCell_GET(firstarg); } @@ -11649,7 +11650,7 @@ super_init_impl(PyObject *self, PyTypeObject *type, PyObject *obj) { "super(): no current frame"); return -1; } - int res = super_init_without_args(frame, _PyFrame_GetCode(frame), &type, &obj); + int res = super_init_without_args(frame, &type, &obj); if (res < 0) { return -1; diff --git a/Programs/test_frozenmain.h b/Programs/test_frozenmain.h index 624d9c0b653ad7..661ce867c1ce00 100644 --- a/Programs/test_frozenmain.h +++ b/Programs/test_frozenmain.h @@ -12,26 +12,26 @@ unsigned char M_test_frozenmain[] = { 0,0,111,6,88,2,31,0,79,6,88,6,12,0,79,7, 88,5,88,6,2,0,0,0,12,0,47,4,49,1,0,0, 0,0,0,0,29,0,72,22,0,0,9,0,29,0,100,1, - 41,8,233,0,0,0,0,78,122,18,70,114,111,122,101,110, - 32,72,101,108,108,111,32,87,111,114,108,100,122,8,115,121, + 41,8,233,0,0,0,0,78,218,18,70,114,111,122,101,110, + 32,72,101,108,108,111,32,87,111,114,108,100,218,8,115,121, 115,46,97,114,103,118,218,6,99,111,110,102,105,103,41,5, 218,12,112,114,111,103,114,97,109,95,110,97,109,101,218,10, 101,120,101,99,117,116,97,98,108,101,218,15,117,115,101,95, 101,110,118,105,114,111,110,109,101,110,116,218,17,99,111,110, 102,105,103,117,114,101,95,99,95,115,116,100,105,111,218,14, - 98,117,102,102,101,114,101,100,95,115,116,100,105,111,122,7, - 99,111,110,102,105,103,32,122,2,58,32,41,7,218,3,115, + 98,117,102,102,101,114,101,100,95,115,116,100,105,111,218,7, + 99,111,110,102,105,103,32,218,2,58,32,41,7,218,3,115, 121,115,218,17,95,116,101,115,116,105,110,116,101,114,110,97, 108,99,97,112,105,218,5,112,114,105,110,116,218,4,97,114, 103,118,218,11,103,101,116,95,99,111,110,102,105,103,115,114, - 3,0,0,0,218,3,107,101,121,169,0,243,0,0,0,0, + 5,0,0,0,218,3,107,101,121,169,0,243,0,0,0,0, 218,18,116,101,115,116,95,102,114,111,122,101,110,109,97,105, - 110,46,112,121,218,8,60,109,111,100,117,108,101,62,114,18, + 110,46,112,121,218,8,60,109,111,100,117,108,101,62,114,22, 0,0,0,1,0,0,0,115,94,0,0,0,240,3,1,1, 1,243,8,0,1,11,219,0,24,225,0,5,208,6,26,212, 0,27,217,0,5,128,106,144,35,151,40,145,40,212,0,27, 216,9,26,215,9,38,210,9,38,211,9,40,168,24,209,9, 50,128,6,243,2,6,12,2,128,67,241,14,0,5,10,136, 71,144,67,144,53,152,2,152,54,160,35,153,59,152,45,208, - 10,40,214,4,41,242,15,6,12,2,114,16,0,0,0, + 10,40,214,4,41,242,15,6,12,2,114,20,0,0,0, }; diff --git a/Python/bytecodes.c b/Python/bytecodes.c index b5a642dccd2aec..991ec75725d9d1 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -1209,7 +1209,7 @@ dummy_func( if (oparg) { PyObject *lasti = PyStackRef_AsPyObjectBorrow(values[0]); if (PyLong_Check(lasti)) { - frame->instr_ptr = _PyCode_CODE(_PyFrame_GetCode(frame)) + PyLong_AsLong(lasti); + frame->instr_ptr = _PyFrame_GetBytecode(frame) + PyLong_AsLong(lasti); assert(!_PyErr_Occurred(tstate)); } else { @@ -3604,7 +3604,7 @@ dummy_func( op(_CREATE_INIT_FRAME, (self, init, args[oparg] -- init_frame: _PyInterpreterFrame *)) { _PyInterpreterFrame *shim = _PyFrame_PushTrampolineUnchecked( tstate, (PyCodeObject *)&_Py_InitCleanup, 1, frame); - assert(_PyCode_CODE((PyCodeObject *)shim->f_executable)[0].op.code == EXIT_INIT_CHECK); + assert(_PyFrame_GetBytecode(shim)[0].op.code == EXIT_INIT_CHECK); /* Push self onto stack of shim */ shim->localsplus[0] = PyStackRef_DUP(self); PyFunctionObject *init_func = (PyFunctionObject *)PyStackRef_AsPyObjectSteal(init); @@ -4525,7 +4525,7 @@ dummy_func( int original_opcode = 0; if (tstate->tracing) { PyCodeObject *code = _PyFrame_GetCode(frame); - original_opcode = code->_co_monitoring->lines[(int)(this_instr - _PyCode_CODE(code))].original_opcode; + original_opcode = code->_co_monitoring->lines[(int)(this_instr - _PyFrame_GetBytecode(frame))].original_opcode; next_instr = this_instr; } else { _PyFrame_SetStackPointer(frame, stack_pointer); @@ -4705,7 +4705,7 @@ dummy_func( tier2 op(_EXIT_TRACE, (exit_p/4 --)) { _PyExitData *exit = (_PyExitData *)exit_p; PyCodeObject *code = _PyFrame_GetCode(frame); - _Py_CODEUNIT *target = _PyCode_CODE(code) + exit->target; + _Py_CODEUNIT *target = _PyFrame_GetBytecode(frame) + exit->target; #if defined(Py_DEBUG) && !defined(_Py_JIT) OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); if (lltrace >= 2) { @@ -4713,7 +4713,7 @@ dummy_func( _PyUOpPrint(&next_uop[-1]); printf(", exit %u, temp %d, target %d -> %s]\n", exit - current_executor->exits, exit->temperature.as_counter, - (int)(target - _PyCode_CODE(code)), + (int)(target - _PyFrame_GetBytecode(frame)), _PyOpcode_OpName[target->op.code]); } #endif @@ -4801,7 +4801,7 @@ dummy_func( _PyUOpPrint(&next_uop[-1]); printf(", exit %u, temp %d, target %d -> %s]\n", exit - current_executor->exits, exit->temperature.as_counter, - (int)(target - _PyCode_CODE(_PyFrame_GetCode(frame))), + (int)(target - _PyFrame_GetBytecode(frame)), _PyOpcode_OpName[target->op.code]); } #endif @@ -4855,7 +4855,7 @@ dummy_func( } tier2 op(_ERROR_POP_N, (target/2, unused[oparg] --)) { - frame->instr_ptr = ((_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive) + target; + frame->instr_ptr = _PyFrame_GetBytecode(frame) + target; SYNC_SP(); GOTO_UNWIND(); } diff --git a/Python/ceval.c b/Python/ceval.c index 0ebd5bb58c859c..71c4bd591a3596 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -181,7 +181,7 @@ lltrace_instruction(_PyInterpreterFrame *frame, dump_stack(frame, stack_pointer); const char *opname = _PyOpcode_OpName[opcode]; assert(opname != NULL); - int offset = (int)(next_instr - _PyCode_CODE(_PyFrame_GetCode(frame))); + int offset = (int)(next_instr - _PyFrame_GetBytecode(frame)); if (OPCODE_HAS_ARG((int)_PyOpcode_Deopt[opcode])) { printf("%d: %s %d\n", offset * 2, opname, oparg); } @@ -950,7 +950,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int Python main loop. */ PyObject *exc = _PyErr_GetRaisedException(tstate); PUSH(PyStackRef_FromPyObjectSteal(exc)); - next_instr = _PyCode_CODE(_PyFrame_GetCode(frame)) + handler; + next_instr = _PyFrame_GetBytecode(frame) + handler; if (monitor_handled(tstate, frame, next_instr, exc) < 0) { goto exception_unwind; @@ -1106,7 +1106,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int goto goto_to_tier1; exit_to_tier1: assert(next_uop[-1].format == UOP_FORMAT_TARGET); - next_instr = next_uop[-1].target + _PyCode_CODE(_PyFrame_GetCode(frame)); + next_instr = next_uop[-1].target + _PyFrame_GetBytecode(frame); goto_to_tier1: #ifdef Py_DEBUG if (lltrace >= 2) { @@ -3202,5 +3202,3 @@ _PyEval_LoadName(PyThreadState *tstate, _PyInterpreterFrame *frame, PyObject *na } return value; } - - diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h index 9e1540674d4219..387bc994870352 100644 --- a/Python/ceval_macros.h +++ b/Python/ceval_macros.h @@ -150,7 +150,7 @@ GETITEM(PyObject *v, Py_ssize_t i) { /* Code access macros */ /* The integer overflow is checked by an assertion below. */ -#define INSTR_OFFSET() ((int)(next_instr - _PyCode_CODE(_PyFrame_GetCode(frame)))) +#define INSTR_OFFSET() ((int)(next_instr - _PyFrame_GetBytecode(frame))) #define NEXTOPARG() do { \ _Py_CODEUNIT word = {.cache = FT_ATOMIC_LOAD_UINT16_RELAXED(*(uint16_t*)next_instr)}; \ opcode = word.op.code; \ diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 6d687bbb48b0ba..6dd34febde9611 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -4150,7 +4150,7 @@ self = stack_pointer[-2 - oparg]; _PyInterpreterFrame *shim = _PyFrame_PushTrampolineUnchecked( tstate, (PyCodeObject *)&_Py_InitCleanup, 1, frame); - assert(_PyCode_CODE((PyCodeObject *)shim->f_executable)[0].op.code == EXIT_INIT_CHECK); + assert(_PyFrame_GetBytecode(shim)[0].op.code == EXIT_INIT_CHECK); /* Push self onto stack of shim */ shim->localsplus[0] = PyStackRef_DUP(self); PyFunctionObject *init_func = (PyFunctionObject *)PyStackRef_AsPyObjectSteal(init); @@ -5271,7 +5271,7 @@ PyObject *exit_p = (PyObject *)CURRENT_OPERAND(); _PyExitData *exit = (_PyExitData *)exit_p; PyCodeObject *code = _PyFrame_GetCode(frame); - _Py_CODEUNIT *target = _PyCode_CODE(code) + exit->target; + _Py_CODEUNIT *target = _PyFrame_GetBytecode(frame) + exit->target; #if defined(Py_DEBUG) && !defined(_Py_JIT) OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); if (lltrace >= 2) { @@ -5279,7 +5279,7 @@ _PyUOpPrint(&next_uop[-1]); printf(", exit %u, temp %d, target %d -> %s]\n", exit - current_executor->exits, exit->temperature.as_counter, - (int)(target - _PyCode_CODE(code)), + (int)(target - _PyFrame_GetBytecode(frame)), _PyOpcode_OpName[target->op.code]); } #endif @@ -5416,7 +5416,7 @@ _PyUOpPrint(&next_uop[-1]); printf(", exit %u, temp %d, target %d -> %s]\n", exit - current_executor->exits, exit->temperature.as_counter, - (int)(target - _PyCode_CODE(_PyFrame_GetCode(frame))), + (int)(target - _PyFrame_GetBytecode(frame)), _PyOpcode_OpName[target->op.code]); } #endif @@ -5482,7 +5482,7 @@ case _ERROR_POP_N: { oparg = CURRENT_OPARG(); uint32_t target = (uint32_t)CURRENT_OPERAND(); - frame->instr_ptr = ((_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive) + target; + frame->instr_ptr = _PyFrame_GetBytecode(frame) + target; stack_pointer += -oparg; assert(WITHIN_STACK_BOUNDS()); GOTO_UNWIND(); diff --git a/Python/frame.c b/Python/frame.c index 3192968a0fb1b5..80d2c8e864e47b 100644 --- a/Python/frame.c +++ b/Python/frame.c @@ -63,7 +63,7 @@ take_ownership(PyFrameObject *f, _PyInterpreterFrame *frame) // This may be a newly-created generator or coroutine frame. Since it's // dead anyways, just pretend that the first RESUME ran: PyCodeObject *code = _PyFrame_GetCode(frame); - frame->instr_ptr = _PyCode_CODE(code) + code->_co_firsttraceable + 1; + frame->instr_ptr = _PyFrame_GetBytecode(frame) + code->_co_firsttraceable + 1; } assert(!_PyFrame_IsIncomplete(frame)); assert(f->f_back == NULL); diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 65dfb990cc2820..f74c3d9ad7af3e 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -1023,7 +1023,7 @@ { _PyInterpreterFrame *shim = _PyFrame_PushTrampolineUnchecked( tstate, (PyCodeObject *)&_Py_InitCleanup, 1, frame); - assert(_PyCode_CODE((PyCodeObject *)shim->f_executable)[0].op.code == EXIT_INIT_CHECK); + assert(_PyFrame_GetBytecode(shim)[0].op.code == EXIT_INIT_CHECK); /* Push self onto stack of shim */ shim->localsplus[0] = PyStackRef_DUP(self); PyFunctionObject *init_func = (PyFunctionObject *)PyStackRef_AsPyObjectSteal(init); @@ -4366,7 +4366,7 @@ int original_opcode = 0; if (tstate->tracing) { PyCodeObject *code = _PyFrame_GetCode(frame); - original_opcode = code->_co_monitoring->lines[(int)(this_instr - _PyCode_CODE(code))].original_opcode; + original_opcode = code->_co_monitoring->lines[(int)(this_instr - _PyFrame_GetBytecode(frame))].original_opcode; next_instr = this_instr; } else { _PyFrame_SetStackPointer(frame, stack_pointer); @@ -6397,7 +6397,7 @@ if (oparg) { PyObject *lasti = PyStackRef_AsPyObjectBorrow(values[0]); if (PyLong_Check(lasti)) { - frame->instr_ptr = _PyCode_CODE(_PyFrame_GetCode(frame)) + PyLong_AsLong(lasti); + frame->instr_ptr = _PyFrame_GetBytecode(frame) + PyLong_AsLong(lasti); assert(!_PyErr_Occurred(tstate)); } else { diff --git a/Python/specialize.c b/Python/specialize.c index be916d39ab017e..dfc46a796e2887 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -434,13 +434,12 @@ do { \ // Initialize warmup counters and insert superinstructions. This cannot fail. void -_PyCode_Quicken(PyCodeObject *code) +_PyCode_Quicken(_Py_CODEUNIT *instructions, Py_ssize_t size) { #if ENABLE_SPECIALIZATION int opcode = 0; - _Py_CODEUNIT *instructions = _PyCode_CODE(code); /* The last code unit cannot have a cache, so we don't need to check it */ - for (int i = 0; i < Py_SIZE(code)-1; i++) { + for (Py_ssize_t i = 0; i < size-1; i++) { opcode = instructions[i].op.code; int caches = _PyOpcode_Caches[opcode]; if (caches) { From 344d7adb8a8f9eafad15c8864d39c1cb825beae3 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Tue, 20 Aug 2024 11:21:13 -0700 Subject: [PATCH 03/67] Get bytecode copying working --- Include/cpython/code.h | 15 +++++ Include/internal/pycore_code.h | 20 ++++++ Include/internal/pycore_frame.h | 9 +++ Include/internal/pycore_uop_ids.h | 103 +++++++++++++++--------------- Objects/codeobject.c | 81 ++++++++++++++++++++++- Python/bytecodes.c | 13 ++++ Python/executor_cases.c.h | 2 + Python/generated_cases.c.h | 22 +++++++ Python/instrumentation.c | 3 +- Python/optimizer_cases.c.h | 2 + Python/specialize.c | 1 + 11 files changed, 217 insertions(+), 54 deletions(-) diff --git a/Include/cpython/code.h b/Include/cpython/code.h index 58d93fcfc1066b..7ec795793a434b 100644 --- a/Include/cpython/code.h +++ b/Include/cpython/code.h @@ -68,6 +68,20 @@ typedef struct { uint8_t *per_instruction_tools; } _PyCoMonitoringData; +#ifdef Py_GIL_DISABLED +/* Each thread specializes a thread-local copy of the bytecode in free-threaded + * builds. These copies are stored on the code object in a `_PyCodeArray`. + */ +typedef struct { + Py_ssize_t size; + char *entries[]; +} _PyCodeArray; + +#define _PyCode_DEF_THREAD_LOCAL_BYTECODE() _PyCodeArray *co_specialized_code; +#else +#define _PyCode_DEF_THREAD_LOCAL_BYTECODE() +#endif + // To avoid repeating ourselves in deepfreeze.py, all PyCodeObject members are // defined in this macro: #define _PyCode_DEF(SIZE) { \ @@ -133,6 +147,7 @@ typedef struct { Type is a void* to keep the format private in codeobject.c to force \ people to go through the proper APIs. */ \ void *co_extra; \ + _PyCode_DEF_THREAD_LOCAL_BYTECODE() \ char co_code_adaptive[(SIZE)]; \ } diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index f05f4b1de4bb9c..b004b42bc0fabb 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -602,6 +602,26 @@ struct _PyCode8 _PyCode_DEF(8); PyAPI_DATA(const struct _PyCode8) _Py_InitCleanup; #ifdef Py_GIL_DISABLED + +extern _Py_CODEUNIT *_PyCode_CreateSpecializableCode(PyCodeObject *co); +/* Return bytecode that should be executed. + * Will not return NULL, but may disable specialization, in which case the + * returned bytecode should not be specialized. + * + * XXX - This is a confusing contract. + */ +static inline _Py_CODEUNIT * +_PyCode_GetSpecializableCode(PyCodeObject *co) +{ + _PyCodeArray *code = _Py_atomic_load_ptr_acquire(&co->co_specialized_code); + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *) PyThreadState_GET(); + Py_ssize_t idx = tstate->specialized_code_index; + if (idx < code->size && code->entries[idx] != NULL) { + // XXX - Do we need to worry about alignment here? + return (_Py_CODEUNIT *) code->entries[idx]; + } + return _PyCode_CreateSpecializableCode(co); +} extern int _Py_ReserveSpecializedCodeIndex(PyInterpreterState *interp); extern void _Py_ClearSpecializedCodeIndex(_PyThreadStateImpl *tstate); #endif diff --git a/Include/internal/pycore_frame.h b/Include/internal/pycore_frame.h index 4adb668b9a7795..7688e227ef2144 100644 --- a/Include/internal/pycore_frame.h +++ b/Include/internal/pycore_frame.h @@ -68,6 +68,9 @@ typedef struct _PyInterpreterFrame { PyObject *f_locals; /* Strong reference, may be NULL. Only valid if not on C stack */ PyFrameObject *frame_obj; /* Strong reference, may be NULL. Only valid if not on C stack */ _Py_CODEUNIT *instr_ptr; /* Instruction currently executing (or about to begin) */ +#ifdef Py_GIL_DISABLED + _Py_CODEUNIT *bytecode; +#endif _PyStackRef *stackpointer; uint16_t return_offset; /* Only relevant during a function call */ char owner; @@ -84,7 +87,11 @@ static inline PyCodeObject *_PyFrame_GetCode(_PyInterpreterFrame *f) { } static inline _Py_CODEUNIT *_PyFrame_GetBytecode(_PyInterpreterFrame *f) { +#ifdef Py_GIL_DISABLED + return f->bytecode; +#else return _PyCode_CODE(_PyFrame_GetCode(f)); +#endif } static inline _PyStackRef *_PyFrame_Stackbase(_PyInterpreterFrame *f) { @@ -167,6 +174,7 @@ _PyFrame_Initialize( } #ifdef Py_GIL_DISABLED + frame->bytecode = frame->instr_ptr; // On GIL disabled, we walk the entire stack in GC. Since stacktop // is not always in sync with the real stack pointer, we have // no choice but to traverse the entire stack. @@ -339,6 +347,7 @@ _PyFrame_PushTrampolineUnchecked(PyThreadState *tstate, PyCodeObject *code, int frame->return_offset = 0; #ifdef Py_GIL_DISABLED + frame->bytecode = frame->instr_ptr; assert(code->co_nlocalsplus == 0); for (int i = 0; i < code->co_stacksize; i++) { frame->localsplus[i] = PyStackRef_NULL; diff --git a/Include/internal/pycore_uop_ids.h b/Include/internal/pycore_uop_ids.h index b950f760d74ac7..19582d85e5dd25 100644 --- a/Include/internal/pycore_uop_ids.h +++ b/Include/internal/pycore_uop_ids.h @@ -191,30 +191,31 @@ extern "C" { #define _LOAD_ATTR_SLOT_1 421 #define _LOAD_ATTR_WITH_HINT 422 #define _LOAD_BUILD_CLASS LOAD_BUILD_CLASS +#define _LOAD_BYTECODE 423 #define _LOAD_COMMON_CONSTANT LOAD_COMMON_CONSTANT #define _LOAD_CONST LOAD_CONST -#define _LOAD_CONST_INLINE 423 -#define _LOAD_CONST_INLINE_BORROW 424 -#define _LOAD_CONST_INLINE_BORROW_WITH_NULL 425 -#define _LOAD_CONST_INLINE_WITH_NULL 426 +#define _LOAD_CONST_INLINE 424 +#define _LOAD_CONST_INLINE_BORROW 425 +#define _LOAD_CONST_INLINE_BORROW_WITH_NULL 426 +#define _LOAD_CONST_INLINE_WITH_NULL 427 #define _LOAD_DEREF LOAD_DEREF -#define _LOAD_FAST 427 -#define _LOAD_FAST_0 428 -#define _LOAD_FAST_1 429 -#define _LOAD_FAST_2 430 -#define _LOAD_FAST_3 431 -#define _LOAD_FAST_4 432 -#define _LOAD_FAST_5 433 -#define _LOAD_FAST_6 434 -#define _LOAD_FAST_7 435 +#define _LOAD_FAST 428 +#define _LOAD_FAST_0 429 +#define _LOAD_FAST_1 430 +#define _LOAD_FAST_2 431 +#define _LOAD_FAST_3 432 +#define _LOAD_FAST_4 433 +#define _LOAD_FAST_5 434 +#define _LOAD_FAST_6 435 +#define _LOAD_FAST_7 436 #define _LOAD_FAST_AND_CLEAR LOAD_FAST_AND_CLEAR #define _LOAD_FAST_CHECK LOAD_FAST_CHECK #define _LOAD_FAST_LOAD_FAST LOAD_FAST_LOAD_FAST #define _LOAD_FROM_DICT_OR_DEREF LOAD_FROM_DICT_OR_DEREF #define _LOAD_FROM_DICT_OR_GLOBALS LOAD_FROM_DICT_OR_GLOBALS -#define _LOAD_GLOBAL 436 -#define _LOAD_GLOBAL_BUILTINS 437 -#define _LOAD_GLOBAL_MODULE 438 +#define _LOAD_GLOBAL 437 +#define _LOAD_GLOBAL_BUILTINS 438 +#define _LOAD_GLOBAL_MODULE 439 #define _LOAD_LOCALS LOAD_LOCALS #define _LOAD_NAME LOAD_NAME #define _LOAD_SPECIAL LOAD_SPECIAL @@ -227,59 +228,59 @@ extern "C" { #define _MATCH_KEYS MATCH_KEYS #define _MATCH_MAPPING MATCH_MAPPING #define _MATCH_SEQUENCE MATCH_SEQUENCE -#define _MAYBE_EXPAND_METHOD 439 -#define _MONITOR_CALL 440 -#define _MONITOR_JUMP_BACKWARD 441 -#define _MONITOR_RESUME 442 +#define _MAYBE_EXPAND_METHOD 440 +#define _MONITOR_CALL 441 +#define _MONITOR_JUMP_BACKWARD 442 +#define _MONITOR_RESUME 443 #define _NOP NOP #define _POP_EXCEPT POP_EXCEPT -#define _POP_JUMP_IF_FALSE 443 -#define _POP_JUMP_IF_TRUE 444 +#define _POP_JUMP_IF_FALSE 444 +#define _POP_JUMP_IF_TRUE 445 #define _POP_TOP POP_TOP -#define _POP_TOP_LOAD_CONST_INLINE_BORROW 445 +#define _POP_TOP_LOAD_CONST_INLINE_BORROW 446 #define _PUSH_EXC_INFO PUSH_EXC_INFO -#define _PUSH_FRAME 446 +#define _PUSH_FRAME 447 #define _PUSH_NULL PUSH_NULL -#define _PY_FRAME_GENERAL 447 -#define _PY_FRAME_KW 448 -#define _QUICKEN_RESUME 449 -#define _REPLACE_WITH_TRUE 450 +#define _PY_FRAME_GENERAL 448 +#define _PY_FRAME_KW 449 +#define _QUICKEN_RESUME 450 +#define _REPLACE_WITH_TRUE 451 #define _RESUME_CHECK RESUME_CHECK #define _RETURN_GENERATOR RETURN_GENERATOR #define _RETURN_VALUE RETURN_VALUE -#define _SAVE_RETURN_OFFSET 451 -#define _SEND 452 -#define _SEND_GEN_FRAME 453 +#define _SAVE_RETURN_OFFSET 452 +#define _SEND 453 +#define _SEND_GEN_FRAME 454 #define _SETUP_ANNOTATIONS SETUP_ANNOTATIONS #define _SET_ADD SET_ADD #define _SET_FUNCTION_ATTRIBUTE SET_FUNCTION_ATTRIBUTE #define _SET_UPDATE SET_UPDATE -#define _START_EXECUTOR 454 -#define _STORE_ATTR 455 -#define _STORE_ATTR_INSTANCE_VALUE 456 -#define _STORE_ATTR_SLOT 457 -#define _STORE_ATTR_WITH_HINT 458 +#define _START_EXECUTOR 455 +#define _STORE_ATTR 456 +#define _STORE_ATTR_INSTANCE_VALUE 457 +#define _STORE_ATTR_SLOT 458 +#define _STORE_ATTR_WITH_HINT 459 #define _STORE_DEREF STORE_DEREF -#define _STORE_FAST 459 -#define _STORE_FAST_0 460 -#define _STORE_FAST_1 461 -#define _STORE_FAST_2 462 -#define _STORE_FAST_3 463 -#define _STORE_FAST_4 464 -#define _STORE_FAST_5 465 -#define _STORE_FAST_6 466 -#define _STORE_FAST_7 467 +#define _STORE_FAST 460 +#define _STORE_FAST_0 461 +#define _STORE_FAST_1 462 +#define _STORE_FAST_2 463 +#define _STORE_FAST_3 464 +#define _STORE_FAST_4 465 +#define _STORE_FAST_5 466 +#define _STORE_FAST_6 467 +#define _STORE_FAST_7 468 #define _STORE_FAST_LOAD_FAST STORE_FAST_LOAD_FAST #define _STORE_FAST_STORE_FAST STORE_FAST_STORE_FAST #define _STORE_GLOBAL STORE_GLOBAL #define _STORE_NAME STORE_NAME -#define _STORE_SLICE 468 -#define _STORE_SUBSCR 469 +#define _STORE_SLICE 469 +#define _STORE_SUBSCR 470 #define _STORE_SUBSCR_DICT STORE_SUBSCR_DICT #define _STORE_SUBSCR_LIST_INT STORE_SUBSCR_LIST_INT #define _SWAP SWAP -#define _TIER2_RESUME_CHECK 470 -#define _TO_BOOL 471 +#define _TIER2_RESUME_CHECK 471 +#define _TO_BOOL 472 #define _TO_BOOL_BOOL TO_BOOL_BOOL #define _TO_BOOL_INT TO_BOOL_INT #define _TO_BOOL_LIST TO_BOOL_LIST @@ -289,14 +290,14 @@ extern "C" { #define _UNARY_NEGATIVE UNARY_NEGATIVE #define _UNARY_NOT UNARY_NOT #define _UNPACK_EX UNPACK_EX -#define _UNPACK_SEQUENCE 472 +#define _UNPACK_SEQUENCE 473 #define _UNPACK_SEQUENCE_LIST UNPACK_SEQUENCE_LIST #define _UNPACK_SEQUENCE_TUPLE UNPACK_SEQUENCE_TUPLE #define _UNPACK_SEQUENCE_TWO_TUPLE UNPACK_SEQUENCE_TWO_TUPLE #define _WITH_EXCEPT_START WITH_EXCEPT_START #define _YIELD_VALUE YIELD_VALUE #define __DO_CALL_FUNCTION_EX _DO_CALL_FUNCTION_EX -#define MAX_UOP_ID 472 +#define MAX_UOP_ID 473 #ifdef __cplusplus } diff --git a/Objects/codeobject.c b/Objects/codeobject.c index d7c6794042e543..c9d6dedec3538c 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -12,11 +12,14 @@ #include "pycore_object.h" // _PyObject_SetDeferredRefcount #include "pycore_opcode_metadata.h" // _PyOpcode_Deopt, _PyOpcode_Caches #include "pycore_opcode_utils.h" // RESUME_AT_FUNC_START +#include "pycore_pymem.h" // _PyMem_FreeDelayed #include "pycore_pystate.h" // _PyInterpreterState_GET() #include "pycore_setobject.h" // _PySet_NextEntry() #include "pycore_tuple.h" // _PyTuple_ITEMS() #include "clinic/codeobject.c.h" +#define INITIAL_SPECIALIZED_CODE_SIZE 16 + static const char * code_event_name(PyCodeEvent event) { switch (event) { @@ -450,7 +453,7 @@ _PyCode_Validate(struct _PyCodeConstructor *con) extern void _PyCode_Quicken(_Py_CODEUNIT *instructions, Py_ssize_t size); -static void +static int init_code(PyCodeObject *co, struct _PyCodeConstructor *con) { int nlocalsplus = (int)PyTuple_GET_SIZE(con->localsplusnames); @@ -513,6 +516,12 @@ init_code(PyCodeObject *co, struct _PyCodeConstructor *con) memcpy(_PyCode_CODE(co), PyBytes_AS_STRING(con->code), PyBytes_GET_SIZE(con->code)); +#ifdef Py_GIL_DISABLED + // XXX - initialize code array + co->co_specialized_code = PyMem_Calloc(1, sizeof(_PyCodeArray) + sizeof(void*) * INITIAL_SPECIALIZED_CODE_SIZE); + co->co_specialized_code->size = INITIAL_SPECIALIZED_CODE_SIZE; + co->co_specialized_code->entries[0] = (char *) _PyCode_CODE(co); +#endif int entry_point = 0; while (entry_point < Py_SIZE(co) && _PyCode_CODE(co)[entry_point].op.code != RESUME) { @@ -521,6 +530,7 @@ init_code(PyCodeObject *co, struct _PyCodeConstructor *con) co->_co_firsttraceable = entry_point; _PyCode_Quicken(_PyCode_CODE(co), Py_SIZE(co)); notify_code_watchers(PY_CODE_EVENT_CREATE, co); + return 0; } static int @@ -675,7 +685,12 @@ _PyCode_New(struct _PyCodeConstructor *con) PyErr_NoMemory(); return NULL; } - init_code(co, con); + + if (init_code(co, con) < 0) { + Py_DECREF(co); + return NULL; + } + #ifdef Py_GIL_DISABLED _PyObject_SetDeferredRefcount((PyObject *)co); _PyObject_GC_TRACK(co); @@ -1871,6 +1886,15 @@ code_dealloc(PyCodeObject *co) PyObject_ClearWeakRefs((PyObject*)co); } free_monitoring_data(co->_co_monitoring); +#ifdef Py_GIL_DISABLED + // The first element always points to the bytecode that follows the fixed + // part of the code object, which will be freed when the code object is + // freed. + for (Py_ssize_t i = 1; i < co->co_specialized_code->size; i++) { + PyMem_Free(co->co_specialized_code->entries[i]); + } + PyMem_Free(co->co_specialized_code); +#endif PyObject_Free(co); } @@ -2637,3 +2661,56 @@ _PyCode_Fini(PyInterpreterState *interp) _PyIndexPool_Fini(&interp->specialized_code_indices); #endif } + +#ifdef Py_GIL_DISABLED + +static void +copy_code(_Py_CODEUNIT *dst, _Py_CODEUNIT *src, Py_ssize_t nbytes) +{ + int code_len = Py_SIZE(co); + _Py_CODEUNIT *dst_bytecode = (_Py_CODEUNIT *) dst->bytecode; + for (int i = 0; i < code_len; i += _PyInstruction_GetLength(co, i)) { + dst_bytecode[i] = _Py_GetBaseCodeUnit(co, i); + } + _PyCode_Quicken(dst_bytecode, code_len); +} + +static _Py_CODEUNIT * +create_specializable_code_lock_held(PyCodeObject *co) +{ + _PyCodeArray *spec_code = co->co_specialized_code; + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *) PyThreadState_GET(); + Py_ssize_t idx = tstate->specialized_code_index; + if (idx >= spec_code->size) { + Py_ssize_t new_size = spec_code->size * 2; + _PyCodeArray *new_spec_code = PyMem_Calloc(sizeof(_PyCodeArray) + sizeof(char*) * new_size, 1); + if (new_spec_code == NULL) { + PyErr_NoMemory(); + return NULL; + } + new_spec_code->size = new_size; + memcpy(new_spec_code->entries, spec_code->entries, spec_code->size * sizeof(char*)); + _Py_atomic_store_ptr_release(&co->co_specialized_code, new_spec_code); + _PyMem_FreeDelayed(spec_code); + spec_code = new_spec_code; + } + spec_code->entries[idx] = PyMem_Malloc(_PyCode_NBYTES(co)); + if (spec_code->entries[idx] == NULL) { + PyErr_NoMemory(); + return NULL; + } + copy_code((_Py_CODEUNIT *) spec_code->entries[idx], _PyCode_CODE(co), _PyCode_NBYTES(co)); + return (_Py_CODEUNIT *) spec_code->entries[idx]; +} + +_Py_CODEUNIT * +_PyCode_CreateSpecializableCode(PyCodeObject *co) +{ + _Py_CODEUNIT *result; + Py_BEGIN_CRITICAL_SECTION(co); + result = create_specializable_code_lock_held(co); + Py_END_CRITICAL_SECTION(); + return result; +} + +#endif diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 991ec75725d9d1..d1cea0b36e5fe1 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -190,7 +190,19 @@ dummy_func( } } + op(_LOAD_BYTECODE, (--)) { + #ifdef Py_GIL_DISABLED + if (frame->instr_ptr == frame->bytecode) { + frame->bytecode = _PyCode_GetSpecializableCode(_PyFrame_GetCode(frame)); + frame->instr_ptr = frame->bytecode; + this_instr = frame->instr_ptr; + next_instr = frame->instr_ptr + 1; + } + #endif + } + macro(RESUME) = + _LOAD_BYTECODE + _MAYBE_INSTRUMENT + _QUICKEN_RESUME + _CHECK_PERIODIC_IF_NOT_YIELD_FROM; @@ -219,6 +231,7 @@ dummy_func( } macro(INSTRUMENTED_RESUME) = + _LOAD_BYTECODE + _MAYBE_INSTRUMENT + _CHECK_PERIODIC_IF_NOT_YIELD_FROM + _MONITOR_RESUME; diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 6dd34febde9611..1ca87325f8db2d 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -37,6 +37,8 @@ /* _QUICKEN_RESUME is not a viable micro-op for tier 2 because it uses the 'this_instr' variable */ + /* _LOAD_BYTECODE is not a viable micro-op for tier 2 because it uses the 'this_instr' variable */ + case _RESUME_CHECK: { #if defined(__EMSCRIPTEN__) if (_Py_emscripten_signal_clock == 0) { diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index f74c3d9ad7af3e..a533647e85ec53 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -4489,6 +4489,17 @@ (void)this_instr; next_instr += 1; INSTRUCTION_STATS(INSTRUMENTED_RESUME); + // _LOAD_BYTECODE + { + #ifdef Py_GIL_DISABLED + if (frame->instr_ptr == frame->bytecode) { + frame->bytecode = _PyCode_GetSpecializableCode(_PyFrame_GetCode(frame)); + frame->instr_ptr = frame->bytecode; + this_instr = frame->instr_ptr; + next_instr = frame->instr_ptr + 1; + } + #endif + } // _MAYBE_INSTRUMENT { if (tstate->tracing == 0) { @@ -6428,6 +6439,17 @@ PREDICTED(RESUME); _Py_CODEUNIT *this_instr = next_instr - 1; (void)this_instr; + // _LOAD_BYTECODE + { + #ifdef Py_GIL_DISABLED + if (frame->instr_ptr == frame->bytecode) { + frame->bytecode = _PyCode_GetSpecializableCode(_PyFrame_GetCode(frame)); + frame->instr_ptr = frame->bytecode; + this_instr = frame->instr_ptr; + next_instr = frame->instr_ptr + 1; + } + #endif + } // _MAYBE_INSTRUMENT { if (tstate->tracing == 0) { diff --git a/Python/instrumentation.c b/Python/instrumentation.c index 5e51a9c992f6c2..e16937c43aa46a 100644 --- a/Python/instrumentation.c +++ b/Python/instrumentation.c @@ -583,7 +583,8 @@ sanity_check_instrumentation(PyCodeObject *code) _Py_CODEUNIT _Py_GetBaseCodeUnit(PyCodeObject *code, int i) { - _Py_CODEUNIT inst = _PyCode_CODE(code)[i]; + _Py_CODEUNIT *src_instr = _PyCode_CODE(code) + i; + _Py_CODEUNIT inst = {.cache = FT_ATOMIC_LOAD_UINT16_RELAXED(*(uint16_t *)src_instr)}; int opcode = inst.op.code; if (opcode < MIN_INSTRUMENTED_OPCODE) { inst.op.code = _PyOpcode_Deopt[opcode]; diff --git a/Python/optimizer_cases.c.h b/Python/optimizer_cases.c.h index 672fec3946f2fb..f2804a66450f8d 100644 --- a/Python/optimizer_cases.c.h +++ b/Python/optimizer_cases.c.h @@ -17,6 +17,8 @@ /* _QUICKEN_RESUME is not a viable micro-op for tier 2 */ + /* _LOAD_BYTECODE is not a viable micro-op for tier 2 */ + case _RESUME_CHECK: { break; } diff --git a/Python/specialize.c b/Python/specialize.c index dfc46a796e2887..7b343e5ccb7666 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -2721,6 +2721,7 @@ _Py_Specialize_ContainsOp(_PyStackRef value_st, _Py_CODEUNIT *instr) } #ifdef Py_GIL_DISABLED + int _Py_ReserveSpecializedCodeIndex(PyInterpreterState *interp) { From f203d00158744ab0c19b4755b1feadb666d3d1d9 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Fri, 30 Aug 2024 10:18:35 -0700 Subject: [PATCH 04/67] Refactor remove_tools --- Python/instrumentation.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/Python/instrumentation.c b/Python/instrumentation.c index e16937c43aa46a..420b76885e6ed9 100644 --- a/Python/instrumentation.c +++ b/Python/instrumentation.c @@ -620,21 +620,21 @@ _Py_GetBaseCodeUnit(PyCodeObject *code, int i) } static void -de_instrument(PyCodeObject *code, int i, int event) +de_instrument(_PyCoMonitoringData *monitoring, _Py_CODEUNIT *bytecode, int i, int event) { assert(event != PY_MONITORING_EVENT_INSTRUCTION); assert(event != PY_MONITORING_EVENT_LINE); - _Py_CODEUNIT *instr = &_PyCode_CODE(code)[i]; + _Py_CODEUNIT *instr = &bytecode[i]; uint8_t *opcode_ptr = &instr->op.code; int opcode = *opcode_ptr; assert(opcode != ENTER_EXECUTOR); if (opcode == INSTRUMENTED_LINE) { - opcode_ptr = &code->_co_monitoring->lines[i].original_opcode; + opcode_ptr = &monitoring->lines[i].original_opcode; opcode = *opcode_ptr; } if (opcode == INSTRUMENTED_INSTRUCTION) { - opcode_ptr = &code->_co_monitoring->per_instruction_opcodes[i]; + opcode_ptr = &monitoring->per_instruction_opcodes[i]; opcode = *opcode_ptr; } int deinstrumented = DE_INSTRUMENT[opcode]; @@ -779,19 +779,19 @@ remove_tools(PyCodeObject * code, int offset, int event, int tools) assert(PY_MONITORING_IS_INSTRUMENTED_EVENT(event)); assert(opcode_has_event(_Py_GetBaseCodeUnit(code, offset).op.code)); _PyCoMonitoringData *monitoring = code->_co_monitoring; + bool should_deinstrument; if (monitoring && monitoring->tools) { monitoring->tools[offset] &= ~tools; - if (monitoring->tools[offset] == 0) { - de_instrument(code, offset, event); - } + should_deinstrument = (monitoring->tools[offset] == 0); } else { /* Single tool */ uint8_t single_tool = code->_co_monitoring->active_monitors.tools[event]; assert(_Py_popcount32(single_tool) <= 1); - if (((single_tool & tools) == single_tool)) { - de_instrument(code, offset, event); - } + should_deinstrument = ((single_tool & tools) == single_tool); + } + if (should_deinstrument) { + de_instrument(monitoring, _PyCode_CODE(code), offset, event); } } From 82b456a301e29f9daaf053694a0f1079f5962a5b Mon Sep 17 00:00:00 2001 From: Matt Page Date: Fri, 30 Aug 2024 10:26:55 -0700 Subject: [PATCH 05/67] Refactor remove_line_tools --- Python/instrumentation.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/Python/instrumentation.c b/Python/instrumentation.c index 420b76885e6ed9..aab25f7dd80f65 100644 --- a/Python/instrumentation.c +++ b/Python/instrumentation.c @@ -650,17 +650,17 @@ de_instrument(_PyCoMonitoringData *monitoring, _Py_CODEUNIT *bytecode, int i, in } static void -de_instrument_line(PyCodeObject *code, int i) +de_instrument_line(_PyCoMonitoringData *monitoring, _Py_CODEUNIT *bytecode, int i) { - _Py_CODEUNIT *instr = &_PyCode_CODE(code)[i]; + _Py_CODEUNIT *instr = &bytecode[i]; int opcode = instr->op.code; if (opcode != INSTRUMENTED_LINE) { return; } - _PyCoLineInstrumentationData *lines = &code->_co_monitoring->lines[i]; + _PyCoLineInstrumentationData *lines = &monitoring->lines[i]; int original_opcode = lines->original_opcode; if (original_opcode == INSTRUMENTED_INSTRUCTION) { - lines->original_opcode = code->_co_monitoring->per_instruction_opcodes[i]; + lines->original_opcode = monitoring->per_instruction_opcodes[i]; } CHECK(original_opcode != 0); CHECK(original_opcode == _PyOpcode_Deopt[original_opcode]); @@ -810,22 +810,23 @@ remove_line_tools(PyCodeObject * code, int offset, int tools) { ASSERT_WORLD_STOPPED_OR_LOCKED(code); - assert(code->_co_monitoring); - if (code->_co_monitoring->line_tools) + _PyCoMonitoringData *monitoring = code->_co_monitoring; + assert(monitoring); + bool should_deinstrument; + if (monitoring->line_tools) { - uint8_t *toolsptr = &code->_co_monitoring->line_tools[offset]; + uint8_t *toolsptr = &monitoring->line_tools[offset]; *toolsptr &= ~tools; - if (*toolsptr == 0 ) { - de_instrument_line(code, offset); - } + should_deinstrument = (*toolsptr == 0); } else { /* Single tool */ - uint8_t single_tool = code->_co_monitoring->active_monitors.tools[PY_MONITORING_EVENT_LINE]; + uint8_t single_tool = monitoring->active_monitors.tools[PY_MONITORING_EVENT_LINE]; assert(_Py_popcount32(single_tool) <= 1); - if (((single_tool & tools) == single_tool)) { - de_instrument_line(code, offset); - } + should_deinstrument = ((single_tool & tools) == single_tool); + } + if (should_deinstrument) { + de_instrument_line(monitoring, _PyCode_CODE(code), offset); } } From b02170421a66969376075f9f1e3ba48f8e4870d9 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Sat, 31 Aug 2024 20:42:46 -0700 Subject: [PATCH 06/67] Instrument thread-local bytecode --- Include/cpython/code.h | 8 +- Include/internal/pycore_code.h | 11 +- Lib/test/test_sys.py | 14 ++- Objects/codeobject.c | 48 ++++++--- Python/bytecodes.c | 20 +++- Python/ceval.c | 3 +- Python/generated_cases.c.h | 38 +++++-- Python/instrumentation.c | 188 ++++++++++++++++++++++----------- 8 files changed, 230 insertions(+), 100 deletions(-) diff --git a/Include/cpython/code.h b/Include/cpython/code.h index 7ec795793a434b..d743c91a5ff704 100644 --- a/Include/cpython/code.h +++ b/Include/cpython/code.h @@ -69,12 +69,18 @@ typedef struct { } _PyCoMonitoringData; #ifdef Py_GIL_DISABLED +typedef struct { + uintptr_t instrumentation_version; + // TODO(mpage) - Change this to _Py_CODEUNIT*? + uint8_t *bytecode; +} _PySpecializableCode; + /* Each thread specializes a thread-local copy of the bytecode in free-threaded * builds. These copies are stored on the code object in a `_PyCodeArray`. */ typedef struct { Py_ssize_t size; - char *entries[]; + _PySpecializableCode entries[]; } _PyCodeArray; #define _PyCode_DEF_THREAD_LOCAL_BYTECODE() _PyCodeArray *co_specialized_code; diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index b004b42bc0fabb..c4b597388b0fd0 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -591,7 +591,7 @@ adaptive_counter_backoff(_Py_BackoffCounter counter) { #define COMPARISON_NOT_EQUALS (COMPARISON_UNORDERED | COMPARISON_LESS_THAN | COMPARISON_GREATER_THAN) -extern int _Py_Instrument(PyCodeObject *co, PyInterpreterState *interp); +extern int _Py_Instrument(PyCodeObject *co, _Py_CODEUNIT *bytecode, PyInterpreterState *interp); extern _Py_CODEUNIT _Py_GetBaseCodeUnit(PyCodeObject *code, int offset); @@ -603,22 +603,23 @@ PyAPI_DATA(const struct _PyCode8) _Py_InitCleanup; #ifdef Py_GIL_DISABLED -extern _Py_CODEUNIT *_PyCode_CreateSpecializableCode(PyCodeObject *co); +extern _PySpecializableCode *_PyCode_CreateSpecializableCode(PyCodeObject *co); + /* Return bytecode that should be executed. * Will not return NULL, but may disable specialization, in which case the * returned bytecode should not be specialized. * * XXX - This is a confusing contract. */ -static inline _Py_CODEUNIT * +static inline _PySpecializableCode * _PyCode_GetSpecializableCode(PyCodeObject *co) { _PyCodeArray *code = _Py_atomic_load_ptr_acquire(&co->co_specialized_code); _PyThreadStateImpl *tstate = (_PyThreadStateImpl *) PyThreadState_GET(); Py_ssize_t idx = tstate->specialized_code_index; - if (idx < code->size && code->entries[idx] != NULL) { + if (idx < code->size && code->entries[idx].bytecode != NULL) { // XXX - Do we need to worry about alignment here? - return (_Py_CODEUNIT *) code->entries[idx]; + return &code->entries[idx]; } return _PyCode_CreateSpecializableCode(co); } diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py index 42b5a7c94e7700..434b3e315e0aa3 100644 --- a/Lib/test/test_sys.py +++ b/Lib/test/test_sys.py @@ -1084,7 +1084,14 @@ def test_getallocatedblocks(self): # While we could imagine a Python session where the number of # multiple buffer objects would exceed the sharing of references, # it is unlikely to happen in a normal test run. - self.assertLess(a, sys.gettotalrefcount()) + # + # In free-threaded builds each code object owns an array of + # pointers to copies of the bytecode. When the number of + # code objects is a large fraction of the total number of + # references, this can cause the total number of allocated + # blocks to exceed the total number of references. + if not support.Py_GIL_DISABLED: + self.assertLess(a, sys.gettotalrefcount()) except AttributeError: # gettotalrefcount() not available pass @@ -1603,7 +1610,10 @@ class C(object): pass def func(): return sys._getframe() x = func() - INTERPRETER_FRAME = '9PhcP' + if support.Py_GIL_DISABLED: + INTERPRETER_FRAME = '10PhcP' + else: + INTERPRETER_FRAME = '9PhcP' check(x, size('3PiccPP' + INTERPRETER_FRAME + 'P')) # function def func(): pass diff --git a/Objects/codeobject.c b/Objects/codeobject.c index c9d6dedec3538c..4b418b1779a7e6 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -453,6 +453,10 @@ _PyCode_Validate(struct _PyCodeConstructor *con) extern void _PyCode_Quicken(_Py_CODEUNIT *instructions, Py_ssize_t size); +#ifdef Py_GIL_DISABLED +static _PyCodeArray * _PyCodeArray_New(Py_ssize_t size); +#endif + static int init_code(PyCodeObject *co, struct _PyCodeConstructor *con) { @@ -517,10 +521,11 @@ init_code(PyCodeObject *co, struct _PyCodeConstructor *con) memcpy(_PyCode_CODE(co), PyBytes_AS_STRING(con->code), PyBytes_GET_SIZE(con->code)); #ifdef Py_GIL_DISABLED - // XXX - initialize code array - co->co_specialized_code = PyMem_Calloc(1, sizeof(_PyCodeArray) + sizeof(void*) * INITIAL_SPECIALIZED_CODE_SIZE); - co->co_specialized_code->size = INITIAL_SPECIALIZED_CODE_SIZE; - co->co_specialized_code->entries[0] = (char *) _PyCode_CODE(co); + co->co_specialized_code = _PyCodeArray_New(INITIAL_SPECIALIZED_CODE_SIZE); + if (co->co_specialized_code == NULL) { + return -1; + } + co->co_specialized_code->entries[0].bytecode = (uint8_t *) _PyCode_CODE(co); #endif int entry_point = 0; while (entry_point < Py_SIZE(co) && @@ -1891,7 +1896,7 @@ code_dealloc(PyCodeObject *co) // part of the code object, which will be freed when the code object is // freed. for (Py_ssize_t i = 1; i < co->co_specialized_code->size; i++) { - PyMem_Free(co->co_specialized_code->entries[i]); + PyMem_Free(co->co_specialized_code->entries[i].bytecode); } PyMem_Free(co->co_specialized_code); #endif @@ -2664,6 +2669,18 @@ _PyCode_Fini(PyInterpreterState *interp) #ifdef Py_GIL_DISABLED +static _PyCodeArray * +_PyCodeArray_New(Py_ssize_t size) +{ + _PyCodeArray *arr = PyMem_Calloc(sizeof(_PyCodeArray) + sizeof(_PySpecializableCode) * size, 1); + if (arr == NULL) { + PyErr_NoMemory(); + return NULL; + } + arr->size = size; + return arr; +} + static void copy_code(_Py_CODEUNIT *dst, _Py_CODEUNIT *src, Py_ssize_t nbytes) { @@ -2675,38 +2692,35 @@ copy_code(_Py_CODEUNIT *dst, _Py_CODEUNIT *src, Py_ssize_t nbytes) _PyCode_Quicken(dst_bytecode, code_len); } -static _Py_CODEUNIT * +static _PySpecializableCode * create_specializable_code_lock_held(PyCodeObject *co) { _PyCodeArray *spec_code = co->co_specialized_code; _PyThreadStateImpl *tstate = (_PyThreadStateImpl *) PyThreadState_GET(); Py_ssize_t idx = tstate->specialized_code_index; if (idx >= spec_code->size) { - Py_ssize_t new_size = spec_code->size * 2; - _PyCodeArray *new_spec_code = PyMem_Calloc(sizeof(_PyCodeArray) + sizeof(char*) * new_size, 1); + _PyCodeArray *new_spec_code = _PyCodeArray_New(spec_code->size * 2); if (new_spec_code == NULL) { - PyErr_NoMemory(); return NULL; } - new_spec_code->size = new_size; - memcpy(new_spec_code->entries, spec_code->entries, spec_code->size * sizeof(char*)); + memcpy(new_spec_code->entries, spec_code->entries, spec_code->size * sizeof(_PySpecializableCode)); _Py_atomic_store_ptr_release(&co->co_specialized_code, new_spec_code); _PyMem_FreeDelayed(spec_code); spec_code = new_spec_code; } - spec_code->entries[idx] = PyMem_Malloc(_PyCode_NBYTES(co)); - if (spec_code->entries[idx] == NULL) { + spec_code->entries[idx].bytecode = PyMem_Malloc(_PyCode_NBYTES(co)); + if (spec_code->entries[idx].bytecode == NULL) { PyErr_NoMemory(); return NULL; } - copy_code((_Py_CODEUNIT *) spec_code->entries[idx], _PyCode_CODE(co), _PyCode_NBYTES(co)); - return (_Py_CODEUNIT *) spec_code->entries[idx]; + copy_code((_Py_CODEUNIT *) spec_code->entries[idx].bytecode, _PyCode_CODE(co), _PyCode_NBYTES(co)); + return &spec_code->entries[idx]; } -_Py_CODEUNIT * +_PySpecializableCode * _PyCode_CreateSpecializableCode(PyCodeObject *co) { - _Py_CODEUNIT *result; + _PySpecializableCode *result; Py_BEGIN_CRITICAL_SECTION(co); result = create_specializable_code_lock_held(co); Py_END_CRITICAL_SECTION(); diff --git a/Python/bytecodes.c b/Python/bytecodes.c index d1cea0b36e5fe1..74b239d9ae0701 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -178,9 +178,16 @@ dummy_func( tier1 op(_MAYBE_INSTRUMENT, (--)) { if (tstate->tracing == 0) { uintptr_t global_version = _Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker) & ~_PY_EVAL_EVENTS_MASK; - uintptr_t code_version = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(_PyFrame_GetCode(frame)->_co_instrumentation_version); + PyCodeObject *code = _PyFrame_GetCode(frame); +#ifdef Py_GIL_DISABLED + _PySpecializableCode *spec_code = _PyCode_GetSpecializableCode(code); + uintptr_t code_version = spec_code->instrumentation_version; +#else + uintptr_t code_version = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(code->_co_instrumentation_version); +#endif if (code_version != global_version) { - int err = _Py_Instrument(_PyFrame_GetCode(frame), tstate->interp); + _Py_CODEUNIT *bytecode = _PyFrame_GetBytecode(frame); + int err = _Py_Instrument(code, bytecode, tstate->interp); if (err) { ERROR_NO_POP(); } @@ -192,11 +199,14 @@ dummy_func( op(_LOAD_BYTECODE, (--)) { #ifdef Py_GIL_DISABLED - if (frame->instr_ptr == frame->bytecode) { - frame->bytecode = _PyCode_GetSpecializableCode(_PyFrame_GetCode(frame)); - frame->instr_ptr = frame->bytecode; + _PySpecializableCode *code = _PyCode_GetSpecializableCode(_PyFrame_GetCode(frame)); + if (frame->bytecode != (_Py_CODEUNIT *) code->bytecode) { + int off = this_instr - frame->bytecode; + frame->bytecode = (_Py_CODEUNIT *) code->bytecode; + frame->instr_ptr = frame->bytecode + off; this_instr = frame->instr_ptr; next_instr = frame->instr_ptr + 1; + } #endif } diff --git a/Python/ceval.c b/Python/ceval.c index 71c4bd591a3596..b726c24ef20146 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -808,7 +808,8 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int } /* Because this avoids the RESUME, * we need to update instrumentation */ - _Py_Instrument(_PyFrame_GetCode(frame), tstate->interp); + /* TODO(mpage) - Need to handle this */ + _Py_Instrument(_PyFrame_GetCode(frame), _PyFrame_GetBytecode(frame), tstate->interp); monitor_throw(tstate, frame, frame->instr_ptr); /* TO DO -- Monitor throw entry. */ goto resume_with_error; diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index a533647e85ec53..fff8ca977d33ac 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -4492,9 +4492,11 @@ // _LOAD_BYTECODE { #ifdef Py_GIL_DISABLED - if (frame->instr_ptr == frame->bytecode) { - frame->bytecode = _PyCode_GetSpecializableCode(_PyFrame_GetCode(frame)); - frame->instr_ptr = frame->bytecode; + _PySpecializableCode *code = _PyCode_GetSpecializableCode(_PyFrame_GetCode(frame)); + if (frame->bytecode != (_Py_CODEUNIT *) code->bytecode) { + int off = this_instr - frame->bytecode; + frame->bytecode = (_Py_CODEUNIT *) code->bytecode; + frame->instr_ptr = frame->bytecode + off; this_instr = frame->instr_ptr; next_instr = frame->instr_ptr + 1; } @@ -4504,9 +4506,16 @@ { if (tstate->tracing == 0) { uintptr_t global_version = _Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker) & ~_PY_EVAL_EVENTS_MASK; - uintptr_t code_version = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(_PyFrame_GetCode(frame)->_co_instrumentation_version); + PyCodeObject *code = _PyFrame_GetCode(frame); + #ifdef Py_GIL_DISABLED + _PySpecializableCode *spec_code = _PyCode_GetSpecializableCode(code); + uintptr_t code_version = spec_code->instrumentation_version; + #else + uintptr_t code_version = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(code->_co_instrumentation_version); + #endif if (code_version != global_version) { - int err = _Py_Instrument(_PyFrame_GetCode(frame), tstate->interp); + _Py_CODEUNIT *bytecode = _PyFrame_GetBytecode(frame); + int err = _Py_Instrument(code, bytecode, tstate->interp); if (err) { goto error; } @@ -6442,9 +6451,11 @@ // _LOAD_BYTECODE { #ifdef Py_GIL_DISABLED - if (frame->instr_ptr == frame->bytecode) { - frame->bytecode = _PyCode_GetSpecializableCode(_PyFrame_GetCode(frame)); - frame->instr_ptr = frame->bytecode; + _PySpecializableCode *code = _PyCode_GetSpecializableCode(_PyFrame_GetCode(frame)); + if (frame->bytecode != (_Py_CODEUNIT *) code->bytecode) { + int off = this_instr - frame->bytecode; + frame->bytecode = (_Py_CODEUNIT *) code->bytecode; + frame->instr_ptr = frame->bytecode + off; this_instr = frame->instr_ptr; next_instr = frame->instr_ptr + 1; } @@ -6454,9 +6465,16 @@ { if (tstate->tracing == 0) { uintptr_t global_version = _Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker) & ~_PY_EVAL_EVENTS_MASK; - uintptr_t code_version = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(_PyFrame_GetCode(frame)->_co_instrumentation_version); + PyCodeObject *code = _PyFrame_GetCode(frame); + #ifdef Py_GIL_DISABLED + _PySpecializableCode *spec_code = _PyCode_GetSpecializableCode(code); + uintptr_t code_version = spec_code->instrumentation_version; + #else + uintptr_t code_version = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(code->_co_instrumentation_version); + #endif if (code_version != global_version) { - int err = _Py_Instrument(_PyFrame_GetCode(frame), tstate->interp); + _Py_CODEUNIT *bytecode = _PyFrame_GetBytecode(frame); + int err = _Py_Instrument(code, bytecode, tstate->interp); if (err) { goto error; } diff --git a/Python/instrumentation.c b/Python/instrumentation.c index aab25f7dd80f65..1c1f8f1474a9d8 100644 --- a/Python/instrumentation.c +++ b/Python/instrumentation.c @@ -18,6 +18,8 @@ #include "pycore_pyerrors.h" #include "pycore_pystate.h" // _PyInterpreterState_GET() +// TODO(mpage) - Document how we keep everything in sync + /* Uncomment this to dump debugging output when assertions fail */ // #define INSTRUMENT_DEBUG 1 @@ -672,19 +674,20 @@ de_instrument_line(_PyCoMonitoringData *monitoring, _Py_CODEUNIT *bytecode, int } static void -de_instrument_per_instruction(PyCodeObject *code, int i) +de_instrument_per_instruction(_PyCoMonitoringData *monitoring, + _Py_CODEUNIT *bytecode, int i) { - _Py_CODEUNIT *instr = &_PyCode_CODE(code)[i]; + _Py_CODEUNIT *instr = &bytecode[i]; uint8_t *opcode_ptr = &instr->op.code; int opcode = *opcode_ptr; if (opcode == INSTRUMENTED_LINE) { - opcode_ptr = &code->_co_monitoring->lines[i].original_opcode; + opcode_ptr = &monitoring->lines[i].original_opcode; opcode = *opcode_ptr; } if (opcode != INSTRUMENTED_INSTRUCTION) { return; } - int original_opcode = code->_co_monitoring->per_instruction_opcodes[i]; + int original_opcode = monitoring->per_instruction_opcodes[i]; CHECK(original_opcode != 0); CHECK(original_opcode == _PyOpcode_Deopt[original_opcode]); *opcode_ptr = original_opcode; @@ -697,18 +700,18 @@ de_instrument_per_instruction(PyCodeObject *code, int i) static void -instrument(PyCodeObject *code, int i) +instrument(_PyCoMonitoringData *monitoring, _Py_CODEUNIT *bytecode, int i) { - _Py_CODEUNIT *instr = &_PyCode_CODE(code)[i]; + _Py_CODEUNIT *instr = &bytecode[i]; uint8_t *opcode_ptr = &instr->op.code; int opcode =*opcode_ptr; if (opcode == INSTRUMENTED_LINE) { - _PyCoLineInstrumentationData *lines = &code->_co_monitoring->lines[i]; + _PyCoLineInstrumentationData *lines = &monitoring->lines[i]; opcode_ptr = &lines->original_opcode; opcode = *opcode_ptr; } if (opcode == INSTRUMENTED_INSTRUCTION) { - opcode_ptr = &code->_co_monitoring->per_instruction_opcodes[i]; + opcode_ptr = &monitoring->per_instruction_opcodes[i]; opcode = *opcode_ptr; CHECK(opcode != INSTRUMENTED_INSTRUCTION && opcode != INSTRUMENTED_LINE); CHECK(opcode == _PyOpcode_Deopt[opcode]); @@ -720,53 +723,54 @@ instrument(PyCodeObject *code, int i) assert(instrumented); FT_ATOMIC_STORE_UINT8_RELAXED(*opcode_ptr, instrumented); if (_PyOpcode_Caches[deopt]) { - FT_ATOMIC_STORE_UINT16_RELAXED(instr[1].counter.as_counter, - adaptive_counter_warmup().as_counter); + FT_ATOMIC_STORE_UINT16_RELAXED(instr[1].counter.as_counter, + adaptive_counter_warmup().as_counter); instr[1].counter = adaptive_counter_warmup(); } } } static void -instrument_line(PyCodeObject *code, int i) +instrument_line(_PyCoMonitoringData *monitoring, _Py_CODEUNIT *bytecode, int i) { - uint8_t *opcode_ptr = &_PyCode_CODE(code)[i].op.code; + uint8_t *opcode_ptr = &bytecode[i].op.code; int opcode = *opcode_ptr; if (opcode == INSTRUMENTED_LINE) { return; } - _PyCoLineInstrumentationData *lines = &code->_co_monitoring->lines[i]; + _PyCoLineInstrumentationData *lines = &monitoring->lines[i]; lines->original_opcode = _PyOpcode_Deopt[opcode]; CHECK(lines->original_opcode > 0); *opcode_ptr = INSTRUMENTED_LINE; } static void -instrument_per_instruction(PyCodeObject *code, int i) +instrument_per_instruction(_PyCoMonitoringData *monitoring, + _Py_CODEUNIT *bytecode, int i) { - _Py_CODEUNIT *instr = &_PyCode_CODE(code)[i]; + _Py_CODEUNIT *instr = &bytecode[i]; uint8_t *opcode_ptr = &instr->op.code; int opcode = *opcode_ptr; if (opcode == INSTRUMENTED_LINE) { - _PyCoLineInstrumentationData *lines = &code->_co_monitoring->lines[i]; + _PyCoLineInstrumentationData *lines = &monitoring->lines[i]; opcode_ptr = &lines->original_opcode; opcode = *opcode_ptr; } if (opcode == INSTRUMENTED_INSTRUCTION) { - assert(code->_co_monitoring->per_instruction_opcodes[i] > 0); + assert(monitoring->per_instruction_opcodes[i] > 0); return; } CHECK(opcode != 0); if (is_instrumented(opcode)) { - code->_co_monitoring->per_instruction_opcodes[i] = opcode; + monitoring->per_instruction_opcodes[i] = opcode; } else { assert(opcode != 0); assert(_PyOpcode_Deopt[opcode] != 0); assert(_PyOpcode_Deopt[opcode] != RESUME); - code->_co_monitoring->per_instruction_opcodes[i] = _PyOpcode_Deopt[opcode]; + monitoring->per_instruction_opcodes[i] = _PyOpcode_Deopt[opcode]; } - assert(code->_co_monitoring->per_instruction_opcodes[i] > 0); + assert(monitoring->per_instruction_opcodes[i] > 0); *opcode_ptr = INSTRUMENTED_INSTRUCTION; } @@ -779,19 +783,29 @@ remove_tools(PyCodeObject * code, int offset, int event, int tools) assert(PY_MONITORING_IS_INSTRUMENTED_EVENT(event)); assert(opcode_has_event(_Py_GetBaseCodeUnit(code, offset).op.code)); _PyCoMonitoringData *monitoring = code->_co_monitoring; - bool should_deinstrument; + bool should_de_instrument; if (monitoring && monitoring->tools) { monitoring->tools[offset] &= ~tools; - should_deinstrument = (monitoring->tools[offset] == 0); + should_de_instrument = (monitoring->tools[offset] == 0); } else { /* Single tool */ uint8_t single_tool = code->_co_monitoring->active_monitors.tools[event]; assert(_Py_popcount32(single_tool) <= 1); - should_deinstrument = ((single_tool & tools) == single_tool); + should_de_instrument = ((single_tool & tools) == single_tool); } - if (should_deinstrument) { + if (should_de_instrument) { +#ifdef Py_GIL_DISABLED + for (Py_ssize_t i = 0; i < code->co_specialized_code->size; i++) { + _Py_CODEUNIT *bc = (_Py_CODEUNIT *) code->co_specialized_code->entries[i].bytecode; + if (bc == NULL) { + continue; + } + de_instrument(monitoring, bc, offset, event); + } +#else de_instrument(monitoring, _PyCode_CODE(code), offset, event); +#endif } } @@ -812,26 +826,36 @@ remove_line_tools(PyCodeObject * code, int offset, int tools) _PyCoMonitoringData *monitoring = code->_co_monitoring; assert(monitoring); - bool should_deinstrument; + bool should_de_instrument; if (monitoring->line_tools) { uint8_t *toolsptr = &monitoring->line_tools[offset]; *toolsptr &= ~tools; - should_deinstrument = (*toolsptr == 0); + should_de_instrument = (*toolsptr == 0); } else { /* Single tool */ uint8_t single_tool = monitoring->active_monitors.tools[PY_MONITORING_EVENT_LINE]; assert(_Py_popcount32(single_tool) <= 1); - should_deinstrument = ((single_tool & tools) == single_tool); + should_de_instrument = ((single_tool & tools) == single_tool); } - if (should_deinstrument) { + if (should_de_instrument) { +#ifdef Py_GIL_DISABLED + for (Py_ssize_t i = 0; i < code->co_specialized_code->size; i++) { + _Py_CODEUNIT *bc = (_Py_CODEUNIT *) code->co_specialized_code->entries[i].bytecode; + if (bc == NULL) { + continue; + } + de_instrument_line(monitoring, bc, offset); + } +#else de_instrument_line(monitoring, _PyCode_CODE(code), offset); +#endif } } static void -add_tools(PyCodeObject * code, int offset, int event, int tools) +add_tools(PyCodeObject * code, _Py_CODEUNIT *bytecode, int offset, int event, int tools) { ASSERT_WORLD_STOPPED_OR_LOCKED(code); assert(event != PY_MONITORING_EVENT_LINE); @@ -848,11 +872,11 @@ add_tools(PyCodeObject * code, int offset, int event, int tools) assert(_Py_popcount32(tools) == 1); assert(tools_is_subset_for_event(code, event, tools)); } - instrument(code, offset); + instrument(code->_co_monitoring, bytecode, offset); } static void -add_line_tools(PyCodeObject * code, int offset, int tools) +add_line_tools(PyCodeObject * code, _Py_CODEUNIT *bytecode, int offset, int tools) { ASSERT_WORLD_STOPPED_OR_LOCKED(code); @@ -865,12 +889,12 @@ add_line_tools(PyCodeObject * code, int offset, int tools) /* Single tool */ assert(_Py_popcount32(tools) == 1); } - instrument_line(code, offset); + instrument_line(code->_co_monitoring, bytecode, offset); } static void -add_per_instruction_tools(PyCodeObject * code, int offset, int tools) +add_per_instruction_tools(PyCodeObject * code, _Py_CODEUNIT *bytecode, int offset, int tools) { ASSERT_WORLD_STOPPED_OR_LOCKED(code); @@ -883,7 +907,7 @@ add_per_instruction_tools(PyCodeObject * code, int offset, int tools) /* Single tool */ assert(_Py_popcount32(tools) == 1); } - instrument_per_instruction(code, offset); + instrument_per_instruction(code->_co_monitoring, bytecode, offset); } @@ -892,21 +916,32 @@ remove_per_instruction_tools(PyCodeObject * code, int offset, int tools) { ASSERT_WORLD_STOPPED_OR_LOCKED(code); + _PyCoMonitoringData *monitoring = code->_co_monitoring; assert(code->_co_monitoring); + bool should_de_instrument; if (code->_co_monitoring->per_instruction_tools) { uint8_t *toolsptr = &code->_co_monitoring->per_instruction_tools[offset]; *toolsptr &= ~tools; - if (*toolsptr == 0) { - de_instrument_per_instruction(code, offset); - } + should_de_instrument = (*toolsptr == 0); } else { /* Single tool */ uint8_t single_tool = code->_co_monitoring->active_monitors.tools[PY_MONITORING_EVENT_INSTRUCTION]; assert(_Py_popcount32(single_tool) <= 1); - if (((single_tool & tools) == single_tool)) { - de_instrument_per_instruction(code, offset); + should_de_instrument = ((single_tool & tools) == single_tool); + } + if (should_de_instrument) { +#ifdef Py_GIL_DISABLED + for (Py_ssize_t i = 0; i < code->co_specialized_code->size; i++) { + _Py_CODEUNIT *bc = (_Py_CODEUNIT *) code->co_specialized_code->entries[i].bytecode; + if (bc == NULL) { + continue; + } + de_instrument_per_instruction(monitoring, bc, offset); } +#else + de_instrument_per_instruction(monitoring, _PyCode_CODE(code), offset); +#endif } } @@ -1094,7 +1129,7 @@ call_instrumentation_vector( PyCodeObject *code = _PyFrame_GetCode(frame); assert(args[1] == NULL); args[1] = (PyObject *)code; - int offset = (int)(instr - _PyCode_CODE(code)); + int offset = (int)(instr - _PyFrame_GetBytecode(frame)); /* Offset visible to user should be the offset in bytes, as that is the * convention for APIs involving code offsets. */ int bytes_offset = offset * (int)sizeof(_Py_CODEUNIT); @@ -1180,8 +1215,7 @@ _Py_call_instrumentation_jump( assert(event == PY_MONITORING_EVENT_JUMP || event == PY_MONITORING_EVENT_BRANCH); assert(frame->instr_ptr == instr); - PyCodeObject *code = _PyFrame_GetCode(frame); - int to = (int)(target - _PyCode_CODE(code)); + int to = (int)(target - _PyFrame_GetBytecode(frame)); PyObject *to_obj = PyLong_FromLong(to * (int)sizeof(_Py_CODEUNIT)); if (to_obj == NULL) { return NULL; @@ -1247,7 +1281,8 @@ _Py_call_instrumentation_line(PyThreadState *tstate, _PyInterpreterFrame* frame, PyCodeObject *code = _PyFrame_GetCode(frame); assert(tstate->tracing == 0); assert(debug_check_sanity(tstate->interp, code)); - int i = (int)(instr - _PyCode_CODE(code)); + _Py_CODEUNIT *bytecode = _PyFrame_GetBytecode(frame); + int i = (int)(instr - bytecode); _PyCoMonitoringData *monitoring = code->_co_monitoring; _PyCoLineInstrumentationData *line_data = &monitoring->lines[i]; @@ -1263,10 +1298,10 @@ _Py_call_instrumentation_line(PyThreadState *tstate, _PyInterpreterFrame* frame, line = compute_line(code, i, line_delta); assert(line >= 0); assert(prev != NULL); - int prev_index = (int)(prev - _PyCode_CODE(code)); + int prev_index = (int)(prev - bytecode); int prev_line = _Py_Instrumentation_GetLine(code, prev_index); if (prev_line == line) { - int prev_opcode = _PyCode_CODE(code)[prev_index].op.code; + int prev_opcode = bytecode[prev_index].op.code; /* RESUME and INSTRUMENTED_RESUME are needed for the operation of * instrumentation, so must never be hidden by an INSTRUMENTED_LINE. */ @@ -1366,7 +1401,7 @@ int _Py_call_instrumentation_instruction(PyThreadState *tstate, _PyInterpreterFrame* frame, _Py_CODEUNIT *instr) { PyCodeObject *code = _PyFrame_GetCode(frame); - int offset = (int)(instr - _PyCode_CODE(code)); + int offset = (int)(instr - _PyFrame_GetBytecode(frame)); _PyCoMonitoringData *instrumentation_data = code->_co_monitoring; assert(instrumentation_data->per_instruction_opcodes); int next_opcode = instrumentation_data->per_instruction_opcodes[offset]; @@ -1720,8 +1755,29 @@ update_instrumentation_data(PyCodeObject *code, PyInterpreterState *interp) return 0; } +static void +update_instrumentation_version(PyCodeObject *code, _Py_CODEUNIT *bytecode, + PyInterpreterState *interp) +{ + ASSERT_WORLD_STOPPED_OR_LOCKED(code); + + uint32_t interp_version = global_version(interp); +#ifdef Py_GIL_DISABLED + for (Py_ssize_t i = 0; i < code->co_specialized_code->size; i++) { + _PySpecializableCode *spec_code = &code->co_specialized_code->entries[i]; + if (spec_code->bytecode == (uint8_t *) bytecode) { + spec_code->instrumentation_version = interp_version; + } + } +#else + (void)bytecode; +#endif + FT_ATOMIC_STORE_UINTPTR_RELEASE(code->_co_instrumentation_version, + interp_version); +} + static int -force_instrument_lock_held(PyCodeObject *code, PyInterpreterState *interp) +force_instrument_lock_held(PyCodeObject *code, _Py_CODEUNIT *bytecode, PyInterpreterState *interp) { ASSERT_WORLD_STOPPED_OR_LOCKED(code); @@ -1783,7 +1839,7 @@ force_instrument_lock_held(PyCodeObject *code, PyInterpreterState *interp) } uint8_t new_tools = new_events.tools[event]; if (new_tools) { - add_tools(code, i, event, new_tools); + add_tools(code, bytecode, i, event, new_tools); } } } @@ -1824,7 +1880,7 @@ force_instrument_lock_held(PyCodeObject *code, PyInterpreterState *interp) _PyCoLineInstrumentationData *line_data = code->_co_monitoring->lines; for (int i = code->_co_firsttraceable; i < code_len;) { if (line_data[i].original_opcode) { - add_line_tools(code, i, new_line_tools); + add_line_tools(code, bytecode, i, new_line_tools); } i += _PyInstruction_GetLength(code, i); } @@ -1836,14 +1892,13 @@ force_instrument_lock_held(PyCodeObject *code, PyInterpreterState *interp) i += _PyInstruction_GetLength(code, i); continue; } - add_per_instruction_tools(code, i, new_per_instruction_tools); + add_per_instruction_tools(code, bytecode, i, new_per_instruction_tools); i += _PyInstruction_GetLength(code, i); } } done: - FT_ATOMIC_STORE_UINTPTR_RELEASE(code->_co_instrumentation_version, - global_version(interp)); + update_instrumentation_version(code, bytecode, interp); #ifdef INSTRUMENT_DEBUG sanity_check_instrumentation(code); @@ -1852,11 +1907,11 @@ force_instrument_lock_held(PyCodeObject *code, PyInterpreterState *interp) } static int -instrument_lock_held(PyCodeObject *code, PyInterpreterState *interp) +instrument_lock_held(PyCodeObject *code, _Py_CODEUNIT *bytecode, PyInterpreterState *interp) { ASSERT_WORLD_STOPPED_OR_LOCKED(code); - if (is_version_up_to_date(code, interp)) { + if (0 && is_version_up_to_date(code, interp)) { assert( interp->ceval.instrumentation_version == 0 || instrumentation_cross_checks(interp, code) @@ -1864,15 +1919,15 @@ instrument_lock_held(PyCodeObject *code, PyInterpreterState *interp) return 0; } - return force_instrument_lock_held(code, interp); + return force_instrument_lock_held(code, bytecode, interp); } int -_Py_Instrument(PyCodeObject *code, PyInterpreterState *interp) +_Py_Instrument(PyCodeObject *code, _Py_CODEUNIT *bytecode, PyInterpreterState *interp) { int res; LOCK_CODE(code); - res = instrument_lock_held(code, interp); + res = instrument_lock_held(code, bytecode, interp); UNLOCK_CODE(); return res; } @@ -1897,7 +1952,9 @@ instrument_all_executing_code_objects(PyInterpreterState *interp) { _PyInterpreterFrame *frame = ts->current_frame; while (frame) { if (frame->owner != FRAME_OWNED_BY_CSTACK) { - if (instrument_lock_held(_PyFrame_GetCode(frame), interp)) { + PyCodeObject *code = _PyFrame_GetCode(frame); + _Py_CODEUNIT *bytecode = _PyFrame_GetBytecode(frame); + if (instrument_lock_held(code, bytecode, interp)) { return -1; } } @@ -2014,7 +2071,20 @@ _PyMonitoring_SetLocalEvents(PyCodeObject *code, int tool_id, _PyMonitoringEvent } set_local_events(local, tool_id, events); - res = force_instrument_lock_held(code, interp); +#ifdef Py_GIL_DISABLED + res = 0; + for (Py_ssize_t i = 0; i < code->co_specialized_code->size; i++) { + _Py_CODEUNIT *bytecode = (_Py_CODEUNIT *) code->co_specialized_code->entries[i].bytecode; + if (bytecode != NULL) { + res = force_instrument_lock_held(code, bytecode, interp); + if (res < 0) { + goto done; + } + } + } +#else + res = force_instrument_lock_held(code, _PyCode_CODE(code), interp); +#endif done: _PyEval_StartTheWorld(interp); From aea69c56a7c33f95a454acc73ea253783e5c6e6a Mon Sep 17 00:00:00 2001 From: Matt Page Date: Mon, 2 Sep 2024 21:25:28 -0700 Subject: [PATCH 07/67] Use locks for instrumentation --- Include/cpython/code.h | 14 ++-- Include/internal/pycore_code.h | 10 +-- Objects/codeobject.c | 50 ++++++++---- Python/bytecodes.c | 13 +-- Python/ceval.c | 2 +- Python/generated_cases.c.h | 26 ++---- Python/instrumentation.c | 140 +++++++++++---------------------- 7 files changed, 102 insertions(+), 153 deletions(-) diff --git a/Include/cpython/code.h b/Include/cpython/code.h index d743c91a5ff704..4e8b364ea1ee67 100644 --- a/Include/cpython/code.h +++ b/Include/cpython/code.h @@ -69,21 +69,23 @@ typedef struct { } _PyCoMonitoringData; #ifdef Py_GIL_DISABLED + typedef struct { - uintptr_t instrumentation_version; - // TODO(mpage) - Change this to _Py_CODEUNIT*? - uint8_t *bytecode; -} _PySpecializableCode; + PyMutex mutex; + char bytecode[]; +} _PyMutBytecode; /* Each thread specializes a thread-local copy of the bytecode in free-threaded * builds. These copies are stored on the code object in a `_PyCodeArray`. */ typedef struct { Py_ssize_t size; - _PySpecializableCode entries[]; + _PyMutBytecode *entries[]; } _PyCodeArray; -#define _PyCode_DEF_THREAD_LOCAL_BYTECODE() _PyCodeArray *co_specialized_code; +#define _PyCode_DEF_THREAD_LOCAL_BYTECODE() \ + _PyCodeArray *co_specialized_code; \ + PyMutex co_code_adaptive_mutex; #else #define _PyCode_DEF_THREAD_LOCAL_BYTECODE() #endif diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index c4b597388b0fd0..2c08cba59fe27e 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -591,7 +591,7 @@ adaptive_counter_backoff(_Py_BackoffCounter counter) { #define COMPARISON_NOT_EQUALS (COMPARISON_UNORDERED | COMPARISON_LESS_THAN | COMPARISON_GREATER_THAN) -extern int _Py_Instrument(PyCodeObject *co, _Py_CODEUNIT *bytecode, PyInterpreterState *interp); +extern int _Py_Instrument(PyCodeObject *co, PyInterpreterState *interp); extern _Py_CODEUNIT _Py_GetBaseCodeUnit(PyCodeObject *code, int offset); @@ -603,7 +603,7 @@ PyAPI_DATA(const struct _PyCode8) _Py_InitCleanup; #ifdef Py_GIL_DISABLED -extern _PySpecializableCode *_PyCode_CreateSpecializableCode(PyCodeObject *co); +extern _PyMutBytecode *_PyCode_CreateSpecializableCode(PyCodeObject *co); /* Return bytecode that should be executed. * Will not return NULL, but may disable specialization, in which case the @@ -611,15 +611,15 @@ extern _PySpecializableCode *_PyCode_CreateSpecializableCode(PyCodeObject *co); * * XXX - This is a confusing contract. */ -static inline _PySpecializableCode * +static inline _PyMutBytecode * _PyCode_GetSpecializableCode(PyCodeObject *co) { _PyCodeArray *code = _Py_atomic_load_ptr_acquire(&co->co_specialized_code); _PyThreadStateImpl *tstate = (_PyThreadStateImpl *) PyThreadState_GET(); Py_ssize_t idx = tstate->specialized_code_index; - if (idx < code->size && code->entries[idx].bytecode != NULL) { + if (idx < code->size && code->entries[idx] != NULL) { // XXX - Do we need to worry about alignment here? - return &code->entries[idx]; + return code->entries[idx]; } return _PyCode_CreateSpecializableCode(co); } diff --git a/Objects/codeobject.c b/Objects/codeobject.c index 4b418b1779a7e6..31f7acebc7bd73 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -525,7 +525,8 @@ init_code(PyCodeObject *co, struct _PyCodeConstructor *con) if (co->co_specialized_code == NULL) { return -1; } - co->co_specialized_code->entries[0].bytecode = (uint8_t *) _PyCode_CODE(co); + co->co_specialized_code->entries[0] = (_PyMutBytecode *) &co->co_code_adaptive_mutex; + co->co_specialized_code->entries[0]->mutex = (PyMutex){0}; #endif int entry_point = 0; while (entry_point < Py_SIZE(co) && @@ -1892,11 +1893,10 @@ code_dealloc(PyCodeObject *co) } free_monitoring_data(co->_co_monitoring); #ifdef Py_GIL_DISABLED - // The first element always points to the bytecode that follows the fixed - // part of the code object, which will be freed when the code object is - // freed. + // The first element always points to the mutable bytecode at the end of + // the code object, which will be freed when the code object is freed. for (Py_ssize_t i = 1; i < co->co_specialized_code->size; i++) { - PyMem_Free(co->co_specialized_code->entries[i].bytecode); + PyMem_Free(co->co_specialized_code->entries[i]); } PyMem_Free(co->co_specialized_code); #endif @@ -2672,7 +2672,7 @@ _PyCode_Fini(PyInterpreterState *interp) static _PyCodeArray * _PyCodeArray_New(Py_ssize_t size) { - _PyCodeArray *arr = PyMem_Calloc(sizeof(_PyCodeArray) + sizeof(_PySpecializableCode) * size, 1); + _PyCodeArray *arr = PyMem_Calloc(1, sizeof(_PyCodeArray) + sizeof(_PyMutBytecode*) * size); if (arr == NULL) { PyErr_NoMemory(); return NULL; @@ -2682,7 +2682,7 @@ _PyCodeArray_New(Py_ssize_t size) } static void -copy_code(_Py_CODEUNIT *dst, _Py_CODEUNIT *src, Py_ssize_t nbytes) +copy_code(_PyMutBytecode *dst, PyCodeObject *co) { int code_len = Py_SIZE(co); _Py_CODEUNIT *dst_bytecode = (_Py_CODEUNIT *) dst->bytecode; @@ -2692,35 +2692,53 @@ copy_code(_Py_CODEUNIT *dst, _Py_CODEUNIT *src, Py_ssize_t nbytes) _PyCode_Quicken(dst_bytecode, code_len); } -static _PySpecializableCode * +static Py_ssize_t +get_pow2_greater(Py_ssize_t initial, Py_ssize_t limit) +{ + // initial must be a power of two + assert(!(initial & (initial - 1))); + Py_ssize_t res = initial; + while (res && res < limit) { + res <<= 1; + } + return res; +} + +static _PyMutBytecode * create_specializable_code_lock_held(PyCodeObject *co) { _PyCodeArray *spec_code = co->co_specialized_code; _PyThreadStateImpl *tstate = (_PyThreadStateImpl *) PyThreadState_GET(); Py_ssize_t idx = tstate->specialized_code_index; if (idx >= spec_code->size) { - _PyCodeArray *new_spec_code = _PyCodeArray_New(spec_code->size * 2); + Py_ssize_t new_size = get_pow2_greater(spec_code->size, idx + 1); + if (!new_size) { + PyErr_NoMemory(); + return NULL; + } + _PyCodeArray *new_spec_code = _PyCodeArray_New(new_size); if (new_spec_code == NULL) { return NULL; } - memcpy(new_spec_code->entries, spec_code->entries, spec_code->size * sizeof(_PySpecializableCode)); + memcpy(new_spec_code->entries, spec_code->entries, spec_code->size * sizeof(void*)); _Py_atomic_store_ptr_release(&co->co_specialized_code, new_spec_code); _PyMem_FreeDelayed(spec_code); spec_code = new_spec_code; } - spec_code->entries[idx].bytecode = PyMem_Malloc(_PyCode_NBYTES(co)); - if (spec_code->entries[idx].bytecode == NULL) { + _PyMutBytecode *bc = PyMem_Calloc(1, sizeof(_PyMutBytecode) + _PyCode_NBYTES(co)); + if (bc == NULL) { PyErr_NoMemory(); return NULL; } - copy_code((_Py_CODEUNIT *) spec_code->entries[idx].bytecode, _PyCode_CODE(co), _PyCode_NBYTES(co)); - return &spec_code->entries[idx]; + copy_code(bc, co); + spec_code->entries[idx] = bc; + return bc; } -_PySpecializableCode * +_PyMutBytecode * _PyCode_CreateSpecializableCode(PyCodeObject *co) { - _PySpecializableCode *result; + _PyMutBytecode *result; Py_BEGIN_CRITICAL_SECTION(co); result = create_specializable_code_lock_held(co); Py_END_CRITICAL_SECTION(); diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 74b239d9ae0701..9ec07d335d0c1b 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -178,16 +178,9 @@ dummy_func( tier1 op(_MAYBE_INSTRUMENT, (--)) { if (tstate->tracing == 0) { uintptr_t global_version = _Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker) & ~_PY_EVAL_EVENTS_MASK; - PyCodeObject *code = _PyFrame_GetCode(frame); -#ifdef Py_GIL_DISABLED - _PySpecializableCode *spec_code = _PyCode_GetSpecializableCode(code); - uintptr_t code_version = spec_code->instrumentation_version; -#else - uintptr_t code_version = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(code->_co_instrumentation_version); -#endif + uintptr_t code_version = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(_PyFrame_GetCode(frame)->_co_instrumentation_version); if (code_version != global_version) { - _Py_CODEUNIT *bytecode = _PyFrame_GetBytecode(frame); - int err = _Py_Instrument(code, bytecode, tstate->interp); + int err = _Py_Instrument(_PyFrame_GetCode(frame), tstate->interp); if (err) { ERROR_NO_POP(); } @@ -199,7 +192,7 @@ dummy_func( op(_LOAD_BYTECODE, (--)) { #ifdef Py_GIL_DISABLED - _PySpecializableCode *code = _PyCode_GetSpecializableCode(_PyFrame_GetCode(frame)); + _PyMutBytecode *code = _PyCode_GetSpecializableCode(_PyFrame_GetCode(frame)); if (frame->bytecode != (_Py_CODEUNIT *) code->bytecode) { int off = this_instr - frame->bytecode; frame->bytecode = (_Py_CODEUNIT *) code->bytecode; diff --git a/Python/ceval.c b/Python/ceval.c index b726c24ef20146..9d2fe891c15a20 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -809,7 +809,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int /* Because this avoids the RESUME, * we need to update instrumentation */ /* TODO(mpage) - Need to handle this */ - _Py_Instrument(_PyFrame_GetCode(frame), _PyFrame_GetBytecode(frame), tstate->interp); + _Py_Instrument(_PyFrame_GetCode(frame), tstate->interp); monitor_throw(tstate, frame, frame->instr_ptr); /* TO DO -- Monitor throw entry. */ goto resume_with_error; diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index fff8ca977d33ac..bc63235ef7f346 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -4492,7 +4492,7 @@ // _LOAD_BYTECODE { #ifdef Py_GIL_DISABLED - _PySpecializableCode *code = _PyCode_GetSpecializableCode(_PyFrame_GetCode(frame)); + _PyMutBytecode *code = _PyCode_GetSpecializableCode(_PyFrame_GetCode(frame)); if (frame->bytecode != (_Py_CODEUNIT *) code->bytecode) { int off = this_instr - frame->bytecode; frame->bytecode = (_Py_CODEUNIT *) code->bytecode; @@ -4506,16 +4506,9 @@ { if (tstate->tracing == 0) { uintptr_t global_version = _Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker) & ~_PY_EVAL_EVENTS_MASK; - PyCodeObject *code = _PyFrame_GetCode(frame); - #ifdef Py_GIL_DISABLED - _PySpecializableCode *spec_code = _PyCode_GetSpecializableCode(code); - uintptr_t code_version = spec_code->instrumentation_version; - #else - uintptr_t code_version = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(code->_co_instrumentation_version); - #endif + uintptr_t code_version = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(_PyFrame_GetCode(frame)->_co_instrumentation_version); if (code_version != global_version) { - _Py_CODEUNIT *bytecode = _PyFrame_GetBytecode(frame); - int err = _Py_Instrument(code, bytecode, tstate->interp); + int err = _Py_Instrument(_PyFrame_GetCode(frame), tstate->interp); if (err) { goto error; } @@ -6451,7 +6444,7 @@ // _LOAD_BYTECODE { #ifdef Py_GIL_DISABLED - _PySpecializableCode *code = _PyCode_GetSpecializableCode(_PyFrame_GetCode(frame)); + _PyMutBytecode *code = _PyCode_GetSpecializableCode(_PyFrame_GetCode(frame)); if (frame->bytecode != (_Py_CODEUNIT *) code->bytecode) { int off = this_instr - frame->bytecode; frame->bytecode = (_Py_CODEUNIT *) code->bytecode; @@ -6465,16 +6458,9 @@ { if (tstate->tracing == 0) { uintptr_t global_version = _Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker) & ~_PY_EVAL_EVENTS_MASK; - PyCodeObject *code = _PyFrame_GetCode(frame); - #ifdef Py_GIL_DISABLED - _PySpecializableCode *spec_code = _PyCode_GetSpecializableCode(code); - uintptr_t code_version = spec_code->instrumentation_version; - #else - uintptr_t code_version = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(code->_co_instrumentation_version); - #endif + uintptr_t code_version = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(_PyFrame_GetCode(frame)->_co_instrumentation_version); if (code_version != global_version) { - _Py_CODEUNIT *bytecode = _PyFrame_GetBytecode(frame); - int err = _Py_Instrument(code, bytecode, tstate->interp); + int err = _Py_Instrument(_PyFrame_GetCode(frame), tstate->interp); if (err) { goto error; } diff --git a/Python/instrumentation.c b/Python/instrumentation.c index 1c1f8f1474a9d8..f8b77a2d369b7d 100644 --- a/Python/instrumentation.c +++ b/Python/instrumentation.c @@ -46,10 +46,25 @@ #define UNLOCK_CODE() Py_END_CRITICAL_SECTION() +#define MODIFY_BYTECODE(code, func, args...) \ + do { \ + PyCodeObject *co = (code); \ + for (Py_ssize_t i = 0; i < code->co_specialized_code->size; i++) { \ + _PyMutBytecode *mb = co->co_specialized_code->entries[i]; \ + if (mb == NULL) { \ + continue; \ + } \ + PyMutex_LockFlags(&mb->mutex, _Py_LOCK_DONT_DETACH); \ + (func)((_Py_CODEUNIT *) mb->bytecode, args); \ + PyMutex_Unlock(&mb->mutex); \ + } \ + } while (0) + #else #define LOCK_CODE(code) #define UNLOCK_CODE() +#define MODIFY_BYTECODE(code, func, args...) (func)(_PyCode_CODE((code), __VA_ARGS__) #endif @@ -622,7 +637,7 @@ _Py_GetBaseCodeUnit(PyCodeObject *code, int i) } static void -de_instrument(_PyCoMonitoringData *monitoring, _Py_CODEUNIT *bytecode, int i, int event) +de_instrument(_Py_CODEUNIT *bytecode, _PyCoMonitoringData *monitoring, int i, int event) { assert(event != PY_MONITORING_EVENT_INSTRUCTION); assert(event != PY_MONITORING_EVENT_LINE); @@ -652,7 +667,7 @@ de_instrument(_PyCoMonitoringData *monitoring, _Py_CODEUNIT *bytecode, int i, in } static void -de_instrument_line(_PyCoMonitoringData *monitoring, _Py_CODEUNIT *bytecode, int i) +de_instrument_line(_Py_CODEUNIT *bytecode, _PyCoMonitoringData *monitoring, int i) { _Py_CODEUNIT *instr = &bytecode[i]; int opcode = instr->op.code; @@ -674,8 +689,8 @@ de_instrument_line(_PyCoMonitoringData *monitoring, _Py_CODEUNIT *bytecode, int } static void -de_instrument_per_instruction(_PyCoMonitoringData *monitoring, - _Py_CODEUNIT *bytecode, int i) +de_instrument_per_instruction(_Py_CODEUNIT *bytecode, + _PyCoMonitoringData *monitoring, int i) { _Py_CODEUNIT *instr = &bytecode[i]; uint8_t *opcode_ptr = &instr->op.code; @@ -700,7 +715,7 @@ de_instrument_per_instruction(_PyCoMonitoringData *monitoring, static void -instrument(_PyCoMonitoringData *monitoring, _Py_CODEUNIT *bytecode, int i) +instrument(_Py_CODEUNIT *bytecode, _PyCoMonitoringData *monitoring, int i) { _Py_CODEUNIT *instr = &bytecode[i]; uint8_t *opcode_ptr = &instr->op.code; @@ -731,7 +746,7 @@ instrument(_PyCoMonitoringData *monitoring, _Py_CODEUNIT *bytecode, int i) } static void -instrument_line(_PyCoMonitoringData *monitoring, _Py_CODEUNIT *bytecode, int i) +instrument_line(_Py_CODEUNIT *bytecode, _PyCoMonitoringData *monitoring, int i) { uint8_t *opcode_ptr = &bytecode[i].op.code; int opcode = *opcode_ptr; @@ -745,8 +760,8 @@ instrument_line(_PyCoMonitoringData *monitoring, _Py_CODEUNIT *bytecode, int i) } static void -instrument_per_instruction(_PyCoMonitoringData *monitoring, - _Py_CODEUNIT *bytecode, int i) +instrument_per_instruction(_Py_CODEUNIT *bytecode, + _PyCoMonitoringData *monitoring, int i) { _Py_CODEUNIT *instr = &bytecode[i]; uint8_t *opcode_ptr = &instr->op.code; @@ -795,17 +810,7 @@ remove_tools(PyCodeObject * code, int offset, int event, int tools) should_de_instrument = ((single_tool & tools) == single_tool); } if (should_de_instrument) { -#ifdef Py_GIL_DISABLED - for (Py_ssize_t i = 0; i < code->co_specialized_code->size; i++) { - _Py_CODEUNIT *bc = (_Py_CODEUNIT *) code->co_specialized_code->entries[i].bytecode; - if (bc == NULL) { - continue; - } - de_instrument(monitoring, bc, offset, event); - } -#else - de_instrument(monitoring, _PyCode_CODE(code), offset, event); -#endif + MODIFY_BYTECODE(code, de_instrument, monitoring, offset, event); } } @@ -840,22 +845,12 @@ remove_line_tools(PyCodeObject * code, int offset, int tools) should_de_instrument = ((single_tool & tools) == single_tool); } if (should_de_instrument) { -#ifdef Py_GIL_DISABLED - for (Py_ssize_t i = 0; i < code->co_specialized_code->size; i++) { - _Py_CODEUNIT *bc = (_Py_CODEUNIT *) code->co_specialized_code->entries[i].bytecode; - if (bc == NULL) { - continue; - } - de_instrument_line(monitoring, bc, offset); - } -#else - de_instrument_line(monitoring, _PyCode_CODE(code), offset); -#endif + MODIFY_BYTECODE(code, de_instrument_line, monitoring, offset); } } static void -add_tools(PyCodeObject * code, _Py_CODEUNIT *bytecode, int offset, int event, int tools) +add_tools(PyCodeObject * code, int offset, int event, int tools) { ASSERT_WORLD_STOPPED_OR_LOCKED(code); assert(event != PY_MONITORING_EVENT_LINE); @@ -872,11 +867,11 @@ add_tools(PyCodeObject * code, _Py_CODEUNIT *bytecode, int offset, int event, in assert(_Py_popcount32(tools) == 1); assert(tools_is_subset_for_event(code, event, tools)); } - instrument(code->_co_monitoring, bytecode, offset); + MODIFY_BYTECODE(code, instrument, code->_co_monitoring, offset); } static void -add_line_tools(PyCodeObject * code, _Py_CODEUNIT *bytecode, int offset, int tools) +add_line_tools(PyCodeObject * code, int offset, int tools) { ASSERT_WORLD_STOPPED_OR_LOCKED(code); @@ -889,12 +884,12 @@ add_line_tools(PyCodeObject * code, _Py_CODEUNIT *bytecode, int offset, int tool /* Single tool */ assert(_Py_popcount32(tools) == 1); } - instrument_line(code->_co_monitoring, bytecode, offset); + MODIFY_BYTECODE(code, instrument_line, code->_co_monitoring, offset); } static void -add_per_instruction_tools(PyCodeObject * code, _Py_CODEUNIT *bytecode, int offset, int tools) +add_per_instruction_tools(PyCodeObject * code, int offset, int tools) { ASSERT_WORLD_STOPPED_OR_LOCKED(code); @@ -907,7 +902,7 @@ add_per_instruction_tools(PyCodeObject * code, _Py_CODEUNIT *bytecode, int offse /* Single tool */ assert(_Py_popcount32(tools) == 1); } - instrument_per_instruction(code->_co_monitoring, bytecode, offset); + MODIFY_BYTECODE(code, instrument_per_instruction, code->_co_monitoring, offset); } @@ -931,17 +926,7 @@ remove_per_instruction_tools(PyCodeObject * code, int offset, int tools) should_de_instrument = ((single_tool & tools) == single_tool); } if (should_de_instrument) { -#ifdef Py_GIL_DISABLED - for (Py_ssize_t i = 0; i < code->co_specialized_code->size; i++) { - _Py_CODEUNIT *bc = (_Py_CODEUNIT *) code->co_specialized_code->entries[i].bytecode; - if (bc == NULL) { - continue; - } - de_instrument_per_instruction(monitoring, bc, offset); - } -#else - de_instrument_per_instruction(monitoring, _PyCode_CODE(code), offset); -#endif + MODIFY_BYTECODE(code, de_instrument_per_instruction, monitoring, offset); } } @@ -1755,29 +1740,8 @@ update_instrumentation_data(PyCodeObject *code, PyInterpreterState *interp) return 0; } -static void -update_instrumentation_version(PyCodeObject *code, _Py_CODEUNIT *bytecode, - PyInterpreterState *interp) -{ - ASSERT_WORLD_STOPPED_OR_LOCKED(code); - - uint32_t interp_version = global_version(interp); -#ifdef Py_GIL_DISABLED - for (Py_ssize_t i = 0; i < code->co_specialized_code->size; i++) { - _PySpecializableCode *spec_code = &code->co_specialized_code->entries[i]; - if (spec_code->bytecode == (uint8_t *) bytecode) { - spec_code->instrumentation_version = interp_version; - } - } -#else - (void)bytecode; -#endif - FT_ATOMIC_STORE_UINTPTR_RELEASE(code->_co_instrumentation_version, - interp_version); -} - static int -force_instrument_lock_held(PyCodeObject *code, _Py_CODEUNIT *bytecode, PyInterpreterState *interp) +force_instrument_lock_held(PyCodeObject *code, PyInterpreterState *interp) { ASSERT_WORLD_STOPPED_OR_LOCKED(code); @@ -1839,7 +1803,7 @@ force_instrument_lock_held(PyCodeObject *code, _Py_CODEUNIT *bytecode, PyInterpr } uint8_t new_tools = new_events.tools[event]; if (new_tools) { - add_tools(code, bytecode, i, event, new_tools); + add_tools(code, i, event, new_tools); } } } @@ -1880,7 +1844,7 @@ force_instrument_lock_held(PyCodeObject *code, _Py_CODEUNIT *bytecode, PyInterpr _PyCoLineInstrumentationData *line_data = code->_co_monitoring->lines; for (int i = code->_co_firsttraceable; i < code_len;) { if (line_data[i].original_opcode) { - add_line_tools(code, bytecode, i, new_line_tools); + add_line_tools(code, i, new_line_tools); } i += _PyInstruction_GetLength(code, i); } @@ -1892,13 +1856,14 @@ force_instrument_lock_held(PyCodeObject *code, _Py_CODEUNIT *bytecode, PyInterpr i += _PyInstruction_GetLength(code, i); continue; } - add_per_instruction_tools(code, bytecode, i, new_per_instruction_tools); + add_per_instruction_tools(code, i, new_per_instruction_tools); i += _PyInstruction_GetLength(code, i); } } done: - update_instrumentation_version(code, bytecode, interp); + FT_ATOMIC_STORE_UINTPTR_RELEASE(code->_co_instrumentation_version, + global_version(interp)); #ifdef INSTRUMENT_DEBUG sanity_check_instrumentation(code); @@ -1907,11 +1872,11 @@ force_instrument_lock_held(PyCodeObject *code, _Py_CODEUNIT *bytecode, PyInterpr } static int -instrument_lock_held(PyCodeObject *code, _Py_CODEUNIT *bytecode, PyInterpreterState *interp) +instrument_lock_held(PyCodeObject *code, PyInterpreterState *interp) { ASSERT_WORLD_STOPPED_OR_LOCKED(code); - if (0 && is_version_up_to_date(code, interp)) { + if (is_version_up_to_date(code, interp)) { assert( interp->ceval.instrumentation_version == 0 || instrumentation_cross_checks(interp, code) @@ -1919,15 +1884,15 @@ instrument_lock_held(PyCodeObject *code, _Py_CODEUNIT *bytecode, PyInterpreterSt return 0; } - return force_instrument_lock_held(code, bytecode, interp); + return force_instrument_lock_held(code, interp); } int -_Py_Instrument(PyCodeObject *code, _Py_CODEUNIT *bytecode, PyInterpreterState *interp) +_Py_Instrument(PyCodeObject *code, PyInterpreterState *interp) { int res; LOCK_CODE(code); - res = instrument_lock_held(code, bytecode, interp); + res = instrument_lock_held(code, interp); UNLOCK_CODE(); return res; } @@ -1952,9 +1917,7 @@ instrument_all_executing_code_objects(PyInterpreterState *interp) { _PyInterpreterFrame *frame = ts->current_frame; while (frame) { if (frame->owner != FRAME_OWNED_BY_CSTACK) { - PyCodeObject *code = _PyFrame_GetCode(frame); - _Py_CODEUNIT *bytecode = _PyFrame_GetBytecode(frame); - if (instrument_lock_held(code, bytecode, interp)) { + if (instrument_lock_held(_PyFrame_GetCode(frame), interp)) { return -1; } } @@ -2071,20 +2034,7 @@ _PyMonitoring_SetLocalEvents(PyCodeObject *code, int tool_id, _PyMonitoringEvent } set_local_events(local, tool_id, events); -#ifdef Py_GIL_DISABLED - res = 0; - for (Py_ssize_t i = 0; i < code->co_specialized_code->size; i++) { - _Py_CODEUNIT *bytecode = (_Py_CODEUNIT *) code->co_specialized_code->entries[i].bytecode; - if (bytecode != NULL) { - res = force_instrument_lock_held(code, bytecode, interp); - if (res < 0) { - goto done; - } - } - } -#else - res = force_instrument_lock_held(code, _PyCode_CODE(code), interp); -#endif + res = force_instrument_lock_held(code, interp); done: _PyEval_StartTheWorld(interp); From 552277d8db532b1475fd5fa1396d72493515e9ee Mon Sep 17 00:00:00 2001 From: Matt Page Date: Tue, 3 Sep 2024 15:47:31 -0700 Subject: [PATCH 08/67] Add ifdef guards for each specialization family --- Include/internal/pycore_code.h | 30 +++++++++++++++++ Lib/test/support/__init__.py | 14 ++++++++ Lib/test/test_dis.py | 18 ++++++---- Lib/test/test_monitoring.py | 4 +-- Lib/test/test_opcache.py | 25 +++++++++++--- Lib/test/test_type_cache.py | 9 +++-- Modules/_opcode.c | 29 ++++++++++++++-- Python/bytecodes.c | 60 +++++++++++++++++----------------- Python/generated_cases.c.h | 60 +++++++++++++++++----------------- Python/specialize.c | 28 ++++++++-------- 10 files changed, 185 insertions(+), 92 deletions(-) diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index 2c08cba59fe27e..72e5b8863f7a5c 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -317,8 +317,38 @@ extern void _PyCode_Clear_Executors(PyCodeObject *code); #ifdef Py_GIL_DISABLED // gh-115999 tracks progress on addressing this. #define ENABLE_SPECIALIZATION 0 +#define ENABLE_SPECIALIZED_BINARY_OP 0 && ENABLE_SPECIALIZATION +#define ENABLE_SPECIALIZED_BINARY_SUBSCR 0 && ENABLE_SPECIALIZATION +#define ENABLE_SPECIALIZED_CALL 0 && ENABLE_SPECIALIZATION +#define ENABLE_SPECIALIZED_CALL_KW 0 && ENABLE_SPECIALIZATION +#define ENABLE_SPECIALIZED_COMPARE_OP 0 && ENABLE_SPECIALIZATION +#define ENABLE_SPECIALIZED_CONTAINS_OP 0 && ENABLE_SPECIALIZATION +#define ENABLE_SPECIALIZED_FOR_ITER 0 && ENABLE_SPECIALIZATION +#define ENABLE_SPECIALIZED_LOAD_ATTR 0 && ENABLE_SPECIALIZATION +#define ENABLE_SPECIALIZED_LOAD_GLOBAL 0 && ENABLE_SPECIALIZATION +#define ENABLE_SPECIALIZED_LOAD_SUPER_ATTR 0 && ENABLE_SPECIALIZATION +#define ENABLE_SPECIALIZED_SEND 0 && ENABLE_SPECIALIZATION +#define ENABLE_SPECIALIZED_STORE_ATTR 0 && ENABLE_SPECIALIZATION +#define ENABLE_SPECIALIZED_STORE_SUBSCR 0 && ENABLE_SPECIALIZATION +#define ENABLE_SPECIALIZED_TO_BOOL 0 && ENABLE_SPECIALIZATION +#define ENABLE_SPECIALIZED_UNPACK_SEQUENCE 0 && ENABLE_SPECIALIZATION #else #define ENABLE_SPECIALIZATION 1 +#define ENABLE_SPECIALIZED_BINARY_OP ENABLE_SPECIALIZATION +#define ENABLE_SPECIALIZED_BINARY_SUBSCR ENABLE_SPECIALIZATION +#define ENABLE_SPECIALIZED_CALL ENABLE_SPECIALIZATION +#define ENABLE_SPECIALIZED_CALL_KW ENABLE_SPECIALIZATION +#define ENABLE_SPECIALIZED_COMPARE_OP ENABLE_SPECIALIZATION +#define ENABLE_SPECIALIZED_CONTAINS_OP ENABLE_SPECIALIZATION +#define ENABLE_SPECIALIZED_FOR_ITER ENABLE_SPECIALIZATION +#define ENABLE_SPECIALIZED_LOAD_ATTR ENABLE_SPECIALIZATION +#define ENABLE_SPECIALIZED_LOAD_GLOBAL ENABLE_SPECIALIZATION +#define ENABLE_SPECIALIZED_LOAD_SUPER_ATTR ENABLE_SPECIALIZATION +#define ENABLE_SPECIALIZED_SEND ENABLE_SPECIALIZATION +#define ENABLE_SPECIALIZED_STORE_ATTR ENABLE_SPECIALIZATION +#define ENABLE_SPECIALIZED_STORE_SUBSCR ENABLE_SPECIALIZATION +#define ENABLE_SPECIALIZED_TO_BOOL ENABLE_SPECIALIZATION +#define ENABLE_SPECIALIZED_UNPACK_SEQUENCE ENABLE_SPECIALIZATION #endif /* Specialization functions */ diff --git a/Lib/test/support/__init__.py b/Lib/test/support/__init__.py index dbf479dddff7b3..722cd0c96ad3c6 100644 --- a/Lib/test/support/__init__.py +++ b/Lib/test/support/__init__.py @@ -1256,6 +1256,20 @@ def requires_specialization(test): _opcode.ENABLE_SPECIALIZATION, "requires specialization")(test) +def requires_specialization_of(*ops): + missing_ops = [] + is_enabled = True + for op in ops: + is_op_specialized = getattr(_opcode, f"ENABLE_SPECIALIZED_{op}") + if not is_op_specialized: + missing_ops.append(op) + is_enabled = is_enabled and is_op_specialized + reason = f"requires specialized {', '.join(missing_ops)}" + def f(test): + return unittest.skipUnless(is_enabled, reason)(test) + return f + + #======================================================================= # Check for the presence of docstrings. diff --git a/Lib/test/test_dis.py b/Lib/test/test_dis.py index bccd2182412577..96084456579217 100644 --- a/Lib/test/test_dis.py +++ b/Lib/test/test_dis.py @@ -10,7 +10,8 @@ import types import unittest from test.support import (captured_stdout, requires_debug_ranges, - requires_specialization, cpython_only) + requires_specialization, + requires_specialization_of, cpython_only) from test.support.bytecode_helper import BytecodeTestCase import opcode @@ -1212,8 +1213,8 @@ def test_super_instructions(self): self.do_disassembly_compare(got, dis_load_test_quickened_code) @cpython_only - @requires_specialization - def test_binary_specialize(self): + @requires_specialization_of("BINARY_OP") + def test_binary_op_specialize(self): binary_op_quicken = """\ 0 RESUME_CHECK 0 @@ -1232,6 +1233,9 @@ def test_binary_specialize(self): got = self.get_disassembly(co_unicode, adaptive=True) self.do_disassembly_compare(got, binary_op_quicken % "BINARY_OP_ADD_UNICODE 0 (+)") + @cpython_only + @requires_specialization_of("BINARY_SUBSCR") + def test_binary_subscr_specialize(self): binary_subscr_quicken = """\ 0 RESUME_CHECK 0 @@ -1251,7 +1255,7 @@ def test_binary_specialize(self): self.do_disassembly_compare(got, binary_subscr_quicken % "BINARY_SUBSCR_DICT") @cpython_only - @requires_specialization + @requires_specialization_of("LOAD_ATTR") def test_load_attr_specialize(self): load_attr_quicken = """\ 0 RESUME_CHECK 0 @@ -1266,7 +1270,7 @@ def test_load_attr_specialize(self): self.do_disassembly_compare(got, load_attr_quicken) @cpython_only - @requires_specialization + @requires_specialization_of("CALL") def test_call_specialize(self): call_quicken = """\ 0 RESUME_CHECK 0 @@ -1283,7 +1287,7 @@ def test_call_specialize(self): self.do_disassembly_compare(got, call_quicken) @cpython_only - @requires_specialization + @requires_specialization_of("FOR_ITER", "LOAD_GLOBAL") def test_loop_quicken(self): # Loop can trigger a quicken where the loop is located self.code_quicken(loop_test, 4) @@ -1292,7 +1296,7 @@ def test_loop_quicken(self): self.do_disassembly_compare(got, expected) @cpython_only - @requires_specialization + @requires_specialization_of("COMPARE_OP", "FOR_ITER") def test_loop_with_conditional_at_end_is_quickened(self): def for_loop_true(x): for i in range(10): diff --git a/Lib/test/test_monitoring.py b/Lib/test/test_monitoring.py index 351f1067c10343..72a9617236278b 100644 --- a/Lib/test/test_monitoring.py +++ b/Lib/test/test_monitoring.py @@ -11,7 +11,7 @@ import unittest import test.support -from test.support import requires_specialization, script_helper +from test.support import requires_specialization_of, script_helper from test.support.import_helper import import_module _testcapi = test.support.import_helper.import_module("_testcapi") @@ -1006,7 +1006,7 @@ def func(): ) self.assertEqual(events[0], ("throw", IndexError)) - @requires_specialization + @requires_specialization_of("CALL") def test_no_unwind_for_shim_frame(self): class B: diff --git a/Lib/test/test_opcache.py b/Lib/test/test_opcache.py index acf8158b0d0ea1..6674ab9d0b5d96 100644 --- a/Lib/test/test_opcache.py +++ b/Lib/test/test_opcache.py @@ -4,7 +4,7 @@ import threading import types import unittest -from test.support import threading_helper, check_impl_detail, requires_specialization +from test.support import threading_helper, check_impl_detail, requires_specialization_of from test.support.import_helper import import_module # Skip this module on other interpreters, it is cpython specific: @@ -515,7 +515,7 @@ def f(x, y): f() @disabling_optimizer - @requires_specialization + @requires_specialization_of("CALL") def test_assign_init_code(self): class MyClass: def __init__(self): @@ -539,7 +539,6 @@ def count_args(self, *args): @threading_helper.requires_working_threading() -@requires_specialization class TestRacesDoNotCrash(TestBase): # Careful with these. Bigger numbers have a higher chance of catching bugs, # but you can also burn through a *ton* of type/dict/function versions: @@ -581,6 +580,7 @@ def assert_races_do_not_crash( for writer in writers: writer.join() + @requires_specialization_of("BINARY_SUBSCR") def test_binary_subscr_getitem(self): def get_items(): class C: @@ -610,6 +610,7 @@ def write(items): opname = "BINARY_SUBSCR_GETITEM" self.assert_races_do_not_crash(opname, get_items, read, write) + @requires_specialization_of("BINARY_SUBSCR") def test_binary_subscr_list_int(self): def get_items(): items = [] @@ -633,6 +634,7 @@ def write(items): opname = "BINARY_SUBSCR_LIST_INT" self.assert_races_do_not_crash(opname, get_items, read, write) + @requires_specialization_of("FOR_ITER") def test_for_iter_gen(self): def get_items(): def g(): @@ -664,6 +666,7 @@ def write(items): opname = "FOR_ITER_GEN" self.assert_races_do_not_crash(opname, get_items, read, write) + @requires_specialization_of("FOR_ITER") def test_for_iter_list(self): def get_items(): items = [] @@ -685,6 +688,7 @@ def write(items): opname = "FOR_ITER_LIST" self.assert_races_do_not_crash(opname, get_items, read, write) + @requires_specialization_of("LOAD_ATTR") def test_load_attr_class(self): def get_items(): class C: @@ -714,6 +718,7 @@ def write(items): opname = "LOAD_ATTR_CLASS" self.assert_races_do_not_crash(opname, get_items, read, write) + @requires_specialization_of("LOAD_ATTR") def test_load_attr_getattribute_overridden(self): def get_items(): class C: @@ -743,6 +748,7 @@ def write(items): opname = "LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN" self.assert_races_do_not_crash(opname, get_items, read, write) + @requires_specialization_of("LOAD_ATTR") def test_load_attr_instance_value(self): def get_items(): class C: @@ -766,6 +772,7 @@ def write(items): opname = "LOAD_ATTR_INSTANCE_VALUE" self.assert_races_do_not_crash(opname, get_items, read, write) + @requires_specialization_of("LOAD_ATTR") def test_load_attr_method_lazy_dict(self): def get_items(): class C(Exception): @@ -795,6 +802,7 @@ def write(items): opname = "LOAD_ATTR_METHOD_LAZY_DICT" self.assert_races_do_not_crash(opname, get_items, read, write) + @requires_specialization_of("LOAD_ATTR") def test_load_attr_method_no_dict(self): def get_items(): class C: @@ -825,6 +833,7 @@ def write(items): opname = "LOAD_ATTR_METHOD_NO_DICT" self.assert_races_do_not_crash(opname, get_items, read, write) + @requires_specialization_of("LOAD_ATTR") def test_load_attr_method_with_values(self): def get_items(): class C: @@ -854,6 +863,7 @@ def write(items): opname = "LOAD_ATTR_METHOD_WITH_VALUES" self.assert_races_do_not_crash(opname, get_items, read, write) + @requires_specialization_of("LOAD_ATTR") def test_load_attr_module(self): def get_items(): items = [] @@ -878,6 +888,7 @@ def write(items): opname = "LOAD_ATTR_MODULE" self.assert_races_do_not_crash(opname, get_items, read, write) + @requires_specialization_of("LOAD_ATTR") def test_load_attr_property(self): def get_items(): class C: @@ -907,6 +918,7 @@ def write(items): opname = "LOAD_ATTR_PROPERTY" self.assert_races_do_not_crash(opname, get_items, read, write) + @requires_specialization_of("LOAD_ATTR") def test_load_attr_with_hint(self): def get_items(): class C: @@ -933,6 +945,7 @@ def write(items): opname = "LOAD_ATTR_WITH_HINT" self.assert_races_do_not_crash(opname, get_items, read, write) + @requires_specialization_of("LOAD_GLOBAL") def test_load_global_module(self): def get_items(): items = [] @@ -954,6 +967,7 @@ def write(items): opname, get_items, read, write, check_items=True ) + @requires_specialization_of("STORE_ATTR") def test_store_attr_instance_value(self): def get_items(): class C: @@ -976,6 +990,7 @@ def write(items): opname = "STORE_ATTR_INSTANCE_VALUE" self.assert_races_do_not_crash(opname, get_items, read, write) + @requires_specialization_of("STORE_ATTR") def test_store_attr_with_hint(self): def get_items(): class C: @@ -1001,6 +1016,7 @@ def write(items): opname = "STORE_ATTR_WITH_HINT" self.assert_races_do_not_crash(opname, get_items, read, write) + @requires_specialization_of("STORE_SUBSCR") def test_store_subscr_list_int(self): def get_items(): items = [] @@ -1024,6 +1040,7 @@ def write(items): opname = "STORE_SUBSCR_LIST_INT" self.assert_races_do_not_crash(opname, get_items, read, write) + @requires_specialization_of("UNPACK_SEQUENCE") def test_unpack_sequence_list(self): def get_items(): items = [] @@ -1050,7 +1067,7 @@ def write(items): class C: pass -@requires_specialization +@requires_specialization_of("LOAD_ATTR") class TestInstanceDict(unittest.TestCase): def setUp(self): diff --git a/Lib/test/test_type_cache.py b/Lib/test/test_type_cache.py index 66abe73f8d766d..1d431cbecfeec1 100644 --- a/Lib/test/test_type_cache.py +++ b/Lib/test/test_type_cache.py @@ -2,7 +2,7 @@ import unittest import dis from test import support -from test.support import import_helper, requires_specialization +from test.support import import_helper, requires_specialization_of try: from sys import _clear_type_cache except ImportError: @@ -110,7 +110,6 @@ class HolderSub(Holder): HolderSub.value @support.cpython_only -@requires_specialization class TypeCacheWithSpecializationTests(unittest.TestCase): def tearDown(self): _clear_type_cache() @@ -140,6 +139,7 @@ def _check_specialization(self, func, arg, opname, *, should_specialize): else: self.assertIn(opname, self._all_opnames(func)) + @requires_specialization_of("LOAD_ATTR") def test_class_load_attr_specialization_user_type(self): class A: def foo(self): @@ -160,6 +160,7 @@ def load_foo_2(type_): self._check_specialization(load_foo_2, A, "LOAD_ATTR", should_specialize=False) + @requires_specialization_of("LOAD_ATTR") def test_class_load_attr_specialization_static_type(self): self.assertNotEqual(type_get_version(str), 0) self.assertNotEqual(type_get_version(bytes), 0) @@ -171,6 +172,7 @@ def get_capitalize_1(type_): self.assertEqual(get_capitalize_1(str)('hello'), 'Hello') self.assertEqual(get_capitalize_1(bytes)(b'hello'), b'Hello') + @requires_specialization_of("LOAD_ATTR") def test_property_load_attr_specialization_user_type(self): class G: @property @@ -192,6 +194,7 @@ def load_x_2(instance): self._check_specialization(load_x_2, G(), "LOAD_ATTR", should_specialize=False) + @requires_specialization_of("STORE_ATTR") def test_store_attr_specialization_user_type(self): class B: __slots__ = ("bar",) @@ -211,6 +214,7 @@ def store_bar_2(type_): self._check_specialization(store_bar_2, B(), "STORE_ATTR", should_specialize=False) + @requires_specialization_of("CALL") def test_class_call_specialization_user_type(self): class F: def __init__(self): @@ -231,6 +235,7 @@ def call_class_2(type_): self._check_specialization(call_class_2, F, "CALL", should_specialize=False) + @requires_specialization_of("TO_BOOL") def test_to_bool_specialization_user_type(self): class H: pass diff --git a/Modules/_opcode.c b/Modules/_opcode.c index dc93063aee7e54..35b40c19367e91 100644 --- a/Modules/_opcode.c +++ b/Modules/_opcode.c @@ -417,11 +417,34 @@ opcode_functions[] = { {NULL, NULL, 0, NULL} }; + static int _opcode_exec(PyObject *m) { - if (PyModule_AddIntMacro(m, ENABLE_SPECIALIZATION) < 0) { - return -1; - } +#define ADD(X) \ + do { \ + if (PyModule_AddIntConstant(m, #X, (X)) < 0) { \ + return -1; \ + } \ + } while (0) + + ADD(ENABLE_SPECIALIZATION); + ADD(ENABLE_SPECIALIZED_BINARY_OP); + ADD(ENABLE_SPECIALIZED_BINARY_SUBSCR); + ADD(ENABLE_SPECIALIZED_CALL); + ADD(ENABLE_SPECIALIZED_CALL_KW); + ADD(ENABLE_SPECIALIZED_COMPARE_OP); + ADD(ENABLE_SPECIALIZED_CONTAINS_OP); + ADD(ENABLE_SPECIALIZED_FOR_ITER); + ADD(ENABLE_SPECIALIZED_LOAD_ATTR); + ADD(ENABLE_SPECIALIZED_LOAD_GLOBAL); + ADD(ENABLE_SPECIALIZED_LOAD_SUPER_ATTR); + ADD(ENABLE_SPECIALIZED_SEND); + ADD(ENABLE_SPECIALIZED_STORE_ATTR); + ADD(ENABLE_SPECIALIZED_STORE_SUBSCR); + ADD(ENABLE_SPECIALIZED_TO_BOOL); + ADD(ENABLE_SPECIALIZED_UNPACK_SEQUENCE); + +#undef ADD return 0; } diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 9ec07d335d0c1b..5a71d549f1e810 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -360,7 +360,7 @@ dummy_func( }; specializing op(_SPECIALIZE_TO_BOOL, (counter/1, value -- value)) { - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZED_TO_BOOL if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_ToBool(value, next_instr); @@ -368,7 +368,7 @@ dummy_func( } OPCODE_DEFERRED_INC(TO_BOOL); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZED_TO_BOOL */ } op(_TO_BOOL, (value -- res)) { @@ -661,7 +661,7 @@ dummy_func( }; specializing op(_SPECIALIZE_BINARY_SUBSCR, (counter/1, container, sub -- container, sub)) { - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZED_BINARY_SUBSCR if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_BinarySubscr(container, sub, next_instr); @@ -669,7 +669,7 @@ dummy_func( } OPCODE_DEFERRED_INC(BINARY_SUBSCR); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZED_BINARY_SUBSCR */ } op(_BINARY_SUBSCR, (container, sub -- res)) { @@ -863,7 +863,7 @@ dummy_func( }; specializing op(_SPECIALIZE_STORE_SUBSCR, (counter/1, container, sub -- container, sub)) { - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZED_STORE_SUBSCR if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_StoreSubscr(container, sub, next_instr); @@ -871,7 +871,7 @@ dummy_func( } OPCODE_DEFERRED_INC(STORE_SUBSCR); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZED_STORE_SUBSCR */ } op(_STORE_SUBSCR, (v, container, sub -- )) { @@ -1076,7 +1076,7 @@ dummy_func( }; specializing op(_SPECIALIZE_SEND, (counter/1, receiver, unused -- receiver, unused)) { - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZED_SEND if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_Send(receiver, next_instr); @@ -1084,7 +1084,7 @@ dummy_func( } OPCODE_DEFERRED_INC(SEND); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZED_SEND */ } op(_SEND, (receiver, v -- receiver, retval)) { @@ -1342,7 +1342,7 @@ dummy_func( }; specializing op(_SPECIALIZE_UNPACK_SEQUENCE, (counter/1, seq -- seq)) { - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZED_UNPACK_SEQUENCE if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_UnpackSequence(seq, next_instr, oparg); @@ -1350,7 +1350,7 @@ dummy_func( } OPCODE_DEFERRED_INC(UNPACK_SEQUENCE); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZED_UNPACK_SEQUENCE */ (void)seq; (void)counter; } @@ -1413,7 +1413,7 @@ dummy_func( }; specializing op(_SPECIALIZE_STORE_ATTR, (counter/1, owner -- owner)) { - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZED_STORE_ATTR if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { PyObject *name = GETITEM(FRAME_CO_NAMES, oparg); next_instr = this_instr; @@ -1422,7 +1422,7 @@ dummy_func( } OPCODE_DEFERRED_INC(STORE_ATTR); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZED_STORE_ATTR */ } op(_STORE_ATTR, (v, owner --)) { @@ -1530,7 +1530,7 @@ dummy_func( }; specializing op(_SPECIALIZE_LOAD_GLOBAL, (counter/1 -- )) { - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZED_LOAD_GLOBAL if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { PyObject *name = GETITEM(FRAME_CO_NAMES, oparg>>1); next_instr = this_instr; @@ -1539,7 +1539,7 @@ dummy_func( } OPCODE_DEFERRED_INC(LOAD_GLOBAL); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZED_LOAD_GLOBAL */ } op(_LOAD_GLOBAL, ( -- res, null if (oparg & 1))) { @@ -1863,7 +1863,7 @@ dummy_func( }; specializing op(_SPECIALIZE_LOAD_SUPER_ATTR, (counter/1, global_super_st, class_st, unused -- global_super_st, class_st, unused)) { - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZED_LOAD_SUPER_ATTR int load_method = oparg & 1; if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; @@ -1872,7 +1872,7 @@ dummy_func( } OPCODE_DEFERRED_INC(LOAD_SUPER_ATTR); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZED_LOAD_SUPER_ATTR */ } tier1 op(_LOAD_SUPER_ATTR, (global_super_st, class_st, self_st -- attr, null if (oparg & 1))) { @@ -1981,7 +1981,7 @@ dummy_func( }; specializing op(_SPECIALIZE_LOAD_ATTR, (counter/1, owner -- owner)) { - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZED_LOAD_ATTR if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { PyObject *name = GETITEM(FRAME_CO_NAMES, oparg>>1); next_instr = this_instr; @@ -1990,7 +1990,7 @@ dummy_func( } OPCODE_DEFERRED_INC(LOAD_ATTR); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZED_LOAD_ATTR */ } op(_LOAD_ATTR, (owner -- attr, self_or_null if (oparg & 1))) { @@ -2319,7 +2319,7 @@ dummy_func( }; specializing op(_SPECIALIZE_COMPARE_OP, (counter/1, left, right -- left, right)) { - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZED_COMPARE_OP if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_CompareOp(left, right, next_instr, oparg); @@ -2327,7 +2327,7 @@ dummy_func( } OPCODE_DEFERRED_INC(COMPARE_OP); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZED_COMPARE_OP */ } op(_COMPARE_OP, (left, right -- res)) { @@ -2440,7 +2440,7 @@ dummy_func( } specializing op(_SPECIALIZE_CONTAINS_OP, (counter/1, left, right -- left, right)) { - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZED_CONTAINS_OP if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_ContainsOp(right, next_instr); @@ -2448,7 +2448,7 @@ dummy_func( } OPCODE_DEFERRED_INC(CONTAINS_OP); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZED_CONTAINS_OP */ } macro(CONTAINS_OP) = _SPECIALIZE_CONTAINS_OP + _CONTAINS_OP; @@ -2755,7 +2755,7 @@ dummy_func( }; specializing op(_SPECIALIZE_FOR_ITER, (counter/1, iter -- iter)) { - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZED_FOR_ITER if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_ForIter(iter, next_instr, oparg); @@ -2763,7 +2763,7 @@ dummy_func( } OPCODE_DEFERRED_INC(FOR_ITER); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZED_FOR_ITER */ } replaced op(_FOR_ITER, (iter -- iter, next)) { @@ -3224,7 +3224,7 @@ dummy_func( }; specializing op(_SPECIALIZE_CALL, (counter/1, callable, self_or_null, args[oparg] -- callable, self_or_null, args[oparg])) { - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZED_CALL if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_Call(callable, next_instr, oparg + !PyStackRef_IsNull(self_or_null)); @@ -3232,7 +3232,7 @@ dummy_func( } OPCODE_DEFERRED_INC(CALL); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZED_CALL */ } op(_MAYBE_EXPAND_METHOD, (callable, self_or_null, args[oparg] -- func, maybe_self, args[oparg])) { @@ -4231,7 +4231,7 @@ dummy_func( _PUSH_FRAME; specializing op(_SPECIALIZE_CALL_KW, (counter/1, callable, self_or_null, args[oparg], kwnames -- callable, self_or_null, args[oparg], kwnames)) { - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZED_CALL_KW if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_CallKw(callable, next_instr, oparg + !PyStackRef_IsNull(self_or_null)); @@ -4239,7 +4239,7 @@ dummy_func( } OPCODE_DEFERRED_INC(CALL_KW); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZED_CALL_KW */ } macro(CALL_KW) = @@ -4506,7 +4506,7 @@ dummy_func( } specializing op(_SPECIALIZE_BINARY_OP, (counter/1, lhs, rhs -- lhs, rhs)) { - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZED_BINARY_OP if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_BinaryOp(lhs, rhs, next_instr, oparg, LOCALS_ARRAY); @@ -4514,7 +4514,7 @@ dummy_func( } OPCODE_DEFERRED_INC(BINARY_OP); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZED_BINARY_OP */ assert(NB_ADD <= oparg); assert(oparg <= NB_INPLACE_XOR); } diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index bc63235ef7f346..370ece70c3cbe9 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -25,7 +25,7 @@ { uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZED_BINARY_OP if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_BinaryOp(lhs, rhs, next_instr, oparg, LOCALS_ARRAY); @@ -33,7 +33,7 @@ } OPCODE_DEFERRED_INC(BINARY_OP); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZED_BINARY_OP */ assert(NB_ADD <= oparg); assert(oparg <= NB_INPLACE_XOR); } @@ -421,7 +421,7 @@ { uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZED_BINARY_SUBSCR if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_BinarySubscr(container, sub, next_instr); @@ -429,7 +429,7 @@ } OPCODE_DEFERRED_INC(BINARY_SUBSCR); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZED_BINARY_SUBSCR */ } // _BINARY_SUBSCR { @@ -845,7 +845,7 @@ { uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZED_CALL if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_Call(callable, next_instr, oparg + !PyStackRef_IsNull(self_or_null)); @@ -853,7 +853,7 @@ } OPCODE_DEFERRED_INC(CALL); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZED_CALL */ } /* Skip 2 cache entries */ // _MAYBE_EXPAND_METHOD @@ -1765,7 +1765,7 @@ { uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZED_CALL_KW if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_CallKw(callable, next_instr, oparg + !PyStackRef_IsNull(self_or_null)); @@ -1773,7 +1773,7 @@ } OPCODE_DEFERRED_INC(CALL_KW); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZED_CALL_KW */ } /* Skip 2 cache entries */ // _DO_CALL_KW @@ -2998,7 +2998,7 @@ { uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZED_COMPARE_OP if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_CompareOp(left, right, next_instr, oparg); @@ -3006,7 +3006,7 @@ } OPCODE_DEFERRED_INC(COMPARE_OP); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZED_COMPARE_OP */ } // _COMPARE_OP { @@ -3167,7 +3167,7 @@ { uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZED_CONTAINS_OP if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_ContainsOp(right, next_instr); @@ -3175,7 +3175,7 @@ } OPCODE_DEFERRED_INC(CONTAINS_OP); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZED_CONTAINS_OP */ } // _CONTAINS_OP left = stack_pointer[-2]; @@ -3628,7 +3628,7 @@ { uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZED_FOR_ITER if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_ForIter(iter, next_instr, oparg); @@ -3636,7 +3636,7 @@ } OPCODE_DEFERRED_INC(FOR_ITER); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZED_FOR_ITER */ } // _FOR_ITER { @@ -4870,7 +4870,7 @@ { uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZED_LOAD_ATTR if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { PyObject *name = GETITEM(FRAME_CO_NAMES, oparg>>1); next_instr = this_instr; @@ -4879,7 +4879,7 @@ } OPCODE_DEFERRED_INC(LOAD_ATTR); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZED_LOAD_ATTR */ } /* Skip 8 cache entries */ // _LOAD_ATTR @@ -5710,7 +5710,7 @@ { uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZED_LOAD_GLOBAL if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { PyObject *name = GETITEM(FRAME_CO_NAMES, oparg>>1); next_instr = this_instr; @@ -5719,7 +5719,7 @@ } OPCODE_DEFERRED_INC(LOAD_GLOBAL); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZED_LOAD_GLOBAL */ } /* Skip 1 cache entry */ /* Skip 1 cache entry */ @@ -5898,7 +5898,7 @@ { uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZED_LOAD_SUPER_ATTR int load_method = oparg & 1; if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; @@ -5907,7 +5907,7 @@ } OPCODE_DEFERRED_INC(LOAD_SUPER_ATTR); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZED_LOAD_SUPER_ATTR */ } // _LOAD_SUPER_ATTR self_st = stack_pointer[-1]; @@ -6620,7 +6620,7 @@ { uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZED_SEND if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_Send(receiver, next_instr); @@ -6628,7 +6628,7 @@ } OPCODE_DEFERRED_INC(SEND); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZED_SEND */ } // _SEND v = stack_pointer[-1]; @@ -6853,7 +6853,7 @@ { uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZED_STORE_ATTR if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { PyObject *name = GETITEM(FRAME_CO_NAMES, oparg); next_instr = this_instr; @@ -6862,7 +6862,7 @@ } OPCODE_DEFERRED_INC(STORE_ATTR); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZED_STORE_ATTR */ } /* Skip 3 cache entries */ // _STORE_ATTR @@ -7168,7 +7168,7 @@ { uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZED_STORE_SUBSCR if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_StoreSubscr(container, sub, next_instr); @@ -7176,7 +7176,7 @@ } OPCODE_DEFERRED_INC(STORE_SUBSCR); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZED_STORE_SUBSCR */ } // _STORE_SUBSCR v = stack_pointer[-3]; @@ -7279,7 +7279,7 @@ { uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZED_TO_BOOL if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_ToBool(value, next_instr); @@ -7287,7 +7287,7 @@ } OPCODE_DEFERRED_INC(TO_BOOL); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZED_TO_BOOL */ } /* Skip 2 cache entries */ // _TO_BOOL @@ -7505,7 +7505,7 @@ { uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZED_UNPACK_SEQUENCE if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_UnpackSequence(seq, next_instr, oparg); @@ -7513,7 +7513,7 @@ } OPCODE_DEFERRED_INC(UNPACK_SEQUENCE); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZED_UNPACK_SEQUENCE */ (void)seq; (void)counter; } diff --git a/Python/specialize.c b/Python/specialize.c index 7b343e5ccb7666..042c4c31844991 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -697,7 +697,7 @@ _Py_Specialize_LoadSuperAttr(_PyStackRef global_super_st, _PyStackRef cls_st, _P PyObject *global_super = PyStackRef_AsPyObjectBorrow(global_super_st); PyObject *cls = PyStackRef_AsPyObjectBorrow(cls_st); - assert(ENABLE_SPECIALIZATION); + assert(ENABLE_SPECIALIZED_LOAD_SUPER_ATTR); assert(_PyOpcode_Caches[LOAD_SUPER_ATTR] == INLINE_CACHE_ENTRIES_LOAD_SUPER_ATTR); _PySuperAttrCache *cache = (_PySuperAttrCache *)(instr + 1); if (global_super != (PyObject *)&PySuper_Type) { @@ -1113,7 +1113,7 @@ _Py_Specialize_LoadAttr(_PyStackRef owner_st, _Py_CODEUNIT *instr, PyObject *nam _PyAttrCache *cache = (_PyAttrCache *)(instr + 1); PyObject *owner = PyStackRef_AsPyObjectBorrow(owner_st); - assert(ENABLE_SPECIALIZATION); + assert(ENABLE_SPECIALIZED_LOAD_ATTR); assert(_PyOpcode_Caches[LOAD_ATTR] == INLINE_CACHE_ENTRIES_LOAD_ATTR); PyTypeObject *type = Py_TYPE(owner); bool fail; @@ -1152,7 +1152,7 @@ _Py_Specialize_StoreAttr(_PyStackRef owner_st, _Py_CODEUNIT *instr, PyObject *na { PyObject *owner = PyStackRef_AsPyObjectBorrow(owner_st); - assert(ENABLE_SPECIALIZATION); + assert(ENABLE_SPECIALIZED_STORE_ATTR); assert(_PyOpcode_Caches[STORE_ATTR] == INLINE_CACHE_ENTRIES_STORE_ATTR); _PyAttrCache *cache = (_PyAttrCache *)(instr + 1); PyTypeObject *type = Py_TYPE(owner); @@ -1430,7 +1430,7 @@ _Py_Specialize_LoadGlobal( PyObject *globals, PyObject *builtins, _Py_CODEUNIT *instr, PyObject *name) { - assert(ENABLE_SPECIALIZATION); + assert(ENABLE_SPECIALIZED_LOAD_GLOBAL); assert(_PyOpcode_Caches[LOAD_GLOBAL] == INLINE_CACHE_ENTRIES_LOAD_GLOBAL); /* Use inline cache */ _PyLoadGlobalCache *cache = (_PyLoadGlobalCache *)(instr + 1); @@ -1620,7 +1620,7 @@ _Py_Specialize_BinarySubscr( PyObject *container = PyStackRef_AsPyObjectBorrow(container_st); PyObject *sub = PyStackRef_AsPyObjectBorrow(sub_st); - assert(ENABLE_SPECIALIZATION); + assert(ENABLE_SPECIALIZED_BINARY_SUBSCR); assert(_PyOpcode_Caches[BINARY_SUBSCR] == INLINE_CACHE_ENTRIES_BINARY_SUBSCR); _PyBinarySubscrCache *cache = (_PyBinarySubscrCache *)(instr + 1); @@ -1723,7 +1723,7 @@ _Py_Specialize_StoreSubscr(_PyStackRef container_st, _PyStackRef sub_st, _Py_COD PyObject *container = PyStackRef_AsPyObjectBorrow(container_st); PyObject *sub = PyStackRef_AsPyObjectBorrow(sub_st); - assert(ENABLE_SPECIALIZATION); + assert(ENABLE_SPECIALIZED_STORE_SUBSCR); _PyStoreSubscrCache *cache = (_PyStoreSubscrCache *)(instr + 1); PyTypeObject *container_type = Py_TYPE(container); if (container_type == &PyList_Type) { @@ -2064,7 +2064,7 @@ _Py_Specialize_Call(_PyStackRef callable_st, _Py_CODEUNIT *instr, int nargs) { PyObject *callable = PyStackRef_AsPyObjectBorrow(callable_st); - assert(ENABLE_SPECIALIZATION); + assert(ENABLE_SPECIALIZED_CALL); assert(_PyOpcode_Caches[CALL] == INLINE_CACHE_ENTRIES_CALL); assert(_Py_OPCODE(*instr) != INSTRUMENTED_CALL); _PyCallCache *cache = (_PyCallCache *)(instr + 1); @@ -2224,7 +2224,7 @@ _Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *in { PyObject *lhs = PyStackRef_AsPyObjectBorrow(lhs_st); PyObject *rhs = PyStackRef_AsPyObjectBorrow(rhs_st); - assert(ENABLE_SPECIALIZATION); + assert(ENABLE_SPECIALIZED_BINARY_OP); assert(_PyOpcode_Caches[BINARY_OP] == INLINE_CACHE_ENTRIES_BINARY_OP); _PyBinaryOpCache *cache = (_PyBinaryOpCache *)(instr + 1); switch (oparg) { @@ -2334,7 +2334,7 @@ _Py_Specialize_CompareOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *i PyObject *lhs = PyStackRef_AsPyObjectBorrow(lhs_st); PyObject *rhs = PyStackRef_AsPyObjectBorrow(rhs_st); - assert(ENABLE_SPECIALIZATION); + assert(ENABLE_SPECIALIZED_COMPARE_OP); assert(_PyOpcode_Caches[COMPARE_OP] == INLINE_CACHE_ENTRIES_COMPARE_OP); // All of these specializations compute boolean values, so they're all valid // regardless of the fifth-lowest oparg bit. @@ -2398,7 +2398,7 @@ _Py_Specialize_UnpackSequence(_PyStackRef seq_st, _Py_CODEUNIT *instr, int oparg { PyObject *seq = PyStackRef_AsPyObjectBorrow(seq_st); - assert(ENABLE_SPECIALIZATION); + assert(ENABLE_SPECIALIZED_UNPACK_SEQUENCE); assert(_PyOpcode_Caches[UNPACK_SEQUENCE] == INLINE_CACHE_ENTRIES_UNPACK_SEQUENCE); _PyUnpackSequenceCache *cache = (_PyUnpackSequenceCache *)(instr + 1); @@ -2509,7 +2509,7 @@ int void _Py_Specialize_ForIter(_PyStackRef iter, _Py_CODEUNIT *instr, int oparg) { - assert(ENABLE_SPECIALIZATION); + assert(ENABLE_SPECIALIZED_FOR_ITER); assert(_PyOpcode_Caches[FOR_ITER] == INLINE_CACHE_ENTRIES_FOR_ITER); _PyForIterCache *cache = (_PyForIterCache *)(instr + 1); PyObject *iter_o = PyStackRef_AsPyObjectBorrow(iter); @@ -2554,7 +2554,7 @@ _Py_Specialize_Send(_PyStackRef receiver_st, _Py_CODEUNIT *instr) { PyObject *receiver = PyStackRef_AsPyObjectBorrow(receiver_st); - assert(ENABLE_SPECIALIZATION); + assert(ENABLE_SPECIALIZED_SEND); assert(_PyOpcode_Caches[SEND] == INLINE_CACHE_ENTRIES_SEND); _PySendCache *cache = (_PySendCache *)(instr + 1); PyTypeObject *tp = Py_TYPE(receiver); @@ -2581,7 +2581,7 @@ _Py_Specialize_Send(_PyStackRef receiver_st, _Py_CODEUNIT *instr) void _Py_Specialize_ToBool(_PyStackRef value_o, _Py_CODEUNIT *instr) { - assert(ENABLE_SPECIALIZATION); + assert(ENABLE_SPECIALIZED_TO_BOOL); assert(_PyOpcode_Caches[TO_BOOL] == INLINE_CACHE_ENTRIES_TO_BOOL); _PyToBoolCache *cache = (_PyToBoolCache *)(instr + 1); PyObject *value = PyStackRef_AsPyObjectBorrow(value_o); @@ -2698,7 +2698,7 @@ _Py_Specialize_ContainsOp(_PyStackRef value_st, _Py_CODEUNIT *instr) { PyObject *value = PyStackRef_AsPyObjectBorrow(value_st); - assert(ENABLE_SPECIALIZATION); + assert(ENABLE_SPECIALIZED_CONTAINS_OP); assert(_PyOpcode_Caches[CONTAINS_OP] == INLINE_CACHE_ENTRIES_COMPARE_OP); _PyContainsOpCache *cache = (_PyContainsOpCache *)(instr + 1); if (PyDict_CheckExact(value)) { From 50a6089a6c73b302ad991469aeb6bb893462339c Mon Sep 17 00:00:00 2001 From: Matt Page Date: Tue, 3 Sep 2024 20:49:47 -0700 Subject: [PATCH 09/67] Specialize BINARY_OP --- Include/internal/pycore_code.h | 6 +++--- Python/ceval_macros.h | 9 --------- Python/specialize.c | 23 +++++++++++++++-------- 3 files changed, 18 insertions(+), 20 deletions(-) diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index 72e5b8863f7a5c..6cdd20dd116fe2 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -314,10 +314,11 @@ extern int _PyLineTable_PreviousAddressRange(PyCodeAddressRange *range); /** API for executors */ extern void _PyCode_Clear_Executors(PyCodeObject *code); +#define ENABLE_SPECIALIZATION 1 + #ifdef Py_GIL_DISABLED // gh-115999 tracks progress on addressing this. -#define ENABLE_SPECIALIZATION 0 -#define ENABLE_SPECIALIZED_BINARY_OP 0 && ENABLE_SPECIALIZATION +#define ENABLE_SPECIALIZED_BINARY_OP ENABLE_SPECIALIZATION #define ENABLE_SPECIALIZED_BINARY_SUBSCR 0 && ENABLE_SPECIALIZATION #define ENABLE_SPECIALIZED_CALL 0 && ENABLE_SPECIALIZATION #define ENABLE_SPECIALIZED_CALL_KW 0 && ENABLE_SPECIALIZATION @@ -333,7 +334,6 @@ extern void _PyCode_Clear_Executors(PyCodeObject *code); #define ENABLE_SPECIALIZED_TO_BOOL 0 && ENABLE_SPECIALIZATION #define ENABLE_SPECIALIZED_UNPACK_SEQUENCE 0 && ENABLE_SPECIALIZATION #else -#define ENABLE_SPECIALIZATION 1 #define ENABLE_SPECIALIZED_BINARY_OP ENABLE_SPECIALIZATION #define ENABLE_SPECIALIZED_BINARY_SUBSCR ENABLE_SPECIALIZATION #define ENABLE_SPECIALIZED_CALL ENABLE_SPECIALIZATION diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h index 387bc994870352..eee009ee6d01f3 100644 --- a/Python/ceval_macros.h +++ b/Python/ceval_macros.h @@ -300,14 +300,6 @@ GETITEM(PyObject *v, Py_ssize_t i) { #define ADAPTIVE_COUNTER_TRIGGERS(COUNTER) \ backoff_counter_triggers(forge_backoff_counter((COUNTER))) -#ifdef Py_GIL_DISABLED -#define ADVANCE_ADAPTIVE_COUNTER(COUNTER) \ - do { \ - /* gh-115999 tracks progress on addressing this. */ \ - static_assert(0, "The specializing interpreter is not yet thread-safe"); \ - } while (0); -#define PAUSE_ADAPTIVE_COUNTER(COUNTER) ((void)COUNTER) -#else #define ADVANCE_ADAPTIVE_COUNTER(COUNTER) \ do { \ (COUNTER) = advance_backoff_counter((COUNTER)); \ @@ -317,7 +309,6 @@ GETITEM(PyObject *v, Py_ssize_t i) { do { \ (COUNTER) = pause_backoff_counter((COUNTER)); \ } while (0); -#endif #define UNBOUNDLOCAL_ERROR_MSG \ "cannot access local variable '%s' where it is not associated with a value" diff --git a/Python/specialize.c b/Python/specialize.c index 042c4c31844991..eba7450c1fa1b3 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -24,6 +24,13 @@ extern const char *_PyUOpName(int index); * ./adaptive.md */ +#ifdef Py_GIL_DISABLED +#define SET_OPCODE(instr, opcode) _Py_atomic_store_uint8_relaxed(&(instr)->op.code, (opcode)) +#else +#define SET_OPCODE(instr, opcode) (instr)->op.code = (opcode) +#endif + + #ifdef Py_STATS GCStats _py_gc_stats[NUM_GENERATIONS] = { 0 }; static PyStats _Py_stats_struct = { .gc_stats = _py_gc_stats }; @@ -2237,18 +2244,18 @@ _Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *in _Py_CODEUNIT next = instr[INLINE_CACHE_ENTRIES_BINARY_OP + 1]; bool to_store = (next.op.code == STORE_FAST); if (to_store && PyStackRef_AsPyObjectBorrow(locals[next.op.arg]) == lhs) { - instr->op.code = BINARY_OP_INPLACE_ADD_UNICODE; + SET_OPCODE(instr, BINARY_OP_INPLACE_ADD_UNICODE); goto success; } - instr->op.code = BINARY_OP_ADD_UNICODE; + SET_OPCODE(instr, BINARY_OP_ADD_UNICODE); goto success; } if (PyLong_CheckExact(lhs)) { - instr->op.code = BINARY_OP_ADD_INT; + SET_OPCODE(instr, BINARY_OP_ADD_INT); goto success; } if (PyFloat_CheckExact(lhs)) { - instr->op.code = BINARY_OP_ADD_FLOAT; + SET_OPCODE(instr, BINARY_OP_ADD_FLOAT); goto success; } break; @@ -2258,11 +2265,11 @@ _Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *in break; } if (PyLong_CheckExact(lhs)) { - instr->op.code = BINARY_OP_MULTIPLY_INT; + SET_OPCODE(instr, BINARY_OP_MULTIPLY_INT); goto success; } if (PyFloat_CheckExact(lhs)) { - instr->op.code = BINARY_OP_MULTIPLY_FLOAT; + SET_OPCODE(instr, BINARY_OP_MULTIPLY_FLOAT); goto success; } break; @@ -2272,11 +2279,11 @@ _Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *in break; } if (PyLong_CheckExact(lhs)) { - instr->op.code = BINARY_OP_SUBTRACT_INT; + SET_OPCODE(instr, BINARY_OP_SUBTRACT_INT); goto success; } if (PyFloat_CheckExact(lhs)) { - instr->op.code = BINARY_OP_SUBTRACT_FLOAT; + SET_OPCODE(instr, BINARY_OP_SUBTRACT_FLOAT); goto success; } break; From 3f1d94175e3bdab06156f6f0cc408f4fb444cacc Mon Sep 17 00:00:00 2001 From: Matt Page Date: Fri, 6 Sep 2024 14:45:19 -0700 Subject: [PATCH 10/67] Limit the amount of memory consumed by bytecode copies --- Include/internal/pycore_code.h | 19 ++--- Include/internal/pycore_interp.h | 5 ++ Objects/codeobject.c | 123 ++++++++++++++++++++++++++++--- Python/bytecodes.c | 6 +- Python/generated_cases.c.h | 12 +-- Python/specialize.c | 18 +++++ 6 files changed, 152 insertions(+), 31 deletions(-) diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index 6cdd20dd116fe2..7d9ae813806eb1 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -633,25 +633,20 @@ PyAPI_DATA(const struct _PyCode8) _Py_InitCleanup; #ifdef Py_GIL_DISABLED -extern _PyMutBytecode *_PyCode_CreateSpecializableCode(PyCodeObject *co); +extern _Py_CODEUNIT *_PyCode_GetExecutableCodeSlow(PyCodeObject *co); -/* Return bytecode that should be executed. - * Will not return NULL, but may disable specialization, in which case the - * returned bytecode should not be specialized. - * - * XXX - This is a confusing contract. - */ -static inline _PyMutBytecode * -_PyCode_GetSpecializableCode(PyCodeObject *co) +// Return the bytecode that should be executed by the current thread, creating +// a copy if necessary. +static inline _Py_CODEUNIT * +_PyCode_GetExecutableCode(PyCodeObject *co) { _PyCodeArray *code = _Py_atomic_load_ptr_acquire(&co->co_specialized_code); _PyThreadStateImpl *tstate = (_PyThreadStateImpl *) PyThreadState_GET(); Py_ssize_t idx = tstate->specialized_code_index; if (idx < code->size && code->entries[idx] != NULL) { - // XXX - Do we need to worry about alignment here? - return code->entries[idx]; + return (_Py_CODEUNIT *) code->entries[idx]->bytecode; } - return _PyCode_CreateSpecializableCode(co); + return _PyCode_GetExecutableCodeSlow(co); } extern int _Py_ReserveSpecializedCodeIndex(PyInterpreterState *interp); extern void _Py_ClearSpecializedCodeIndex(_PyThreadStateImpl *tstate); diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h index caf86b44374da8..0e6702a077ab77 100644 --- a/Include/internal/pycore_interp.h +++ b/Include/internal/pycore_interp.h @@ -225,6 +225,11 @@ struct _is { struct _Py_type_id_pool type_ids; PyMutex weakref_locks[NUM_WEAKREF_LIST_LOCKS]; _PyIndexPool specialized_code_indices; + // Number of bytes available for thread-local bytecode, counts down to zero + Py_ssize_t specialized_code_bytes_free; + PyMutex specialized_code_bytes_free_mutex; + // This is monotonic; once true it will remain true + bool new_thread_local_bytecode_disabled; #endif // Per-interpreter state for the obmalloc allocator. For the main diff --git a/Objects/codeobject.c b/Objects/codeobject.c index 31f7acebc7bd73..9ff3c8871f188a 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -454,7 +454,9 @@ _PyCode_Validate(struct _PyCodeConstructor *con) extern void _PyCode_Quicken(_Py_CODEUNIT *instructions, Py_ssize_t size); #ifdef Py_GIL_DISABLED +extern void _PyCode_DisableSpecialization(_Py_CODEUNIT *instructions, Py_ssize_t size); static _PyCodeArray * _PyCodeArray_New(Py_ssize_t size); +static void release_bytes_for_specialized_code(Py_ssize_t nbytes); #endif static int @@ -534,7 +536,16 @@ init_code(PyCodeObject *co, struct _PyCodeConstructor *con) entry_point++; } co->_co_firsttraceable = entry_point; +#ifdef Py_GIL_DISABLED + if (interp->new_thread_local_bytecode_disabled) { + _PyCode_DisableSpecialization(_PyCode_CODE(co), Py_SIZE(co)); + } + else { + _PyCode_Quicken(_PyCode_CODE(co), Py_SIZE(co)); + } +#else _PyCode_Quicken(_PyCode_CODE(co), Py_SIZE(co)); +#endif notify_code_watchers(PY_CODE_EVENT_CREATE, co); return 0; } @@ -1895,9 +1906,15 @@ code_dealloc(PyCodeObject *co) #ifdef Py_GIL_DISABLED // The first element always points to the mutable bytecode at the end of // the code object, which will be freed when the code object is freed. + Py_ssize_t bytes_freed = 0; for (Py_ssize_t i = 1; i < co->co_specialized_code->size; i++) { - PyMem_Free(co->co_specialized_code->entries[i]); + _PyMutBytecode *entry = co->co_specialized_code->entries[i]; + if (entry != NULL) { + PyMem_Free(entry); + bytes_freed += _PyCode_NBYTES(co); + } } + release_bytes_for_specialized_code(bytes_freed); PyMem_Free(co->co_specialized_code); #endif PyObject_Free(co); @@ -2704,12 +2721,10 @@ get_pow2_greater(Py_ssize_t initial, Py_ssize_t limit) return res; } -static _PyMutBytecode * -create_specializable_code_lock_held(PyCodeObject *co) +static _Py_CODEUNIT * +create_specializable_code_lock_held(PyCodeObject *co, Py_ssize_t idx) { _PyCodeArray *spec_code = co->co_specialized_code; - _PyThreadStateImpl *tstate = (_PyThreadStateImpl *) PyThreadState_GET(); - Py_ssize_t idx = tstate->specialized_code_index; if (idx >= spec_code->size) { Py_ssize_t new_size = get_pow2_greater(spec_code->size, idx + 1); if (!new_size) { @@ -2731,16 +2746,104 @@ create_specializable_code_lock_held(PyCodeObject *co) return NULL; } copy_code(bc, co); + assert(spec_code->entries[idx] == NULL); spec_code->entries[idx] = bc; - return bc; + return (_Py_CODEUNIT *) bc->bytecode; } -_PyMutBytecode * -_PyCode_CreateSpecializableCode(PyCodeObject *co) +static Py_ssize_t +reserve_bytes_for_specialized_code(PyCodeObject *co) { - _PyMutBytecode *result; + PyInterpreterState *interp = _PyInterpreterState_GET(); + Py_ssize_t nbytes_reserved = -1; + Py_ssize_t code_size = _PyCode_NBYTES(co); + PyMutex_LockFlags(&interp->specialized_code_bytes_free_mutex, _Py_LOCK_DONT_DETACH); + if (interp->specialized_code_bytes_free >= code_size) { + interp->specialized_code_bytes_free -= code_size; + nbytes_reserved = code_size; + } + PyMutex_Unlock(&interp->specialized_code_bytes_free_mutex); + return nbytes_reserved; +} + +static void +release_bytes_for_specialized_code(Py_ssize_t nbytes) +{ + assert(nbytes >= 0); + if (nbytes == 0) { + return; + } + PyInterpreterState *interp = _PyInterpreterState_GET(); + PyMutex_LockFlags(&interp->specialized_code_bytes_free_mutex, _Py_LOCK_DONT_DETACH); + interp->specialized_code_bytes_free += nbytes; + PyMutex_Unlock(&interp->specialized_code_bytes_free_mutex); +} + +static int +disable_specialization(PyObject *obj, void*) +{ + if (!PyCode_Check(obj)) { + return 1; + } + PyCodeObject *co = (PyCodeObject *) obj; + _PyCode_DisableSpecialization(_PyCode_CODE(co), Py_SIZE(co)); + return 1; +} + +static void +disable_new_thread_local_bytecode(void) +{ + PyInterpreterState *interp = _PyInterpreterState_GET(); + if (interp->new_thread_local_bytecode_disabled) { + return; + } + // Disable creation of new thread-local copies of bytecode. We disable + // further specialization of the "main" copy of the bytecode (the bytecode + // that is embedded in the code object), so that multiple threads can + // safely execute it. From this point on, threads are free to specialize + // existing thread-local copies of the bytecode (other than the main copy), + // but any attempts to create new copies of bytecode will fail, and the + // main, unspecializable copy will be used. + _PyEval_StopTheWorld(interp); + interp->new_thread_local_bytecode_disabled = true; + _PyEval_StartTheWorld(interp); + PyUnstable_GC_VisitObjects(disable_specialization, NULL); + if (PyErr_WarnEx(PyExc_ResourceWarning, "Reached memory limit for thread-local bytecode", 1) < 0) { + PyErr_WriteUnraisable(NULL); + } +} + +static _Py_CODEUNIT * +get_executable_code_lock_held(PyCodeObject *co) +{ + _PyCodeArray *spec_code = co->co_specialized_code; + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *) PyThreadState_GET(); + Py_ssize_t idx = tstate->specialized_code_index; + if (idx < spec_code->size && spec_code->entries[idx] != NULL) { + return (_Py_CODEUNIT *) spec_code->entries[idx]->bytecode; + } + Py_ssize_t reserved = reserve_bytes_for_specialized_code(co); + if (reserved == -1) { + disable_new_thread_local_bytecode(); + return (_Py_CODEUNIT *) spec_code->entries[0]->bytecode; + } + _Py_CODEUNIT *result = create_specializable_code_lock_held(co, idx); + if (result == NULL) { + release_bytes_for_specialized_code(reserved); + } + return result; +} + +_Py_CODEUNIT * +_PyCode_GetExecutableCodeSlow(PyCodeObject *co) +{ + PyInterpreterState *interp = _PyInterpreterState_GET(); + if (interp->new_thread_local_bytecode_disabled) { + return (_Py_CODEUNIT *) co->co_specialized_code->entries[0]->bytecode; + } + _Py_CODEUNIT *result; Py_BEGIN_CRITICAL_SECTION(co); - result = create_specializable_code_lock_held(co); + result = get_executable_code_lock_held(co); Py_END_CRITICAL_SECTION(); return result; } diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 5a71d549f1e810..7af6ca2f981eb3 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -192,10 +192,10 @@ dummy_func( op(_LOAD_BYTECODE, (--)) { #ifdef Py_GIL_DISABLED - _PyMutBytecode *code = _PyCode_GetSpecializableCode(_PyFrame_GetCode(frame)); - if (frame->bytecode != (_Py_CODEUNIT *) code->bytecode) { + _Py_CODEUNIT *bytecode = _PyCode_GetExecutableCode(_PyFrame_GetCode(frame)); + if (frame->bytecode != bytecode) { int off = this_instr - frame->bytecode; - frame->bytecode = (_Py_CODEUNIT *) code->bytecode; + frame->bytecode = bytecode; frame->instr_ptr = frame->bytecode + off; this_instr = frame->instr_ptr; next_instr = frame->instr_ptr + 1; diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 370ece70c3cbe9..d439cb713402a3 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -4492,10 +4492,10 @@ // _LOAD_BYTECODE { #ifdef Py_GIL_DISABLED - _PyMutBytecode *code = _PyCode_GetSpecializableCode(_PyFrame_GetCode(frame)); - if (frame->bytecode != (_Py_CODEUNIT *) code->bytecode) { + _Py_CODEUNIT *bytecode = _PyCode_GetExecutableCode(_PyFrame_GetCode(frame)); + if (frame->bytecode != bytecode) { int off = this_instr - frame->bytecode; - frame->bytecode = (_Py_CODEUNIT *) code->bytecode; + frame->bytecode = bytecode; frame->instr_ptr = frame->bytecode + off; this_instr = frame->instr_ptr; next_instr = frame->instr_ptr + 1; @@ -6444,10 +6444,10 @@ // _LOAD_BYTECODE { #ifdef Py_GIL_DISABLED - _PyMutBytecode *code = _PyCode_GetSpecializableCode(_PyFrame_GetCode(frame)); - if (frame->bytecode != (_Py_CODEUNIT *) code->bytecode) { + _Py_CODEUNIT *bytecode = _PyCode_GetExecutableCode(_PyFrame_GetCode(frame)); + if (frame->bytecode != bytecode) { int off = this_instr - frame->bytecode; - frame->bytecode = (_Py_CODEUNIT *) code->bytecode; + frame->bytecode = bytecode; frame->instr_ptr = frame->bytecode + off; this_instr = frame->instr_ptr; next_instr = frame->instr_ptr + 1; diff --git a/Python/specialize.c b/Python/specialize.c index eba7450c1fa1b3..f263c3b4aa9188 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -471,6 +471,24 @@ _PyCode_Quicken(_Py_CODEUNIT *instructions, Py_ssize_t size) #endif /* ENABLE_SPECIALIZATION */ } +#ifdef Py_GIL_DISABLED + +void +_PyCode_DisableSpecialization(_Py_CODEUNIT *instructions, Py_ssize_t size) +{ + /* The last code unit cannot have a cache, so we don't need to check it */ + for (Py_ssize_t i = 0; i < size-1; i++) { + int opcode = instructions[i].op.code; + int caches = _PyOpcode_Caches[opcode]; + if (caches) { + instructions[i + 1].counter = initial_unreachable_backoff_counter(); + i += caches; + } + } +} + +#endif + #define SIMPLE_FUNCTION 0 /* Common */ From 7d2eb2750d13b40ca7df7a470d8e0778d61c487f Mon Sep 17 00:00:00 2001 From: Matt Page Date: Fri, 6 Sep 2024 22:29:22 -0700 Subject: [PATCH 11/67] Make thread-local bytecode limits user configurable --- Include/cpython/initconfig.h | 1 + Include/internal/pycore_code.h | 1 + Include/internal/pycore_interp.h | 4 +-- Lib/test/test_cmd_line.py | 29 ++++++++++++++++++ Lib/test/test_embed.py | 1 + Objects/codeobject.c | 35 +++++++++++++++------- Python/initconfig.c | 50 ++++++++++++++++++++++++++++++-- Python/pylifecycle.c | 4 +++ Python/pystate.c | 1 + 9 files changed, 111 insertions(+), 15 deletions(-) diff --git a/Include/cpython/initconfig.h b/Include/cpython/initconfig.h index c2cb4e3cdd92fb..f2ba2273a857b2 100644 --- a/Include/cpython/initconfig.h +++ b/Include/cpython/initconfig.h @@ -183,6 +183,7 @@ typedef struct PyConfig { int cpu_count; #ifdef Py_GIL_DISABLED int enable_gil; + int thread_local_bytecode_limit; #endif /* --- Path configuration inputs ------------ */ diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index 7d9ae813806eb1..1eddf1f0320179 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -633,6 +633,7 @@ PyAPI_DATA(const struct _PyCode8) _Py_InitCleanup; #ifdef Py_GIL_DISABLED +extern void _PyCode_InitState(PyInterpreterState *interp); extern _Py_CODEUNIT *_PyCode_GetExecutableCodeSlow(PyCodeObject *co); // Return the bytecode that should be executed by the current thread, creating diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h index 0e6702a077ab77..4f6ed2c3f5b040 100644 --- a/Include/internal/pycore_interp.h +++ b/Include/internal/pycore_interp.h @@ -226,8 +226,8 @@ struct _is { PyMutex weakref_locks[NUM_WEAKREF_LIST_LOCKS]; _PyIndexPool specialized_code_indices; // Number of bytes available for thread-local bytecode, counts down to zero - Py_ssize_t specialized_code_bytes_free; - PyMutex specialized_code_bytes_free_mutex; + Py_ssize_t thread_local_bytecode_avail; + PyMutex thread_local_bytecode_avail_mutex; // This is monotonic; once true it will remain true bool new_thread_local_bytecode_disabled; #endif diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py index 35725718152c56..c2210d60a810b1 100644 --- a/Lib/test/test_cmd_line.py +++ b/Lib/test/test_cmd_line.py @@ -12,6 +12,7 @@ from test import support from test.support import os_helper from test.support import force_not_colorized +from test.support import threading_helper from test.support.script_helper import ( spawn_python, kill_python, assert_python_ok, assert_python_failure, interpreter_requires_environment @@ -1068,6 +1069,34 @@ def res2int(self, res): out = res.out.strip().decode("utf-8") return tuple(int(i) for i in out.split()) + @unittest.skipUnless(support.Py_GIL_DISABLED, + "PYTHON_THREAD_LOCAL_BC_LIMIT and -X thread_local_bc_limit" + " only supported in Py_GIL_DISABLED builds") + @threading_helper.requires_working_threading() + def test_set_thread_local_bytecode_limit(self): + code = """if 1: + import threading + def test(x, y): + return x + y + t = threading.Thread(target=test, args=(1,2)) + t.start() + t.join()""" + rc, out, err = assert_python_ok("-W", "always", "-X", "thread_local_bc_limit=1", "-c", code) + self.assertIn(b"Reached memory limit for thread-local bytecode", err) + rc, out, err = assert_python_ok("-W", "always", "-c", code, PYTHON_THREAD_LOCAL_BC_LIMIT="1") + self.assertIn(b"Reached memory limit for thread-local bytecode", err) + + @unittest.skipUnless(support.Py_GIL_DISABLED, + "PYTHON_THREAD_LOCAL_BC_LIMIT and -X thread_local_bc_limit" + " only supported in Py_GIL_DISABLED builds") + def test_invalid_thread_local_bytecode_limit(self): + rc, out, err = assert_python_failure("-X", "thread_local_bc_limit") + self.assertIn(b"thread_local_bc_limit=n: n is missing or invalid", err) + rc, out, err = assert_python_failure("-X", "thread_local_bc_limit=foo") + self.assertIn(b"thread_local_bc_limit=n: n is missing or invalid", err) + rc, out, err = assert_python_failure(PYTHON_THREAD_LOCAL_BC_LIMIT="foo") + self.assertIn(b"PYTHON_THREAD_LOCAL_BC_LIMIT=N: N is missing or invalid", err) + @unittest.skipIf(interpreter_requires_environment(), 'Cannot run -I tests when PYTHON env vars are required.') diff --git a/Lib/test/test_embed.py b/Lib/test/test_embed.py index 6790326a2afa47..e6fddfdc5e2582 100644 --- a/Lib/test/test_embed.py +++ b/Lib/test/test_embed.py @@ -631,6 +631,7 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase): CONFIG_COMPAT['run_presite'] = None if support.Py_GIL_DISABLED: CONFIG_COMPAT['enable_gil'] = -1 + CONFIG_COMPAT['thread_local_bytecode_limit'] = GET_DEFAULT_CONFIG if MS_WINDOWS: CONFIG_COMPAT.update({ 'legacy_windows_stdio': False, diff --git a/Objects/codeobject.c b/Objects/codeobject.c index 9ff3c8871f188a..0b88db2b5a5fa9 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -2686,6 +2686,14 @@ _PyCode_Fini(PyInterpreterState *interp) #ifdef Py_GIL_DISABLED +void +_PyCode_InitState(PyInterpreterState *interp) +{ + int limit = interp->config.thread_local_bytecode_limit; + interp->thread_local_bytecode_avail = limit; + interp->new_thread_local_bytecode_disabled = limit == 0; +} + static _PyCodeArray * _PyCodeArray_New(Py_ssize_t size) { @@ -2757,12 +2765,15 @@ reserve_bytes_for_specialized_code(PyCodeObject *co) PyInterpreterState *interp = _PyInterpreterState_GET(); Py_ssize_t nbytes_reserved = -1; Py_ssize_t code_size = _PyCode_NBYTES(co); - PyMutex_LockFlags(&interp->specialized_code_bytes_free_mutex, _Py_LOCK_DONT_DETACH); - if (interp->specialized_code_bytes_free >= code_size) { - interp->specialized_code_bytes_free -= code_size; + PyMutex_LockFlags(&interp->thread_local_bytecode_avail_mutex, _Py_LOCK_DONT_DETACH); + if (interp->thread_local_bytecode_avail < 0) { nbytes_reserved = code_size; } - PyMutex_Unlock(&interp->specialized_code_bytes_free_mutex); + else if (interp->thread_local_bytecode_avail >= code_size) { + interp->thread_local_bytecode_avail -= code_size; + nbytes_reserved = code_size; + } + PyMutex_Unlock(&interp->thread_local_bytecode_avail_mutex); return nbytes_reserved; } @@ -2774,9 +2785,11 @@ release_bytes_for_specialized_code(Py_ssize_t nbytes) return; } PyInterpreterState *interp = _PyInterpreterState_GET(); - PyMutex_LockFlags(&interp->specialized_code_bytes_free_mutex, _Py_LOCK_DONT_DETACH); - interp->specialized_code_bytes_free += nbytes; - PyMutex_Unlock(&interp->specialized_code_bytes_free_mutex); + PyMutex_LockFlags(&interp->thread_local_bytecode_avail_mutex, _Py_LOCK_DONT_DETACH); + if (interp->thread_local_bytecode_avail >= 0) { + interp->thread_local_bytecode_avail += nbytes; + } + PyMutex_Unlock(&interp->thread_local_bytecode_avail_mutex); } static int @@ -2800,10 +2813,10 @@ disable_new_thread_local_bytecode(void) // Disable creation of new thread-local copies of bytecode. We disable // further specialization of the "main" copy of the bytecode (the bytecode // that is embedded in the code object), so that multiple threads can - // safely execute it. From this point on, threads are free to specialize - // existing thread-local copies of the bytecode (other than the main copy), - // but any attempts to create new copies of bytecode will fail, and the - // main, unspecializable copy will be used. + // safely execute it concurrently. From this point on, threads are free to + // specialize existing thread-local copies of the bytecode (other than the + // main copy), but any attempts to create new copies of bytecode will fail, + // and the main, unspecializable copy will be used. _PyEval_StopTheWorld(interp); interp->new_thread_local_bytecode_disabled = true; _PyEval_StartTheWorld(interp); diff --git a/Python/initconfig.c b/Python/initconfig.c index d93244f7f41084..311d7bf9705ff9 100644 --- a/Python/initconfig.c +++ b/Python/initconfig.c @@ -134,6 +134,7 @@ static const PyConfigSpec PYCONFIG_SPEC[] = { SPEC(dump_refs_file, WSTR_OPT, READ_ONLY, NO_SYS), #ifdef Py_GIL_DISABLED SPEC(enable_gil, INT, READ_ONLY, NO_SYS), + SPEC(thread_local_bytecode_limit, INT), #endif SPEC(faulthandler, BOOL, READ_ONLY, NO_SYS), SPEC(filesystem_encoding, WSTR, READ_ONLY, NO_SYS), @@ -315,8 +316,14 @@ The following implementation-specific options are available:\n\ "\ -X showrefcount: output the total reference count and number of used\n\ memory blocks when the program finishes or after each statement in\n\ - the interactive interpreter; only works on debug builds\n\ --X tracemalloc[=N]: trace Python memory allocations; N sets a traceback limit\n\ + the interactive interpreter; only works on debug builds\n" +#ifdef Py_GIL_DISABLED +"-X thread_local_bc_limit=N: limit the total size of thread-local bytecode,\n\ + per-interpreter, to N bytes. A value < 0 means unlimited. A value of\n\ + 0 disables thread-local bytecode. Also PYTHON_THREAD_LOCAL_BC_LIMIT\n" +#endif +"\ +-X tracemalloc[=N]: trace Python memory allocations; N sets a traceback limit\n \ of N frames (default: 1); also PYTHONTRACEMALLOC=N\n\ -X utf8[=0|1]: enable (1) or disable (0) UTF-8 mode; also PYTHONUTF8\n\ -X warn_default_encoding: enable opt-in EncodingWarning for 'encoding=None';\n\ @@ -400,6 +407,10 @@ static const char usage_envvars[] = #ifdef Py_STATS "PYTHONSTATS : turns on statistics gathering (-X pystats)\n" #endif +#ifdef Py_GIL_DISABLED +"PYTHON_THREAD_LOCAL_BC_LIMT: limit the total size of thread-local bytecode\n" +" (-X thread-local-bc-limit)\n" +#endif "PYTHONTRACEMALLOC: trace Python memory allocations (-X tracemalloc)\n" "PYTHONUNBUFFERED: disable stdout/stderr buffering (-u)\n" "PYTHONUTF8 : control the UTF-8 mode (-X utf8)\n" @@ -979,6 +990,8 @@ _PyConfig_InitCompatConfig(PyConfig *config) config->cpu_count = -1; #ifdef Py_GIL_DISABLED config->enable_gil = _PyConfig_GIL_DEFAULT; + // 100 MiB + config->thread_local_bytecode_limit = 100 * (1 << 20); #endif } @@ -1862,6 +1875,34 @@ config_init_cpu_count(PyConfig *config) "n must be greater than 0"); } +static PyStatus +config_init_thread_local_bytecode_limit(PyConfig *config) +{ +#ifdef Py_GIL_DISABLED + const char *env = config_get_env(config, "PYTHON_THREAD_LOCAL_BC_LIMIT"); + if (env) { + int limit = -1; + if (_Py_str_to_int(env, &limit) < 0) { + return _PyStatus_ERR( + "PYTHON_THREAD_LOCAL_BC_LIMIT=N: N is missing or invalid"); + } + config->thread_local_bytecode_limit = limit; + } + + const wchar_t *xoption = config_get_xoption(config, L"thread_local_bc_limit"); + if (xoption) { + int limit = -1; + const wchar_t *sep = wcschr(xoption, L'='); + if (!sep || (config_wstr_to_int(sep + 1, &limit) < 0)) { + return _PyStatus_ERR( + "-X thread_local_bc_limit=n: n is missing or invalid"); + } + config->thread_local_bytecode_limit = limit; + } + return _PyStatus_OK(); +#endif +} + static PyStatus config_init_perf_profiling(PyConfig *config) { @@ -2111,6 +2152,11 @@ config_read_complex_options(PyConfig *config) } #endif + status = config_init_thread_local_bytecode_limit(config); + if (_PyStatus_EXCEPTION(status)) { + return status; + } + return _PyStatus_OK(); } diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 27faf723745c21..d1090b8570e970 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -907,6 +907,10 @@ pycore_interp_init(PyThreadState *tstate) goto done; } +#ifdef Py_GIL_DISABLED + _PyCode_InitState(interp); +#endif + done: /* sys.modules['sys'] contains a strong reference to the module */ Py_XDECREF(sysmod); diff --git a/Python/pystate.c b/Python/pystate.c index afc63cdd9deb2b..3a92af581a695e 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -644,6 +644,7 @@ init_interpreter(PyInterpreterState *interp, _PyType_InitCache(interp); #ifdef Py_GIL_DISABLED _Py_brc_init_state(interp); + _PyCode_InitState(interp); #endif llist_init(&interp->mem_free_queue.head); for (int i = 0; i < _PY_MONITORING_UNGROUPED_EVENTS; i++) { From d5476b92a41190edb5a11243d4f716e832505bd8 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Sat, 7 Sep 2024 18:26:42 -0700 Subject: [PATCH 12/67] Fix a few data races when (de)instrumenting opcodes - Fix a few places where we were not using atomics to (de)instrument opcodes. - Fix a few places where we weren't using atomics to reset adaptive counters. - Remove some redundant non-atomic resets of adaptive counters that presumably snuck as merge artifacts of https://github.com/python/cpython/pull/118064 and https://github.com/python/cpython/pull/117144 landing close together. --- Python/instrumentation.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/Python/instrumentation.c b/Python/instrumentation.c index f8b77a2d369b7d..730ca5d23478c0 100644 --- a/Python/instrumentation.c +++ b/Python/instrumentation.c @@ -681,9 +681,10 @@ de_instrument_line(_Py_CODEUNIT *bytecode, _PyCoMonitoringData *monitoring, int } CHECK(original_opcode != 0); CHECK(original_opcode == _PyOpcode_Deopt[original_opcode]); - instr->op.code = original_opcode; + FT_ATOMIC_STORE_UINT8(instr->op.code, original_opcode); if (_PyOpcode_Caches[original_opcode]) { - instr[1].counter = adaptive_counter_warmup(); + FT_ATOMIC_STORE_UINT16_RELAXED(instr[1].counter.as_counter, + adaptive_counter_warmup().as_counter); } assert(instr->op.code != INSTRUMENTED_LINE); } @@ -705,9 +706,10 @@ de_instrument_per_instruction(_Py_CODEUNIT *bytecode, int original_opcode = monitoring->per_instruction_opcodes[i]; CHECK(original_opcode != 0); CHECK(original_opcode == _PyOpcode_Deopt[original_opcode]); - *opcode_ptr = original_opcode; + FT_ATOMIC_STORE_UINT8_RELAXED(*opcode_ptr, original_opcode); if (_PyOpcode_Caches[original_opcode]) { - instr[1].counter = adaptive_counter_warmup(); + FT_ATOMIC_STORE_UINT16_RELAXED(instr[1].counter.as_counter, + adaptive_counter_warmup().as_counter); } assert(*opcode_ptr != INSTRUMENTED_INSTRUCTION); assert(instr->op.code != INSTRUMENTED_INSTRUCTION); @@ -740,7 +742,6 @@ instrument(_Py_CODEUNIT *bytecode, _PyCoMonitoringData *monitoring, int i) if (_PyOpcode_Caches[deopt]) { FT_ATOMIC_STORE_UINT16_RELAXED(instr[1].counter.as_counter, adaptive_counter_warmup().as_counter); - instr[1].counter = adaptive_counter_warmup(); } } } @@ -756,7 +757,7 @@ instrument_line(_Py_CODEUNIT *bytecode, _PyCoMonitoringData *monitoring, int i) _PyCoLineInstrumentationData *lines = &monitoring->lines[i]; lines->original_opcode = _PyOpcode_Deopt[opcode]; CHECK(lines->original_opcode > 0); - *opcode_ptr = INSTRUMENTED_LINE; + FT_ATOMIC_STORE_UINT8_RELAXED(*opcode_ptr, INSTRUMENTED_LINE); } static void @@ -786,7 +787,7 @@ instrument_per_instruction(_Py_CODEUNIT *bytecode, monitoring->per_instruction_opcodes[i] = _PyOpcode_Deopt[opcode]; } assert(monitoring->per_instruction_opcodes[i] > 0); - *opcode_ptr = INSTRUMENTED_INSTRUCTION; + FT_ATOMIC_STORE_UINT8_RELAXED(*opcode_ptr, INSTRUMENTED_INSTRUCTION); } static void From e3b367a8351b799b6ed812332d5e756906051b94 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Sat, 7 Sep 2024 19:37:55 -0700 Subject: [PATCH 13/67] Make branch taken recording thread-safe --- Python/bytecodes.c | 20 +++++--------------- Python/ceval_macros.h | 12 ++++++++++++ Python/generated_cases.c.h | 28 +++++++--------------------- 3 files changed, 24 insertions(+), 36 deletions(-) diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 7af6ca2f981eb3..4af555d545300a 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -2618,18 +2618,14 @@ dummy_func( replaced op(_POP_JUMP_IF_FALSE, (cond -- )) { assert(PyStackRef_BoolCheck(cond)); int flag = PyStackRef_Is(cond, PyStackRef_False); - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); JUMPBY(oparg * flag); } replaced op(_POP_JUMP_IF_TRUE, (cond -- )) { assert(PyStackRef_BoolCheck(cond)); int flag = PyStackRef_Is(cond, PyStackRef_True); - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); JUMPBY(oparg * flag); } @@ -4598,9 +4594,7 @@ dummy_func( assert(PyStackRef_BoolCheck(cond)); int flag = PyStackRef_Is(cond, PyStackRef_True); int offset = flag * oparg; - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); INSTRUMENTED_JUMP(this_instr, next_instr + offset, PY_MONITORING_EVENT_BRANCH); } @@ -4609,9 +4603,7 @@ dummy_func( assert(PyStackRef_BoolCheck(cond)); int flag = PyStackRef_Is(cond, PyStackRef_False); int offset = flag * oparg; - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); INSTRUMENTED_JUMP(this_instr, next_instr + offset, PY_MONITORING_EVENT_BRANCH); } @@ -4626,9 +4618,7 @@ dummy_func( PyStackRef_CLOSE(value_stackref); offset = 0; } - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); INSTRUMENTED_JUMP(this_instr, next_instr + offset, PY_MONITORING_EVENT_BRANCH); } diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h index eee009ee6d01f3..8bb9c9624383ae 100644 --- a/Python/ceval_macros.h +++ b/Python/ceval_macros.h @@ -310,6 +310,18 @@ GETITEM(PyObject *v, Py_ssize_t i) { (COUNTER) = pause_backoff_counter((COUNTER)); \ } while (0); +#ifdef ENABLE_SPECIALIZATION +/* Multiple threads may execute these concurrently if the thread-local bytecode + * limit is reached and they all execute the main copy of the bytecode. This is + * approximate, we do not need the RMW cycle to be atomic. + */ +#define RECORD_BRANCH_TAKEN(bitset, flag) \ + FT_ATOMIC_STORE_UINT16_RELAXED(bitset, \ + (FT_ATOMIC_LOAD_UINT16_RELAXED(bitset) << 1) | (flag)) +#else +#define RECORD_BRANCH_TAKEN(bitset, flag) +#endif + #define UNBOUNDLOCAL_ERROR_MSG \ "cannot access local variable '%s' where it is not associated with a value" #define UNBOUNDFREE_ERROR_MSG \ diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index d439cb713402a3..8ab9bdd383326a 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -4414,9 +4414,7 @@ assert(PyStackRef_BoolCheck(cond)); int flag = PyStackRef_Is(cond, PyStackRef_False); int offset = flag * oparg; - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); INSTRUMENTED_JUMP(this_instr, next_instr + offset, PY_MONITORING_EVENT_BRANCH); DISPATCH(); } @@ -4437,9 +4435,7 @@ PyStackRef_CLOSE(value_stackref); offset = 0; } - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); INSTRUMENTED_JUMP(this_instr, next_instr + offset, PY_MONITORING_EVENT_BRANCH); DISPATCH(); } @@ -4477,9 +4473,7 @@ assert(PyStackRef_BoolCheck(cond)); int flag = PyStackRef_Is(cond, PyStackRef_True); int offset = flag * oparg; - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); INSTRUMENTED_JUMP(this_instr, next_instr + offset, PY_MONITORING_EVENT_BRANCH); DISPATCH(); } @@ -6217,9 +6211,7 @@ cond = stack_pointer[-1]; assert(PyStackRef_BoolCheck(cond)); int flag = PyStackRef_Is(cond, PyStackRef_False); - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); JUMPBY(oparg * flag); stack_pointer += -1; assert(WITHIN_STACK_BOUNDS()); @@ -6251,9 +6243,7 @@ { assert(PyStackRef_BoolCheck(cond)); int flag = PyStackRef_Is(cond, PyStackRef_True); - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); JUMPBY(oparg * flag); } stack_pointer += -1; @@ -6286,9 +6276,7 @@ { assert(PyStackRef_BoolCheck(cond)); int flag = PyStackRef_Is(cond, PyStackRef_False); - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); JUMPBY(oparg * flag); } stack_pointer += -1; @@ -6306,9 +6294,7 @@ cond = stack_pointer[-1]; assert(PyStackRef_BoolCheck(cond)); int flag = PyStackRef_Is(cond, PyStackRef_True); - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); JUMPBY(oparg * flag); stack_pointer += -1; assert(WITHIN_STACK_BOUNDS()); From b2375bf9678dc4b362f9271968b57680f4aff962 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Mon, 9 Sep 2024 14:30:37 -0700 Subject: [PATCH 14/67] Lock thread-local bytecode when specializing --- Include/internal/pycore_code.h | 6 +++++- Objects/codeobject.c | 26 ++++++++++++++++++++++++++ Python/bytecodes.c | 3 ++- Python/generated_cases.c.h | 3 ++- Python/specialize.c | 18 ++++++++++++++++-- 5 files changed, 51 insertions(+), 5 deletions(-) diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index 1eddf1f0320179..aa8bcd413ee737 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -369,7 +369,8 @@ extern void _Py_Specialize_Call(_PyStackRef callable, _Py_CODEUNIT *instr, int nargs); extern void _Py_Specialize_CallKw(_PyStackRef callable, _Py_CODEUNIT *instr, int nargs); -extern void _Py_Specialize_BinaryOp(_PyStackRef lhs, _PyStackRef rhs, _Py_CODEUNIT *instr, +extern void _Py_Specialize_BinaryOp(PyCodeObject *code, _PyStackRef lhs, + _PyStackRef rhs, _Py_CODEUNIT *instr, int oparg, _PyStackRef *locals); extern void _Py_Specialize_CompareOp(_PyStackRef lhs, _PyStackRef rhs, _Py_CODEUNIT *instr, int oparg); @@ -649,6 +650,9 @@ _PyCode_GetExecutableCode(PyCodeObject *co) } return _PyCode_GetExecutableCodeSlow(co); } + +extern void _PyCode_LockTLBC(PyCodeObject *co); +extern void _PyCode_UnlockTLBC(PyCodeObject *co); extern int _Py_ReserveSpecializedCodeIndex(PyInterpreterState *interp); extern void _Py_ClearSpecializedCodeIndex(_PyThreadStateImpl *tstate); #endif diff --git a/Objects/codeobject.c b/Objects/codeobject.c index 0b88db2b5a5fa9..14ae9f2a29616e 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -2861,4 +2861,30 @@ _PyCode_GetExecutableCodeSlow(PyCodeObject *co) return result; } +static inline _PyMutBytecode * +get_tlbc(PyCodeObject *co) +{ + _PyCodeArray *code = _Py_atomic_load_ptr_acquire(&co->co_specialized_code); + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *) PyThreadState_GET(); + Py_ssize_t idx = tstate->specialized_code_index; + assert(idx >= 0 && idx < code->size); + return code->entries[idx]; +} + +void +_PyCode_LockTLBC(PyCodeObject *co) +{ + _PyMutBytecode *tlbc = get_tlbc(co); + assert(tlbc != NULL); + PyMutex_LockFlags(&tlbc->mutex, _PY_LOCK_DETACH); +} + +void +_PyCode_UnlockTLBC(PyCodeObject *co) +{ + _PyMutBytecode *tlbc = get_tlbc(co); + assert(tlbc != NULL); + PyMutex_Unlock(&tlbc->mutex); +} + #endif diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 4af555d545300a..b8db346c7aaa61 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -4505,7 +4505,8 @@ dummy_func( #if ENABLE_SPECIALIZED_BINARY_OP if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; - _Py_Specialize_BinaryOp(lhs, rhs, next_instr, oparg, LOCALS_ARRAY); + _Py_Specialize_BinaryOp(_PyFrame_GetCode(frame), lhs, rhs, + next_instr, oparg, LOCALS_ARRAY); DISPATCH_SAME_OPARG(); } OPCODE_DEFERRED_INC(BINARY_OP); diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 8ab9bdd383326a..775ab548b976e0 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -28,7 +28,8 @@ #if ENABLE_SPECIALIZED_BINARY_OP if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; - _Py_Specialize_BinaryOp(lhs, rhs, next_instr, oparg, LOCALS_ARRAY); + _Py_Specialize_BinaryOp(_PyFrame_GetCode(frame), lhs, rhs, + next_instr, oparg, LOCALS_ARRAY); DISPATCH_SAME_OPARG(); } OPCODE_DEFERRED_INC(BINARY_OP); diff --git a/Python/specialize.c b/Python/specialize.c index f263c3b4aa9188..972adbcdadf3c2 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -26,8 +26,19 @@ extern const char *_PyUOpName(int index); #ifdef Py_GIL_DISABLED #define SET_OPCODE(instr, opcode) _Py_atomic_store_uint8_relaxed(&(instr)->op.code, (opcode)) +#define LOCK_TLBC_RETURN_IF_INSTRUMENTED(code, instr) \ + do { \ + _PyCode_LockTLBC(code); \ + if ((instr)->op.code >= MIN_INSTRUMENTED_OPCODE) { \ + _PyCode_UnlockTLBC(code); \ + return; \ + } \ + } while (0) +#define UNLOCK_TLBC(code) _PyCode_UnlockTLBC(code) #else #define SET_OPCODE(instr, opcode) (instr)->op.code = (opcode) +#define LOCK_TLBC_RETURN_IF_INSTRUMENTED(code, instr) (void) (code) +#define UNLOCK_TLBC(code) (void) (code) #endif @@ -2244,13 +2255,14 @@ binary_op_fail_kind(int oparg, PyObject *lhs, PyObject *rhs) #endif // Py_STATS void -_Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *instr, - int oparg, _PyStackRef *locals) +_Py_Specialize_BinaryOp(PyCodeObject *code, _PyStackRef lhs_st, _PyStackRef rhs_st, + _Py_CODEUNIT *instr, int oparg, _PyStackRef *locals) { PyObject *lhs = PyStackRef_AsPyObjectBorrow(lhs_st); PyObject *rhs = PyStackRef_AsPyObjectBorrow(rhs_st); assert(ENABLE_SPECIALIZED_BINARY_OP); assert(_PyOpcode_Caches[BINARY_OP] == INLINE_CACHE_ENTRIES_BINARY_OP); + LOCK_TLBC_RETURN_IF_INSTRUMENTED(code, instr); _PyBinaryOpCache *cache = (_PyBinaryOpCache *)(instr + 1); switch (oparg) { case NB_ADD: @@ -2310,10 +2322,12 @@ _Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *in STAT_INC(BINARY_OP, failure); instr->op.code = BINARY_OP; cache->counter = adaptive_counter_backoff(cache->counter); + UNLOCK_TLBC(code); return; success: STAT_INC(BINARY_OP, success); cache->counter = adaptive_counter_cooldown(); + UNLOCK_TLBC(code); } From 2707f8e5d854b63f886472813bbab04ea4a55ee0 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Mon, 9 Sep 2024 14:35:33 -0700 Subject: [PATCH 15/67] Load bytecode on RESUME_CHECK --- Include/internal/pycore_opcode_metadata.h | 3 +- Include/internal/pycore_uop_ids.h | 48 +++++++++++------------ Python/bytecodes.c | 6 ++- Python/generated_cases.c.h | 34 +++++++++++----- 4 files changed, 55 insertions(+), 36 deletions(-) diff --git a/Include/internal/pycore_opcode_metadata.h b/Include/internal/pycore_opcode_metadata.h index 51479afae3833d..9dd945956eeb3a 100644 --- a/Include/internal/pycore_opcode_metadata.h +++ b/Include/internal/pycore_opcode_metadata.h @@ -1180,7 +1180,7 @@ const struct opcode_metadata _PyOpcode_opcode_metadata[264] = { [RERAISE] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG }, [RESERVED] = { true, INSTR_FMT_IX, 0 }, [RESUME] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_EVAL_BREAK_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG }, - [RESUME_CHECK] = { true, INSTR_FMT_IX, HAS_DEOPT_FLAG }, + [RESUME_CHECK] = { true, INSTR_FMT_IX, HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG }, [RETURN_CONST] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_CONST_FLAG }, [RETURN_GENERATOR] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG }, [RETURN_VALUE] = { true, INSTR_FMT_IX, 0 }, @@ -1378,7 +1378,6 @@ _PyOpcode_macro_expansion[256] = { [POP_TOP] = { .nuops = 1, .uops = { { _POP_TOP, 0, 0 } } }, [PUSH_EXC_INFO] = { .nuops = 1, .uops = { { _PUSH_EXC_INFO, 0, 0 } } }, [PUSH_NULL] = { .nuops = 1, .uops = { { _PUSH_NULL, 0, 0 } } }, - [RESUME_CHECK] = { .nuops = 1, .uops = { { _RESUME_CHECK, 0, 0 } } }, [RETURN_CONST] = { .nuops = 2, .uops = { { _LOAD_CONST, 0, 0 }, { _RETURN_VALUE, 0, 0 } } }, [RETURN_GENERATOR] = { .nuops = 1, .uops = { { _RETURN_GENERATOR, 0, 0 } } }, [RETURN_VALUE] = { .nuops = 1, .uops = { { _RETURN_VALUE, 0, 0 } } }, diff --git a/Include/internal/pycore_uop_ids.h b/Include/internal/pycore_uop_ids.h index 19582d85e5dd25..736a91f32d8a0b 100644 --- a/Include/internal/pycore_uop_ids.h +++ b/Include/internal/pycore_uop_ids.h @@ -245,42 +245,42 @@ extern "C" { #define _PY_FRAME_KW 449 #define _QUICKEN_RESUME 450 #define _REPLACE_WITH_TRUE 451 -#define _RESUME_CHECK RESUME_CHECK +#define _RESUME_CHECK 452 #define _RETURN_GENERATOR RETURN_GENERATOR #define _RETURN_VALUE RETURN_VALUE -#define _SAVE_RETURN_OFFSET 452 -#define _SEND 453 -#define _SEND_GEN_FRAME 454 +#define _SAVE_RETURN_OFFSET 453 +#define _SEND 454 +#define _SEND_GEN_FRAME 455 #define _SETUP_ANNOTATIONS SETUP_ANNOTATIONS #define _SET_ADD SET_ADD #define _SET_FUNCTION_ATTRIBUTE SET_FUNCTION_ATTRIBUTE #define _SET_UPDATE SET_UPDATE -#define _START_EXECUTOR 455 -#define _STORE_ATTR 456 -#define _STORE_ATTR_INSTANCE_VALUE 457 -#define _STORE_ATTR_SLOT 458 -#define _STORE_ATTR_WITH_HINT 459 +#define _START_EXECUTOR 456 +#define _STORE_ATTR 457 +#define _STORE_ATTR_INSTANCE_VALUE 458 +#define _STORE_ATTR_SLOT 459 +#define _STORE_ATTR_WITH_HINT 460 #define _STORE_DEREF STORE_DEREF -#define _STORE_FAST 460 -#define _STORE_FAST_0 461 -#define _STORE_FAST_1 462 -#define _STORE_FAST_2 463 -#define _STORE_FAST_3 464 -#define _STORE_FAST_4 465 -#define _STORE_FAST_5 466 -#define _STORE_FAST_6 467 -#define _STORE_FAST_7 468 +#define _STORE_FAST 461 +#define _STORE_FAST_0 462 +#define _STORE_FAST_1 463 +#define _STORE_FAST_2 464 +#define _STORE_FAST_3 465 +#define _STORE_FAST_4 466 +#define _STORE_FAST_5 467 +#define _STORE_FAST_6 468 +#define _STORE_FAST_7 469 #define _STORE_FAST_LOAD_FAST STORE_FAST_LOAD_FAST #define _STORE_FAST_STORE_FAST STORE_FAST_STORE_FAST #define _STORE_GLOBAL STORE_GLOBAL #define _STORE_NAME STORE_NAME -#define _STORE_SLICE 469 -#define _STORE_SUBSCR 470 +#define _STORE_SLICE 470 +#define _STORE_SUBSCR 471 #define _STORE_SUBSCR_DICT STORE_SUBSCR_DICT #define _STORE_SUBSCR_LIST_INT STORE_SUBSCR_LIST_INT #define _SWAP SWAP -#define _TIER2_RESUME_CHECK 471 -#define _TO_BOOL 472 +#define _TIER2_RESUME_CHECK 472 +#define _TO_BOOL 473 #define _TO_BOOL_BOOL TO_BOOL_BOOL #define _TO_BOOL_INT TO_BOOL_INT #define _TO_BOOL_LIST TO_BOOL_LIST @@ -290,14 +290,14 @@ extern "C" { #define _UNARY_NEGATIVE UNARY_NEGATIVE #define _UNARY_NOT UNARY_NOT #define _UNPACK_EX UNPACK_EX -#define _UNPACK_SEQUENCE 473 +#define _UNPACK_SEQUENCE 474 #define _UNPACK_SEQUENCE_LIST UNPACK_SEQUENCE_LIST #define _UNPACK_SEQUENCE_TUPLE UNPACK_SEQUENCE_TUPLE #define _UNPACK_SEQUENCE_TWO_TUPLE UNPACK_SEQUENCE_TWO_TUPLE #define _WITH_EXCEPT_START WITH_EXCEPT_START #define _YIELD_VALUE YIELD_VALUE #define __DO_CALL_FUNCTION_EX _DO_CALL_FUNCTION_EX -#define MAX_UOP_ID 473 +#define MAX_UOP_ID 474 #ifdef __cplusplus } diff --git a/Python/bytecodes.c b/Python/bytecodes.c index b8db346c7aaa61..83b8a159ab59da 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -210,7 +210,7 @@ dummy_func( _QUICKEN_RESUME + _CHECK_PERIODIC_IF_NOT_YIELD_FROM; - inst(RESUME_CHECK, (--)) { + op(_RESUME_CHECK, (--)) { #if defined(__EMSCRIPTEN__) DEOPT_IF(_Py_emscripten_signal_clock == 0); _Py_emscripten_signal_clock -= Py_EMSCRIPTEN_SIGNAL_HANDLING; @@ -221,6 +221,10 @@ dummy_func( DEOPT_IF(eval_breaker != version); } + macro(RESUME_CHECK) = + _LOAD_BYTECODE + + _RESUME_CHECK; + op(_MONITOR_RESUME, (--)) { _PyFrame_SetStackPointer(frame, stack_pointer); int err = _Py_call_instrumentation( diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 775ab548b976e0..41555eff5d7abf 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -6479,18 +6479,34 @@ } TARGET(RESUME_CHECK) { - frame->instr_ptr = next_instr; + _Py_CODEUNIT *this_instr = frame->instr_ptr = next_instr; next_instr += 1; INSTRUCTION_STATS(RESUME_CHECK); static_assert(0 == 0, "incorrect cache size"); - #if defined(__EMSCRIPTEN__) - DEOPT_IF(_Py_emscripten_signal_clock == 0, RESUME); - _Py_emscripten_signal_clock -= Py_EMSCRIPTEN_SIGNAL_HANDLING; - #endif - uintptr_t eval_breaker = _Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker); - uintptr_t version = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(_PyFrame_GetCode(frame)->_co_instrumentation_version); - assert((version & _PY_EVAL_EVENTS_MASK) == 0); - DEOPT_IF(eval_breaker != version, RESUME); + // _LOAD_BYTECODE + { + #ifdef Py_GIL_DISABLED + _Py_CODEUNIT *bytecode = _PyCode_GetExecutableCode(_PyFrame_GetCode(frame)); + if (frame->bytecode != bytecode) { + int off = this_instr - frame->bytecode; + frame->bytecode = bytecode; + frame->instr_ptr = frame->bytecode + off; + this_instr = frame->instr_ptr; + next_instr = frame->instr_ptr + 1; + } + #endif + } + // _RESUME_CHECK + { + #if defined(__EMSCRIPTEN__) + DEOPT_IF(_Py_emscripten_signal_clock == 0, RESUME); + _Py_emscripten_signal_clock -= Py_EMSCRIPTEN_SIGNAL_HANDLING; + #endif + uintptr_t eval_breaker = _Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker); + uintptr_t version = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(_PyFrame_GetCode(frame)->_co_instrumentation_version); + assert((version & _PY_EVAL_EVENTS_MASK) == 0); + DEOPT_IF(eval_breaker != version, RESUME); + } DISPATCH(); } From 3fdcb288e56e45b6681ffdf85fb095a58207fc49 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Mon, 9 Sep 2024 15:01:55 -0700 Subject: [PATCH 16/67] Load tlbc on generator.throw() --- Python/ceval.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Python/ceval.c b/Python/ceval.c index 9d2fe891c15a20..5407347a39391d 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -808,7 +808,15 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int } /* Because this avoids the RESUME, * we need to update instrumentation */ - /* TODO(mpage) - Need to handle this */ +#ifdef Py_GIL_DISABLED + /* Load thread-local bytecode */ + _Py_CODEUNIT *bytecode = _PyCode_GetExecutableCode(_PyFrame_GetCode(frame)); + if (frame->bytecode != bytecode) { + int off = frame->instr_ptr - frame->bytecode; + frame->bytecode = bytecode; + frame->instr_ptr = frame->bytecode + off; + } +#endif _Py_Instrument(_PyFrame_GetCode(frame), tstate->interp); monitor_throw(tstate, frame, frame->instr_ptr); /* TO DO -- Monitor throw entry. */ From 4a55ce5924d2dd9c193325497413d3143f116b42 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Mon, 9 Sep 2024 15:21:27 -0700 Subject: [PATCH 17/67] Use tlbc instead of thread_local_bytecode --- Include/cpython/initconfig.h | 2 +- Include/internal/pycore_interp.h | 6 +++--- Lib/test/test_cmd_line.py | 20 +++++++++--------- Lib/test/test_embed.py | 2 +- Objects/codeobject.c | 36 ++++++++++++++++---------------- Python/initconfig.c | 28 ++++++++++++------------- 6 files changed, 47 insertions(+), 47 deletions(-) diff --git a/Include/cpython/initconfig.h b/Include/cpython/initconfig.h index f2ba2273a857b2..8b4ad95ed9f89c 100644 --- a/Include/cpython/initconfig.h +++ b/Include/cpython/initconfig.h @@ -183,7 +183,7 @@ typedef struct PyConfig { int cpu_count; #ifdef Py_GIL_DISABLED int enable_gil; - int thread_local_bytecode_limit; + int tlbc_limit; #endif /* --- Path configuration inputs ------------ */ diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h index 4f6ed2c3f5b040..3935d9b885238f 100644 --- a/Include/internal/pycore_interp.h +++ b/Include/internal/pycore_interp.h @@ -226,10 +226,10 @@ struct _is { PyMutex weakref_locks[NUM_WEAKREF_LIST_LOCKS]; _PyIndexPool specialized_code_indices; // Number of bytes available for thread-local bytecode, counts down to zero - Py_ssize_t thread_local_bytecode_avail; - PyMutex thread_local_bytecode_avail_mutex; + Py_ssize_t tlbc_avail; + PyMutex tlbc_avail_mutex; // This is monotonic; once true it will remain true - bool new_thread_local_bytecode_disabled; + bool new_tlbc_disabled; #endif // Per-interpreter state for the obmalloc allocator. For the main diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py index c2210d60a810b1..0ee81126ca6725 100644 --- a/Lib/test/test_cmd_line.py +++ b/Lib/test/test_cmd_line.py @@ -1070,7 +1070,7 @@ def res2int(self, res): return tuple(int(i) for i in out.split()) @unittest.skipUnless(support.Py_GIL_DISABLED, - "PYTHON_THREAD_LOCAL_BC_LIMIT and -X thread_local_bc_limit" + "PYTHON_TLBC_LIMIT and -X tlbc_limit" " only supported in Py_GIL_DISABLED builds") @threading_helper.requires_working_threading() def test_set_thread_local_bytecode_limit(self): @@ -1081,21 +1081,21 @@ def test(x, y): t = threading.Thread(target=test, args=(1,2)) t.start() t.join()""" - rc, out, err = assert_python_ok("-W", "always", "-X", "thread_local_bc_limit=1", "-c", code) + rc, out, err = assert_python_ok("-W", "always", "-X", "tlbc_limit=1", "-c", code) self.assertIn(b"Reached memory limit for thread-local bytecode", err) - rc, out, err = assert_python_ok("-W", "always", "-c", code, PYTHON_THREAD_LOCAL_BC_LIMIT="1") + rc, out, err = assert_python_ok("-W", "always", "-c", code, PYTHON_TLBC_LIMIT="1") self.assertIn(b"Reached memory limit for thread-local bytecode", err) @unittest.skipUnless(support.Py_GIL_DISABLED, - "PYTHON_THREAD_LOCAL_BC_LIMIT and -X thread_local_bc_limit" + "PYTHON_TLBC_LIMIT and -X tlbc_limit" " only supported in Py_GIL_DISABLED builds") def test_invalid_thread_local_bytecode_limit(self): - rc, out, err = assert_python_failure("-X", "thread_local_bc_limit") - self.assertIn(b"thread_local_bc_limit=n: n is missing or invalid", err) - rc, out, err = assert_python_failure("-X", "thread_local_bc_limit=foo") - self.assertIn(b"thread_local_bc_limit=n: n is missing or invalid", err) - rc, out, err = assert_python_failure(PYTHON_THREAD_LOCAL_BC_LIMIT="foo") - self.assertIn(b"PYTHON_THREAD_LOCAL_BC_LIMIT=N: N is missing or invalid", err) + rc, out, err = assert_python_failure("-X", "tlbc_limit") + self.assertIn(b"tlbc_limit=n: n is missing or invalid", err) + rc, out, err = assert_python_failure("-X", "tlbc_limit=foo") + self.assertIn(b"tlbc_limit=n: n is missing or invalid", err) + rc, out, err = assert_python_failure(PYTHON_TLBC_LIMIT="foo") + self.assertIn(b"PYTHON_TLBC_LIMIT=N: N is missing or invalid", err) @unittest.skipIf(interpreter_requires_environment(), diff --git a/Lib/test/test_embed.py b/Lib/test/test_embed.py index e6fddfdc5e2582..385e08bed10cd3 100644 --- a/Lib/test/test_embed.py +++ b/Lib/test/test_embed.py @@ -631,7 +631,7 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase): CONFIG_COMPAT['run_presite'] = None if support.Py_GIL_DISABLED: CONFIG_COMPAT['enable_gil'] = -1 - CONFIG_COMPAT['thread_local_bytecode_limit'] = GET_DEFAULT_CONFIG + CONFIG_COMPAT['tlbc_limit'] = GET_DEFAULT_CONFIG if MS_WINDOWS: CONFIG_COMPAT.update({ 'legacy_windows_stdio': False, diff --git a/Objects/codeobject.c b/Objects/codeobject.c index 14ae9f2a29616e..52b6fb6512c051 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -537,7 +537,7 @@ init_code(PyCodeObject *co, struct _PyCodeConstructor *con) } co->_co_firsttraceable = entry_point; #ifdef Py_GIL_DISABLED - if (interp->new_thread_local_bytecode_disabled) { + if (interp->new_tlbc_disabled) { _PyCode_DisableSpecialization(_PyCode_CODE(co), Py_SIZE(co)); } else { @@ -2689,9 +2689,9 @@ _PyCode_Fini(PyInterpreterState *interp) void _PyCode_InitState(PyInterpreterState *interp) { - int limit = interp->config.thread_local_bytecode_limit; - interp->thread_local_bytecode_avail = limit; - interp->new_thread_local_bytecode_disabled = limit == 0; + int limit = interp->config.tlbc_limit; + interp->tlbc_avail = limit; + interp->new_tlbc_disabled = limit == 0; } static _PyCodeArray * @@ -2765,15 +2765,15 @@ reserve_bytes_for_specialized_code(PyCodeObject *co) PyInterpreterState *interp = _PyInterpreterState_GET(); Py_ssize_t nbytes_reserved = -1; Py_ssize_t code_size = _PyCode_NBYTES(co); - PyMutex_LockFlags(&interp->thread_local_bytecode_avail_mutex, _Py_LOCK_DONT_DETACH); - if (interp->thread_local_bytecode_avail < 0) { + PyMutex_LockFlags(&interp->tlbc_avail_mutex, _Py_LOCK_DONT_DETACH); + if (interp->tlbc_avail < 0) { nbytes_reserved = code_size; } - else if (interp->thread_local_bytecode_avail >= code_size) { - interp->thread_local_bytecode_avail -= code_size; + else if (interp->tlbc_avail >= code_size) { + interp->tlbc_avail -= code_size; nbytes_reserved = code_size; } - PyMutex_Unlock(&interp->thread_local_bytecode_avail_mutex); + PyMutex_Unlock(&interp->tlbc_avail_mutex); return nbytes_reserved; } @@ -2785,11 +2785,11 @@ release_bytes_for_specialized_code(Py_ssize_t nbytes) return; } PyInterpreterState *interp = _PyInterpreterState_GET(); - PyMutex_LockFlags(&interp->thread_local_bytecode_avail_mutex, _Py_LOCK_DONT_DETACH); - if (interp->thread_local_bytecode_avail >= 0) { - interp->thread_local_bytecode_avail += nbytes; + PyMutex_LockFlags(&interp->tlbc_avail_mutex, _Py_LOCK_DONT_DETACH); + if (interp->tlbc_avail >= 0) { + interp->tlbc_avail += nbytes; } - PyMutex_Unlock(&interp->thread_local_bytecode_avail_mutex); + PyMutex_Unlock(&interp->tlbc_avail_mutex); } static int @@ -2804,10 +2804,10 @@ disable_specialization(PyObject *obj, void*) } static void -disable_new_thread_local_bytecode(void) +disable_new_tlbc(void) { PyInterpreterState *interp = _PyInterpreterState_GET(); - if (interp->new_thread_local_bytecode_disabled) { + if (interp->new_tlbc_disabled) { return; } // Disable creation of new thread-local copies of bytecode. We disable @@ -2818,7 +2818,7 @@ disable_new_thread_local_bytecode(void) // main copy), but any attempts to create new copies of bytecode will fail, // and the main, unspecializable copy will be used. _PyEval_StopTheWorld(interp); - interp->new_thread_local_bytecode_disabled = true; + interp->new_tlbc_disabled = true; _PyEval_StartTheWorld(interp); PyUnstable_GC_VisitObjects(disable_specialization, NULL); if (PyErr_WarnEx(PyExc_ResourceWarning, "Reached memory limit for thread-local bytecode", 1) < 0) { @@ -2837,7 +2837,7 @@ get_executable_code_lock_held(PyCodeObject *co) } Py_ssize_t reserved = reserve_bytes_for_specialized_code(co); if (reserved == -1) { - disable_new_thread_local_bytecode(); + disable_new_tlbc(); return (_Py_CODEUNIT *) spec_code->entries[0]->bytecode; } _Py_CODEUNIT *result = create_specializable_code_lock_held(co, idx); @@ -2851,7 +2851,7 @@ _Py_CODEUNIT * _PyCode_GetExecutableCodeSlow(PyCodeObject *co) { PyInterpreterState *interp = _PyInterpreterState_GET(); - if (interp->new_thread_local_bytecode_disabled) { + if (interp->new_tlbc_disabled) { return (_Py_CODEUNIT *) co->co_specialized_code->entries[0]->bytecode; } _Py_CODEUNIT *result; diff --git a/Python/initconfig.c b/Python/initconfig.c index 311d7bf9705ff9..1b7a9489626702 100644 --- a/Python/initconfig.c +++ b/Python/initconfig.c @@ -134,7 +134,7 @@ static const PyConfigSpec PYCONFIG_SPEC[] = { SPEC(dump_refs_file, WSTR_OPT, READ_ONLY, NO_SYS), #ifdef Py_GIL_DISABLED SPEC(enable_gil, INT, READ_ONLY, NO_SYS), - SPEC(thread_local_bytecode_limit, INT), + SPEC(tlbc_limit, INT, READ_ONLY, NO_SYS), #endif SPEC(faulthandler, BOOL, READ_ONLY, NO_SYS), SPEC(filesystem_encoding, WSTR, READ_ONLY, NO_SYS), @@ -318,9 +318,9 @@ The following implementation-specific options are available:\n\ memory blocks when the program finishes or after each statement in\n\ the interactive interpreter; only works on debug builds\n" #ifdef Py_GIL_DISABLED -"-X thread_local_bc_limit=N: limit the total size of thread-local bytecode,\n\ +"-X tlbc_limit=N: limit the total size of thread-local bytecode,\n\ per-interpreter, to N bytes. A value < 0 means unlimited. A value of\n\ - 0 disables thread-local bytecode. Also PYTHON_THREAD_LOCAL_BC_LIMIT\n" + 0 disables thread-local bytecode. Also PYTHON_TLBC_LIMIT\n" #endif "\ -X tracemalloc[=N]: trace Python memory allocations; N sets a traceback limit\n \ @@ -408,8 +408,8 @@ static const char usage_envvars[] = "PYTHONSTATS : turns on statistics gathering (-X pystats)\n" #endif #ifdef Py_GIL_DISABLED -"PYTHON_THREAD_LOCAL_BC_LIMT: limit the total size of thread-local bytecode\n" -" (-X thread-local-bc-limit)\n" +"PYTHON_TLBC_LIMIT: limit the total size of thread-local bytecode\n" +" (-X tlbc-limit)\n" #endif "PYTHONTRACEMALLOC: trace Python memory allocations (-X tracemalloc)\n" "PYTHONUNBUFFERED: disable stdout/stderr buffering (-u)\n" @@ -991,7 +991,7 @@ _PyConfig_InitCompatConfig(PyConfig *config) #ifdef Py_GIL_DISABLED config->enable_gil = _PyConfig_GIL_DEFAULT; // 100 MiB - config->thread_local_bytecode_limit = 100 * (1 << 20); + config->tlbc_limit = 100 * (1 << 20); #endif } @@ -1876,28 +1876,28 @@ config_init_cpu_count(PyConfig *config) } static PyStatus -config_init_thread_local_bytecode_limit(PyConfig *config) +config_init_tlbc_limit(PyConfig *config) { #ifdef Py_GIL_DISABLED - const char *env = config_get_env(config, "PYTHON_THREAD_LOCAL_BC_LIMIT"); + const char *env = config_get_env(config, "PYTHON_TLBC_LIMIT"); if (env) { int limit = -1; if (_Py_str_to_int(env, &limit) < 0) { return _PyStatus_ERR( - "PYTHON_THREAD_LOCAL_BC_LIMIT=N: N is missing or invalid"); + "PYTHON_TLBC_LIMIT=N: N is missing or invalid"); } - config->thread_local_bytecode_limit = limit; + config->tlbc_limit = limit; } - const wchar_t *xoption = config_get_xoption(config, L"thread_local_bc_limit"); + const wchar_t *xoption = config_get_xoption(config, L"tlbc_limit"); if (xoption) { int limit = -1; const wchar_t *sep = wcschr(xoption, L'='); if (!sep || (config_wstr_to_int(sep + 1, &limit) < 0)) { return _PyStatus_ERR( - "-X thread_local_bc_limit=n: n is missing or invalid"); + "-X tlbc_limit=n: n is missing or invalid"); } - config->thread_local_bytecode_limit = limit; + config->tlbc_limit = limit; } return _PyStatus_OK(); #endif @@ -2152,7 +2152,7 @@ config_read_complex_options(PyConfig *config) } #endif - status = config_init_thread_local_bytecode_limit(config); + status = config_init_tlbc_limit(config); if (_PyStatus_EXCEPTION(status)) { return status; } From 8b3ff60c1d5e97c8e0b3abdbfd663929a4d0a943 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Mon, 9 Sep 2024 15:48:36 -0700 Subject: [PATCH 18/67] Use tlbc everywhere --- Include/cpython/code.h | 2 +- Include/internal/pycore_code.h | 8 +-- Include/internal/pycore_interp.h | 2 +- Include/internal/pycore_tstate.h | 4 +- Objects/codeobject.c | 87 ++++++++++++++++++-------------- Python/instrumentation.c | 24 ++++----- Python/pystate.c | 8 +-- Python/specialize.c | 17 ------- 8 files changed, 74 insertions(+), 78 deletions(-) diff --git a/Include/cpython/code.h b/Include/cpython/code.h index 4e8b364ea1ee67..9c41d7521d7cef 100644 --- a/Include/cpython/code.h +++ b/Include/cpython/code.h @@ -84,7 +84,7 @@ typedef struct { } _PyCodeArray; #define _PyCode_DEF_THREAD_LOCAL_BYTECODE() \ - _PyCodeArray *co_specialized_code; \ + _PyCodeArray *co_tlbc; \ PyMutex co_code_adaptive_mutex; #else #define _PyCode_DEF_THREAD_LOCAL_BYTECODE() diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index aa8bcd413ee737..1014d03669725f 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -642,9 +642,9 @@ extern _Py_CODEUNIT *_PyCode_GetExecutableCodeSlow(PyCodeObject *co); static inline _Py_CODEUNIT * _PyCode_GetExecutableCode(PyCodeObject *co) { - _PyCodeArray *code = _Py_atomic_load_ptr_acquire(&co->co_specialized_code); + _PyCodeArray *code = _Py_atomic_load_ptr_acquire(&co->co_tlbc); _PyThreadStateImpl *tstate = (_PyThreadStateImpl *) PyThreadState_GET(); - Py_ssize_t idx = tstate->specialized_code_index; + Py_ssize_t idx = tstate->tlbc_index; if (idx < code->size && code->entries[idx] != NULL) { return (_Py_CODEUNIT *) code->entries[idx]->bytecode; } @@ -653,8 +653,8 @@ _PyCode_GetExecutableCode(PyCodeObject *co) extern void _PyCode_LockTLBC(PyCodeObject *co); extern void _PyCode_UnlockTLBC(PyCodeObject *co); -extern int _Py_ReserveSpecializedCodeIndex(PyInterpreterState *interp); -extern void _Py_ClearSpecializedCodeIndex(_PyThreadStateImpl *tstate); +extern int _Py_ReserveTLBCIndex(PyInterpreterState *interp); +extern void _Py_ClearTLBCIndex(_PyThreadStateImpl *tstate); #endif #ifdef __cplusplus diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h index 3935d9b885238f..b3fe6824a3a914 100644 --- a/Include/internal/pycore_interp.h +++ b/Include/internal/pycore_interp.h @@ -224,7 +224,7 @@ struct _is { struct _brc_state brc; // biased reference counting state struct _Py_type_id_pool type_ids; PyMutex weakref_locks[NUM_WEAKREF_LIST_LOCKS]; - _PyIndexPool specialized_code_indices; + _PyIndexPool tlbc_indices; // Number of bytes available for thread-local bytecode, counts down to zero Py_ssize_t tlbc_avail; PyMutex tlbc_avail_mutex; diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h index e977c8afcea361..83f99bb10571f3 100644 --- a/Include/internal/pycore_tstate.h +++ b/Include/internal/pycore_tstate.h @@ -42,8 +42,8 @@ typedef struct _PyThreadStateImpl { int is_finalized; } types; - // Index to use to retrieve specialized bytecode for this thread - Py_ssize_t specialized_code_index; + // Index to use to retrieve thread-local bytecode for this thread + Py_ssize_t tlbc_index; #endif #if defined(Py_REF_DEBUG) && defined(Py_GIL_DISABLED) diff --git a/Objects/codeobject.c b/Objects/codeobject.c index 52b6fb6512c051..73601464ad90af 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -456,7 +456,7 @@ extern void _PyCode_Quicken(_Py_CODEUNIT *instructions, Py_ssize_t size); #ifdef Py_GIL_DISABLED extern void _PyCode_DisableSpecialization(_Py_CODEUNIT *instructions, Py_ssize_t size); static _PyCodeArray * _PyCodeArray_New(Py_ssize_t size); -static void release_bytes_for_specialized_code(Py_ssize_t nbytes); +static void release_bytes_for_tlbc(Py_ssize_t nbytes); #endif static int @@ -523,12 +523,12 @@ init_code(PyCodeObject *co, struct _PyCodeConstructor *con) memcpy(_PyCode_CODE(co), PyBytes_AS_STRING(con->code), PyBytes_GET_SIZE(con->code)); #ifdef Py_GIL_DISABLED - co->co_specialized_code = _PyCodeArray_New(INITIAL_SPECIALIZED_CODE_SIZE); - if (co->co_specialized_code == NULL) { + co->co_tlbc = _PyCodeArray_New(INITIAL_SPECIALIZED_CODE_SIZE); + if (co->co_tlbc == NULL) { return -1; } - co->co_specialized_code->entries[0] = (_PyMutBytecode *) &co->co_code_adaptive_mutex; - co->co_specialized_code->entries[0]->mutex = (PyMutex){0}; + co->co_tlbc->entries[0] = (_PyMutBytecode *) &co->co_code_adaptive_mutex; + co->co_tlbc->entries[0]->mutex = (PyMutex){0}; #endif int entry_point = 0; while (entry_point < Py_SIZE(co) && @@ -1907,15 +1907,15 @@ code_dealloc(PyCodeObject *co) // The first element always points to the mutable bytecode at the end of // the code object, which will be freed when the code object is freed. Py_ssize_t bytes_freed = 0; - for (Py_ssize_t i = 1; i < co->co_specialized_code->size; i++) { - _PyMutBytecode *entry = co->co_specialized_code->entries[i]; + for (Py_ssize_t i = 1; i < co->co_tlbc->size; i++) { + _PyMutBytecode *entry = co->co_tlbc->entries[i]; if (entry != NULL) { PyMem_Free(entry); bytes_freed += _PyCode_NBYTES(co); } } - release_bytes_for_specialized_code(bytes_freed); - PyMem_Free(co->co_specialized_code); + release_bytes_for_tlbc(bytes_freed); + PyMem_Free(co->co_tlbc); #endif PyObject_Free(co); } @@ -2680,7 +2680,7 @@ _PyCode_Fini(PyInterpreterState *interp) _Py_hashtable_destroy(state->constants); state->constants = NULL; } - _PyIndexPool_Fini(&interp->specialized_code_indices); + _PyIndexPool_Fini(&interp->tlbc_indices); #endif } @@ -2694,6 +2694,19 @@ _PyCode_InitState(PyInterpreterState *interp) interp->new_tlbc_disabled = limit == 0; } +int +_Py_ReserveTLBCIndex(PyInterpreterState *interp) +{ + return _PyIndexPool_AllocIndex(&interp->tlbc_indices); +} + +void +_Py_ClearTLBCIndex(_PyThreadStateImpl *tstate) +{ + PyInterpreterState *interp = ((PyThreadState*) tstate)->interp; + _PyIndexPool_FreeIndex(&interp->tlbc_indices, tstate->tlbc_index); +} + static _PyCodeArray * _PyCodeArray_New(Py_ssize_t size) { @@ -2730,23 +2743,23 @@ get_pow2_greater(Py_ssize_t initial, Py_ssize_t limit) } static _Py_CODEUNIT * -create_specializable_code_lock_held(PyCodeObject *co, Py_ssize_t idx) +create_tlbc_lock_held(PyCodeObject *co, Py_ssize_t idx) { - _PyCodeArray *spec_code = co->co_specialized_code; - if (idx >= spec_code->size) { - Py_ssize_t new_size = get_pow2_greater(spec_code->size, idx + 1); + _PyCodeArray *tlbc = co->co_tlbc; + if (idx >= tlbc->size) { + Py_ssize_t new_size = get_pow2_greater(tlbc->size, idx + 1); if (!new_size) { PyErr_NoMemory(); return NULL; } - _PyCodeArray *new_spec_code = _PyCodeArray_New(new_size); - if (new_spec_code == NULL) { + _PyCodeArray *new_tlbc = _PyCodeArray_New(new_size); + if (new_tlbc == NULL) { return NULL; } - memcpy(new_spec_code->entries, spec_code->entries, spec_code->size * sizeof(void*)); - _Py_atomic_store_ptr_release(&co->co_specialized_code, new_spec_code); - _PyMem_FreeDelayed(spec_code); - spec_code = new_spec_code; + memcpy(new_tlbc->entries, tlbc->entries, tlbc->size * sizeof(void*)); + _Py_atomic_store_ptr_release(&co->co_tlbc, new_tlbc); + _PyMem_FreeDelayed(tlbc); + tlbc = new_tlbc; } _PyMutBytecode *bc = PyMem_Calloc(1, sizeof(_PyMutBytecode) + _PyCode_NBYTES(co)); if (bc == NULL) { @@ -2754,13 +2767,13 @@ create_specializable_code_lock_held(PyCodeObject *co, Py_ssize_t idx) return NULL; } copy_code(bc, co); - assert(spec_code->entries[idx] == NULL); - spec_code->entries[idx] = bc; + assert(tlbc->entries[idx] == NULL); + tlbc->entries[idx] = bc; return (_Py_CODEUNIT *) bc->bytecode; } static Py_ssize_t -reserve_bytes_for_specialized_code(PyCodeObject *co) +reserve_bytes_for_tlbc(PyCodeObject *co) { PyInterpreterState *interp = _PyInterpreterState_GET(); Py_ssize_t nbytes_reserved = -1; @@ -2778,7 +2791,7 @@ reserve_bytes_for_specialized_code(PyCodeObject *co) } static void -release_bytes_for_specialized_code(Py_ssize_t nbytes) +release_bytes_for_tlbc(Py_ssize_t nbytes) { assert(nbytes >= 0); if (nbytes == 0) { @@ -2827,22 +2840,22 @@ disable_new_tlbc(void) } static _Py_CODEUNIT * -get_executable_code_lock_held(PyCodeObject *co) +get_tlbc_lock_held(PyCodeObject *co) { - _PyCodeArray *spec_code = co->co_specialized_code; + _PyCodeArray *tlbc = co->co_tlbc; _PyThreadStateImpl *tstate = (_PyThreadStateImpl *) PyThreadState_GET(); - Py_ssize_t idx = tstate->specialized_code_index; - if (idx < spec_code->size && spec_code->entries[idx] != NULL) { - return (_Py_CODEUNIT *) spec_code->entries[idx]->bytecode; + Py_ssize_t idx = tstate->tlbc_index; + if (idx < tlbc->size && tlbc->entries[idx] != NULL) { + return (_Py_CODEUNIT *) tlbc->entries[idx]->bytecode; } - Py_ssize_t reserved = reserve_bytes_for_specialized_code(co); + Py_ssize_t reserved = reserve_bytes_for_tlbc(co); if (reserved == -1) { disable_new_tlbc(); - return (_Py_CODEUNIT *) spec_code->entries[0]->bytecode; + return (_Py_CODEUNIT *) tlbc->entries[0]->bytecode; } - _Py_CODEUNIT *result = create_specializable_code_lock_held(co, idx); + _Py_CODEUNIT *result = create_tlbc_lock_held(co, idx); if (result == NULL) { - release_bytes_for_specialized_code(reserved); + release_bytes_for_tlbc(reserved); } return result; } @@ -2852,11 +2865,11 @@ _PyCode_GetExecutableCodeSlow(PyCodeObject *co) { PyInterpreterState *interp = _PyInterpreterState_GET(); if (interp->new_tlbc_disabled) { - return (_Py_CODEUNIT *) co->co_specialized_code->entries[0]->bytecode; + return (_Py_CODEUNIT *) co->co_tlbc->entries[0]->bytecode; } _Py_CODEUNIT *result; Py_BEGIN_CRITICAL_SECTION(co); - result = get_executable_code_lock_held(co); + result = get_tlbc_lock_held(co); Py_END_CRITICAL_SECTION(); return result; } @@ -2864,9 +2877,9 @@ _PyCode_GetExecutableCodeSlow(PyCodeObject *co) static inline _PyMutBytecode * get_tlbc(PyCodeObject *co) { - _PyCodeArray *code = _Py_atomic_load_ptr_acquire(&co->co_specialized_code); + _PyCodeArray *code = _Py_atomic_load_ptr_acquire(&co->co_tlbc); _PyThreadStateImpl *tstate = (_PyThreadStateImpl *) PyThreadState_GET(); - Py_ssize_t idx = tstate->specialized_code_index; + Py_ssize_t idx = tstate->tlbc_index; assert(idx >= 0 && idx < code->size); return code->entries[idx]; } diff --git a/Python/instrumentation.c b/Python/instrumentation.c index 730ca5d23478c0..cbebd7d9923ab9 100644 --- a/Python/instrumentation.c +++ b/Python/instrumentation.c @@ -46,18 +46,18 @@ #define UNLOCK_CODE() Py_END_CRITICAL_SECTION() -#define MODIFY_BYTECODE(code, func, args...) \ - do { \ - PyCodeObject *co = (code); \ - for (Py_ssize_t i = 0; i < code->co_specialized_code->size; i++) { \ - _PyMutBytecode *mb = co->co_specialized_code->entries[i]; \ - if (mb == NULL) { \ - continue; \ - } \ - PyMutex_LockFlags(&mb->mutex, _Py_LOCK_DONT_DETACH); \ - (func)((_Py_CODEUNIT *) mb->bytecode, args); \ - PyMutex_Unlock(&mb->mutex); \ - } \ +#define MODIFY_BYTECODE(code, func, args...) \ + do { \ + PyCodeObject *co = (code); \ + for (Py_ssize_t i = 0; i < code->co_tlbc->size; i++) { \ + _PyMutBytecode *mb = co->co_tlbc->entries[i]; \ + if (mb == NULL) { \ + continue; \ + } \ + PyMutex_LockFlags(&mb->mutex, _Py_LOCK_DONT_DETACH); \ + (func)((_Py_CODEUNIT *) mb->bytecode, args); \ + PyMutex_Unlock(&mb->mutex); \ + } \ } while (0) #else diff --git a/Python/pystate.c b/Python/pystate.c index 3a92af581a695e..fb55644cb5f4d7 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -1551,8 +1551,8 @@ new_threadstate(PyInterpreterState *interp, int whence) PyMem_RawFree(new_tstate); return NULL; } - Py_ssize_t code_idx = _Py_ReserveSpecializedCodeIndex(interp); - if (code_idx < 0) { + Py_ssize_t tlbc_idx = _Py_ReserveTLBCIndex(interp); + if (tlbc_idx < 0) { PyMem_RawFree(new_tstate); return NULL; } @@ -1598,7 +1598,7 @@ new_threadstate(PyInterpreterState *interp, int whence) #ifdef Py_GIL_DISABLED // Must be called with lock unlocked to avoid lock ordering deadlocks. _Py_qsbr_register(tstate, interp, qsbr_idx); - tstate->specialized_code_index = code_idx; + tstate->tlbc_index = tlbc_idx; #endif return (PyThreadState *)tstate; @@ -1753,7 +1753,7 @@ PyThreadState_Clear(PyThreadState *tstate) // Release our thread-local copies of the bytecode for reuse by another // thread - _Py_ClearSpecializedCodeIndex((_PyThreadStateImpl *)tstate); + _Py_ClearTLBCIndex((_PyThreadStateImpl *)tstate); #endif // Merge our queue of pointers to be freed into the interpreter queue. diff --git a/Python/specialize.c b/Python/specialize.c index 972adbcdadf3c2..e3ed068ace31ec 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -2759,23 +2759,6 @@ _Py_Specialize_ContainsOp(_PyStackRef value_st, _Py_CODEUNIT *instr) cache->counter = adaptive_counter_cooldown(); } -#ifdef Py_GIL_DISABLED - -int -_Py_ReserveSpecializedCodeIndex(PyInterpreterState *interp) -{ - return _PyIndexPool_AllocIndex(&interp->specialized_code_indices); -} - -void -_Py_ClearSpecializedCodeIndex(_PyThreadStateImpl *tstate) -{ - PyInterpreterState *interp = ((PyThreadState*) tstate)->interp; - _PyIndexPool_FreeIndex(&interp->specialized_code_indices, tstate->specialized_code_index); -} - -#endif - /* Code init cleanup. * CALL_ALLOC_AND_ENTER_INIT will set up * the frame to execute the EXIT_INIT_CHECK From 862afa181e2222106cf0a516a1513453bc59e86b Mon Sep 17 00:00:00 2001 From: Matt Page Date: Mon, 9 Sep 2024 16:13:18 -0700 Subject: [PATCH 19/67] Explicitly manage tlbc state --- Include/internal/pycore_code.h | 15 ++++++++++ Include/internal/pycore_interp.h | 5 ++-- Objects/codeobject.c | 48 ++++++++++++++++++++++++-------- 3 files changed, 54 insertions(+), 14 deletions(-) diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index 1014d03669725f..25692da7e27f8c 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -634,6 +634,21 @@ PyAPI_DATA(const struct _PyCode8) _Py_InitCleanup; #ifdef Py_GIL_DISABLED +typedef enum { + // No limit on the amount of memory consumed by thread-local bytecode. + // Terminal state. + _PY_TLBC_UNLIMITED = 0, + + // The total amount of memory consumed by thread-local bytecode must be + // <= PyInterpreterState::tlbc_limit. State transitions to _PY_TLBC_DISABLED + // when the limit is reached. + _PY_TLBC_LIMITED = 1, + + // New thread-local bytecode is disabled. Previously allocated copies + // may still be used. Terminal state. + _PY_TLBC_DISABLED = 2, +} _Py_TLBC_State; + extern void _PyCode_InitState(PyInterpreterState *interp); extern _Py_CODEUNIT *_PyCode_GetExecutableCodeSlow(PyCodeObject *co); diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h index b3fe6824a3a914..314b5dc0153060 100644 --- a/Include/internal/pycore_interp.h +++ b/Include/internal/pycore_interp.h @@ -225,11 +225,10 @@ struct _is { struct _Py_type_id_pool type_ids; PyMutex weakref_locks[NUM_WEAKREF_LIST_LOCKS]; _PyIndexPool tlbc_indices; - // Number of bytes available for thread-local bytecode, counts down to zero + // Number of bytes available for thread-local bytecode, counts down to zero. Py_ssize_t tlbc_avail; PyMutex tlbc_avail_mutex; - // This is monotonic; once true it will remain true - bool new_tlbc_disabled; + _Py_TLBC_State tlbc_state; #endif // Per-interpreter state for the obmalloc allocator. For the main diff --git a/Objects/codeobject.c b/Objects/codeobject.c index 73601464ad90af..c1c3bb5c8f8dfd 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -537,7 +537,7 @@ init_code(PyCodeObject *co, struct _PyCodeConstructor *con) } co->_co_firsttraceable = entry_point; #ifdef Py_GIL_DISABLED - if (interp->new_tlbc_disabled) { + if (interp->tlbc_state == _PY_TLBC_DISABLED) { _PyCode_DisableSpecialization(_PyCode_CODE(co), Py_SIZE(co)); } else { @@ -2690,8 +2690,18 @@ void _PyCode_InitState(PyInterpreterState *interp) { int limit = interp->config.tlbc_limit; - interp->tlbc_avail = limit; - interp->new_tlbc_disabled = limit == 0; + if (limit < 0) { + interp->tlbc_avail = -1; + interp->tlbc_state = _PY_TLBC_UNLIMITED; + } + else if (limit == 0) { + interp->tlbc_avail = 0; + interp->tlbc_state = _PY_TLBC_DISABLED; + } + else { + interp->tlbc_avail = limit; + interp->tlbc_state = _PY_TLBC_LIMITED; + } } int @@ -2776,15 +2786,31 @@ static Py_ssize_t reserve_bytes_for_tlbc(PyCodeObject *co) { PyInterpreterState *interp = _PyInterpreterState_GET(); - Py_ssize_t nbytes_reserved = -1; Py_ssize_t code_size = _PyCode_NBYTES(co); PyMutex_LockFlags(&interp->tlbc_avail_mutex, _Py_LOCK_DONT_DETACH); - if (interp->tlbc_avail < 0) { + Py_ssize_t nbytes_reserved; + switch (interp->tlbc_state) { + case _PY_TLBC_UNLIMITED: { nbytes_reserved = code_size; + break; + } + case _PY_TLBC_LIMITED: { + if (interp->tlbc_avail >= code_size) { + nbytes_reserved = code_size; + interp->tlbc_avail -= code_size; + } + else { + nbytes_reserved = -1; + } + break; + } + case _PY_TLBC_DISABLED: { + nbytes_reserved = -1; + break; + } + default: { + Py_UNREACHABLE(); } - else if (interp->tlbc_avail >= code_size) { - interp->tlbc_avail -= code_size; - nbytes_reserved = code_size; } PyMutex_Unlock(&interp->tlbc_avail_mutex); return nbytes_reserved; @@ -2820,7 +2846,7 @@ static void disable_new_tlbc(void) { PyInterpreterState *interp = _PyInterpreterState_GET(); - if (interp->new_tlbc_disabled) { + if (interp->tlbc_state == _PY_TLBC_DISABLED) { return; } // Disable creation of new thread-local copies of bytecode. We disable @@ -2831,7 +2857,7 @@ disable_new_tlbc(void) // main copy), but any attempts to create new copies of bytecode will fail, // and the main, unspecializable copy will be used. _PyEval_StopTheWorld(interp); - interp->new_tlbc_disabled = true; + interp->tlbc_state = _PY_TLBC_DISABLED; _PyEval_StartTheWorld(interp); PyUnstable_GC_VisitObjects(disable_specialization, NULL); if (PyErr_WarnEx(PyExc_ResourceWarning, "Reached memory limit for thread-local bytecode", 1) < 0) { @@ -2864,7 +2890,7 @@ _Py_CODEUNIT * _PyCode_GetExecutableCodeSlow(PyCodeObject *co) { PyInterpreterState *interp = _PyInterpreterState_GET(); - if (interp->new_tlbc_disabled) { + if (interp->tlbc_state == _PY_TLBC_DISABLED) { return (_Py_CODEUNIT *) co->co_tlbc->entries[0]->bytecode; } _Py_CODEUNIT *result; From 0b4d9522652e1f2e9be8a80bbb7d2c4b65b5591a Mon Sep 17 00:00:00 2001 From: Matt Page Date: Mon, 9 Sep 2024 16:51:45 -0700 Subject: [PATCH 20/67] Refactor API for fetching tlbc --- Include/internal/pycore_code.h | 33 ++++++++++++++++++++++++++++----- Objects/codeobject.c | 6 +++--- 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index 25692da7e27f8c..a1d8338f75aafd 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -650,12 +650,11 @@ typedef enum { } _Py_TLBC_State; extern void _PyCode_InitState(PyInterpreterState *interp); -extern _Py_CODEUNIT *_PyCode_GetExecutableCodeSlow(PyCodeObject *co); -// Return the bytecode that should be executed by the current thread, creating -// a copy if necessary. +// Return a pointer to the thread-local bytecode for the current thread, if it +// exists. static inline _Py_CODEUNIT * -_PyCode_GetExecutableCode(PyCodeObject *co) +_PyCode_GetTLBCFast(PyCodeObject *co) { _PyCodeArray *code = _Py_atomic_load_ptr_acquire(&co->co_tlbc); _PyThreadStateImpl *tstate = (_PyThreadStateImpl *) PyThreadState_GET(); @@ -663,7 +662,31 @@ _PyCode_GetExecutableCode(PyCodeObject *co) if (idx < code->size && code->entries[idx] != NULL) { return (_Py_CODEUNIT *) code->entries[idx]->bytecode; } - return _PyCode_GetExecutableCodeSlow(co); + return NULL; +} + +// Return a pointer to the thread-local bytecode for the current thread, creating +// it if it doesn't exist. +// +// On error, NULL is returned, new thread-local bytecode is disabled, and +// specialization is disabled for the "main" copy of the bytecode (the bytecode +// embedded in the code object) for all code objects. +extern _Py_CODEUNIT *_PyCode_GetTLBCSlow(PyCodeObject *co); + +// Return the bytecode that should be executed by the current thread, creating +// a copy if necessary. +static inline _Py_CODEUNIT * +_PyCode_GetExecutableCode(PyCodeObject *co) +{ + _Py_CODEUNIT *res = _PyCode_GetTLBCFast(co); + if (res != NULL) { + return res; + } + res = _PyCode_GetTLBCSlow(co); + if (res != NULL) { + return res; + } + return _PyCode_CODE(co); } extern void _PyCode_LockTLBC(PyCodeObject *co); diff --git a/Objects/codeobject.c b/Objects/codeobject.c index c1c3bb5c8f8dfd..dd2c33e7454a1d 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -2877,7 +2877,7 @@ get_tlbc_lock_held(PyCodeObject *co) Py_ssize_t reserved = reserve_bytes_for_tlbc(co); if (reserved == -1) { disable_new_tlbc(); - return (_Py_CODEUNIT *) tlbc->entries[0]->bytecode; + return NULL; } _Py_CODEUNIT *result = create_tlbc_lock_held(co, idx); if (result == NULL) { @@ -2887,11 +2887,11 @@ get_tlbc_lock_held(PyCodeObject *co) } _Py_CODEUNIT * -_PyCode_GetExecutableCodeSlow(PyCodeObject *co) +_PyCode_GetTLBCSlow(PyCodeObject *co) { PyInterpreterState *interp = _PyInterpreterState_GET(); if (interp->tlbc_state == _PY_TLBC_DISABLED) { - return (_Py_CODEUNIT *) co->co_tlbc->entries[0]->bytecode; + return NULL; } _Py_CODEUNIT *result; Py_BEGIN_CRITICAL_SECTION(co); From 7795e99a3220421f91a953eafb0d416aff203b4e Mon Sep 17 00:00:00 2001 From: Matt Page Date: Mon, 9 Sep 2024 22:44:37 -0700 Subject: [PATCH 21/67] Add unit tests --- Lib/test/test_tlbc.py | 197 ++++++++++++++++++++++++++++++++++++ Modules/_testinternalcapi.c | 44 ++++++++ 2 files changed, 241 insertions(+) create mode 100644 Lib/test/test_tlbc.py diff --git a/Lib/test/test_tlbc.py b/Lib/test/test_tlbc.py new file mode 100644 index 00000000000000..adcab24215756b --- /dev/null +++ b/Lib/test/test_tlbc.py @@ -0,0 +1,197 @@ +"""Tests for thread-local bytecode.""" +import dis +import textwrap +import unittest + +from test import support +from test.support import cpython_only, import_helper, requires_specialization_of +from test.support.script_helper import assert_python_ok +from test.support.threading_helper import requires_working_threading + +# Skip this test if the _testinternalcapi module isn't available +_testinternalcapi = import_helper.import_module("_testinternalcapi") + + +@cpython_only +@requires_working_threading() +@unittest.skipUnless(support.Py_GIL_DISABLED, "only in free-threaded builds") +class TLBCTests(unittest.TestCase): + @requires_specialization_of("BINARY_OP") + def test_new_threads_start_with_unspecialized_code(self): + code = textwrap.dedent(""" + import dis + import queue + import threading + + from _testinternalcapi import get_tlbc + + def all_opnames(bc): + return {i.opname for i in dis._get_instructions_bytes(bc)} + + def f(a, b, q=None): + if q is not None: + q.put(get_tlbc(f)) + return a + b + + for _ in range(100): + # specialize + f(1, 2) + + q = queue.Queue() + t = threading.Thread(target=f, args=('a', 'b', q)) + t.start() + t.join() + + assert "BINARY_OP_ADD_INT" in all_opnames(get_tlbc(f)) + assert "BINARY_OP_ADD_INT" not in all_opnames(q.get()) + """) + assert_python_ok("-X", "tlbc_limit=-1", "-c", code) + + @requires_specialization_of("BINARY_OP") + def test_threads_specialize_independently(self): + code = textwrap.dedent(""" + import dis + import queue + import threading + + from _testinternalcapi import get_tlbc + + def all_opnames(bc): + return {i.opname for i in dis._get_instructions_bytes(bc)} + + def f(a, b): + return a + b + + def g(a, b, q=None): + for _ in range(100): + f(a, b) + if q is not None: + q.put(get_tlbc(f)) + + # specialize in main thread + g(1, 2) + + # specialize in other thread + q = queue.Queue() + t = threading.Thread(target=g, args=('a', 'b', q)) + t.start() + t.join() + + assert "BINARY_OP_ADD_INT" in all_opnames(get_tlbc(f)) + t_opnames = all_opnames(q.get()) + assert "BINARY_OP_ADD_INT" not in t_opnames + assert "BINARY_OP_ADD_UNICODE" in t_opnames + """) + assert_python_ok("-X", "tlbc_limit=-1", "-c", code) + + def test_reuse_tlbc_across_threads_different_lifetimes(self): + code = textwrap.dedent(""" + import queue + import threading + + from _testinternalcapi import get_tlbc_id + + def f(a, b, q=None): + if q is not None: + q.put(get_tlbc_id(f)) + return a + b + + q = queue.Queue() + tlbc_ids = [] + for _ in range(3): + t = threading.Thread(target=f, args=('a', 'b', q)) + t.start() + t.join() + tlbc_ids.append(q.get()) + + assert tlbc_ids[0] == tlbc_ids[1] + assert tlbc_ids[1] == tlbc_ids[2] + """) + assert_python_ok("-X", "tlbc_limit=-1", "-c", code) + + def test_no_tlbc_if_tlbc_disabled(self): + code = textwrap.dedent(""" + import queue + import threading + + from _testinternalcapi import get_tlbc + + def f(a, b, q=None): + if q is not None: + q.put(get_tlbc(f)) + return a + b + + q = queue.Queue() + threads = [] + for _ in range(3): + t = threading.Thread(target=f, args=('a', 'b', q)) + t.start() + threads.append(t) + + tlbcs = [] + for t in threads: + t.join() + tlbcs.append(q.get()) + + assert get_tlbc(f) is not None + assert tlbcs[0] is None + assert tlbcs[1] is None + assert tlbcs[2] is None + """) + assert_python_ok("-X", "tlbc_limit=0", "-c", code) + + def test_no_specialization_if_tlbc_disabled(self): + code = textwrap.dedent(""" + import dis + import queue + import threading + + from _testinternalcapi import get_tlbc + + def all_opnames(f): + bc = get_tlbc(f) + return {i.opname for i in dis._get_instructions_bytes(bc)} + + def f(a, b): + return a + b + + for _ in range(100): + f(1, 2) + + assert "BINARY_OP_ADD_INT" not in all_opnames(f) + """) + assert_python_ok("-X", "tlbc_limit=0", "-c", code) + + def test_generator_throw(self): + code = textwrap.dedent(""" + import queue + import threading + + from _testinternalcapi import get_tlbc_id + + def g(): + try: + yield + except: + yield get_tlbc_id(g) + + def f(q): + gen = g() + next(gen) + q.put(gen.throw(ValueError)) + + q = queue.Queue() + t = threading.Thread(target=f, args=(q,)) + t.start() + t.join() + + gen = g() + next(gen) + main_id = gen.throw(ValueError) + assert main_id != q.get() + """) + assert_python_ok("-X", "tlbc_limit=-1", "-c", code) + + +if __name__ == "__main__": + unittest.main() diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c index 0451688a46c75f..514777e6830b08 100644 --- a/Modules/_testinternalcapi.c +++ b/Modules/_testinternalcapi.c @@ -14,6 +14,7 @@ #include "pycore_bitutils.h" // _Py_bswap32() #include "pycore_bytesobject.h" // _PyBytes_Find() #include "pycore_ceval.h" // _PyEval_AddPendingCall() +#include "pycore_code.h" // _PyCode_GetTLBCFast() #include "pycore_compile.h" // _PyCompile_CodeGen() #include "pycore_context.h" // _PyContext_NewHamtForTests() #include "pycore_dict.h" // _PyManagedDictPointer_GetValues() @@ -1963,6 +1964,47 @@ get_py_thread_id(PyObject *self, PyObject *Py_UNUSED(ignored)) Py_BUILD_ASSERT(sizeof(unsigned long long) >= sizeof(tid)); return PyLong_FromUnsignedLongLong(tid); } + +static PyCodeObject * +get_code(PyObject *obj) +{ + if (PyCode_Check(obj)) { + return (PyCodeObject *) obj; + } + else if (PyFunction_Check(obj)) { + return (PyCodeObject *) PyFunction_GetCode(obj); + } + return (PyCodeObject *) PyErr_Format(PyExc_TypeError, + "expected function or code object, got %s", Py_TYPE(obj)->tp_name); +} + +static PyObject * +get_tlbc(PyObject *Py_UNUSED(module), PyObject *obj) +{ + PyCodeObject *code = get_code(obj); + if (code == NULL) { + return NULL; + } + _Py_CODEUNIT *bc = _PyCode_GetTLBCFast(code); + if (bc == NULL) { + Py_RETURN_NONE; + } + return PyBytes_FromStringAndSize((const char *) bc, _PyCode_NBYTES(code)); +} + +static PyObject * +get_tlbc_id(PyObject *Py_UNUSED(module), PyObject *obj) +{ + PyCodeObject *code = get_code(obj); + if (code == NULL) { + return NULL; + } + _Py_CODEUNIT *bc = _PyCode_GetTLBCFast(code); + if (bc == NULL) { + Py_RETURN_NONE; + } + return PyLong_FromVoidPtr(bc); +} #endif static PyObject * @@ -2136,6 +2178,8 @@ static PyMethodDef module_functions[] = { #ifdef Py_GIL_DISABLED {"py_thread_id", get_py_thread_id, METH_NOARGS}, + {"get_tlbc", get_tlbc, METH_O, NULL}, + {"get_tlbc_id", get_tlbc_id, METH_O, NULL}, #endif {"suppress_immortalization", suppress_immortalization, METH_O}, {"get_immortalize_deferred", get_immortalize_deferred, METH_NOARGS}, From 693a4cc96313506bc6678ba9123ba22ab711170d Mon Sep 17 00:00:00 2001 From: Matt Page Date: Mon, 9 Sep 2024 23:04:09 -0700 Subject: [PATCH 22/67] Fix initconfig in default build --- Python/initconfig.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Python/initconfig.c b/Python/initconfig.c index 1b7a9489626702..53e120d4d3868f 100644 --- a/Python/initconfig.c +++ b/Python/initconfig.c @@ -1900,6 +1900,8 @@ config_init_tlbc_limit(PyConfig *config) config->tlbc_limit = limit; } return _PyStatus_OK(); +#else + return _PyStatus_OK(); #endif } From b43531e1de1392cc7c7fc980f2354379cef05823 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Mon, 9 Sep 2024 23:07:10 -0700 Subject: [PATCH 23/67] Fix instrumentation in default build --- Python/instrumentation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/instrumentation.c b/Python/instrumentation.c index cbebd7d9923ab9..b84660b84fe168 100644 --- a/Python/instrumentation.c +++ b/Python/instrumentation.c @@ -64,7 +64,7 @@ #define LOCK_CODE(code) #define UNLOCK_CODE() -#define MODIFY_BYTECODE(code, func, args...) (func)(_PyCode_CODE((code), __VA_ARGS__) +#define MODIFY_BYTECODE(code, func, args...) (func)(_PyCode_CODE(code), args) #endif From 9025f431aceeb44b1f8e532502d0554b2708818b Mon Sep 17 00:00:00 2001 From: Matt Page Date: Tue, 10 Sep 2024 11:12:09 -0700 Subject: [PATCH 24/67] Synchronize bytecode modifications between specialization and instrumentation using atomics --- Include/cpython/code.h | 10 ++----- Include/internal/pycore_code.h | 7 ++--- Objects/codeobject.c | 48 +++++++---------------------- Python/bytecodes.c | 3 +- Python/generated_cases.c.h | 3 +- Python/instrumentation.c | 8 ++--- Python/specialize.c | 55 +++++++++++++++++----------------- 7 files changed, 46 insertions(+), 88 deletions(-) diff --git a/Include/cpython/code.h b/Include/cpython/code.h index 9c41d7521d7cef..721999e7ab5e8f 100644 --- a/Include/cpython/code.h +++ b/Include/cpython/code.h @@ -70,22 +70,16 @@ typedef struct { #ifdef Py_GIL_DISABLED -typedef struct { - PyMutex mutex; - char bytecode[]; -} _PyMutBytecode; - /* Each thread specializes a thread-local copy of the bytecode in free-threaded * builds. These copies are stored on the code object in a `_PyCodeArray`. */ typedef struct { Py_ssize_t size; - _PyMutBytecode *entries[]; + char *entries[]; } _PyCodeArray; #define _PyCode_DEF_THREAD_LOCAL_BYTECODE() \ - _PyCodeArray *co_tlbc; \ - PyMutex co_code_adaptive_mutex; + _PyCodeArray *co_tlbc; #else #define _PyCode_DEF_THREAD_LOCAL_BYTECODE() #endif diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index a1d8338f75aafd..2526a39ddce79f 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -369,8 +369,7 @@ extern void _Py_Specialize_Call(_PyStackRef callable, _Py_CODEUNIT *instr, int nargs); extern void _Py_Specialize_CallKw(_PyStackRef callable, _Py_CODEUNIT *instr, int nargs); -extern void _Py_Specialize_BinaryOp(PyCodeObject *code, _PyStackRef lhs, - _PyStackRef rhs, _Py_CODEUNIT *instr, +extern void _Py_Specialize_BinaryOp(_PyStackRef lhs, _PyStackRef rhs, _Py_CODEUNIT *instr, int oparg, _PyStackRef *locals); extern void _Py_Specialize_CompareOp(_PyStackRef lhs, _PyStackRef rhs, _Py_CODEUNIT *instr, int oparg); @@ -660,7 +659,7 @@ _PyCode_GetTLBCFast(PyCodeObject *co) _PyThreadStateImpl *tstate = (_PyThreadStateImpl *) PyThreadState_GET(); Py_ssize_t idx = tstate->tlbc_index; if (idx < code->size && code->entries[idx] != NULL) { - return (_Py_CODEUNIT *) code->entries[idx]->bytecode; + return (_Py_CODEUNIT *) code->entries[idx]; } return NULL; } @@ -689,8 +688,6 @@ _PyCode_GetExecutableCode(PyCodeObject *co) return _PyCode_CODE(co); } -extern void _PyCode_LockTLBC(PyCodeObject *co); -extern void _PyCode_UnlockTLBC(PyCodeObject *co); extern int _Py_ReserveTLBCIndex(PyInterpreterState *interp); extern void _Py_ClearTLBCIndex(_PyThreadStateImpl *tstate); #endif diff --git a/Objects/codeobject.c b/Objects/codeobject.c index dd2c33e7454a1d..b711e6ee1739f7 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -527,8 +527,7 @@ init_code(PyCodeObject *co, struct _PyCodeConstructor *con) if (co->co_tlbc == NULL) { return -1; } - co->co_tlbc->entries[0] = (_PyMutBytecode *) &co->co_code_adaptive_mutex; - co->co_tlbc->entries[0]->mutex = (PyMutex){0}; + co->co_tlbc->entries[0] = co->co_code_adaptive; #endif int entry_point = 0; while (entry_point < Py_SIZE(co) && @@ -1908,7 +1907,7 @@ code_dealloc(PyCodeObject *co) // the code object, which will be freed when the code object is freed. Py_ssize_t bytes_freed = 0; for (Py_ssize_t i = 1; i < co->co_tlbc->size; i++) { - _PyMutBytecode *entry = co->co_tlbc->entries[i]; + char *entry = co->co_tlbc->entries[i]; if (entry != NULL) { PyMem_Free(entry); bytes_freed += _PyCode_NBYTES(co); @@ -2720,7 +2719,7 @@ _Py_ClearTLBCIndex(_PyThreadStateImpl *tstate) static _PyCodeArray * _PyCodeArray_New(Py_ssize_t size) { - _PyCodeArray *arr = PyMem_Calloc(1, sizeof(_PyCodeArray) + sizeof(_PyMutBytecode*) * size); + _PyCodeArray *arr = PyMem_Calloc(1, sizeof(_PyCodeArray) + sizeof(void*) * size); if (arr == NULL) { PyErr_NoMemory(); return NULL; @@ -2730,14 +2729,13 @@ _PyCodeArray_New(Py_ssize_t size) } static void -copy_code(_PyMutBytecode *dst, PyCodeObject *co) +copy_code(_Py_CODEUNIT *dst, PyCodeObject *co) { int code_len = Py_SIZE(co); - _Py_CODEUNIT *dst_bytecode = (_Py_CODEUNIT *) dst->bytecode; for (int i = 0; i < code_len; i += _PyInstruction_GetLength(co, i)) { - dst_bytecode[i] = _Py_GetBaseCodeUnit(co, i); + dst[i] = _Py_GetBaseCodeUnit(co, i); } - _PyCode_Quicken(dst_bytecode, code_len); + _PyCode_Quicken(dst, code_len); } static Py_ssize_t @@ -2771,15 +2769,15 @@ create_tlbc_lock_held(PyCodeObject *co, Py_ssize_t idx) _PyMem_FreeDelayed(tlbc); tlbc = new_tlbc; } - _PyMutBytecode *bc = PyMem_Calloc(1, sizeof(_PyMutBytecode) + _PyCode_NBYTES(co)); + char *bc = PyMem_Calloc(1, _PyCode_NBYTES(co)); if (bc == NULL) { PyErr_NoMemory(); return NULL; } - copy_code(bc, co); + copy_code((_Py_CODEUNIT *) bc, co); assert(tlbc->entries[idx] == NULL); tlbc->entries[idx] = bc; - return (_Py_CODEUNIT *) bc->bytecode; + return (_Py_CODEUNIT *) bc; } static Py_ssize_t @@ -2872,7 +2870,7 @@ get_tlbc_lock_held(PyCodeObject *co) _PyThreadStateImpl *tstate = (_PyThreadStateImpl *) PyThreadState_GET(); Py_ssize_t idx = tstate->tlbc_index; if (idx < tlbc->size && tlbc->entries[idx] != NULL) { - return (_Py_CODEUNIT *) tlbc->entries[idx]->bytecode; + return (_Py_CODEUNIT *) tlbc->entries[idx]; } Py_ssize_t reserved = reserve_bytes_for_tlbc(co); if (reserved == -1) { @@ -2900,30 +2898,4 @@ _PyCode_GetTLBCSlow(PyCodeObject *co) return result; } -static inline _PyMutBytecode * -get_tlbc(PyCodeObject *co) -{ - _PyCodeArray *code = _Py_atomic_load_ptr_acquire(&co->co_tlbc); - _PyThreadStateImpl *tstate = (_PyThreadStateImpl *) PyThreadState_GET(); - Py_ssize_t idx = tstate->tlbc_index; - assert(idx >= 0 && idx < code->size); - return code->entries[idx]; -} - -void -_PyCode_LockTLBC(PyCodeObject *co) -{ - _PyMutBytecode *tlbc = get_tlbc(co); - assert(tlbc != NULL); - PyMutex_LockFlags(&tlbc->mutex, _PY_LOCK_DETACH); -} - -void -_PyCode_UnlockTLBC(PyCodeObject *co) -{ - _PyMutBytecode *tlbc = get_tlbc(co); - assert(tlbc != NULL); - PyMutex_Unlock(&tlbc->mutex); -} - #endif diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 83b8a159ab59da..e9ba7ea2b61de8 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -4509,8 +4509,7 @@ dummy_func( #if ENABLE_SPECIALIZED_BINARY_OP if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; - _Py_Specialize_BinaryOp(_PyFrame_GetCode(frame), lhs, rhs, - next_instr, oparg, LOCALS_ARRAY); + _Py_Specialize_BinaryOp(lhs, rhs, next_instr, oparg, LOCALS_ARRAY); DISPATCH_SAME_OPARG(); } OPCODE_DEFERRED_INC(BINARY_OP); diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 41555eff5d7abf..14d188c98b18e7 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -28,8 +28,7 @@ #if ENABLE_SPECIALIZED_BINARY_OP if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; - _Py_Specialize_BinaryOp(_PyFrame_GetCode(frame), lhs, rhs, - next_instr, oparg, LOCALS_ARRAY); + _Py_Specialize_BinaryOp(lhs, rhs, next_instr, oparg, LOCALS_ARRAY); DISPATCH_SAME_OPARG(); } OPCODE_DEFERRED_INC(BINARY_OP); diff --git a/Python/instrumentation.c b/Python/instrumentation.c index b84660b84fe168..6a6fa820291b37 100644 --- a/Python/instrumentation.c +++ b/Python/instrumentation.c @@ -50,13 +50,11 @@ do { \ PyCodeObject *co = (code); \ for (Py_ssize_t i = 0; i < code->co_tlbc->size; i++) { \ - _PyMutBytecode *mb = co->co_tlbc->entries[i]; \ - if (mb == NULL) { \ + char *bc = co->co_tlbc->entries[i]; \ + if (bc == NULL) { \ continue; \ } \ - PyMutex_LockFlags(&mb->mutex, _Py_LOCK_DONT_DETACH); \ - (func)((_Py_CODEUNIT *) mb->bytecode, args); \ - PyMutex_Unlock(&mb->mutex); \ + (func)((_Py_CODEUNIT *) bc, args); \ } \ } while (0) diff --git a/Python/specialize.c b/Python/specialize.c index e3ed068ace31ec..6d4a6734a7f297 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -25,23 +25,23 @@ extern const char *_PyUOpName(int index); */ #ifdef Py_GIL_DISABLED -#define SET_OPCODE(instr, opcode) _Py_atomic_store_uint8_relaxed(&(instr)->op.code, (opcode)) -#define LOCK_TLBC_RETURN_IF_INSTRUMENTED(code, instr) \ - do { \ - _PyCode_LockTLBC(code); \ - if ((instr)->op.code >= MIN_INSTRUMENTED_OPCODE) { \ - _PyCode_UnlockTLBC(code); \ - return; \ - } \ - } while (0) -#define UNLOCK_TLBC(code) _PyCode_UnlockTLBC(code) +#define SET_OPCODE_OR_RETURN(instr, opcode) \ + do { \ + uint8_t old_op = _Py_atomic_load_uint8_relaxed(&(instr)->op.code); \ + if (old_op >= MIN_INSTRUMENTED_OPCODE) { \ + /* Lost race with instrumentation */ \ + return; \ + } \ + if (!_Py_atomic_compare_exchange_uint8(&(instr)->op.code, &old_op, (opcode))) { \ + /* Lost race with instrumentation */ \ + assert(old_op >= MIN_INSTRUMENTED_OPCODE); \ + return; \ + } \ + } while (0) #else -#define SET_OPCODE(instr, opcode) (instr)->op.code = (opcode) -#define LOCK_TLBC_RETURN_IF_INSTRUMENTED(code, instr) (void) (code) -#define UNLOCK_TLBC(code) (void) (code) +#define SET_OPCODE_OR_RETURN(instr, opcode) (instr)->op.code = (opcode) #endif - #ifdef Py_STATS GCStats _py_gc_stats[NUM_GENERATIONS] = { 0 }; static PyStats _Py_stats_struct = { .gc_stats = _py_gc_stats }; @@ -2255,15 +2255,15 @@ binary_op_fail_kind(int oparg, PyObject *lhs, PyObject *rhs) #endif // Py_STATS void -_Py_Specialize_BinaryOp(PyCodeObject *code, _PyStackRef lhs_st, _PyStackRef rhs_st, - _Py_CODEUNIT *instr, int oparg, _PyStackRef *locals) +_Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *instr, + int oparg, _PyStackRef *locals) { PyObject *lhs = PyStackRef_AsPyObjectBorrow(lhs_st); PyObject *rhs = PyStackRef_AsPyObjectBorrow(rhs_st); assert(ENABLE_SPECIALIZED_BINARY_OP); assert(_PyOpcode_Caches[BINARY_OP] == INLINE_CACHE_ENTRIES_BINARY_OP); - LOCK_TLBC_RETURN_IF_INSTRUMENTED(code, instr); _PyBinaryOpCache *cache = (_PyBinaryOpCache *)(instr + 1); + uint8_t specialized_op; switch (oparg) { case NB_ADD: case NB_INPLACE_ADD: @@ -2274,18 +2274,18 @@ _Py_Specialize_BinaryOp(PyCodeObject *code, _PyStackRef lhs_st, _PyStackRef rhs_ _Py_CODEUNIT next = instr[INLINE_CACHE_ENTRIES_BINARY_OP + 1]; bool to_store = (next.op.code == STORE_FAST); if (to_store && PyStackRef_AsPyObjectBorrow(locals[next.op.arg]) == lhs) { - SET_OPCODE(instr, BINARY_OP_INPLACE_ADD_UNICODE); + specialized_op = BINARY_OP_INPLACE_ADD_UNICODE; goto success; } - SET_OPCODE(instr, BINARY_OP_ADD_UNICODE); + specialized_op = BINARY_OP_ADD_UNICODE; goto success; } if (PyLong_CheckExact(lhs)) { - SET_OPCODE(instr, BINARY_OP_ADD_INT); + specialized_op = BINARY_OP_ADD_INT; goto success; } if (PyFloat_CheckExact(lhs)) { - SET_OPCODE(instr, BINARY_OP_ADD_FLOAT); + specialized_op = BINARY_OP_ADD_FLOAT; goto success; } break; @@ -2295,11 +2295,11 @@ _Py_Specialize_BinaryOp(PyCodeObject *code, _PyStackRef lhs_st, _PyStackRef rhs_ break; } if (PyLong_CheckExact(lhs)) { - SET_OPCODE(instr, BINARY_OP_MULTIPLY_INT); + specialized_op = BINARY_OP_MULTIPLY_INT; goto success; } if (PyFloat_CheckExact(lhs)) { - SET_OPCODE(instr, BINARY_OP_MULTIPLY_FLOAT); + specialized_op = BINARY_OP_MULTIPLY_FLOAT; goto success; } break; @@ -2309,25 +2309,24 @@ _Py_Specialize_BinaryOp(PyCodeObject *code, _PyStackRef lhs_st, _PyStackRef rhs_ break; } if (PyLong_CheckExact(lhs)) { - SET_OPCODE(instr, BINARY_OP_SUBTRACT_INT); + specialized_op = BINARY_OP_SUBTRACT_INT; goto success; } if (PyFloat_CheckExact(lhs)) { - SET_OPCODE(instr, BINARY_OP_SUBTRACT_FLOAT); + specialized_op = BINARY_OP_SUBTRACT_FLOAT; goto success; } break; } SPECIALIZATION_FAIL(BINARY_OP, binary_op_fail_kind(oparg, lhs, rhs)); STAT_INC(BINARY_OP, failure); - instr->op.code = BINARY_OP; + SET_OPCODE_OR_RETURN(instr, BINARY_OP); cache->counter = adaptive_counter_backoff(cache->counter); - UNLOCK_TLBC(code); return; success: STAT_INC(BINARY_OP, success); + SET_OPCODE_OR_RETURN(instr, specialized_op); cache->counter = adaptive_counter_cooldown(); - UNLOCK_TLBC(code); } From c44c7d902cf4a4c7326510d7888d0727de267931 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Tue, 10 Sep 2024 13:10:02 -0700 Subject: [PATCH 25/67] Add a high-level comment --- Include/cpython/code.h | 4 +++- Objects/codeobject.c | 23 +++++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/Include/cpython/code.h b/Include/cpython/code.h index 721999e7ab5e8f..0b354c38b2c398 100644 --- a/Include/cpython/code.h +++ b/Include/cpython/code.h @@ -71,7 +71,9 @@ typedef struct { #ifdef Py_GIL_DISABLED /* Each thread specializes a thread-local copy of the bytecode in free-threaded - * builds. These copies are stored on the code object in a `_PyCodeArray`. + * builds. These copies are stored on the code object in a `_PyCodeArray`. The + * first entry in the array always points to the "main" copy of the bytecode + * that is stored at the end of the code object. */ typedef struct { Py_ssize_t size; diff --git a/Objects/codeobject.c b/Objects/codeobject.c index b711e6ee1739f7..21df02b2fc56b3 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -2685,6 +2685,29 @@ _PyCode_Fini(PyInterpreterState *interp) #ifdef Py_GIL_DISABLED +// Thread-local bytecode (TLBC) +// +// Each thread specializes a thread-local copy of the bytecode, created on the +// first RESUME, in free-threaded builds. All copies of the bytecode for a code +// object are stored in the `co_tlbc` array. Threads reserve a globally unique +// index identifying its copy of the bytecode in all `co_tlbc` arrays at thread +// creation and release the index at thread destruction. The first entry in +// every `co_tlbc` array always points to the "main" copy of the bytecode that +// is stored at the end of the code object. This ensures that no bytecode is +// copied for programs that do not use threads. +// +// The total amount of memory consumed by thread-local bytecode can be limited +// at runtime by setting either `-X tlbc_limit` or `PYTHON_TLBC_LIMIT`. When +// the limit is reached, no new copies of thread-local bytecode can be created +// and specialization is disabled for the "main" copy of the bytecode (the bytecode +// at index 0 of the `co_tlbc` array). Threads can continue to specialize +// existing thread-local copies of the bytecode (other than the "main" copy). +// All other execution will use the unspecialized, "main" copy of the bytecode. +// +// Concurrent modifications to the bytecode made by the specializing interpreter +// and instrumentation use atomics, with specialization taking care not to +// overwrite an instruction that was instrumented concurrently. + void _PyCode_InitState(PyInterpreterState *interp) { From e2a6656c20e66880a7d73febccf75f638229eba7 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Tue, 10 Sep 2024 13:23:23 -0700 Subject: [PATCH 26/67] Fix unused variable warning in default build --- Python/bytecodes.c | 2 ++ Python/generated_cases.c.h | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/Python/bytecodes.c b/Python/bytecodes.c index e9ba7ea2b61de8..bdc02183e807b2 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -201,6 +201,8 @@ dummy_func( next_instr = frame->instr_ptr + 1; } + #else + (void)this_instr; #endif } diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 14d188c98b18e7..52901893db95c0 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -4494,6 +4494,8 @@ this_instr = frame->instr_ptr; next_instr = frame->instr_ptr + 1; } + #else + (void)this_instr; #endif } // _MAYBE_INSTRUMENT @@ -6438,6 +6440,8 @@ this_instr = frame->instr_ptr; next_instr = frame->instr_ptr + 1; } + #else + (void)this_instr; #endif } // _MAYBE_INSTRUMENT @@ -6493,6 +6497,8 @@ this_instr = frame->instr_ptr; next_instr = frame->instr_ptr + 1; } + #else + (void)this_instr; #endif } // _RESUME_CHECK From e6513d1a945f63826ace6f21c75e11bdec1a1774 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Tue, 10 Sep 2024 13:39:19 -0700 Subject: [PATCH 27/67] Fix test_config in free-threaded builds --- Lib/test/test_capi/test_config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/Lib/test/test_capi/test_config.py b/Lib/test/test_capi/test_config.py index 01637e1cb7b6e5..7e7b7a60aa21f0 100644 --- a/Lib/test/test_capi/test_config.py +++ b/Lib/test/test_capi/test_config.py @@ -100,6 +100,7 @@ def test_config_get(self): options.append(("run_presite", str | None, None)) if sysconfig.get_config_var('Py_GIL_DISABLED'): options.append(("enable_gil", int, None)) + options.append(("tlbc_limit", int, None)) if support.MS_WINDOWS: options.extend(( ("legacy_windows_stdio", bool, None), From a18396fa60ea54b04e3a375c951fa27e0288f31a Mon Sep 17 00:00:00 2001 From: Matt Page Date: Tue, 10 Sep 2024 14:02:11 -0700 Subject: [PATCH 28/67] Fix formatting --- Include/internal/pycore_code.h | 3 +- Include/internal/pycore_frame.h | 7 +++- Include/internal/pycore_interp.h | 3 +- Modules/_opcode.c | 1 - Modules/_testinternalcapi.c | 12 +++--- Objects/codeobject.c | 68 +++++++++++++++++--------------- Objects/frameobject.c | 6 +-- Objects/typeobject.c | 4 +- Python/ceval.c | 3 +- Python/ceval_macros.h | 6 +-- Python/frame.c | 3 +- Python/index_pool.c | 4 +- Python/instrumentation.c | 35 ++++++++-------- Python/specialize.c | 32 ++++++++------- 14 files changed, 101 insertions(+), 86 deletions(-) diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index 2526a39ddce79f..0d9f33a34b3a91 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -639,7 +639,8 @@ typedef enum { _PY_TLBC_UNLIMITED = 0, // The total amount of memory consumed by thread-local bytecode must be - // <= PyInterpreterState::tlbc_limit. State transitions to _PY_TLBC_DISABLED + // <= PyInterpreterState::tlbc_limit. State transitions to + // _PY_TLBC_DISABLED // when the limit is reached. _PY_TLBC_LIMITED = 1, diff --git a/Include/internal/pycore_frame.h b/Include/internal/pycore_frame.h index 7688e227ef2144..9bf3ce764b7fff 100644 --- a/Include/internal/pycore_frame.h +++ b/Include/internal/pycore_frame.h @@ -86,7 +86,9 @@ static inline PyCodeObject *_PyFrame_GetCode(_PyInterpreterFrame *f) { return (PyCodeObject *)f->f_executable; } -static inline _Py_CODEUNIT *_PyFrame_GetBytecode(_PyInterpreterFrame *f) { +static inline _Py_CODEUNIT * +_PyFrame_GetBytecode(_PyInterpreterFrame *f) +{ #ifdef Py_GIL_DISABLED return f->bytecode; #else @@ -228,7 +230,8 @@ _PyFrame_IsIncomplete(_PyInterpreterFrame *frame) return true; } return frame->owner != FRAME_OWNED_BY_GENERATOR && - frame->instr_ptr < _PyFrame_GetBytecode(frame) + _PyFrame_GetCode(frame)->_co_firsttraceable; + frame->instr_ptr < _PyFrame_GetBytecode(frame) + + _PyFrame_GetCode(frame)->_co_firsttraceable; } static inline _PyInterpreterFrame * diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h index 314b5dc0153060..9bfed0cd28f1a4 100644 --- a/Include/internal/pycore_interp.h +++ b/Include/internal/pycore_interp.h @@ -225,7 +225,8 @@ struct _is { struct _Py_type_id_pool type_ids; PyMutex weakref_locks[NUM_WEAKREF_LIST_LOCKS]; _PyIndexPool tlbc_indices; - // Number of bytes available for thread-local bytecode, counts down to zero. + // Number of bytes available for thread-local bytecode, counts down to + // zero. Py_ssize_t tlbc_avail; PyMutex tlbc_avail_mutex; _Py_TLBC_State tlbc_state; diff --git a/Modules/_opcode.c b/Modules/_opcode.c index 35b40c19367e91..23fc7d797a0b18 100644 --- a/Modules/_opcode.c +++ b/Modules/_opcode.c @@ -417,7 +417,6 @@ opcode_functions[] = { {NULL, NULL, 0, NULL} }; - static int _opcode_exec(PyObject *m) { #define ADD(X) \ diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c index 514777e6830b08..f4c3af7ed43b76 100644 --- a/Modules/_testinternalcapi.c +++ b/Modules/_testinternalcapi.c @@ -1969,13 +1969,14 @@ static PyCodeObject * get_code(PyObject *obj) { if (PyCode_Check(obj)) { - return (PyCodeObject *) obj; + return (PyCodeObject *)obj; } else if (PyFunction_Check(obj)) { - return (PyCodeObject *) PyFunction_GetCode(obj); + return (PyCodeObject *)PyFunction_GetCode(obj); } - return (PyCodeObject *) PyErr_Format(PyExc_TypeError, - "expected function or code object, got %s", Py_TYPE(obj)->tp_name); + return (PyCodeObject *)PyErr_Format( + PyExc_TypeError, "expected function or code object, got %s", + Py_TYPE(obj)->tp_name); } static PyObject * @@ -1989,7 +1990,7 @@ get_tlbc(PyObject *Py_UNUSED(module), PyObject *obj) if (bc == NULL) { Py_RETURN_NONE; } - return PyBytes_FromStringAndSize((const char *) bc, _PyCode_NBYTES(code)); + return PyBytes_FromStringAndSize((const char *)bc, _PyCode_NBYTES(code)); } static PyObject * @@ -2090,7 +2091,6 @@ identify_type_slot_wrappers(PyObject *self, PyObject *Py_UNUSED(ignored)) return _PyType_GetSlotWrapperNames(); } - static PyMethodDef module_functions[] = { {"get_configs", get_configs, METH_NOARGS}, {"get_recursion_depth", get_recursion_depth, METH_NOARGS}, diff --git a/Objects/codeobject.c b/Objects/codeobject.c index 21df02b2fc56b3..837c0a154c40c4 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -2699,14 +2699,15 @@ _PyCode_Fini(PyInterpreterState *interp) // The total amount of memory consumed by thread-local bytecode can be limited // at runtime by setting either `-X tlbc_limit` or `PYTHON_TLBC_LIMIT`. When // the limit is reached, no new copies of thread-local bytecode can be created -// and specialization is disabled for the "main" copy of the bytecode (the bytecode -// at index 0 of the `co_tlbc` array). Threads can continue to specialize -// existing thread-local copies of the bytecode (other than the "main" copy). -// All other execution will use the unspecialized, "main" copy of the bytecode. +// and specialization is disabled for the "main" copy of the bytecode (the +// bytecode at index 0 of the `co_tlbc` array). Threads can continue to +// specialize existing thread-local copies of the bytecode (other than the +// "main" copy). All other execution will use the unspecialized, "main" copy of +// the bytecode. // -// Concurrent modifications to the bytecode made by the specializing interpreter -// and instrumentation use atomics, with specialization taking care not to -// overwrite an instruction that was instrumented concurrently. +// Concurrent modifications to the bytecode made by the specializing +// interpreter and instrumentation use atomics, with specialization taking care +// not to overwrite an instruction that was instrumented concurrently. void _PyCode_InitState(PyInterpreterState *interp) @@ -2735,14 +2736,15 @@ _Py_ReserveTLBCIndex(PyInterpreterState *interp) void _Py_ClearTLBCIndex(_PyThreadStateImpl *tstate) { - PyInterpreterState *interp = ((PyThreadState*) tstate)->interp; + PyInterpreterState *interp = ((PyThreadState *)tstate)->interp; _PyIndexPool_FreeIndex(&interp->tlbc_indices, tstate->tlbc_index); } static _PyCodeArray * _PyCodeArray_New(Py_ssize_t size) { - _PyCodeArray *arr = PyMem_Calloc(1, sizeof(_PyCodeArray) + sizeof(void*) * size); + _PyCodeArray *arr = + PyMem_Calloc(1, sizeof(_PyCodeArray) + sizeof(void *) * size); if (arr == NULL) { PyErr_NoMemory(); return NULL; @@ -2787,7 +2789,7 @@ create_tlbc_lock_held(PyCodeObject *co, Py_ssize_t idx) if (new_tlbc == NULL) { return NULL; } - memcpy(new_tlbc->entries, tlbc->entries, tlbc->size * sizeof(void*)); + memcpy(new_tlbc->entries, tlbc->entries, tlbc->size * sizeof(void *)); _Py_atomic_store_ptr_release(&co->co_tlbc, new_tlbc); _PyMem_FreeDelayed(tlbc); tlbc = new_tlbc; @@ -2811,27 +2813,27 @@ reserve_bytes_for_tlbc(PyCodeObject *co) PyMutex_LockFlags(&interp->tlbc_avail_mutex, _Py_LOCK_DONT_DETACH); Py_ssize_t nbytes_reserved; switch (interp->tlbc_state) { - case _PY_TLBC_UNLIMITED: { - nbytes_reserved = code_size; - break; - } - case _PY_TLBC_LIMITED: { - if (interp->tlbc_avail >= code_size) { + case _PY_TLBC_UNLIMITED: { nbytes_reserved = code_size; - interp->tlbc_avail -= code_size; + break; } - else { + case _PY_TLBC_LIMITED: { + if (interp->tlbc_avail >= code_size) { + nbytes_reserved = code_size; + interp->tlbc_avail -= code_size; + } + else { + nbytes_reserved = -1; + } + break; + } + case _PY_TLBC_DISABLED: { nbytes_reserved = -1; + break; + } + default: { + Py_UNREACHABLE(); } - break; - } - case _PY_TLBC_DISABLED: { - nbytes_reserved = -1; - break; - } - default: { - Py_UNREACHABLE(); - } } PyMutex_Unlock(&interp->tlbc_avail_mutex); return nbytes_reserved; @@ -2853,12 +2855,12 @@ release_bytes_for_tlbc(Py_ssize_t nbytes) } static int -disable_specialization(PyObject *obj, void*) +disable_specialization(PyObject *obj, void *) { if (!PyCode_Check(obj)) { return 1; } - PyCodeObject *co = (PyCodeObject *) obj; + PyCodeObject *co = (PyCodeObject *)obj; _PyCode_DisableSpecialization(_PyCode_CODE(co), Py_SIZE(co)); return 1; } @@ -2881,7 +2883,9 @@ disable_new_tlbc(void) interp->tlbc_state = _PY_TLBC_DISABLED; _PyEval_StartTheWorld(interp); PyUnstable_GC_VisitObjects(disable_specialization, NULL); - if (PyErr_WarnEx(PyExc_ResourceWarning, "Reached memory limit for thread-local bytecode", 1) < 0) { + if (PyErr_WarnEx(PyExc_ResourceWarning, + "Reached memory limit for thread-local bytecode", + 1) < 0) { PyErr_WriteUnraisable(NULL); } } @@ -2890,10 +2894,10 @@ static _Py_CODEUNIT * get_tlbc_lock_held(PyCodeObject *co) { _PyCodeArray *tlbc = co->co_tlbc; - _PyThreadStateImpl *tstate = (_PyThreadStateImpl *) PyThreadState_GET(); + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)PyThreadState_GET(); Py_ssize_t idx = tstate->tlbc_index; if (idx < tlbc->size && tlbc->entries[idx] != NULL) { - return (_Py_CODEUNIT *) tlbc->entries[idx]; + return (_Py_CODEUNIT *)tlbc->entries[idx]; } Py_ssize_t reserved = reserve_bytes_for_tlbc(co); if (reserved == -1) { diff --git a/Objects/frameobject.c b/Objects/frameobject.c index 1387cdcaa3fced..fef6cadbe58132 100644 --- a/Objects/frameobject.c +++ b/Objects/frameobject.c @@ -1865,9 +1865,9 @@ frame_init_get_vars(_PyInterpreterFrame *frame) // here: PyCodeObject *co = _PyFrame_GetCode(frame); int lasti = _PyInterpreterFrame_LASTI(frame); - if (!(lasti < 0 && _PyFrame_GetBytecode(frame)->op.code == COPY_FREE_VARS - && PyFunction_Check(frame->f_funcobj))) - { + if (!(lasti < 0 && + _PyFrame_GetBytecode(frame)->op.code == COPY_FREE_VARS && + PyFunction_Check(frame->f_funcobj))) { /* Free vars are initialized */ return; } diff --git a/Objects/typeobject.c b/Objects/typeobject.c index 4f1f5c8295a966..d9dc9accc65b54 100644 --- a/Objects/typeobject.c +++ b/Objects/typeobject.c @@ -11547,8 +11547,8 @@ super_descr_get(PyObject *self, PyObject *obj, PyObject *type) } static int -super_init_without_args(_PyInterpreterFrame *cframe, - PyTypeObject **type_p, PyObject **obj_p) +super_init_without_args(_PyInterpreterFrame *cframe, PyTypeObject **type_p, + PyObject **obj_p) { PyCodeObject *co = _PyFrame_GetCode(cframe); if (co->co_argcount == 0) { diff --git a/Python/ceval.c b/Python/ceval.c index 5407347a39391d..75531657a129b7 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -810,7 +810,8 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int * we need to update instrumentation */ #ifdef Py_GIL_DISABLED /* Load thread-local bytecode */ - _Py_CODEUNIT *bytecode = _PyCode_GetExecutableCode(_PyFrame_GetCode(frame)); + _Py_CODEUNIT *bytecode = + _PyCode_GetExecutableCode(_PyFrame_GetCode(frame)); if (frame->bytecode != bytecode) { int off = frame->instr_ptr - frame->bytecode; frame->bytecode = bytecode; diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h index 8bb9c9624383ae..a6a8cfc7ab2337 100644 --- a/Python/ceval_macros.h +++ b/Python/ceval_macros.h @@ -315,9 +315,9 @@ GETITEM(PyObject *v, Py_ssize_t i) { * limit is reached and they all execute the main copy of the bytecode. This is * approximate, we do not need the RMW cycle to be atomic. */ -#define RECORD_BRANCH_TAKEN(bitset, flag) \ - FT_ATOMIC_STORE_UINT16_RELAXED(bitset, \ - (FT_ATOMIC_LOAD_UINT16_RELAXED(bitset) << 1) | (flag)) +#define RECORD_BRANCH_TAKEN(bitset, flag) \ + FT_ATOMIC_STORE_UINT16_RELAXED( \ + bitset, (FT_ATOMIC_LOAD_UINT16_RELAXED(bitset) << 1) | (flag)) #else #define RECORD_BRANCH_TAKEN(bitset, flag) #endif diff --git a/Python/frame.c b/Python/frame.c index 80d2c8e864e47b..6f0105723b465a 100644 --- a/Python/frame.c +++ b/Python/frame.c @@ -63,7 +63,8 @@ take_ownership(PyFrameObject *f, _PyInterpreterFrame *frame) // This may be a newly-created generator or coroutine frame. Since it's // dead anyways, just pretend that the first RESUME ran: PyCodeObject *code = _PyFrame_GetCode(frame); - frame->instr_ptr = _PyFrame_GetBytecode(frame) + code->_co_firsttraceable + 1; + frame->instr_ptr = + _PyFrame_GetBytecode(frame) + code->_co_firsttraceable + 1; } assert(!_PyFrame_IsIncomplete(frame)); assert(f->f_back == NULL); diff --git a/Python/index_pool.c b/Python/index_pool.c index ecc55935416268..927c57838cf3aa 100644 --- a/Python/index_pool.c +++ b/Python/index_pool.c @@ -45,13 +45,13 @@ parent(Py_ssize_t i) static inline Py_ssize_t left_child(Py_ssize_t i) { - return 2*i + 1; + return 2 * i + 1; } static inline Py_ssize_t right_child(Py_ssize_t i) { - return 2*i + 2; + return 2 * i + 2; } static void diff --git a/Python/instrumentation.c b/Python/instrumentation.c index 6a6fa820291b37..0cb84a4d66ca02 100644 --- a/Python/instrumentation.c +++ b/Python/instrumentation.c @@ -46,16 +46,16 @@ #define UNLOCK_CODE() Py_END_CRITICAL_SECTION() -#define MODIFY_BYTECODE(code, func, args...) \ - do { \ - PyCodeObject *co = (code); \ - for (Py_ssize_t i = 0; i < code->co_tlbc->size; i++) { \ - char *bc = co->co_tlbc->entries[i]; \ - if (bc == NULL) { \ - continue; \ - } \ - (func)((_Py_CODEUNIT *) bc, args); \ - } \ +#define MODIFY_BYTECODE(code, func, args...) \ + do { \ + PyCodeObject *co = (code); \ + for (Py_ssize_t i = 0; i < code->co_tlbc->size; i++) { \ + char *bc = co->co_tlbc->entries[i]; \ + if (bc == NULL) { \ + continue; \ + } \ + (func)((_Py_CODEUNIT *)bc, args); \ + } \ } while (0) #else @@ -599,7 +599,8 @@ _Py_CODEUNIT _Py_GetBaseCodeUnit(PyCodeObject *code, int i) { _Py_CODEUNIT *src_instr = _PyCode_CODE(code) + i; - _Py_CODEUNIT inst = {.cache = FT_ATOMIC_LOAD_UINT16_RELAXED(*(uint16_t *)src_instr)}; + _Py_CODEUNIT inst = { + .cache = FT_ATOMIC_LOAD_UINT16_RELAXED(*(uint16_t *)src_instr)}; int opcode = inst.op.code; if (opcode < MIN_INSTRUMENTED_OPCODE) { inst.op.code = _PyOpcode_Deopt[opcode]; @@ -635,7 +636,8 @@ _Py_GetBaseCodeUnit(PyCodeObject *code, int i) } static void -de_instrument(_Py_CODEUNIT *bytecode, _PyCoMonitoringData *monitoring, int i, int event) +de_instrument(_Py_CODEUNIT *bytecode, _PyCoMonitoringData *monitoring, int i, + int event) { assert(event != PY_MONITORING_EVENT_INSTRUCTION); assert(event != PY_MONITORING_EVENT_LINE); @@ -665,7 +667,8 @@ de_instrument(_Py_CODEUNIT *bytecode, _PyCoMonitoringData *monitoring, int i, in } static void -de_instrument_line(_Py_CODEUNIT *bytecode, _PyCoMonitoringData *monitoring, int i) +de_instrument_line(_Py_CODEUNIT *bytecode, _PyCoMonitoringData *monitoring, + int i) { _Py_CODEUNIT *instr = &bytecode[i]; int opcode = instr->op.code; @@ -713,7 +716,6 @@ de_instrument_per_instruction(_Py_CODEUNIT *bytecode, assert(instr->op.code != INSTRUMENTED_INSTRUCTION); } - static void instrument(_Py_CODEUNIT *bytecode, _PyCoMonitoringData *monitoring, int i) { @@ -738,8 +740,9 @@ instrument(_Py_CODEUNIT *bytecode, _PyCoMonitoringData *monitoring, int i) assert(instrumented); FT_ATOMIC_STORE_UINT8_RELAXED(*opcode_ptr, instrumented); if (_PyOpcode_Caches[deopt]) { - FT_ATOMIC_STORE_UINT16_RELAXED(instr[1].counter.as_counter, - adaptive_counter_warmup().as_counter); + FT_ATOMIC_STORE_UINT16_RELAXED( + instr[1].counter.as_counter, + adaptive_counter_warmup().as_counter); } } } diff --git a/Python/specialize.c b/Python/specialize.c index 6d4a6734a7f297..b5caffcaea098b 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -25,19 +25,20 @@ extern const char *_PyUOpName(int index); */ #ifdef Py_GIL_DISABLED -#define SET_OPCODE_OR_RETURN(instr, opcode) \ - do { \ - uint8_t old_op = _Py_atomic_load_uint8_relaxed(&(instr)->op.code); \ - if (old_op >= MIN_INSTRUMENTED_OPCODE) { \ - /* Lost race with instrumentation */ \ - return; \ - } \ - if (!_Py_atomic_compare_exchange_uint8(&(instr)->op.code, &old_op, (opcode))) { \ - /* Lost race with instrumentation */ \ - assert(old_op >= MIN_INSTRUMENTED_OPCODE); \ - return; \ - } \ - } while (0) +#define SET_OPCODE_OR_RETURN(instr, opcode) \ + do { \ + uint8_t old_op = _Py_atomic_load_uint8_relaxed(&(instr)->op.code); \ + if (old_op >= MIN_INSTRUMENTED_OPCODE) { \ + /* Lost race with instrumentation */ \ + return; \ + } \ + if (!_Py_atomic_compare_exchange_uint8(&(instr)->op.code, &old_op, \ + (opcode))) { \ + /* Lost race with instrumentation */ \ + assert(old_op >= MIN_INSTRUMENTED_OPCODE); \ + return; \ + } \ + } while (0) #else #define SET_OPCODE_OR_RETURN(instr, opcode) (instr)->op.code = (opcode) #endif @@ -488,11 +489,12 @@ void _PyCode_DisableSpecialization(_Py_CODEUNIT *instructions, Py_ssize_t size) { /* The last code unit cannot have a cache, so we don't need to check it */ - for (Py_ssize_t i = 0; i < size-1; i++) { + for (Py_ssize_t i = 0; i < size - 1; i++) { int opcode = instructions[i].op.code; int caches = _PyOpcode_Caches[opcode]; if (caches) { - instructions[i + 1].counter = initial_unreachable_backoff_counter(); + instructions[i + 1].counter = + initial_unreachable_backoff_counter(); i += caches; } } From 81fe1a213f2943e777cc0c4355e65dda52d8f117 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Tue, 10 Sep 2024 14:46:01 -0700 Subject: [PATCH 29/67] Remove comment --- Python/instrumentation.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/Python/instrumentation.c b/Python/instrumentation.c index 0cb84a4d66ca02..9a7cc027553f4f 100644 --- a/Python/instrumentation.c +++ b/Python/instrumentation.c @@ -18,8 +18,6 @@ #include "pycore_pyerrors.h" #include "pycore_pystate.h" // _PyInterpreterState_GET() -// TODO(mpage) - Document how we keep everything in sync - /* Uncomment this to dump debugging output when assertions fail */ // #define INSTRUMENT_DEBUG 1 From 837645e31166e141149cc7a984ab832ac73d4896 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Tue, 10 Sep 2024 16:53:29 -0700 Subject: [PATCH 30/67] Fix data race in _PyInstruction_GetLength Read the opcode atomically, the interpreter may be specializing it --- Python/instrumentation.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Python/instrumentation.c b/Python/instrumentation.c index 9a7cc027553f4f..18b77193fc4c52 100644 --- a/Python/instrumentation.c +++ b/Python/instrumentation.c @@ -327,7 +327,8 @@ _PyInstruction_GetLength(PyCodeObject *code, int offset) { ASSERT_WORLD_STOPPED_OR_LOCKED(code); - int opcode = _PyCode_CODE(code)[offset].op.code; + int opcode = + FT_ATOMIC_LOAD_UINT8_RELAXED(_PyCode_CODE(code)[offset].op.code); assert(opcode != 0); assert(opcode != RESERVED); if (opcode == INSTRUMENTED_LINE) { From f13e132eed50dba5f7e0382dd68dc30b5d27eb4a Mon Sep 17 00:00:00 2001 From: Matt Page Date: Wed, 11 Sep 2024 15:11:28 -0700 Subject: [PATCH 31/67] Fix tier2 optimizer uops that uses `this_instr` are not eligible to be included in traces. Restore `RESUME_CHECK` to its original implemenation and inline reloading bytecode, which does not need to reset `this` since nothing else in `RESUME_CHECK` uses it. --- Include/internal/pycore_opcode_metadata.h | 1 + Include/internal/pycore_uop_ids.h | 48 +++++++++++------------ Include/internal/pycore_uop_metadata.h | 2 +- Programs/test_frozenmain.h | 14 +++---- Python/bytecodes.c | 19 ++++++--- Python/executor_cases.c.h | 12 ++++++ Python/generated_cases.c.h | 46 ++++++++++------------ 7 files changed, 78 insertions(+), 64 deletions(-) diff --git a/Include/internal/pycore_opcode_metadata.h b/Include/internal/pycore_opcode_metadata.h index 9dd945956eeb3a..a20c55c3a90607 100644 --- a/Include/internal/pycore_opcode_metadata.h +++ b/Include/internal/pycore_opcode_metadata.h @@ -1378,6 +1378,7 @@ _PyOpcode_macro_expansion[256] = { [POP_TOP] = { .nuops = 1, .uops = { { _POP_TOP, 0, 0 } } }, [PUSH_EXC_INFO] = { .nuops = 1, .uops = { { _PUSH_EXC_INFO, 0, 0 } } }, [PUSH_NULL] = { .nuops = 1, .uops = { { _PUSH_NULL, 0, 0 } } }, + [RESUME_CHECK] = { .nuops = 1, .uops = { { _RESUME_CHECK, 0, 0 } } }, [RETURN_CONST] = { .nuops = 2, .uops = { { _LOAD_CONST, 0, 0 }, { _RETURN_VALUE, 0, 0 } } }, [RETURN_GENERATOR] = { .nuops = 1, .uops = { { _RETURN_GENERATOR, 0, 0 } } }, [RETURN_VALUE] = { .nuops = 1, .uops = { { _RETURN_VALUE, 0, 0 } } }, diff --git a/Include/internal/pycore_uop_ids.h b/Include/internal/pycore_uop_ids.h index 736a91f32d8a0b..19582d85e5dd25 100644 --- a/Include/internal/pycore_uop_ids.h +++ b/Include/internal/pycore_uop_ids.h @@ -245,42 +245,42 @@ extern "C" { #define _PY_FRAME_KW 449 #define _QUICKEN_RESUME 450 #define _REPLACE_WITH_TRUE 451 -#define _RESUME_CHECK 452 +#define _RESUME_CHECK RESUME_CHECK #define _RETURN_GENERATOR RETURN_GENERATOR #define _RETURN_VALUE RETURN_VALUE -#define _SAVE_RETURN_OFFSET 453 -#define _SEND 454 -#define _SEND_GEN_FRAME 455 +#define _SAVE_RETURN_OFFSET 452 +#define _SEND 453 +#define _SEND_GEN_FRAME 454 #define _SETUP_ANNOTATIONS SETUP_ANNOTATIONS #define _SET_ADD SET_ADD #define _SET_FUNCTION_ATTRIBUTE SET_FUNCTION_ATTRIBUTE #define _SET_UPDATE SET_UPDATE -#define _START_EXECUTOR 456 -#define _STORE_ATTR 457 -#define _STORE_ATTR_INSTANCE_VALUE 458 -#define _STORE_ATTR_SLOT 459 -#define _STORE_ATTR_WITH_HINT 460 +#define _START_EXECUTOR 455 +#define _STORE_ATTR 456 +#define _STORE_ATTR_INSTANCE_VALUE 457 +#define _STORE_ATTR_SLOT 458 +#define _STORE_ATTR_WITH_HINT 459 #define _STORE_DEREF STORE_DEREF -#define _STORE_FAST 461 -#define _STORE_FAST_0 462 -#define _STORE_FAST_1 463 -#define _STORE_FAST_2 464 -#define _STORE_FAST_3 465 -#define _STORE_FAST_4 466 -#define _STORE_FAST_5 467 -#define _STORE_FAST_6 468 -#define _STORE_FAST_7 469 +#define _STORE_FAST 460 +#define _STORE_FAST_0 461 +#define _STORE_FAST_1 462 +#define _STORE_FAST_2 463 +#define _STORE_FAST_3 464 +#define _STORE_FAST_4 465 +#define _STORE_FAST_5 466 +#define _STORE_FAST_6 467 +#define _STORE_FAST_7 468 #define _STORE_FAST_LOAD_FAST STORE_FAST_LOAD_FAST #define _STORE_FAST_STORE_FAST STORE_FAST_STORE_FAST #define _STORE_GLOBAL STORE_GLOBAL #define _STORE_NAME STORE_NAME -#define _STORE_SLICE 470 -#define _STORE_SUBSCR 471 +#define _STORE_SLICE 469 +#define _STORE_SUBSCR 470 #define _STORE_SUBSCR_DICT STORE_SUBSCR_DICT #define _STORE_SUBSCR_LIST_INT STORE_SUBSCR_LIST_INT #define _SWAP SWAP -#define _TIER2_RESUME_CHECK 472 -#define _TO_BOOL 473 +#define _TIER2_RESUME_CHECK 471 +#define _TO_BOOL 472 #define _TO_BOOL_BOOL TO_BOOL_BOOL #define _TO_BOOL_INT TO_BOOL_INT #define _TO_BOOL_LIST TO_BOOL_LIST @@ -290,14 +290,14 @@ extern "C" { #define _UNARY_NEGATIVE UNARY_NEGATIVE #define _UNARY_NOT UNARY_NOT #define _UNPACK_EX UNPACK_EX -#define _UNPACK_SEQUENCE 474 +#define _UNPACK_SEQUENCE 473 #define _UNPACK_SEQUENCE_LIST UNPACK_SEQUENCE_LIST #define _UNPACK_SEQUENCE_TUPLE UNPACK_SEQUENCE_TUPLE #define _UNPACK_SEQUENCE_TWO_TUPLE UNPACK_SEQUENCE_TWO_TUPLE #define _WITH_EXCEPT_START WITH_EXCEPT_START #define _YIELD_VALUE YIELD_VALUE #define __DO_CALL_FUNCTION_EX _DO_CALL_FUNCTION_EX -#define MAX_UOP_ID 474 +#define MAX_UOP_ID 473 #ifdef __cplusplus } diff --git a/Include/internal/pycore_uop_metadata.h b/Include/internal/pycore_uop_metadata.h index bd64df66d0c90d..6bcbd354eb55c8 100644 --- a/Include/internal/pycore_uop_metadata.h +++ b/Include/internal/pycore_uop_metadata.h @@ -22,7 +22,7 @@ const uint16_t _PyUop_Flags[MAX_UOP_ID+1] = { [_NOP] = HAS_PURE_FLAG, [_CHECK_PERIODIC] = HAS_EVAL_BREAK_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG, [_CHECK_PERIODIC_IF_NOT_YIELD_FROM] = HAS_ARG_FLAG | HAS_EVAL_BREAK_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG, - [_RESUME_CHECK] = HAS_DEOPT_FLAG, + [_RESUME_CHECK] = HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG, [_LOAD_FAST_CHECK] = HAS_ARG_FLAG | HAS_LOCAL_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG, [_LOAD_FAST_0] = HAS_LOCAL_FLAG | HAS_PURE_FLAG, [_LOAD_FAST_1] = HAS_LOCAL_FLAG | HAS_PURE_FLAG, diff --git a/Programs/test_frozenmain.h b/Programs/test_frozenmain.h index 661ce867c1ce00..624d9c0b653ad7 100644 --- a/Programs/test_frozenmain.h +++ b/Programs/test_frozenmain.h @@ -12,26 +12,26 @@ unsigned char M_test_frozenmain[] = { 0,0,111,6,88,2,31,0,79,6,88,6,12,0,79,7, 88,5,88,6,2,0,0,0,12,0,47,4,49,1,0,0, 0,0,0,0,29,0,72,22,0,0,9,0,29,0,100,1, - 41,8,233,0,0,0,0,78,218,18,70,114,111,122,101,110, - 32,72,101,108,108,111,32,87,111,114,108,100,218,8,115,121, + 41,8,233,0,0,0,0,78,122,18,70,114,111,122,101,110, + 32,72,101,108,108,111,32,87,111,114,108,100,122,8,115,121, 115,46,97,114,103,118,218,6,99,111,110,102,105,103,41,5, 218,12,112,114,111,103,114,97,109,95,110,97,109,101,218,10, 101,120,101,99,117,116,97,98,108,101,218,15,117,115,101,95, 101,110,118,105,114,111,110,109,101,110,116,218,17,99,111,110, 102,105,103,117,114,101,95,99,95,115,116,100,105,111,218,14, - 98,117,102,102,101,114,101,100,95,115,116,100,105,111,218,7, - 99,111,110,102,105,103,32,218,2,58,32,41,7,218,3,115, + 98,117,102,102,101,114,101,100,95,115,116,100,105,111,122,7, + 99,111,110,102,105,103,32,122,2,58,32,41,7,218,3,115, 121,115,218,17,95,116,101,115,116,105,110,116,101,114,110,97, 108,99,97,112,105,218,5,112,114,105,110,116,218,4,97,114, 103,118,218,11,103,101,116,95,99,111,110,102,105,103,115,114, - 5,0,0,0,218,3,107,101,121,169,0,243,0,0,0,0, + 3,0,0,0,218,3,107,101,121,169,0,243,0,0,0,0, 218,18,116,101,115,116,95,102,114,111,122,101,110,109,97,105, - 110,46,112,121,218,8,60,109,111,100,117,108,101,62,114,22, + 110,46,112,121,218,8,60,109,111,100,117,108,101,62,114,18, 0,0,0,1,0,0,0,115,94,0,0,0,240,3,1,1, 1,243,8,0,1,11,219,0,24,225,0,5,208,6,26,212, 0,27,217,0,5,128,106,144,35,151,40,145,40,212,0,27, 216,9,26,215,9,38,210,9,38,211,9,40,168,24,209,9, 50,128,6,243,2,6,12,2,128,67,241,14,0,5,10,136, 71,144,67,144,53,152,2,152,54,160,35,153,59,152,45,208, - 10,40,214,4,41,242,15,6,12,2,114,20,0,0,0, + 10,40,214,4,41,242,15,6,12,2,114,16,0,0,0, }; diff --git a/Python/bytecodes.c b/Python/bytecodes.c index bdc02183e807b2..3ef67d5491250c 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -199,7 +199,6 @@ dummy_func( frame->instr_ptr = frame->bytecode + off; this_instr = frame->instr_ptr; next_instr = frame->instr_ptr + 1; - } #else (void)this_instr; @@ -212,7 +211,7 @@ dummy_func( _QUICKEN_RESUME + _CHECK_PERIODIC_IF_NOT_YIELD_FROM; - op(_RESUME_CHECK, (--)) { + inst(RESUME_CHECK, (--)) { #if defined(__EMSCRIPTEN__) DEOPT_IF(_Py_emscripten_signal_clock == 0); _Py_emscripten_signal_clock -= Py_EMSCRIPTEN_SIGNAL_HANDLING; @@ -221,12 +220,20 @@ dummy_func( uintptr_t version = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(_PyFrame_GetCode(frame)->_co_instrumentation_version); assert((version & _PY_EVAL_EVENTS_MASK) == 0); DEOPT_IF(eval_breaker != version); + #ifdef Py_GIL_DISABLED + _Py_CODEUNIT *bytecode = _PyCode_GetExecutableCode(_PyFrame_GetCode(frame)); + if (frame->bytecode != bytecode) { + /* Avoid using this_instr here so that _RESUME_CHECK can be included + in traces. + */ + int off = frame->instr_ptr - frame->bytecode; + frame->bytecode = bytecode; + frame->instr_ptr = frame->bytecode + off; + next_instr = frame->instr_ptr + 1; + } + #endif } - macro(RESUME_CHECK) = - _LOAD_BYTECODE + - _RESUME_CHECK; - op(_MONITOR_RESUME, (--)) { _PyFrame_SetStackPointer(frame, stack_pointer); int err = _Py_call_instrumentation( diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 1ca87325f8db2d..8da7625780323a 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -54,6 +54,18 @@ UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); } + #ifdef Py_GIL_DISABLED + _Py_CODEUNIT *bytecode = _PyCode_GetExecutableCode(_PyFrame_GetCode(frame)); + if (frame->bytecode != bytecode) { + /* Avoid using this_instr here so that _RESUME_CHECK can be included + in traces. + */ + int off = frame->instr_ptr - frame->bytecode; + frame->bytecode = bytecode; + frame->instr_ptr = frame->bytecode + off; + next_instr = frame->instr_ptr + 1; + } + #endif break; } diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 52901893db95c0..8b705df70a5232 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -6482,36 +6482,30 @@ } TARGET(RESUME_CHECK) { - _Py_CODEUNIT *this_instr = frame->instr_ptr = next_instr; + frame->instr_ptr = next_instr; next_instr += 1; INSTRUCTION_STATS(RESUME_CHECK); static_assert(0 == 0, "incorrect cache size"); - // _LOAD_BYTECODE - { - #ifdef Py_GIL_DISABLED - _Py_CODEUNIT *bytecode = _PyCode_GetExecutableCode(_PyFrame_GetCode(frame)); - if (frame->bytecode != bytecode) { - int off = this_instr - frame->bytecode; - frame->bytecode = bytecode; - frame->instr_ptr = frame->bytecode + off; - this_instr = frame->instr_ptr; - next_instr = frame->instr_ptr + 1; - } - #else - (void)this_instr; - #endif - } - // _RESUME_CHECK - { - #if defined(__EMSCRIPTEN__) - DEOPT_IF(_Py_emscripten_signal_clock == 0, RESUME); - _Py_emscripten_signal_clock -= Py_EMSCRIPTEN_SIGNAL_HANDLING; - #endif - uintptr_t eval_breaker = _Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker); - uintptr_t version = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(_PyFrame_GetCode(frame)->_co_instrumentation_version); - assert((version & _PY_EVAL_EVENTS_MASK) == 0); - DEOPT_IF(eval_breaker != version, RESUME); + #if defined(__EMSCRIPTEN__) + DEOPT_IF(_Py_emscripten_signal_clock == 0, RESUME); + _Py_emscripten_signal_clock -= Py_EMSCRIPTEN_SIGNAL_HANDLING; + #endif + uintptr_t eval_breaker = _Py_atomic_load_uintptr_relaxed(&tstate->eval_breaker); + uintptr_t version = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(_PyFrame_GetCode(frame)->_co_instrumentation_version); + assert((version & _PY_EVAL_EVENTS_MASK) == 0); + DEOPT_IF(eval_breaker != version, RESUME); + #ifdef Py_GIL_DISABLED + _Py_CODEUNIT *bytecode = _PyCode_GetExecutableCode(_PyFrame_GetCode(frame)); + if (frame->bytecode != bytecode) { + /* Avoid using this_instr here so that _RESUME_CHECK can be included + in traces. + */ + int off = frame->instr_ptr - frame->bytecode; + frame->bytecode = bytecode; + frame->instr_ptr = frame->bytecode + off; + next_instr = frame->instr_ptr + 1; } + #endif DISPATCH(); } From 942f62898182d61cc889214de715391f06df6a16 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Wed, 11 Sep 2024 15:39:25 -0700 Subject: [PATCH 32/67] Use __VA_ARGS__ for macros Named argument lists apparently aren't supported on windows --- Python/instrumentation.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Python/instrumentation.c b/Python/instrumentation.c index 18b77193fc4c52..1d22bc6695ac92 100644 --- a/Python/instrumentation.c +++ b/Python/instrumentation.c @@ -44,7 +44,7 @@ #define UNLOCK_CODE() Py_END_CRITICAL_SECTION() -#define MODIFY_BYTECODE(code, func, args...) \ +#define MODIFY_BYTECODE(code, func, ...) \ do { \ PyCodeObject *co = (code); \ for (Py_ssize_t i = 0; i < code->co_tlbc->size; i++) { \ @@ -52,7 +52,7 @@ if (bc == NULL) { \ continue; \ } \ - (func)((_Py_CODEUNIT *)bc, args); \ + (func)((_Py_CODEUNIT *)bc, __VA_ARGS__); \ } \ } while (0) @@ -60,7 +60,8 @@ #define LOCK_CODE(code) #define UNLOCK_CODE() -#define MODIFY_BYTECODE(code, func, args...) (func)(_PyCode_CODE(code), args) +#define MODIFY_BYTECODE(code, func, ...) \ + (func)(_PyCode_CODE(code), __VA_ARGS__) #endif From 66cb24d82d6557cf07725f8bacf720dcc0134fab Mon Sep 17 00:00:00 2001 From: Matt Page Date: Wed, 11 Sep 2024 15:46:02 -0700 Subject: [PATCH 33/67] Update vcxproj files to include newly added files Hopefully I did this correctly. --- PCbuild/_freeze_module.vcxproj | 1 + PCbuild/_freeze_module.vcxproj.filters | 3 +++ PCbuild/pythoncore.vcxproj | 2 ++ PCbuild/pythoncore.vcxproj.filters | 6 ++++++ 4 files changed, 12 insertions(+) diff --git a/PCbuild/_freeze_module.vcxproj b/PCbuild/_freeze_module.vcxproj index 743e6e2a66a8f1..65ca5136587f8c 100644 --- a/PCbuild/_freeze_module.vcxproj +++ b/PCbuild/_freeze_module.vcxproj @@ -222,6 +222,7 @@ + diff --git a/PCbuild/_freeze_module.vcxproj.filters b/PCbuild/_freeze_module.vcxproj.filters index 0887a47917a931..b11791ef95efce 100644 --- a/PCbuild/_freeze_module.vcxproj.filters +++ b/PCbuild/_freeze_module.vcxproj.filters @@ -232,6 +232,9 @@ Source Files + + Source Files + Source Files diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index 6399eac313db29..4c818d6938b7aa 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -252,6 +252,7 @@ + @@ -611,6 +612,7 @@ + diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters index 23f2e9c8bc0eb7..00692bd6497c28 100644 --- a/PCbuild/pythoncore.vcxproj.filters +++ b/PCbuild/pythoncore.vcxproj.filters @@ -678,6 +678,9 @@ Include\internal + + Include\internal + Include\internal @@ -1373,6 +1376,9 @@ Python + + Python + Python From ad12bd42d7cd91d008cde3236a238a5f7f60bd56 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Wed, 11 Sep 2024 16:12:55 -0700 Subject: [PATCH 34/67] Mark unused params --- Objects/codeobject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Objects/codeobject.c b/Objects/codeobject.c index 837c0a154c40c4..9be04087d5e127 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -2855,7 +2855,7 @@ release_bytes_for_tlbc(Py_ssize_t nbytes) } static int -disable_specialization(PyObject *obj, void *) +disable_specialization(PyObject *obj, void *Py_UNUSED(arg)) { if (!PyCode_Check(obj)) { return 1; From 1bbbbbc8dbbcbb4b0efc6b0a3067b1bdbfe289d3 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Thu, 12 Sep 2024 15:09:08 -0700 Subject: [PATCH 35/67] Keep tier2 and the JIT disabled in free-threaded builds --- Lib/test/test_capi/test_opt.py | 7 ++++++- Python/bytecodes.c | 2 +- Python/generated_cases.c.h | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py index f1ab72180d714d..c352325ff3d08a 100644 --- a/Lib/test/test_capi/test_opt.py +++ b/Lib/test/test_capi/test_opt.py @@ -7,7 +7,8 @@ import _opcode -from test.support import script_helper, requires_specialization, import_helper +from test.support import (script_helper, requires_specialization, + import_helper, Py_GIL_DISABLED) _testinternalcapi = import_helper.import_module("_testinternalcapi") @@ -34,6 +35,7 @@ def clear_executors(func): @requires_specialization +@unittest.skipIf(Py_GIL_DISABLED, "optimizer not yet supported in free-threaded builds") @unittest.skipUnless(hasattr(_testinternalcapi, "get_optimizer"), "Requires optimizer infrastructure") class TestOptimizerAPI(unittest.TestCase): @@ -138,6 +140,7 @@ def get_opnames(ex): @requires_specialization +@unittest.skipIf(Py_GIL_DISABLED, "optimizer not yet supported in free-threaded builds") @unittest.skipUnless(hasattr(_testinternalcapi, "get_optimizer"), "Requires optimizer infrastructure") class TestExecutorInvalidation(unittest.TestCase): @@ -219,6 +222,7 @@ def f(): @requires_specialization +@unittest.skipIf(Py_GIL_DISABLED, "optimizer not yet supported in free-threaded builds") @unittest.skipUnless(hasattr(_testinternalcapi, "get_optimizer"), "Requires optimizer infrastructure") @unittest.skipIf(os.getenv("PYTHON_UOPS_OPTIMIZE") == "0", "Needs uop optimizer to run.") @@ -586,6 +590,7 @@ def testfunc(n): @requires_specialization +@unittest.skipIf(Py_GIL_DISABLED, "optimizer not yet supported in free-threaded builds") @unittest.skipUnless(hasattr(_testinternalcapi, "get_optimizer"), "Requires optimizer infrastructure") @unittest.skipIf(os.getenv("PYTHON_UOPS_OPTIMIZE") == "0", "Needs uop optimizer to run.") diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 3ef67d5491250c..5c04d0e452be13 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -2558,7 +2558,7 @@ dummy_func( assert(oparg <= INSTR_OFFSET()); JUMPBY(-oparg); #ifdef _Py_TIER2 - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZATION && !Py_GIL_DISABLED _Py_BackoffCounter counter = this_instr[1].counter; if (backoff_counter_triggers(counter) && this_instr->op.code == JUMP_BACKWARD) { _Py_CODEUNIT *start = this_instr; diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 8b705df70a5232..05930c385c514c 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -4754,7 +4754,7 @@ assert(oparg <= INSTR_OFFSET()); JUMPBY(-oparg); #ifdef _Py_TIER2 - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZATION && !Py_GIL_DISABLED _Py_BackoffCounter counter = this_instr[1].counter; if (backoff_counter_triggers(counter) && this_instr->op.code == JUMP_BACKWARD) { _Py_CODEUNIT *start = this_instr; From e63e403641eccdb118dce4b0efaf42198adbc4e9 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Thu, 12 Sep 2024 18:50:06 -0700 Subject: [PATCH 36/67] Only allow enabling/disabling tlbc --- Include/cpython/initconfig.h | 2 +- Include/internal/pycore_ceval.h | 15 ++++ Include/internal/pycore_code.h | 48 ++-------- Include/internal/pycore_interp.h | 5 -- Lib/test/test_capi/test_config.py | 2 +- Lib/test/test_cmd_line.py | 34 ++++--- Lib/test/test_embed.py | 2 +- Lib/test/test_tlbc.py | 12 +-- Objects/codeobject.c | 141 ++---------------------------- Python/bytecodes.c | 4 +- Python/ceval.c | 2 +- Python/executor_cases.c.h | 2 +- Python/generated_cases.c.h | 6 +- Python/initconfig.c | 37 ++++---- Python/pylifecycle.c | 4 - Python/pystate.c | 1 - 16 files changed, 83 insertions(+), 234 deletions(-) diff --git a/Include/cpython/initconfig.h b/Include/cpython/initconfig.h index 8b4ad95ed9f89c..f69c586a4f96f3 100644 --- a/Include/cpython/initconfig.h +++ b/Include/cpython/initconfig.h @@ -183,7 +183,7 @@ typedef struct PyConfig { int cpu_count; #ifdef Py_GIL_DISABLED int enable_gil; - int tlbc_limit; + int tlbc_enabled; #endif /* --- Path configuration inputs ------------ */ diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h index e4af731be0e87f..4141c0f80b0da7 100644 --- a/Include/internal/pycore_ceval.h +++ b/Include/internal/pycore_ceval.h @@ -177,6 +177,21 @@ _PyEval_IsGILEnabled(PyThreadState *tstate) extern int _PyEval_EnableGILTransient(PyThreadState *tstate); extern int _PyEval_EnableGILPermanent(PyThreadState *tstate); extern int _PyEval_DisableGIL(PyThreadState *state); + + +static inline _Py_CODEUNIT * +_PyEval_GetExecutableCode(PyCodeObject *co) +{ + _Py_CODEUNIT *bc = _PyCode_GetTLBCFast(co); + if (bc != NULL) { + return bc; + } + if (!_PyInterpreterState_GET()->config.tlbc_enabled) { + return _PyCode_CODE(co); + } + return _PyCode_GetTLBC(co); +} + #endif extern void _PyEval_DeactivateOpCache(void); diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index 0d9f33a34b3a91..2df170e2b25ab9 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -633,24 +633,6 @@ PyAPI_DATA(const struct _PyCode8) _Py_InitCleanup; #ifdef Py_GIL_DISABLED -typedef enum { - // No limit on the amount of memory consumed by thread-local bytecode. - // Terminal state. - _PY_TLBC_UNLIMITED = 0, - - // The total amount of memory consumed by thread-local bytecode must be - // <= PyInterpreterState::tlbc_limit. State transitions to - // _PY_TLBC_DISABLED - // when the limit is reached. - _PY_TLBC_LIMITED = 1, - - // New thread-local bytecode is disabled. Previously allocated copies - // may still be used. Terminal state. - _PY_TLBC_DISABLED = 2, -} _Py_TLBC_State; - -extern void _PyCode_InitState(PyInterpreterState *interp); - // Return a pointer to the thread-local bytecode for the current thread, if it // exists. static inline _Py_CODEUNIT * @@ -665,31 +647,15 @@ _PyCode_GetTLBCFast(PyCodeObject *co) return NULL; } -// Return a pointer to the thread-local bytecode for the current thread, creating -// it if it doesn't exist. -// -// On error, NULL is returned, new thread-local bytecode is disabled, and -// specialization is disabled for the "main" copy of the bytecode (the bytecode -// embedded in the code object) for all code objects. -extern _Py_CODEUNIT *_PyCode_GetTLBCSlow(PyCodeObject *co); - -// Return the bytecode that should be executed by the current thread, creating -// a copy if necessary. -static inline _Py_CODEUNIT * -_PyCode_GetExecutableCode(PyCodeObject *co) -{ - _Py_CODEUNIT *res = _PyCode_GetTLBCFast(co); - if (res != NULL) { - return res; - } - res = _PyCode_GetTLBCSlow(co); - if (res != NULL) { - return res; - } - return _PyCode_CODE(co); -} +// Return a pointer to the thread-local bytecode for the current thread, +// creating it if necessary. +extern _Py_CODEUNIT *_PyCode_GetTLBC(PyCodeObject *co); +// Reserve an index for the current thread into thread-local bytecode +// arrays extern int _Py_ReserveTLBCIndex(PyInterpreterState *interp); + +// Release the current thread's index into thread-local bytecode arrays extern void _Py_ClearTLBCIndex(_PyThreadStateImpl *tstate); #endif diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h index 9bfed0cd28f1a4..8080d894e77f95 100644 --- a/Include/internal/pycore_interp.h +++ b/Include/internal/pycore_interp.h @@ -225,11 +225,6 @@ struct _is { struct _Py_type_id_pool type_ids; PyMutex weakref_locks[NUM_WEAKREF_LIST_LOCKS]; _PyIndexPool tlbc_indices; - // Number of bytes available for thread-local bytecode, counts down to - // zero. - Py_ssize_t tlbc_avail; - PyMutex tlbc_avail_mutex; - _Py_TLBC_State tlbc_state; #endif // Per-interpreter state for the obmalloc allocator. For the main diff --git a/Lib/test/test_capi/test_config.py b/Lib/test/test_capi/test_config.py index 7e7b7a60aa21f0..b323f4e7df5d58 100644 --- a/Lib/test/test_capi/test_config.py +++ b/Lib/test/test_capi/test_config.py @@ -100,7 +100,7 @@ def test_config_get(self): options.append(("run_presite", str | None, None)) if sysconfig.get_config_var('Py_GIL_DISABLED'): options.append(("enable_gil", int, None)) - options.append(("tlbc_limit", int, None)) + options.append(("tlbc_enabled", int, None)) if support.MS_WINDOWS: options.extend(( ("legacy_windows_stdio", bool, None), diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py index 0ee81126ca6725..f088e5b8b5089c 100644 --- a/Lib/test/test_cmd_line.py +++ b/Lib/test/test_cmd_line.py @@ -1070,10 +1070,10 @@ def res2int(self, res): return tuple(int(i) for i in out.split()) @unittest.skipUnless(support.Py_GIL_DISABLED, - "PYTHON_TLBC_LIMIT and -X tlbc_limit" + "PYTHON_TLBC and -X tlbc" " only supported in Py_GIL_DISABLED builds") @threading_helper.requires_working_threading() - def test_set_thread_local_bytecode_limit(self): + def test_disable_thread_local_bytecode(self): code = """if 1: import threading def test(x, y): @@ -1081,21 +1081,27 @@ def test(x, y): t = threading.Thread(target=test, args=(1,2)) t.start() t.join()""" - rc, out, err = assert_python_ok("-W", "always", "-X", "tlbc_limit=1", "-c", code) - self.assertIn(b"Reached memory limit for thread-local bytecode", err) - rc, out, err = assert_python_ok("-W", "always", "-c", code, PYTHON_TLBC_LIMIT="1") - self.assertIn(b"Reached memory limit for thread-local bytecode", err) + assert_python_ok("-W", "always", "-X", "tlbc=0", "-c", code) + assert_python_ok("-W", "always", "-c", code, PYTHON_TLBC="0") @unittest.skipUnless(support.Py_GIL_DISABLED, - "PYTHON_TLBC_LIMIT and -X tlbc_limit" + "PYTHON_TLBC and -X tlbc" " only supported in Py_GIL_DISABLED builds") - def test_invalid_thread_local_bytecode_limit(self): - rc, out, err = assert_python_failure("-X", "tlbc_limit") - self.assertIn(b"tlbc_limit=n: n is missing or invalid", err) - rc, out, err = assert_python_failure("-X", "tlbc_limit=foo") - self.assertIn(b"tlbc_limit=n: n is missing or invalid", err) - rc, out, err = assert_python_failure(PYTHON_TLBC_LIMIT="foo") - self.assertIn(b"PYTHON_TLBC_LIMIT=N: N is missing or invalid", err) + def test_invalid_thread_local_bytecode(self): + rc, out, err = assert_python_failure("-X", "tlbc") + self.assertIn(b"tlbc=n: n is missing or invalid", err) + rc, out, err = assert_python_failure("-X", "tlbc=foo") + self.assertIn(b"tlbc=n: n is missing or invalid", err) + rc, out, err = assert_python_failure("-X", "tlbc=-1") + self.assertIn(b"tlbc=n: n is missing or invalid", err) + rc, out, err = assert_python_failure("-X", "tlbc=2") + self.assertIn(b"tlbc=n: n is missing or invalid", err) + rc, out, err = assert_python_failure(PYTHON_TLBC="foo") + self.assertIn(b"PYTHON_TLBC=N: N is missing or invalid", err) + rc, out, err = assert_python_failure(PYTHON_TLBC="-1") + self.assertIn(b"PYTHON_TLBC=N: N is missing or invalid", err) + rc, out, err = assert_python_failure(PYTHON_TLBC="2") + self.assertIn(b"PYTHON_TLBC=N: N is missing or invalid", err) @unittest.skipIf(interpreter_requires_environment(), diff --git a/Lib/test/test_embed.py b/Lib/test/test_embed.py index 385e08bed10cd3..637a8591b5bb2e 100644 --- a/Lib/test/test_embed.py +++ b/Lib/test/test_embed.py @@ -631,7 +631,7 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase): CONFIG_COMPAT['run_presite'] = None if support.Py_GIL_DISABLED: CONFIG_COMPAT['enable_gil'] = -1 - CONFIG_COMPAT['tlbc_limit'] = GET_DEFAULT_CONFIG + CONFIG_COMPAT['tlbc_enabled'] = GET_DEFAULT_CONFIG if MS_WINDOWS: CONFIG_COMPAT.update({ 'legacy_windows_stdio': False, diff --git a/Lib/test/test_tlbc.py b/Lib/test/test_tlbc.py index adcab24215756b..16a98faa6783d7 100644 --- a/Lib/test/test_tlbc.py +++ b/Lib/test/test_tlbc.py @@ -45,7 +45,7 @@ def f(a, b, q=None): assert "BINARY_OP_ADD_INT" in all_opnames(get_tlbc(f)) assert "BINARY_OP_ADD_INT" not in all_opnames(q.get()) """) - assert_python_ok("-X", "tlbc_limit=-1", "-c", code) + assert_python_ok("-X", "tlbc=1", "-c", code) @requires_specialization_of("BINARY_OP") def test_threads_specialize_independently(self): @@ -82,7 +82,7 @@ def g(a, b, q=None): assert "BINARY_OP_ADD_INT" not in t_opnames assert "BINARY_OP_ADD_UNICODE" in t_opnames """) - assert_python_ok("-X", "tlbc_limit=-1", "-c", code) + assert_python_ok("-X", "tlbc=1", "-c", code) def test_reuse_tlbc_across_threads_different_lifetimes(self): code = textwrap.dedent(""" @@ -107,7 +107,7 @@ def f(a, b, q=None): assert tlbc_ids[0] == tlbc_ids[1] assert tlbc_ids[1] == tlbc_ids[2] """) - assert_python_ok("-X", "tlbc_limit=-1", "-c", code) + assert_python_ok("-X", "tlbc=1", "-c", code) def test_no_tlbc_if_tlbc_disabled(self): code = textwrap.dedent(""" @@ -138,7 +138,7 @@ def f(a, b, q=None): assert tlbcs[1] is None assert tlbcs[2] is None """) - assert_python_ok("-X", "tlbc_limit=0", "-c", code) + assert_python_ok("-X", "tlbc=0", "-c", code) def test_no_specialization_if_tlbc_disabled(self): code = textwrap.dedent(""" @@ -160,7 +160,7 @@ def f(a, b): assert "BINARY_OP_ADD_INT" not in all_opnames(f) """) - assert_python_ok("-X", "tlbc_limit=0", "-c", code) + assert_python_ok("-X", "tlbc=0", "-c", code) def test_generator_throw(self): code = textwrap.dedent(""" @@ -190,7 +190,7 @@ def f(q): main_id = gen.throw(ValueError) assert main_id != q.get() """) - assert_python_ok("-X", "tlbc_limit=-1", "-c", code) + assert_python_ok("-X", "tlbc=1", "-c", code) if __name__ == "__main__": diff --git a/Objects/codeobject.c b/Objects/codeobject.c index 9be04087d5e127..82f8ce1f1d250f 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -456,7 +456,6 @@ extern void _PyCode_Quicken(_Py_CODEUNIT *instructions, Py_ssize_t size); #ifdef Py_GIL_DISABLED extern void _PyCode_DisableSpecialization(_Py_CODEUNIT *instructions, Py_ssize_t size); static _PyCodeArray * _PyCodeArray_New(Py_ssize_t size); -static void release_bytes_for_tlbc(Py_ssize_t nbytes); #endif static int @@ -536,11 +535,11 @@ init_code(PyCodeObject *co, struct _PyCodeConstructor *con) } co->_co_firsttraceable = entry_point; #ifdef Py_GIL_DISABLED - if (interp->tlbc_state == _PY_TLBC_DISABLED) { - _PyCode_DisableSpecialization(_PyCode_CODE(co), Py_SIZE(co)); + if (interp->config.tlbc_enabled) { + _PyCode_Quicken(_PyCode_CODE(co), Py_SIZE(co)); } else { - _PyCode_Quicken(_PyCode_CODE(co), Py_SIZE(co)); + _PyCode_DisableSpecialization(_PyCode_CODE(co), Py_SIZE(co)); } #else _PyCode_Quicken(_PyCode_CODE(co), Py_SIZE(co)); @@ -1905,15 +1904,12 @@ code_dealloc(PyCodeObject *co) #ifdef Py_GIL_DISABLED // The first element always points to the mutable bytecode at the end of // the code object, which will be freed when the code object is freed. - Py_ssize_t bytes_freed = 0; for (Py_ssize_t i = 1; i < co->co_tlbc->size; i++) { char *entry = co->co_tlbc->entries[i]; if (entry != NULL) { PyMem_Free(entry); - bytes_freed += _PyCode_NBYTES(co); } } - release_bytes_for_tlbc(bytes_freed); PyMem_Free(co->co_tlbc); #endif PyObject_Free(co); @@ -2696,37 +2692,14 @@ _PyCode_Fini(PyInterpreterState *interp) // is stored at the end of the code object. This ensures that no bytecode is // copied for programs that do not use threads. // -// The total amount of memory consumed by thread-local bytecode can be limited -// at runtime by setting either `-X tlbc_limit` or `PYTHON_TLBC_LIMIT`. When -// the limit is reached, no new copies of thread-local bytecode can be created -// and specialization is disabled for the "main" copy of the bytecode (the -// bytecode at index 0 of the `co_tlbc` array). Threads can continue to -// specialize existing thread-local copies of the bytecode (other than the -// "main" copy). All other execution will use the unspecialized, "main" copy of -// the bytecode. +// Thread-local bytecode can be disabled at runtime by providing either `-X +// tlbc=0` or `PYTHON_TLBC=0`. Disabling thread-local bytecode also disables +// specialization. // // Concurrent modifications to the bytecode made by the specializing // interpreter and instrumentation use atomics, with specialization taking care // not to overwrite an instruction that was instrumented concurrently. -void -_PyCode_InitState(PyInterpreterState *interp) -{ - int limit = interp->config.tlbc_limit; - if (limit < 0) { - interp->tlbc_avail = -1; - interp->tlbc_state = _PY_TLBC_UNLIMITED; - } - else if (limit == 0) { - interp->tlbc_avail = 0; - interp->tlbc_state = _PY_TLBC_DISABLED; - } - else { - interp->tlbc_avail = limit; - interp->tlbc_state = _PY_TLBC_LIMITED; - } -} - int _Py_ReserveTLBCIndex(PyInterpreterState *interp) { @@ -2805,91 +2778,6 @@ create_tlbc_lock_held(PyCodeObject *co, Py_ssize_t idx) return (_Py_CODEUNIT *) bc; } -static Py_ssize_t -reserve_bytes_for_tlbc(PyCodeObject *co) -{ - PyInterpreterState *interp = _PyInterpreterState_GET(); - Py_ssize_t code_size = _PyCode_NBYTES(co); - PyMutex_LockFlags(&interp->tlbc_avail_mutex, _Py_LOCK_DONT_DETACH); - Py_ssize_t nbytes_reserved; - switch (interp->tlbc_state) { - case _PY_TLBC_UNLIMITED: { - nbytes_reserved = code_size; - break; - } - case _PY_TLBC_LIMITED: { - if (interp->tlbc_avail >= code_size) { - nbytes_reserved = code_size; - interp->tlbc_avail -= code_size; - } - else { - nbytes_reserved = -1; - } - break; - } - case _PY_TLBC_DISABLED: { - nbytes_reserved = -1; - break; - } - default: { - Py_UNREACHABLE(); - } - } - PyMutex_Unlock(&interp->tlbc_avail_mutex); - return nbytes_reserved; -} - -static void -release_bytes_for_tlbc(Py_ssize_t nbytes) -{ - assert(nbytes >= 0); - if (nbytes == 0) { - return; - } - PyInterpreterState *interp = _PyInterpreterState_GET(); - PyMutex_LockFlags(&interp->tlbc_avail_mutex, _Py_LOCK_DONT_DETACH); - if (interp->tlbc_avail >= 0) { - interp->tlbc_avail += nbytes; - } - PyMutex_Unlock(&interp->tlbc_avail_mutex); -} - -static int -disable_specialization(PyObject *obj, void *Py_UNUSED(arg)) -{ - if (!PyCode_Check(obj)) { - return 1; - } - PyCodeObject *co = (PyCodeObject *)obj; - _PyCode_DisableSpecialization(_PyCode_CODE(co), Py_SIZE(co)); - return 1; -} - -static void -disable_new_tlbc(void) -{ - PyInterpreterState *interp = _PyInterpreterState_GET(); - if (interp->tlbc_state == _PY_TLBC_DISABLED) { - return; - } - // Disable creation of new thread-local copies of bytecode. We disable - // further specialization of the "main" copy of the bytecode (the bytecode - // that is embedded in the code object), so that multiple threads can - // safely execute it concurrently. From this point on, threads are free to - // specialize existing thread-local copies of the bytecode (other than the - // main copy), but any attempts to create new copies of bytecode will fail, - // and the main, unspecializable copy will be used. - _PyEval_StopTheWorld(interp); - interp->tlbc_state = _PY_TLBC_DISABLED; - _PyEval_StartTheWorld(interp); - PyUnstable_GC_VisitObjects(disable_specialization, NULL); - if (PyErr_WarnEx(PyExc_ResourceWarning, - "Reached memory limit for thread-local bytecode", - 1) < 0) { - PyErr_WriteUnraisable(NULL); - } -} - static _Py_CODEUNIT * get_tlbc_lock_held(PyCodeObject *co) { @@ -2899,25 +2787,12 @@ get_tlbc_lock_held(PyCodeObject *co) if (idx < tlbc->size && tlbc->entries[idx] != NULL) { return (_Py_CODEUNIT *)tlbc->entries[idx]; } - Py_ssize_t reserved = reserve_bytes_for_tlbc(co); - if (reserved == -1) { - disable_new_tlbc(); - return NULL; - } - _Py_CODEUNIT *result = create_tlbc_lock_held(co, idx); - if (result == NULL) { - release_bytes_for_tlbc(reserved); - } - return result; + return create_tlbc_lock_held(co, idx); } _Py_CODEUNIT * -_PyCode_GetTLBCSlow(PyCodeObject *co) +_PyCode_GetTLBC(PyCodeObject *co) { - PyInterpreterState *interp = _PyInterpreterState_GET(); - if (interp->tlbc_state == _PY_TLBC_DISABLED) { - return NULL; - } _Py_CODEUNIT *result; Py_BEGIN_CRITICAL_SECTION(co); result = get_tlbc_lock_held(co); diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 5c04d0e452be13..80d41989791678 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -192,7 +192,7 @@ dummy_func( op(_LOAD_BYTECODE, (--)) { #ifdef Py_GIL_DISABLED - _Py_CODEUNIT *bytecode = _PyCode_GetExecutableCode(_PyFrame_GetCode(frame)); + _Py_CODEUNIT *bytecode = _PyEval_GetExecutableCode(_PyFrame_GetCode(frame)); if (frame->bytecode != bytecode) { int off = this_instr - frame->bytecode; frame->bytecode = bytecode; @@ -221,7 +221,7 @@ dummy_func( assert((version & _PY_EVAL_EVENTS_MASK) == 0); DEOPT_IF(eval_breaker != version); #ifdef Py_GIL_DISABLED - _Py_CODEUNIT *bytecode = _PyCode_GetExecutableCode(_PyFrame_GetCode(frame)); + _Py_CODEUNIT *bytecode = _PyEval_GetExecutableCode(_PyFrame_GetCode(frame)); if (frame->bytecode != bytecode) { /* Avoid using this_instr here so that _RESUME_CHECK can be included in traces. diff --git a/Python/ceval.c b/Python/ceval.c index 75531657a129b7..0176508844bc2a 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -811,7 +811,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int #ifdef Py_GIL_DISABLED /* Load thread-local bytecode */ _Py_CODEUNIT *bytecode = - _PyCode_GetExecutableCode(_PyFrame_GetCode(frame)); + _PyEval_GetExecutableCode(_PyFrame_GetCode(frame)); if (frame->bytecode != bytecode) { int off = frame->instr_ptr - frame->bytecode; frame->bytecode = bytecode; diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 8da7625780323a..c5d52c984a84f1 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -55,7 +55,7 @@ JUMP_TO_JUMP_TARGET(); } #ifdef Py_GIL_DISABLED - _Py_CODEUNIT *bytecode = _PyCode_GetExecutableCode(_PyFrame_GetCode(frame)); + _Py_CODEUNIT *bytecode = _PyEval_GetExecutableCode(_PyFrame_GetCode(frame)); if (frame->bytecode != bytecode) { /* Avoid using this_instr here so that _RESUME_CHECK can be included in traces. diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 05930c385c514c..d5ce4910a37deb 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -4486,7 +4486,7 @@ // _LOAD_BYTECODE { #ifdef Py_GIL_DISABLED - _Py_CODEUNIT *bytecode = _PyCode_GetExecutableCode(_PyFrame_GetCode(frame)); + _Py_CODEUNIT *bytecode = _PyEval_GetExecutableCode(_PyFrame_GetCode(frame)); if (frame->bytecode != bytecode) { int off = this_instr - frame->bytecode; frame->bytecode = bytecode; @@ -6432,7 +6432,7 @@ // _LOAD_BYTECODE { #ifdef Py_GIL_DISABLED - _Py_CODEUNIT *bytecode = _PyCode_GetExecutableCode(_PyFrame_GetCode(frame)); + _Py_CODEUNIT *bytecode = _PyEval_GetExecutableCode(_PyFrame_GetCode(frame)); if (frame->bytecode != bytecode) { int off = this_instr - frame->bytecode; frame->bytecode = bytecode; @@ -6495,7 +6495,7 @@ assert((version & _PY_EVAL_EVENTS_MASK) == 0); DEOPT_IF(eval_breaker != version, RESUME); #ifdef Py_GIL_DISABLED - _Py_CODEUNIT *bytecode = _PyCode_GetExecutableCode(_PyFrame_GetCode(frame)); + _Py_CODEUNIT *bytecode = _PyEval_GetExecutableCode(_PyFrame_GetCode(frame)); if (frame->bytecode != bytecode) { /* Avoid using this_instr here so that _RESUME_CHECK can be included in traces. diff --git a/Python/initconfig.c b/Python/initconfig.c index 53e120d4d3868f..840a787b796d83 100644 --- a/Python/initconfig.c +++ b/Python/initconfig.c @@ -134,7 +134,7 @@ static const PyConfigSpec PYCONFIG_SPEC[] = { SPEC(dump_refs_file, WSTR_OPT, READ_ONLY, NO_SYS), #ifdef Py_GIL_DISABLED SPEC(enable_gil, INT, READ_ONLY, NO_SYS), - SPEC(tlbc_limit, INT, READ_ONLY, NO_SYS), + SPEC(tlbc_enabled, INT, READ_ONLY, NO_SYS), #endif SPEC(faulthandler, BOOL, READ_ONLY, NO_SYS), SPEC(filesystem_encoding, WSTR, READ_ONLY, NO_SYS), @@ -318,9 +318,8 @@ The following implementation-specific options are available:\n\ memory blocks when the program finishes or after each statement in\n\ the interactive interpreter; only works on debug builds\n" #ifdef Py_GIL_DISABLED -"-X tlbc_limit=N: limit the total size of thread-local bytecode,\n\ - per-interpreter, to N bytes. A value < 0 means unlimited. A value of\n\ - 0 disables thread-local bytecode. Also PYTHON_TLBC_LIMIT\n" +"-X tlbc=[0|1]: enable (1) or disable (0) thread-local bytecode. Also\n\ + PYTHON_TLBC\n" #endif "\ -X tracemalloc[=N]: trace Python memory allocations; N sets a traceback limit\n \ @@ -408,8 +407,7 @@ static const char usage_envvars[] = "PYTHONSTATS : turns on statistics gathering (-X pystats)\n" #endif #ifdef Py_GIL_DISABLED -"PYTHON_TLBC_LIMIT: limit the total size of thread-local bytecode\n" -" (-X tlbc-limit)\n" +"PYTHON_TLBC : when set to 0, disables thread-local bytecode (-X tlbc)\n" #endif "PYTHONTRACEMALLOC: trace Python memory allocations (-X tracemalloc)\n" "PYTHONUNBUFFERED: disable stdout/stderr buffering (-u)\n" @@ -990,8 +988,7 @@ _PyConfig_InitCompatConfig(PyConfig *config) config->cpu_count = -1; #ifdef Py_GIL_DISABLED config->enable_gil = _PyConfig_GIL_DEFAULT; - // 100 MiB - config->tlbc_limit = 100 * (1 << 20); + config->tlbc_enabled = 1; #endif } @@ -1876,28 +1873,28 @@ config_init_cpu_count(PyConfig *config) } static PyStatus -config_init_tlbc_limit(PyConfig *config) +config_init_tlbc(PyConfig *config) { #ifdef Py_GIL_DISABLED - const char *env = config_get_env(config, "PYTHON_TLBC_LIMIT"); + const char *env = config_get_env(config, "PYTHON_TLBC"); if (env) { - int limit = -1; - if (_Py_str_to_int(env, &limit) < 0) { + int enabled; + if (_Py_str_to_int(env, &enabled) < 0 || (enabled < 0) || (enabled > 1)) { return _PyStatus_ERR( - "PYTHON_TLBC_LIMIT=N: N is missing or invalid"); + "PYTHON_TLBC=N: N is missing or invalid"); } - config->tlbc_limit = limit; + config->tlbc_enabled = enabled; } - const wchar_t *xoption = config_get_xoption(config, L"tlbc_limit"); + const wchar_t *xoption = config_get_xoption(config, L"tlbc"); if (xoption) { - int limit = -1; + int enabled; const wchar_t *sep = wcschr(xoption, L'='); - if (!sep || (config_wstr_to_int(sep + 1, &limit) < 0)) { + if (!sep || (config_wstr_to_int(sep + 1, &enabled) < 0) || (enabled < 0) || (enabled > 1)) { return _PyStatus_ERR( - "-X tlbc_limit=n: n is missing or invalid"); + "-X tlbc=n: n is missing or invalid"); } - config->tlbc_limit = limit; + config->tlbc_enabled = enabled; } return _PyStatus_OK(); #else @@ -2154,7 +2151,7 @@ config_read_complex_options(PyConfig *config) } #endif - status = config_init_tlbc_limit(config); + status = config_init_tlbc(config); if (_PyStatus_EXCEPTION(status)) { return status; } diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index d1090b8570e970..27faf723745c21 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -907,10 +907,6 @@ pycore_interp_init(PyThreadState *tstate) goto done; } -#ifdef Py_GIL_DISABLED - _PyCode_InitState(interp); -#endif - done: /* sys.modules['sys'] contains a strong reference to the module */ Py_XDECREF(sysmod); diff --git a/Python/pystate.c b/Python/pystate.c index fb55644cb5f4d7..1b54a096cf7c2c 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -644,7 +644,6 @@ init_interpreter(PyInterpreterState *interp, _PyType_InitCache(interp); #ifdef Py_GIL_DISABLED _Py_brc_init_state(interp); - _PyCode_InitState(interp); #endif llist_init(&interp->mem_free_queue.head); for (int i = 0; i < _PY_MONITORING_UNGROUPED_EVENTS; i++) { From 8b97771fb39e25e74f915455bd79f1f6a5310b79 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Fri, 13 Sep 2024 12:23:42 -0700 Subject: [PATCH 37/67] Update libpython for gdb --- Tools/gdb/libpython.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/Tools/gdb/libpython.py b/Tools/gdb/libpython.py index cf03788d037a81..c13dd59b232ef6 100755 --- a/Tools/gdb/libpython.py +++ b/Tools/gdb/libpython.py @@ -77,6 +77,10 @@ def _managed_dict_offset(): else: return -3 * _sizeof_void_p() +def _interp_frame_has_bytecode(): + interp_frame = gdb.lookup_type("_PyInterpreterFrame") + return any(field.name == "bytecode" for field in interp_frame.fields()) + Py_TPFLAGS_INLINE_VALUES = (1 << 2) Py_TPFLAGS_MANAGED_DICT = (1 << 4) @@ -105,6 +109,8 @@ def _managed_dict_offset(): UNABLE_READ_INFO_PYTHON_FRAME = 'Unable to read information on python frame' EVALFRAME = '_PyEval_EvalFrameDefault' +INTERP_FRAME_HAS_BYTECODE = _interp_frame_has_bytecode() + class NullPyObjectPtr(RuntimeError): pass @@ -1082,7 +1088,10 @@ def _f_nlocalsplus(self): def _f_lasti(self): codeunit_p = gdb.lookup_type("_Py_CODEUNIT").pointer() instr_ptr = self._gdbval["instr_ptr"] - first_instr = self._f_code().field("co_code_adaptive").cast(codeunit_p) + if INTERP_FRAME_HAS_BYTECODE: + first_instr = self._gdbval["bytecode"].cast(codeunit_p) + else: + first_instr = self._f_code().field("co_code_adaptive").cast(codeunit_p) return int(instr_ptr - first_instr) def is_shim(self): From 6d4fe7354356fe2841e7600e88850595839ca3fc Mon Sep 17 00:00:00 2001 From: Matt Page Date: Fri, 13 Sep 2024 15:30:58 -0700 Subject: [PATCH 38/67] Handle out of memory errors --- Include/internal/pycore_opcode_metadata.h | 2 +- Include/internal/pycore_uop_metadata.h | 2 +- Python/bytecodes.c | 2 ++ Python/ceval.c | 3 +++ Python/executor_cases.c.h | 1 + Python/generated_cases.c.h | 3 +++ 6 files changed, 11 insertions(+), 2 deletions(-) diff --git a/Include/internal/pycore_opcode_metadata.h b/Include/internal/pycore_opcode_metadata.h index a20c55c3a90607..a7b19ee158bcd2 100644 --- a/Include/internal/pycore_opcode_metadata.h +++ b/Include/internal/pycore_opcode_metadata.h @@ -1180,7 +1180,7 @@ const struct opcode_metadata _PyOpcode_opcode_metadata[264] = { [RERAISE] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG }, [RESERVED] = { true, INSTR_FMT_IX, 0 }, [RESUME] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_EVAL_BREAK_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG }, - [RESUME_CHECK] = { true, INSTR_FMT_IX, HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG }, + [RESUME_CHECK] = { true, INSTR_FMT_IX, HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG }, [RETURN_CONST] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_CONST_FLAG }, [RETURN_GENERATOR] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG }, [RETURN_VALUE] = { true, INSTR_FMT_IX, 0 }, diff --git a/Include/internal/pycore_uop_metadata.h b/Include/internal/pycore_uop_metadata.h index 6bcbd354eb55c8..60ba65a44c9bdd 100644 --- a/Include/internal/pycore_uop_metadata.h +++ b/Include/internal/pycore_uop_metadata.h @@ -22,7 +22,7 @@ const uint16_t _PyUop_Flags[MAX_UOP_ID+1] = { [_NOP] = HAS_PURE_FLAG, [_CHECK_PERIODIC] = HAS_EVAL_BREAK_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG, [_CHECK_PERIODIC_IF_NOT_YIELD_FROM] = HAS_ARG_FLAG | HAS_EVAL_BREAK_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG, - [_RESUME_CHECK] = HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG, + [_RESUME_CHECK] = HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG, [_LOAD_FAST_CHECK] = HAS_ARG_FLAG | HAS_LOCAL_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG, [_LOAD_FAST_0] = HAS_LOCAL_FLAG | HAS_PURE_FLAG, [_LOAD_FAST_1] = HAS_LOCAL_FLAG | HAS_PURE_FLAG, diff --git a/Python/bytecodes.c b/Python/bytecodes.c index f9cd82c360a93d..efe988b293b28c 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -193,6 +193,7 @@ dummy_func( op(_LOAD_BYTECODE, (--)) { #ifdef Py_GIL_DISABLED _Py_CODEUNIT *bytecode = _PyEval_GetExecutableCode(_PyFrame_GetCode(frame)); + ERROR_IF(bytecode == NULL, error); if (frame->bytecode != bytecode) { int off = this_instr - frame->bytecode; frame->bytecode = bytecode; @@ -222,6 +223,7 @@ dummy_func( DEOPT_IF(eval_breaker != version); #ifdef Py_GIL_DISABLED _Py_CODEUNIT *bytecode = _PyEval_GetExecutableCode(_PyFrame_GetCode(frame)); + ERROR_IF(bytecode == NULL, error); if (frame->bytecode != bytecode) { /* Avoid using this_instr here so that _RESUME_CHECK can be included in traces. diff --git a/Python/ceval.c b/Python/ceval.c index fa79062b1723f8..b9495b5312b4de 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -812,6 +812,9 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int /* Load thread-local bytecode */ _Py_CODEUNIT *bytecode = _PyEval_GetExecutableCode(_PyFrame_GetCode(frame)); + if (bytecode == NULL) { + goto error; + } if (frame->bytecode != bytecode) { int off = frame->instr_ptr - frame->bytecode; frame->bytecode = bytecode; diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 5ea67bd3038a14..a12b1f4dfbb250 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -56,6 +56,7 @@ } #ifdef Py_GIL_DISABLED _Py_CODEUNIT *bytecode = _PyEval_GetExecutableCode(_PyFrame_GetCode(frame)); + if (bytecode == NULL) JUMP_TO_ERROR(); if (frame->bytecode != bytecode) { /* Avoid using this_instr here so that _RESUME_CHECK can be included in traces. diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 6cd0d9154bc6f1..cf79900c876656 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -4470,6 +4470,7 @@ { #ifdef Py_GIL_DISABLED _Py_CODEUNIT *bytecode = _PyEval_GetExecutableCode(_PyFrame_GetCode(frame)); + if (bytecode == NULL) goto error; if (frame->bytecode != bytecode) { int off = this_instr - frame->bytecode; frame->bytecode = bytecode; @@ -6415,6 +6416,7 @@ { #ifdef Py_GIL_DISABLED _Py_CODEUNIT *bytecode = _PyEval_GetExecutableCode(_PyFrame_GetCode(frame)); + if (bytecode == NULL) goto error; if (frame->bytecode != bytecode) { int off = this_instr - frame->bytecode; frame->bytecode = bytecode; @@ -6478,6 +6480,7 @@ DEOPT_IF(eval_breaker != version, RESUME); #ifdef Py_GIL_DISABLED _Py_CODEUNIT *bytecode = _PyEval_GetExecutableCode(_PyFrame_GetCode(frame)); + if (bytecode == NULL) goto error; if (frame->bytecode != bytecode) { /* Avoid using this_instr here so that _RESUME_CHECK can be included in traces. From b10478268c1363a44b23005bb46e846e4f388268 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Tue, 17 Sep 2024 15:32:59 -0700 Subject: [PATCH 39/67] Fix warnings on windows --- Include/cpython/code.h | 2 +- Include/internal/pycore_code.h | 4 +++- Objects/codeobject.c | 11 ++++++----- Python/ceval.c | 2 +- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/Include/cpython/code.h b/Include/cpython/code.h index 0b354c38b2c398..741bc062a846ae 100644 --- a/Include/cpython/code.h +++ b/Include/cpython/code.h @@ -77,7 +77,7 @@ typedef struct { */ typedef struct { Py_ssize_t size; - char *entries[]; + char *entries[1]; } _PyCodeArray; #define _PyCode_DEF_THREAD_LOCAL_BYTECODE() \ diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index 2df170e2b25ab9..5514991d29fa5d 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -653,7 +653,9 @@ extern _Py_CODEUNIT *_PyCode_GetTLBC(PyCodeObject *co); // Reserve an index for the current thread into thread-local bytecode // arrays -extern int _Py_ReserveTLBCIndex(PyInterpreterState *interp); +// +// Returns the reserved index or -1 on error. +extern Py_ssize_t _Py_ReserveTLBCIndex(PyInterpreterState *interp); // Release the current thread's index into thread-local bytecode arrays extern void _Py_ClearTLBCIndex(_PyThreadStateImpl *tstate); diff --git a/Objects/codeobject.c b/Objects/codeobject.c index 82f8ce1f1d250f..2b051e221489eb 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -2700,7 +2700,7 @@ _PyCode_Fini(PyInterpreterState *interp) // interpreter and instrumentation use atomics, with specialization taking care // not to overwrite an instruction that was instrumented concurrently. -int +Py_ssize_t _Py_ReserveTLBCIndex(PyInterpreterState *interp) { return _PyIndexPool_AllocIndex(&interp->tlbc_indices); @@ -2716,8 +2716,8 @@ _Py_ClearTLBCIndex(_PyThreadStateImpl *tstate) static _PyCodeArray * _PyCodeArray_New(Py_ssize_t size) { - _PyCodeArray *arr = - PyMem_Calloc(1, sizeof(_PyCodeArray) + sizeof(void *) * size); + _PyCodeArray *arr = PyMem_Calloc( + 1, offsetof(_PyCodeArray, entries) + sizeof(void *) * size); if (arr == NULL) { PyErr_NoMemory(); return NULL; @@ -2729,8 +2729,9 @@ _PyCodeArray_New(Py_ssize_t size) static void copy_code(_Py_CODEUNIT *dst, PyCodeObject *co) { - int code_len = Py_SIZE(co); - for (int i = 0; i < code_len; i += _PyInstruction_GetLength(co, i)) { + Py_ssize_t code_len = Py_SIZE(co); + for (Py_ssize_t i = 0; i < code_len; + i += _PyInstruction_GetLength(co, i)) { dst[i] = _Py_GetBaseCodeUnit(co, i); } _PyCode_Quicken(dst, code_len); diff --git a/Python/ceval.c b/Python/ceval.c index b9495b5312b4de..469dfee07ca601 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -816,7 +816,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int goto error; } if (frame->bytecode != bytecode) { - int off = frame->instr_ptr - frame->bytecode; + ptrdiff_t off = frame->instr_ptr - frame->bytecode; frame->bytecode = bytecode; frame->instr_ptr = frame->bytecode + off; } From deb52167c1eefc3992f8a814d5feae92ba862d35 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Tue, 17 Sep 2024 18:47:36 -0700 Subject: [PATCH 40/67] Fix another warning --- Objects/codeobject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Objects/codeobject.c b/Objects/codeobject.c index 2b051e221489eb..1524c6fb4f0dc1 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -2729,7 +2729,7 @@ _PyCodeArray_New(Py_ssize_t size) static void copy_code(_Py_CODEUNIT *dst, PyCodeObject *co) { - Py_ssize_t code_len = Py_SIZE(co); + int code_len = (int) Py_SIZE(co); for (Py_ssize_t i = 0; i < code_len; i += _PyInstruction_GetLength(co, i)) { dst[i] = _Py_GetBaseCodeUnit(co, i); From 2f11cc781d891e2e9f9b793d8658680e79022586 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Tue, 17 Sep 2024 21:38:12 -0700 Subject: [PATCH 41/67] Ugh actually fix it --- Objects/codeobject.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Objects/codeobject.c b/Objects/codeobject.c index 1524c6fb4f0dc1..dacfe1c7a889ef 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -2730,8 +2730,7 @@ static void copy_code(_Py_CODEUNIT *dst, PyCodeObject *co) { int code_len = (int) Py_SIZE(co); - for (Py_ssize_t i = 0; i < code_len; - i += _PyInstruction_GetLength(co, i)) { + for (int i = 0; i < code_len; i += _PyInstruction_GetLength(co, i)) { dst[i] = _Py_GetBaseCodeUnit(co, i); } _PyCode_Quicken(dst, code_len); From 04f1ac3d55e9f1e3526c2ec8af7e3a95895cc4ff Mon Sep 17 00:00:00 2001 From: Matt Page Date: Wed, 25 Sep 2024 12:11:10 -0700 Subject: [PATCH 42/67] Add high-level comment about index pools --- Include/internal/pycore_index_pool.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Include/internal/pycore_index_pool.h b/Include/internal/pycore_index_pool.h index e393c52aecf7f2..721cc6a8075e3b 100644 --- a/Include/internal/pycore_index_pool.h +++ b/Include/internal/pycore_index_pool.h @@ -13,6 +13,11 @@ extern "C" { #ifdef Py_GIL_DISABLED +// This contains code for allocating unique indices in an array. It is used by +// the free-threaded build to assign each thread a globally unique index into +// each code object's thread-local bytecode array. + +// A min-heap of indices typedef struct _PyIndexHeap { Py_ssize_t *values; @@ -23,6 +28,8 @@ typedef struct _PyIndexHeap { Py_ssize_t capacity; } _PyIndexHeap; +// An unbounded pool of indices. Indices are allocated starting from 0. They +// may be released back to the pool once they are no longer in use. typedef struct _PyIndexPool { PyMutex mutex; From 7c9da242ef59824697e500ab0bab80420f516c22 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Fri, 27 Sep 2024 15:29:00 -0700 Subject: [PATCH 43/67] Exclude tlbc from refleak counts TLBC is intended to be reused across threads with different lifetimes, so may appear as a "leak" depending on the order in which threads execute code objects. They are freed when the code object is destroyed, which typically occurs when the runtime is finalized. --- Lib/test/libregrtest/refleak.py | 6 ++++++ Modules/_testinternalcapi.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/Lib/test/libregrtest/refleak.py b/Lib/test/libregrtest/refleak.py index fa447a4336a399..2e495ee2a1540f 100644 --- a/Lib/test/libregrtest/refleak.py +++ b/Lib/test/libregrtest/refleak.py @@ -145,6 +145,12 @@ def get_pooled_int(value): # Use an internal-only keyword argument that mypy doesn't know yet _only_immortal=True) # type: ignore[call-arg] alloc_after = getallocatedblocks() - interned_immortal_after + if support.Py_GIL_DISABLED: + # Ignore any thread-local bytecode that was allocated. These will be + # released when the code object is destroyed, typically at runtime + # shutdown + import _testinternalcapi + alloc_after -= _testinternalcapi.get_tlbc_blocks() rc_after = gettotalrefcount() fd_after = fd_count() diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c index 6170d127385152..cb8429db692c95 100644 --- a/Modules/_testinternalcapi.c +++ b/Modules/_testinternalcapi.c @@ -2006,6 +2006,33 @@ get_tlbc_id(PyObject *Py_UNUSED(module), PyObject *obj) } return PyLong_FromVoidPtr(bc); } + +static int +count_tlbc_blocks(PyObject *obj, Py_ssize_t *count) +{ + if (PyCode_Check(obj)) { + _PyCodeArray *tlbc = ((PyCodeObject *)obj)->co_tlbc; + // First entry always points to the bytecode at the end of the code + // object. Exclude it from the count as it is allocated as part of + // creating the code object. + for (Py_ssize_t i = 1; i < tlbc->size; i++) { + if (tlbc->entries[i] != NULL) { + (*count)++; + } + } + } + return 1; +} + +// Return the total number of thread-local bytecode copies, excluding the +// copies that are embedded in the code object. +static PyObject * +get_tlbc_blocks(PyObject *Py_UNUSED(module), PyObject *Py_UNUSED(ignored)) +{ + Py_ssize_t count = 0; + PyUnstable_GC_VisitObjects((gcvisitobjects_t) count_tlbc_blocks, &count); + return PyLong_FromSsize_t(count); +} #endif static PyObject * @@ -2180,6 +2207,7 @@ static PyMethodDef module_functions[] = { {"py_thread_id", get_py_thread_id, METH_NOARGS}, {"get_tlbc", get_tlbc, METH_O, NULL}, {"get_tlbc_id", get_tlbc_id, METH_O, NULL}, + {"get_tlbc_blocks", get_tlbc_blocks, METH_NOARGS}, #endif {"suppress_immortalization", suppress_immortalization, METH_O}, {"get_immortalize_deferred", get_immortalize_deferred, METH_NOARGS}, From ad180d183c9f1bbe6689df21d35a4f1375574ef4 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Fri, 27 Sep 2024 17:53:49 -0700 Subject: [PATCH 44/67] Regen files --- Include/internal/pycore_uop_ids.h | 72 +++++++++++++++---------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/Include/internal/pycore_uop_ids.h b/Include/internal/pycore_uop_ids.h index 5a14c83b8e76d7..0f06073c32c81e 100644 --- a/Include/internal/pycore_uop_ids.h +++ b/Include/internal/pycore_uop_ids.h @@ -223,65 +223,65 @@ extern "C" { #define _LOAD_SUPER_ATTR_METHOD LOAD_SUPER_ATTR_METHOD #define _MAKE_CELL MAKE_CELL #define _MAKE_FUNCTION MAKE_FUNCTION -#define _MAKE_WARM 439 +#define _MAKE_WARM 440 #define _MAP_ADD MAP_ADD #define _MATCH_CLASS MATCH_CLASS #define _MATCH_KEYS MATCH_KEYS #define _MATCH_MAPPING MATCH_MAPPING #define _MATCH_SEQUENCE MATCH_SEQUENCE -#define _MAYBE_EXPAND_METHOD 440 -#define _MONITOR_CALL 441 -#define _MONITOR_JUMP_BACKWARD 442 -#define _MONITOR_RESUME 443 +#define _MAYBE_EXPAND_METHOD 441 +#define _MONITOR_CALL 442 +#define _MONITOR_JUMP_BACKWARD 443 +#define _MONITOR_RESUME 444 #define _NOP NOP #define _POP_EXCEPT POP_EXCEPT -#define _POP_JUMP_IF_FALSE 444 -#define _POP_JUMP_IF_TRUE 445 +#define _POP_JUMP_IF_FALSE 445 +#define _POP_JUMP_IF_TRUE 446 #define _POP_TOP POP_TOP -#define _POP_TOP_LOAD_CONST_INLINE_BORROW 446 +#define _POP_TOP_LOAD_CONST_INLINE_BORROW 447 #define _PUSH_EXC_INFO PUSH_EXC_INFO -#define _PUSH_FRAME 447 +#define _PUSH_FRAME 448 #define _PUSH_NULL PUSH_NULL -#define _PY_FRAME_GENERAL 448 -#define _PY_FRAME_KW 449 -#define _QUICKEN_RESUME 450 -#define _REPLACE_WITH_TRUE 451 +#define _PY_FRAME_GENERAL 449 +#define _PY_FRAME_KW 450 +#define _QUICKEN_RESUME 451 +#define _REPLACE_WITH_TRUE 452 #define _RESUME_CHECK RESUME_CHECK #define _RETURN_GENERATOR RETURN_GENERATOR #define _RETURN_VALUE RETURN_VALUE -#define _SAVE_RETURN_OFFSET 452 -#define _SEND 453 -#define _SEND_GEN_FRAME 454 +#define _SAVE_RETURN_OFFSET 453 +#define _SEND 454 +#define _SEND_GEN_FRAME 455 #define _SETUP_ANNOTATIONS SETUP_ANNOTATIONS #define _SET_ADD SET_ADD #define _SET_FUNCTION_ATTRIBUTE SET_FUNCTION_ATTRIBUTE #define _SET_UPDATE SET_UPDATE -#define _START_EXECUTOR 455 -#define _STORE_ATTR 456 -#define _STORE_ATTR_INSTANCE_VALUE 457 -#define _STORE_ATTR_SLOT 458 -#define _STORE_ATTR_WITH_HINT 459 +#define _START_EXECUTOR 456 +#define _STORE_ATTR 457 +#define _STORE_ATTR_INSTANCE_VALUE 458 +#define _STORE_ATTR_SLOT 459 +#define _STORE_ATTR_WITH_HINT 460 #define _STORE_DEREF STORE_DEREF -#define _STORE_FAST 460 -#define _STORE_FAST_0 461 -#define _STORE_FAST_1 462 -#define _STORE_FAST_2 463 -#define _STORE_FAST_3 464 -#define _STORE_FAST_4 465 -#define _STORE_FAST_5 466 -#define _STORE_FAST_6 467 -#define _STORE_FAST_7 468 +#define _STORE_FAST 461 +#define _STORE_FAST_0 462 +#define _STORE_FAST_1 463 +#define _STORE_FAST_2 464 +#define _STORE_FAST_3 465 +#define _STORE_FAST_4 466 +#define _STORE_FAST_5 467 +#define _STORE_FAST_6 468 +#define _STORE_FAST_7 469 #define _STORE_FAST_LOAD_FAST STORE_FAST_LOAD_FAST #define _STORE_FAST_STORE_FAST STORE_FAST_STORE_FAST #define _STORE_GLOBAL STORE_GLOBAL #define _STORE_NAME STORE_NAME -#define _STORE_SLICE 469 -#define _STORE_SUBSCR 470 +#define _STORE_SLICE 470 +#define _STORE_SUBSCR 471 #define _STORE_SUBSCR_DICT STORE_SUBSCR_DICT #define _STORE_SUBSCR_LIST_INT STORE_SUBSCR_LIST_INT #define _SWAP SWAP -#define _TIER2_RESUME_CHECK 471 -#define _TO_BOOL 472 +#define _TIER2_RESUME_CHECK 472 +#define _TO_BOOL 473 #define _TO_BOOL_BOOL TO_BOOL_BOOL #define _TO_BOOL_INT TO_BOOL_INT #define _TO_BOOL_LIST TO_BOOL_LIST @@ -291,14 +291,14 @@ extern "C" { #define _UNARY_NEGATIVE UNARY_NEGATIVE #define _UNARY_NOT UNARY_NOT #define _UNPACK_EX UNPACK_EX -#define _UNPACK_SEQUENCE 473 +#define _UNPACK_SEQUENCE 474 #define _UNPACK_SEQUENCE_LIST UNPACK_SEQUENCE_LIST #define _UNPACK_SEQUENCE_TUPLE UNPACK_SEQUENCE_TUPLE #define _UNPACK_SEQUENCE_TWO_TUPLE UNPACK_SEQUENCE_TWO_TUPLE #define _WITH_EXCEPT_START WITH_EXCEPT_START #define _YIELD_VALUE YIELD_VALUE #define __DO_CALL_FUNCTION_EX _DO_CALL_FUNCTION_EX -#define MAX_UOP_ID 473 +#define MAX_UOP_ID 474 #ifdef __cplusplus } From 95d22644c85da15a87902ce7f4e43f71354d026f Mon Sep 17 00:00:00 2001 From: Matt Page Date: Mon, 30 Sep 2024 10:36:13 -0700 Subject: [PATCH 45/67] Move `get_tlbc_blocks` into the sys module --- Lib/test/libregrtest/refleak.py | 5 ++--- Modules/_testinternalcapi.c | 28 ------------------------ Python/clinic/sysmodule.c.h | 38 ++++++++++++++++++++++++++++++++- Python/sysmodule.c | 36 +++++++++++++++++++++++++++++++ 4 files changed, 75 insertions(+), 32 deletions(-) diff --git a/Lib/test/libregrtest/refleak.py b/Lib/test/libregrtest/refleak.py index 2e495ee2a1540f..b9d0e95081ded6 100644 --- a/Lib/test/libregrtest/refleak.py +++ b/Lib/test/libregrtest/refleak.py @@ -145,12 +145,11 @@ def get_pooled_int(value): # Use an internal-only keyword argument that mypy doesn't know yet _only_immortal=True) # type: ignore[call-arg] alloc_after = getallocatedblocks() - interned_immortal_after - if support.Py_GIL_DISABLED: + if _get_tlbc_blocks := getattr(sys, "_get_tlbc_blocks", None): # Ignore any thread-local bytecode that was allocated. These will be # released when the code object is destroyed, typically at runtime # shutdown - import _testinternalcapi - alloc_after -= _testinternalcapi.get_tlbc_blocks() + alloc_after -= _get_tlbc_blocks() rc_after = gettotalrefcount() fd_after = fd_count() diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c index cb8429db692c95..6170d127385152 100644 --- a/Modules/_testinternalcapi.c +++ b/Modules/_testinternalcapi.c @@ -2006,33 +2006,6 @@ get_tlbc_id(PyObject *Py_UNUSED(module), PyObject *obj) } return PyLong_FromVoidPtr(bc); } - -static int -count_tlbc_blocks(PyObject *obj, Py_ssize_t *count) -{ - if (PyCode_Check(obj)) { - _PyCodeArray *tlbc = ((PyCodeObject *)obj)->co_tlbc; - // First entry always points to the bytecode at the end of the code - // object. Exclude it from the count as it is allocated as part of - // creating the code object. - for (Py_ssize_t i = 1; i < tlbc->size; i++) { - if (tlbc->entries[i] != NULL) { - (*count)++; - } - } - } - return 1; -} - -// Return the total number of thread-local bytecode copies, excluding the -// copies that are embedded in the code object. -static PyObject * -get_tlbc_blocks(PyObject *Py_UNUSED(module), PyObject *Py_UNUSED(ignored)) -{ - Py_ssize_t count = 0; - PyUnstable_GC_VisitObjects((gcvisitobjects_t) count_tlbc_blocks, &count); - return PyLong_FromSsize_t(count); -} #endif static PyObject * @@ -2207,7 +2180,6 @@ static PyMethodDef module_functions[] = { {"py_thread_id", get_py_thread_id, METH_NOARGS}, {"get_tlbc", get_tlbc, METH_O, NULL}, {"get_tlbc_id", get_tlbc_id, METH_O, NULL}, - {"get_tlbc_blocks", get_tlbc_blocks, METH_NOARGS}, #endif {"suppress_immortalization", suppress_immortalization, METH_O}, {"get_immortalize_deferred", get_immortalize_deferred, METH_NOARGS}, diff --git a/Python/clinic/sysmodule.c.h b/Python/clinic/sysmodule.c.h index 8277d286cf51ef..dd205bf203a457 100644 --- a/Python/clinic/sysmodule.c.h +++ b/Python/clinic/sysmodule.c.h @@ -1571,6 +1571,38 @@ sys__is_gil_enabled(PyObject *module, PyObject *Py_UNUSED(ignored)) return return_value; } +#if defined(Py_GIL_DISABLED) + +PyDoc_STRVAR(sys__get_tlbc_blocks__doc__, +"_get_tlbc_blocks($module, /)\n" +"--\n" +"\n" +"Return the total number of thread-local bytecode copies, excluding the copies that are embedded in the code object."); + +#define SYS__GET_TLBC_BLOCKS_METHODDEF \ + {"_get_tlbc_blocks", (PyCFunction)sys__get_tlbc_blocks, METH_NOARGS, sys__get_tlbc_blocks__doc__}, + +static Py_ssize_t +sys__get_tlbc_blocks_impl(PyObject *module); + +static PyObject * +sys__get_tlbc_blocks(PyObject *module, PyObject *Py_UNUSED(ignored)) +{ + PyObject *return_value = NULL; + Py_ssize_t _return_value; + + _return_value = sys__get_tlbc_blocks_impl(module); + if ((_return_value == -1) && PyErr_Occurred()) { + goto exit; + } + return_value = PyLong_FromSsize_t(_return_value); + +exit: + return return_value; +} + +#endif /* defined(Py_GIL_DISABLED) */ + #ifndef SYS_GETWINDOWSVERSION_METHODDEF #define SYS_GETWINDOWSVERSION_METHODDEF #endif /* !defined(SYS_GETWINDOWSVERSION_METHODDEF) */ @@ -1614,4 +1646,8 @@ sys__is_gil_enabled(PyObject *module, PyObject *Py_UNUSED(ignored)) #ifndef SYS_GETANDROIDAPILEVEL_METHODDEF #define SYS_GETANDROIDAPILEVEL_METHODDEF #endif /* !defined(SYS_GETANDROIDAPILEVEL_METHODDEF) */ -/*[clinic end generated code: output=9cc9069aef1482bc input=a9049054013a1b77]*/ + +#ifndef SYS__GET_TLBC_BLOCKS_METHODDEF + #define SYS__GET_TLBC_BLOCKS_METHODDEF +#endif /* !defined(SYS__GET_TLBC_BLOCKS_METHODDEF) */ +/*[clinic end generated code: output=fca6c27bfc0c17ac input=a9049054013a1b77]*/ diff --git a/Python/sysmodule.c b/Python/sysmodule.c index ac343a8048e008..9fd194019544dd 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -2442,6 +2442,41 @@ sys__is_gil_enabled_impl(PyObject *module) #endif } +#ifdef Py_GIL_DISABLED +static int +count_tlbc_blocks(PyObject *obj, Py_ssize_t *count) +{ + if (PyCode_Check(obj)) { + _PyCodeArray *tlbc = ((PyCodeObject *)obj)->co_tlbc; + // First entry always points to the bytecode at the end of the code + // object. Exclude it from the count as it is allocated as part of + // creating the code object. + for (Py_ssize_t i = 1; i < tlbc->size; i++) { + if (tlbc->entries[i] != NULL) { + (*count)++; + } + } + } + return 1; +} + +/*[clinic input] +sys._get_tlbc_blocks -> Py_ssize_t + +Return the total number of thread-local bytecode copies, excluding the copies that are embedded in the code object. +[clinic start generated code]*/ + +static Py_ssize_t +sys__get_tlbc_blocks_impl(PyObject *module) +/*[clinic end generated code: output=4b4e350583cbd643 input=37c14e47d8905a95]*/ +{ + Py_ssize_t count = 0; + PyUnstable_GC_VisitObjects((gcvisitobjects_t) count_tlbc_blocks, &count); + return count; +} +#endif /* Py_GIL_DISABLED */ + + static PerfMapState perf_map_state; @@ -2617,6 +2652,7 @@ static PyMethodDef sys_methods[] = { #endif SYS__GET_CPU_COUNT_CONFIG_METHODDEF SYS__IS_GIL_ENABLED_METHODDEF + SYS__GET_TLBC_BLOCKS_METHODDEF {NULL, NULL} // sentinel }; From 2cc5830a28048165bd2710394283c31f38ef642c Mon Sep 17 00:00:00 2001 From: Matt Page Date: Thu, 10 Oct 2024 18:20:41 -0700 Subject: [PATCH 46/67] Work around `this_instr` now being const --- Python/bytecodes.c | 6 ++--- Python/executor_cases.c.h | 11 ++++++++++ Python/generated_cases.c.h | 45 +++++++++++++++++++++++++++++++------- 3 files changed, 50 insertions(+), 12 deletions(-) diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 87e00aad6dc1d8..6c57340f87ae42 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -210,11 +210,9 @@ dummy_func( int off = this_instr - frame->bytecode; frame->bytecode = bytecode; frame->instr_ptr = frame->bytecode + off; - this_instr = frame->instr_ptr; - next_instr = frame->instr_ptr + 1; + next_instr = frame->instr_ptr; + DISPATCH(); } - #else - (void)this_instr; #endif } diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 840bd5e2a37c61..4c4c9de073018e 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -59,6 +59,17 @@ JUMP_TO_JUMP_TARGET(); } #ifdef Py_GIL_DISABLED + // Work around a bug in the cases_generator logic that inserts code + // to save and restore the stack pointer. Without splitting these + // lines the cases_generator will insert code to save the stack + // pointer before the `#ifdef Py_GIL_DISABLED` and will insert code + // to clear the stack pointer immediately after the call to + // `_PyEval_GetExecutableCode` below. As a result, the stack + // pointer won't properly be cleared in default (with-gil) + // builds. By putting the declaration and assignment on separate + // lines, we cause the cases_generator to correctly insert the code + // to save and clear the stack pointer immediately before and after + // the call to _PyEval_GetExectableCode. _Py_CODEUNIT *bytecode; _PyFrame_SetStackPointer(frame, stack_pointer); bytecode = _PyEval_GetExecutableCode(_PyFrame_GetCode(frame)); diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 1f2e6edea4c95e..b1f47f33da0e98 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -4831,6 +4831,17 @@ // _LOAD_BYTECODE { #ifdef Py_GIL_DISABLED + // Work around a bug in the cases_generator logic that inserts code + // to save and restore the stack pointer. Without splitting these + // lines the cases_generator will insert code to save the stack + // pointer before the `#ifdef Py_GIL_DISABLED` and will insert code + // to clear the stack pointer immediately after the call to + // `_PyEval_GetExecutableCode` below. As a result, the stack + // pointer won't properly be cleared in default (with-gil) + // builds. By putting the declaration and assignment on separate + // lines, we cause the cases_generator to correctly insert the code + // to save and clear the stack pointer immediately before and after + // the call to _PyEval_GetExectableCode. _Py_CODEUNIT *bytecode; _PyFrame_SetStackPointer(frame, stack_pointer); bytecode = _PyEval_GetExecutableCode(_PyFrame_GetCode(frame)); @@ -4840,11 +4851,9 @@ int off = this_instr - frame->bytecode; frame->bytecode = bytecode; frame->instr_ptr = frame->bytecode + off; - this_instr = frame->instr_ptr; - next_instr = frame->instr_ptr + 1; + next_instr = frame->instr_ptr; + DISPATCH(); } - #else - (void)this_instr; #endif } // _MAYBE_INSTRUMENT @@ -6906,6 +6915,17 @@ // _LOAD_BYTECODE { #ifdef Py_GIL_DISABLED + // Work around a bug in the cases_generator logic that inserts code + // to save and restore the stack pointer. Without splitting these + // lines the cases_generator will insert code to save the stack + // pointer before the `#ifdef Py_GIL_DISABLED` and will insert code + // to clear the stack pointer immediately after the call to + // `_PyEval_GetExecutableCode` below. As a result, the stack + // pointer won't properly be cleared in default (with-gil) + // builds. By putting the declaration and assignment on separate + // lines, we cause the cases_generator to correctly insert the code + // to save and clear the stack pointer immediately before and after + // the call to _PyEval_GetExectableCode. _Py_CODEUNIT *bytecode; _PyFrame_SetStackPointer(frame, stack_pointer); bytecode = _PyEval_GetExecutableCode(_PyFrame_GetCode(frame)); @@ -6915,11 +6935,9 @@ int off = this_instr - frame->bytecode; frame->bytecode = bytecode; frame->instr_ptr = frame->bytecode + off; - this_instr = frame->instr_ptr; - next_instr = frame->instr_ptr + 1; + next_instr = frame->instr_ptr; + DISPATCH(); } - #else - (void)this_instr; #endif } // _MAYBE_INSTRUMENT @@ -6977,6 +6995,17 @@ assert((version & _PY_EVAL_EVENTS_MASK) == 0); DEOPT_IF(eval_breaker != version, RESUME); #ifdef Py_GIL_DISABLED + // Work around a bug in the cases_generator logic that inserts code + // to save and restore the stack pointer. Without splitting these + // lines the cases_generator will insert code to save the stack + // pointer before the `#ifdef Py_GIL_DISABLED` and will insert code + // to clear the stack pointer immediately after the call to + // `_PyEval_GetExecutableCode` below. As a result, the stack + // pointer won't properly be cleared in default (with-gil) + // builds. By putting the declaration and assignment on separate + // lines, we cause the cases_generator to correctly insert the code + // to save and clear the stack pointer immediately before and after + // the call to _PyEval_GetExectableCode. _Py_CODEUNIT *bytecode; _PyFrame_SetStackPointer(frame, stack_pointer); bytecode = _PyEval_GetExecutableCode(_PyFrame_GetCode(frame)); From 96ec1262c71007055d89cfc0fa33fbeaee839400 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Thu, 10 Oct 2024 20:16:10 -0700 Subject: [PATCH 47/67] Make RESUME_CHECK cheaper --- Include/internal/pycore_opcode_metadata.h | 2 +- Include/internal/pycore_uop_metadata.h | 2 +- Python/bytecodes.c | 10 +++++----- Python/executor_cases.c.h | 11 +++++++---- Python/generated_cases.c.h | 12 ++++++------ 5 files changed, 20 insertions(+), 17 deletions(-) diff --git a/Include/internal/pycore_opcode_metadata.h b/Include/internal/pycore_opcode_metadata.h index 42202ee5515418..1a390d84b8d042 100644 --- a/Include/internal/pycore_opcode_metadata.h +++ b/Include/internal/pycore_opcode_metadata.h @@ -1186,7 +1186,7 @@ const struct opcode_metadata _PyOpcode_opcode_metadata[266] = { [RERAISE] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG }, [RESERVED] = { true, INSTR_FMT_IX, 0 }, [RESUME] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_EVAL_BREAK_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG }, - [RESUME_CHECK] = { true, INSTR_FMT_IX, HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG }, + [RESUME_CHECK] = { true, INSTR_FMT_IX, HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG }, [RETURN_CONST] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_CONST_FLAG }, [RETURN_GENERATOR] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG }, [RETURN_VALUE] = { true, INSTR_FMT_IX, 0 }, diff --git a/Include/internal/pycore_uop_metadata.h b/Include/internal/pycore_uop_metadata.h index 94d7e199bcff74..749fd62c015dd3 100644 --- a/Include/internal/pycore_uop_metadata.h +++ b/Include/internal/pycore_uop_metadata.h @@ -22,7 +22,7 @@ const uint16_t _PyUop_Flags[MAX_UOP_ID+1] = { [_NOP] = HAS_PURE_FLAG, [_CHECK_PERIODIC] = HAS_EVAL_BREAK_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG, [_CHECK_PERIODIC_IF_NOT_YIELD_FROM] = HAS_ARG_FLAG | HAS_EVAL_BREAK_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG, - [_RESUME_CHECK] = HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG, + [_RESUME_CHECK] = HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG, [_LOAD_FAST_CHECK] = HAS_ARG_FLAG | HAS_LOCAL_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG, [_LOAD_FAST_0] = HAS_LOCAL_FLAG | HAS_PURE_FLAG, [_LOAD_FAST_1] = HAS_LOCAL_FLAG | HAS_PURE_FLAG, diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 6c57340f87ae42..729e7a4a6762a5 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -202,7 +202,7 @@ dummy_func( // builds. By putting the declaration and assignment on separate // lines, we cause the cases_generator to correctly insert the code // to save and clear the stack pointer immediately before and after - // the call to _PyEval_GetExectableCode. + // the call to `_PyEval_GetExecutableCode`. _Py_CODEUNIT *bytecode; bytecode = _PyEval_GetExecutableCode(_PyFrame_GetCode(frame)); ERROR_IF(bytecode == NULL, error); @@ -237,15 +237,15 @@ dummy_func( // lines the cases_generator will insert code to save the stack // pointer before the `#ifdef Py_GIL_DISABLED` and will insert code // to clear the stack pointer immediately after the call to - // `_PyEval_GetExecutableCode` below. As a result, the stack + // `_PyCode_GetTLBCFast` below. As a result, the stack // pointer won't properly be cleared in default (with-gil) // builds. By putting the declaration and assignment on separate // lines, we cause the cases_generator to correctly insert the code // to save and clear the stack pointer immediately before and after - // the call to _PyEval_GetExectableCode. + // the call to `_PyCode_GetTLBCFast`. _Py_CODEUNIT *bytecode; - bytecode = _PyEval_GetExecutableCode(_PyFrame_GetCode(frame)); - ERROR_IF(bytecode == NULL, error); + bytecode = _PyCode_GetTLBCFast(_PyFrame_GetCode(frame)); + DEOPT_IF(bytecode == NULL); if (frame->bytecode != bytecode) { /* Avoid using this_instr here so that _RESUME_CHECK can be included in traces. diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 4c4c9de073018e..31a6e9b177229a 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -64,17 +64,20 @@ // lines the cases_generator will insert code to save the stack // pointer before the `#ifdef Py_GIL_DISABLED` and will insert code // to clear the stack pointer immediately after the call to - // `_PyEval_GetExecutableCode` below. As a result, the stack + // `_PyCode_GetTLBCFast` below. As a result, the stack // pointer won't properly be cleared in default (with-gil) // builds. By putting the declaration and assignment on separate // lines, we cause the cases_generator to correctly insert the code // to save and clear the stack pointer immediately before and after - // the call to _PyEval_GetExectableCode. + // the call to `_PyCode_GetTLBCFast`. _Py_CODEUNIT *bytecode; _PyFrame_SetStackPointer(frame, stack_pointer); - bytecode = _PyEval_GetExecutableCode(_PyFrame_GetCode(frame)); + bytecode = _PyCode_GetTLBCFast(_PyFrame_GetCode(frame)); stack_pointer = _PyFrame_GetStackPointer(frame); - if (bytecode == NULL) JUMP_TO_ERROR(); + if (bytecode == NULL) { + UOP_STAT_INC(uopcode, miss); + JUMP_TO_JUMP_TARGET(); + } if (frame->bytecode != bytecode) { /* Avoid using this_instr here so that _RESUME_CHECK can be included in traces. diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index b1f47f33da0e98..fe8e1f504f9324 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -4841,7 +4841,7 @@ // builds. By putting the declaration and assignment on separate // lines, we cause the cases_generator to correctly insert the code // to save and clear the stack pointer immediately before and after - // the call to _PyEval_GetExectableCode. + // the call to `_PyEval_GetExecutableCode`. _Py_CODEUNIT *bytecode; _PyFrame_SetStackPointer(frame, stack_pointer); bytecode = _PyEval_GetExecutableCode(_PyFrame_GetCode(frame)); @@ -6925,7 +6925,7 @@ // builds. By putting the declaration and assignment on separate // lines, we cause the cases_generator to correctly insert the code // to save and clear the stack pointer immediately before and after - // the call to _PyEval_GetExectableCode. + // the call to `_PyEval_GetExecutableCode`. _Py_CODEUNIT *bytecode; _PyFrame_SetStackPointer(frame, stack_pointer); bytecode = _PyEval_GetExecutableCode(_PyFrame_GetCode(frame)); @@ -7000,17 +7000,17 @@ // lines the cases_generator will insert code to save the stack // pointer before the `#ifdef Py_GIL_DISABLED` and will insert code // to clear the stack pointer immediately after the call to - // `_PyEval_GetExecutableCode` below. As a result, the stack + // `_PyCode_GetTLBCFast` below. As a result, the stack // pointer won't properly be cleared in default (with-gil) // builds. By putting the declaration and assignment on separate // lines, we cause the cases_generator to correctly insert the code // to save and clear the stack pointer immediately before and after - // the call to _PyEval_GetExectableCode. + // the call to `_PyCode_GetTLBCFast`. _Py_CODEUNIT *bytecode; _PyFrame_SetStackPointer(frame, stack_pointer); - bytecode = _PyEval_GetExecutableCode(_PyFrame_GetCode(frame)); + bytecode = _PyCode_GetTLBCFast(_PyFrame_GetCode(frame)); stack_pointer = _PyFrame_GetStackPointer(frame); - if (bytecode == NULL) goto error; + DEOPT_IF(bytecode == NULL, RESUME); if (frame->bytecode != bytecode) { /* Avoid using this_instr here so that _RESUME_CHECK can be included in traces. From 5ecebd9f849432d17b08e6c8910517f9bfaa661a Mon Sep 17 00:00:00 2001 From: Matt Page Date: Thu, 10 Oct 2024 22:19:26 -0700 Subject: [PATCH 48/67] Pass tstate to _PyCode_GetTLBCFast --- Include/internal/pycore_ceval.h | 4 ++-- Include/internal/pycore_code.h | 5 ++--- Modules/_testinternalcapi.c | 4 ++-- Python/bytecodes.c | 4 ++-- Python/ceval.c | 2 +- Python/executor_cases.c.h | 2 +- Python/generated_cases.c.h | 6 +++--- 7 files changed, 13 insertions(+), 14 deletions(-) diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h index 42bf918234a5e3..94a642baad8539 100644 --- a/Include/internal/pycore_ceval.h +++ b/Include/internal/pycore_ceval.h @@ -180,9 +180,9 @@ extern int _PyEval_DisableGIL(PyThreadState *state); static inline _Py_CODEUNIT * -_PyEval_GetExecutableCode(PyCodeObject *co) +_PyEval_GetExecutableCode(PyThreadState *tstate, PyCodeObject *co) { - _Py_CODEUNIT *bc = _PyCode_GetTLBCFast(co); + _Py_CODEUNIT *bc = _PyCode_GetTLBCFast(tstate, co); if (bc != NULL) { return bc; } diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index 5514991d29fa5d..6f8899ddd2425e 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -636,11 +636,10 @@ PyAPI_DATA(const struct _PyCode8) _Py_InitCleanup; // Return a pointer to the thread-local bytecode for the current thread, if it // exists. static inline _Py_CODEUNIT * -_PyCode_GetTLBCFast(PyCodeObject *co) +_PyCode_GetTLBCFast(PyThreadState *tstate, PyCodeObject *co) { _PyCodeArray *code = _Py_atomic_load_ptr_acquire(&co->co_tlbc); - _PyThreadStateImpl *tstate = (_PyThreadStateImpl *) PyThreadState_GET(); - Py_ssize_t idx = tstate->tlbc_index; + Py_ssize_t idx = ((_PyThreadStateImpl*) tstate)->tlbc_index; if (idx < code->size && code->entries[idx] != NULL) { return (_Py_CODEUNIT *) code->entries[idx]; } diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c index 6170d127385152..60218915b2387e 100644 --- a/Modules/_testinternalcapi.c +++ b/Modules/_testinternalcapi.c @@ -1986,7 +1986,7 @@ get_tlbc(PyObject *Py_UNUSED(module), PyObject *obj) if (code == NULL) { return NULL; } - _Py_CODEUNIT *bc = _PyCode_GetTLBCFast(code); + _Py_CODEUNIT *bc = _PyCode_GetTLBCFast(PyThreadState_GET(), code); if (bc == NULL) { Py_RETURN_NONE; } @@ -2000,7 +2000,7 @@ get_tlbc_id(PyObject *Py_UNUSED(module), PyObject *obj) if (code == NULL) { return NULL; } - _Py_CODEUNIT *bc = _PyCode_GetTLBCFast(code); + _Py_CODEUNIT *bc = _PyCode_GetTLBCFast(PyThreadState_GET(), code); if (bc == NULL) { Py_RETURN_NONE; } diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 729e7a4a6762a5..0cd877f63fa207 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -204,7 +204,7 @@ dummy_func( // to save and clear the stack pointer immediately before and after // the call to `_PyEval_GetExecutableCode`. _Py_CODEUNIT *bytecode; - bytecode = _PyEval_GetExecutableCode(_PyFrame_GetCode(frame)); + bytecode = _PyEval_GetExecutableCode(tstate, _PyFrame_GetCode(frame)); ERROR_IF(bytecode == NULL, error); if (frame->bytecode != bytecode) { int off = this_instr - frame->bytecode; @@ -244,7 +244,7 @@ dummy_func( // to save and clear the stack pointer immediately before and after // the call to `_PyCode_GetTLBCFast`. _Py_CODEUNIT *bytecode; - bytecode = _PyCode_GetTLBCFast(_PyFrame_GetCode(frame)); + bytecode = _PyCode_GetTLBCFast(tstate, _PyFrame_GetCode(frame)); DEOPT_IF(bytecode == NULL); if (frame->bytecode != bytecode) { /* Avoid using this_instr here so that _RESUME_CHECK can be included diff --git a/Python/ceval.c b/Python/ceval.c index b5ca27dc27d4d6..58342e77042f9b 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -820,7 +820,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int #ifdef Py_GIL_DISABLED /* Load thread-local bytecode */ _Py_CODEUNIT *bytecode = - _PyEval_GetExecutableCode(_PyFrame_GetCode(frame)); + _PyEval_GetExecutableCode(tstate, _PyFrame_GetCode(frame)); if (bytecode == NULL) { goto error; } diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 31a6e9b177229a..d790de529b74b3 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -72,7 +72,7 @@ // the call to `_PyCode_GetTLBCFast`. _Py_CODEUNIT *bytecode; _PyFrame_SetStackPointer(frame, stack_pointer); - bytecode = _PyCode_GetTLBCFast(_PyFrame_GetCode(frame)); + bytecode = _PyCode_GetTLBCFast(tstate, _PyFrame_GetCode(frame)); stack_pointer = _PyFrame_GetStackPointer(frame); if (bytecode == NULL) { UOP_STAT_INC(uopcode, miss); diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index fe8e1f504f9324..613550d398e593 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -4844,7 +4844,7 @@ // the call to `_PyEval_GetExecutableCode`. _Py_CODEUNIT *bytecode; _PyFrame_SetStackPointer(frame, stack_pointer); - bytecode = _PyEval_GetExecutableCode(_PyFrame_GetCode(frame)); + bytecode = _PyEval_GetExecutableCode(tstate, _PyFrame_GetCode(frame)); stack_pointer = _PyFrame_GetStackPointer(frame); if (bytecode == NULL) goto error; if (frame->bytecode != bytecode) { @@ -6928,7 +6928,7 @@ // the call to `_PyEval_GetExecutableCode`. _Py_CODEUNIT *bytecode; _PyFrame_SetStackPointer(frame, stack_pointer); - bytecode = _PyEval_GetExecutableCode(_PyFrame_GetCode(frame)); + bytecode = _PyEval_GetExecutableCode(tstate, _PyFrame_GetCode(frame)); stack_pointer = _PyFrame_GetStackPointer(frame); if (bytecode == NULL) goto error; if (frame->bytecode != bytecode) { @@ -7008,7 +7008,7 @@ // the call to `_PyCode_GetTLBCFast`. _Py_CODEUNIT *bytecode; _PyFrame_SetStackPointer(frame, stack_pointer); - bytecode = _PyCode_GetTLBCFast(_PyFrame_GetCode(frame)); + bytecode = _PyCode_GetTLBCFast(tstate, _PyFrame_GetCode(frame)); stack_pointer = _PyFrame_GetStackPointer(frame); DEOPT_IF(bytecode == NULL, RESUME); if (frame->bytecode != bytecode) { From 815b2fed5bdab3acb463092bf574da4bb381712e Mon Sep 17 00:00:00 2001 From: Matt Page Date: Thu, 10 Oct 2024 22:20:28 -0700 Subject: [PATCH 49/67] Rename test_tlbc.py to test_thread_local_bytecode.py --- Lib/test/{test_tlbc.py => test_thread_local_bytecode.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Lib/test/{test_tlbc.py => test_thread_local_bytecode.py} (100%) diff --git a/Lib/test/test_tlbc.py b/Lib/test/test_thread_local_bytecode.py similarity index 100% rename from Lib/test/test_tlbc.py rename to Lib/test/test_thread_local_bytecode.py From fb90d2376cc7a037208a8bd04af164d6fca1b347 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Fri, 11 Oct 2024 00:06:49 -0700 Subject: [PATCH 50/67] Remove per-family defines for specialization --- Include/internal/pycore_code.h | 38 +++------------ Lib/test/support/__init__.py | 15 ++---- Lib/test/test_dis.py | 18 +++---- Lib/test/test_monitoring.py | 4 +- Lib/test/test_opcache.py | 25 ++-------- Lib/test/test_thread_local_bytecode.py | 6 +-- Lib/test/test_type_cache.py | 9 +--- Modules/_opcode.c | 31 +++--------- Python/bytecodes.c | 66 +++++++++++++------------- Python/ceval.c | 2 + Python/ceval_macros.h | 2 +- Python/generated_cases.c.h | 66 +++++++++++++------------- Python/specialize.c | 32 ++++++------- 13 files changed, 121 insertions(+), 193 deletions(-) diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index 6f8899ddd2425e..033baeb915b962 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -314,41 +314,17 @@ extern int _PyLineTable_PreviousAddressRange(PyCodeAddressRange *range); /** API for executors */ extern void _PyCode_Clear_Executors(PyCodeObject *code); -#define ENABLE_SPECIALIZATION 1 #ifdef Py_GIL_DISABLED // gh-115999 tracks progress on addressing this. -#define ENABLE_SPECIALIZED_BINARY_OP ENABLE_SPECIALIZATION -#define ENABLE_SPECIALIZED_BINARY_SUBSCR 0 && ENABLE_SPECIALIZATION -#define ENABLE_SPECIALIZED_CALL 0 && ENABLE_SPECIALIZATION -#define ENABLE_SPECIALIZED_CALL_KW 0 && ENABLE_SPECIALIZATION -#define ENABLE_SPECIALIZED_COMPARE_OP 0 && ENABLE_SPECIALIZATION -#define ENABLE_SPECIALIZED_CONTAINS_OP 0 && ENABLE_SPECIALIZATION -#define ENABLE_SPECIALIZED_FOR_ITER 0 && ENABLE_SPECIALIZATION -#define ENABLE_SPECIALIZED_LOAD_ATTR 0 && ENABLE_SPECIALIZATION -#define ENABLE_SPECIALIZED_LOAD_GLOBAL 0 && ENABLE_SPECIALIZATION -#define ENABLE_SPECIALIZED_LOAD_SUPER_ATTR 0 && ENABLE_SPECIALIZATION -#define ENABLE_SPECIALIZED_SEND 0 && ENABLE_SPECIALIZATION -#define ENABLE_SPECIALIZED_STORE_ATTR 0 && ENABLE_SPECIALIZATION -#define ENABLE_SPECIALIZED_STORE_SUBSCR 0 && ENABLE_SPECIALIZATION -#define ENABLE_SPECIALIZED_TO_BOOL 0 && ENABLE_SPECIALIZATION -#define ENABLE_SPECIALIZED_UNPACK_SEQUENCE 0 && ENABLE_SPECIALIZATION +#define ENABLE_SPECIALIZATION 0 +// Use this to enable specialization families once they are thread-safe. All +// uses will be replaced with ENABLE_SPECIALIZATION once all families are +// thread-safe. +#define ENABLE_SPECIALIZATION_FT 1 #else -#define ENABLE_SPECIALIZED_BINARY_OP ENABLE_SPECIALIZATION -#define ENABLE_SPECIALIZED_BINARY_SUBSCR ENABLE_SPECIALIZATION -#define ENABLE_SPECIALIZED_CALL ENABLE_SPECIALIZATION -#define ENABLE_SPECIALIZED_CALL_KW ENABLE_SPECIALIZATION -#define ENABLE_SPECIALIZED_COMPARE_OP ENABLE_SPECIALIZATION -#define ENABLE_SPECIALIZED_CONTAINS_OP ENABLE_SPECIALIZATION -#define ENABLE_SPECIALIZED_FOR_ITER ENABLE_SPECIALIZATION -#define ENABLE_SPECIALIZED_LOAD_ATTR ENABLE_SPECIALIZATION -#define ENABLE_SPECIALIZED_LOAD_GLOBAL ENABLE_SPECIALIZATION -#define ENABLE_SPECIALIZED_LOAD_SUPER_ATTR ENABLE_SPECIALIZATION -#define ENABLE_SPECIALIZED_SEND ENABLE_SPECIALIZATION -#define ENABLE_SPECIALIZED_STORE_ATTR ENABLE_SPECIALIZATION -#define ENABLE_SPECIALIZED_STORE_SUBSCR ENABLE_SPECIALIZATION -#define ENABLE_SPECIALIZED_TO_BOOL ENABLE_SPECIALIZATION -#define ENABLE_SPECIALIZED_UNPACK_SEQUENCE ENABLE_SPECIALIZATION +#define ENABLE_SPECIALIZATION 1 +#define ENABLE_SPECIALIZATION_FT ENABLE_SPECIALIZATION #endif /* Specialization functions */ diff --git a/Lib/test/support/__init__.py b/Lib/test/support/__init__.py index 7f9446f5132e26..6587558be39b1b 100644 --- a/Lib/test/support/__init__.py +++ b/Lib/test/support/__init__.py @@ -1302,18 +1302,9 @@ def requires_specialization(test): _opcode.ENABLE_SPECIALIZATION, "requires specialization")(test) -def requires_specialization_of(*ops): - missing_ops = [] - is_enabled = True - for op in ops: - is_op_specialized = getattr(_opcode, f"ENABLE_SPECIALIZED_{op}") - if not is_op_specialized: - missing_ops.append(op) - is_enabled = is_enabled and is_op_specialized - reason = f"requires specialized {', '.join(missing_ops)}" - def f(test): - return unittest.skipUnless(is_enabled, reason)(test) - return f +def requires_specialization_ft(test): + return unittest.skipUnless( + _opcode.ENABLE_SPECIALIZATION_FT, "requires specialization")(test) #======================================================================= diff --git a/Lib/test/test_dis.py b/Lib/test/test_dis.py index c59b83a0e742d5..ab69ad38ac2404 100644 --- a/Lib/test/test_dis.py +++ b/Lib/test/test_dis.py @@ -10,8 +10,8 @@ import types import unittest from test.support import (captured_stdout, requires_debug_ranges, - requires_specialization, - requires_specialization_of, cpython_only) + requires_specialization, requires_specialization_ft, + cpython_only) from test.support.bytecode_helper import BytecodeTestCase import opcode @@ -1231,8 +1231,8 @@ def test_super_instructions(self): self.do_disassembly_compare(got, dis_load_test_quickened_code) @cpython_only - @requires_specialization_of("BINARY_OP") - def test_binary_op_specialize(self): + @requires_specialization_ft + def test_binary_specialize(self): binary_op_quicken = """\ 0 RESUME_CHECK 0 @@ -1252,7 +1252,7 @@ def test_binary_op_specialize(self): self.do_disassembly_compare(got, binary_op_quicken % "BINARY_OP_ADD_UNICODE 0 (+)") @cpython_only - @requires_specialization_of("BINARY_SUBSCR") + @requires_specialization def test_binary_subscr_specialize(self): binary_subscr_quicken = """\ 0 RESUME_CHECK 0 @@ -1273,7 +1273,7 @@ def test_binary_subscr_specialize(self): self.do_disassembly_compare(got, binary_subscr_quicken % "BINARY_SUBSCR_DICT") @cpython_only - @requires_specialization_of("LOAD_ATTR") + @requires_specialization def test_load_attr_specialize(self): load_attr_quicken = """\ 0 RESUME_CHECK 0 @@ -1288,7 +1288,7 @@ def test_load_attr_specialize(self): self.do_disassembly_compare(got, load_attr_quicken) @cpython_only - @requires_specialization_of("CALL") + @requires_specialization def test_call_specialize(self): call_quicken = """\ 0 RESUME_CHECK 0 @@ -1305,7 +1305,7 @@ def test_call_specialize(self): self.do_disassembly_compare(got, call_quicken) @cpython_only - @requires_specialization_of("FOR_ITER", "LOAD_GLOBAL") + @requires_specialization def test_loop_quicken(self): # Loop can trigger a quicken where the loop is located self.code_quicken(loop_test, 4) @@ -1314,7 +1314,7 @@ def test_loop_quicken(self): self.do_disassembly_compare(got, expected) @cpython_only - @requires_specialization_of("COMPARE_OP", "FOR_ITER") + @requires_specialization def test_loop_with_conditional_at_end_is_quickened(self): def for_loop_true(x): for i in range(10): diff --git a/Lib/test/test_monitoring.py b/Lib/test/test_monitoring.py index 8096d1e0b6be58..1b06816214e7d6 100644 --- a/Lib/test/test_monitoring.py +++ b/Lib/test/test_monitoring.py @@ -11,7 +11,7 @@ import unittest import test.support -from test.support import requires_specialization_of, script_helper +from test.support import requires_specialization, script_helper from test.support.import_helper import import_module _testcapi = test.support.import_helper.import_module("_testcapi") @@ -1045,7 +1045,7 @@ def func(): ) self.assertEqual(events[0], ("throw", IndexError)) - @requires_specialization_of("CALL") + @requires_specialization def test_no_unwind_for_shim_frame(self): class B: diff --git a/Lib/test/test_opcache.py b/Lib/test/test_opcache.py index 6674ab9d0b5d96..acf8158b0d0ea1 100644 --- a/Lib/test/test_opcache.py +++ b/Lib/test/test_opcache.py @@ -4,7 +4,7 @@ import threading import types import unittest -from test.support import threading_helper, check_impl_detail, requires_specialization_of +from test.support import threading_helper, check_impl_detail, requires_specialization from test.support.import_helper import import_module # Skip this module on other interpreters, it is cpython specific: @@ -515,7 +515,7 @@ def f(x, y): f() @disabling_optimizer - @requires_specialization_of("CALL") + @requires_specialization def test_assign_init_code(self): class MyClass: def __init__(self): @@ -539,6 +539,7 @@ def count_args(self, *args): @threading_helper.requires_working_threading() +@requires_specialization class TestRacesDoNotCrash(TestBase): # Careful with these. Bigger numbers have a higher chance of catching bugs, # but you can also burn through a *ton* of type/dict/function versions: @@ -580,7 +581,6 @@ def assert_races_do_not_crash( for writer in writers: writer.join() - @requires_specialization_of("BINARY_SUBSCR") def test_binary_subscr_getitem(self): def get_items(): class C: @@ -610,7 +610,6 @@ def write(items): opname = "BINARY_SUBSCR_GETITEM" self.assert_races_do_not_crash(opname, get_items, read, write) - @requires_specialization_of("BINARY_SUBSCR") def test_binary_subscr_list_int(self): def get_items(): items = [] @@ -634,7 +633,6 @@ def write(items): opname = "BINARY_SUBSCR_LIST_INT" self.assert_races_do_not_crash(opname, get_items, read, write) - @requires_specialization_of("FOR_ITER") def test_for_iter_gen(self): def get_items(): def g(): @@ -666,7 +664,6 @@ def write(items): opname = "FOR_ITER_GEN" self.assert_races_do_not_crash(opname, get_items, read, write) - @requires_specialization_of("FOR_ITER") def test_for_iter_list(self): def get_items(): items = [] @@ -688,7 +685,6 @@ def write(items): opname = "FOR_ITER_LIST" self.assert_races_do_not_crash(opname, get_items, read, write) - @requires_specialization_of("LOAD_ATTR") def test_load_attr_class(self): def get_items(): class C: @@ -718,7 +714,6 @@ def write(items): opname = "LOAD_ATTR_CLASS" self.assert_races_do_not_crash(opname, get_items, read, write) - @requires_specialization_of("LOAD_ATTR") def test_load_attr_getattribute_overridden(self): def get_items(): class C: @@ -748,7 +743,6 @@ def write(items): opname = "LOAD_ATTR_GETATTRIBUTE_OVERRIDDEN" self.assert_races_do_not_crash(opname, get_items, read, write) - @requires_specialization_of("LOAD_ATTR") def test_load_attr_instance_value(self): def get_items(): class C: @@ -772,7 +766,6 @@ def write(items): opname = "LOAD_ATTR_INSTANCE_VALUE" self.assert_races_do_not_crash(opname, get_items, read, write) - @requires_specialization_of("LOAD_ATTR") def test_load_attr_method_lazy_dict(self): def get_items(): class C(Exception): @@ -802,7 +795,6 @@ def write(items): opname = "LOAD_ATTR_METHOD_LAZY_DICT" self.assert_races_do_not_crash(opname, get_items, read, write) - @requires_specialization_of("LOAD_ATTR") def test_load_attr_method_no_dict(self): def get_items(): class C: @@ -833,7 +825,6 @@ def write(items): opname = "LOAD_ATTR_METHOD_NO_DICT" self.assert_races_do_not_crash(opname, get_items, read, write) - @requires_specialization_of("LOAD_ATTR") def test_load_attr_method_with_values(self): def get_items(): class C: @@ -863,7 +854,6 @@ def write(items): opname = "LOAD_ATTR_METHOD_WITH_VALUES" self.assert_races_do_not_crash(opname, get_items, read, write) - @requires_specialization_of("LOAD_ATTR") def test_load_attr_module(self): def get_items(): items = [] @@ -888,7 +878,6 @@ def write(items): opname = "LOAD_ATTR_MODULE" self.assert_races_do_not_crash(opname, get_items, read, write) - @requires_specialization_of("LOAD_ATTR") def test_load_attr_property(self): def get_items(): class C: @@ -918,7 +907,6 @@ def write(items): opname = "LOAD_ATTR_PROPERTY" self.assert_races_do_not_crash(opname, get_items, read, write) - @requires_specialization_of("LOAD_ATTR") def test_load_attr_with_hint(self): def get_items(): class C: @@ -945,7 +933,6 @@ def write(items): opname = "LOAD_ATTR_WITH_HINT" self.assert_races_do_not_crash(opname, get_items, read, write) - @requires_specialization_of("LOAD_GLOBAL") def test_load_global_module(self): def get_items(): items = [] @@ -967,7 +954,6 @@ def write(items): opname, get_items, read, write, check_items=True ) - @requires_specialization_of("STORE_ATTR") def test_store_attr_instance_value(self): def get_items(): class C: @@ -990,7 +976,6 @@ def write(items): opname = "STORE_ATTR_INSTANCE_VALUE" self.assert_races_do_not_crash(opname, get_items, read, write) - @requires_specialization_of("STORE_ATTR") def test_store_attr_with_hint(self): def get_items(): class C: @@ -1016,7 +1001,6 @@ def write(items): opname = "STORE_ATTR_WITH_HINT" self.assert_races_do_not_crash(opname, get_items, read, write) - @requires_specialization_of("STORE_SUBSCR") def test_store_subscr_list_int(self): def get_items(): items = [] @@ -1040,7 +1024,6 @@ def write(items): opname = "STORE_SUBSCR_LIST_INT" self.assert_races_do_not_crash(opname, get_items, read, write) - @requires_specialization_of("UNPACK_SEQUENCE") def test_unpack_sequence_list(self): def get_items(): items = [] @@ -1067,7 +1050,7 @@ def write(items): class C: pass -@requires_specialization_of("LOAD_ATTR") +@requires_specialization class TestInstanceDict(unittest.TestCase): def setUp(self): diff --git a/Lib/test/test_thread_local_bytecode.py b/Lib/test/test_thread_local_bytecode.py index 16a98faa6783d7..fd4e5ffbe467db 100644 --- a/Lib/test/test_thread_local_bytecode.py +++ b/Lib/test/test_thread_local_bytecode.py @@ -4,7 +4,7 @@ import unittest from test import support -from test.support import cpython_only, import_helper, requires_specialization_of +from test.support import cpython_only, import_helper, requires_specialization_ft from test.support.script_helper import assert_python_ok from test.support.threading_helper import requires_working_threading @@ -16,7 +16,7 @@ @requires_working_threading() @unittest.skipUnless(support.Py_GIL_DISABLED, "only in free-threaded builds") class TLBCTests(unittest.TestCase): - @requires_specialization_of("BINARY_OP") + @requires_specialization_ft def test_new_threads_start_with_unspecialized_code(self): code = textwrap.dedent(""" import dis @@ -47,7 +47,7 @@ def f(a, b, q=None): """) assert_python_ok("-X", "tlbc=1", "-c", code) - @requires_specialization_of("BINARY_OP") + @requires_specialization_ft def test_threads_specialize_independently(self): code = textwrap.dedent(""" import dis diff --git a/Lib/test/test_type_cache.py b/Lib/test/test_type_cache.py index 1d431cbecfeec1..66abe73f8d766d 100644 --- a/Lib/test/test_type_cache.py +++ b/Lib/test/test_type_cache.py @@ -2,7 +2,7 @@ import unittest import dis from test import support -from test.support import import_helper, requires_specialization_of +from test.support import import_helper, requires_specialization try: from sys import _clear_type_cache except ImportError: @@ -110,6 +110,7 @@ class HolderSub(Holder): HolderSub.value @support.cpython_only +@requires_specialization class TypeCacheWithSpecializationTests(unittest.TestCase): def tearDown(self): _clear_type_cache() @@ -139,7 +140,6 @@ def _check_specialization(self, func, arg, opname, *, should_specialize): else: self.assertIn(opname, self._all_opnames(func)) - @requires_specialization_of("LOAD_ATTR") def test_class_load_attr_specialization_user_type(self): class A: def foo(self): @@ -160,7 +160,6 @@ def load_foo_2(type_): self._check_specialization(load_foo_2, A, "LOAD_ATTR", should_specialize=False) - @requires_specialization_of("LOAD_ATTR") def test_class_load_attr_specialization_static_type(self): self.assertNotEqual(type_get_version(str), 0) self.assertNotEqual(type_get_version(bytes), 0) @@ -172,7 +171,6 @@ def get_capitalize_1(type_): self.assertEqual(get_capitalize_1(str)('hello'), 'Hello') self.assertEqual(get_capitalize_1(bytes)(b'hello'), b'Hello') - @requires_specialization_of("LOAD_ATTR") def test_property_load_attr_specialization_user_type(self): class G: @property @@ -194,7 +192,6 @@ def load_x_2(instance): self._check_specialization(load_x_2, G(), "LOAD_ATTR", should_specialize=False) - @requires_specialization_of("STORE_ATTR") def test_store_attr_specialization_user_type(self): class B: __slots__ = ("bar",) @@ -214,7 +211,6 @@ def store_bar_2(type_): self._check_specialization(store_bar_2, B(), "STORE_ATTR", should_specialize=False) - @requires_specialization_of("CALL") def test_class_call_specialization_user_type(self): class F: def __init__(self): @@ -235,7 +231,6 @@ def call_class_2(type_): self._check_specialization(call_class_2, F, "CALL", should_specialize=False) - @requires_specialization_of("TO_BOOL") def test_to_bool_specialization_user_type(self): class H: pass diff --git a/Modules/_opcode.c b/Modules/_opcode.c index 23fc7d797a0b18..7ccf7af6bf908f 100644 --- a/Modules/_opcode.c +++ b/Modules/_opcode.c @@ -419,31 +419,12 @@ opcode_functions[] = { static int _opcode_exec(PyObject *m) { -#define ADD(X) \ - do { \ - if (PyModule_AddIntConstant(m, #X, (X)) < 0) { \ - return -1; \ - } \ - } while (0) - - ADD(ENABLE_SPECIALIZATION); - ADD(ENABLE_SPECIALIZED_BINARY_OP); - ADD(ENABLE_SPECIALIZED_BINARY_SUBSCR); - ADD(ENABLE_SPECIALIZED_CALL); - ADD(ENABLE_SPECIALIZED_CALL_KW); - ADD(ENABLE_SPECIALIZED_COMPARE_OP); - ADD(ENABLE_SPECIALIZED_CONTAINS_OP); - ADD(ENABLE_SPECIALIZED_FOR_ITER); - ADD(ENABLE_SPECIALIZED_LOAD_ATTR); - ADD(ENABLE_SPECIALIZED_LOAD_GLOBAL); - ADD(ENABLE_SPECIALIZED_LOAD_SUPER_ATTR); - ADD(ENABLE_SPECIALIZED_SEND); - ADD(ENABLE_SPECIALIZED_STORE_ATTR); - ADD(ENABLE_SPECIALIZED_STORE_SUBSCR); - ADD(ENABLE_SPECIALIZED_TO_BOOL); - ADD(ENABLE_SPECIALIZED_UNPACK_SEQUENCE); - -#undef ADD + if (PyModule_AddIntMacro(m, ENABLE_SPECIALIZATION) < 0) { + return -1; + } + if (PyModule_AddIntMacro(m, ENABLE_SPECIALIZATION_FT) < 0) { + return -1; + } return 0; } diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 0cd877f63fa207..e8a89430ab3d4b 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -168,11 +168,11 @@ dummy_func( } op(_QUICKEN_RESUME, (--)) { - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZATION_FT if (tstate->tracing == 0 && this_instr->op.code == RESUME) { FT_ATOMIC_STORE_UINT8_RELAXED(this_instr->op.code, RESUME_CHECK); } - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZATION_FT */ } tier1 op(_MAYBE_INSTRUMENT, (--)) { @@ -404,7 +404,7 @@ dummy_func( }; specializing op(_SPECIALIZE_TO_BOOL, (counter/1, value -- value)) { - #if ENABLE_SPECIALIZED_TO_BOOL + #if ENABLE_SPECIALIZATION if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_ToBool(value, next_instr); @@ -412,7 +412,7 @@ dummy_func( } OPCODE_DEFERRED_INC(TO_BOOL); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZED_TO_BOOL */ + #endif /* ENABLE_SPECIALIZATION */ } op(_TO_BOOL, (value -- res)) { @@ -717,7 +717,7 @@ dummy_func( }; specializing op(_SPECIALIZE_BINARY_SUBSCR, (counter/1, container, sub -- container, sub)) { - #if ENABLE_SPECIALIZED_BINARY_SUBSCR + #if ENABLE_SPECIALIZATION assert(frame->stackpointer == NULL); if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; @@ -726,7 +726,7 @@ dummy_func( } OPCODE_DEFERRED_INC(BINARY_SUBSCR); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZED_BINARY_SUBSCR */ + #endif /* ENABLE_SPECIALIZATION */ } op(_BINARY_SUBSCR, (container, sub -- res)) { @@ -923,7 +923,7 @@ dummy_func( }; specializing op(_SPECIALIZE_STORE_SUBSCR, (counter/1, container, sub -- container, sub)) { - #if ENABLE_SPECIALIZED_STORE_SUBSCR + #if ENABLE_SPECIALIZATION if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_StoreSubscr(container, sub, next_instr); @@ -931,7 +931,7 @@ dummy_func( } OPCODE_DEFERRED_INC(STORE_SUBSCR); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZED_STORE_SUBSCR */ + #endif /* ENABLE_SPECIALIZATION */ } op(_STORE_SUBSCR, (v, container, sub -- )) { @@ -1127,7 +1127,7 @@ dummy_func( }; specializing op(_SPECIALIZE_SEND, (counter/1, receiver, unused -- receiver, unused)) { - #if ENABLE_SPECIALIZED_SEND + #if ENABLE_SPECIALIZATION if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_Send(receiver, next_instr); @@ -1135,7 +1135,7 @@ dummy_func( } OPCODE_DEFERRED_INC(SEND); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZED_SEND */ + #endif /* ENABLE_SPECIALIZATION */ } op(_SEND, (receiver, v -- receiver, retval)) { @@ -1400,7 +1400,7 @@ dummy_func( }; specializing op(_SPECIALIZE_UNPACK_SEQUENCE, (counter/1, seq -- seq)) { - #if ENABLE_SPECIALIZED_UNPACK_SEQUENCE + #if ENABLE_SPECIALIZATION if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_UnpackSequence(seq, next_instr, oparg); @@ -1408,7 +1408,7 @@ dummy_func( } OPCODE_DEFERRED_INC(UNPACK_SEQUENCE); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZED_UNPACK_SEQUENCE */ + #endif /* ENABLE_SPECIALIZATION */ (void)seq; (void)counter; } @@ -1471,7 +1471,7 @@ dummy_func( }; specializing op(_SPECIALIZE_STORE_ATTR, (counter/1, owner -- owner)) { - #if ENABLE_SPECIALIZED_STORE_ATTR + #if ENABLE_SPECIALIZATION if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { PyObject *name = GETITEM(FRAME_CO_NAMES, oparg); next_instr = this_instr; @@ -1480,7 +1480,7 @@ dummy_func( } OPCODE_DEFERRED_INC(STORE_ATTR); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZED_STORE_ATTR */ + #endif /* ENABLE_SPECIALIZATION */ } op(_STORE_ATTR, (v, owner --)) { @@ -1588,7 +1588,7 @@ dummy_func( }; specializing op(_SPECIALIZE_LOAD_GLOBAL, (counter/1 -- )) { - #if ENABLE_SPECIALIZED_LOAD_GLOBAL + #if ENABLE_SPECIALIZATION if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { PyObject *name = GETITEM(FRAME_CO_NAMES, oparg>>1); next_instr = this_instr; @@ -1597,7 +1597,7 @@ dummy_func( } OPCODE_DEFERRED_INC(LOAD_GLOBAL); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZED_LOAD_GLOBAL */ + #endif /* ENABLE_SPECIALIZATION */ } // res[1] because we need a pointer to res to pass it to _PyEval_LoadGlobalStackRef @@ -1938,7 +1938,7 @@ dummy_func( }; specializing op(_SPECIALIZE_LOAD_SUPER_ATTR, (counter/1, global_super_st, class_st, unused -- global_super_st, class_st, unused)) { - #if ENABLE_SPECIALIZED_LOAD_SUPER_ATTR + #if ENABLE_SPECIALIZATION int load_method = oparg & 1; if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; @@ -1947,7 +1947,7 @@ dummy_func( } OPCODE_DEFERRED_INC(LOAD_SUPER_ATTR); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZED_LOAD_SUPER_ATTR */ + #endif /* ENABLE_SPECIALIZATION */ } tier1 op(_LOAD_SUPER_ATTR, (global_super_st, class_st, self_st -- attr, null if (oparg & 1))) { @@ -2061,7 +2061,7 @@ dummy_func( }; specializing op(_SPECIALIZE_LOAD_ATTR, (counter/1, owner -- owner)) { - #if ENABLE_SPECIALIZED_LOAD_ATTR + #if ENABLE_SPECIALIZATION if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { PyObject *name = GETITEM(FRAME_CO_NAMES, oparg>>1); next_instr = this_instr; @@ -2070,7 +2070,7 @@ dummy_func( } OPCODE_DEFERRED_INC(LOAD_ATTR); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZED_LOAD_ATTR */ + #endif /* ENABLE_SPECIALIZATION */ } op(_LOAD_ATTR, (owner -- attr, self_or_null if (oparg & 1))) { @@ -2401,7 +2401,7 @@ dummy_func( }; specializing op(_SPECIALIZE_COMPARE_OP, (counter/1, left, right -- left, right)) { - #if ENABLE_SPECIALIZED_COMPARE_OP + #if ENABLE_SPECIALIZATION if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_CompareOp(left, right, next_instr, oparg); @@ -2409,7 +2409,7 @@ dummy_func( } OPCODE_DEFERRED_INC(COMPARE_OP); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZED_COMPARE_OP */ + #endif /* ENABLE_SPECIALIZATION */ } op(_COMPARE_OP, (left, right -- res)) { @@ -2528,7 +2528,7 @@ dummy_func( } specializing op(_SPECIALIZE_CONTAINS_OP, (counter/1, left, right -- left, right)) { - #if ENABLE_SPECIALIZED_CONTAINS_OP + #if ENABLE_SPECIALIZATION if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_ContainsOp(right, next_instr); @@ -2536,7 +2536,7 @@ dummy_func( } OPCODE_DEFERRED_INC(CONTAINS_OP); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZED_CONTAINS_OP */ + #endif /* ENABLE_SPECIALIZATION */ } macro(CONTAINS_OP) = _SPECIALIZE_CONTAINS_OP + _CONTAINS_OP; @@ -2633,7 +2633,7 @@ dummy_func( assert(oparg <= INSTR_OFFSET()); JUMPBY(-oparg); #ifdef _Py_TIER2 - #if ENABLE_SPECIALIZATION && !Py_GIL_DISABLED + #if ENABLE_SPECIALIZATION _Py_BackoffCounter counter = this_instr[1].counter; if (backoff_counter_triggers(counter) && this_instr->op.code == JUMP_BACKWARD) { _Py_CODEUNIT *start = this_instr; @@ -2854,7 +2854,7 @@ dummy_func( }; specializing op(_SPECIALIZE_FOR_ITER, (counter/1, iter -- iter)) { - #if ENABLE_SPECIALIZED_FOR_ITER + #if ENABLE_SPECIALIZATION if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_ForIter(iter, next_instr, oparg); @@ -2862,7 +2862,7 @@ dummy_func( } OPCODE_DEFERRED_INC(FOR_ITER); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZED_FOR_ITER */ + #endif /* ENABLE_SPECIALIZATION */ } replaced op(_FOR_ITER, (iter -- iter, next)) { @@ -3330,7 +3330,7 @@ dummy_func( }; specializing op(_SPECIALIZE_CALL, (counter/1, callable[1], self_or_null[1], args[oparg] -- callable[1], self_or_null[1], args[oparg])) { - #if ENABLE_SPECIALIZED_CALL + #if ENABLE_SPECIALIZATION if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_Call(callable[0], next_instr, oparg + !PyStackRef_IsNull(self_or_null[0])); @@ -3338,7 +3338,7 @@ dummy_func( } OPCODE_DEFERRED_INC(CALL); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZED_CALL */ + #endif /* ENABLE_SPECIALIZATION */ } op(_MAYBE_EXPAND_METHOD, (callable[1], self_or_null[1], args[oparg] -- func[1], maybe_self[1], args[oparg])) { @@ -4361,7 +4361,7 @@ dummy_func( _PUSH_FRAME; specializing op(_SPECIALIZE_CALL_KW, (counter/1, callable[1], self_or_null[1], args[oparg], kwnames -- callable[1], self_or_null[1], args[oparg], kwnames)) { - #if ENABLE_SPECIALIZED_CALL_KW + #if ENABLE_SPECIALIZATION if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_CallKw(callable[0], next_instr, oparg + !PyStackRef_IsNull(self_or_null[0])); @@ -4369,7 +4369,7 @@ dummy_func( } OPCODE_DEFERRED_INC(CALL_KW); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZED_CALL_KW */ + #endif /* ENABLE_SPECIALIZATION */ } macro(CALL_KW) = @@ -4631,7 +4631,7 @@ dummy_func( } specializing op(_SPECIALIZE_BINARY_OP, (counter/1, lhs, rhs -- lhs, rhs)) { - #if ENABLE_SPECIALIZED_BINARY_OP + #if ENABLE_SPECIALIZATION_FT if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_BinaryOp(lhs, rhs, next_instr, oparg, LOCALS_ARRAY); @@ -4639,7 +4639,7 @@ dummy_func( } OPCODE_DEFERRED_INC(BINARY_OP); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZED_BINARY_OP */ + #endif /* ENABLE_SPECIALIZATION_FT */ assert(NB_ADD <= oparg); assert(oparg <= NB_INPLACE_XOR); } diff --git a/Python/ceval.c b/Python/ceval.c index 58342e77042f9b..77d01c0bbab072 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -1034,6 +1034,8 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int #undef ENABLE_SPECIALIZATION #define ENABLE_SPECIALIZATION 0 +#undef ENABLE_SPECIALIZATION_FT +#define ENABLE_SPECIALIZATION_FT 0 #ifdef Py_DEBUG #define DPRINTF(level, ...) \ diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h index 826c62190d3418..62499cb6198388 100644 --- a/Python/ceval_macros.h +++ b/Python/ceval_macros.h @@ -311,7 +311,7 @@ GETITEM(PyObject *v, Py_ssize_t i) { (COUNTER) = pause_backoff_counter((COUNTER)); \ } while (0); -#ifdef ENABLE_SPECIALIZATION +#ifdef ENABLE_SPECIALIZATION_FT /* Multiple threads may execute these concurrently if the thread-local bytecode * limit is reached and they all execute the main copy of the bytecode. This is * approximate, we do not need the RMW cycle to be atomic. diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 613550d398e593..737b30b934258d 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -25,7 +25,7 @@ lhs = stack_pointer[-2]; uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - #if ENABLE_SPECIALIZED_BINARY_OP + #if ENABLE_SPECIALIZATION_FT if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _PyFrame_SetStackPointer(frame, stack_pointer); @@ -35,7 +35,7 @@ } OPCODE_DEFERRED_INC(BINARY_OP); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZED_BINARY_OP */ + #endif /* ENABLE_SPECIALIZATION_FT */ assert(NB_ADD <= oparg); assert(oparg <= NB_INPLACE_XOR); } @@ -433,7 +433,7 @@ container = stack_pointer[-2]; uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - #if ENABLE_SPECIALIZED_BINARY_SUBSCR + #if ENABLE_SPECIALIZATION assert(frame->stackpointer == NULL); if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; @@ -444,7 +444,7 @@ } OPCODE_DEFERRED_INC(BINARY_SUBSCR); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZED_BINARY_SUBSCR */ + #endif /* ENABLE_SPECIALIZATION */ } // _BINARY_SUBSCR { @@ -872,7 +872,7 @@ callable = &stack_pointer[-2 - oparg]; uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - #if ENABLE_SPECIALIZED_CALL + #if ENABLE_SPECIALIZATION if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _PyFrame_SetStackPointer(frame, stack_pointer); @@ -882,7 +882,7 @@ } OPCODE_DEFERRED_INC(CALL); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZED_CALL */ + #endif /* ENABLE_SPECIALIZATION */ } /* Skip 2 cache entries */ // _MAYBE_EXPAND_METHOD @@ -1888,7 +1888,7 @@ callable = &stack_pointer[-3 - oparg]; uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - #if ENABLE_SPECIALIZED_CALL_KW + #if ENABLE_SPECIALIZATION if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _PyFrame_SetStackPointer(frame, stack_pointer); @@ -1898,7 +1898,7 @@ } OPCODE_DEFERRED_INC(CALL_KW); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZED_CALL_KW */ + #endif /* ENABLE_SPECIALIZATION */ } /* Skip 2 cache entries */ // _MAYBE_EXPAND_METHOD_KW @@ -3212,7 +3212,7 @@ left = stack_pointer[-2]; uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - #if ENABLE_SPECIALIZED_COMPARE_OP + #if ENABLE_SPECIALIZATION if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _PyFrame_SetStackPointer(frame, stack_pointer); @@ -3222,7 +3222,7 @@ } OPCODE_DEFERRED_INC(COMPARE_OP); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZED_COMPARE_OP */ + #endif /* ENABLE_SPECIALIZATION */ } // _COMPARE_OP { @@ -3391,7 +3391,7 @@ right = stack_pointer[-1]; uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - #if ENABLE_SPECIALIZED_CONTAINS_OP + #if ENABLE_SPECIALIZATION if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _PyFrame_SetStackPointer(frame, stack_pointer); @@ -3401,7 +3401,7 @@ } OPCODE_DEFERRED_INC(CONTAINS_OP); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZED_CONTAINS_OP */ + #endif /* ENABLE_SPECIALIZATION */ } // _CONTAINS_OP { @@ -3905,7 +3905,7 @@ iter = stack_pointer[-1]; uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - #if ENABLE_SPECIALIZED_FOR_ITER + #if ENABLE_SPECIALIZATION if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _PyFrame_SetStackPointer(frame, stack_pointer); @@ -3915,7 +3915,7 @@ } OPCODE_DEFERRED_INC(FOR_ITER); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZED_FOR_ITER */ + #endif /* ENABLE_SPECIALIZATION */ } // _FOR_ITER { @@ -5133,7 +5133,7 @@ assert(oparg <= INSTR_OFFSET()); JUMPBY(-oparg); #ifdef _Py_TIER2 - #if ENABLE_SPECIALIZATION && !Py_GIL_DISABLED + #if ENABLE_SPECIALIZATION _Py_BackoffCounter counter = this_instr[1].counter; if (backoff_counter_triggers(counter) && this_instr->op.code == JUMP_BACKWARD) { _Py_CODEUNIT *start = this_instr; @@ -5254,7 +5254,7 @@ owner = stack_pointer[-1]; uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - #if ENABLE_SPECIALIZED_LOAD_ATTR + #if ENABLE_SPECIALIZATION if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { PyObject *name = GETITEM(FRAME_CO_NAMES, oparg>>1); next_instr = this_instr; @@ -5265,7 +5265,7 @@ } OPCODE_DEFERRED_INC(LOAD_ATTR); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZED_LOAD_ATTR */ + #endif /* ENABLE_SPECIALIZATION */ } /* Skip 8 cache entries */ // _LOAD_ATTR @@ -6133,7 +6133,7 @@ { uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - #if ENABLE_SPECIALIZED_LOAD_GLOBAL + #if ENABLE_SPECIALIZATION if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { PyObject *name = GETITEM(FRAME_CO_NAMES, oparg>>1); next_instr = this_instr; @@ -6144,7 +6144,7 @@ } OPCODE_DEFERRED_INC(LOAD_GLOBAL); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZED_LOAD_GLOBAL */ + #endif /* ENABLE_SPECIALIZATION */ } /* Skip 1 cache entry */ /* Skip 1 cache entry */ @@ -6338,7 +6338,7 @@ global_super_st = stack_pointer[-3]; uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - #if ENABLE_SPECIALIZED_LOAD_SUPER_ATTR + #if ENABLE_SPECIALIZATION int load_method = oparg & 1; if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; @@ -6349,7 +6349,7 @@ } OPCODE_DEFERRED_INC(LOAD_SUPER_ATTR); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZED_LOAD_SUPER_ATTR */ + #endif /* ENABLE_SPECIALIZATION */ } // _LOAD_SUPER_ATTR { @@ -6959,11 +6959,11 @@ } // _QUICKEN_RESUME { - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZATION_FT if (tstate->tracing == 0 && this_instr->op.code == RESUME) { FT_ATOMIC_STORE_UINT8_RELAXED(this_instr->op.code, RESUME_CHECK); } - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZATION_FT */ } // _CHECK_PERIODIC_IF_NOT_YIELD_FROM { @@ -7138,7 +7138,7 @@ receiver = stack_pointer[-2]; uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - #if ENABLE_SPECIALIZED_SEND + #if ENABLE_SPECIALIZATION if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _PyFrame_SetStackPointer(frame, stack_pointer); @@ -7148,7 +7148,7 @@ } OPCODE_DEFERRED_INC(SEND); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZED_SEND */ + #endif /* ENABLE_SPECIALIZATION */ } // _SEND { @@ -7376,7 +7376,7 @@ owner = stack_pointer[-1]; uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - #if ENABLE_SPECIALIZED_STORE_ATTR + #if ENABLE_SPECIALIZATION if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { PyObject *name = GETITEM(FRAME_CO_NAMES, oparg); next_instr = this_instr; @@ -7387,7 +7387,7 @@ } OPCODE_DEFERRED_INC(STORE_ATTR); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZED_STORE_ATTR */ + #endif /* ENABLE_SPECIALIZATION */ } /* Skip 3 cache entries */ // _STORE_ATTR @@ -7715,7 +7715,7 @@ container = stack_pointer[-2]; uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - #if ENABLE_SPECIALIZED_STORE_SUBSCR + #if ENABLE_SPECIALIZATION if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _PyFrame_SetStackPointer(frame, stack_pointer); @@ -7725,7 +7725,7 @@ } OPCODE_DEFERRED_INC(STORE_SUBSCR); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZED_STORE_SUBSCR */ + #endif /* ENABLE_SPECIALIZATION */ } // _STORE_SUBSCR { @@ -7836,7 +7836,7 @@ value = stack_pointer[-1]; uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - #if ENABLE_SPECIALIZED_TO_BOOL + #if ENABLE_SPECIALIZATION if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _PyFrame_SetStackPointer(frame, stack_pointer); @@ -7846,7 +7846,7 @@ } OPCODE_DEFERRED_INC(TO_BOOL); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZED_TO_BOOL */ + #endif /* ENABLE_SPECIALIZATION */ } /* Skip 2 cache entries */ // _TO_BOOL @@ -8072,7 +8072,7 @@ seq = stack_pointer[-1]; uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - #if ENABLE_SPECIALIZED_UNPACK_SEQUENCE + #if ENABLE_SPECIALIZATION if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _PyFrame_SetStackPointer(frame, stack_pointer); @@ -8082,7 +8082,7 @@ } OPCODE_DEFERRED_INC(UNPACK_SEQUENCE); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZED_UNPACK_SEQUENCE */ + #endif /* ENABLE_SPECIALIZATION */ (void)seq; (void)counter; } diff --git a/Python/specialize.c b/Python/specialize.c index 2d5bfa3f79db6b..a3fb3d27ffe3aa 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -459,7 +459,7 @@ do { \ void _PyCode_Quicken(_Py_CODEUNIT *instructions, Py_ssize_t size) { - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZATION_FT int opcode = 0; /* The last code unit cannot have a cache, so we don't need to check it */ for (Py_ssize_t i = 0; i < size-1; i++) { @@ -484,7 +484,7 @@ _PyCode_Quicken(_Py_CODEUNIT *instructions, Py_ssize_t size) i += caches; } } - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZATION_FT */ } #ifdef Py_GIL_DISABLED @@ -739,7 +739,7 @@ _Py_Specialize_LoadSuperAttr(_PyStackRef global_super_st, _PyStackRef cls_st, _P PyObject *global_super = PyStackRef_AsPyObjectBorrow(global_super_st); PyObject *cls = PyStackRef_AsPyObjectBorrow(cls_st); - assert(ENABLE_SPECIALIZED_LOAD_SUPER_ATTR); + assert(ENABLE_SPECIALIZATION); assert(_PyOpcode_Caches[LOAD_SUPER_ATTR] == INLINE_CACHE_ENTRIES_LOAD_SUPER_ATTR); _PySuperAttrCache *cache = (_PySuperAttrCache *)(instr + 1); if (global_super != (PyObject *)&PySuper_Type) { @@ -1155,7 +1155,7 @@ _Py_Specialize_LoadAttr(_PyStackRef owner_st, _Py_CODEUNIT *instr, PyObject *nam _PyAttrCache *cache = (_PyAttrCache *)(instr + 1); PyObject *owner = PyStackRef_AsPyObjectBorrow(owner_st); - assert(ENABLE_SPECIALIZED_LOAD_ATTR); + assert(ENABLE_SPECIALIZATION); assert(_PyOpcode_Caches[LOAD_ATTR] == INLINE_CACHE_ENTRIES_LOAD_ATTR); PyTypeObject *type = Py_TYPE(owner); bool fail; @@ -1194,7 +1194,7 @@ _Py_Specialize_StoreAttr(_PyStackRef owner_st, _Py_CODEUNIT *instr, PyObject *na { PyObject *owner = PyStackRef_AsPyObjectBorrow(owner_st); - assert(ENABLE_SPECIALIZED_STORE_ATTR); + assert(ENABLE_SPECIALIZATION); assert(_PyOpcode_Caches[STORE_ATTR] == INLINE_CACHE_ENTRIES_STORE_ATTR); _PyAttrCache *cache = (_PyAttrCache *)(instr + 1); PyTypeObject *type = Py_TYPE(owner); @@ -1472,7 +1472,7 @@ _Py_Specialize_LoadGlobal( PyObject *globals, PyObject *builtins, _Py_CODEUNIT *instr, PyObject *name) { - assert(ENABLE_SPECIALIZED_LOAD_GLOBAL); + assert(ENABLE_SPECIALIZATION); assert(_PyOpcode_Caches[LOAD_GLOBAL] == INLINE_CACHE_ENTRIES_LOAD_GLOBAL); /* Use inline cache */ _PyLoadGlobalCache *cache = (_PyLoadGlobalCache *)(instr + 1); @@ -1662,7 +1662,7 @@ _Py_Specialize_BinarySubscr( PyObject *container = PyStackRef_AsPyObjectBorrow(container_st); PyObject *sub = PyStackRef_AsPyObjectBorrow(sub_st); - assert(ENABLE_SPECIALIZED_BINARY_SUBSCR); + assert(ENABLE_SPECIALIZATION); assert(_PyOpcode_Caches[BINARY_SUBSCR] == INLINE_CACHE_ENTRIES_BINARY_SUBSCR); _PyBinarySubscrCache *cache = (_PyBinarySubscrCache *)(instr + 1); @@ -1765,7 +1765,7 @@ _Py_Specialize_StoreSubscr(_PyStackRef container_st, _PyStackRef sub_st, _Py_COD PyObject *container = PyStackRef_AsPyObjectBorrow(container_st); PyObject *sub = PyStackRef_AsPyObjectBorrow(sub_st); - assert(ENABLE_SPECIALIZED_STORE_SUBSCR); + assert(ENABLE_SPECIALIZATION); _PyStoreSubscrCache *cache = (_PyStoreSubscrCache *)(instr + 1); PyTypeObject *container_type = Py_TYPE(container); if (container_type == &PyList_Type) { @@ -2106,7 +2106,7 @@ _Py_Specialize_Call(_PyStackRef callable_st, _Py_CODEUNIT *instr, int nargs) { PyObject *callable = PyStackRef_AsPyObjectBorrow(callable_st); - assert(ENABLE_SPECIALIZED_CALL); + assert(ENABLE_SPECIALIZATION); assert(_PyOpcode_Caches[CALL] == INLINE_CACHE_ENTRIES_CALL); assert(_Py_OPCODE(*instr) != INSTRUMENTED_CALL); _PyCallCache *cache = (_PyCallCache *)(instr + 1); @@ -2266,7 +2266,7 @@ _Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *in { PyObject *lhs = PyStackRef_AsPyObjectBorrow(lhs_st); PyObject *rhs = PyStackRef_AsPyObjectBorrow(rhs_st); - assert(ENABLE_SPECIALIZED_BINARY_OP); + assert(ENABLE_SPECIALIZATION_FT); assert(_PyOpcode_Caches[BINARY_OP] == INLINE_CACHE_ENTRIES_BINARY_OP); _PyBinaryOpCache *cache = (_PyBinaryOpCache *)(instr + 1); uint8_t specialized_op; @@ -2378,7 +2378,7 @@ _Py_Specialize_CompareOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *i PyObject *lhs = PyStackRef_AsPyObjectBorrow(lhs_st); PyObject *rhs = PyStackRef_AsPyObjectBorrow(rhs_st); - assert(ENABLE_SPECIALIZED_COMPARE_OP); + assert(ENABLE_SPECIALIZATION); assert(_PyOpcode_Caches[COMPARE_OP] == INLINE_CACHE_ENTRIES_COMPARE_OP); // All of these specializations compute boolean values, so they're all valid // regardless of the fifth-lowest oparg bit. @@ -2442,7 +2442,7 @@ _Py_Specialize_UnpackSequence(_PyStackRef seq_st, _Py_CODEUNIT *instr, int oparg { PyObject *seq = PyStackRef_AsPyObjectBorrow(seq_st); - assert(ENABLE_SPECIALIZED_UNPACK_SEQUENCE); + assert(ENABLE_SPECIALIZATION); assert(_PyOpcode_Caches[UNPACK_SEQUENCE] == INLINE_CACHE_ENTRIES_UNPACK_SEQUENCE); _PyUnpackSequenceCache *cache = (_PyUnpackSequenceCache *)(instr + 1); @@ -2553,7 +2553,7 @@ int void _Py_Specialize_ForIter(_PyStackRef iter, _Py_CODEUNIT *instr, int oparg) { - assert(ENABLE_SPECIALIZED_FOR_ITER); + assert(ENABLE_SPECIALIZATION); assert(_PyOpcode_Caches[FOR_ITER] == INLINE_CACHE_ENTRIES_FOR_ITER); _PyForIterCache *cache = (_PyForIterCache *)(instr + 1); PyObject *iter_o = PyStackRef_AsPyObjectBorrow(iter); @@ -2598,7 +2598,7 @@ _Py_Specialize_Send(_PyStackRef receiver_st, _Py_CODEUNIT *instr) { PyObject *receiver = PyStackRef_AsPyObjectBorrow(receiver_st); - assert(ENABLE_SPECIALIZED_SEND); + assert(ENABLE_SPECIALIZATION); assert(_PyOpcode_Caches[SEND] == INLINE_CACHE_ENTRIES_SEND); _PySendCache *cache = (_PySendCache *)(instr + 1); PyTypeObject *tp = Py_TYPE(receiver); @@ -2625,7 +2625,7 @@ _Py_Specialize_Send(_PyStackRef receiver_st, _Py_CODEUNIT *instr) void _Py_Specialize_ToBool(_PyStackRef value_o, _Py_CODEUNIT *instr) { - assert(ENABLE_SPECIALIZED_TO_BOOL); + assert(ENABLE_SPECIALIZATION); assert(_PyOpcode_Caches[TO_BOOL] == INLINE_CACHE_ENTRIES_TO_BOOL); _PyToBoolCache *cache = (_PyToBoolCache *)(instr + 1); PyObject *value = PyStackRef_AsPyObjectBorrow(value_o); @@ -2742,7 +2742,7 @@ _Py_Specialize_ContainsOp(_PyStackRef value_st, _Py_CODEUNIT *instr) { PyObject *value = PyStackRef_AsPyObjectBorrow(value_st); - assert(ENABLE_SPECIALIZED_CONTAINS_OP); + assert(ENABLE_SPECIALIZATION); assert(_PyOpcode_Caches[CONTAINS_OP] == INLINE_CACHE_ENTRIES_COMPARE_OP); _PyContainsOpCache *cache = (_PyContainsOpCache *)(instr + 1); if (PyDict_CheckExact(value)) { From 4e424140daac45e736956810c58dba1c734f0250 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Sat, 12 Oct 2024 23:39:43 -0700 Subject: [PATCH 51/67] Replace bytecode pointer with tlbc_index This uses less space --- Include/internal/pycore_frame.h | 42 ++++++++-- Include/internal/pycore_opcode_metadata.h | 2 +- Include/internal/pycore_tstate.h | 1 - Include/internal/pycore_uop_metadata.h | 2 +- Objects/frameobject.c | 7 +- Python/bytecodes.c | 57 ++++---------- Python/ceval.c | 20 ++--- Python/executor_cases.c.h | 27 +------ Python/generated_cases.c.h | 95 +++++++---------------- 9 files changed, 98 insertions(+), 155 deletions(-) diff --git a/Include/internal/pycore_frame.h b/Include/internal/pycore_frame.h index 5f29c1ef9490e7..8a0f7ea387d7d5 100644 --- a/Include/internal/pycore_frame.h +++ b/Include/internal/pycore_frame.h @@ -69,7 +69,8 @@ typedef struct _PyInterpreterFrame { PyFrameObject *frame_obj; /* Strong reference, may be NULL. Only valid if not on C stack */ _Py_CODEUNIT *instr_ptr; /* Instruction currently executing (or about to begin) */ #ifdef Py_GIL_DISABLED - _Py_CODEUNIT *bytecode; + /* Index of thread-local bytecode containing instr_ptr. */ + Py_ssize_t tlbc_index; #endif _PyStackRef *stackpointer; uint16_t return_offset; /* Only relevant during a function call */ @@ -91,7 +92,9 @@ static inline _Py_CODEUNIT * _PyFrame_GetBytecode(_PyInterpreterFrame *f) { #ifdef Py_GIL_DISABLED - return f->bytecode; + PyCodeObject *co = _PyFrame_GetCode(f); + assert(f->tlbc_index >= 0 && f->tlbc_index < co->co_tlbc->size); + return (_Py_CODEUNIT *)co->co_tlbc->entries[f->tlbc_index]; #else return _PyCode_CODE(_PyFrame_GetCode(f)); #endif @@ -157,13 +160,32 @@ static inline void _PyFrame_Copy(_PyInterpreterFrame *src, _PyInterpreterFrame * #endif } +#ifdef Py_GIL_DISABLED +static inline void +_PyFrame_InitializeTLBC(PyThreadState *tstate, _PyInterpreterFrame *frame, + PyCodeObject *code) +{ + _Py_CODEUNIT *tlbc = _PyCode_GetTLBCFast(tstate, code); + if (tlbc == NULL) { + // No thread-local bytecode exists for this thread yet, use the main + // thread's copy. It will be created on the first RESUME. + frame->instr_ptr = _PyCode_CODE(code); + frame->tlbc_index = 0; + } + else { + frame->instr_ptr = tlbc; + frame->tlbc_index = ((_PyThreadStateImpl *)tstate)->tlbc_index; + } +} +#endif + /* Consumes reference to func and locals. Does not initialize frame->previous, which happens when frame is linked into the frame stack. */ static inline void _PyFrame_Initialize( - _PyInterpreterFrame *frame, _PyStackRef func, + PyThreadState *tstate, _PyInterpreterFrame *frame, _PyStackRef func, PyObject *locals, PyCodeObject *code, int null_locals_from, _PyInterpreterFrame *previous) { frame->previous = previous; @@ -175,7 +197,12 @@ _PyFrame_Initialize( frame->f_locals = locals; frame->stackpointer = frame->localsplus + code->co_nlocalsplus; frame->frame_obj = NULL; +#ifdef Py_GIL_DISABLED + _PyFrame_InitializeTLBC(tstate, frame, code); +#else + (void)tstate; frame->instr_ptr = _PyCode_CODE(code); +#endif frame->return_offset = 0; frame->owner = FRAME_OWNED_BY_THREAD; @@ -184,7 +211,6 @@ _PyFrame_Initialize( } #ifdef Py_GIL_DISABLED - frame->bytecode = frame->instr_ptr; // On GIL disabled, we walk the entire stack in GC. Since stacktop // is not always in sync with the real stack pointer, we have // no choice but to traverse the entire stack. @@ -330,7 +356,8 @@ _PyFrame_PushUnchecked(PyThreadState *tstate, _PyStackRef func, int null_locals_ _PyInterpreterFrame *new_frame = (_PyInterpreterFrame *)tstate->datastack_top; tstate->datastack_top += code->co_framesize; assert(tstate->datastack_top < tstate->datastack_limit); - _PyFrame_Initialize(new_frame, func, NULL, code, null_locals_from, previous); + _PyFrame_Initialize(tstate, new_frame, func, NULL, code, null_locals_from, + previous); return new_frame; } @@ -354,12 +381,15 @@ _PyFrame_PushTrampolineUnchecked(PyThreadState *tstate, PyCodeObject *code, int assert(stackdepth <= code->co_stacksize); frame->stackpointer = frame->localsplus + code->co_nlocalsplus + stackdepth; frame->frame_obj = NULL; +#ifdef Py_GIL_DISABLED + _PyFrame_InitializeTLBC(tstate, frame, code); +#else frame->instr_ptr = _PyCode_CODE(code); +#endif frame->owner = FRAME_OWNED_BY_THREAD; frame->return_offset = 0; #ifdef Py_GIL_DISABLED - frame->bytecode = frame->instr_ptr; assert(code->co_nlocalsplus == 0); for (int i = 0; i < code->co_stacksize; i++) { frame->localsplus[i] = PyStackRef_NULL; diff --git a/Include/internal/pycore_opcode_metadata.h b/Include/internal/pycore_opcode_metadata.h index 1a390d84b8d042..8fec45b1e8d5c3 100644 --- a/Include/internal/pycore_opcode_metadata.h +++ b/Include/internal/pycore_opcode_metadata.h @@ -1186,7 +1186,7 @@ const struct opcode_metadata _PyOpcode_opcode_metadata[266] = { [RERAISE] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG }, [RESERVED] = { true, INSTR_FMT_IX, 0 }, [RESUME] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_EVAL_BREAK_FLAG | HAS_ERROR_FLAG | HAS_ERROR_NO_POP_FLAG | HAS_ESCAPES_FLAG }, - [RESUME_CHECK] = { true, INSTR_FMT_IX, HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG }, + [RESUME_CHECK] = { true, INSTR_FMT_IX, HAS_DEOPT_FLAG }, [RETURN_CONST] = { true, INSTR_FMT_IB, HAS_ARG_FLAG | HAS_CONST_FLAG }, [RETURN_GENERATOR] = { true, INSTR_FMT_IX, HAS_ERROR_FLAG | HAS_ESCAPES_FLAG }, [RETURN_VALUE] = { true, INSTR_FMT_IX, 0 }, diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h index 847de79a1fbb1a..19953b2fe6a329 100644 --- a/Include/internal/pycore_tstate.h +++ b/Include/internal/pycore_tstate.h @@ -52,7 +52,6 @@ typedef struct _PyThreadStateImpl { } _PyThreadStateImpl; - #ifdef __cplusplus } #endif diff --git a/Include/internal/pycore_uop_metadata.h b/Include/internal/pycore_uop_metadata.h index 749fd62c015dd3..890aa7815886bf 100644 --- a/Include/internal/pycore_uop_metadata.h +++ b/Include/internal/pycore_uop_metadata.h @@ -22,7 +22,7 @@ const uint16_t _PyUop_Flags[MAX_UOP_ID+1] = { [_NOP] = HAS_PURE_FLAG, [_CHECK_PERIODIC] = HAS_EVAL_BREAK_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG, [_CHECK_PERIODIC_IF_NOT_YIELD_FROM] = HAS_ARG_FLAG | HAS_EVAL_BREAK_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG, - [_RESUME_CHECK] = HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG, + [_RESUME_CHECK] = HAS_DEOPT_FLAG, [_LOAD_FAST_CHECK] = HAS_ARG_FLAG | HAS_LOCAL_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG, [_LOAD_FAST_0] = HAS_LOCAL_FLAG | HAS_PURE_FLAG, [_LOAD_FAST_1] = HAS_LOCAL_FLAG | HAS_PURE_FLAG, diff --git a/Objects/frameobject.c b/Objects/frameobject.c index d4931b0eb4a087..2a43a174ccab02 100644 --- a/Objects/frameobject.c +++ b/Objects/frameobject.c @@ -1804,10 +1804,11 @@ PyTypeObject PyFrame_Type = { }; static void -init_frame(_PyInterpreterFrame *frame, PyFunctionObject *func, PyObject *locals) +init_frame(PyThreadState *tstate, _PyInterpreterFrame *frame, + PyFunctionObject *func, PyObject *locals) { PyCodeObject *code = (PyCodeObject *)func->func_code; - _PyFrame_Initialize(frame, PyStackRef_FromPyObjectNew(func), + _PyFrame_Initialize(tstate, frame, PyStackRef_FromPyObjectNew(func), Py_XNewRef(locals), code, 0, NULL); } @@ -1858,7 +1859,7 @@ PyFrame_New(PyThreadState *tstate, PyCodeObject *code, Py_DECREF(func); return NULL; } - init_frame((_PyInterpreterFrame *)f->_f_frame_data, func, locals); + init_frame(tstate, (_PyInterpreterFrame *)f->_f_frame_data, func, locals); f->f_frame = (_PyInterpreterFrame *)f->_f_frame_data; f->f_frame->owner = FRAME_OWNED_BY_FRAME_OBJECT; // This frame needs to be "complete", so pretend that the first RESUME ran: diff --git a/Python/bytecodes.c b/Python/bytecodes.c index e8a89430ab3d4b..44ca1239c92843 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -192,28 +192,20 @@ dummy_func( op(_LOAD_BYTECODE, (--)) { #ifdef Py_GIL_DISABLED - // Work around a bug in the cases_generator logic that inserts code - // to save and restore the stack pointer. Without splitting these - // lines the cases_generator will insert code to save the stack - // pointer before the `#ifdef Py_GIL_DISABLED` and will insert code - // to clear the stack pointer immediately after the call to - // `_PyEval_GetExecutableCode` below. As a result, the stack - // pointer won't properly be cleared in default (with-gil) - // builds. By putting the declaration and assignment on separate - // lines, we cause the cases_generator to correctly insert the code - // to save and clear the stack pointer immediately before and after - // the call to `_PyEval_GetExecutableCode`. - _Py_CODEUNIT *bytecode; - bytecode = _PyEval_GetExecutableCode(tstate, _PyFrame_GetCode(frame)); - ERROR_IF(bytecode == NULL, error); - if (frame->bytecode != bytecode) { - int off = this_instr - frame->bytecode; - frame->bytecode = bytecode; - frame->instr_ptr = frame->bytecode + off; + if (frame->tlbc_index != + ((_PyThreadStateImpl *)tstate)->tlbc_index) { + _Py_CODEUNIT *bytecode = + _PyEval_GetExecutableCode(tstate, _PyFrame_GetCode(frame)); + ERROR_IF(bytecode == NULL, error); + int off = this_instr - _PyFrame_GetBytecode(frame); + frame->tlbc_index = ((_PyThreadStateImpl *)tstate)->tlbc_index; + frame->instr_ptr = bytecode + off; + // Make sure this_instr gets reset correctley for any uops that + // follow next_instr = frame->instr_ptr; DISPATCH(); } - #endif +#endif } macro(RESUME) = @@ -232,30 +224,9 @@ dummy_func( assert((version & _PY_EVAL_EVENTS_MASK) == 0); DEOPT_IF(eval_breaker != version); #ifdef Py_GIL_DISABLED - // Work around a bug in the cases_generator logic that inserts code - // to save and restore the stack pointer. Without splitting these - // lines the cases_generator will insert code to save the stack - // pointer before the `#ifdef Py_GIL_DISABLED` and will insert code - // to clear the stack pointer immediately after the call to - // `_PyCode_GetTLBCFast` below. As a result, the stack - // pointer won't properly be cleared in default (with-gil) - // builds. By putting the declaration and assignment on separate - // lines, we cause the cases_generator to correctly insert the code - // to save and clear the stack pointer immediately before and after - // the call to `_PyCode_GetTLBCFast`. - _Py_CODEUNIT *bytecode; - bytecode = _PyCode_GetTLBCFast(tstate, _PyFrame_GetCode(frame)); - DEOPT_IF(bytecode == NULL); - if (frame->bytecode != bytecode) { - /* Avoid using this_instr here so that _RESUME_CHECK can be included - in traces. - */ - int off = frame->instr_ptr - frame->bytecode; - frame->bytecode = bytecode; - frame->instr_ptr = frame->bytecode + off; - next_instr = frame->instr_ptr + 1; - } - #endif + DEOPT_IF(frame->tlbc_index != + ((_PyThreadStateImpl *)tstate)->tlbc_index); +#endif } op(_MONITOR_RESUME, (--)) { diff --git a/Python/ceval.c b/Python/ceval.c index 77d01c0bbab072..9362648ee4e3b7 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -819,15 +819,15 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int * we need to update instrumentation */ #ifdef Py_GIL_DISABLED /* Load thread-local bytecode */ - _Py_CODEUNIT *bytecode = - _PyEval_GetExecutableCode(tstate, _PyFrame_GetCode(frame)); - if (bytecode == NULL) { - goto error; - } - if (frame->bytecode != bytecode) { - ptrdiff_t off = frame->instr_ptr - frame->bytecode; - frame->bytecode = bytecode; - frame->instr_ptr = frame->bytecode + off; + if (frame->tlbc_index != ((_PyThreadStateImpl *)tstate)->tlbc_index) { + _Py_CODEUNIT *bytecode = + _PyEval_GetExecutableCode(tstate, _PyFrame_GetCode(frame)); + if (bytecode == NULL) { + goto error; + } + ptrdiff_t off = frame->instr_ptr - _PyFrame_GetBytecode(frame); + frame->tlbc_index = ((_PyThreadStateImpl *)tstate)->tlbc_index; + frame->instr_ptr = bytecode + off; } #endif _Py_Instrument(_PyFrame_GetCode(frame), tstate->interp); @@ -1751,7 +1751,7 @@ _PyEvalFramePushAndInit(PyThreadState *tstate, _PyStackRef func, if (frame == NULL) { goto fail; } - _PyFrame_Initialize(frame, func, locals, code, 0, previous); + _PyFrame_Initialize(tstate, frame, func, locals, code, 0, previous); if (initialize_locals(tstate, func_obj, frame->localsplus, args, argcount, kwnames)) { assert(frame->owner == FRAME_OWNED_BY_THREAD); clear_thread_frame(tstate, frame); diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index d790de529b74b3..5882c472405668 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -59,34 +59,11 @@ JUMP_TO_JUMP_TARGET(); } #ifdef Py_GIL_DISABLED - // Work around a bug in the cases_generator logic that inserts code - // to save and restore the stack pointer. Without splitting these - // lines the cases_generator will insert code to save the stack - // pointer before the `#ifdef Py_GIL_DISABLED` and will insert code - // to clear the stack pointer immediately after the call to - // `_PyCode_GetTLBCFast` below. As a result, the stack - // pointer won't properly be cleared in default (with-gil) - // builds. By putting the declaration and assignment on separate - // lines, we cause the cases_generator to correctly insert the code - // to save and clear the stack pointer immediately before and after - // the call to `_PyCode_GetTLBCFast`. - _Py_CODEUNIT *bytecode; - _PyFrame_SetStackPointer(frame, stack_pointer); - bytecode = _PyCode_GetTLBCFast(tstate, _PyFrame_GetCode(frame)); - stack_pointer = _PyFrame_GetStackPointer(frame); - if (bytecode == NULL) { + if (frame->tlbc_index != + ((_PyThreadStateImpl *)tstate)->tlbc_index) { UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); } - if (frame->bytecode != bytecode) { - /* Avoid using this_instr here so that _RESUME_CHECK can be included - in traces. - */ - int off = frame->instr_ptr - frame->bytecode; - frame->bytecode = bytecode; - frame->instr_ptr = frame->bytecode + off; - next_instr = frame->instr_ptr + 1; - } #endif break; } diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 737b30b934258d..a64c4428b70290 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -4831,26 +4831,20 @@ // _LOAD_BYTECODE { #ifdef Py_GIL_DISABLED - // Work around a bug in the cases_generator logic that inserts code - // to save and restore the stack pointer. Without splitting these - // lines the cases_generator will insert code to save the stack - // pointer before the `#ifdef Py_GIL_DISABLED` and will insert code - // to clear the stack pointer immediately after the call to - // `_PyEval_GetExecutableCode` below. As a result, the stack - // pointer won't properly be cleared in default (with-gil) - // builds. By putting the declaration and assignment on separate - // lines, we cause the cases_generator to correctly insert the code - // to save and clear the stack pointer immediately before and after - // the call to `_PyEval_GetExecutableCode`. - _Py_CODEUNIT *bytecode; - _PyFrame_SetStackPointer(frame, stack_pointer); - bytecode = _PyEval_GetExecutableCode(tstate, _PyFrame_GetCode(frame)); - stack_pointer = _PyFrame_GetStackPointer(frame); - if (bytecode == NULL) goto error; - if (frame->bytecode != bytecode) { - int off = this_instr - frame->bytecode; - frame->bytecode = bytecode; - frame->instr_ptr = frame->bytecode + off; + if (frame->tlbc_index != + ((_PyThreadStateImpl *)tstate)->tlbc_index) { + _PyFrame_SetStackPointer(frame, stack_pointer); + _Py_CODEUNIT *bytecode = + _PyEval_GetExecutableCode(tstate, _PyFrame_GetCode(frame)); + stack_pointer = _PyFrame_GetStackPointer(frame); + if (bytecode == NULL) goto error; + _PyFrame_SetStackPointer(frame, stack_pointer); + int off = this_instr - _PyFrame_GetBytecode(frame); + stack_pointer = _PyFrame_GetStackPointer(frame); + frame->tlbc_index = ((_PyThreadStateImpl *)tstate)->tlbc_index; + frame->instr_ptr = bytecode + off; + // Make sure this_instr gets reset correctley for any uops that + // follow next_instr = frame->instr_ptr; DISPATCH(); } @@ -6915,26 +6909,20 @@ // _LOAD_BYTECODE { #ifdef Py_GIL_DISABLED - // Work around a bug in the cases_generator logic that inserts code - // to save and restore the stack pointer. Without splitting these - // lines the cases_generator will insert code to save the stack - // pointer before the `#ifdef Py_GIL_DISABLED` and will insert code - // to clear the stack pointer immediately after the call to - // `_PyEval_GetExecutableCode` below. As a result, the stack - // pointer won't properly be cleared in default (with-gil) - // builds. By putting the declaration and assignment on separate - // lines, we cause the cases_generator to correctly insert the code - // to save and clear the stack pointer immediately before and after - // the call to `_PyEval_GetExecutableCode`. - _Py_CODEUNIT *bytecode; - _PyFrame_SetStackPointer(frame, stack_pointer); - bytecode = _PyEval_GetExecutableCode(tstate, _PyFrame_GetCode(frame)); - stack_pointer = _PyFrame_GetStackPointer(frame); - if (bytecode == NULL) goto error; - if (frame->bytecode != bytecode) { - int off = this_instr - frame->bytecode; - frame->bytecode = bytecode; - frame->instr_ptr = frame->bytecode + off; + if (frame->tlbc_index != + ((_PyThreadStateImpl *)tstate)->tlbc_index) { + _PyFrame_SetStackPointer(frame, stack_pointer); + _Py_CODEUNIT *bytecode = + _PyEval_GetExecutableCode(tstate, _PyFrame_GetCode(frame)); + stack_pointer = _PyFrame_GetStackPointer(frame); + if (bytecode == NULL) goto error; + _PyFrame_SetStackPointer(frame, stack_pointer); + int off = this_instr - _PyFrame_GetBytecode(frame); + stack_pointer = _PyFrame_GetStackPointer(frame); + frame->tlbc_index = ((_PyThreadStateImpl *)tstate)->tlbc_index; + frame->instr_ptr = bytecode + off; + // Make sure this_instr gets reset correctley for any uops that + // follow next_instr = frame->instr_ptr; DISPATCH(); } @@ -6995,31 +6983,8 @@ assert((version & _PY_EVAL_EVENTS_MASK) == 0); DEOPT_IF(eval_breaker != version, RESUME); #ifdef Py_GIL_DISABLED - // Work around a bug in the cases_generator logic that inserts code - // to save and restore the stack pointer. Without splitting these - // lines the cases_generator will insert code to save the stack - // pointer before the `#ifdef Py_GIL_DISABLED` and will insert code - // to clear the stack pointer immediately after the call to - // `_PyCode_GetTLBCFast` below. As a result, the stack - // pointer won't properly be cleared in default (with-gil) - // builds. By putting the declaration and assignment on separate - // lines, we cause the cases_generator to correctly insert the code - // to save and clear the stack pointer immediately before and after - // the call to `_PyCode_GetTLBCFast`. - _Py_CODEUNIT *bytecode; - _PyFrame_SetStackPointer(frame, stack_pointer); - bytecode = _PyCode_GetTLBCFast(tstate, _PyFrame_GetCode(frame)); - stack_pointer = _PyFrame_GetStackPointer(frame); - DEOPT_IF(bytecode == NULL, RESUME); - if (frame->bytecode != bytecode) { - /* Avoid using this_instr here so that _RESUME_CHECK can be included - in traces. - */ - int off = frame->instr_ptr - frame->bytecode; - frame->bytecode = bytecode; - frame->instr_ptr = frame->bytecode + off; - next_instr = frame->instr_ptr + 1; - } + DEOPT_IF(frame->tlbc_index != + ((_PyThreadStateImpl *)tstate)->tlbc_index, RESUME); #endif DISPATCH(); } From 814e4ca4e54165a7081b1633eeaee1d5e078ee47 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Sun, 13 Oct 2024 20:36:23 -0700 Subject: [PATCH 52/67] Add a test verifying that we clean up tlbc when the code object is destroyed --- Lib/test/test_tlbc.py | 235 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 235 insertions(+) create mode 100644 Lib/test/test_tlbc.py diff --git a/Lib/test/test_tlbc.py b/Lib/test/test_tlbc.py new file mode 100644 index 00000000000000..231d32ee6dc5be --- /dev/null +++ b/Lib/test/test_tlbc.py @@ -0,0 +1,235 @@ +"""Tests for thread-local bytecode.""" +import dis +import textwrap +import unittest + +from test import support +from test.support import cpython_only, import_helper, requires_specialization +from test.support.script_helper import assert_python_ok +from test.support.threading_helper import requires_working_threading + +# Skip this test if the _testinternalcapi module isn't available +_testinternalcapi = import_helper.import_module("_testinternalcapi") + + +@cpython_only +@requires_working_threading() +@unittest.skipUnless(support.Py_GIL_DISABLED, "only in free-threaded builds") +class TLBCTests(unittest.TestCase): + @requires_specialization + def test_new_threads_start_with_unspecialized_code(self): + code = textwrap.dedent(""" + import dis + import queue + import threading + + from _testinternalcapi import get_tlbc + + def all_opnames(bc): + return {i.opname for i in dis._get_instructions_bytes(bc)} + + def f(a, b, q=None): + if q is not None: + q.put(get_tlbc(f)) + return a + b + + for _ in range(100): + # specialize + f(1, 2) + + q = queue.Queue() + t = threading.Thread(target=f, args=('a', 'b', q)) + t.start() + t.join() + + assert "BINARY_OP_ADD_INT" in all_opnames(get_tlbc(f)) + assert "BINARY_OP_ADD_INT" not in all_opnames(q.get()) + """) + assert_python_ok("-X", "tlbc=1", "-c", code) + + @requires_specialization + def test_threads_specialize_independently(self): + code = textwrap.dedent(""" + import dis + import queue + import threading + + from _testinternalcapi import get_tlbc + + def all_opnames(bc): + return {i.opname for i in dis._get_instructions_bytes(bc)} + + def f(a, b): + return a + b + + def g(a, b, q=None): + for _ in range(100): + f(a, b) + if q is not None: + q.put(get_tlbc(f)) + + # specialize in main thread + g(1, 2) + + # specialize in other thread + q = queue.Queue() + t = threading.Thread(target=g, args=('a', 'b', q)) + t.start() + t.join() + + assert "BINARY_OP_ADD_INT" in all_opnames(get_tlbc(f)) + t_opnames = all_opnames(q.get()) + assert "BINARY_OP_ADD_INT" not in t_opnames + assert "BINARY_OP_ADD_UNICODE" in t_opnames + """) + assert_python_ok("-X", "tlbc=1", "-c", code) + + def test_reuse_tlbc_across_threads_different_lifetimes(self): + code = textwrap.dedent(""" + import queue + import threading + + from _testinternalcapi import get_tlbc_id + + def f(a, b, q=None): + if q is not None: + q.put(get_tlbc_id(f)) + return a + b + + q = queue.Queue() + tlbc_ids = [] + for _ in range(3): + t = threading.Thread(target=f, args=('a', 'b', q)) + t.start() + t.join() + tlbc_ids.append(q.get()) + + assert tlbc_ids[0] == tlbc_ids[1] + assert tlbc_ids[1] == tlbc_ids[2] + """) + assert_python_ok("-X", "tlbc=1", "-c", code) + + def test_no_tlbc_if_tlbc_disabled(self): + code = textwrap.dedent(""" + import queue + import threading + + from _testinternalcapi import get_tlbc + + def f(a, b, q=None): + if q is not None: + q.put(get_tlbc(f)) + return a + b + + q = queue.Queue() + threads = [] + for _ in range(3): + t = threading.Thread(target=f, args=('a', 'b', q)) + t.start() + threads.append(t) + + tlbcs = [] + for t in threads: + t.join() + tlbcs.append(q.get()) + + assert get_tlbc(f) is not None + assert tlbcs[0] is None + assert tlbcs[1] is None + assert tlbcs[2] is None + """) + assert_python_ok("-X", "tlbc=0", "-c", code) + + def test_no_specialization_if_tlbc_disabled(self): + code = textwrap.dedent(""" + import dis + import queue + import threading + + from _testinternalcapi import get_tlbc + + def all_opnames(f): + bc = get_tlbc(f) + return {i.opname for i in dis._get_instructions_bytes(bc)} + + def f(a, b): + return a + b + + for _ in range(100): + f(1, 2) + + assert "BINARY_OP_ADD_INT" not in all_opnames(f) + """) + assert_python_ok("-X", "tlbc=0", "-c", code) + + def test_generator_throw(self): + code = textwrap.dedent(""" + import queue + import threading + + from _testinternalcapi import get_tlbc_id + + def g(): + try: + yield + except: + yield get_tlbc_id(g) + + def f(q): + gen = g() + next(gen) + q.put(gen.throw(ValueError)) + + q = queue.Queue() + t = threading.Thread(target=f, args=(q,)) + t.start() + t.join() + + gen = g() + next(gen) + main_id = gen.throw(ValueError) + assert main_id != q.get() + """) + assert_python_ok("-X", "tlbc=1", "-c", code) + + def test_tlbc_cleanup(self): + code = textwrap.dedent(""" + import gc + import sys + import threading + + def f(barrier, callee): + barrier.wait() + return callee() + + # Define callee dynamically so that the module body's constants don't + # hold a strong reference to the code object. + ns = {} + exec('def func(): return 42', globals=ns) + callee = ns.pop('func') + + # Create 5 copies of callee's bytecode + threads = [] + barrier = threading.Barrier(5) + for _ in range(barrier.parties): + t = threading.Thread(target=f, args=(barrier, callee)) + t.start() + threads.append(t) + for t in threads: + t.join() + + # Destroy the only reference to callee's code object. All the tlbc + # copies should be destroyed when the code object is destroyed in the + # call to gc.collect below. + before = sys._get_tlbc_blocks() + callee.__code__ = f.__code__ + gc.collect() + after = sys._get_tlbc_blocks() + assert (before - after) == len(threads) + """) + assert_python_ok("-X", "tlbc=1", "-c", code) + + + +if __name__ == "__main__": + unittest.main() From cb8a7749031cb4bd45f39942820fd469f0c644bd Mon Sep 17 00:00:00 2001 From: Matt Page Date: Sun, 13 Oct 2024 21:59:45 -0700 Subject: [PATCH 53/67] Fix indentation --- Python/bytecodes.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 44ca1239c92843..6b10a7acfe4a91 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -205,7 +205,7 @@ dummy_func( next_instr = frame->instr_ptr; DISPATCH(); } -#endif + #endif } macro(RESUME) = @@ -226,7 +226,7 @@ dummy_func( #ifdef Py_GIL_DISABLED DEOPT_IF(frame->tlbc_index != ((_PyThreadStateImpl *)tstate)->tlbc_index); -#endif + #endif } op(_MONITOR_RESUME, (--)) { From 0f8a55b151f2bb96ff553829a6486402859c7801 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Sun, 13 Oct 2024 22:03:33 -0700 Subject: [PATCH 54/67] Clarify comment --- Include/internal/pycore_frame.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Include/internal/pycore_frame.h b/Include/internal/pycore_frame.h index 8a0f7ea387d7d5..e9f608c39cc21f 100644 --- a/Include/internal/pycore_frame.h +++ b/Include/internal/pycore_frame.h @@ -167,8 +167,9 @@ _PyFrame_InitializeTLBC(PyThreadState *tstate, _PyInterpreterFrame *frame, { _Py_CODEUNIT *tlbc = _PyCode_GetTLBCFast(tstate, code); if (tlbc == NULL) { - // No thread-local bytecode exists for this thread yet, use the main - // thread's copy. It will be created on the first RESUME. + // No thread-local bytecode exists for this thread yet; use the main + // thread's copy, deferring thread-local bytecode creation to the + // execution of RESUME. frame->instr_ptr = _PyCode_CODE(code); frame->tlbc_index = 0; } From 70ce0fe7a0ab8c8661b47171e8fce07ddd7b0668 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Sun, 13 Oct 2024 22:28:45 -0700 Subject: [PATCH 55/67] Fix TSAN --- Include/internal/pycore_frame.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Include/internal/pycore_frame.h b/Include/internal/pycore_frame.h index e9f608c39cc21f..42e75d43b90dd4 100644 --- a/Include/internal/pycore_frame.h +++ b/Include/internal/pycore_frame.h @@ -93,8 +93,9 @@ _PyFrame_GetBytecode(_PyInterpreterFrame *f) { #ifdef Py_GIL_DISABLED PyCodeObject *co = _PyFrame_GetCode(f); - assert(f->tlbc_index >= 0 && f->tlbc_index < co->co_tlbc->size); - return (_Py_CODEUNIT *)co->co_tlbc->entries[f->tlbc_index]; + _PyCodeArray *tlbc = _Py_atomic_load_ptr_acquire(&co->co_tlbc); + assert(f->tlbc_index >= 0 && f->tlbc_index < tlbc->size); + return (_Py_CODEUNIT *)tlbc->entries[f->tlbc_index]; #else return _PyCode_CODE(_PyFrame_GetCode(f)); #endif From f512353e861da3d5b5f1017b17ae5c8bb158dbd9 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Sun, 13 Oct 2024 22:35:01 -0700 Subject: [PATCH 56/67] Add test for cleaning up tlbc in correct place, not old emacs buffer --- Lib/test/test_thread_local_bytecode.py | 37 ++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/Lib/test/test_thread_local_bytecode.py b/Lib/test/test_thread_local_bytecode.py index fd4e5ffbe467db..08f7f4e4263c44 100644 --- a/Lib/test/test_thread_local_bytecode.py +++ b/Lib/test/test_thread_local_bytecode.py @@ -192,6 +192,43 @@ def f(q): """) assert_python_ok("-X", "tlbc=1", "-c", code) + def test_tlbc_cleanup(self): + code = textwrap.dedent(""" + import gc + import sys + import threading + + def f(barrier, callee): + barrier.wait() + return callee() + + # Define callee dynamically so that the module body's constants don't + # hold a strong reference to the code object. + ns = {} + exec('def func(): return 42', globals=ns) + callee = ns.pop('func') + + # Create 5 copies of callee's bytecode + threads = [] + barrier = threading.Barrier(5) + for _ in range(barrier.parties): + t = threading.Thread(target=f, args=(barrier, callee)) + t.start() + threads.append(t) + for t in threads: + t.join() + + # Destroy the only reference to callee's code object. All the tlbc + # copies should be destroyed when the code object is destroyed in the + # call to gc.collect below. + before = sys._get_tlbc_blocks() + callee.__code__ = f.__code__ + gc.collect() + after = sys._get_tlbc_blocks() + assert (before - after) == len(threads) + """) + assert_python_ok("-X", "tlbc=1", "-c", code) + if __name__ == "__main__": unittest.main() From 4be2b1fc3dc52620266250f974667cfbc6420264 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Sun, 13 Oct 2024 22:36:26 -0700 Subject: [PATCH 57/67] Remove test_tlbc.py --- Lib/test/test_tlbc.py | 235 ------------------------------------------ 1 file changed, 235 deletions(-) delete mode 100644 Lib/test/test_tlbc.py diff --git a/Lib/test/test_tlbc.py b/Lib/test/test_tlbc.py deleted file mode 100644 index 231d32ee6dc5be..00000000000000 --- a/Lib/test/test_tlbc.py +++ /dev/null @@ -1,235 +0,0 @@ -"""Tests for thread-local bytecode.""" -import dis -import textwrap -import unittest - -from test import support -from test.support import cpython_only, import_helper, requires_specialization -from test.support.script_helper import assert_python_ok -from test.support.threading_helper import requires_working_threading - -# Skip this test if the _testinternalcapi module isn't available -_testinternalcapi = import_helper.import_module("_testinternalcapi") - - -@cpython_only -@requires_working_threading() -@unittest.skipUnless(support.Py_GIL_DISABLED, "only in free-threaded builds") -class TLBCTests(unittest.TestCase): - @requires_specialization - def test_new_threads_start_with_unspecialized_code(self): - code = textwrap.dedent(""" - import dis - import queue - import threading - - from _testinternalcapi import get_tlbc - - def all_opnames(bc): - return {i.opname for i in dis._get_instructions_bytes(bc)} - - def f(a, b, q=None): - if q is not None: - q.put(get_tlbc(f)) - return a + b - - for _ in range(100): - # specialize - f(1, 2) - - q = queue.Queue() - t = threading.Thread(target=f, args=('a', 'b', q)) - t.start() - t.join() - - assert "BINARY_OP_ADD_INT" in all_opnames(get_tlbc(f)) - assert "BINARY_OP_ADD_INT" not in all_opnames(q.get()) - """) - assert_python_ok("-X", "tlbc=1", "-c", code) - - @requires_specialization - def test_threads_specialize_independently(self): - code = textwrap.dedent(""" - import dis - import queue - import threading - - from _testinternalcapi import get_tlbc - - def all_opnames(bc): - return {i.opname for i in dis._get_instructions_bytes(bc)} - - def f(a, b): - return a + b - - def g(a, b, q=None): - for _ in range(100): - f(a, b) - if q is not None: - q.put(get_tlbc(f)) - - # specialize in main thread - g(1, 2) - - # specialize in other thread - q = queue.Queue() - t = threading.Thread(target=g, args=('a', 'b', q)) - t.start() - t.join() - - assert "BINARY_OP_ADD_INT" in all_opnames(get_tlbc(f)) - t_opnames = all_opnames(q.get()) - assert "BINARY_OP_ADD_INT" not in t_opnames - assert "BINARY_OP_ADD_UNICODE" in t_opnames - """) - assert_python_ok("-X", "tlbc=1", "-c", code) - - def test_reuse_tlbc_across_threads_different_lifetimes(self): - code = textwrap.dedent(""" - import queue - import threading - - from _testinternalcapi import get_tlbc_id - - def f(a, b, q=None): - if q is not None: - q.put(get_tlbc_id(f)) - return a + b - - q = queue.Queue() - tlbc_ids = [] - for _ in range(3): - t = threading.Thread(target=f, args=('a', 'b', q)) - t.start() - t.join() - tlbc_ids.append(q.get()) - - assert tlbc_ids[0] == tlbc_ids[1] - assert tlbc_ids[1] == tlbc_ids[2] - """) - assert_python_ok("-X", "tlbc=1", "-c", code) - - def test_no_tlbc_if_tlbc_disabled(self): - code = textwrap.dedent(""" - import queue - import threading - - from _testinternalcapi import get_tlbc - - def f(a, b, q=None): - if q is not None: - q.put(get_tlbc(f)) - return a + b - - q = queue.Queue() - threads = [] - for _ in range(3): - t = threading.Thread(target=f, args=('a', 'b', q)) - t.start() - threads.append(t) - - tlbcs = [] - for t in threads: - t.join() - tlbcs.append(q.get()) - - assert get_tlbc(f) is not None - assert tlbcs[0] is None - assert tlbcs[1] is None - assert tlbcs[2] is None - """) - assert_python_ok("-X", "tlbc=0", "-c", code) - - def test_no_specialization_if_tlbc_disabled(self): - code = textwrap.dedent(""" - import dis - import queue - import threading - - from _testinternalcapi import get_tlbc - - def all_opnames(f): - bc = get_tlbc(f) - return {i.opname for i in dis._get_instructions_bytes(bc)} - - def f(a, b): - return a + b - - for _ in range(100): - f(1, 2) - - assert "BINARY_OP_ADD_INT" not in all_opnames(f) - """) - assert_python_ok("-X", "tlbc=0", "-c", code) - - def test_generator_throw(self): - code = textwrap.dedent(""" - import queue - import threading - - from _testinternalcapi import get_tlbc_id - - def g(): - try: - yield - except: - yield get_tlbc_id(g) - - def f(q): - gen = g() - next(gen) - q.put(gen.throw(ValueError)) - - q = queue.Queue() - t = threading.Thread(target=f, args=(q,)) - t.start() - t.join() - - gen = g() - next(gen) - main_id = gen.throw(ValueError) - assert main_id != q.get() - """) - assert_python_ok("-X", "tlbc=1", "-c", code) - - def test_tlbc_cleanup(self): - code = textwrap.dedent(""" - import gc - import sys - import threading - - def f(barrier, callee): - barrier.wait() - return callee() - - # Define callee dynamically so that the module body's constants don't - # hold a strong reference to the code object. - ns = {} - exec('def func(): return 42', globals=ns) - callee = ns.pop('func') - - # Create 5 copies of callee's bytecode - threads = [] - barrier = threading.Barrier(5) - for _ in range(barrier.parties): - t = threading.Thread(target=f, args=(barrier, callee)) - t.start() - threads.append(t) - for t in threads: - t.join() - - # Destroy the only reference to callee's code object. All the tlbc - # copies should be destroyed when the code object is destroyed in the - # call to gc.collect below. - before = sys._get_tlbc_blocks() - callee.__code__ = f.__code__ - gc.collect() - after = sys._get_tlbc_blocks() - assert (before - after) == len(threads) - """) - assert_python_ok("-X", "tlbc=1", "-c", code) - - - -if __name__ == "__main__": - unittest.main() From ab6222ce472db49da859e98c849e916255919f83 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Thu, 17 Oct 2024 13:40:31 -0700 Subject: [PATCH 58/67] Use int32_t instead of Py_ssize_t for tlbc indices --- Include/internal/pycore_code.h | 4 ++-- Include/internal/pycore_frame.h | 2 +- Include/internal/pycore_index_pool.h | 8 ++++---- Include/internal/pycore_tstate.h | 2 +- Objects/codeobject.c | 4 ++-- Python/index_pool.c | 20 ++++++++++---------- Python/pystate.c | 2 +- 7 files changed, 21 insertions(+), 21 deletions(-) diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index 033baeb915b962..4530ab94ce1038 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -615,7 +615,7 @@ static inline _Py_CODEUNIT * _PyCode_GetTLBCFast(PyThreadState *tstate, PyCodeObject *co) { _PyCodeArray *code = _Py_atomic_load_ptr_acquire(&co->co_tlbc); - Py_ssize_t idx = ((_PyThreadStateImpl*) tstate)->tlbc_index; + int32_t idx = ((_PyThreadStateImpl*) tstate)->tlbc_index; if (idx < code->size && code->entries[idx] != NULL) { return (_Py_CODEUNIT *) code->entries[idx]; } @@ -630,7 +630,7 @@ extern _Py_CODEUNIT *_PyCode_GetTLBC(PyCodeObject *co); // arrays // // Returns the reserved index or -1 on error. -extern Py_ssize_t _Py_ReserveTLBCIndex(PyInterpreterState *interp); +extern int32_t _Py_ReserveTLBCIndex(PyInterpreterState *interp); // Release the current thread's index into thread-local bytecode arrays extern void _Py_ClearTLBCIndex(_PyThreadStateImpl *tstate); diff --git a/Include/internal/pycore_frame.h b/Include/internal/pycore_frame.h index 42e75d43b90dd4..8c0100390d036e 100644 --- a/Include/internal/pycore_frame.h +++ b/Include/internal/pycore_frame.h @@ -70,7 +70,7 @@ typedef struct _PyInterpreterFrame { _Py_CODEUNIT *instr_ptr; /* Instruction currently executing (or about to begin) */ #ifdef Py_GIL_DISABLED /* Index of thread-local bytecode containing instr_ptr. */ - Py_ssize_t tlbc_index; + int32_t tlbc_index; #endif _PyStackRef *stackpointer; uint16_t return_offset; /* Only relevant during a function call */ diff --git a/Include/internal/pycore_index_pool.h b/Include/internal/pycore_index_pool.h index 721cc6a8075e3b..e81bfd4d6ed03d 100644 --- a/Include/internal/pycore_index_pool.h +++ b/Include/internal/pycore_index_pool.h @@ -19,7 +19,7 @@ extern "C" { // A min-heap of indices typedef struct _PyIndexHeap { - Py_ssize_t *values; + int32_t *values; // Number of items stored in values Py_ssize_t size; @@ -37,14 +37,14 @@ typedef struct _PyIndexPool { _PyIndexHeap free_indices; // Next index to allocate if no free indices are available - Py_ssize_t next_index; + int32_t next_index; } _PyIndexPool; // Allocate the smallest available index. Returns -1 on error. -extern Py_ssize_t _PyIndexPool_AllocIndex(_PyIndexPool *indices); +extern int32_t _PyIndexPool_AllocIndex(_PyIndexPool *indices); // Release `index` back to the pool -extern void _PyIndexPool_FreeIndex(_PyIndexPool *indices, Py_ssize_t index); +extern void _PyIndexPool_FreeIndex(_PyIndexPool *indices, int32_t index); extern void _PyIndexPool_Fini(_PyIndexPool *indices); diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h index 19953b2fe6a329..867d4baf0970da 100644 --- a/Include/internal/pycore_tstate.h +++ b/Include/internal/pycore_tstate.h @@ -43,7 +43,7 @@ typedef struct _PyThreadStateImpl { } refcounts; // Index to use to retrieve thread-local bytecode for this thread - Py_ssize_t tlbc_index; + int32_t tlbc_index; #endif #if defined(Py_REF_DEBUG) && defined(Py_GIL_DISABLED) diff --git a/Objects/codeobject.c b/Objects/codeobject.c index 219e7735e9aa9a..f263b7b630bcea 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -2721,7 +2721,7 @@ _PyCode_Fini(PyInterpreterState *interp) // interpreter and instrumentation use atomics, with specialization taking care // not to overwrite an instruction that was instrumented concurrently. -Py_ssize_t +int32_t _Py_ReserveTLBCIndex(PyInterpreterState *interp) { return _PyIndexPool_AllocIndex(&interp->tlbc_indices); @@ -2804,7 +2804,7 @@ get_tlbc_lock_held(PyCodeObject *co) { _PyCodeArray *tlbc = co->co_tlbc; _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)PyThreadState_GET(); - Py_ssize_t idx = tstate->tlbc_index; + int32_t idx = tstate->tlbc_index; if (idx < tlbc->size && tlbc->entries[idx] != NULL) { return (_Py_CODEUNIT *)tlbc->entries[idx]; } diff --git a/Python/index_pool.c b/Python/index_pool.c index 927c57838cf3aa..09787ffabcac7e 100644 --- a/Python/index_pool.c +++ b/Python/index_pool.c @@ -8,9 +8,9 @@ #ifdef Py_GIL_DISABLED static inline void -swap(Py_ssize_t *values, Py_ssize_t i, Py_ssize_t j) +swap(int32_t *values, Py_ssize_t i, Py_ssize_t j) { - Py_ssize_t tmp = values[i]; + int32_t tmp = values[i]; values[i] = values[j]; values[j] = tmp; } @@ -43,19 +43,19 @@ parent(Py_ssize_t i) } static inline Py_ssize_t -left_child(Py_ssize_t i) +left_child(int32_t i) { return 2 * i + 1; } static inline Py_ssize_t -right_child(Py_ssize_t i) +right_child(int32_t i) { return 2 * i + 2; } static void -heap_add(_PyIndexHeap *heap, Py_ssize_t val) +heap_add(_PyIndexHeap *heap, int32_t val) { assert(heap->size < heap->capacity); // Add val to end @@ -86,12 +86,12 @@ heap_min_child(_PyIndexHeap *heap, Py_ssize_t i) return -1; } -static Py_ssize_t +static int32_t heap_pop(_PyIndexHeap *heap) { assert(heap->size > 0); // Pop smallest and replace with the last element - Py_ssize_t result = heap->values[0]; + int32_t result = heap->values[0]; heap->values[0] = heap->values[heap->size - 1]; heap->size--; // Sift down @@ -148,11 +148,11 @@ heap_fini(_PyIndexHeap *heap) #define LOCK_POOL(pool) PyMutex_LockFlags(&pool->mutex, _Py_LOCK_DONT_DETACH) #define UNLOCK_POOL(pool) PyMutex_Unlock(&pool->mutex) -Py_ssize_t +int32_t _PyIndexPool_AllocIndex(_PyIndexPool *pool) { LOCK_POOL(pool); - Py_ssize_t index; + int32_t index; _PyIndexHeap *free_indices = &pool->free_indices; if (free_indices->size == 0) { // No free indices. Make sure the heap can always store all of the @@ -177,7 +177,7 @@ _PyIndexPool_AllocIndex(_PyIndexPool *pool) } void -_PyIndexPool_FreeIndex(_PyIndexPool *pool, Py_ssize_t index) +_PyIndexPool_FreeIndex(_PyIndexPool *pool, int32_t index) { LOCK_POOL(pool); heap_add(&pool->free_indices, index); diff --git a/Python/pystate.c b/Python/pystate.c index 8dbbd563d70124..f990a5403aabe6 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -1515,7 +1515,7 @@ new_threadstate(PyInterpreterState *interp, int whence) PyMem_RawFree(new_tstate); return NULL; } - Py_ssize_t tlbc_idx = _Py_ReserveTLBCIndex(interp); + int32_t tlbc_idx = _Py_ReserveTLBCIndex(interp); if (tlbc_idx < 0) { PyMem_RawFree(new_tstate); return NULL; From 6bbb22082cca959ef592097a0e367644b363d769 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Thu, 17 Oct 2024 13:44:34 -0700 Subject: [PATCH 59/67] Use _PyCode_CODE instead of PyFrame_GetBytecode in super_init_without_args --- Objects/typeobject.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Objects/typeobject.c b/Objects/typeobject.c index c949550eefa836..4654a57ffb4d75 100644 --- a/Objects/typeobject.c +++ b/Objects/typeobject.c @@ -11618,8 +11618,8 @@ super_init_without_args(_PyInterpreterFrame *cframe, PyTypeObject **type_p, if (_PyInterpreterFrame_LASTI(cframe) >= 0) { // MAKE_CELL and COPY_FREE_VARS have no quickened forms, so no need // to use _PyOpcode_Deopt here: - assert(_PyFrame_GetBytecode(cframe)[0].op.code == MAKE_CELL || - _PyFrame_GetBytecode(cframe)[0].op.code == COPY_FREE_VARS); + assert(_PyCode_CODE(co)[0].op.code == MAKE_CELL || + _PyCode_CODE(co)[0].op.code == COPY_FREE_VARS); assert(PyCell_Check(firstarg)); firstarg = PyCell_GET(firstarg); } From 4580e3c9d05672dddd190f040c5f1232870632b0 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Thu, 17 Oct 2024 13:50:17 -0700 Subject: [PATCH 60/67] Update comment --- Python/ceval_macros.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h index 966d8daa62179d..5df55813a0ddeb 100644 --- a/Python/ceval_macros.h +++ b/Python/ceval_macros.h @@ -312,9 +312,10 @@ GETITEM(PyObject *v, Py_ssize_t i) { } while (0); #ifdef ENABLE_SPECIALIZATION_FT -/* Multiple threads may execute these concurrently if the thread-local bytecode - * limit is reached and they all execute the main copy of the bytecode. This is - * approximate, we do not need the RMW cycle to be atomic. +/* Multiple threads may execute these concurrently if thread-local bytecode is + * disabled and they all execute the main copy of the bytecode. Specialization + * is disabled in that case so the value is unused, but the RMW cycle should be + * free of data races. */ #define RECORD_BRANCH_TAKEN(bitset, flag) \ FT_ATOMIC_STORE_UINT16_RELAXED( \ From b992f446e991aa5247673f2d331b1ca44fc7b0d8 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Thu, 17 Oct 2024 15:30:07 -0700 Subject: [PATCH 61/67] Consolidate _PyCode_{Quicken,DisableSpecialization} into _PyCode_InitCounters --- Objects/codeobject.c | 16 ++++++---------- Python/specialize.c | 36 +++++++++++++----------------------- 2 files changed, 19 insertions(+), 33 deletions(-) diff --git a/Objects/codeobject.c b/Objects/codeobject.c index f263b7b630bcea..517270b1acde12 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -452,10 +452,10 @@ _PyCode_Validate(struct _PyCodeConstructor *con) return 0; } -extern void _PyCode_Quicken(_Py_CODEUNIT *instructions, Py_ssize_t size); +extern void +_PyCode_InitCounters(_Py_CODEUNIT *instructions, Py_ssize_t size, int enable); #ifdef Py_GIL_DISABLED -extern void _PyCode_DisableSpecialization(_Py_CODEUNIT *instructions, Py_ssize_t size); static _PyCodeArray * _PyCodeArray_New(Py_ssize_t size); #endif @@ -536,14 +536,10 @@ init_code(PyCodeObject *co, struct _PyCodeConstructor *con) } co->_co_firsttraceable = entry_point; #ifdef Py_GIL_DISABLED - if (interp->config.tlbc_enabled) { - _PyCode_Quicken(_PyCode_CODE(co), Py_SIZE(co)); - } - else { - _PyCode_DisableSpecialization(_PyCode_CODE(co), Py_SIZE(co)); - } + _PyCode_InitCounters(_PyCode_CODE(co), Py_SIZE(co), + interp->config.tlbc_enabled); #else - _PyCode_Quicken(_PyCode_CODE(co), Py_SIZE(co)); + _PyCode_InitCounters(_PyCode_CODE(co), Py_SIZE(co), 1); #endif notify_code_watchers(PY_CODE_EVENT_CREATE, co); return 0; @@ -2754,7 +2750,7 @@ copy_code(_Py_CODEUNIT *dst, PyCodeObject *co) for (int i = 0; i < code_len; i += _PyInstruction_GetLength(co, i)) { dst[i] = _Py_GetBaseCodeUnit(co, i); } - _PyCode_Quicken(dst, code_len); + _PyCode_InitCounters(dst, code_len, 1); } static Py_ssize_t diff --git a/Python/specialize.c b/Python/specialize.c index a3fb3d27ffe3aa..8c8ca712561ae6 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -455,11 +455,20 @@ do { \ # define SPECIALIZATION_FAIL(opcode, kind) ((void)0) #endif -// Initialize warmup counters and insert superinstructions. This cannot fail. +// Initialize warmup counters. This cannot fail. void -_PyCode_Quicken(_Py_CODEUNIT *instructions, Py_ssize_t size) +_PyCode_InitCounters(_Py_CODEUNIT *instructions, Py_ssize_t size, int enable) { #if ENABLE_SPECIALIZATION_FT + _Py_BackoffCounter jump_counter, adaptive_counter; + if (enable) { + jump_counter = initial_jump_backoff_counter(); + adaptive_counter = adaptive_counter_warmup(); + } + else { + jump_counter = initial_unreachable_backoff_counter(); + adaptive_counter = initial_unreachable_backoff_counter(); + } int opcode = 0; /* The last code unit cannot have a cache, so we don't need to check it */ for (Py_ssize_t i = 0; i < size-1; i++) { @@ -469,7 +478,7 @@ _PyCode_Quicken(_Py_CODEUNIT *instructions, Py_ssize_t size) // The initial value depends on the opcode switch (opcode) { case JUMP_BACKWARD: - instructions[i + 1].counter = initial_jump_backoff_counter(); + instructions[i + 1].counter = jump_counter; break; case POP_JUMP_IF_FALSE: case POP_JUMP_IF_TRUE: @@ -478,7 +487,7 @@ _PyCode_Quicken(_Py_CODEUNIT *instructions, Py_ssize_t size) instructions[i + 1].cache = 0x5555; // Alternating 0, 1 bits break; default: - instructions[i + 1].counter = adaptive_counter_warmup(); + instructions[i + 1].counter = adaptive_counter; break; } i += caches; @@ -487,25 +496,6 @@ _PyCode_Quicken(_Py_CODEUNIT *instructions, Py_ssize_t size) #endif /* ENABLE_SPECIALIZATION_FT */ } -#ifdef Py_GIL_DISABLED - -void -_PyCode_DisableSpecialization(_Py_CODEUNIT *instructions, Py_ssize_t size) -{ - /* The last code unit cannot have a cache, so we don't need to check it */ - for (Py_ssize_t i = 0; i < size - 1; i++) { - int opcode = instructions[i].op.code; - int caches = _PyOpcode_Caches[opcode]; - if (caches) { - instructions[i + 1].counter = - initial_unreachable_backoff_counter(); - i += caches; - } - } -} - -#endif - #define SIMPLE_FUNCTION 0 /* Common */ From 5b7658cbbf6df0e86ce7a3143b7c8354a2b9f295 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Fri, 18 Oct 2024 09:47:26 -0700 Subject: [PATCH 62/67] Fix incorrect types --- Python/index_pool.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Python/index_pool.c b/Python/index_pool.c index 09787ffabcac7e..526eccff74af00 100644 --- a/Python/index_pool.c +++ b/Python/index_pool.c @@ -43,13 +43,13 @@ parent(Py_ssize_t i) } static inline Py_ssize_t -left_child(int32_t i) +left_child(Py_ssize_t i) { return 2 * i + 1; } static inline Py_ssize_t -right_child(int32_t i) +right_child(Py_ssize_t i) { return 2 * i + 2; } @@ -121,7 +121,7 @@ heap_ensure_capacity(_PyIndexHeap *heap, Py_ssize_t limit) if (!new_capacity) { return -1; } - Py_ssize_t *new_values = PyMem_RawCalloc(new_capacity, sizeof(Py_ssize_t)); + int32_t *new_values = PyMem_RawCalloc(new_capacity, sizeof(int32_t)); if (new_values == NULL) { return -1; } From bec5bce4fd46b78b1b661ed991bcd4fc5adb9b48 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Fri, 18 Oct 2024 09:57:05 -0700 Subject: [PATCH 63/67] Add command-line tests for enabling TLBC --- Lib/test/test_cmd_line.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py index f088e5b8b5089c..64bf310d569c14 100644 --- a/Lib/test/test_cmd_line.py +++ b/Lib/test/test_cmd_line.py @@ -1084,6 +1084,23 @@ def test(x, y): assert_python_ok("-W", "always", "-X", "tlbc=0", "-c", code) assert_python_ok("-W", "always", "-c", code, PYTHON_TLBC="0") + @unittest.skipUnless(support.Py_GIL_DISABLED, + "PYTHON_TLBC and -X tlbc" + " only supported in Py_GIL_DISABLED builds") + @threading_helper.requires_working_threading() + def test_enable_thread_local_bytecode(self): + code = """if 1: + import threading + def test(x, y): + return x + y + t = threading.Thread(target=test, args=(1,2)) + t.start() + t.join()""" + # The functionality of thread-local bytecode is tested more extensively + # in test_thread_local_bytecode + assert_python_ok("-W", "always", "-X", "tlbc=1", "-c", code) + assert_python_ok("-W", "always", "-c", code, PYTHON_TLBC="1") + @unittest.skipUnless(support.Py_GIL_DISABLED, "PYTHON_TLBC and -X tlbc" " only supported in Py_GIL_DISABLED builds") From c9054b7ab44e59ec5de1709c95aeb8eb2231877d Mon Sep 17 00:00:00 2001 From: Matt Page Date: Fri, 18 Oct 2024 15:03:33 -0700 Subject: [PATCH 64/67] Update libpython.py for tlbc_index --- Tools/gdb/libpython.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/Tools/gdb/libpython.py b/Tools/gdb/libpython.py index ae24ee83df9fe5..ed254152d7da41 100755 --- a/Tools/gdb/libpython.py +++ b/Tools/gdb/libpython.py @@ -77,9 +77,9 @@ def _managed_dict_offset(): else: return -3 * _sizeof_void_p() -def _interp_frame_has_bytecode(): +def _interp_frame_has_tlbc_index(): interp_frame = gdb.lookup_type("_PyInterpreterFrame") - return any(field.name == "bytecode" for field in interp_frame.fields()) + return any(field.name == "tlbc_index" for field in interp_frame.fields()) Py_TPFLAGS_INLINE_VALUES = (1 << 2) @@ -109,7 +109,7 @@ def _interp_frame_has_bytecode(): UNABLE_READ_INFO_PYTHON_FRAME = 'Unable to read information on python frame' EVALFRAME = '_PyEval_EvalFrameDefault' -INTERP_FRAME_HAS_BYTECODE = _interp_frame_has_bytecode() +INTERP_FRAME_HAS_TLBC_INDEX = _interp_frame_has_tlbc_index() class NullPyObjectPtr(RuntimeError): pass @@ -699,6 +699,16 @@ def parse_location_table(firstlineno, linetable): yield addr, end_addr, line addr = end_addr + +class PyCodeArrayPtr: + def __init__(self, gdbval): + self._gdbval = gdbval + + def get_entry(self, index): + assert (index >= 0) and (index < self._gdbval["size"]) + return self._gdbval["entries"][index] + + class PyCodeObjectPtr(PyObjectPtr): """ Class wrapping a gdb.Value that's a PyCodeObject* i.e. a instance @@ -1091,8 +1101,10 @@ def _f_nlocalsplus(self): def _f_lasti(self): codeunit_p = gdb.lookup_type("_Py_CODEUNIT").pointer() instr_ptr = self._gdbval["instr_ptr"] - if INTERP_FRAME_HAS_BYTECODE: - first_instr = self._gdbval["bytecode"].cast(codeunit_p) + if INTERP_FRAME_HAS_TLBC_INDEX: + tlbc_index = self._gdbval["tlbc_index"] + code_arr = PyCodeArrayPtr(self._f_code().field("co_tlbc")) + first_instr = code_arr.get_entry(tlbc_index).cast(codeunit_p) else: first_instr = self._f_code().field("co_code_adaptive").cast(codeunit_p) return int(instr_ptr - first_instr) From 1a48ab2827ad0f4db61c1aefed8164ddb5df1644 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Fri, 18 Oct 2024 22:18:58 -0700 Subject: [PATCH 65/67] Avoid special casing in _PyEval_GetExecutableCode It's cleaner to assign all threads the index of the main copy of the bytecode when tlbc is disabled rather than adding a special case in _PyEval_GetExecutableCode. --- Include/internal/pycore_ceval.h | 3 --- Lib/test/test_thread_local_bytecode.py | 19 ++++++++++--------- Objects/codeobject.c | 13 ++++++++++--- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h index 38d43dec401f29..a3cdda9b1fa6a0 100644 --- a/Include/internal/pycore_ceval.h +++ b/Include/internal/pycore_ceval.h @@ -186,9 +186,6 @@ _PyEval_GetExecutableCode(PyThreadState *tstate, PyCodeObject *co) if (bc != NULL) { return bc; } - if (!_PyInterpreterState_GET()->config.tlbc_enabled) { - return _PyCode_CODE(co); - } return _PyCode_GetTLBC(co); } diff --git a/Lib/test/test_thread_local_bytecode.py b/Lib/test/test_thread_local_bytecode.py index 08f7f4e4263c44..604056ed7de1ba 100644 --- a/Lib/test/test_thread_local_bytecode.py +++ b/Lib/test/test_thread_local_bytecode.py @@ -109,16 +109,16 @@ def f(a, b, q=None): """) assert_python_ok("-X", "tlbc=1", "-c", code) - def test_no_tlbc_if_tlbc_disabled(self): + def test_no_copies_if_tlbc_disabled(self): code = textwrap.dedent(""" import queue import threading - from _testinternalcapi import get_tlbc + from _testinternalcapi import get_tlbc_id def f(a, b, q=None): if q is not None: - q.put(get_tlbc(f)) + q.put(get_tlbc_id(f)) return a + b q = queue.Queue() @@ -128,15 +128,16 @@ def f(a, b, q=None): t.start() threads.append(t) - tlbcs = [] + tlbc_ids = [] for t in threads: t.join() - tlbcs.append(q.get()) + tlbc_ids.append(q.get()) - assert get_tlbc(f) is not None - assert tlbcs[0] is None - assert tlbcs[1] is None - assert tlbcs[2] is None + main_tlbc_id = get_tlbc_id(f) + assert main_tlbc_id is not None + assert tlbc_ids[0] == main_tlbc_id + assert tlbc_ids[1] == main_tlbc_id + assert tlbc_ids[2] == main_tlbc_id """) assert_python_ok("-X", "tlbc=0", "-c", code) diff --git a/Objects/codeobject.c b/Objects/codeobject.c index 517270b1acde12..6d5fad0ea1a1ae 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -2711,7 +2711,8 @@ _PyCode_Fini(PyInterpreterState *interp) // // Thread-local bytecode can be disabled at runtime by providing either `-X // tlbc=0` or `PYTHON_TLBC=0`. Disabling thread-local bytecode also disables -// specialization. +// specialization. All threads share the main copy of the bytecode when +// thread-local bytecode is disabled. // // Concurrent modifications to the bytecode made by the specializing // interpreter and instrumentation use atomics, with specialization taking care @@ -2720,14 +2721,20 @@ _PyCode_Fini(PyInterpreterState *interp) int32_t _Py_ReserveTLBCIndex(PyInterpreterState *interp) { - return _PyIndexPool_AllocIndex(&interp->tlbc_indices); + if (interp->config.tlbc_enabled) { + return _PyIndexPool_AllocIndex(&interp->tlbc_indices); + } + // All threads share the main copy of the bytecode when TLBC is disabled + return 0; } void _Py_ClearTLBCIndex(_PyThreadStateImpl *tstate) { PyInterpreterState *interp = ((PyThreadState *)tstate)->interp; - _PyIndexPool_FreeIndex(&interp->tlbc_indices, tstate->tlbc_index); + if (interp->config.tlbc_enabled) { + _PyIndexPool_FreeIndex(&interp->tlbc_indices, tstate->tlbc_index); + } } static _PyCodeArray * From c10749537025eee20b5b673a05632f23b0f69bbf Mon Sep 17 00:00:00 2001 From: Matt Page Date: Wed, 23 Oct 2024 16:53:40 -0700 Subject: [PATCH 66/67] Clear TLBC when other caches are cleared --- Include/internal/pycore_code.h | 5 ++ Include/internal/pycore_gc.h | 4 + Lib/test/libregrtest/refleak.py | 5 -- Objects/codeobject.c | 135 ++++++++++++++++++++++++++++++++ Python/gc_free_threading.c | 12 ++- Python/sysmodule.c | 5 ++ 6 files changed, 158 insertions(+), 8 deletions(-) diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index 4530ab94ce1038..a0acf76db6f04d 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -634,6 +634,11 @@ extern int32_t _Py_ReserveTLBCIndex(PyInterpreterState *interp); // Release the current thread's index into thread-local bytecode arrays extern void _Py_ClearTLBCIndex(_PyThreadStateImpl *tstate); + +// Free all TLBC copies not associated with live threads. +// +// Returns 0 on success or -1 on error. +extern int _Py_ClearUnusedTLBC(PyInterpreterState *interp); #endif #ifdef __cplusplus diff --git a/Include/internal/pycore_gc.h b/Include/internal/pycore_gc.h index cf96f661e6cd7e..20cad938d44bbc 100644 --- a/Include/internal/pycore_gc.h +++ b/Include/internal/pycore_gc.h @@ -397,6 +397,10 @@ extern int _PyGC_VisitStackRef(union _PyStackRef *ref, visitproc visit, void *ar } \ } while (0) +#ifdef Py_GIL_DISABLED +extern void _PyGC_VisitObjectsWorldStopped(PyInterpreterState *interp, + gcvisitobjects_t callback, void *arg); +#endif #ifdef __cplusplus } diff --git a/Lib/test/libregrtest/refleak.py b/Lib/test/libregrtest/refleak.py index b9d0e95081ded6..fa447a4336a399 100644 --- a/Lib/test/libregrtest/refleak.py +++ b/Lib/test/libregrtest/refleak.py @@ -145,11 +145,6 @@ def get_pooled_int(value): # Use an internal-only keyword argument that mypy doesn't know yet _only_immortal=True) # type: ignore[call-arg] alloc_after = getallocatedblocks() - interned_immortal_after - if _get_tlbc_blocks := getattr(sys, "_get_tlbc_blocks", None): - # Ignore any thread-local bytecode that was allocated. These will be - # released when the code object is destroyed, typically at runtime - # shutdown - alloc_after -= _get_tlbc_blocks() rc_after = gettotalrefcount() fd_after = fd_count() diff --git a/Objects/codeobject.c b/Objects/codeobject.c index 6d5fad0ea1a1ae..030aa245030d13 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -10,6 +10,7 @@ #include "pycore_initconfig.h" // _PyStatus_OK() #include "pycore_interp.h" // PyInterpreterState.co_extra_freefuncs #include "pycore_object.h" // _PyObject_SetDeferredRefcount +#include "pycore_object_stack.h" #include "pycore_opcode_metadata.h" // _PyOpcode_Deopt, _PyOpcode_Caches #include "pycore_opcode_utils.h" // RESUME_AT_FUNC_START #include "pycore_pymem.h" // _PyMem_FreeDelayed @@ -2824,4 +2825,138 @@ _PyCode_GetTLBC(PyCodeObject *co) return result; } +// My kingdom for a bitset +struct flag_set { + uint8_t *flags; + Py_ssize_t size; +}; + +static inline int +flag_is_set(struct flag_set *flags, Py_ssize_t idx) +{ + assert(idx >= 0); + return (idx < flags->size) && flags->flags[idx]; +} + +// Set the flag for each tlbc index in use +static int +get_indices_in_use(PyInterpreterState *interp, struct flag_set *in_use) +{ + assert(interp->stoptheworld.world_stopped); + assert(in_use->flags == NULL); + int32_t max_index = 0; + for (PyThreadState *p = interp->threads.head; p != NULL; p = p->next) { + int32_t idx = ((_PyThreadStateImpl *) p)->tlbc_index; + if (idx > max_index) { + max_index = idx; + } + } + in_use->size = (size_t) max_index + 1; + in_use->flags = PyMem_Calloc(in_use->size, sizeof(*in_use->flags)); + if (in_use->flags == NULL) { + return -1; + } + for (PyThreadState *p = interp->threads.head; p != NULL; p = p->next) { + in_use->flags[((_PyThreadStateImpl *) p)->tlbc_index] = 1; + } + return 0; +} + +struct get_code_args { + _PyObjectStack code_objs; + struct flag_set indices_in_use; + int err; +}; + +static void +clear_get_code_args(struct get_code_args *args) +{ + if (args->indices_in_use.flags != NULL) { + PyMem_Free(args->indices_in_use.flags); + args->indices_in_use.flags = NULL; + } + _PyObjectStack_Clear(&args->code_objs); +} + +static inline int +is_bytecode_unused(_PyCodeArray *tlbc, Py_ssize_t idx, + struct flag_set *indices_in_use) +{ + assert(idx > 0 && idx < tlbc->size); + return tlbc->entries[idx] != NULL && !flag_is_set(indices_in_use, idx); +} + +static int +get_code_with_unused_tlbc(PyObject *obj, struct get_code_args *args) +{ + if (!PyCode_Check(obj)) { + return 1; + } + PyCodeObject *co = (PyCodeObject *) obj; + _PyCodeArray *tlbc = co->co_tlbc; + // The first index always points at the main copy of the bytecode embedded + // in the code object. + for (Py_ssize_t i = 1; i < tlbc->size; i++) { + if (is_bytecode_unused(tlbc, i, &args->indices_in_use)) { + if (_PyObjectStack_Push(&args->code_objs, obj) < 0) { + args->err = -1; + return 0; + } + return 1; + } + } + return 1; +} + +static void +free_unused_bytecode(PyCodeObject *co, struct flag_set *indices_in_use) +{ + _PyCodeArray *tlbc = co->co_tlbc; + // The first index always points at the main copy of the bytecode embedded + // in the code object. + for (Py_ssize_t i = 1; i < tlbc->size; i++) { + if (is_bytecode_unused(tlbc, i, indices_in_use)) { + PyMem_Free(tlbc->entries[i]); + tlbc->entries[i] = NULL; + } + } +} + +int +_Py_ClearUnusedTLBC(PyInterpreterState *interp) +{ + struct get_code_args args = { + .code_objs = {NULL}, + .indices_in_use = {NULL, 0}, + .err = 0, + }; + _PyEval_StopTheWorld(interp); + // Collect in-use tlbc indices + if (get_indices_in_use(interp, &args.indices_in_use) < 0) { + goto err; + } + // Collect code objects that have bytecode not in use by any thread + _PyGC_VisitObjectsWorldStopped( + interp, (gcvisitobjects_t)get_code_with_unused_tlbc, &args); + if (args.err < 0) { + goto err; + } + // Free unused bytecode. This must happen outside of gc_visit_heaps; it is + // unsafe to allocate or free any mimalloc managed memory when it's + // running. + PyObject *obj; + while ((obj = _PyObjectStack_Pop(&args.code_objs)) != NULL) { + free_unused_bytecode((PyCodeObject*) obj, &args.indices_in_use); + } + _PyEval_StartTheWorld(interp); + clear_get_code_args(&args); + return 0; + +err: + _PyEval_StartTheWorld(interp); + clear_get_code_args(&args); + PyErr_NoMemory(); + return -1; +} + #endif diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 8558d4555a9a3a..6e49ee4a7026db 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -1962,16 +1962,22 @@ custom_visitor_wrapper(const mi_heap_t *heap, const mi_heap_area_t *area, } void -PyUnstable_GC_VisitObjects(gcvisitobjects_t callback, void *arg) +_PyGC_VisitObjectsWorldStopped(PyInterpreterState *interp, + gcvisitobjects_t callback, void *arg) { - PyInterpreterState *interp = _PyInterpreterState_GET(); struct custom_visitor_args wrapper = { .callback = callback, .arg = arg, }; + gc_visit_heaps(interp, &custom_visitor_wrapper, &wrapper.base); +} +void +PyUnstable_GC_VisitObjects(gcvisitobjects_t callback, void *arg) +{ + PyInterpreterState *interp = _PyInterpreterState_GET(); _PyEval_StopTheWorld(interp); - gc_visit_heaps(interp, &custom_visitor_wrapper, &wrapper.base); + _PyGC_VisitObjectsWorldStopped(interp, callback, arg); _PyEval_StartTheWorld(interp); } diff --git a/Python/sysmodule.c b/Python/sysmodule.c index 079e9d3158cf50..bb368a6c170178 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -2175,6 +2175,11 @@ sys__clear_internal_caches_impl(PyObject *module) #ifdef _Py_TIER2 PyInterpreterState *interp = _PyInterpreterState_GET(); _Py_Executors_InvalidateAll(interp, 0); +#endif +#ifdef Py_GIL_DISABLED + if (_Py_ClearUnusedTLBC(_PyInterpreterState_GET()) < 0) { + return NULL; + } #endif PyType_ClearCache(); Py_RETURN_NONE; From 07f91409a6a4a933b26b23eb79e41da4d6c77918 Mon Sep 17 00:00:00 2001 From: Matt Page Date: Wed, 23 Oct 2024 21:09:40 -0700 Subject: [PATCH 67/67] Remove _get_tlbc_blocks --- Lib/test/test_thread_local_bytecode.py | 37 ------------------------- Python/clinic/sysmodule.c.h | 38 +------------------------- Python/sysmodule.c | 36 ------------------------ 3 files changed, 1 insertion(+), 110 deletions(-) diff --git a/Lib/test/test_thread_local_bytecode.py b/Lib/test/test_thread_local_bytecode.py index 604056ed7de1ba..7a8809c5ae7697 100644 --- a/Lib/test/test_thread_local_bytecode.py +++ b/Lib/test/test_thread_local_bytecode.py @@ -193,43 +193,6 @@ def f(q): """) assert_python_ok("-X", "tlbc=1", "-c", code) - def test_tlbc_cleanup(self): - code = textwrap.dedent(""" - import gc - import sys - import threading - - def f(barrier, callee): - barrier.wait() - return callee() - - # Define callee dynamically so that the module body's constants don't - # hold a strong reference to the code object. - ns = {} - exec('def func(): return 42', globals=ns) - callee = ns.pop('func') - - # Create 5 copies of callee's bytecode - threads = [] - barrier = threading.Barrier(5) - for _ in range(barrier.parties): - t = threading.Thread(target=f, args=(barrier, callee)) - t.start() - threads.append(t) - for t in threads: - t.join() - - # Destroy the only reference to callee's code object. All the tlbc - # copies should be destroyed when the code object is destroyed in the - # call to gc.collect below. - before = sys._get_tlbc_blocks() - callee.__code__ = f.__code__ - gc.collect() - after = sys._get_tlbc_blocks() - assert (before - after) == len(threads) - """) - assert_python_ok("-X", "tlbc=1", "-c", code) - if __name__ == "__main__": unittest.main() diff --git a/Python/clinic/sysmodule.c.h b/Python/clinic/sysmodule.c.h index dd205bf203a457..8277d286cf51ef 100644 --- a/Python/clinic/sysmodule.c.h +++ b/Python/clinic/sysmodule.c.h @@ -1571,38 +1571,6 @@ sys__is_gil_enabled(PyObject *module, PyObject *Py_UNUSED(ignored)) return return_value; } -#if defined(Py_GIL_DISABLED) - -PyDoc_STRVAR(sys__get_tlbc_blocks__doc__, -"_get_tlbc_blocks($module, /)\n" -"--\n" -"\n" -"Return the total number of thread-local bytecode copies, excluding the copies that are embedded in the code object."); - -#define SYS__GET_TLBC_BLOCKS_METHODDEF \ - {"_get_tlbc_blocks", (PyCFunction)sys__get_tlbc_blocks, METH_NOARGS, sys__get_tlbc_blocks__doc__}, - -static Py_ssize_t -sys__get_tlbc_blocks_impl(PyObject *module); - -static PyObject * -sys__get_tlbc_blocks(PyObject *module, PyObject *Py_UNUSED(ignored)) -{ - PyObject *return_value = NULL; - Py_ssize_t _return_value; - - _return_value = sys__get_tlbc_blocks_impl(module); - if ((_return_value == -1) && PyErr_Occurred()) { - goto exit; - } - return_value = PyLong_FromSsize_t(_return_value); - -exit: - return return_value; -} - -#endif /* defined(Py_GIL_DISABLED) */ - #ifndef SYS_GETWINDOWSVERSION_METHODDEF #define SYS_GETWINDOWSVERSION_METHODDEF #endif /* !defined(SYS_GETWINDOWSVERSION_METHODDEF) */ @@ -1646,8 +1614,4 @@ sys__get_tlbc_blocks(PyObject *module, PyObject *Py_UNUSED(ignored)) #ifndef SYS_GETANDROIDAPILEVEL_METHODDEF #define SYS_GETANDROIDAPILEVEL_METHODDEF #endif /* !defined(SYS_GETANDROIDAPILEVEL_METHODDEF) */ - -#ifndef SYS__GET_TLBC_BLOCKS_METHODDEF - #define SYS__GET_TLBC_BLOCKS_METHODDEF -#endif /* !defined(SYS__GET_TLBC_BLOCKS_METHODDEF) */ -/*[clinic end generated code: output=fca6c27bfc0c17ac input=a9049054013a1b77]*/ +/*[clinic end generated code: output=9cc9069aef1482bc input=a9049054013a1b77]*/ diff --git a/Python/sysmodule.c b/Python/sysmodule.c index bb368a6c170178..97dc6dc82b3f8b 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -2448,41 +2448,6 @@ sys__is_gil_enabled_impl(PyObject *module) #endif } -#ifdef Py_GIL_DISABLED -static int -count_tlbc_blocks(PyObject *obj, Py_ssize_t *count) -{ - if (PyCode_Check(obj)) { - _PyCodeArray *tlbc = ((PyCodeObject *)obj)->co_tlbc; - // First entry always points to the bytecode at the end of the code - // object. Exclude it from the count as it is allocated as part of - // creating the code object. - for (Py_ssize_t i = 1; i < tlbc->size; i++) { - if (tlbc->entries[i] != NULL) { - (*count)++; - } - } - } - return 1; -} - -/*[clinic input] -sys._get_tlbc_blocks -> Py_ssize_t - -Return the total number of thread-local bytecode copies, excluding the copies that are embedded in the code object. -[clinic start generated code]*/ - -static Py_ssize_t -sys__get_tlbc_blocks_impl(PyObject *module) -/*[clinic end generated code: output=4b4e350583cbd643 input=37c14e47d8905a95]*/ -{ - Py_ssize_t count = 0; - PyUnstable_GC_VisitObjects((gcvisitobjects_t) count_tlbc_blocks, &count); - return count; -} -#endif /* Py_GIL_DISABLED */ - - static PerfMapState perf_map_state; @@ -2658,7 +2623,6 @@ static PyMethodDef sys_methods[] = { #endif SYS__GET_CPU_COUNT_CONFIG_METHODDEF SYS__IS_GIL_ENABLED_METHODDEF - SYS__GET_TLBC_BLOCKS_METHODDEF {NULL, NULL} // sentinel };