From 2e95c5ba3bf7e5004c7e2304afda4a8f8e2443a7 Mon Sep 17 00:00:00 2001 From: mpage Date: Mon, 4 Nov 2024 11:13:32 -0800 Subject: [PATCH] gh-115999: Implement thread-local bytecode and enable specialization for `BINARY_OP` (#123926) Each thread specializes a thread-local copy of the bytecode, created on the first RESUME, in free-threaded builds. All copies of the bytecode for a code object are stored in the co_tlbc array on the code object. Threads reserve a globally unique index identifying its copy of the bytecode in all co_tlbc arrays at thread creation and release the index at thread destruction. The first entry in every co_tlbc array always points to the "main" copy of the bytecode that is stored at the end of the code object. This ensures that no bytecode is copied for programs that do not use threads. Thread-local bytecode can be disabled at runtime by providing either -X tlbc=0 or PYTHON_TLBC=0. Disabling thread-local bytecode also disables specialization. Concurrent modifications to the bytecode made by the specializing interpreter and instrumentation use atomics, with specialization taking care not to overwrite an instruction that was instrumented concurrently. --- Include/cpython/code.h | 19 ++ Include/cpython/initconfig.h | 1 + Include/internal/pycore_ceval.h | 12 + Include/internal/pycore_code.h | 41 ++++ Include/internal/pycore_frame.h | 56 ++++- Include/internal/pycore_gc.h | 4 + Include/internal/pycore_index_pool.h | 56 +++++ Include/internal/pycore_interp.h | 2 + Include/internal/pycore_tstate.h | 4 +- Include/internal/pycore_uop_ids.h | 123 +++++----- Include/internal/pycore_uop_metadata.h | 2 +- Lib/test/support/__init__.py | 5 + Lib/test/test_capi/test_config.py | 1 + Lib/test/test_capi/test_opt.py | 7 +- Lib/test/test_cmd_line.py | 52 ++++ Lib/test/test_dis.py | 8 +- Lib/test/test_embed.py | 1 + Lib/test/test_sys.py | 14 +- Lib/test/test_thread_local_bytecode.py | 198 ++++++++++++++++ Makefile.pre.in | 2 + Modules/_opcode.c | 3 + Modules/_testinternalcapi.c | 46 +++- Objects/codeobject.c | 313 ++++++++++++++++++++++++- Objects/frameobject.c | 14 +- Objects/typeobject.c | 7 +- PCbuild/_freeze_module.vcxproj | 1 + PCbuild/_freeze_module.vcxproj.filters | 3 + PCbuild/pythoncore.vcxproj | 2 + PCbuild/pythoncore.vcxproj.filters | 6 + Python/bytecodes.c | 68 +++--- Python/ceval.c | 23 +- Python/ceval_macros.h | 22 +- Python/executor_cases.c.h | 23 +- Python/frame.c | 3 +- Python/gc_free_threading.c | 12 +- Python/generated_cases.c.h | 100 +++++--- Python/index_pool.c | 193 +++++++++++++++ Python/initconfig.c | 49 +++- Python/instrumentation.c | 159 +++++++------ Python/optimizer_cases.c.h | 2 + Python/pystate.c | 10 + Python/specialize.c | 68 ++++-- Python/sysmodule.c | 5 + Tools/gdb/libpython.py | 23 +- 44 files changed, 1509 insertions(+), 254 deletions(-) create mode 100644 Include/internal/pycore_index_pool.h create mode 100644 Lib/test/test_thread_local_bytecode.py create mode 100644 Python/index_pool.c diff --git a/Include/cpython/code.h b/Include/cpython/code.h index 2561b2b88baacc..370f1d259abe0f 100644 --- a/Include/cpython/code.h +++ b/Include/cpython/code.h @@ -72,6 +72,24 @@ typedef struct { uint8_t *per_instruction_tools; } _PyCoMonitoringData; +#ifdef Py_GIL_DISABLED + +/* Each thread specializes a thread-local copy of the bytecode in free-threaded + * builds. These copies are stored on the code object in a `_PyCodeArray`. The + * first entry in the array always points to the "main" copy of the bytecode + * that is stored at the end of the code object. + */ +typedef struct { + Py_ssize_t size; + char *entries[1]; +} _PyCodeArray; + +#define _PyCode_DEF_THREAD_LOCAL_BYTECODE() \ + _PyCodeArray *co_tlbc; +#else +#define _PyCode_DEF_THREAD_LOCAL_BYTECODE() +#endif + // To avoid repeating ourselves in deepfreeze.py, all PyCodeObject members are // defined in this macro: #define _PyCode_DEF(SIZE) { \ @@ -138,6 +156,7 @@ typedef struct { Type is a void* to keep the format private in codeobject.c to force \ people to go through the proper APIs. */ \ void *co_extra; \ + _PyCode_DEF_THREAD_LOCAL_BYTECODE() \ char co_code_adaptive[(SIZE)]; \ } diff --git a/Include/cpython/initconfig.h b/Include/cpython/initconfig.h index c2cb4e3cdd92fb..f69c586a4f96f3 100644 --- a/Include/cpython/initconfig.h +++ b/Include/cpython/initconfig.h @@ -183,6 +183,7 @@ typedef struct PyConfig { int cpu_count; #ifdef Py_GIL_DISABLED int enable_gil; + int tlbc_enabled; #endif /* --- Path configuration inputs ------------ */ diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h index 411bbff106dd69..80bd19a887871c 100644 --- a/Include/internal/pycore_ceval.h +++ b/Include/internal/pycore_ceval.h @@ -174,6 +174,18 @@ _PyEval_IsGILEnabled(PyThreadState *tstate) extern int _PyEval_EnableGILTransient(PyThreadState *tstate); extern int _PyEval_EnableGILPermanent(PyThreadState *tstate); extern int _PyEval_DisableGIL(PyThreadState *state); + + +static inline _Py_CODEUNIT * +_PyEval_GetExecutableCode(PyThreadState *tstate, PyCodeObject *co) +{ + _Py_CODEUNIT *bc = _PyCode_GetTLBCFast(tstate, co); + if (bc != NULL) { + return bc; + } + return _PyCode_GetTLBC(co); +} + #endif extern void _PyEval_DeactivateOpCache(void); diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index 57e0a14bb9b5bd..a0acf76db6f04d 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -11,6 +11,7 @@ extern "C" { #include "pycore_stackref.h" // _PyStackRef #include "pycore_lock.h" // PyMutex #include "pycore_backoff.h" // _Py_BackoffCounter +#include "pycore_tstate.h" // _PyThreadStateImpl /* Each instruction in a code object is a fixed-width value, @@ -313,11 +314,17 @@ extern int _PyLineTable_PreviousAddressRange(PyCodeAddressRange *range); /** API for executors */ extern void _PyCode_Clear_Executors(PyCodeObject *code); + #ifdef Py_GIL_DISABLED // gh-115999 tracks progress on addressing this. #define ENABLE_SPECIALIZATION 0 +// Use this to enable specialization families once they are thread-safe. All +// uses will be replaced with ENABLE_SPECIALIZATION once all families are +// thread-safe. +#define ENABLE_SPECIALIZATION_FT 1 #else #define ENABLE_SPECIALIZATION 1 +#define ENABLE_SPECIALIZATION_FT ENABLE_SPECIALIZATION #endif /* Specialization functions */ @@ -600,6 +607,40 @@ struct _PyCode8 _PyCode_DEF(8); PyAPI_DATA(const struct _PyCode8) _Py_InitCleanup; +#ifdef Py_GIL_DISABLED + +// Return a pointer to the thread-local bytecode for the current thread, if it +// exists. +static inline _Py_CODEUNIT * +_PyCode_GetTLBCFast(PyThreadState *tstate, PyCodeObject *co) +{ + _PyCodeArray *code = _Py_atomic_load_ptr_acquire(&co->co_tlbc); + int32_t idx = ((_PyThreadStateImpl*) tstate)->tlbc_index; + if (idx < code->size && code->entries[idx] != NULL) { + return (_Py_CODEUNIT *) code->entries[idx]; + } + return NULL; +} + +// Return a pointer to the thread-local bytecode for the current thread, +// creating it if necessary. +extern _Py_CODEUNIT *_PyCode_GetTLBC(PyCodeObject *co); + +// Reserve an index for the current thread into thread-local bytecode +// arrays +// +// Returns the reserved index or -1 on error. +extern int32_t _Py_ReserveTLBCIndex(PyInterpreterState *interp); + +// Release the current thread's index into thread-local bytecode arrays +extern void _Py_ClearTLBCIndex(_PyThreadStateImpl *tstate); + +// Free all TLBC copies not associated with live threads. +// +// Returns 0 on success or -1 on error. +extern int _Py_ClearUnusedTLBC(PyInterpreterState *interp); +#endif + #ifdef __cplusplus } #endif diff --git a/Include/internal/pycore_frame.h b/Include/internal/pycore_frame.h index c9ac3819d0390b..8c0100390d036e 100644 --- a/Include/internal/pycore_frame.h +++ b/Include/internal/pycore_frame.h @@ -68,6 +68,10 @@ typedef struct _PyInterpreterFrame { PyObject *f_locals; /* Strong reference, may be NULL. Only valid if not on C stack */ PyFrameObject *frame_obj; /* Strong reference, may be NULL. Only valid if not on C stack */ _Py_CODEUNIT *instr_ptr; /* Instruction currently executing (or about to begin) */ +#ifdef Py_GIL_DISABLED + /* Index of thread-local bytecode containing instr_ptr. */ + int32_t tlbc_index; +#endif _PyStackRef *stackpointer; uint16_t return_offset; /* Only relevant during a function call */ char owner; @@ -76,7 +80,7 @@ typedef struct _PyInterpreterFrame { } _PyInterpreterFrame; #define _PyInterpreterFrame_LASTI(IF) \ - ((int)((IF)->instr_ptr - _PyCode_CODE(_PyFrame_GetCode(IF)))) + ((int)((IF)->instr_ptr - _PyFrame_GetBytecode((IF)))) static inline PyCodeObject *_PyFrame_GetCode(_PyInterpreterFrame *f) { PyObject *executable = PyStackRef_AsPyObjectBorrow(f->f_executable); @@ -84,6 +88,19 @@ static inline PyCodeObject *_PyFrame_GetCode(_PyInterpreterFrame *f) { return (PyCodeObject *)executable; } +static inline _Py_CODEUNIT * +_PyFrame_GetBytecode(_PyInterpreterFrame *f) +{ +#ifdef Py_GIL_DISABLED + PyCodeObject *co = _PyFrame_GetCode(f); + _PyCodeArray *tlbc = _Py_atomic_load_ptr_acquire(&co->co_tlbc); + assert(f->tlbc_index >= 0 && f->tlbc_index < tlbc->size); + return (_Py_CODEUNIT *)tlbc->entries[f->tlbc_index]; +#else + return _PyCode_CODE(_PyFrame_GetCode(f)); +#endif +} + static inline PyFunctionObject *_PyFrame_GetFunction(_PyInterpreterFrame *f) { PyObject *func = PyStackRef_AsPyObjectBorrow(f->f_funcobj); assert(PyFunction_Check(func)); @@ -144,13 +161,33 @@ static inline void _PyFrame_Copy(_PyInterpreterFrame *src, _PyInterpreterFrame * #endif } +#ifdef Py_GIL_DISABLED +static inline void +_PyFrame_InitializeTLBC(PyThreadState *tstate, _PyInterpreterFrame *frame, + PyCodeObject *code) +{ + _Py_CODEUNIT *tlbc = _PyCode_GetTLBCFast(tstate, code); + if (tlbc == NULL) { + // No thread-local bytecode exists for this thread yet; use the main + // thread's copy, deferring thread-local bytecode creation to the + // execution of RESUME. + frame->instr_ptr = _PyCode_CODE(code); + frame->tlbc_index = 0; + } + else { + frame->instr_ptr = tlbc; + frame->tlbc_index = ((_PyThreadStateImpl *)tstate)->tlbc_index; + } +} +#endif + /* Consumes reference to func and locals. Does not initialize frame->previous, which happens when frame is linked into the frame stack. */ static inline void _PyFrame_Initialize( - _PyInterpreterFrame *frame, _PyStackRef func, + PyThreadState *tstate, _PyInterpreterFrame *frame, _PyStackRef func, PyObject *locals, PyCodeObject *code, int null_locals_from, _PyInterpreterFrame *previous) { frame->previous = previous; @@ -162,7 +199,12 @@ _PyFrame_Initialize( frame->f_locals = locals; frame->stackpointer = frame->localsplus + code->co_nlocalsplus; frame->frame_obj = NULL; +#ifdef Py_GIL_DISABLED + _PyFrame_InitializeTLBC(tstate, frame, code); +#else + (void)tstate; frame->instr_ptr = _PyCode_CODE(code); +#endif frame->return_offset = 0; frame->owner = FRAME_OWNED_BY_THREAD; @@ -224,7 +266,8 @@ _PyFrame_IsIncomplete(_PyInterpreterFrame *frame) return true; } return frame->owner != FRAME_OWNED_BY_GENERATOR && - frame->instr_ptr < _PyCode_CODE(_PyFrame_GetCode(frame)) + _PyFrame_GetCode(frame)->_co_firsttraceable; + frame->instr_ptr < _PyFrame_GetBytecode(frame) + + _PyFrame_GetCode(frame)->_co_firsttraceable; } static inline _PyInterpreterFrame * @@ -315,7 +358,8 @@ _PyFrame_PushUnchecked(PyThreadState *tstate, _PyStackRef func, int null_locals_ _PyInterpreterFrame *new_frame = (_PyInterpreterFrame *)tstate->datastack_top; tstate->datastack_top += code->co_framesize; assert(tstate->datastack_top < tstate->datastack_limit); - _PyFrame_Initialize(new_frame, func, NULL, code, null_locals_from, previous); + _PyFrame_Initialize(tstate, new_frame, func, NULL, code, null_locals_from, + previous); return new_frame; } @@ -339,7 +383,11 @@ _PyFrame_PushTrampolineUnchecked(PyThreadState *tstate, PyCodeObject *code, int assert(stackdepth <= code->co_stacksize); frame->stackpointer = frame->localsplus + code->co_nlocalsplus + stackdepth; frame->frame_obj = NULL; +#ifdef Py_GIL_DISABLED + _PyFrame_InitializeTLBC(tstate, frame, code); +#else frame->instr_ptr = _PyCode_CODE(code); +#endif frame->owner = FRAME_OWNED_BY_THREAD; frame->return_offset = 0; diff --git a/Include/internal/pycore_gc.h b/Include/internal/pycore_gc.h index b85957df5a6b9f..38a1c56c09d9db 100644 --- a/Include/internal/pycore_gc.h +++ b/Include/internal/pycore_gc.h @@ -389,6 +389,10 @@ extern int _PyGC_VisitStackRef(union _PyStackRef *ref, visitproc visit, void *ar } \ } while (0) +#ifdef Py_GIL_DISABLED +extern void _PyGC_VisitObjectsWorldStopped(PyInterpreterState *interp, + gcvisitobjects_t callback, void *arg); +#endif #ifdef __cplusplus } diff --git a/Include/internal/pycore_index_pool.h b/Include/internal/pycore_index_pool.h new file mode 100644 index 00000000000000..e81bfd4d6ed03d --- /dev/null +++ b/Include/internal/pycore_index_pool.h @@ -0,0 +1,56 @@ +#ifndef Py_INTERNAL_INDEX_POOL_H +#define Py_INTERNAL_INDEX_POOL_H + +#include "Python.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef Py_BUILD_CORE +# error "this header requires Py_BUILD_CORE define" +#endif + +#ifdef Py_GIL_DISABLED + +// This contains code for allocating unique indices in an array. It is used by +// the free-threaded build to assign each thread a globally unique index into +// each code object's thread-local bytecode array. + +// A min-heap of indices +typedef struct _PyIndexHeap { + int32_t *values; + + // Number of items stored in values + Py_ssize_t size; + + // Maximum number of items that can be stored in values + Py_ssize_t capacity; +} _PyIndexHeap; + +// An unbounded pool of indices. Indices are allocated starting from 0. They +// may be released back to the pool once they are no longer in use. +typedef struct _PyIndexPool { + PyMutex mutex; + + // Min heap of indices available for allocation + _PyIndexHeap free_indices; + + // Next index to allocate if no free indices are available + int32_t next_index; +} _PyIndexPool; + +// Allocate the smallest available index. Returns -1 on error. +extern int32_t _PyIndexPool_AllocIndex(_PyIndexPool *indices); + +// Release `index` back to the pool +extern void _PyIndexPool_FreeIndex(_PyIndexPool *indices, int32_t index); + +extern void _PyIndexPool_Fini(_PyIndexPool *indices); + +#endif // Py_GIL_DISABLED + +#ifdef __cplusplus +} +#endif +#endif // !Py_INTERNAL_INDEX_POOL_H diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h index 36cd71e5a007d5..9e3b4299693bbc 100644 --- a/Include/internal/pycore_interp.h +++ b/Include/internal/pycore_interp.h @@ -26,6 +26,7 @@ extern "C" { #include "pycore_genobject.h" // _PyGen_FetchStopIterationValue #include "pycore_global_objects.h"// struct _Py_interp_cached_objects #include "pycore_import.h" // struct _import_state +#include "pycore_index_pool.h" // _PyIndexPool #include "pycore_instruments.h" // _PY_MONITORING_EVENTS #include "pycore_list.h" // struct _Py_list_state #include "pycore_mimalloc.h" // struct _mimalloc_interp_state @@ -222,6 +223,7 @@ struct _is { struct _brc_state brc; // biased reference counting state struct _Py_unique_id_pool unique_ids; // object ids for per-thread refcounts PyMutex weakref_locks[NUM_WEAKREF_LIST_LOCKS]; + _PyIndexPool tlbc_indices; #endif // Per-interpreter state for the obmalloc allocator. For the main diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h index e0e7d5ebf0912c..b8bea72baeaaf5 100644 --- a/Include/internal/pycore_tstate.h +++ b/Include/internal/pycore_tstate.h @@ -42,6 +42,9 @@ typedef struct _PyThreadStateImpl { int is_finalized; } refcounts; + // Index to use to retrieve thread-local bytecode for this thread + int32_t tlbc_index; + // When >1, code objects do not immortalize their non-string constants. int suppress_co_const_immortalization; #endif @@ -52,7 +55,6 @@ typedef struct _PyThreadStateImpl { } _PyThreadStateImpl; - #ifdef __cplusplus } #endif diff --git a/Include/internal/pycore_uop_ids.h b/Include/internal/pycore_uop_ids.h index de628d240d1c07..55416d2aae1e1a 100644 --- a/Include/internal/pycore_uop_ids.h +++ b/Include/internal/pycore_uop_ids.h @@ -193,106 +193,107 @@ extern "C" { #define _LOAD_ATTR_SLOT_1 423 #define _LOAD_ATTR_WITH_HINT 424 #define _LOAD_BUILD_CLASS LOAD_BUILD_CLASS +#define _LOAD_BYTECODE 425 #define _LOAD_COMMON_CONSTANT LOAD_COMMON_CONSTANT #define _LOAD_CONST LOAD_CONST #define _LOAD_CONST_IMMORTAL LOAD_CONST_IMMORTAL -#define _LOAD_CONST_INLINE 425 -#define _LOAD_CONST_INLINE_BORROW 426 -#define _LOAD_CONST_INLINE_BORROW_WITH_NULL 427 -#define _LOAD_CONST_INLINE_WITH_NULL 428 +#define _LOAD_CONST_INLINE 426 +#define _LOAD_CONST_INLINE_BORROW 427 +#define _LOAD_CONST_INLINE_BORROW_WITH_NULL 428 +#define _LOAD_CONST_INLINE_WITH_NULL 429 #define _LOAD_DEREF LOAD_DEREF -#define _LOAD_FAST 429 -#define _LOAD_FAST_0 430 -#define _LOAD_FAST_1 431 -#define _LOAD_FAST_2 432 -#define _LOAD_FAST_3 433 -#define _LOAD_FAST_4 434 -#define _LOAD_FAST_5 435 -#define _LOAD_FAST_6 436 -#define _LOAD_FAST_7 437 +#define _LOAD_FAST 430 +#define _LOAD_FAST_0 431 +#define _LOAD_FAST_1 432 +#define _LOAD_FAST_2 433 +#define _LOAD_FAST_3 434 +#define _LOAD_FAST_4 435 +#define _LOAD_FAST_5 436 +#define _LOAD_FAST_6 437 +#define _LOAD_FAST_7 438 #define _LOAD_FAST_AND_CLEAR LOAD_FAST_AND_CLEAR #define _LOAD_FAST_CHECK LOAD_FAST_CHECK #define _LOAD_FAST_LOAD_FAST LOAD_FAST_LOAD_FAST #define _LOAD_FROM_DICT_OR_DEREF LOAD_FROM_DICT_OR_DEREF #define _LOAD_FROM_DICT_OR_GLOBALS LOAD_FROM_DICT_OR_GLOBALS -#define _LOAD_GLOBAL 438 -#define _LOAD_GLOBAL_BUILTINS 439 -#define _LOAD_GLOBAL_BUILTINS_FROM_KEYS 440 -#define _LOAD_GLOBAL_MODULE 441 -#define _LOAD_GLOBAL_MODULE_FROM_KEYS 442 +#define _LOAD_GLOBAL 439 +#define _LOAD_GLOBAL_BUILTINS 440 +#define _LOAD_GLOBAL_BUILTINS_FROM_KEYS 441 +#define _LOAD_GLOBAL_MODULE 442 +#define _LOAD_GLOBAL_MODULE_FROM_KEYS 443 #define _LOAD_LOCALS LOAD_LOCALS #define _LOAD_NAME LOAD_NAME -#define _LOAD_SMALL_INT 443 -#define _LOAD_SMALL_INT_0 444 -#define _LOAD_SMALL_INT_1 445 -#define _LOAD_SMALL_INT_2 446 -#define _LOAD_SMALL_INT_3 447 +#define _LOAD_SMALL_INT 444 +#define _LOAD_SMALL_INT_0 445 +#define _LOAD_SMALL_INT_1 446 +#define _LOAD_SMALL_INT_2 447 +#define _LOAD_SMALL_INT_3 448 #define _LOAD_SPECIAL LOAD_SPECIAL #define _LOAD_SUPER_ATTR_ATTR LOAD_SUPER_ATTR_ATTR #define _LOAD_SUPER_ATTR_METHOD LOAD_SUPER_ATTR_METHOD -#define _MAKE_CALLARGS_A_TUPLE 448 +#define _MAKE_CALLARGS_A_TUPLE 449 #define _MAKE_CELL MAKE_CELL #define _MAKE_FUNCTION MAKE_FUNCTION -#define _MAKE_WARM 449 +#define _MAKE_WARM 450 #define _MAP_ADD MAP_ADD #define _MATCH_CLASS MATCH_CLASS #define _MATCH_KEYS MATCH_KEYS #define _MATCH_MAPPING MATCH_MAPPING #define _MATCH_SEQUENCE MATCH_SEQUENCE -#define _MAYBE_EXPAND_METHOD 450 -#define _MAYBE_EXPAND_METHOD_KW 451 -#define _MONITOR_CALL 452 -#define _MONITOR_JUMP_BACKWARD 453 -#define _MONITOR_RESUME 454 +#define _MAYBE_EXPAND_METHOD 451 +#define _MAYBE_EXPAND_METHOD_KW 452 +#define _MONITOR_CALL 453 +#define _MONITOR_JUMP_BACKWARD 454 +#define _MONITOR_RESUME 455 #define _NOP NOP #define _POP_EXCEPT POP_EXCEPT -#define _POP_JUMP_IF_FALSE 455 -#define _POP_JUMP_IF_TRUE 456 +#define _POP_JUMP_IF_FALSE 456 +#define _POP_JUMP_IF_TRUE 457 #define _POP_TOP POP_TOP -#define _POP_TOP_LOAD_CONST_INLINE_BORROW 457 +#define _POP_TOP_LOAD_CONST_INLINE_BORROW 458 #define _PUSH_EXC_INFO PUSH_EXC_INFO -#define _PUSH_FRAME 458 +#define _PUSH_FRAME 459 #define _PUSH_NULL PUSH_NULL -#define _PY_FRAME_GENERAL 459 -#define _PY_FRAME_KW 460 -#define _QUICKEN_RESUME 461 -#define _REPLACE_WITH_TRUE 462 +#define _PY_FRAME_GENERAL 460 +#define _PY_FRAME_KW 461 +#define _QUICKEN_RESUME 462 +#define _REPLACE_WITH_TRUE 463 #define _RESUME_CHECK RESUME_CHECK #define _RETURN_GENERATOR RETURN_GENERATOR #define _RETURN_VALUE RETURN_VALUE -#define _SAVE_RETURN_OFFSET 463 -#define _SEND 464 -#define _SEND_GEN_FRAME 465 +#define _SAVE_RETURN_OFFSET 464 +#define _SEND 465 +#define _SEND_GEN_FRAME 466 #define _SETUP_ANNOTATIONS SETUP_ANNOTATIONS #define _SET_ADD SET_ADD #define _SET_FUNCTION_ATTRIBUTE SET_FUNCTION_ATTRIBUTE #define _SET_UPDATE SET_UPDATE -#define _START_EXECUTOR 466 -#define _STORE_ATTR 467 -#define _STORE_ATTR_INSTANCE_VALUE 468 -#define _STORE_ATTR_SLOT 469 -#define _STORE_ATTR_WITH_HINT 470 +#define _START_EXECUTOR 467 +#define _STORE_ATTR 468 +#define _STORE_ATTR_INSTANCE_VALUE 469 +#define _STORE_ATTR_SLOT 470 +#define _STORE_ATTR_WITH_HINT 471 #define _STORE_DEREF STORE_DEREF -#define _STORE_FAST 471 -#define _STORE_FAST_0 472 -#define _STORE_FAST_1 473 -#define _STORE_FAST_2 474 -#define _STORE_FAST_3 475 -#define _STORE_FAST_4 476 -#define _STORE_FAST_5 477 -#define _STORE_FAST_6 478 -#define _STORE_FAST_7 479 +#define _STORE_FAST 472 +#define _STORE_FAST_0 473 +#define _STORE_FAST_1 474 +#define _STORE_FAST_2 475 +#define _STORE_FAST_3 476 +#define _STORE_FAST_4 477 +#define _STORE_FAST_5 478 +#define _STORE_FAST_6 479 +#define _STORE_FAST_7 480 #define _STORE_FAST_LOAD_FAST STORE_FAST_LOAD_FAST #define _STORE_FAST_STORE_FAST STORE_FAST_STORE_FAST #define _STORE_GLOBAL STORE_GLOBAL #define _STORE_NAME STORE_NAME -#define _STORE_SLICE 480 -#define _STORE_SUBSCR 481 +#define _STORE_SLICE 481 +#define _STORE_SUBSCR 482 #define _STORE_SUBSCR_DICT STORE_SUBSCR_DICT #define _STORE_SUBSCR_LIST_INT STORE_SUBSCR_LIST_INT #define _SWAP SWAP -#define _TIER2_RESUME_CHECK 482 -#define _TO_BOOL 483 +#define _TIER2_RESUME_CHECK 483 +#define _TO_BOOL 484 #define _TO_BOOL_BOOL TO_BOOL_BOOL #define _TO_BOOL_INT TO_BOOL_INT #define _TO_BOOL_LIST TO_BOOL_LIST @@ -302,13 +303,13 @@ extern "C" { #define _UNARY_NEGATIVE UNARY_NEGATIVE #define _UNARY_NOT UNARY_NOT #define _UNPACK_EX UNPACK_EX -#define _UNPACK_SEQUENCE 484 +#define _UNPACK_SEQUENCE 485 #define _UNPACK_SEQUENCE_LIST UNPACK_SEQUENCE_LIST #define _UNPACK_SEQUENCE_TUPLE UNPACK_SEQUENCE_TUPLE #define _UNPACK_SEQUENCE_TWO_TUPLE UNPACK_SEQUENCE_TWO_TUPLE #define _WITH_EXCEPT_START WITH_EXCEPT_START #define _YIELD_VALUE YIELD_VALUE -#define MAX_UOP_ID 484 +#define MAX_UOP_ID 485 #ifdef __cplusplus } diff --git a/Include/internal/pycore_uop_metadata.h b/Include/internal/pycore_uop_metadata.h index 4cfdecec78b0db..ade297201f0ac2 100644 --- a/Include/internal/pycore_uop_metadata.h +++ b/Include/internal/pycore_uop_metadata.h @@ -289,7 +289,7 @@ const uint16_t _PyUop_Flags[MAX_UOP_ID+1] = { [_FATAL_ERROR] = 0, [_CHECK_VALIDITY_AND_SET_IP] = HAS_DEOPT_FLAG, [_DEOPT] = 0, - [_ERROR_POP_N] = HAS_ARG_FLAG, + [_ERROR_POP_N] = HAS_ARG_FLAG | HAS_ESCAPES_FLAG, [_TIER2_RESUME_CHECK] = HAS_DEOPT_FLAG, }; diff --git a/Lib/test/support/__init__.py b/Lib/test/support/__init__.py index 7c1ef42a4970d7..2ad267e3e08f0f 100644 --- a/Lib/test/support/__init__.py +++ b/Lib/test/support/__init__.py @@ -1274,6 +1274,11 @@ def requires_specialization(test): _opcode.ENABLE_SPECIALIZATION, "requires specialization")(test) +def requires_specialization_ft(test): + return unittest.skipUnless( + _opcode.ENABLE_SPECIALIZATION_FT, "requires specialization")(test) + + #======================================================================= # Check for the presence of docstrings. diff --git a/Lib/test/test_capi/test_config.py b/Lib/test/test_capi/test_config.py index 71fb9ae45c7c30..77730ad2f32085 100644 --- a/Lib/test/test_capi/test_config.py +++ b/Lib/test/test_capi/test_config.py @@ -100,6 +100,7 @@ def test_config_get(self): options.append(("run_presite", str | None, None)) if sysconfig.get_config_var('Py_GIL_DISABLED'): options.append(("enable_gil", int, None)) + options.append(("tlbc_enabled", int, None)) if support.MS_WINDOWS: options.extend(( ("legacy_windows_stdio", bool, None), diff --git a/Lib/test/test_capi/test_opt.py b/Lib/test/test_capi/test_opt.py index f1ab72180d714d..c352325ff3d08a 100644 --- a/Lib/test/test_capi/test_opt.py +++ b/Lib/test/test_capi/test_opt.py @@ -7,7 +7,8 @@ import _opcode -from test.support import script_helper, requires_specialization, import_helper +from test.support import (script_helper, requires_specialization, + import_helper, Py_GIL_DISABLED) _testinternalcapi = import_helper.import_module("_testinternalcapi") @@ -34,6 +35,7 @@ def clear_executors(func): @requires_specialization +@unittest.skipIf(Py_GIL_DISABLED, "optimizer not yet supported in free-threaded builds") @unittest.skipUnless(hasattr(_testinternalcapi, "get_optimizer"), "Requires optimizer infrastructure") class TestOptimizerAPI(unittest.TestCase): @@ -138,6 +140,7 @@ def get_opnames(ex): @requires_specialization +@unittest.skipIf(Py_GIL_DISABLED, "optimizer not yet supported in free-threaded builds") @unittest.skipUnless(hasattr(_testinternalcapi, "get_optimizer"), "Requires optimizer infrastructure") class TestExecutorInvalidation(unittest.TestCase): @@ -219,6 +222,7 @@ def f(): @requires_specialization +@unittest.skipIf(Py_GIL_DISABLED, "optimizer not yet supported in free-threaded builds") @unittest.skipUnless(hasattr(_testinternalcapi, "get_optimizer"), "Requires optimizer infrastructure") @unittest.skipIf(os.getenv("PYTHON_UOPS_OPTIMIZE") == "0", "Needs uop optimizer to run.") @@ -586,6 +590,7 @@ def testfunc(n): @requires_specialization +@unittest.skipIf(Py_GIL_DISABLED, "optimizer not yet supported in free-threaded builds") @unittest.skipUnless(hasattr(_testinternalcapi, "get_optimizer"), "Requires optimizer infrastructure") @unittest.skipIf(os.getenv("PYTHON_UOPS_OPTIMIZE") == "0", "Needs uop optimizer to run.") diff --git a/Lib/test/test_cmd_line.py b/Lib/test/test_cmd_line.py index eca9adf9a7dcbc..634efda354407f 100644 --- a/Lib/test/test_cmd_line.py +++ b/Lib/test/test_cmd_line.py @@ -12,6 +12,7 @@ from test import support from test.support import os_helper from test.support import force_not_colorized +from test.support import threading_helper from test.support.script_helper import ( spawn_python, kill_python, assert_python_ok, assert_python_failure, interpreter_requires_environment @@ -1068,6 +1069,57 @@ def res2int(self, res): out = res.out.strip().decode("utf-8") return tuple(int(i) for i in out.split()) + @unittest.skipUnless(support.Py_GIL_DISABLED, + "PYTHON_TLBC and -X tlbc" + " only supported in Py_GIL_DISABLED builds") + @threading_helper.requires_working_threading() + def test_disable_thread_local_bytecode(self): + code = """if 1: + import threading + def test(x, y): + return x + y + t = threading.Thread(target=test, args=(1,2)) + t.start() + t.join()""" + assert_python_ok("-W", "always", "-X", "tlbc=0", "-c", code) + assert_python_ok("-W", "always", "-c", code, PYTHON_TLBC="0") + + @unittest.skipUnless(support.Py_GIL_DISABLED, + "PYTHON_TLBC and -X tlbc" + " only supported in Py_GIL_DISABLED builds") + @threading_helper.requires_working_threading() + def test_enable_thread_local_bytecode(self): + code = """if 1: + import threading + def test(x, y): + return x + y + t = threading.Thread(target=test, args=(1,2)) + t.start() + t.join()""" + # The functionality of thread-local bytecode is tested more extensively + # in test_thread_local_bytecode + assert_python_ok("-W", "always", "-X", "tlbc=1", "-c", code) + assert_python_ok("-W", "always", "-c", code, PYTHON_TLBC="1") + + @unittest.skipUnless(support.Py_GIL_DISABLED, + "PYTHON_TLBC and -X tlbc" + " only supported in Py_GIL_DISABLED builds") + def test_invalid_thread_local_bytecode(self): + rc, out, err = assert_python_failure("-X", "tlbc") + self.assertIn(b"tlbc=n: n is missing or invalid", err) + rc, out, err = assert_python_failure("-X", "tlbc=foo") + self.assertIn(b"tlbc=n: n is missing or invalid", err) + rc, out, err = assert_python_failure("-X", "tlbc=-1") + self.assertIn(b"tlbc=n: n is missing or invalid", err) + rc, out, err = assert_python_failure("-X", "tlbc=2") + self.assertIn(b"tlbc=n: n is missing or invalid", err) + rc, out, err = assert_python_failure(PYTHON_TLBC="foo") + self.assertIn(b"PYTHON_TLBC=N: N is missing or invalid", err) + rc, out, err = assert_python_failure(PYTHON_TLBC="-1") + self.assertIn(b"PYTHON_TLBC=N: N is missing or invalid", err) + rc, out, err = assert_python_failure(PYTHON_TLBC="2") + self.assertIn(b"PYTHON_TLBC=N: N is missing or invalid", err) + @unittest.skipIf(interpreter_requires_environment(), 'Cannot run -I tests when PYTHON env vars are required.') diff --git a/Lib/test/test_dis.py b/Lib/test/test_dis.py index 3c6570afa50d45..a991c67fca46be 100644 --- a/Lib/test/test_dis.py +++ b/Lib/test/test_dis.py @@ -10,7 +10,8 @@ import types import unittest from test.support import (captured_stdout, requires_debug_ranges, - requires_specialization, cpython_only) + requires_specialization, requires_specialization_ft, + cpython_only) from test.support.bytecode_helper import BytecodeTestCase import opcode @@ -1261,7 +1262,7 @@ def test_super_instructions(self): self.do_disassembly_compare(got, dis_load_test_quickened_code) @cpython_only - @requires_specialization + @requires_specialization_ft def test_binary_specialize(self): binary_op_quicken = """\ 0 RESUME_CHECK 0 @@ -1281,6 +1282,9 @@ def test_binary_specialize(self): got = self.get_disassembly(co_unicode, adaptive=True) self.do_disassembly_compare(got, binary_op_quicken % "BINARY_OP_ADD_UNICODE 0 (+)") + @cpython_only + @requires_specialization + def test_binary_subscr_specialize(self): binary_subscr_quicken = """\ 0 RESUME_CHECK 0 diff --git a/Lib/test/test_embed.py b/Lib/test/test_embed.py index 5e886b6c8c38ec..bf861ef06ee2d3 100644 --- a/Lib/test/test_embed.py +++ b/Lib/test/test_embed.py @@ -644,6 +644,7 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase): CONFIG_COMPAT['run_presite'] = None if support.Py_GIL_DISABLED: CONFIG_COMPAT['enable_gil'] = -1 + CONFIG_COMPAT['tlbc_enabled'] = GET_DEFAULT_CONFIG if MS_WINDOWS: CONFIG_COMPAT.update({ 'legacy_windows_stdio': False, diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py index c0862d7d15f39e..d839893d2c657e 100644 --- a/Lib/test/test_sys.py +++ b/Lib/test/test_sys.py @@ -1094,7 +1094,14 @@ def test_getallocatedblocks(self): # While we could imagine a Python session where the number of # multiple buffer objects would exceed the sharing of references, # it is unlikely to happen in a normal test run. - self.assertLess(a, sys.gettotalrefcount()) + # + # In free-threaded builds each code object owns an array of + # pointers to copies of the bytecode. When the number of + # code objects is a large fraction of the total number of + # references, this can cause the total number of allocated + # blocks to exceed the total number of references. + if not support.Py_GIL_DISABLED: + self.assertLess(a, sys.gettotalrefcount()) except AttributeError: # gettotalrefcount() not available pass @@ -1613,7 +1620,10 @@ class C(object): pass def func(): return sys._getframe() x = func() - INTERPRETER_FRAME = '9PhcP' + if support.Py_GIL_DISABLED: + INTERPRETER_FRAME = '10PhcP' + else: + INTERPRETER_FRAME = '9PhcP' check(x, size('3PiccPP' + INTERPRETER_FRAME + 'P')) # function def func(): pass diff --git a/Lib/test/test_thread_local_bytecode.py b/Lib/test/test_thread_local_bytecode.py new file mode 100644 index 00000000000000..7a8809c5ae7697 --- /dev/null +++ b/Lib/test/test_thread_local_bytecode.py @@ -0,0 +1,198 @@ +"""Tests for thread-local bytecode.""" +import dis +import textwrap +import unittest + +from test import support +from test.support import cpython_only, import_helper, requires_specialization_ft +from test.support.script_helper import assert_python_ok +from test.support.threading_helper import requires_working_threading + +# Skip this test if the _testinternalcapi module isn't available +_testinternalcapi = import_helper.import_module("_testinternalcapi") + + +@cpython_only +@requires_working_threading() +@unittest.skipUnless(support.Py_GIL_DISABLED, "only in free-threaded builds") +class TLBCTests(unittest.TestCase): + @requires_specialization_ft + def test_new_threads_start_with_unspecialized_code(self): + code = textwrap.dedent(""" + import dis + import queue + import threading + + from _testinternalcapi import get_tlbc + + def all_opnames(bc): + return {i.opname for i in dis._get_instructions_bytes(bc)} + + def f(a, b, q=None): + if q is not None: + q.put(get_tlbc(f)) + return a + b + + for _ in range(100): + # specialize + f(1, 2) + + q = queue.Queue() + t = threading.Thread(target=f, args=('a', 'b', q)) + t.start() + t.join() + + assert "BINARY_OP_ADD_INT" in all_opnames(get_tlbc(f)) + assert "BINARY_OP_ADD_INT" not in all_opnames(q.get()) + """) + assert_python_ok("-X", "tlbc=1", "-c", code) + + @requires_specialization_ft + def test_threads_specialize_independently(self): + code = textwrap.dedent(""" + import dis + import queue + import threading + + from _testinternalcapi import get_tlbc + + def all_opnames(bc): + return {i.opname for i in dis._get_instructions_bytes(bc)} + + def f(a, b): + return a + b + + def g(a, b, q=None): + for _ in range(100): + f(a, b) + if q is not None: + q.put(get_tlbc(f)) + + # specialize in main thread + g(1, 2) + + # specialize in other thread + q = queue.Queue() + t = threading.Thread(target=g, args=('a', 'b', q)) + t.start() + t.join() + + assert "BINARY_OP_ADD_INT" in all_opnames(get_tlbc(f)) + t_opnames = all_opnames(q.get()) + assert "BINARY_OP_ADD_INT" not in t_opnames + assert "BINARY_OP_ADD_UNICODE" in t_opnames + """) + assert_python_ok("-X", "tlbc=1", "-c", code) + + def test_reuse_tlbc_across_threads_different_lifetimes(self): + code = textwrap.dedent(""" + import queue + import threading + + from _testinternalcapi import get_tlbc_id + + def f(a, b, q=None): + if q is not None: + q.put(get_tlbc_id(f)) + return a + b + + q = queue.Queue() + tlbc_ids = [] + for _ in range(3): + t = threading.Thread(target=f, args=('a', 'b', q)) + t.start() + t.join() + tlbc_ids.append(q.get()) + + assert tlbc_ids[0] == tlbc_ids[1] + assert tlbc_ids[1] == tlbc_ids[2] + """) + assert_python_ok("-X", "tlbc=1", "-c", code) + + def test_no_copies_if_tlbc_disabled(self): + code = textwrap.dedent(""" + import queue + import threading + + from _testinternalcapi import get_tlbc_id + + def f(a, b, q=None): + if q is not None: + q.put(get_tlbc_id(f)) + return a + b + + q = queue.Queue() + threads = [] + for _ in range(3): + t = threading.Thread(target=f, args=('a', 'b', q)) + t.start() + threads.append(t) + + tlbc_ids = [] + for t in threads: + t.join() + tlbc_ids.append(q.get()) + + main_tlbc_id = get_tlbc_id(f) + assert main_tlbc_id is not None + assert tlbc_ids[0] == main_tlbc_id + assert tlbc_ids[1] == main_tlbc_id + assert tlbc_ids[2] == main_tlbc_id + """) + assert_python_ok("-X", "tlbc=0", "-c", code) + + def test_no_specialization_if_tlbc_disabled(self): + code = textwrap.dedent(""" + import dis + import queue + import threading + + from _testinternalcapi import get_tlbc + + def all_opnames(f): + bc = get_tlbc(f) + return {i.opname for i in dis._get_instructions_bytes(bc)} + + def f(a, b): + return a + b + + for _ in range(100): + f(1, 2) + + assert "BINARY_OP_ADD_INT" not in all_opnames(f) + """) + assert_python_ok("-X", "tlbc=0", "-c", code) + + def test_generator_throw(self): + code = textwrap.dedent(""" + import queue + import threading + + from _testinternalcapi import get_tlbc_id + + def g(): + try: + yield + except: + yield get_tlbc_id(g) + + def f(q): + gen = g() + next(gen) + q.put(gen.throw(ValueError)) + + q = queue.Queue() + t = threading.Thread(target=f, args=(q,)) + t.start() + t.join() + + gen = g() + next(gen) + main_id = gen.throw(ValueError) + assert main_id != q.get() + """) + assert_python_ok("-X", "tlbc=1", "-c", code) + + +if __name__ == "__main__": + unittest.main() diff --git a/Makefile.pre.in b/Makefile.pre.in index 1a9191ec0ce48f..c650ecaf7be137 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -460,6 +460,7 @@ PYTHON_OBJS= \ Python/hashtable.o \ Python/import.o \ Python/importdl.o \ + Python/index_pool.o \ Python/initconfig.o \ Python/interpconfig.o \ Python/instrumentation.o \ @@ -1228,6 +1229,7 @@ PYTHON_HEADERS= \ $(srcdir)/Include/internal/pycore_hashtable.h \ $(srcdir)/Include/internal/pycore_import.h \ $(srcdir)/Include/internal/pycore_importdl.h \ + $(srcdir)/Include/internal/pycore_index_pool.h \ $(srcdir)/Include/internal/pycore_initconfig.h \ $(srcdir)/Include/internal/pycore_instruments.h \ $(srcdir)/Include/internal/pycore_instruction_sequence.h \ diff --git a/Modules/_opcode.c b/Modules/_opcode.c index dc93063aee7e54..7ccf7af6bf908f 100644 --- a/Modules/_opcode.c +++ b/Modules/_opcode.c @@ -422,6 +422,9 @@ _opcode_exec(PyObject *m) { if (PyModule_AddIntMacro(m, ENABLE_SPECIALIZATION) < 0) { return -1; } + if (PyModule_AddIntMacro(m, ENABLE_SPECIALIZATION_FT) < 0) { + return -1; + } return 0; } diff --git a/Modules/_testinternalcapi.c b/Modules/_testinternalcapi.c index eb98b433c6c6af..883f32599fbc99 100644 --- a/Modules/_testinternalcapi.c +++ b/Modules/_testinternalcapi.c @@ -14,6 +14,7 @@ #include "pycore_bitutils.h" // _Py_bswap32() #include "pycore_bytesobject.h" // _PyBytes_Find() #include "pycore_ceval.h" // _PyEval_AddPendingCall() +#include "pycore_code.h" // _PyCode_GetTLBCFast() #include "pycore_compile.h" // _PyCompile_CodeGen() #include "pycore_context.h" // _PyContext_NewHamtForTests() #include "pycore_dict.h" // _PyManagedDictPointer_GetValues() @@ -1963,6 +1964,48 @@ get_py_thread_id(PyObject *self, PyObject *Py_UNUSED(ignored)) Py_BUILD_ASSERT(sizeof(unsigned long long) >= sizeof(tid)); return PyLong_FromUnsignedLongLong(tid); } + +static PyCodeObject * +get_code(PyObject *obj) +{ + if (PyCode_Check(obj)) { + return (PyCodeObject *)obj; + } + else if (PyFunction_Check(obj)) { + return (PyCodeObject *)PyFunction_GetCode(obj); + } + return (PyCodeObject *)PyErr_Format( + PyExc_TypeError, "expected function or code object, got %s", + Py_TYPE(obj)->tp_name); +} + +static PyObject * +get_tlbc(PyObject *Py_UNUSED(module), PyObject *obj) +{ + PyCodeObject *code = get_code(obj); + if (code == NULL) { + return NULL; + } + _Py_CODEUNIT *bc = _PyCode_GetTLBCFast(PyThreadState_GET(), code); + if (bc == NULL) { + Py_RETURN_NONE; + } + return PyBytes_FromStringAndSize((const char *)bc, _PyCode_NBYTES(code)); +} + +static PyObject * +get_tlbc_id(PyObject *Py_UNUSED(module), PyObject *obj) +{ + PyCodeObject *code = get_code(obj); + if (code == NULL) { + return NULL; + } + _Py_CODEUNIT *bc = _PyCode_GetTLBCFast(PyThreadState_GET(), code); + if (bc == NULL) { + Py_RETURN_NONE; + } + return PyLong_FromVoidPtr(bc); +} #endif static PyObject * @@ -2022,7 +2065,6 @@ identify_type_slot_wrappers(PyObject *self, PyObject *Py_UNUSED(ignored)) return _PyType_GetSlotWrapperNames(); } - static PyMethodDef module_functions[] = { {"get_configs", get_configs, METH_NOARGS}, {"get_recursion_depth", get_recursion_depth, METH_NOARGS}, @@ -2110,6 +2152,8 @@ static PyMethodDef module_functions[] = { #ifdef Py_GIL_DISABLED {"py_thread_id", get_py_thread_id, METH_NOARGS}, + {"get_tlbc", get_tlbc, METH_O, NULL}, + {"get_tlbc_id", get_tlbc_id, METH_O, NULL}, #endif #ifdef _Py_TIER2 {"uop_symbols_test", _Py_uop_symbols_test, METH_NOARGS}, diff --git a/Objects/codeobject.c b/Objects/codeobject.c index 775ea7aca824c4..1cf9740af9a209 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -6,17 +6,22 @@ #include "pycore_code.h" // _PyCodeConstructor #include "pycore_frame.h" // FRAME_SPECIALS_SIZE #include "pycore_hashtable.h" // _Py_hashtable_t +#include "pycore_index_pool.h" // _PyIndexPool #include "pycore_initconfig.h" // _PyStatus_OK() #include "pycore_interp.h" // PyInterpreterState.co_extra_freefuncs #include "pycore_object.h" // _PyObject_SetDeferredRefcount +#include "pycore_object_stack.h" #include "pycore_opcode_metadata.h" // _PyOpcode_Deopt, _PyOpcode_Caches #include "pycore_opcode_utils.h" // RESUME_AT_FUNC_START +#include "pycore_pymem.h" // _PyMem_FreeDelayed #include "pycore_pystate.h" // _PyInterpreterState_GET() #include "pycore_setobject.h" // _PySet_NextEntry() #include "pycore_tuple.h" // _PyTuple_ITEMS() #include "pycore_uniqueid.h" // _PyObject_AssignUniqueId() #include "clinic/codeobject.c.h" +#define INITIAL_SPECIALIZED_CODE_SIZE 16 + static const char * code_event_name(PyCodeEvent event) { switch (event) { @@ -440,9 +445,15 @@ _PyCode_Validate(struct _PyCodeConstructor *con) return 0; } -extern void _PyCode_Quicken(PyCodeObject *code); +extern void +_PyCode_Quicken(_Py_CODEUNIT *instructions, Py_ssize_t size, PyObject *consts, + int enable_counters); -static void +#ifdef Py_GIL_DISABLED +static _PyCodeArray * _PyCodeArray_New(Py_ssize_t size); +#endif + +static int init_code(PyCodeObject *co, struct _PyCodeConstructor *con) { int nlocalsplus = (int)PyTuple_GET_SIZE(con->localsplusnames); @@ -505,14 +516,27 @@ init_code(PyCodeObject *co, struct _PyCodeConstructor *con) memcpy(_PyCode_CODE(co), PyBytes_AS_STRING(con->code), PyBytes_GET_SIZE(con->code)); +#ifdef Py_GIL_DISABLED + co->co_tlbc = _PyCodeArray_New(INITIAL_SPECIALIZED_CODE_SIZE); + if (co->co_tlbc == NULL) { + return -1; + } + co->co_tlbc->entries[0] = co->co_code_adaptive; +#endif int entry_point = 0; while (entry_point < Py_SIZE(co) && _PyCode_CODE(co)[entry_point].op.code != RESUME) { entry_point++; } co->_co_firsttraceable = entry_point; - _PyCode_Quicken(co); +#ifdef Py_GIL_DISABLED + _PyCode_Quicken(_PyCode_CODE(co), Py_SIZE(co), co->co_consts, + interp->config.tlbc_enabled); +#else + _PyCode_Quicken(_PyCode_CODE(co), Py_SIZE(co), co->co_consts, 1); +#endif notify_code_watchers(PY_CODE_EVENT_CREATE, co); + return 0; } static int @@ -667,7 +691,12 @@ _PyCode_New(struct _PyCodeConstructor *con) PyErr_NoMemory(); return NULL; } - init_code(co, con); + + if (init_code(co, con) < 0) { + Py_DECREF(co); + return NULL; + } + #ifdef Py_GIL_DISABLED co->_co_unique_id = _PyObject_AssignUniqueId((PyObject *)co); _PyObject_GC_TRACK(co); @@ -1871,6 +1900,17 @@ code_dealloc(PyCodeObject *co) PyObject_ClearWeakRefs((PyObject*)co); } free_monitoring_data(co->_co_monitoring); +#ifdef Py_GIL_DISABLED + // The first element always points to the mutable bytecode at the end of + // the code object, which will be freed when the code object is freed. + for (Py_ssize_t i = 1; i < co->co_tlbc->size; i++) { + char *entry = co->co_tlbc->entries[i]; + if (entry != NULL) { + PyMem_Free(entry); + } + } + PyMem_Free(co->co_tlbc); +#endif PyObject_Free(co); } @@ -2646,5 +2686,270 @@ _PyCode_Fini(PyInterpreterState *interp) _Py_hashtable_destroy(state->constants); state->constants = NULL; } + _PyIndexPool_Fini(&interp->tlbc_indices); #endif } + +#ifdef Py_GIL_DISABLED + +// Thread-local bytecode (TLBC) +// +// Each thread specializes a thread-local copy of the bytecode, created on the +// first RESUME, in free-threaded builds. All copies of the bytecode for a code +// object are stored in the `co_tlbc` array. Threads reserve a globally unique +// index identifying its copy of the bytecode in all `co_tlbc` arrays at thread +// creation and release the index at thread destruction. The first entry in +// every `co_tlbc` array always points to the "main" copy of the bytecode that +// is stored at the end of the code object. This ensures that no bytecode is +// copied for programs that do not use threads. +// +// Thread-local bytecode can be disabled at runtime by providing either `-X +// tlbc=0` or `PYTHON_TLBC=0`. Disabling thread-local bytecode also disables +// specialization. All threads share the main copy of the bytecode when +// thread-local bytecode is disabled. +// +// Concurrent modifications to the bytecode made by the specializing +// interpreter and instrumentation use atomics, with specialization taking care +// not to overwrite an instruction that was instrumented concurrently. + +int32_t +_Py_ReserveTLBCIndex(PyInterpreterState *interp) +{ + if (interp->config.tlbc_enabled) { + return _PyIndexPool_AllocIndex(&interp->tlbc_indices); + } + // All threads share the main copy of the bytecode when TLBC is disabled + return 0; +} + +void +_Py_ClearTLBCIndex(_PyThreadStateImpl *tstate) +{ + PyInterpreterState *interp = ((PyThreadState *)tstate)->interp; + if (interp->config.tlbc_enabled) { + _PyIndexPool_FreeIndex(&interp->tlbc_indices, tstate->tlbc_index); + } +} + +static _PyCodeArray * +_PyCodeArray_New(Py_ssize_t size) +{ + _PyCodeArray *arr = PyMem_Calloc( + 1, offsetof(_PyCodeArray, entries) + sizeof(void *) * size); + if (arr == NULL) { + PyErr_NoMemory(); + return NULL; + } + arr->size = size; + return arr; +} + +static void +copy_code(_Py_CODEUNIT *dst, PyCodeObject *co) +{ + int code_len = (int) Py_SIZE(co); + for (int i = 0; i < code_len; i += _PyInstruction_GetLength(co, i)) { + dst[i] = _Py_GetBaseCodeUnit(co, i); + } + _PyCode_Quicken(dst, code_len, co->co_consts, 1); +} + +static Py_ssize_t +get_pow2_greater(Py_ssize_t initial, Py_ssize_t limit) +{ + // initial must be a power of two + assert(!(initial & (initial - 1))); + Py_ssize_t res = initial; + while (res && res < limit) { + res <<= 1; + } + return res; +} + +static _Py_CODEUNIT * +create_tlbc_lock_held(PyCodeObject *co, Py_ssize_t idx) +{ + _PyCodeArray *tlbc = co->co_tlbc; + if (idx >= tlbc->size) { + Py_ssize_t new_size = get_pow2_greater(tlbc->size, idx + 1); + if (!new_size) { + PyErr_NoMemory(); + return NULL; + } + _PyCodeArray *new_tlbc = _PyCodeArray_New(new_size); + if (new_tlbc == NULL) { + return NULL; + } + memcpy(new_tlbc->entries, tlbc->entries, tlbc->size * sizeof(void *)); + _Py_atomic_store_ptr_release(&co->co_tlbc, new_tlbc); + _PyMem_FreeDelayed(tlbc); + tlbc = new_tlbc; + } + char *bc = PyMem_Calloc(1, _PyCode_NBYTES(co)); + if (bc == NULL) { + PyErr_NoMemory(); + return NULL; + } + copy_code((_Py_CODEUNIT *) bc, co); + assert(tlbc->entries[idx] == NULL); + tlbc->entries[idx] = bc; + return (_Py_CODEUNIT *) bc; +} + +static _Py_CODEUNIT * +get_tlbc_lock_held(PyCodeObject *co) +{ + _PyCodeArray *tlbc = co->co_tlbc; + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)PyThreadState_GET(); + int32_t idx = tstate->tlbc_index; + if (idx < tlbc->size && tlbc->entries[idx] != NULL) { + return (_Py_CODEUNIT *)tlbc->entries[idx]; + } + return create_tlbc_lock_held(co, idx); +} + +_Py_CODEUNIT * +_PyCode_GetTLBC(PyCodeObject *co) +{ + _Py_CODEUNIT *result; + Py_BEGIN_CRITICAL_SECTION(co); + result = get_tlbc_lock_held(co); + Py_END_CRITICAL_SECTION(); + return result; +} + +// My kingdom for a bitset +struct flag_set { + uint8_t *flags; + Py_ssize_t size; +}; + +static inline int +flag_is_set(struct flag_set *flags, Py_ssize_t idx) +{ + assert(idx >= 0); + return (idx < flags->size) && flags->flags[idx]; +} + +// Set the flag for each tlbc index in use +static int +get_indices_in_use(PyInterpreterState *interp, struct flag_set *in_use) +{ + assert(interp->stoptheworld.world_stopped); + assert(in_use->flags == NULL); + int32_t max_index = 0; + for (PyThreadState *p = interp->threads.head; p != NULL; p = p->next) { + int32_t idx = ((_PyThreadStateImpl *) p)->tlbc_index; + if (idx > max_index) { + max_index = idx; + } + } + in_use->size = (size_t) max_index + 1; + in_use->flags = PyMem_Calloc(in_use->size, sizeof(*in_use->flags)); + if (in_use->flags == NULL) { + return -1; + } + for (PyThreadState *p = interp->threads.head; p != NULL; p = p->next) { + in_use->flags[((_PyThreadStateImpl *) p)->tlbc_index] = 1; + } + return 0; +} + +struct get_code_args { + _PyObjectStack code_objs; + struct flag_set indices_in_use; + int err; +}; + +static void +clear_get_code_args(struct get_code_args *args) +{ + if (args->indices_in_use.flags != NULL) { + PyMem_Free(args->indices_in_use.flags); + args->indices_in_use.flags = NULL; + } + _PyObjectStack_Clear(&args->code_objs); +} + +static inline int +is_bytecode_unused(_PyCodeArray *tlbc, Py_ssize_t idx, + struct flag_set *indices_in_use) +{ + assert(idx > 0 && idx < tlbc->size); + return tlbc->entries[idx] != NULL && !flag_is_set(indices_in_use, idx); +} + +static int +get_code_with_unused_tlbc(PyObject *obj, struct get_code_args *args) +{ + if (!PyCode_Check(obj)) { + return 1; + } + PyCodeObject *co = (PyCodeObject *) obj; + _PyCodeArray *tlbc = co->co_tlbc; + // The first index always points at the main copy of the bytecode embedded + // in the code object. + for (Py_ssize_t i = 1; i < tlbc->size; i++) { + if (is_bytecode_unused(tlbc, i, &args->indices_in_use)) { + if (_PyObjectStack_Push(&args->code_objs, obj) < 0) { + args->err = -1; + return 0; + } + return 1; + } + } + return 1; +} + +static void +free_unused_bytecode(PyCodeObject *co, struct flag_set *indices_in_use) +{ + _PyCodeArray *tlbc = co->co_tlbc; + // The first index always points at the main copy of the bytecode embedded + // in the code object. + for (Py_ssize_t i = 1; i < tlbc->size; i++) { + if (is_bytecode_unused(tlbc, i, indices_in_use)) { + PyMem_Free(tlbc->entries[i]); + tlbc->entries[i] = NULL; + } + } +} + +int +_Py_ClearUnusedTLBC(PyInterpreterState *interp) +{ + struct get_code_args args = { + .code_objs = {NULL}, + .indices_in_use = {NULL, 0}, + .err = 0, + }; + _PyEval_StopTheWorld(interp); + // Collect in-use tlbc indices + if (get_indices_in_use(interp, &args.indices_in_use) < 0) { + goto err; + } + // Collect code objects that have bytecode not in use by any thread + _PyGC_VisitObjectsWorldStopped( + interp, (gcvisitobjects_t)get_code_with_unused_tlbc, &args); + if (args.err < 0) { + goto err; + } + // Free unused bytecode. This must happen outside of gc_visit_heaps; it is + // unsafe to allocate or free any mimalloc managed memory when it's + // running. + PyObject *obj; + while ((obj = _PyObjectStack_Pop(&args.code_objs)) != NULL) { + free_unused_bytecode((PyCodeObject*) obj, &args.indices_in_use); + } + _PyEval_StartTheWorld(interp); + clear_get_code_args(&args); + return 0; + +err: + _PyEval_StartTheWorld(interp); + clear_get_code_args(&args); + PyErr_NoMemory(); + return -1; +} + +#endif diff --git a/Objects/frameobject.c b/Objects/frameobject.c index 55394afa523213..c743c254848d3a 100644 --- a/Objects/frameobject.c +++ b/Objects/frameobject.c @@ -1651,7 +1651,7 @@ frame_setlineno(PyFrameObject *f, PyObject* p_new_lineno, void *Py_UNUSED(ignore } /* Finally set the new lasti and return OK. */ f->f_lineno = 0; - f->f_frame->instr_ptr = _PyCode_CODE(code) + best_addr; + f->f_frame->instr_ptr = _PyFrame_GetBytecode(f->f_frame) + best_addr; return 0; } @@ -1867,10 +1867,11 @@ PyTypeObject PyFrame_Type = { }; static void -init_frame(_PyInterpreterFrame *frame, PyFunctionObject *func, PyObject *locals) +init_frame(PyThreadState *tstate, _PyInterpreterFrame *frame, + PyFunctionObject *func, PyObject *locals) { PyCodeObject *code = (PyCodeObject *)func->func_code; - _PyFrame_Initialize(frame, PyStackRef_FromPyObjectNew(func), + _PyFrame_Initialize(tstate, frame, PyStackRef_FromPyObjectNew(func), Py_XNewRef(locals), code, 0, NULL); } @@ -1922,7 +1923,7 @@ PyFrame_New(PyThreadState *tstate, PyCodeObject *code, Py_DECREF(func); return NULL; } - init_frame((_PyInterpreterFrame *)f->_f_frame_data, func, locals); + init_frame(tstate, (_PyInterpreterFrame *)f->_f_frame_data, func, locals); f->f_frame = (_PyInterpreterFrame *)f->_f_frame_data; f->f_frame->owner = FRAME_OWNED_BY_FRAME_OBJECT; // This frame needs to be "complete", so pretend that the first RESUME ran: @@ -1941,7 +1942,8 @@ frame_init_get_vars(_PyInterpreterFrame *frame) // here: PyCodeObject *co = _PyFrame_GetCode(frame); int lasti = _PyInterpreterFrame_LASTI(frame); - if (!(lasti < 0 && _PyCode_CODE(co)->op.code == COPY_FREE_VARS + if (!(lasti < 0 + && _PyFrame_GetBytecode(frame)->op.code == COPY_FREE_VARS && PyStackRef_FunctionCheck(frame->f_funcobj))) { /* Free vars are initialized */ @@ -1957,7 +1959,7 @@ frame_init_get_vars(_PyInterpreterFrame *frame) frame->localsplus[offset + i] = PyStackRef_FromPyObjectNew(o); } // COPY_FREE_VARS doesn't have inline CACHEs, either: - frame->instr_ptr = _PyCode_CODE(_PyFrame_GetCode(frame)); + frame->instr_ptr = _PyFrame_GetBytecode(frame); } diff --git a/Objects/typeobject.c b/Objects/typeobject.c index b4a11195613d74..40225313a8a33b 100644 --- a/Objects/typeobject.c +++ b/Objects/typeobject.c @@ -11638,9 +11638,10 @@ super_descr_get(PyObject *self, PyObject *obj, PyObject *type) } static int -super_init_without_args(_PyInterpreterFrame *cframe, PyCodeObject *co, - PyTypeObject **type_p, PyObject **obj_p) +super_init_without_args(_PyInterpreterFrame *cframe, PyTypeObject **type_p, + PyObject **obj_p) { + PyCodeObject *co = _PyFrame_GetCode(cframe); if (co->co_argcount == 0) { PyErr_SetString(PyExc_RuntimeError, "super(): no arguments"); @@ -11740,7 +11741,7 @@ super_init_impl(PyObject *self, PyTypeObject *type, PyObject *obj) { "super(): no current frame"); return -1; } - int res = super_init_without_args(frame, _PyFrame_GetCode(frame), &type, &obj); + int res = super_init_without_args(frame, &type, &obj); if (res < 0) { return -1; diff --git a/PCbuild/_freeze_module.vcxproj b/PCbuild/_freeze_module.vcxproj index a3c2d32c454e04..51b493f8a84c6f 100644 --- a/PCbuild/_freeze_module.vcxproj +++ b/PCbuild/_freeze_module.vcxproj @@ -222,6 +222,7 @@ + diff --git a/PCbuild/_freeze_module.vcxproj.filters b/PCbuild/_freeze_module.vcxproj.filters index 91b1d75fb8df5e..09a5f4d30ef490 100644 --- a/PCbuild/_freeze_module.vcxproj.filters +++ b/PCbuild/_freeze_module.vcxproj.filters @@ -232,6 +232,9 @@ Source Files + + Source Files + Source Files diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index a4881e9256e4dd..f840e7fd61f985 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -255,6 +255,7 @@ + @@ -614,6 +615,7 @@ + diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters index 740790cc5e1119..a930cd0b0b10c6 100644 --- a/PCbuild/pythoncore.vcxproj.filters +++ b/PCbuild/pythoncore.vcxproj.filters @@ -687,6 +687,9 @@ Include\internal + + Include\internal + Include\internal @@ -1373,6 +1376,9 @@ Python + + Python + Python diff --git a/Python/bytecodes.c b/Python/bytecodes.c index fa98af12c69aef..2c78cb9931733d 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -168,11 +168,11 @@ dummy_func( } op(_QUICKEN_RESUME, (--)) { - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZATION_FT if (tstate->tracing == 0 && this_instr->op.code == RESUME) { FT_ATOMIC_STORE_UINT8_RELAXED(this_instr->op.code, RESUME_CHECK); } - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZATION_FT */ } tier1 op(_MAYBE_INSTRUMENT, (--)) { @@ -190,7 +190,26 @@ dummy_func( } } + op(_LOAD_BYTECODE, (--)) { + #ifdef Py_GIL_DISABLED + if (frame->tlbc_index != + ((_PyThreadStateImpl *)tstate)->tlbc_index) { + _Py_CODEUNIT *bytecode = + _PyEval_GetExecutableCode(tstate, _PyFrame_GetCode(frame)); + ERROR_IF(bytecode == NULL, error); + int off = this_instr - _PyFrame_GetBytecode(frame); + frame->tlbc_index = ((_PyThreadStateImpl *)tstate)->tlbc_index; + frame->instr_ptr = bytecode + off; + // Make sure this_instr gets reset correctley for any uops that + // follow + next_instr = frame->instr_ptr; + DISPATCH(); + } + #endif + } + macro(RESUME) = + _LOAD_BYTECODE + _MAYBE_INSTRUMENT + _QUICKEN_RESUME + _CHECK_PERIODIC_IF_NOT_YIELD_FROM; @@ -204,6 +223,10 @@ dummy_func( uintptr_t version = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(_PyFrame_GetCode(frame)->_co_instrumentation_version); assert((version & _PY_EVAL_EVENTS_MASK) == 0); DEOPT_IF(eval_breaker != version); + #ifdef Py_GIL_DISABLED + DEOPT_IF(frame->tlbc_index != + ((_PyThreadStateImpl *)tstate)->tlbc_index); + #endif } op(_MONITOR_RESUME, (--)) { @@ -217,6 +240,7 @@ dummy_func( } macro(INSTRUMENTED_RESUME) = + _LOAD_BYTECODE + _MAYBE_INSTRUMENT + _CHECK_PERIODIC_IF_NOT_YIELD_FROM + _MONITOR_RESUME; @@ -682,8 +706,8 @@ dummy_func( }; specializing op(_SPECIALIZE_BINARY_SUBSCR, (counter/1, container, sub -- container, sub)) { - assert(frame->stackpointer == NULL); #if ENABLE_SPECIALIZATION + assert(frame->stackpointer == NULL); if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_BinarySubscr(container, sub, next_instr); @@ -1236,7 +1260,7 @@ dummy_func( if (oparg) { PyObject *lasti = PyStackRef_AsPyObjectBorrow(values[0]); if (PyLong_Check(lasti)) { - frame->instr_ptr = _PyCode_CODE(_PyFrame_GetCode(frame)) + PyLong_AsLong(lasti); + frame->instr_ptr = _PyFrame_GetBytecode(frame) + PyLong_AsLong(lasti); assert(!_PyErr_Occurred(tstate)); } else { @@ -2671,9 +2695,7 @@ dummy_func( assert(PyStackRef_BoolCheck(cond)); int flag = PyStackRef_Is(cond, PyStackRef_False); DEAD(cond); - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); JUMPBY(oparg * flag); } @@ -2681,9 +2703,7 @@ dummy_func( assert(PyStackRef_BoolCheck(cond)); int flag = PyStackRef_Is(cond, PyStackRef_True); DEAD(cond); - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); JUMPBY(oparg * flag); } @@ -3697,7 +3717,7 @@ dummy_func( op(_CREATE_INIT_FRAME, (init[1], self[1], args[oparg] -- init_frame: _PyInterpreterFrame *)) { _PyInterpreterFrame *shim = _PyFrame_PushTrampolineUnchecked( tstate, (PyCodeObject *)&_Py_InitCleanup, 1, frame); - assert(_PyCode_CODE(_PyFrame_GetCode(shim))[0].op.code == EXIT_INIT_CHECK); + assert(_PyFrame_GetBytecode(shim)[0].op.code == EXIT_INIT_CHECK); /* Push self onto stack of shim */ shim->localsplus[0] = PyStackRef_DUP(self[0]); DEAD(init); @@ -4593,7 +4613,7 @@ dummy_func( } specializing op(_SPECIALIZE_BINARY_OP, (counter/1, lhs, rhs -- lhs, rhs)) { - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZATION_FT if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _Py_Specialize_BinaryOp(lhs, rhs, next_instr, oparg, LOCALS_ARRAY); @@ -4601,7 +4621,7 @@ dummy_func( } OPCODE_DEFERRED_INC(BINARY_OP); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZATION_FT */ assert(NB_ADD <= oparg); assert(oparg <= NB_INPLACE_XOR); } @@ -4632,7 +4652,7 @@ dummy_func( int original_opcode = 0; if (tstate->tracing) { PyCodeObject *code = _PyFrame_GetCode(frame); - original_opcode = code->_co_monitoring->lines[(int)(this_instr - _PyCode_CODE(code))].original_opcode; + original_opcode = code->_co_monitoring->lines[(int)(this_instr - _PyFrame_GetBytecode(frame))].original_opcode; next_instr = this_instr; } else { original_opcode = _Py_call_instrumentation_line( @@ -4687,9 +4707,7 @@ dummy_func( assert(PyStackRef_BoolCheck(cond)); int flag = PyStackRef_Is(cond, PyStackRef_True); int offset = flag * oparg; - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); INSTRUMENTED_JUMP(this_instr, next_instr + offset, PY_MONITORING_EVENT_BRANCH); } @@ -4698,9 +4716,7 @@ dummy_func( assert(PyStackRef_BoolCheck(cond)); int flag = PyStackRef_Is(cond, PyStackRef_False); int offset = flag * oparg; - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); INSTRUMENTED_JUMP(this_instr, next_instr + offset, PY_MONITORING_EVENT_BRANCH); } @@ -4715,9 +4731,7 @@ dummy_func( PyStackRef_CLOSE(value_stackref); offset = 0; } - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); INSTRUMENTED_JUMP(this_instr, next_instr + offset, PY_MONITORING_EVENT_BRANCH); } @@ -4815,7 +4829,7 @@ dummy_func( tier2 op(_EXIT_TRACE, (exit_p/4 --)) { _PyExitData *exit = (_PyExitData *)exit_p; PyCodeObject *code = _PyFrame_GetCode(frame); - _Py_CODEUNIT *target = _PyCode_CODE(code) + exit->target; + _Py_CODEUNIT *target = _PyFrame_GetBytecode(frame) + exit->target; #if defined(Py_DEBUG) && !defined(_Py_JIT) OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); if (lltrace >= 2) { @@ -4823,7 +4837,7 @@ dummy_func( _PyUOpPrint(&next_uop[-1]); printf(", exit %u, temp %d, target %d -> %s]\n", exit - current_executor->exits, exit->temperature.value_and_backoff, - (int)(target - _PyCode_CODE(code)), + (int)(target - _PyFrame_GetBytecode(frame)), _PyOpcode_OpName[target->op.code]); } #endif @@ -4933,7 +4947,7 @@ dummy_func( _PyUOpPrint(&next_uop[-1]); printf(", exit %u, temp %d, target %d -> %s]\n", exit - current_executor->exits, exit->temperature.value_and_backoff, - (int)(target - _PyCode_CODE(_PyFrame_GetCode(frame))), + (int)(target - _PyFrame_GetBytecode(frame)), _PyOpcode_OpName[target->op.code]); } #endif @@ -4995,7 +5009,7 @@ dummy_func( } tier2 op(_ERROR_POP_N, (target/2, unused[oparg] --)) { - frame->instr_ptr = ((_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive) + target; + frame->instr_ptr = _PyFrame_GetBytecode(frame) + target; SYNC_SP(); GOTO_UNWIND(); } diff --git a/Python/ceval.c b/Python/ceval.c index beee5325cd6259..9a608f06966688 100644 --- a/Python/ceval.c +++ b/Python/ceval.c @@ -189,7 +189,7 @@ lltrace_instruction(_PyInterpreterFrame *frame, dump_stack(frame, stack_pointer); const char *opname = _PyOpcode_OpName[opcode]; assert(opname != NULL); - int offset = (int)(next_instr - _PyCode_CODE(_PyFrame_GetCode(frame))); + int offset = (int)(next_instr - _PyFrame_GetBytecode(frame)); if (OPCODE_HAS_ARG((int)_PyOpcode_Deopt[opcode])) { printf("%d: %s %d\n", offset * 2, opname, oparg); } @@ -841,6 +841,19 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int } /* Because this avoids the RESUME, * we need to update instrumentation */ +#ifdef Py_GIL_DISABLED + /* Load thread-local bytecode */ + if (frame->tlbc_index != ((_PyThreadStateImpl *)tstate)->tlbc_index) { + _Py_CODEUNIT *bytecode = + _PyEval_GetExecutableCode(tstate, _PyFrame_GetCode(frame)); + if (bytecode == NULL) { + goto error; + } + ptrdiff_t off = frame->instr_ptr - _PyFrame_GetBytecode(frame); + frame->tlbc_index = ((_PyThreadStateImpl *)tstate)->tlbc_index; + frame->instr_ptr = bytecode + off; + } +#endif _Py_Instrument(_PyFrame_GetCode(frame), tstate->interp); monitor_throw(tstate, frame, frame->instr_ptr); /* TO DO -- Monitor throw entry. */ @@ -983,7 +996,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int Python main loop. */ PyObject *exc = _PyErr_GetRaisedException(tstate); PUSH(PyStackRef_FromPyObjectSteal(exc)); - next_instr = _PyCode_CODE(_PyFrame_GetCode(frame)) + handler; + next_instr = _PyFrame_GetBytecode(frame) + handler; if (monitor_handled(tstate, frame, next_instr, exc) < 0) { goto exception_unwind; @@ -1045,6 +1058,8 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int #undef ENABLE_SPECIALIZATION #define ENABLE_SPECIALIZATION 0 +#undef ENABLE_SPECIALIZATION_FT +#define ENABLE_SPECIALIZATION_FT 0 #ifdef Py_DEBUG #define DPRINTF(level, ...) \ @@ -1139,7 +1154,7 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int goto goto_to_tier1; exit_to_tier1: assert(next_uop[-1].format == UOP_FORMAT_TARGET); - next_instr = next_uop[-1].target + _PyCode_CODE(_PyFrame_GetCode(frame)); + next_instr = next_uop[-1].target + _PyFrame_GetBytecode(frame); goto_to_tier1: #ifdef Py_DEBUG if (lltrace >= 2) { @@ -1764,7 +1779,7 @@ _PyEvalFramePushAndInit(PyThreadState *tstate, _PyStackRef func, if (frame == NULL) { goto fail; } - _PyFrame_Initialize(frame, func, locals, code, 0, previous); + _PyFrame_Initialize(tstate, frame, func, locals, code, 0, previous); if (initialize_locals(tstate, func_obj, frame->localsplus, args, argcount, kwnames)) { assert(frame->owner == FRAME_OWNED_BY_THREAD); clear_thread_frame(tstate, frame); diff --git a/Python/ceval_macros.h b/Python/ceval_macros.h index 6674c4ccf9f693..5df55813a0ddeb 100644 --- a/Python/ceval_macros.h +++ b/Python/ceval_macros.h @@ -151,7 +151,7 @@ GETITEM(PyObject *v, Py_ssize_t i) { /* Code access macros */ /* The integer overflow is checked by an assertion below. */ -#define INSTR_OFFSET() ((int)(next_instr - _PyCode_CODE(_PyFrame_GetCode(frame)))) +#define INSTR_OFFSET() ((int)(next_instr - _PyFrame_GetBytecode(frame))) #define NEXTOPARG() do { \ _Py_CODEUNIT word = {.cache = FT_ATOMIC_LOAD_UINT16_RELAXED(*(uint16_t*)next_instr)}; \ opcode = word.op.code; \ @@ -301,14 +301,6 @@ GETITEM(PyObject *v, Py_ssize_t i) { #define ADAPTIVE_COUNTER_TRIGGERS(COUNTER) \ backoff_counter_triggers(forge_backoff_counter((COUNTER))) -#ifdef Py_GIL_DISABLED -#define ADVANCE_ADAPTIVE_COUNTER(COUNTER) \ - do { \ - /* gh-115999 tracks progress on addressing this. */ \ - static_assert(0, "The specializing interpreter is not yet thread-safe"); \ - } while (0); -#define PAUSE_ADAPTIVE_COUNTER(COUNTER) ((void)COUNTER) -#else #define ADVANCE_ADAPTIVE_COUNTER(COUNTER) \ do { \ (COUNTER) = advance_backoff_counter((COUNTER)); \ @@ -318,6 +310,18 @@ GETITEM(PyObject *v, Py_ssize_t i) { do { \ (COUNTER) = pause_backoff_counter((COUNTER)); \ } while (0); + +#ifdef ENABLE_SPECIALIZATION_FT +/* Multiple threads may execute these concurrently if thread-local bytecode is + * disabled and they all execute the main copy of the bytecode. Specialization + * is disabled in that case so the value is unused, but the RMW cycle should be + * free of data races. + */ +#define RECORD_BRANCH_TAKEN(bitset, flag) \ + FT_ATOMIC_STORE_UINT16_RELAXED( \ + bitset, (FT_ATOMIC_LOAD_UINT16_RELAXED(bitset) << 1) | (flag)) +#else +#define RECORD_BRANCH_TAKEN(bitset, flag) #endif #define UNBOUNDLOCAL_ERROR_MSG \ diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index ff4a0a52a0b445..9fac4e881b81e2 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -41,6 +41,8 @@ /* _QUICKEN_RESUME is not a viable micro-op for tier 2 because it uses the 'this_instr' variable */ + /* _LOAD_BYTECODE is not a viable micro-op for tier 2 because it uses the 'this_instr' variable */ + case _RESUME_CHECK: { #if defined(__EMSCRIPTEN__) if (_Py_emscripten_signal_clock == 0) { @@ -56,6 +58,13 @@ UOP_STAT_INC(uopcode, miss); JUMP_TO_JUMP_TARGET(); } + #ifdef Py_GIL_DISABLED + if (frame->tlbc_index != + ((_PyThreadStateImpl *)tstate)->tlbc_index) { + UOP_STAT_INC(uopcode, miss); + JUMP_TO_JUMP_TARGET(); + } + #endif break; } @@ -4480,8 +4489,8 @@ _PyFrame_SetStackPointer(frame, stack_pointer); _PyInterpreterFrame *shim = _PyFrame_PushTrampolineUnchecked( tstate, (PyCodeObject *)&_Py_InitCleanup, 1, frame); + assert(_PyFrame_GetBytecode(shim)[0].op.code == EXIT_INIT_CHECK); stack_pointer = _PyFrame_GetStackPointer(frame); - assert(_PyCode_CODE(_PyFrame_GetCode(shim))[0].op.code == EXIT_INIT_CHECK); /* Push self onto stack of shim */ shim->localsplus[0] = PyStackRef_DUP(self[0]); _PyFrame_SetStackPointer(frame, stack_pointer); @@ -5683,7 +5692,9 @@ PyObject *exit_p = (PyObject *)CURRENT_OPERAND(); _PyExitData *exit = (_PyExitData *)exit_p; PyCodeObject *code = _PyFrame_GetCode(frame); - _Py_CODEUNIT *target = _PyCode_CODE(code) + exit->target; + _PyFrame_SetStackPointer(frame, stack_pointer); + _Py_CODEUNIT *target = _PyFrame_GetBytecode(frame) + exit->target; + stack_pointer = _PyFrame_GetStackPointer(frame); #if defined(Py_DEBUG) && !defined(_Py_JIT) OPT_HIST(trace_uop_execution_counter, trace_run_length_hist); if (lltrace >= 2) { @@ -5692,7 +5703,7 @@ _PyUOpPrint(&next_uop[-1]); printf(", exit %u, temp %d, target %d -> %s]\n", exit - current_executor->exits, exit->temperature.value_and_backoff, - (int)(target - _PyCode_CODE(code)), + (int)(target - _PyFrame_GetBytecode(frame)), _PyOpcode_OpName[target->op.code]); stack_pointer = _PyFrame_GetStackPointer(frame); } @@ -5878,7 +5889,7 @@ _PyUOpPrint(&next_uop[-1]); printf(", exit %u, temp %d, target %d -> %s]\n", exit - current_executor->exits, exit->temperature.value_and_backoff, - (int)(target - _PyCode_CODE(_PyFrame_GetCode(frame))), + (int)(target - _PyFrame_GetBytecode(frame)), _PyOpcode_OpName[target->op.code]); stack_pointer = _PyFrame_GetStackPointer(frame); } @@ -5956,9 +5967,11 @@ case _ERROR_POP_N: { oparg = CURRENT_OPARG(); uint32_t target = (uint32_t)CURRENT_OPERAND(); - frame->instr_ptr = ((_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive) + target; stack_pointer += -oparg; assert(WITHIN_STACK_BOUNDS()); + _PyFrame_SetStackPointer(frame, stack_pointer); + frame->instr_ptr = _PyFrame_GetBytecode(frame) + target; + stack_pointer = _PyFrame_GetStackPointer(frame); GOTO_UNWIND(); break; } diff --git a/Python/frame.c b/Python/frame.c index 35e6c2d0a93333..9a865e57d97cc6 100644 --- a/Python/frame.c +++ b/Python/frame.c @@ -63,7 +63,8 @@ take_ownership(PyFrameObject *f, _PyInterpreterFrame *frame) // This may be a newly-created generator or coroutine frame. Since it's // dead anyways, just pretend that the first RESUME ran: PyCodeObject *code = _PyFrame_GetCode(frame); - frame->instr_ptr = _PyCode_CODE(code) + code->_co_firsttraceable + 1; + frame->instr_ptr = + _PyFrame_GetBytecode(frame) + code->_co_firsttraceable + 1; } assert(!_PyFrame_IsIncomplete(frame)); assert(f->f_back == NULL); diff --git a/Python/gc_free_threading.c b/Python/gc_free_threading.c index 1969ed608ea524..986d80c18d36c8 100644 --- a/Python/gc_free_threading.c +++ b/Python/gc_free_threading.c @@ -1953,16 +1953,22 @@ custom_visitor_wrapper(const mi_heap_t *heap, const mi_heap_area_t *area, } void -PyUnstable_GC_VisitObjects(gcvisitobjects_t callback, void *arg) +_PyGC_VisitObjectsWorldStopped(PyInterpreterState *interp, + gcvisitobjects_t callback, void *arg) { - PyInterpreterState *interp = _PyInterpreterState_GET(); struct custom_visitor_args wrapper = { .callback = callback, .arg = arg, }; + gc_visit_heaps(interp, &custom_visitor_wrapper, &wrapper.base); +} +void +PyUnstable_GC_VisitObjects(gcvisitobjects_t callback, void *arg) +{ + PyInterpreterState *interp = _PyInterpreterState_GET(); _PyEval_StopTheWorld(interp); - gc_visit_heaps(interp, &custom_visitor_wrapper, &wrapper.base); + _PyGC_VisitObjectsWorldStopped(interp, callback, arg); _PyEval_StartTheWorld(interp); } diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 632cbc7790a4d8..eff246f1997276 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -25,7 +25,7 @@ lhs = stack_pointer[-2]; uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZATION_FT if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _PyFrame_SetStackPointer(frame, stack_pointer); @@ -35,7 +35,7 @@ } OPCODE_DEFERRED_INC(BINARY_OP); ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter); - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZATION_FT */ assert(NB_ADD <= oparg); assert(oparg <= NB_INPLACE_XOR); } @@ -435,8 +435,8 @@ container = stack_pointer[-2]; uint16_t counter = read_u16(&this_instr[1].cache); (void)counter; - assert(frame->stackpointer == NULL); #if ENABLE_SPECIALIZATION + assert(frame->stackpointer == NULL); if (ADAPTIVE_COUNTER_TRIGGERS(counter)) { next_instr = this_instr; _PyFrame_SetStackPointer(frame, stack_pointer); @@ -1066,8 +1066,8 @@ _PyFrame_SetStackPointer(frame, stack_pointer); _PyInterpreterFrame *shim = _PyFrame_PushTrampolineUnchecked( tstate, (PyCodeObject *)&_Py_InitCleanup, 1, frame); + assert(_PyFrame_GetBytecode(shim)[0].op.code == EXIT_INIT_CHECK); stack_pointer = _PyFrame_GetStackPointer(frame); - assert(_PyCode_CODE(_PyFrame_GetCode(shim))[0].op.code == EXIT_INIT_CHECK); /* Push self onto stack of shim */ shim->localsplus[0] = PyStackRef_DUP(self[0]); _PyFrame_SetStackPointer(frame, stack_pointer); @@ -4711,7 +4711,9 @@ int original_opcode = 0; if (tstate->tracing) { PyCodeObject *code = _PyFrame_GetCode(frame); - original_opcode = code->_co_monitoring->lines[(int)(this_instr - _PyCode_CODE(code))].original_opcode; + _PyFrame_SetStackPointer(frame, stack_pointer); + original_opcode = code->_co_monitoring->lines[(int)(this_instr - _PyFrame_GetBytecode(frame))].original_opcode; + stack_pointer = _PyFrame_GetStackPointer(frame); next_instr = this_instr; } else { _PyFrame_SetStackPointer(frame, stack_pointer); @@ -4759,9 +4761,7 @@ assert(PyStackRef_BoolCheck(cond)); int flag = PyStackRef_Is(cond, PyStackRef_False); int offset = flag * oparg; - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); INSTRUMENTED_JUMP(this_instr, next_instr + offset, PY_MONITORING_EVENT_BRANCH); DISPATCH(); } @@ -4782,9 +4782,7 @@ PyStackRef_CLOSE(value_stackref); offset = 0; } - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); INSTRUMENTED_JUMP(this_instr, next_instr + offset, PY_MONITORING_EVENT_BRANCH); DISPATCH(); } @@ -4822,9 +4820,7 @@ assert(PyStackRef_BoolCheck(cond)); int flag = PyStackRef_Is(cond, PyStackRef_True); int offset = flag * oparg; - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); INSTRUMENTED_JUMP(this_instr, next_instr + offset, PY_MONITORING_EVENT_BRANCH); DISPATCH(); } @@ -4834,6 +4830,28 @@ (void)this_instr; next_instr += 1; INSTRUCTION_STATS(INSTRUMENTED_RESUME); + // _LOAD_BYTECODE + { + #ifdef Py_GIL_DISABLED + if (frame->tlbc_index != + ((_PyThreadStateImpl *)tstate)->tlbc_index) { + _PyFrame_SetStackPointer(frame, stack_pointer); + _Py_CODEUNIT *bytecode = + _PyEval_GetExecutableCode(tstate, _PyFrame_GetCode(frame)); + stack_pointer = _PyFrame_GetStackPointer(frame); + if (bytecode == NULL) goto error; + _PyFrame_SetStackPointer(frame, stack_pointer); + int off = this_instr - _PyFrame_GetBytecode(frame); + stack_pointer = _PyFrame_GetStackPointer(frame); + frame->tlbc_index = ((_PyThreadStateImpl *)tstate)->tlbc_index; + frame->instr_ptr = bytecode + off; + // Make sure this_instr gets reset correctley for any uops that + // follow + next_instr = frame->instr_ptr; + DISPATCH(); + } + #endif + } // _MAYBE_INSTRUMENT { if (tstate->tracing == 0) { @@ -6646,9 +6664,7 @@ cond = stack_pointer[-1]; assert(PyStackRef_BoolCheck(cond)); int flag = PyStackRef_Is(cond, PyStackRef_False); - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); JUMPBY(oparg * flag); stack_pointer += -1; assert(WITHIN_STACK_BOUNDS()); @@ -6680,9 +6696,7 @@ cond = b; assert(PyStackRef_BoolCheck(cond)); int flag = PyStackRef_Is(cond, PyStackRef_True); - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); JUMPBY(oparg * flag); } stack_pointer += -1; @@ -6715,9 +6729,7 @@ cond = b; assert(PyStackRef_BoolCheck(cond)); int flag = PyStackRef_Is(cond, PyStackRef_False); - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); JUMPBY(oparg * flag); } stack_pointer += -1; @@ -6735,9 +6747,7 @@ cond = stack_pointer[-1]; assert(PyStackRef_BoolCheck(cond)); int flag = PyStackRef_Is(cond, PyStackRef_True); - #if ENABLE_SPECIALIZATION - this_instr[1].cache = (this_instr[1].cache << 1) | flag; - #endif + RECORD_BRANCH_TAKEN(this_instr[1].cache, flag); JUMPBY(oparg * flag); stack_pointer += -1; assert(WITHIN_STACK_BOUNDS()); @@ -6832,7 +6842,11 @@ if (oparg) { PyObject *lasti = PyStackRef_AsPyObjectBorrow(values[0]); if (PyLong_Check(lasti)) { - frame->instr_ptr = _PyCode_CODE(_PyFrame_GetCode(frame)) + PyLong_AsLong(lasti); + stack_pointer += -1; + assert(WITHIN_STACK_BOUNDS()); + _PyFrame_SetStackPointer(frame, stack_pointer); + frame->instr_ptr = _PyFrame_GetBytecode(frame) + PyLong_AsLong(lasti); + stack_pointer = _PyFrame_GetStackPointer(frame); assert(!_PyErr_Occurred(tstate)); } else { @@ -6844,6 +6858,8 @@ Py_DECREF(exc); goto error; } + stack_pointer += 1; + assert(WITHIN_STACK_BOUNDS()); } assert(exc && PyExceptionInstance_Check(exc)); stack_pointer += -1; @@ -6871,6 +6887,28 @@ PREDICTED(RESUME); _Py_CODEUNIT* const this_instr = next_instr - 1; (void)this_instr; + // _LOAD_BYTECODE + { + #ifdef Py_GIL_DISABLED + if (frame->tlbc_index != + ((_PyThreadStateImpl *)tstate)->tlbc_index) { + _PyFrame_SetStackPointer(frame, stack_pointer); + _Py_CODEUNIT *bytecode = + _PyEval_GetExecutableCode(tstate, _PyFrame_GetCode(frame)); + stack_pointer = _PyFrame_GetStackPointer(frame); + if (bytecode == NULL) goto error; + _PyFrame_SetStackPointer(frame, stack_pointer); + int off = this_instr - _PyFrame_GetBytecode(frame); + stack_pointer = _PyFrame_GetStackPointer(frame); + frame->tlbc_index = ((_PyThreadStateImpl *)tstate)->tlbc_index; + frame->instr_ptr = bytecode + off; + // Make sure this_instr gets reset correctley for any uops that + // follow + next_instr = frame->instr_ptr; + DISPATCH(); + } + #endif + } // _MAYBE_INSTRUMENT { if (tstate->tracing == 0) { @@ -6890,11 +6928,11 @@ } // _QUICKEN_RESUME { - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZATION_FT if (tstate->tracing == 0 && this_instr->op.code == RESUME) { FT_ATOMIC_STORE_UINT8_RELAXED(this_instr->op.code, RESUME_CHECK); } - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZATION_FT */ } // _CHECK_PERIODIC_IF_NOT_YIELD_FROM { @@ -6925,6 +6963,10 @@ uintptr_t version = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(_PyFrame_GetCode(frame)->_co_instrumentation_version); assert((version & _PY_EVAL_EVENTS_MASK) == 0); DEOPT_IF(eval_breaker != version, RESUME); + #ifdef Py_GIL_DISABLED + DEOPT_IF(frame->tlbc_index != + ((_PyThreadStateImpl *)tstate)->tlbc_index, RESUME); + #endif DISPATCH(); } diff --git a/Python/index_pool.c b/Python/index_pool.c new file mode 100644 index 00000000000000..526eccff74af00 --- /dev/null +++ b/Python/index_pool.c @@ -0,0 +1,193 @@ +#include + +#include "Python.h" + +#include "pycore_index_pool.h" +#include "pycore_lock.h" + +#ifdef Py_GIL_DISABLED + +static inline void +swap(int32_t *values, Py_ssize_t i, Py_ssize_t j) +{ + int32_t tmp = values[i]; + values[i] = values[j]; + values[j] = tmp; +} + +static bool +heap_try_swap(_PyIndexHeap *heap, Py_ssize_t i, Py_ssize_t j) +{ + if (i < 0 || i >= heap->size) { + return 0; + } + if (j < 0 || j >= heap->size) { + return 0; + } + if (i <= j) { + if (heap->values[i] <= heap->values[j]) { + return 0; + } + } + else if (heap->values[j] <= heap->values[i]) { + return 0; + } + swap(heap->values, i, j); + return 1; +} + +static inline Py_ssize_t +parent(Py_ssize_t i) +{ + return (i - 1) / 2; +} + +static inline Py_ssize_t +left_child(Py_ssize_t i) +{ + return 2 * i + 1; +} + +static inline Py_ssize_t +right_child(Py_ssize_t i) +{ + return 2 * i + 2; +} + +static void +heap_add(_PyIndexHeap *heap, int32_t val) +{ + assert(heap->size < heap->capacity); + // Add val to end + heap->values[heap->size] = val; + heap->size++; + // Sift up + for (Py_ssize_t cur = heap->size - 1; cur > 0; cur = parent(cur)) { + if (!heap_try_swap(heap, cur, parent(cur))) { + break; + } + } +} + +static Py_ssize_t +heap_min_child(_PyIndexHeap *heap, Py_ssize_t i) +{ + if (left_child(i) < heap->size) { + if (right_child(i) < heap->size) { + Py_ssize_t lval = heap->values[left_child(i)]; + Py_ssize_t rval = heap->values[right_child(i)]; + return lval < rval ? left_child(i) : right_child(i); + } + return left_child(i); + } + else if (right_child(i) < heap->size) { + return right_child(i); + } + return -1; +} + +static int32_t +heap_pop(_PyIndexHeap *heap) +{ + assert(heap->size > 0); + // Pop smallest and replace with the last element + int32_t result = heap->values[0]; + heap->values[0] = heap->values[heap->size - 1]; + heap->size--; + // Sift down + for (Py_ssize_t cur = 0; cur < heap->size;) { + Py_ssize_t min_child = heap_min_child(heap, cur); + if (min_child > -1 && heap_try_swap(heap, cur, min_child)) { + cur = min_child; + } + else { + break; + } + } + return result; +} + +static int +heap_ensure_capacity(_PyIndexHeap *heap, Py_ssize_t limit) +{ + assert(limit > 0); + if (heap->capacity > limit) { + return 0; + } + Py_ssize_t new_capacity = heap->capacity ? heap->capacity : 1024; + while (new_capacity && new_capacity < limit) { + new_capacity <<= 1; + } + if (!new_capacity) { + return -1; + } + int32_t *new_values = PyMem_RawCalloc(new_capacity, sizeof(int32_t)); + if (new_values == NULL) { + return -1; + } + if (heap->values != NULL) { + memcpy(new_values, heap->values, heap->capacity); + PyMem_RawFree(heap->values); + } + heap->values = new_values; + heap->capacity = new_capacity; + return 0; +} + +static void +heap_fini(_PyIndexHeap *heap) +{ + if (heap->values != NULL) { + PyMem_RawFree(heap->values); + heap->values = NULL; + } + heap->size = -1; + heap->capacity = -1; +} + +#define LOCK_POOL(pool) PyMutex_LockFlags(&pool->mutex, _Py_LOCK_DONT_DETACH) +#define UNLOCK_POOL(pool) PyMutex_Unlock(&pool->mutex) + +int32_t +_PyIndexPool_AllocIndex(_PyIndexPool *pool) +{ + LOCK_POOL(pool); + int32_t index; + _PyIndexHeap *free_indices = &pool->free_indices; + if (free_indices->size == 0) { + // No free indices. Make sure the heap can always store all of the + // indices that have been allocated to avoid having to allocate memory + // (which can fail) when freeing an index. Freeing indices happens when + // threads are being destroyed, which makes error handling awkward / + // impossible. This arrangement shifts handling of allocation failures + // to when indices are allocated, which happens at thread creation, + // where we are better equipped to deal with failure. + if (heap_ensure_capacity(free_indices, pool->next_index + 1) < 0) { + UNLOCK_POOL(pool); + PyErr_NoMemory(); + return -1; + } + index = pool->next_index++; + } + else { + index = heap_pop(free_indices); + } + UNLOCK_POOL(pool); + return index; +} + +void +_PyIndexPool_FreeIndex(_PyIndexPool *pool, int32_t index) +{ + LOCK_POOL(pool); + heap_add(&pool->free_indices, index); + UNLOCK_POOL(pool); +} + +void +_PyIndexPool_Fini(_PyIndexPool *pool) +{ + heap_fini(&pool->free_indices); +} + +#endif // Py_GIL_DISABLED diff --git a/Python/initconfig.c b/Python/initconfig.c index c142438b02bfd9..438f8a5c1cf1ce 100644 --- a/Python/initconfig.c +++ b/Python/initconfig.c @@ -134,6 +134,7 @@ static const PyConfigSpec PYCONFIG_SPEC[] = { SPEC(dump_refs_file, WSTR_OPT, READ_ONLY, NO_SYS), #ifdef Py_GIL_DISABLED SPEC(enable_gil, INT, READ_ONLY, NO_SYS), + SPEC(tlbc_enabled, INT, READ_ONLY, NO_SYS), #endif SPEC(faulthandler, BOOL, READ_ONLY, NO_SYS), SPEC(filesystem_encoding, WSTR, READ_ONLY, NO_SYS), @@ -315,8 +316,13 @@ The following implementation-specific options are available:\n\ "\ -X showrefcount: output the total reference count and number of used\n\ memory blocks when the program finishes or after each statement in\n\ - the interactive interpreter; only works on debug builds\n\ --X tracemalloc[=N]: trace Python memory allocations; N sets a traceback limit\n\ + the interactive interpreter; only works on debug builds\n" +#ifdef Py_GIL_DISABLED +"-X tlbc=[0|1]: enable (1) or disable (0) thread-local bytecode. Also\n\ + PYTHON_TLBC\n" +#endif +"\ +-X tracemalloc[=N]: trace Python memory allocations; N sets a traceback limit\n \ of N frames (default: 1); also PYTHONTRACEMALLOC=N\n\ -X utf8[=0|1]: enable (1) or disable (0) UTF-8 mode; also PYTHONUTF8\n\ -X warn_default_encoding: enable opt-in EncodingWarning for 'encoding=None';\n\ @@ -400,6 +406,9 @@ static const char usage_envvars[] = #ifdef Py_STATS "PYTHONSTATS : turns on statistics gathering (-X pystats)\n" #endif +#ifdef Py_GIL_DISABLED +"PYTHON_TLBC : when set to 0, disables thread-local bytecode (-X tlbc)\n" +#endif "PYTHONTRACEMALLOC: trace Python memory allocations (-X tracemalloc)\n" "PYTHONUNBUFFERED: disable stdout/stderr buffering (-u)\n" "PYTHONUTF8 : control the UTF-8 mode (-X utf8)\n" @@ -979,6 +988,7 @@ _PyConfig_InitCompatConfig(PyConfig *config) config->cpu_count = -1; #ifdef Py_GIL_DISABLED config->enable_gil = _PyConfig_GIL_DEFAULT; + config->tlbc_enabled = 1; #endif } @@ -1862,6 +1872,36 @@ config_init_cpu_count(PyConfig *config) "n must be greater than 0"); } +static PyStatus +config_init_tlbc(PyConfig *config) +{ +#ifdef Py_GIL_DISABLED + const char *env = config_get_env(config, "PYTHON_TLBC"); + if (env) { + int enabled; + if (_Py_str_to_int(env, &enabled) < 0 || (enabled < 0) || (enabled > 1)) { + return _PyStatus_ERR( + "PYTHON_TLBC=N: N is missing or invalid"); + } + config->tlbc_enabled = enabled; + } + + const wchar_t *xoption = config_get_xoption(config, L"tlbc"); + if (xoption) { + int enabled; + const wchar_t *sep = wcschr(xoption, L'='); + if (!sep || (config_wstr_to_int(sep + 1, &enabled) < 0) || (enabled < 0) || (enabled > 1)) { + return _PyStatus_ERR( + "-X tlbc=n: n is missing or invalid"); + } + config->tlbc_enabled = enabled; + } + return _PyStatus_OK(); +#else + return _PyStatus_OK(); +#endif +} + static PyStatus config_init_perf_profiling(PyConfig *config) { @@ -2111,6 +2151,11 @@ config_read_complex_options(PyConfig *config) } #endif + status = config_init_tlbc(config); + if (_PyStatus_EXCEPTION(status)) { + return status; + } + return _PyStatus_OK(); } diff --git a/Python/instrumentation.c b/Python/instrumentation.c index d4568764117563..87c2addaf809eb 100644 --- a/Python/instrumentation.c +++ b/Python/instrumentation.c @@ -44,10 +44,24 @@ #define UNLOCK_CODE() Py_END_CRITICAL_SECTION() +#define MODIFY_BYTECODE(code, func, ...) \ + do { \ + PyCodeObject *co = (code); \ + for (Py_ssize_t i = 0; i < code->co_tlbc->size; i++) { \ + char *bc = co->co_tlbc->entries[i]; \ + if (bc == NULL) { \ + continue; \ + } \ + (func)((_Py_CODEUNIT *)bc, __VA_ARGS__); \ + } \ + } while (0) + #else #define LOCK_CODE(code) #define UNLOCK_CODE() +#define MODIFY_BYTECODE(code, func, ...) \ + (func)(_PyCode_CODE(code), __VA_ARGS__) #endif @@ -309,7 +323,8 @@ _PyInstruction_GetLength(PyCodeObject *code, int offset) { ASSERT_WORLD_STOPPED_OR_LOCKED(code); - int opcode = _PyCode_CODE(code)[offset].op.code; + int opcode = + FT_ATOMIC_LOAD_UINT8_RELAXED(_PyCode_CODE(code)[offset].op.code); assert(opcode != 0); assert(opcode != RESERVED); if (opcode == INSTRUMENTED_LINE) { @@ -578,7 +593,9 @@ sanity_check_instrumentation(PyCodeObject *code) _Py_CODEUNIT _Py_GetBaseCodeUnit(PyCodeObject *code, int i) { - _Py_CODEUNIT inst = _PyCode_CODE(code)[i]; + _Py_CODEUNIT *src_instr = _PyCode_CODE(code) + i; + _Py_CODEUNIT inst = { + .cache = FT_ATOMIC_LOAD_UINT16_RELAXED(*(uint16_t *)src_instr)}; int opcode = inst.op.code; if (opcode < MIN_INSTRUMENTED_OPCODE) { inst.op.code = _PyOpcode_Deopt[opcode]; @@ -614,21 +631,22 @@ _Py_GetBaseCodeUnit(PyCodeObject *code, int i) } static void -de_instrument(PyCodeObject *code, int i, int event) +de_instrument(_Py_CODEUNIT *bytecode, _PyCoMonitoringData *monitoring, int i, + int event) { assert(event != PY_MONITORING_EVENT_INSTRUCTION); assert(event != PY_MONITORING_EVENT_LINE); - _Py_CODEUNIT *instr = &_PyCode_CODE(code)[i]; + _Py_CODEUNIT *instr = &bytecode[i]; uint8_t *opcode_ptr = &instr->op.code; int opcode = *opcode_ptr; assert(opcode != ENTER_EXECUTOR); if (opcode == INSTRUMENTED_LINE) { - opcode_ptr = &code->_co_monitoring->lines[i].original_opcode; + opcode_ptr = &monitoring->lines[i].original_opcode; opcode = *opcode_ptr; } if (opcode == INSTRUMENTED_INSTRUCTION) { - opcode_ptr = &code->_co_monitoring->per_instruction_opcodes[i]; + opcode_ptr = &monitoring->per_instruction_opcodes[i]; opcode = *opcode_ptr; } int deinstrumented = DE_INSTRUMENT[opcode]; @@ -644,65 +662,68 @@ de_instrument(PyCodeObject *code, int i, int event) } static void -de_instrument_line(PyCodeObject *code, int i) +de_instrument_line(_Py_CODEUNIT *bytecode, _PyCoMonitoringData *monitoring, + int i) { - _Py_CODEUNIT *instr = &_PyCode_CODE(code)[i]; + _Py_CODEUNIT *instr = &bytecode[i]; int opcode = instr->op.code; if (opcode != INSTRUMENTED_LINE) { return; } - _PyCoLineInstrumentationData *lines = &code->_co_monitoring->lines[i]; + _PyCoLineInstrumentationData *lines = &monitoring->lines[i]; int original_opcode = lines->original_opcode; if (original_opcode == INSTRUMENTED_INSTRUCTION) { - lines->original_opcode = code->_co_monitoring->per_instruction_opcodes[i]; + lines->original_opcode = monitoring->per_instruction_opcodes[i]; } CHECK(original_opcode != 0); CHECK(original_opcode == _PyOpcode_Deopt[original_opcode]); - instr->op.code = original_opcode; + FT_ATOMIC_STORE_UINT8(instr->op.code, original_opcode); if (_PyOpcode_Caches[original_opcode]) { - instr[1].counter = adaptive_counter_warmup(); + FT_ATOMIC_STORE_UINT16_RELAXED(instr[1].counter.value_and_backoff, + adaptive_counter_warmup().value_and_backoff); } assert(instr->op.code != INSTRUMENTED_LINE); } static void -de_instrument_per_instruction(PyCodeObject *code, int i) +de_instrument_per_instruction(_Py_CODEUNIT *bytecode, + _PyCoMonitoringData *monitoring, int i) { - _Py_CODEUNIT *instr = &_PyCode_CODE(code)[i]; + _Py_CODEUNIT *instr = &bytecode[i]; uint8_t *opcode_ptr = &instr->op.code; int opcode = *opcode_ptr; if (opcode == INSTRUMENTED_LINE) { - opcode_ptr = &code->_co_monitoring->lines[i].original_opcode; + opcode_ptr = &monitoring->lines[i].original_opcode; opcode = *opcode_ptr; } if (opcode != INSTRUMENTED_INSTRUCTION) { return; } - int original_opcode = code->_co_monitoring->per_instruction_opcodes[i]; + int original_opcode = monitoring->per_instruction_opcodes[i]; CHECK(original_opcode != 0); CHECK(original_opcode == _PyOpcode_Deopt[original_opcode]); - *opcode_ptr = original_opcode; + FT_ATOMIC_STORE_UINT8_RELAXED(*opcode_ptr, original_opcode); if (_PyOpcode_Caches[original_opcode]) { - instr[1].counter = adaptive_counter_warmup(); + FT_ATOMIC_STORE_UINT16_RELAXED(instr[1].counter.value_and_backoff, + adaptive_counter_warmup().value_and_backoff); } assert(*opcode_ptr != INSTRUMENTED_INSTRUCTION); assert(instr->op.code != INSTRUMENTED_INSTRUCTION); } - static void -instrument(PyCodeObject *code, int i) +instrument(_Py_CODEUNIT *bytecode, _PyCoMonitoringData *monitoring, int i) { - _Py_CODEUNIT *instr = &_PyCode_CODE(code)[i]; + _Py_CODEUNIT *instr = &bytecode[i]; uint8_t *opcode_ptr = &instr->op.code; int opcode =*opcode_ptr; if (opcode == INSTRUMENTED_LINE) { - _PyCoLineInstrumentationData *lines = &code->_co_monitoring->lines[i]; + _PyCoLineInstrumentationData *lines = &monitoring->lines[i]; opcode_ptr = &lines->original_opcode; opcode = *opcode_ptr; } if (opcode == INSTRUMENTED_INSTRUCTION) { - opcode_ptr = &code->_co_monitoring->per_instruction_opcodes[i]; + opcode_ptr = &monitoring->per_instruction_opcodes[i]; opcode = *opcode_ptr; CHECK(opcode != INSTRUMENTED_INSTRUCTION && opcode != INSTRUMENTED_LINE); CHECK(opcode == _PyOpcode_Deopt[opcode]); @@ -716,52 +737,52 @@ instrument(PyCodeObject *code, int i) if (_PyOpcode_Caches[deopt]) { FT_ATOMIC_STORE_UINT16_RELAXED(instr[1].counter.value_and_backoff, adaptive_counter_warmup().value_and_backoff); - instr[1].counter = adaptive_counter_warmup(); } } } static void -instrument_line(PyCodeObject *code, int i) +instrument_line(_Py_CODEUNIT *bytecode, _PyCoMonitoringData *monitoring, int i) { - uint8_t *opcode_ptr = &_PyCode_CODE(code)[i].op.code; + uint8_t *opcode_ptr = &bytecode[i].op.code; int opcode = *opcode_ptr; if (opcode == INSTRUMENTED_LINE) { return; } - _PyCoLineInstrumentationData *lines = &code->_co_monitoring->lines[i]; + _PyCoLineInstrumentationData *lines = &monitoring->lines[i]; lines->original_opcode = _PyOpcode_Deopt[opcode]; CHECK(lines->original_opcode > 0); - *opcode_ptr = INSTRUMENTED_LINE; + FT_ATOMIC_STORE_UINT8_RELAXED(*opcode_ptr, INSTRUMENTED_LINE); } static void -instrument_per_instruction(PyCodeObject *code, int i) +instrument_per_instruction(_Py_CODEUNIT *bytecode, + _PyCoMonitoringData *monitoring, int i) { - _Py_CODEUNIT *instr = &_PyCode_CODE(code)[i]; + _Py_CODEUNIT *instr = &bytecode[i]; uint8_t *opcode_ptr = &instr->op.code; int opcode = *opcode_ptr; if (opcode == INSTRUMENTED_LINE) { - _PyCoLineInstrumentationData *lines = &code->_co_monitoring->lines[i]; + _PyCoLineInstrumentationData *lines = &monitoring->lines[i]; opcode_ptr = &lines->original_opcode; opcode = *opcode_ptr; } if (opcode == INSTRUMENTED_INSTRUCTION) { - assert(code->_co_monitoring->per_instruction_opcodes[i] > 0); + assert(monitoring->per_instruction_opcodes[i] > 0); return; } CHECK(opcode != 0); if (is_instrumented(opcode)) { - code->_co_monitoring->per_instruction_opcodes[i] = opcode; + monitoring->per_instruction_opcodes[i] = opcode; } else { assert(opcode != 0); assert(_PyOpcode_Deopt[opcode] != 0); assert(_PyOpcode_Deopt[opcode] != RESUME); - code->_co_monitoring->per_instruction_opcodes[i] = _PyOpcode_Deopt[opcode]; + monitoring->per_instruction_opcodes[i] = _PyOpcode_Deopt[opcode]; } - assert(code->_co_monitoring->per_instruction_opcodes[i] > 0); - *opcode_ptr = INSTRUMENTED_INSTRUCTION; + assert(monitoring->per_instruction_opcodes[i] > 0); + FT_ATOMIC_STORE_UINT8_RELAXED(*opcode_ptr, INSTRUMENTED_INSTRUCTION); } static void @@ -773,19 +794,19 @@ remove_tools(PyCodeObject * code, int offset, int event, int tools) assert(PY_MONITORING_IS_INSTRUMENTED_EVENT(event)); assert(opcode_has_event(_Py_GetBaseCodeUnit(code, offset).op.code)); _PyCoMonitoringData *monitoring = code->_co_monitoring; + bool should_de_instrument; if (monitoring && monitoring->tools) { monitoring->tools[offset] &= ~tools; - if (monitoring->tools[offset] == 0) { - de_instrument(code, offset, event); - } + should_de_instrument = (monitoring->tools[offset] == 0); } else { /* Single tool */ uint8_t single_tool = code->_co_monitoring->active_monitors.tools[event]; assert(_Py_popcount32(single_tool) <= 1); - if (((single_tool & tools) == single_tool)) { - de_instrument(code, offset, event); - } + should_de_instrument = ((single_tool & tools) == single_tool); + } + if (should_de_instrument) { + MODIFY_BYTECODE(code, de_instrument, monitoring, offset, event); } } @@ -804,22 +825,23 @@ remove_line_tools(PyCodeObject * code, int offset, int tools) { ASSERT_WORLD_STOPPED_OR_LOCKED(code); - assert(code->_co_monitoring); - if (code->_co_monitoring->line_tools) + _PyCoMonitoringData *monitoring = code->_co_monitoring; + assert(monitoring); + bool should_de_instrument; + if (monitoring->line_tools) { - uint8_t *toolsptr = &code->_co_monitoring->line_tools[offset]; + uint8_t *toolsptr = &monitoring->line_tools[offset]; *toolsptr &= ~tools; - if (*toolsptr == 0 ) { - de_instrument_line(code, offset); - } + should_de_instrument = (*toolsptr == 0); } else { /* Single tool */ - uint8_t single_tool = code->_co_monitoring->active_monitors.tools[PY_MONITORING_EVENT_LINE]; + uint8_t single_tool = monitoring->active_monitors.tools[PY_MONITORING_EVENT_LINE]; assert(_Py_popcount32(single_tool) <= 1); - if (((single_tool & tools) == single_tool)) { - de_instrument_line(code, offset); - } + should_de_instrument = ((single_tool & tools) == single_tool); + } + if (should_de_instrument) { + MODIFY_BYTECODE(code, de_instrument_line, monitoring, offset); } } @@ -841,7 +863,7 @@ add_tools(PyCodeObject * code, int offset, int event, int tools) assert(_Py_popcount32(tools) == 1); assert(tools_is_subset_for_event(code, event, tools)); } - instrument(code, offset); + MODIFY_BYTECODE(code, instrument, code->_co_monitoring, offset); } static void @@ -858,7 +880,7 @@ add_line_tools(PyCodeObject * code, int offset, int tools) /* Single tool */ assert(_Py_popcount32(tools) == 1); } - instrument_line(code, offset); + MODIFY_BYTECODE(code, instrument_line, code->_co_monitoring, offset); } @@ -876,7 +898,7 @@ add_per_instruction_tools(PyCodeObject * code, int offset, int tools) /* Single tool */ assert(_Py_popcount32(tools) == 1); } - instrument_per_instruction(code, offset); + MODIFY_BYTECODE(code, instrument_per_instruction, code->_co_monitoring, offset); } @@ -885,21 +907,22 @@ remove_per_instruction_tools(PyCodeObject * code, int offset, int tools) { ASSERT_WORLD_STOPPED_OR_LOCKED(code); + _PyCoMonitoringData *monitoring = code->_co_monitoring; assert(code->_co_monitoring); + bool should_de_instrument; if (code->_co_monitoring->per_instruction_tools) { uint8_t *toolsptr = &code->_co_monitoring->per_instruction_tools[offset]; *toolsptr &= ~tools; - if (*toolsptr == 0) { - de_instrument_per_instruction(code, offset); - } + should_de_instrument = (*toolsptr == 0); } else { /* Single tool */ uint8_t single_tool = code->_co_monitoring->active_monitors.tools[PY_MONITORING_EVENT_INSTRUCTION]; assert(_Py_popcount32(single_tool) <= 1); - if (((single_tool & tools) == single_tool)) { - de_instrument_per_instruction(code, offset); - } + should_de_instrument = ((single_tool & tools) == single_tool); + } + if (should_de_instrument) { + MODIFY_BYTECODE(code, de_instrument_per_instruction, monitoring, offset); } } @@ -1087,7 +1110,7 @@ call_instrumentation_vector( PyCodeObject *code = _PyFrame_GetCode(frame); assert(args[1] == NULL); args[1] = (PyObject *)code; - int offset = (int)(instr - _PyCode_CODE(code)); + int offset = (int)(instr - _PyFrame_GetBytecode(frame)); /* Offset visible to user should be the offset in bytes, as that is the * convention for APIs involving code offsets. */ int bytes_offset = offset * (int)sizeof(_Py_CODEUNIT); @@ -1173,8 +1196,7 @@ _Py_call_instrumentation_jump( assert(event == PY_MONITORING_EVENT_JUMP || event == PY_MONITORING_EVENT_BRANCH); assert(frame->instr_ptr == instr); - PyCodeObject *code = _PyFrame_GetCode(frame); - int to = (int)(target - _PyCode_CODE(code)); + int to = (int)(target - _PyFrame_GetBytecode(frame)); PyObject *to_obj = PyLong_FromLong(to * (int)sizeof(_Py_CODEUNIT)); if (to_obj == NULL) { return NULL; @@ -1240,7 +1262,8 @@ _Py_call_instrumentation_line(PyThreadState *tstate, _PyInterpreterFrame* frame, PyCodeObject *code = _PyFrame_GetCode(frame); assert(tstate->tracing == 0); assert(debug_check_sanity(tstate->interp, code)); - int i = (int)(instr - _PyCode_CODE(code)); + _Py_CODEUNIT *bytecode = _PyFrame_GetBytecode(frame); + int i = (int)(instr - bytecode); _PyCoMonitoringData *monitoring = code->_co_monitoring; _PyCoLineInstrumentationData *line_data = &monitoring->lines[i]; @@ -1256,10 +1279,10 @@ _Py_call_instrumentation_line(PyThreadState *tstate, _PyInterpreterFrame* frame, line = compute_line(code, i, line_delta); assert(line >= 0); assert(prev != NULL); - int prev_index = (int)(prev - _PyCode_CODE(code)); + int prev_index = (int)(prev - bytecode); int prev_line = _Py_Instrumentation_GetLine(code, prev_index); if (prev_line == line) { - int prev_opcode = _PyCode_CODE(code)[prev_index].op.code; + int prev_opcode = bytecode[prev_index].op.code; /* RESUME and INSTRUMENTED_RESUME are needed for the operation of * instrumentation, so must never be hidden by an INSTRUMENTED_LINE. */ @@ -1359,7 +1382,7 @@ int _Py_call_instrumentation_instruction(PyThreadState *tstate, _PyInterpreterFrame* frame, _Py_CODEUNIT *instr) { PyCodeObject *code = _PyFrame_GetCode(frame); - int offset = (int)(instr - _PyCode_CODE(code)); + int offset = (int)(instr - _PyFrame_GetBytecode(frame)); _PyCoMonitoringData *instrumentation_data = code->_co_monitoring; assert(instrumentation_data->per_instruction_opcodes); int next_opcode = instrumentation_data->per_instruction_opcodes[offset]; diff --git a/Python/optimizer_cases.c.h b/Python/optimizer_cases.c.h index 0a7e44ef78dda9..54821b23716eeb 100644 --- a/Python/optimizer_cases.c.h +++ b/Python/optimizer_cases.c.h @@ -17,6 +17,8 @@ /* _QUICKEN_RESUME is not a viable micro-op for tier 2 */ + /* _LOAD_BYTECODE is not a viable micro-op for tier 2 */ + case _RESUME_CHECK: { break; } diff --git a/Python/pystate.c b/Python/pystate.c index 36b31f3b9e4200..ded5fde9c4bb51 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -1513,6 +1513,11 @@ new_threadstate(PyInterpreterState *interp, int whence) PyMem_RawFree(new_tstate); return NULL; } + int32_t tlbc_idx = _Py_ReserveTLBCIndex(interp); + if (tlbc_idx < 0) { + PyMem_RawFree(new_tstate); + return NULL; + } #endif /* We serialize concurrent creation to protect global state. */ @@ -1555,6 +1560,7 @@ new_threadstate(PyInterpreterState *interp, int whence) #ifdef Py_GIL_DISABLED // Must be called with lock unlocked to avoid lock ordering deadlocks. _Py_qsbr_register(tstate, interp, qsbr_idx); + tstate->tlbc_index = tlbc_idx; #endif return (PyThreadState *)tstate; @@ -1706,6 +1712,10 @@ PyThreadState_Clear(PyThreadState *tstate) // Remove ourself from the biased reference counting table of threads. _Py_brc_remove_thread(tstate); + + // Release our thread-local copies of the bytecode for reuse by another + // thread + _Py_ClearTLBCIndex((_PyThreadStateImpl *)tstate); #endif // Merge our queue of pointers to be freed into the interpreter queue. diff --git a/Python/specialize.c b/Python/specialize.c index ae47809305a300..86cb997ca2ced3 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -24,6 +24,25 @@ extern const char *_PyUOpName(int index); * ./adaptive.md */ +#ifdef Py_GIL_DISABLED +#define SET_OPCODE_OR_RETURN(instr, opcode) \ + do { \ + uint8_t old_op = _Py_atomic_load_uint8_relaxed(&(instr)->op.code); \ + if (old_op >= MIN_INSTRUMENTED_OPCODE) { \ + /* Lost race with instrumentation */ \ + return; \ + } \ + if (!_Py_atomic_compare_exchange_uint8(&(instr)->op.code, &old_op, \ + (opcode))) { \ + /* Lost race with instrumentation */ \ + assert(old_op >= MIN_INSTRUMENTED_OPCODE); \ + return; \ + } \ + } while (0) +#else +#define SET_OPCODE_OR_RETURN(instr, opcode) (instr)->op.code = (opcode) +#endif + #ifdef Py_STATS GCStats _py_gc_stats[NUM_GENERATIONS] = { 0 }; static PyStats _Py_stats_struct = { .gc_stats = _py_gc_stats }; @@ -436,16 +455,25 @@ do { \ # define SPECIALIZATION_FAIL(opcode, kind) ((void)0) #endif -// Initialize warmup counters and insert superinstructions. This cannot fail. +// Initialize warmup counters and optimize instructions. This cannot fail. void -_PyCode_Quicken(PyCodeObject *code) +_PyCode_Quicken(_Py_CODEUNIT *instructions, Py_ssize_t size, PyObject *consts, + int enable_counters) { - #if ENABLE_SPECIALIZATION + #if ENABLE_SPECIALIZATION_FT + _Py_BackoffCounter jump_counter, adaptive_counter; + if (enable_counters) { + jump_counter = initial_jump_backoff_counter(); + adaptive_counter = adaptive_counter_warmup(); + } + else { + jump_counter = initial_unreachable_backoff_counter(); + adaptive_counter = initial_unreachable_backoff_counter(); + } int opcode = 0; int oparg = 0; - _Py_CODEUNIT *instructions = _PyCode_CODE(code); /* The last code unit cannot have a cache, so we don't need to check it */ - for (int i = 0; i < Py_SIZE(code)-1; i++) { + for (Py_ssize_t i = 0; i < size-1; i++) { opcode = instructions[i].op.code; int caches = _PyOpcode_Caches[opcode]; oparg = (oparg << 8) | instructions[i].op.arg; @@ -453,7 +481,7 @@ _PyCode_Quicken(PyCodeObject *code) // The initial value depends on the opcode switch (opcode) { case JUMP_BACKWARD: - instructions[i + 1].counter = initial_jump_backoff_counter(); + instructions[i + 1].counter = jump_counter; break; case POP_JUMP_IF_FALSE: case POP_JUMP_IF_TRUE: @@ -462,7 +490,7 @@ _PyCode_Quicken(PyCodeObject *code) instructions[i + 1].cache = 0x5555; // Alternating 0, 1 bits break; default: - instructions[i + 1].counter = adaptive_counter_warmup(); + instructions[i + 1].counter = adaptive_counter; break; } i += caches; @@ -471,7 +499,7 @@ _PyCode_Quicken(PyCodeObject *code) /* We can't do this in the bytecode compiler as * marshalling can intern strings and make them immortal. */ - PyObject *obj = PyTuple_GET_ITEM(code->co_consts, oparg); + PyObject *obj = PyTuple_GET_ITEM(consts, oparg); if (_Py_IsImmortal(obj)) { instructions[i].op.code = LOAD_CONST_IMMORTAL; } @@ -480,7 +508,7 @@ _PyCode_Quicken(PyCodeObject *code) oparg = 0; } } - #endif /* ENABLE_SPECIALIZATION */ + #endif /* ENABLE_SPECIALIZATION_FT */ } #define SIMPLE_FUNCTION 0 @@ -2243,9 +2271,10 @@ _Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *in { PyObject *lhs = PyStackRef_AsPyObjectBorrow(lhs_st); PyObject *rhs = PyStackRef_AsPyObjectBorrow(rhs_st); - assert(ENABLE_SPECIALIZATION); + assert(ENABLE_SPECIALIZATION_FT); assert(_PyOpcode_Caches[BINARY_OP] == INLINE_CACHE_ENTRIES_BINARY_OP); _PyBinaryOpCache *cache = (_PyBinaryOpCache *)(instr + 1); + uint8_t specialized_op; switch (oparg) { case NB_ADD: case NB_INPLACE_ADD: @@ -2256,18 +2285,18 @@ _Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *in _Py_CODEUNIT next = instr[INLINE_CACHE_ENTRIES_BINARY_OP + 1]; bool to_store = (next.op.code == STORE_FAST); if (to_store && PyStackRef_AsPyObjectBorrow(locals[next.op.arg]) == lhs) { - instr->op.code = BINARY_OP_INPLACE_ADD_UNICODE; + specialized_op = BINARY_OP_INPLACE_ADD_UNICODE; goto success; } - instr->op.code = BINARY_OP_ADD_UNICODE; + specialized_op = BINARY_OP_ADD_UNICODE; goto success; } if (PyLong_CheckExact(lhs)) { - instr->op.code = BINARY_OP_ADD_INT; + specialized_op = BINARY_OP_ADD_INT; goto success; } if (PyFloat_CheckExact(lhs)) { - instr->op.code = BINARY_OP_ADD_FLOAT; + specialized_op = BINARY_OP_ADD_FLOAT; goto success; } break; @@ -2277,11 +2306,11 @@ _Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *in break; } if (PyLong_CheckExact(lhs)) { - instr->op.code = BINARY_OP_MULTIPLY_INT; + specialized_op = BINARY_OP_MULTIPLY_INT; goto success; } if (PyFloat_CheckExact(lhs)) { - instr->op.code = BINARY_OP_MULTIPLY_FLOAT; + specialized_op = BINARY_OP_MULTIPLY_FLOAT; goto success; } break; @@ -2291,22 +2320,23 @@ _Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *in break; } if (PyLong_CheckExact(lhs)) { - instr->op.code = BINARY_OP_SUBTRACT_INT; + specialized_op = BINARY_OP_SUBTRACT_INT; goto success; } if (PyFloat_CheckExact(lhs)) { - instr->op.code = BINARY_OP_SUBTRACT_FLOAT; + specialized_op = BINARY_OP_SUBTRACT_FLOAT; goto success; } break; } SPECIALIZATION_FAIL(BINARY_OP, binary_op_fail_kind(oparg, lhs, rhs)); STAT_INC(BINARY_OP, failure); - instr->op.code = BINARY_OP; + SET_OPCODE_OR_RETURN(instr, BINARY_OP); cache->counter = adaptive_counter_backoff(cache->counter); return; success: STAT_INC(BINARY_OP, success); + SET_OPCODE_OR_RETURN(instr, specialized_op); cache->counter = adaptive_counter_cooldown(); } diff --git a/Python/sysmodule.c b/Python/sysmodule.c index a4abd7c3c45709..a086bb979efa9c 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -2174,6 +2174,11 @@ sys__clear_internal_caches_impl(PyObject *module) #ifdef _Py_TIER2 PyInterpreterState *interp = _PyInterpreterState_GET(); _Py_Executors_InvalidateAll(interp, 0); +#endif +#ifdef Py_GIL_DISABLED + if (_Py_ClearUnusedTLBC(_PyInterpreterState_GET()) < 0) { + return NULL; + } #endif PyType_ClearCache(); Py_RETURN_NONE; diff --git a/Tools/gdb/libpython.py b/Tools/gdb/libpython.py index 946af4be1a7589..ed254152d7da41 100755 --- a/Tools/gdb/libpython.py +++ b/Tools/gdb/libpython.py @@ -77,6 +77,10 @@ def _managed_dict_offset(): else: return -3 * _sizeof_void_p() +def _interp_frame_has_tlbc_index(): + interp_frame = gdb.lookup_type("_PyInterpreterFrame") + return any(field.name == "tlbc_index" for field in interp_frame.fields()) + Py_TPFLAGS_INLINE_VALUES = (1 << 2) Py_TPFLAGS_MANAGED_DICT = (1 << 4) @@ -105,6 +109,8 @@ def _managed_dict_offset(): UNABLE_READ_INFO_PYTHON_FRAME = 'Unable to read information on python frame' EVALFRAME = '_PyEval_EvalFrameDefault' +INTERP_FRAME_HAS_TLBC_INDEX = _interp_frame_has_tlbc_index() + class NullPyObjectPtr(RuntimeError): pass @@ -693,6 +699,16 @@ def parse_location_table(firstlineno, linetable): yield addr, end_addr, line addr = end_addr + +class PyCodeArrayPtr: + def __init__(self, gdbval): + self._gdbval = gdbval + + def get_entry(self, index): + assert (index >= 0) and (index < self._gdbval["size"]) + return self._gdbval["entries"][index] + + class PyCodeObjectPtr(PyObjectPtr): """ Class wrapping a gdb.Value that's a PyCodeObject* i.e. a instance @@ -1085,7 +1101,12 @@ def _f_nlocalsplus(self): def _f_lasti(self): codeunit_p = gdb.lookup_type("_Py_CODEUNIT").pointer() instr_ptr = self._gdbval["instr_ptr"] - first_instr = self._f_code().field("co_code_adaptive").cast(codeunit_p) + if INTERP_FRAME_HAS_TLBC_INDEX: + tlbc_index = self._gdbval["tlbc_index"] + code_arr = PyCodeArrayPtr(self._f_code().field("co_tlbc")) + first_instr = code_arr.get_entry(tlbc_index).cast(codeunit_p) + else: + first_instr = self._f_code().field("co_code_adaptive").cast(codeunit_p) return int(instr_ptr - first_instr) def is_shim(self):