From 8c387b812ab056efdeeeedcd46d715650c6eb567 Mon Sep 17 00:00:00 2001 From: Mark Shannon Date: Thu, 8 Feb 2024 13:29:32 +0000 Subject: [PATCH 1/2] Improve clarity, and performance by a litte, of backedge threshold handling. --- Include/cpython/optimizer.h | 10 +++++-- Include/internal/pycore_interp.h | 6 ++-- Python/bytecodes.c | 29 ++++++++++--------- Python/generated_cases.c.h | 29 ++++++++++--------- Python/optimizer.c | 48 ++++++++++++++++++++++---------- Python/pylifecycle.c | 4 +-- Python/pystate.c | 10 ++----- 7 files changed, 81 insertions(+), 55 deletions(-) diff --git a/Include/cpython/optimizer.h b/Include/cpython/optimizer.h index 5a9ccaea3b2209..a65baf3300fb64 100644 --- a/Include/cpython/optimizer.h +++ b/Include/cpython/optimizer.h @@ -70,6 +70,8 @@ typedef struct { PyAPI_FUNC(int) PyUnstable_Replace_Executor(PyCodeObject *code, _Py_CODEUNIT *instr, _PyExecutorObject *executor); +_PyOptimizerObject *_Py_SetOptimizer(PyInterpreterState *interp, _PyOptimizerObject* optimizer); + PyAPI_FUNC(void) PyUnstable_SetOptimizer(_PyOptimizerObject* optimizer); PyAPI_FUNC(_PyOptimizerObject *) PyUnstable_GetOptimizer(void); @@ -79,8 +81,6 @@ PyAPI_FUNC(_PyExecutorObject *) PyUnstable_GetExecutor(PyCodeObject *code, int o int _PyOptimizer_Optimize(struct _PyInterpreterFrame *frame, _Py_CODEUNIT *start, PyObject **stack_pointer); -extern _PyOptimizerObject _PyOptimizer_Default; - void _Py_ExecutorInit(_PyExecutorObject *, _PyBloomFilter *); void _Py_ExecutorClear(_PyExecutorObject *); void _Py_BloomFilter_Init(_PyBloomFilter *); @@ -95,7 +95,11 @@ PyAPI_FUNC(PyObject *)PyUnstable_Optimizer_NewUOpOptimizer(void); #define OPTIMIZER_BITS_IN_COUNTER 4 /* Minimum of 16 additional executions before retry */ -#define MINIMUM_TIER2_BACKOFF 4 +#define MIN_TIER2_BACKOFF 4 +#define MAX_TIER2_BACKOFF (15 - OPTIMIZER_BITS_IN_COUNTER) +#define OPTIMIZER_BITS_MASK ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1) +/* A value <= UINT16_MAX but large enough that when shifted is > UINT16_MAX */ +#define OPTIMIZER_UNREACHABLE_THRESHOLD UINT16_MAX #define _Py_MAX_ALLOWED_BUILTINS_MODIFICATIONS 3 #define _Py_MAX_ALLOWED_GLOBALS_MODIFICATIONS 6 diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h index f7c332ed747cfa..6e7809991d7461 100644 --- a/Include/internal/pycore_interp.h +++ b/Include/internal/pycore_interp.h @@ -238,8 +238,10 @@ struct _is { struct callable_cache callable_cache; _PyOptimizerObject *optimizer; _PyExecutorObject *executor_list_head; - uint16_t optimizer_resume_threshold; - uint16_t optimizer_backedge_threshold; + /* These values are shifted and offset to speed up check in JUMP_BACKWARD */ + uint32_t optimizer_resume_threshold; + uint32_t optimizer_backedge_threshold; + uint32_t next_func_version; _rare_events rare_events; PyDict_WatchCallback builtins_dict_watcher; diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 6fb4d719e43991..0891a0dbc01b6a 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -2318,13 +2318,16 @@ dummy_func( assert(oparg <= INSTR_OFFSET()); JUMPBY(-oparg); #if ENABLE_SPECIALIZATION - this_instr[1].cache += (1 << OPTIMIZER_BITS_IN_COUNTER); + uint16_t counter = this_instr[1].cache; + this_instr[1].cache = counter + (1 << OPTIMIZER_BITS_IN_COUNTER); /* We are using unsigned values, but we really want signed values, so - * do the 2s complement comparison manually */ - uint16_t ucounter = this_instr[1].cache + (1 << 15); - uint16_t threshold = tstate->interp->optimizer_backedge_threshold + (1 << 15); + * do the 2s complement adjustment manually */ + uint32_t offset_counter = counter ^ (1 << 15); + uint32_t threshold = tstate->interp->optimizer_backedge_threshold; + assert((threshold & OPTIMIZER_BITS_MASK) == 0); + // Use '>=' not '>' so that the optimizer/backoff bits do not effect the result. // Double-check that the opcode isn't instrumented or something: - if (ucounter > threshold && this_instr->op.code == JUMP_BACKWARD) { + if (offset_counter >= threshold && this_instr->op.code == JUMP_BACKWARD) { OPT_STAT_INC(attempts); _Py_CODEUNIT *start = this_instr; /* Back up over EXTENDED_ARGs so optimizer sees the whole instruction */ @@ -2338,18 +2341,18 @@ dummy_func( // Rewind and enter the executor: assert(start->op.code == ENTER_EXECUTOR); next_instr = start; - this_instr[1].cache &= ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1); + this_instr[1].cache &= OPTIMIZER_BITS_MASK; } else { - int backoff = this_instr[1].cache & ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1); - if (backoff < MINIMUM_TIER2_BACKOFF) { - backoff = MINIMUM_TIER2_BACKOFF; + int backoff = this_instr[1].cache & OPTIMIZER_BITS_MASK; + backoff++; + if (backoff < MIN_TIER2_BACKOFF) { + backoff = MIN_TIER2_BACKOFF; } - else if (backoff < 15 - OPTIMIZER_BITS_IN_COUNTER) { - backoff++; + else if (backoff > MAX_TIER2_BACKOFF) { + backoff = MAX_TIER2_BACKOFF; } - assert(backoff <= 15 - OPTIMIZER_BITS_IN_COUNTER); - this_instr[1].cache = ((1 << 16) - ((1 << OPTIMIZER_BITS_IN_COUNTER) << backoff)) | backoff; + this_instr[1].cache = ((UINT16_MAX << OPTIMIZER_BITS_IN_COUNTER) << backoff) | backoff; } } #endif /* ENABLE_SPECIALIZATION */ diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index 16f1db30620d72..9f37cb1e7d5e97 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -3274,13 +3274,16 @@ assert(oparg <= INSTR_OFFSET()); JUMPBY(-oparg); #if ENABLE_SPECIALIZATION - this_instr[1].cache += (1 << OPTIMIZER_BITS_IN_COUNTER); + uint16_t counter = this_instr[1].cache; + this_instr[1].cache = counter + (1 << OPTIMIZER_BITS_IN_COUNTER); /* We are using unsigned values, but we really want signed values, so - * do the 2s complement comparison manually */ - uint16_t ucounter = this_instr[1].cache + (1 << 15); - uint16_t threshold = tstate->interp->optimizer_backedge_threshold + (1 << 15); + * do the 2s complement adjustment manually */ + uint32_t offset_counter = counter ^ (1 << 15); + uint32_t threshold = tstate->interp->optimizer_backedge_threshold; + assert((threshold & OPTIMIZER_BITS_MASK) == 0); + // Use '>=' not '>' so that the optimizer/backoff bits do not effect the result. // Double-check that the opcode isn't instrumented or something: - if (ucounter > threshold && this_instr->op.code == JUMP_BACKWARD) { + if (offset_counter >= threshold && this_instr->op.code == JUMP_BACKWARD) { OPT_STAT_INC(attempts); _Py_CODEUNIT *start = this_instr; /* Back up over EXTENDED_ARGs so optimizer sees the whole instruction */ @@ -3294,18 +3297,18 @@ // Rewind and enter the executor: assert(start->op.code == ENTER_EXECUTOR); next_instr = start; - this_instr[1].cache &= ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1); + this_instr[1].cache &= OPTIMIZER_BITS_MASK; } else { - int backoff = this_instr[1].cache & ((1 << OPTIMIZER_BITS_IN_COUNTER) - 1); - if (backoff < MINIMUM_TIER2_BACKOFF) { - backoff = MINIMUM_TIER2_BACKOFF; + int backoff = this_instr[1].cache & OPTIMIZER_BITS_MASK; + backoff++; + if (backoff < MIN_TIER2_BACKOFF) { + backoff = MIN_TIER2_BACKOFF; } - else if (backoff < 15 - OPTIMIZER_BITS_IN_COUNTER) { - backoff++; + else if (backoff > MAX_TIER2_BACKOFF) { + backoff = MAX_TIER2_BACKOFF; } - assert(backoff <= 15 - OPTIMIZER_BITS_IN_COUNTER); - this_instr[1].cache = ((1 << 16) - ((1 << OPTIMIZER_BITS_IN_COUNTER) << backoff)) | backoff; + this_instr[1].cache = ((UINT16_MAX << OPTIMIZER_BITS_IN_COUNTER) << backoff) | backoff; } } #endif /* ENABLE_SPECIALIZATION */ diff --git a/Python/optimizer.c b/Python/optimizer.c index d71ca0aef0e11a..edef03447de392 100644 --- a/Python/optimizer.c +++ b/Python/optimizer.c @@ -115,6 +115,9 @@ never_optimize( _PyExecutorObject **exec, int Py_UNUSED(stack_entries)) { + /* Although it should be benign for this to be called, + * it shouldn't happen, so fail in debug builds. */ + assert(0 && "never optimize should never be called"); return 0; } @@ -126,13 +129,19 @@ PyTypeObject _PyDefaultOptimizer_Type = { .tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION, }; -_PyOptimizerObject _PyOptimizer_Default = { +static _PyOptimizerObject _PyOptimizer_Default = { PyObject_HEAD_INIT(&_PyDefaultOptimizer_Type) .optimize = never_optimize, - .resume_threshold = INT16_MAX, - .backedge_threshold = INT16_MAX, + .resume_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD, + .backedge_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD, }; +static uint32_t +shift_and_offset_threshold(uint16_t threshold) +{ + return (threshold << OPTIMIZER_BITS_IN_COUNTER) + (1 << 15); +} + _PyOptimizerObject * PyUnstable_GetOptimizer(void) { @@ -140,24 +149,33 @@ PyUnstable_GetOptimizer(void) if (interp->optimizer == &_PyOptimizer_Default) { return NULL; } - assert(interp->optimizer_backedge_threshold == interp->optimizer->backedge_threshold); - assert(interp->optimizer_resume_threshold == interp->optimizer->resume_threshold); + assert(interp->optimizer_backedge_threshold == + shift_and_offset_threshold(interp->optimizer->backedge_threshold)); + assert(interp->optimizer_resume_threshold == + shift_and_offset_threshold(interp->optimizer->resume_threshold)); Py_INCREF(interp->optimizer); return interp->optimizer; } -void -PyUnstable_SetOptimizer(_PyOptimizerObject *optimizer) +_PyOptimizerObject * +_Py_SetOptimizer(PyInterpreterState *interp, _PyOptimizerObject *optimizer) { - PyInterpreterState *interp = _PyInterpreterState_GET(); if (optimizer == NULL) { optimizer = &_PyOptimizer_Default; } _PyOptimizerObject *old = interp->optimizer; Py_INCREF(optimizer); interp->optimizer = optimizer; - interp->optimizer_backedge_threshold = optimizer->backedge_threshold; - interp->optimizer_resume_threshold = optimizer->resume_threshold; + interp->optimizer_backedge_threshold = shift_and_offset_threshold(optimizer->backedge_threshold); + interp->optimizer_resume_threshold = shift_and_offset_threshold(optimizer->resume_threshold); + return old; +} + +void +PyUnstable_SetOptimizer(_PyOptimizerObject *optimizer) +{ + PyInterpreterState *interp = _PyInterpreterState_GET(); + _PyOptimizerObject *old = _Py_SetOptimizer(interp, optimizer); Py_DECREF(old); } @@ -865,10 +883,10 @@ PyUnstable_Optimizer_NewUOpOptimizer(void) return NULL; } opt->optimize = uop_optimize; - opt->resume_threshold = INT16_MAX; - // Need at least 3 iterations to settle specializations. - // A few lower bits of the counter are reserved for other flags. - opt->backedge_threshold = 16 << OPTIMIZER_BITS_IN_COUNTER; + opt->resume_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD; + // Need a few iterations to settle specializations, + // and to ammortize the cost of optimization. + opt->backedge_threshold = 16; return (PyObject *)opt; } @@ -955,7 +973,7 @@ PyUnstable_Optimizer_NewCounter(void) return NULL; } opt->base.optimize = counter_optimize; - opt->base.resume_threshold = INT16_MAX; + opt->base.resume_threshold = OPTIMIZER_UNREACHABLE_THRESHOLD; opt->base.backedge_threshold = 0; opt->count = 0; return (PyObject *)opt; diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index 0cac7109340129..de843362bb74c4 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -1627,8 +1627,8 @@ finalize_modules(PyThreadState *tstate) // Invalidate all executors and turn off tier 2 optimizer _Py_Executors_InvalidateAll(interp); - Py_XDECREF(interp->optimizer); - interp->optimizer = &_PyOptimizer_Default; + _PyOptimizerObject *old = _Py_SetOptimizer(interp, NULL); + Py_XDECREF(old); // Stop watching __builtin__ modifications PyDict_Unwatch(0, interp->builtins); diff --git a/Python/pystate.c b/Python/pystate.c index e77e5bfa7e2df8..f8264c6e32f212 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -622,9 +622,7 @@ init_interpreter(PyInterpreterState *interp, } interp->sys_profile_initialized = false; interp->sys_trace_initialized = false; - interp->optimizer = &_PyOptimizer_Default; - interp->optimizer_backedge_threshold = _PyOptimizer_Default.backedge_threshold; - interp->optimizer_resume_threshold = _PyOptimizer_Default.backedge_threshold; + _Py_SetOptimizer(interp, NULL); interp->next_func_version = 1; interp->executor_list_head = NULL; if (interp != &runtime->_main_interpreter) { @@ -777,10 +775,8 @@ interpreter_clear(PyInterpreterState *interp, PyThreadState *tstate) tstate->_status.cleared = 0; } - Py_CLEAR(interp->optimizer); - interp->optimizer = &_PyOptimizer_Default; - interp->optimizer_backedge_threshold = _PyOptimizer_Default.backedge_threshold; - interp->optimizer_resume_threshold = _PyOptimizer_Default.backedge_threshold; + _PyOptimizerObject *old = _Py_SetOptimizer(interp, NULL); + Py_DECREF(old); /* It is possible that any of the objects below have a finalizer that runs Python code or otherwise relies on a thread state From 1a9d648297ab91de6316bc48eba76c99d42f4edf Mon Sep 17 00:00:00 2001 From: Mark Shannon Date: Fri, 9 Feb 2024 14:39:45 +0000 Subject: [PATCH 2/2] Update Python/pystate.c Co-authored-by: Erlend E. Aasland --- Python/pystate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Python/pystate.c b/Python/pystate.c index f8264c6e32f212..e99e0b0cf370ae 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -622,7 +622,7 @@ init_interpreter(PyInterpreterState *interp, } interp->sys_profile_initialized = false; interp->sys_trace_initialized = false; - _Py_SetOptimizer(interp, NULL); + (void)_Py_SetOptimizer(interp, NULL); interp->next_func_version = 1; interp->executor_list_head = NULL; if (interp != &runtime->_main_interpreter) {