From 572e191b11b9462a61318008c197979126cf3c60 Mon Sep 17 00:00:00 2001 From: Sam Gross Date: Tue, 5 Dec 2023 13:42:50 -0500 Subject: [PATCH] gh-112532: Use separate mimalloc heaps for GC objects In `--disable-gil` builds, we now use four separate heaps in anticipation of using mimalloc to find GC objects when the GIL is disabled. To support this, we also make a few changes to mimalloc: * Heap and mi_tld_t initialization is split from allocation. This allows us to have a per-PyThreadState mi_tld_t, which is important to keep interpreter isolation, since the same OS thread may run in multiple interpreters (using different PyThreadStates.) * The pool of abandoned segments is refactored into its own struct. This allows us to use different pools for different interpreters so that we can preserve interpreter isolation. * Heap abandoning (mi_heap_collect_ex) can now be called from a different thread than the one that created the heap. This is necessary because we may clear and delete the containing PyThreadStates from a different thread during finalization and after fork(). --- Include/internal/mimalloc/mimalloc/internal.h | 24 ++--- Include/internal/mimalloc/mimalloc/types.h | 40 ++++++++ Include/internal/pycore_interp.h | 5 + Include/internal/pycore_mimalloc.h | 33 +++++++ Include/internal/pycore_tstate.h | 7 +- Lib/test/test_import/__init__.py | 4 +- Objects/mimalloc/heap.c | 26 +++-- Objects/mimalloc/init.c | 27 +++-- Objects/mimalloc/segment.c | 98 ++++++++----------- Objects/obmalloc.c | 36 +++++++ Python/pystate.c | 62 ++++++++++++ 11 files changed, 263 insertions(+), 99 deletions(-) diff --git a/Include/internal/mimalloc/mimalloc/internal.h b/Include/internal/mimalloc/mimalloc/internal.h index f076bc6a40f9773..4dcbe8857b8a81b 100644 --- a/Include/internal/mimalloc/mimalloc/internal.h +++ b/Include/internal/mimalloc/mimalloc/internal.h @@ -23,23 +23,6 @@ terms of the MIT license. A copy of the license can be found in the file #define mi_trace_message(...) #endif -#define MI_CACHE_LINE 64 -#if defined(_MSC_VER) -#pragma warning(disable:4127) // suppress constant conditional warning (due to MI_SECURE paths) -#pragma warning(disable:26812) // unscoped enum warning -#define mi_decl_noinline __declspec(noinline) -#define mi_decl_thread __declspec(thread) -#define mi_decl_cache_align __declspec(align(MI_CACHE_LINE)) -#elif (defined(__GNUC__) && (__GNUC__ >= 3)) || defined(__clang__) // includes clang and icc -#define mi_decl_noinline __attribute__((noinline)) -#define mi_decl_thread __thread -#define mi_decl_cache_align __attribute__((aligned(MI_CACHE_LINE))) -#else -#define mi_decl_noinline -#define mi_decl_thread __thread // hope for the best :-) -#define mi_decl_cache_align -#endif - #if defined(__EMSCRIPTEN__) && !defined(__wasi__) #define __wasi__ #endif @@ -85,6 +68,8 @@ mi_threadid_t _mi_thread_id(void) mi_attr_noexcept; mi_heap_t* _mi_heap_main_get(void); // statically allocated main backing heap void _mi_thread_done(mi_heap_t* heap); void _mi_thread_data_collect(void); +void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap); + // os.c void _mi_os_init(void); // called from process init @@ -130,6 +115,7 @@ void _mi_segment_map_allocated_at(const mi_segment_t* segment); void _mi_segment_map_freed_at(const mi_segment_t* segment); // "segment.c" +extern mi_abandoned_pool_t _mi_abandoned_default; // global abandoned pool mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld, mi_os_tld_t* os_tld); void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld); void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld); @@ -144,7 +130,7 @@ void _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, m uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size); // page start for any page void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld); -void _mi_abandoned_await_readers(void); +void _mi_abandoned_await_readers(mi_abandoned_pool_t *pool); void _mi_abandoned_collect(mi_heap_t* heap, bool force, mi_segments_tld_t* tld); // "page.c" @@ -170,8 +156,10 @@ size_t _mi_bin_size(uint8_t bin); // for stats uint8_t _mi_bin(size_t size); // for stats // "heap.c" +void _mi_heap_init_ex(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id); void _mi_heap_destroy_pages(mi_heap_t* heap); void _mi_heap_collect_abandon(mi_heap_t* heap); +void _mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from); void _mi_heap_set_default_direct(mi_heap_t* heap); bool _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid); void _mi_heap_unsafe_destroy_all(void); diff --git a/Include/internal/mimalloc/mimalloc/types.h b/Include/internal/mimalloc/mimalloc/types.h index 7616f37e4b978f7..ab41b1ce990827f 100644 --- a/Include/internal/mimalloc/mimalloc/types.h +++ b/Include/internal/mimalloc/mimalloc/types.h @@ -33,6 +33,23 @@ terms of the MIT license. A copy of the license can be found in the file #define MI_MAX_ALIGN_SIZE 16 // sizeof(max_align_t) #endif +#define MI_CACHE_LINE 64 +#if defined(_MSC_VER) +#pragma warning(disable:4127) // suppress constant conditional warning (due to MI_SECURE paths) +#pragma warning(disable:26812) // unscoped enum warning +#define mi_decl_noinline __declspec(noinline) +#define mi_decl_thread __declspec(thread) +#define mi_decl_cache_align __declspec(align(MI_CACHE_LINE)) +#elif (defined(__GNUC__) && (__GNUC__ >= 3)) || defined(__clang__) // includes clang and icc +#define mi_decl_noinline __attribute__((noinline)) +#define mi_decl_thread __thread +#define mi_decl_cache_align __attribute__((aligned(MI_CACHE_LINE))) +#else +#define mi_decl_noinline +#define mi_decl_thread __thread // hope for the best :-) +#define mi_decl_cache_align +#endif + // ------------------------------------------------------ // Variants // ------------------------------------------------------ @@ -445,6 +462,28 @@ typedef struct mi_segment_s { mi_slice_t slices[MI_SLICES_PER_SEGMENT+1]; // one more for huge blocks with large alignment } mi_segment_t; +typedef uintptr_t mi_tagged_segment_t; + +// Segments unowned by any thread are put in a shared pool +typedef struct mi_abandoned_pool_s { + // This is a list of visited abandoned pages that were full at the time. + // this list migrates to `abandoned` when that becomes NULL. The use of + // this list reduces contention and the rate at which segments are visited. + mi_decl_cache_align _Atomic(mi_segment_t*) abandoned_visited; // = NULL + + // The abandoned page list (tagged as it supports pop) + mi_decl_cache_align _Atomic(mi_tagged_segment_t) abandoned; // = NULL + + // Maintain these for debug purposes (these counts may be a bit off) + mi_decl_cache_align _Atomic(size_t) abandoned_count; + mi_decl_cache_align _Atomic(size_t) abandoned_visited_count; + + // We also maintain a count of current readers of the abandoned list + // in order to prevent resetting/decommitting segment memory if it might + // still be read. + mi_decl_cache_align _Atomic(size_t) abandoned_readers; // = 0 +} mi_abandoned_pool_t; + // ------------------------------------------------------ // Heaps @@ -654,6 +693,7 @@ typedef struct mi_segments_tld_s { size_t peak_size; // peak size of all segments mi_stats_t* stats; // points to tld stats mi_os_tld_t* os; // points to os stats + mi_abandoned_pool_t* abandoned; // pool of abandoned segments } mi_segments_tld_t; // Thread local data diff --git a/Include/internal/pycore_interp.h b/Include/internal/pycore_interp.h index 04d7a6a615e370f..4512b1edb4b9b35 100644 --- a/Include/internal/pycore_interp.h +++ b/Include/internal/pycore_interp.h @@ -27,6 +27,7 @@ extern "C" { #include "pycore_import.h" // struct _import_state #include "pycore_instruments.h" // _PY_MONITORING_EVENTS #include "pycore_list.h" // struct _Py_list_state +#include "pycore_mimalloc.h" // struct _mimalloc_interp_state #include "pycore_object_state.h" // struct _py_object_state #include "pycore_obmalloc.h" // struct _obmalloc_state #include "pycore_tstate.h" // _PyThreadStateImpl @@ -166,6 +167,10 @@ struct _is { struct _warnings_runtime_state warnings; struct atexit_state atexit; +#if defined(Py_GIL_DISABLED) + struct _mimalloc_interp_state mimalloc; +#endif + struct _obmalloc_state obmalloc; PyObject *audit_hooks; diff --git a/Include/internal/pycore_mimalloc.h b/Include/internal/pycore_mimalloc.h index c29dc82a42762aa..1687ee05b7e2be6 100644 --- a/Include/internal/pycore_mimalloc.h +++ b/Include/internal/pycore_mimalloc.h @@ -9,11 +9,44 @@ # error "pycore_mimalloc.h must be included before mimalloc.h" #endif +#define _Py_MIMALLOC_HEAP_MEM 0 // PyMem_Malloc() and friends +#define _Py_MIMALLOC_HEAP_OBJECT 1 // non-GC objects +#define _Py_MIMALLOC_HEAP_GC 2 // GC objects without pre-header +#define _Py_MIMALLOC_HEAP_GC_PRE 3 // GC objects with pre-header +#define _Py_MIMALLOC_HEAP_COUNT 4 + #include "pycore_pymem.h" #define MI_DEBUG_UNINIT PYMEM_CLEANBYTE #define MI_DEBUG_FREED PYMEM_DEADBYTE #define MI_DEBUG_PADDING PYMEM_FORBIDDENBYTE +#ifdef Py_DEBUG +# define MI_DEBUG 1 +#else +# define MI_DEBUG 0 +#endif #include "mimalloc.h" +#include "mimalloc/types.h" +#include "mimalloc/internal.h" + +struct _mimalloc_interp_state { +#ifdef Py_GIL_DISABLED + // When exiting, threads place any segments with live blocks in this + // shared pool for other threads to claim and reuse. + mi_abandoned_pool_t abandoned_pool; +#else + char _unused; // empty structs are not allowed +#endif +}; + +struct _mimalloc_thread_state { +#ifdef Py_GIL_DISABLED + mi_heap_t *current_object_heap; + mi_heap_t heaps[_Py_MIMALLOC_HEAP_COUNT]; + mi_tld_t tld; +#else + char _unused; // empty structs are not allowed +#endif +}; #endif // Py_INTERNAL_MIMALLOC_H diff --git a/Include/internal/pycore_tstate.h b/Include/internal/pycore_tstate.h index 17f3e8656417735..856ddd5e7e5ff07 100644 --- a/Include/internal/pycore_tstate.h +++ b/Include/internal/pycore_tstate.h @@ -8,6 +8,8 @@ extern "C" { # error "this header requires Py_BUILD_CORE define" #endif +#include "pycore_mimalloc.h" // struct _mimalloc_thread_state + // Every PyThreadState is actually allocated as a _PyThreadStateImpl. The // PyThreadState fields are exposed as part of the C API, although most fields @@ -16,7 +18,10 @@ typedef struct _PyThreadStateImpl { // semi-public fields are in PyThreadState. PyThreadState base; - // TODO: add private fields here +#ifdef Py_GIL_DISABLED + struct _mimalloc_thread_state mimalloc; +#endif + } _PyThreadStateImpl; diff --git a/Lib/test/test_import/__init__.py b/Lib/test/test_import/__init__.py index 48c0a43f29e27f8..701e585aa5432fb 100644 --- a/Lib/test/test_import/__init__.py +++ b/Lib/test/test_import/__init__.py @@ -26,7 +26,8 @@ from test.support import os_helper from test.support import ( STDLIB_DIR, swap_attr, swap_item, cpython_only, is_emscripten, - is_wasi, run_in_subinterp, run_in_subinterp_with_config, Py_TRACE_REFS) + is_wasi, run_in_subinterp, run_in_subinterp_with_config, Py_TRACE_REFS, + Py_GIL_DISABLED) from test.support.import_helper import ( forget, make_legacy_pyc, unlink, unload, ready_to_import, DirsOnSysPath, CleanImport) @@ -2018,6 +2019,7 @@ def parse(cls, text): return self +@unittest.skipIf(Py_GIL_DISABLED, "test deallocates objects from a different interpreter") @requires_singlephase_init class SinglephaseInitTests(unittest.TestCase): diff --git a/Objects/mimalloc/heap.c b/Objects/mimalloc/heap.c index 4eb622ed4bad765..c2c18d7fae288f1 100644 --- a/Objects/mimalloc/heap.c +++ b/Objects/mimalloc/heap.c @@ -206,18 +206,28 @@ mi_heap_t* mi_heap_get_backing(void) { return bheap; } -mi_decl_nodiscard mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id) { - mi_heap_t* bheap = mi_heap_get_backing(); - mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t); // todo: OS allocate in secure mode? - if (heap == NULL) return NULL; +void _mi_heap_init_ex(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id) +{ _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t)); - heap->tld = bheap->tld; + heap->tld = tld; heap->thread_id = _mi_thread_id(); heap->arena_id = arena_id; - _mi_random_split(&bheap->random, &heap->random); + if (heap == tld->heap_backing) { + _mi_random_init(&heap->random); + } + else { + _mi_random_split(&tld->heap_backing->random, &heap->random); + } heap->cookie = _mi_heap_random_next(heap) | 1; heap->keys[0] = _mi_heap_random_next(heap); heap->keys[1] = _mi_heap_random_next(heap); +} + +mi_decl_nodiscard mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id) { + mi_heap_t* bheap = mi_heap_get_backing(); + mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t); // todo: OS allocate in secure mode? + if (heap == NULL) return NULL; + _mi_heap_init_ex(heap, bheap->tld, arena_id); heap->no_reclaim = true; // don't reclaim abandoned pages or otherwise destroy is unsafe // push on the thread local heaps list heap->next = heap->tld->heaps; @@ -383,7 +393,7 @@ void _mi_heap_unsafe_destroy_all(void) { ----------------------------------------------------------- */ // Transfer the pages from one heap to the other -static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) { +void _mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) { mi_assert_internal(heap!=NULL); if (from==NULL || from->page_count == 0) return; @@ -426,7 +436,7 @@ void mi_heap_delete(mi_heap_t* heap) if (!mi_heap_is_backing(heap)) { // tranfer still used pages to the backing heap - mi_heap_absorb(heap->tld->heap_backing, heap); + _mi_heap_absorb(heap->tld->heap_backing, heap); } else { // the backing heap abandons its pages diff --git a/Objects/mimalloc/init.c b/Objects/mimalloc/init.c index 7dfa76577371171..7f85d5905649d7d 100644 --- a/Objects/mimalloc/init.c +++ b/Objects/mimalloc/init.c @@ -148,7 +148,7 @@ extern mi_heap_t _mi_heap_main; static mi_tld_t tld_main = { 0, false, &_mi_heap_main, & _mi_heap_main, - { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, &tld_main.stats, &tld_main.os }, // segments + { MI_SEGMENT_SPAN_QUEUES_EMPTY, 0, 0, 0, 0, &tld_main.stats, &tld_main.os, &_mi_abandoned_default }, // segments { 0, &tld_main.stats }, // os { MI_STATS_NULL } // stats }; @@ -297,24 +297,21 @@ static bool _mi_heap_init(void) { mi_thread_data_t* td = mi_thread_data_zalloc(); if (td == NULL) return false; - mi_tld_t* tld = &td->tld; - mi_heap_t* heap = &td->heap; + _mi_tld_init(&td->tld, &td->heap); + _mi_heap_init_ex(&td->heap, &td->tld, _mi_arena_id_none()); + _mi_heap_set_default_direct(&td->heap); + } + return false; +} + +void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) { _mi_memcpy_aligned(tld, &tld_empty, sizeof(*tld)); - _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(*heap)); - heap->thread_id = _mi_thread_id(); - _mi_random_init(&heap->random); - heap->cookie = _mi_heap_random_next(heap) | 1; - heap->keys[0] = _mi_heap_random_next(heap); - heap->keys[1] = _mi_heap_random_next(heap); - heap->tld = tld; - tld->heap_backing = heap; - tld->heaps = heap; tld->segments.stats = &tld->stats; tld->segments.os = &tld->os; + tld->segments.abandoned = &_mi_abandoned_default; tld->os.stats = &tld->stats; - _mi_heap_set_default_direct(heap); - } - return false; + tld->heap_backing = bheap; + tld->heaps = bheap; } // Free the thread local default heap (called from `mi_thread_done`) diff --git a/Objects/mimalloc/segment.c b/Objects/mimalloc/segment.c index 033e0f97c36c14e..0fa4f16a36617a3 100644 --- a/Objects/mimalloc/segment.c +++ b/Objects/mimalloc/segment.c @@ -395,7 +395,7 @@ static void mi_segment_os_free(mi_segment_t* segment, mi_segments_tld_t* tld) { const size_t size = mi_segment_size(segment); const size_t csize = _mi_commit_mask_committed_size(&segment->commit_mask, size); - _mi_abandoned_await_readers(); // wait until safe to free + _mi_abandoned_await_readers(tld->abandoned); // wait until safe to free _mi_arena_free(segment, mi_segment_size(segment), csize, segment->memid, tld->stats); } @@ -1059,7 +1059,6 @@ would be spread among all other segments in the arenas. // Use the bottom 20-bits (on 64-bit) of the aligned segment pointers // to put in a tag that increments on update to avoid the A-B-A problem. #define MI_TAGGED_MASK MI_SEGMENT_MASK -typedef uintptr_t mi_tagged_segment_t; static mi_segment_t* mi_tagged_segment_ptr(mi_tagged_segment_t ts) { return (mi_segment_t*)(ts & ~MI_TAGGED_MASK); @@ -1071,55 +1070,40 @@ static mi_tagged_segment_t mi_tagged_segment(mi_segment_t* segment, mi_tagged_se return ((uintptr_t)segment | tag); } -// This is a list of visited abandoned pages that were full at the time. -// this list migrates to `abandoned` when that becomes NULL. The use of -// this list reduces contention and the rate at which segments are visited. -static mi_decl_cache_align _Atomic(mi_segment_t*) abandoned_visited; // = NULL - -// The abandoned page list (tagged as it supports pop) -static mi_decl_cache_align _Atomic(mi_tagged_segment_t) abandoned; // = NULL - -// Maintain these for debug purposes (these counts may be a bit off) -static mi_decl_cache_align _Atomic(size_t) abandoned_count; -static mi_decl_cache_align _Atomic(size_t) abandoned_visited_count; - -// We also maintain a count of current readers of the abandoned list -// in order to prevent resetting/decommitting segment memory if it might -// still be read. -static mi_decl_cache_align _Atomic(size_t) abandoned_readers; // = 0 +mi_abandoned_pool_t _mi_abandoned_default; // Push on the visited list -static void mi_abandoned_visited_push(mi_segment_t* segment) { +static void mi_abandoned_visited_push(mi_abandoned_pool_t *pool, mi_segment_t* segment) { mi_assert_internal(segment->thread_id == 0); mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t,&segment->abandoned_next) == NULL); mi_assert_internal(segment->next == NULL); mi_assert_internal(segment->used > 0); - mi_segment_t* anext = mi_atomic_load_ptr_relaxed(mi_segment_t, &abandoned_visited); + mi_segment_t* anext = mi_atomic_load_ptr_relaxed(mi_segment_t, &pool->abandoned_visited); do { mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, anext); - } while (!mi_atomic_cas_ptr_weak_release(mi_segment_t, &abandoned_visited, &anext, segment)); - mi_atomic_increment_relaxed(&abandoned_visited_count); + } while (!mi_atomic_cas_ptr_weak_release(mi_segment_t, &pool->abandoned_visited, &anext, segment)); + mi_atomic_increment_relaxed(&pool->abandoned_visited_count); } // Move the visited list to the abandoned list. -static bool mi_abandoned_visited_revisit(void) +static bool mi_abandoned_visited_revisit(mi_abandoned_pool_t *pool) { // quick check if the visited list is empty - if (mi_atomic_load_ptr_relaxed(mi_segment_t, &abandoned_visited) == NULL) return false; + if (mi_atomic_load_ptr_relaxed(mi_segment_t, &pool->abandoned_visited) == NULL) return false; // grab the whole visited list - mi_segment_t* first = mi_atomic_exchange_ptr_acq_rel(mi_segment_t, &abandoned_visited, NULL); + mi_segment_t* first = mi_atomic_exchange_ptr_acq_rel(mi_segment_t, &pool->abandoned_visited, NULL); if (first == NULL) return false; // first try to swap directly if the abandoned list happens to be NULL mi_tagged_segment_t afirst; - mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned); + mi_tagged_segment_t ts = mi_atomic_load_relaxed(&pool->abandoned); if (mi_tagged_segment_ptr(ts)==NULL) { - size_t count = mi_atomic_load_relaxed(&abandoned_visited_count); + size_t count = mi_atomic_load_relaxed(&pool->abandoned_visited_count); afirst = mi_tagged_segment(first, ts); - if (mi_atomic_cas_strong_acq_rel(&abandoned, &ts, afirst)) { - mi_atomic_add_relaxed(&abandoned_count, count); - mi_atomic_sub_relaxed(&abandoned_visited_count, count); + if (mi_atomic_cas_strong_acq_rel(&pool->abandoned, &ts, afirst)) { + mi_atomic_add_relaxed(&pool->abandoned_count, count); + mi_atomic_sub_relaxed(&pool->abandoned_visited_count, count); return true; } } @@ -1133,51 +1117,52 @@ static bool mi_abandoned_visited_revisit(void) // and atomically prepend to the abandoned list // (no need to increase the readers as we don't access the abandoned segments) - mi_tagged_segment_t anext = mi_atomic_load_relaxed(&abandoned); + mi_tagged_segment_t anext = mi_atomic_load_relaxed(&pool->abandoned); size_t count; do { - count = mi_atomic_load_relaxed(&abandoned_visited_count); + count = mi_atomic_load_relaxed(&pool->abandoned_visited_count); mi_atomic_store_ptr_release(mi_segment_t, &last->abandoned_next, mi_tagged_segment_ptr(anext)); afirst = mi_tagged_segment(first, anext); - } while (!mi_atomic_cas_weak_release(&abandoned, &anext, afirst)); - mi_atomic_add_relaxed(&abandoned_count, count); - mi_atomic_sub_relaxed(&abandoned_visited_count, count); + } while (!mi_atomic_cas_weak_release(&pool->abandoned, &anext, afirst)); + mi_atomic_add_relaxed(&pool->abandoned_count, count); + mi_atomic_sub_relaxed(&pool->abandoned_visited_count, count); return true; } // Push on the abandoned list. -static void mi_abandoned_push(mi_segment_t* segment) { +static void mi_abandoned_push(mi_abandoned_pool_t* pool, mi_segment_t* segment) { + // printf("pushing segment %p to pool %p\n", segment, pool); mi_assert_internal(segment->thread_id == 0); mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL); mi_assert_internal(segment->next == NULL); mi_assert_internal(segment->used > 0); mi_tagged_segment_t next; - mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned); + mi_tagged_segment_t ts = mi_atomic_load_relaxed(&pool->abandoned); do { mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, mi_tagged_segment_ptr(ts)); next = mi_tagged_segment(segment, ts); - } while (!mi_atomic_cas_weak_release(&abandoned, &ts, next)); - mi_atomic_increment_relaxed(&abandoned_count); + } while (!mi_atomic_cas_weak_release(&pool->abandoned, &ts, next)); + mi_atomic_increment_relaxed(&pool->abandoned_count); } // Wait until there are no more pending reads on segments that used to be in the abandoned list // called for example from `arena.c` before decommitting -void _mi_abandoned_await_readers(void) { +void _mi_abandoned_await_readers(mi_abandoned_pool_t* pool) { size_t n; do { - n = mi_atomic_load_acquire(&abandoned_readers); + n = mi_atomic_load_acquire(&pool->abandoned_readers); if (n != 0) mi_atomic_yield(); } while (n != 0); } // Pop from the abandoned list -static mi_segment_t* mi_abandoned_pop(void) { +static mi_segment_t* mi_abandoned_pop(mi_abandoned_pool_t* pool) { mi_segment_t* segment; // Check efficiently if it is empty (or if the visited list needs to be moved) - mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned); + mi_tagged_segment_t ts = mi_atomic_load_relaxed(&pool->abandoned); segment = mi_tagged_segment_ptr(ts); if mi_likely(segment == NULL) { - if mi_likely(!mi_abandoned_visited_revisit()) { // try to swap in the visited list on NULL + if mi_likely(!mi_abandoned_visited_revisit(pool)) { // try to swap in the visited list on NULL return NULL; } } @@ -1186,20 +1171,20 @@ static mi_segment_t* mi_abandoned_pop(void) { // a segment to be decommitted while a read is still pending, // and a tagged pointer to prevent A-B-A link corruption. // (this is called from `region.c:_mi_mem_free` for example) - mi_atomic_increment_relaxed(&abandoned_readers); // ensure no segment gets decommitted + mi_atomic_increment_relaxed(&pool->abandoned_readers); // ensure no segment gets decommitted mi_tagged_segment_t next = 0; - ts = mi_atomic_load_acquire(&abandoned); + ts = mi_atomic_load_acquire(&pool->abandoned); do { segment = mi_tagged_segment_ptr(ts); if (segment != NULL) { mi_segment_t* anext = mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next); next = mi_tagged_segment(anext, ts); // note: reads the segment's `abandoned_next` field so should not be decommitted } - } while (segment != NULL && !mi_atomic_cas_weak_acq_rel(&abandoned, &ts, next)); - mi_atomic_decrement_relaxed(&abandoned_readers); // release reader lock + } while (segment != NULL && !mi_atomic_cas_weak_acq_rel(&pool->abandoned, &ts, next)); + mi_atomic_decrement_relaxed(&pool->abandoned_readers); // release reader lock if (segment != NULL) { mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL); - mi_atomic_decrement_relaxed(&abandoned_count); + mi_atomic_decrement_relaxed(&pool->abandoned_count); } return segment; } @@ -1237,7 +1222,7 @@ static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) { segment->thread_id = 0; mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL); segment->abandoned_visits = 1; // from 0 to 1 to signify it is abandoned - mi_abandoned_push(segment); + mi_abandoned_push(tld->abandoned, segment); } void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld) { @@ -1381,7 +1366,7 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap, void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld) { mi_segment_t* segment; - while ((segment = mi_abandoned_pop()) != NULL) { + while ((segment = mi_abandoned_pop(tld->abandoned)) != NULL) { mi_segment_reclaim(segment, heap, 0, NULL, tld); } } @@ -1391,7 +1376,7 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t needed_slice *reclaimed = false; mi_segment_t* segment; long max_tries = mi_option_get_clamp(mi_option_max_segment_reclaim, 8, 1024); // limit the work to bound allocation times - while ((max_tries-- > 0) && ((segment = mi_abandoned_pop()) != NULL)) { + while ((max_tries-- > 0) && ((segment = mi_abandoned_pop(tld->abandoned)) != NULL)) { segment->abandoned_visits++; // todo: an arena exclusive heap will potentially visit many abandoned unsuitable segments // and push them into the visited list and use many tries. Perhaps we can skip non-suitable ones in a better way? @@ -1418,7 +1403,7 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t needed_slice else { // otherwise, push on the visited list so it gets not looked at too quickly again mi_segment_try_purge(segment, true /* force? */, tld->stats); // force purge if needed as we may not visit soon again - mi_abandoned_visited_push(segment); + mi_abandoned_visited_push(tld->abandoned, segment); } } return NULL; @@ -1428,11 +1413,12 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t needed_slice void _mi_abandoned_collect(mi_heap_t* heap, bool force, mi_segments_tld_t* tld) { mi_segment_t* segment; + mi_abandoned_pool_t* pool = tld->abandoned; int max_tries = (force ? 16*1024 : 1024); // limit latency if (force) { - mi_abandoned_visited_revisit(); + mi_abandoned_visited_revisit(pool); } - while ((max_tries-- > 0) && ((segment = mi_abandoned_pop()) != NULL)) { + while ((max_tries-- > 0) && ((segment = mi_abandoned_pop(pool)) != NULL)) { mi_segment_check_free(segment,0,0,tld); // try to free up pages (due to concurrent frees) if (segment->used == 0) { // free the segment (by forced reclaim) to make it available to other threads. @@ -1444,7 +1430,7 @@ void _mi_abandoned_collect(mi_heap_t* heap, bool force, mi_segments_tld_t* tld) // otherwise, purge if needed and push on the visited list // note: forced purge can be expensive if many threads are destroyed/created as in mstress. mi_segment_try_purge(segment, force, tld->stats); - mi_abandoned_visited_push(segment); + mi_abandoned_visited_push(pool, segment); } } } diff --git a/Objects/obmalloc.c b/Objects/obmalloc.c index 99c95d90658b08e..883adcb1c19b6e0 100644 --- a/Objects/obmalloc.c +++ b/Objects/obmalloc.c @@ -88,19 +88,37 @@ _PyMem_RawFree(void *Py_UNUSED(ctx), void *ptr) void * _PyMem_MiMalloc(void *ctx, size_t size) { +#ifdef Py_GIL_DISABLED + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET(); + mi_heap_t *heap = &tstate->mimalloc.heaps[_Py_MIMALLOC_HEAP_MEM]; + return mi_heap_malloc(heap, size); +#else return mi_malloc(size); +#endif } void * _PyMem_MiCalloc(void *ctx, size_t nelem, size_t elsize) { +#ifdef Py_GIL_DISABLED + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET(); + mi_heap_t *heap = &tstate->mimalloc.heaps[_Py_MIMALLOC_HEAP_MEM]; + return mi_heap_calloc(heap, nelem, elsize); +#else return mi_calloc(nelem, elsize); +#endif } void * _PyMem_MiRealloc(void *ctx, void *ptr, size_t size) { +#ifdef Py_GIL_DISABLED + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET(); + mi_heap_t *heap = &tstate->mimalloc.heaps[_Py_MIMALLOC_HEAP_MEM]; + return mi_heap_realloc(heap, ptr, size); +#else return mi_realloc(ptr, size); +#endif } void @@ -112,20 +130,38 @@ _PyMem_MiFree(void *ctx, void *ptr) void * _PyObject_MiMalloc(void *ctx, size_t nbytes) { +#ifdef Py_GIL_DISABLED + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET(); + mi_heap_t *heap = tstate->mimalloc.current_object_heap; + return mi_heap_malloc(heap, nbytes); +#else return mi_malloc(nbytes); +#endif } void * _PyObject_MiCalloc(void *ctx, size_t nelem, size_t elsize) { +#ifdef Py_GIL_DISABLED + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET(); + mi_heap_t *heap = tstate->mimalloc.current_object_heap; + return mi_heap_calloc(heap, nelem, elsize); +#else return mi_calloc(nelem, elsize); +#endif } void * _PyObject_MiRealloc(void *ctx, void *ptr, size_t nbytes) { +#ifdef Py_GIL_DISABLED + _PyThreadStateImpl *tstate = (_PyThreadStateImpl *)_PyThreadState_GET(); + mi_heap_t *heap = tstate->mimalloc.current_object_heap; + return mi_heap_realloc(heap, ptr, nbytes); +#else return mi_realloc(ptr, nbytes); +#endif } void diff --git a/Python/pystate.c b/Python/pystate.c index e18eb0186d00103..d1126d8446a52a0 100644 --- a/Python/pystate.c +++ b/Python/pystate.c @@ -236,6 +236,9 @@ tstate_is_bound(PyThreadState *tstate) static void bind_gilstate_tstate(PyThreadState *); static void unbind_gilstate_tstate(PyThreadState *); +static void tstate_mimalloc_bind(PyThreadState *); +static void tstate_mimalloc_clear(PyThreadState *); + static void bind_tstate(PyThreadState *tstate) { @@ -256,6 +259,9 @@ bind_tstate(PyThreadState *tstate) tstate->native_thread_id = PyThread_get_thread_native_id(); #endif + // mimalloc state needs to be initialized from the active thread. + tstate_mimalloc_bind(tstate); + tstate->_status.bound = 1; } @@ -1533,6 +1539,8 @@ PyThreadState_Clear(PyThreadState *tstate) tstate->on_delete(tstate->on_delete_data); } + tstate_mimalloc_clear(tstate); + tstate->_status.cleared = 1; // XXX Call _PyThreadStateSwap(runtime, NULL) here if "current". @@ -2495,3 +2503,57 @@ _PyThreadState_MustExit(PyThreadState *tstate) } return 1; } + +/********************/ +/* mimalloc support */ +/********************/ + +static void +tstate_mimalloc_bind(PyThreadState *tstate) +{ +#ifdef Py_GIL_DISABLED + struct _mimalloc_thread_state *mts = &((_PyThreadStateImpl*)tstate)->mimalloc; + struct _mimalloc_interp_state *mis = &tstate->interp->mimalloc; + + // Initialize the mimalloc thread state. This must be called from the + // same thread that will use the thread state. The "mem" heap doubles as + // the "backing" heap. + mi_tld_t *tld = &mts->tld; + _mi_tld_init(tld, &mts->heaps[_Py_MIMALLOC_HEAP_MEM]); + + // Exiting threads push any remaining in-use segments to the abandoned + // pool to be re-claimed later by other threads. We use per-interpreter + // pools to keep Python objects from different interpreters separate. + tld->segments.abandoned = &mis->abandoned_pool; + + // Initialize each heap + for (Py_ssize_t i = 0; i < _Py_MIMALLOC_HEAP_COUNT; i++) { + _mi_heap_init_ex(&mts->heaps[i], tld, _mi_arena_id_none()); + } + + // By default, object allocations use _Py_MIMALLOC_HEAP_OBJECT. + // _PyObject_GC_New() and similar functions temporarily override this to + // use one of the GC heaps. + mts->current_object_heap = &mts->heaps[_Py_MIMALLOC_HEAP_OBJECT]; +#endif +} + +static void +tstate_mimalloc_clear(PyThreadState *tstate) +{ +#ifdef Py_GIL_DISABLED + if (!tstate->_status.bound) { + // The mimalloc heaps are only initialized when the thread is bound. + return; + } + + _PyThreadStateImpl *tstate_impl = (_PyThreadStateImpl *)tstate; + for (Py_ssize_t i = 0; i < _Py_MIMALLOC_HEAP_COUNT; i++) { + // Abandon all segments in use by this thread. This pushes them to + // a shared pool to later be reclaimed by other threads. It's important + // to do this before the thread state is destroyed so that objects + // remain visible to the GC. + _mi_heap_collect_abandon(&tstate_impl->mimalloc.heaps[i]); + } +#endif +}