Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

reduce contention on page metadata lists during the sweeping phase #52943

Merged
merged 1 commit into from
Jan 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 108 additions & 16 deletions src/gc.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ int jl_n_sweepthreads;
_Atomic(int) gc_n_threads_marking;
// Number of threads sweeping
_Atomic(int) gc_n_threads_sweeping;
// Temporary for the `ptls->page_metadata_allocd` used during parallel sweeping
_Atomic(jl_gc_page_stack_t *) gc_allocd_scratch;
// Temporary for the `ptls->page_metadata_allocd` used during parallel sweeping (padded to avoid false sharing)
_Atomic(jl_gc_padded_page_stack_t *) gc_allocd_scratch;
// `tid` of mutator thread that triggered GC
_Atomic(int) gc_master_tid;
// `tid` of first GC thread
Expand Down Expand Up @@ -1596,8 +1596,72 @@ static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_
pg->nfree = nfree;
}

void gc_sweep_wake_all(void)
// pre-scan pages to check whether there are enough pages so that's worth parallelizing
// also sweeps pages that don't need to be linearly scanned
int gc_sweep_prescan(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_scratch)
{
// 4MB worth of pages is worth parallelizing
const int n_pages_worth_parallel_sweep = (int)(4 * (1 << 20) / GC_PAGE_SZ);
int n_pages_to_scan = 0;
gc_page_profiler_serializer_t serializer = gc_page_serializer_create();
for (int t_i = 0; t_i < gc_n_threads; t_i++) {
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
if (ptls2 == NULL) {
continue;
}
jl_gc_page_stack_t *dest = &new_gc_allocd_scratch[ptls2->tid].stack;
jl_gc_page_stack_t tmp;
jl_gc_pagemeta_t *tail = NULL;
memset(&tmp, 0, sizeof(tmp));
while (1) {
jl_gc_pagemeta_t *pg = pop_lf_back_nosync(&ptls2->page_metadata_allocd);
if (pg == NULL) {
break;
}
int should_scan = 1;
if (!pg->has_marked) {
should_scan = 0;
}
if (!current_sweep_full && !pg->has_young) {
assert(!prev_sweep_full || pg->prev_nold >= pg->nold);
if (!prev_sweep_full || pg->prev_nold == pg->nold) {
should_scan = 0;
}
}
if (should_scan) {
if (tail == NULL) {
tail = pg;
}
n_pages_to_scan++;
push_lf_back_nosync(&tmp, pg);
}
else {
gc_sweep_pool_page(&serializer, dest, &ptls2->page_metadata_buffered, pg);
}
if (n_pages_to_scan >= n_pages_worth_parallel_sweep) {
break;
}
}
if (tail != NULL) {
tail->next = jl_atomic_load_relaxed(&ptls2->page_metadata_allocd.bottom);
}
ptls2->page_metadata_allocd = tmp;
if (n_pages_to_scan >= n_pages_worth_parallel_sweep) {
break;
}
}
gc_page_serializer_destroy(&serializer);
return n_pages_to_scan >= n_pages_worth_parallel_sweep;
}

// wake up all threads to sweep the pages
void gc_sweep_wake_all(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_scratch)
{
int parallel_sweep_worthwhile = gc_sweep_prescan(ptls, new_gc_allocd_scratch);
jl_atomic_store(&gc_allocd_scratch, new_gc_allocd_scratch);
if (!parallel_sweep_worthwhile) {
return;
}
uv_mutex_lock(&gc_threads_lock);
for (int i = gc_first_tid; i < gc_first_tid + jl_n_markthreads; i++) {
jl_ptls_t ptls2 = gc_all_tls_states[i];
Expand All @@ -1608,6 +1672,7 @@ void gc_sweep_wake_all(void)
uv_mutex_unlock(&gc_threads_lock);
}

// wait for all threads to finish sweeping
void gc_sweep_wait_for_all(void)
{
jl_atomic_store(&gc_allocd_scratch, NULL);
Expand All @@ -1616,36 +1681,58 @@ void gc_sweep_wait_for_all(void)
}
}

void gc_sweep_pool_parallel(void)
// sweep all pools
void gc_sweep_pool_parallel(jl_ptls_t ptls)
{
jl_atomic_fetch_add(&gc_n_threads_sweeping, 1);
jl_gc_page_stack_t *allocd_scratch = jl_atomic_load(&gc_allocd_scratch);
jl_gc_padded_page_stack_t *allocd_scratch = jl_atomic_load(&gc_allocd_scratch);
if (allocd_scratch != NULL) {
gc_page_profiler_serializer_t serializer = gc_page_serializer_create();
while (1) {
int found_pg = 0;
// sequentially walk the threads and sweep the pages
for (int t_i = 0; t_i < gc_n_threads; t_i++) {
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
// skip foreign threads that already exited
if (ptls2 == NULL) {
continue;
}
jl_gc_page_stack_t *allocd = &allocd_scratch[t_i];
jl_gc_pagemeta_t *pg = pop_lf_back(&ptls2->page_metadata_allocd);
jl_gc_page_stack_t *dest = &allocd_scratch[ptls2->tid].stack;
jl_gc_pagemeta_t *pg = try_pop_lf_back(&ptls2->page_metadata_allocd);
// failed steal attempt
if (pg == NULL) {
continue;
}
gc_sweep_pool_page(&serializer, allocd, &ptls2->page_metadata_buffered, pg);
gc_sweep_pool_page(&serializer, dest, &ptls2->page_metadata_buffered, pg);
found_pg = 1;
}
if (!found_pg) {
break;
// check for termination
int no_more_work = 1;
for (int t_i = 0; t_i < gc_n_threads; t_i++) {
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
// skip foreign threads that already exited
if (ptls2 == NULL) {
continue;
}
jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->page_metadata_allocd.bottom);
if (pg != NULL) {
no_more_work = 0;
break;
}
}
if (no_more_work) {
break;
}
}
jl_cpu_pause();
}
gc_page_serializer_destroy(&serializer);
}
jl_atomic_fetch_add(&gc_n_threads_sweeping, -1);
}

// free all pages (i.e. through `madvise` on Linux) that were lazily freed
void gc_free_pages(void)
{
while (1) {
Expand All @@ -1670,7 +1757,7 @@ static void gc_sweep_pool(void)

// allocate enough space to hold the end of the free list chain
// for every thread and pool size
jl_taggedvalue_t ***pfl = (jl_taggedvalue_t ***) alloca(n_threads * JL_GC_N_POOLS * sizeof(jl_taggedvalue_t**));
jl_taggedvalue_t ***pfl = (jl_taggedvalue_t ***) malloc_s(n_threads * JL_GC_N_POOLS * sizeof(jl_taggedvalue_t**));

// update metadata of pages that were pointed to by freelist or newpages from a pool
// i.e. pages being the current allocation target
Expand Down Expand Up @@ -1712,17 +1799,18 @@ static void gc_sweep_pool(void)
}

// the actual sweeping
jl_gc_page_stack_t *tmp = (jl_gc_page_stack_t *)alloca(n_threads * sizeof(jl_gc_page_stack_t));
memset(tmp, 0, n_threads * sizeof(jl_gc_page_stack_t));
jl_atomic_store(&gc_allocd_scratch, tmp);
gc_sweep_wake_all();
gc_sweep_pool_parallel();
jl_gc_padded_page_stack_t *new_gc_allocd_scratch = (jl_gc_padded_page_stack_t *) malloc_s(n_threads * sizeof(jl_gc_padded_page_stack_t));
memset(new_gc_allocd_scratch, 0, n_threads * sizeof(jl_gc_padded_page_stack_t));
jl_ptls_t ptls = jl_current_task->ptls;
gc_sweep_wake_all(ptls, new_gc_allocd_scratch);
gc_sweep_pool_parallel(ptls);
gc_sweep_wait_for_all();

// reset half-pages pointers
for (int t_i = 0; t_i < n_threads; t_i++) {
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
if (ptls2 != NULL) {
ptls2->page_metadata_allocd = tmp[t_i];
ptls2->page_metadata_allocd = new_gc_allocd_scratch[t_i].stack;
for (int i = 0; i < JL_GC_N_POOLS; i++) {
jl_gc_pool_t *p = &ptls2->heap.norm_pools[i];
p->newpages = NULL;
Expand Down Expand Up @@ -1760,6 +1848,10 @@ static void gc_sweep_pool(void)
}
}

// cleanup
free(pfl);
free(new_gc_allocd_scratch);

#ifdef _P64 // only enable concurrent sweeping on 64bit
// wake thread up to sweep concurrently
if (jl_n_sweepthreads > 0) {
Expand Down
46 changes: 45 additions & 1 deletion src/gc.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,23 @@ extern jl_gc_page_stack_t global_page_pool_freed;
// in the sweeping phase, which also doesn't push a node into the
// same stack after it's popped

STATIC_INLINE void push_lf_back_nosync(jl_gc_page_stack_t *pool, jl_gc_pagemeta_t *elt) JL_NOTSAFEPOINT
{
jl_gc_pagemeta_t *old_back = jl_atomic_load_relaxed(&pool->bottom);
elt->next = old_back;
jl_atomic_store_relaxed(&pool->bottom, elt);
}

STATIC_INLINE jl_gc_pagemeta_t *pop_lf_back_nosync(jl_gc_page_stack_t *pool) JL_NOTSAFEPOINT
{
jl_gc_pagemeta_t *old_back = jl_atomic_load_relaxed(&pool->bottom);
if (old_back == NULL) {
return NULL;
}
jl_atomic_store_relaxed(&pool->bottom, old_back->next);
return old_back;
}

STATIC_INLINE void push_lf_back(jl_gc_page_stack_t *pool, jl_gc_pagemeta_t *elt) JL_NOTSAFEPOINT
{
while (1) {
Expand All @@ -211,6 +228,23 @@ STATIC_INLINE void push_lf_back(jl_gc_page_stack_t *pool, jl_gc_pagemeta_t *elt)
}
}

#define MAX_POP_ATTEMPTS (1 << 10)

STATIC_INLINE jl_gc_pagemeta_t *try_pop_lf_back(jl_gc_page_stack_t *pool) JL_NOTSAFEPOINT
{
for (int i = 0; i < MAX_POP_ATTEMPTS; i++) {
jl_gc_pagemeta_t *old_back = jl_atomic_load_relaxed(&pool->bottom);
if (old_back == NULL) {
return NULL;
}
if (jl_atomic_cmpswap(&pool->bottom, &old_back, old_back->next)) {
return old_back;
}
jl_cpu_pause();
}
return NULL;
}

STATIC_INLINE jl_gc_pagemeta_t *pop_lf_back(jl_gc_page_stack_t *pool) JL_NOTSAFEPOINT
{
while (1) {
Expand All @@ -224,6 +258,16 @@ STATIC_INLINE jl_gc_pagemeta_t *pop_lf_back(jl_gc_page_stack_t *pool) JL_NOTSAFE
jl_cpu_pause();
}
}
typedef struct {
jl_gc_page_stack_t stack;
// pad to 128 bytes to avoid false-sharing
#ifdef _P64
void *_pad[15];
#else
void *_pad[31];
#endif
} jl_gc_padded_page_stack_t;
static_assert(sizeof(jl_gc_padded_page_stack_t) == 128, "jl_gc_padded_page_stack_t is not 128 bytes");

typedef struct {
_Atomic(size_t) n_freed_objs;
Expand Down Expand Up @@ -473,7 +517,7 @@ void gc_mark_finlist(jl_gc_markqueue_t *mq, arraylist_t *list, size_t start) JL_
void gc_mark_loop_serial_(jl_ptls_t ptls, jl_gc_markqueue_t *mq);
void gc_mark_loop_serial(jl_ptls_t ptls);
void gc_mark_loop_parallel(jl_ptls_t ptls, int master);
void gc_sweep_pool_parallel(void);
void gc_sweep_pool_parallel(jl_ptls_t ptls);
void gc_free_pages(void);
void sweep_stack_pools(void);
void jl_gc_debug_init(void);
Expand Down
2 changes: 1 addition & 1 deletion src/scheduler.c
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ void jl_parallel_gc_threadfun(void *arg)
gc_mark_loop_parallel(ptls, 0);
}
if (may_sweep(ptls)) { // not an else!
gc_sweep_pool_parallel();
gc_sweep_pool_parallel(ptls);
jl_atomic_fetch_add(&ptls->gc_sweeps_requested, -1);
}
}
Expand Down