Skip to content

Commit

Permalink
runtime/cgo: store M for C-created thread in pthread key
Browse files Browse the repository at this point in the history
In a C thread, it's necessary to acquire an extra M by using needm while invoking a Go function from C. But, needm and dropm are heavy costs due to the signal-related syscalls.
So, we change to not dropm while returning back to C, which means binding the extra M to the C thread until it exits, to avoid needm and dropm on each C to Go call.
Instead, we only dropm while the C thread exits, so the extra M won't leak.

When invoking a Go function from C:
Allocate a pthread variable using pthread_key_create, only once per shared object, and register a thread-exit-time destructor.
And store the g0 of the current m into the thread-specified value of the pthread key,  only once per C thread, so that the destructor will put the extra M back onto the extra M list while the C thread exits.

When returning back to C:
Skip dropm in cgocallback, when the pthread variable has been created, so that the extra M will be reused the next time invoke a Go function from C.

This is purely a performance optimization. The old version, in which needm & dropm happen on each cgo call, is still correct too, and we have to keep the old version on systems with cgo but without pthreads, like Windows.

This optimization is significant, and the specific value depends on the OS system and CPU, but in general, it can be considered as 10x faster, for a simple Go function call from a C thread.

For the newly added BenchmarkCGoInCThread, some benchmark results:
1. it's 28x faster, from 3395 ns/op to 121 ns/op, in darwin OS & Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz
2. it's 6.5x faster, from 1495 ns/op to 230 ns/op, in Linux OS & Intel(R) Xeon(R) CPU E5-2630 0 @ 2.30GHz

Fixes #51676

Change-Id: I380702fe2f9b6b401b2d6f04b0aba990f4b9ee6c
GitHub-Last-Rev: 93dc64a
GitHub-Pull-Request: #51679
Reviewed-on: https://go-review.googlesource.com/c/go/+/392854
Reviewed-by: Ian Lance Taylor <[email protected]>
TryBot-Result: Gopher Robot <[email protected]>
Run-TryBot: thepudds <[email protected]>
Reviewed-by: Cherry Mui <[email protected]>
  • Loading branch information
doujiang24 authored and cherrymui committed Mar 24, 2023
1 parent a6c382e commit ef0dedc
Show file tree
Hide file tree
Showing 37 changed files with 760 additions and 61 deletions.
7 changes: 4 additions & 3 deletions misc/cgo/test/cgo_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ func TestThreadLock(t *testing.T) { testThreadLockFunc(t) }
func TestUnsignedInt(t *testing.T) { testUnsignedInt(t) }
func TestZeroArgCallback(t *testing.T) { testZeroArgCallback(t) }

func BenchmarkCgoCall(b *testing.B) { benchCgoCall(b) }
func BenchmarkGoString(b *testing.B) { benchGoString(b) }
func BenchmarkCGoCallback(b *testing.B) { benchCallback(b) }
func BenchmarkCgoCall(b *testing.B) { benchCgoCall(b) }
func BenchmarkGoString(b *testing.B) { benchGoString(b) }
func BenchmarkCGoCallback(b *testing.B) { benchCallback(b) }
func BenchmarkCGoInCThread(b *testing.B) { benchCGoInCthread(b) }
24 changes: 24 additions & 0 deletions misc/cgo/test/cthread_unix.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,27 @@ doAdd(int max, int nthread)
for(i=0; i<nthread; i++)
pthread_join(thread_id[i], 0);
}

static void*
goDummyCallbackThread(void* p)
{
int i, max;

max = *(int*)p;
for(i=0; i<max; i++)
goDummy();
return NULL;
}

int
callGoInCThread(int max)
{
pthread_t thread;

if (pthread_create(&thread, NULL, goDummyCallbackThread, (void*)(&max)) != 0)
return -1;
if (pthread_join(thread, NULL) != 0)
return -1;

return max;
}
22 changes: 22 additions & 0 deletions misc/cgo/test/cthread_windows.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,25 @@ doAdd(int max, int nthread)
CloseHandle((HANDLE)thread_id[i]);
}
}

__stdcall
static unsigned int
goDummyCallbackThread(void* p)
{
int i, max;

max = *(int*)p;
for(i=0; i<max; i++)
goDummy();
return 0;
}

int
callGoInCThread(int max)
{
uintptr_t thread_id;
thread_id = _beginthreadex(0, 0, goDummyCallbackThread, &max, 0, 0);
WaitForSingleObject((HANDLE)thread_id, INFINITE);
CloseHandle((HANDLE)thread_id);
return max;
}
14 changes: 14 additions & 0 deletions misc/cgo/test/testx.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
/*
// threads
extern void doAdd(int, int);
extern int callGoInCThread(int);
// issue 1328
void IntoC(void);
Expand Down Expand Up @@ -146,6 +147,10 @@ func Add(x int) {
*p = 2
}

//export goDummy
func goDummy() {
}

func testCthread(t *testing.T) {
if (runtime.GOOS == "darwin" || runtime.GOOS == "ios") && runtime.GOARCH == "arm64" {
t.Skip("the iOS exec wrapper is unable to properly handle the panic from Add")
Expand All @@ -159,6 +164,15 @@ func testCthread(t *testing.T) {
}
}

// Benchmark measuring overhead from C to Go in a C thread.
// Create a new C thread and invoke Go function repeatedly in the new C thread.
func benchCGoInCthread(b *testing.B) {
n := C.callGoInCThread(C.int(b.N))
if int(n) != b.N {
b.Fatal("unmatch loop times")
}
}

// issue 1328

//export BackIntoGo
Expand Down
41 changes: 35 additions & 6 deletions src/runtime/asm_386.s
Original file line number Diff line number Diff line change
Expand Up @@ -689,7 +689,20 @@ nosave:
TEXT ·cgocallback(SB),NOSPLIT,$12-12 // Frame size must match commented places below
NO_LOCAL_POINTERS

// If g is nil, Go did not create the current thread.
// Skip cgocallbackg, just dropm when fn is nil, and frame is the saved g.
// It is used to dropm while thread is exiting.
MOVL fn+0(FP), AX
CMPL AX, $0
JNE loadg
// Restore the g from frame.
get_tls(CX)
MOVL frame+4(FP), BX
MOVL BX, g(CX)
JMP dropm

loadg:
// If g is nil, Go did not create the current thread,
// or if this thread never called into Go on pthread platforms.
// Call needm to obtain one for temporary use.
// In this case, we're running on the thread stack, so there's
// lots of space, but the linker doesn't know. Hide the call from
Expand All @@ -707,9 +720,9 @@ TEXT ·cgocallback(SB),NOSPLIT,$12-12 // Frame size must match commented places
MOVL BP, savedm-4(SP) // saved copy of oldm
JMP havem
needm:
MOVL $runtime·needm(SB), AX
MOVL $runtime·needAndBindM(SB), AX
CALL AX
MOVL $0, savedm-4(SP) // dropm on return
MOVL $0, savedm-4(SP)
get_tls(CX)
MOVL g(CX), BP
MOVL g_m(BP), BP
Expand Down Expand Up @@ -784,13 +797,29 @@ havem:
MOVL 0(SP), AX
MOVL AX, (g_sched+gobuf_sp)(SI)

// If the m on entry was nil, we called needm above to borrow an m
// for the duration of the call. Since the call is over, return it with dropm.
// If the m on entry was nil, we called needm above to borrow an m,
// 1. for the duration of the call on non-pthread platforms,
// 2. or the duration of the C thread alive on pthread platforms.
// If the m on entry wasn't nil,
// 1. the thread might be a Go thread,
// 2. or it's wasn't the first call from a C thread on pthread platforms,
// since the we skip dropm to resue the m in the first call.
MOVL savedm-4(SP), DX
CMPL DX, $0
JNE 3(PC)
JNE droppedm

// Skip dropm to reuse it in the next call, when a pthread key has been created.
MOVL _cgo_pthread_key_created(SB), DX
// It means cgo is disabled when _cgo_pthread_key_created is a nil pointer, need dropm.
CMPL DX, $0
JEQ dropm
CMPL (DX), $0
JNE droppedm

dropm:
MOVL $runtime·dropm(SB), AX
CALL AX
droppedm:

// Done!
RET
Expand Down
38 changes: 33 additions & 5 deletions src/runtime/asm_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -915,7 +915,20 @@ GLOBL zeroTLS<>(SB),RODATA,$const_tlsSize
TEXT ·cgocallback(SB),NOSPLIT,$24-24
NO_LOCAL_POINTERS

// If g is nil, Go did not create the current thread.
// Skip cgocallbackg, just dropm when fn is nil, and frame is the saved g.
// It is used to dropm while thread is exiting.
MOVQ fn+0(FP), AX
CMPQ AX, $0
JNE loadg
// Restore the g from frame.
get_tls(CX)
MOVQ frame+8(FP), BX
MOVQ BX, g(CX)
JMP dropm

loadg:
// If g is nil, Go did not create the current thread,
// or if this thread never called into Go on pthread platforms.
// Call needm to obtain one m for temporary use.
// In this case, we're running on the thread stack, so there's
// lots of space, but the linker doesn't know. Hide the call from
Expand Down Expand Up @@ -953,9 +966,9 @@ needm:
// a bad value in there, in case needm tries to use it.
XORPS X15, X15
XORQ R14, R14
MOVQ $runtime·needm<ABIInternal>(SB), AX
MOVQ $runtime·needAndBindM<ABIInternal>(SB), AX
CALL AX
MOVQ $0, savedm-8(SP) // dropm on return
MOVQ $0, savedm-8(SP)
get_tls(CX)
MOVQ g(CX), BX
MOVQ g_m(BX), BX
Expand Down Expand Up @@ -1044,11 +1057,26 @@ havem:
MOVQ 0(SP), AX
MOVQ AX, (g_sched+gobuf_sp)(SI)

// If the m on entry was nil, we called needm above to borrow an m
// for the duration of the call. Since the call is over, return it with dropm.
// If the m on entry was nil, we called needm above to borrow an m,
// 1. for the duration of the call on non-pthread platforms,
// 2. or the duration of the C thread alive on pthread platforms.
// If the m on entry wasn't nil,
// 1. the thread might be a Go thread,
// 2. or it's wasn't the first call from a C thread on pthread platforms,
// since the we skip dropm to resue the m in the first call.
MOVQ savedm-8(SP), BX
CMPQ BX, $0
JNE done

// Skip dropm to reuse it in the next call, when a pthread key has been created.
MOVQ _cgo_pthread_key_created(SB), AX
// It means cgo is disabled when _cgo_pthread_key_created is a nil pointer, need dropm.
CMPQ AX, $0
JEQ dropm
CMPQ (AX), $0
JNE done

dropm:
MOVQ $runtime·dropm(SB), AX
CALL AX
#ifdef GOOS_windows
Expand Down
37 changes: 32 additions & 5 deletions src/runtime/asm_arm.s
Original file line number Diff line number Diff line change
Expand Up @@ -630,6 +630,15 @@ nosave:
TEXT ·cgocallback(SB),NOSPLIT,$12-12
NO_LOCAL_POINTERS

// Skip cgocallbackg, just dropm when fn is nil, and frame is the saved g.
// It is used to dropm while thread is exiting.
MOVW fn+0(FP), R1
B.NE loadg
// Restore the g from frame.
MOVW frame+4(FP), g
B dropm

loadg:
// Load m and g from thread-local storage.
#ifdef GOOS_openbsd
BL runtime·load_g(SB)
Expand All @@ -639,7 +648,8 @@ TEXT ·cgocallback(SB),NOSPLIT,$12-12
BL.NE runtime·load_g(SB)
#endif

// If g is nil, Go did not create the current thread.
// If g is nil, Go did not create the current thread,
// or if this thread never called into Go on pthread platforms.
// Call needm to obtain one for temporary use.
// In this case, we're running on the thread stack, so there's
// lots of space, but the linker doesn't know. Hide the call from
Expand All @@ -653,7 +663,7 @@ TEXT ·cgocallback(SB),NOSPLIT,$12-12

needm:
MOVW g, savedm-4(SP) // g is zero, so is m.
MOVW $runtime·needm(SB), R0
MOVW $runtime·needAndBindM(SB), R0
BL (R0)

// Set m->g0->sched.sp = SP, so that if a panic happens
Expand Down Expand Up @@ -724,14 +734,31 @@ havem:
MOVW savedsp-12(SP), R4 // must match frame size
MOVW R4, (g_sched+gobuf_sp)(g)

// If the m on entry was nil, we called needm above to borrow an m
// for the duration of the call. Since the call is over, return it with dropm.
// If the m on entry was nil, we called needm above to borrow an m,
// 1. for the duration of the call on non-pthread platforms,
// 2. or the duration of the C thread alive on pthread platforms.
// If the m on entry wasn't nil,
// 1. the thread might be a Go thread,
// 2. or it's wasn't the first call from a C thread on pthread platforms,
// since the we skip dropm to resue the m in the first call.
MOVW savedm-4(SP), R6
CMP $0, R6
B.NE 3(PC)
B.NE done

// Skip dropm to reuse it in the next call, when a pthread key has been created.
MOVW _cgo_pthread_key_created(SB), R6
// It means cgo is disabled when _cgo_pthread_key_created is a nil pointer, need dropm.
CMP $0, R6
B.EQ dropm
MOVW (R6), R6
CMP $0, R6
B.NE done

dropm:
MOVW $runtime·dropm(SB), R0
BL (R0)

done:
// Done!
RET

Expand Down
32 changes: 28 additions & 4 deletions src/runtime/asm_arm64.s
Original file line number Diff line number Diff line change
Expand Up @@ -1014,10 +1014,20 @@ nosave:
TEXT ·cgocallback(SB),NOSPLIT,$24-24
NO_LOCAL_POINTERS

// Skip cgocallbackg, just dropm when fn is nil, and frame is the saved g.
// It is used to dropm while thread is exiting.
MOVD fn+0(FP), R1
CBNZ R1, loadg
// Restore the g from frame.
MOVD frame+8(FP), g
B dropm

loadg:
// Load g from thread-local storage.
BL runtime·load_g(SB)

// If g is nil, Go did not create the current thread.
// If g is nil, Go did not create the current thread,
// or if this thread never called into Go on pthread platforms.
// Call needm to obtain one for temporary use.
// In this case, we're running on the thread stack, so there's
// lots of space, but the linker doesn't know. Hide the call from
Expand All @@ -1030,7 +1040,7 @@ TEXT ·cgocallback(SB),NOSPLIT,$24-24

needm:
MOVD g, savedm-8(SP) // g is zero, so is m.
MOVD $runtime·needm(SB), R0
MOVD $runtime·needAndBindM(SB), R0
BL (R0)

// Set m->g0->sched.sp = SP, so that if a panic happens
Expand Down Expand Up @@ -1111,10 +1121,24 @@ havem:
MOVD savedsp-16(SP), R4
MOVD R4, (g_sched+gobuf_sp)(g)

// If the m on entry was nil, we called needm above to borrow an m
// for the duration of the call. Since the call is over, return it with dropm.
// If the m on entry was nil, we called needm above to borrow an m,
// 1. for the duration of the call on non-pthread platforms,
// 2. or the duration of the C thread alive on pthread platforms.
// If the m on entry wasn't nil,
// 1. the thread might be a Go thread,
// 2. or it's wasn't the first call from a C thread on pthread platforms,
// since the we skip dropm to resue the m in the first call.
MOVD savedm-8(SP), R6
CBNZ R6, droppedm

// Skip dropm to reuse it in the next call, when a pthread key has been created.
MOVD _cgo_pthread_key_created(SB), R6
// It means cgo is disabled when _cgo_pthread_key_created is a nil pointer, need dropm.
CBZ R6, dropm
MOVD (R6), R6
CBNZ R6, droppedm

dropm:
MOVD $runtime·dropm(SB), R0
BL (R0)
droppedm:
Expand Down
Loading

0 comments on commit ef0dedc

Please sign in to comment.