diff --git a/base/locks.jl b/base/locks.jl
index bc7aeb61881bd..b9a33706f7ec8 100644
--- a/base/locks.jl
+++ b/base/locks.jl
@@ -23,6 +23,11 @@ function lock!(l::TatasLock)
             end
         end
         ccall(:jl_cpu_pause, Void, ())
+        # Temporary solution before we have gc transition support in codegen.
+        # This could mess up gc state when we add codegen support.
+        # Use these as a safe point
+        gc_state = ccall(:jl_gc_safe_enter, Int8, ())
+        ccall(:jl_gc_safe_leave, Void, (Int8,), gc_state)
     end
 end
 
@@ -61,6 +66,11 @@ function lock!(l::RecursiveTatasLock)
             end
         end
         ccall(:jl_cpu_pause, Void, ())
+        # Temporary solution before we have gc transition support in codegen.
+        # This could mess up gc state when we add codegen support.
+        # Use these as a safe point
+        gc_state = ccall(:jl_gc_safe_enter, Int8, ())
+        ccall(:jl_gc_safe_leave, Void, (Int8,), gc_state)
     end
 end
 
@@ -116,7 +126,11 @@ function lock!(m::Mutex)
     if m.ownertid == threadid()
         return 0
     end
+    # Temporary solution before we have gc transition support in codegen.
+    # This could mess up gc state when we add codegen support.
+    gc_state = ccall(:jl_gc_safe_enter, Int8, ())
     ccall(:uv_mutex_lock, Void, (Ptr{Void},), m.handle)
+    ccall(:jl_gc_safe_leave, Void, (Int8,), gc_state)
     m.ownertid = threadid()
     return 0
 end
diff --git a/doc/devdocs/debuggingtips.rst b/doc/devdocs/debuggingtips.rst
index 70242120e7e4e..9cacd33090faa 100644
--- a/doc/devdocs/debuggingtips.rst
+++ b/doc/devdocs/debuggingtips.rst
@@ -24,7 +24,7 @@ Similarly, if you're debugging some of julia's internals (e.g.,
 
 This is a good way to circumvent problems that arise from the order in which julia's output streams are initialized.
 
-Julia's flisp interpreter uses ``value_t*`` objects; these can be displayed
+Julia's flisp interpreter uses ``value_t`` objects; these can be displayed
 with ``call fl_print(ios_stdout, obj)``.
 
 Useful Julia variables for Inspecting
@@ -74,7 +74,7 @@ Another useful frame is ``to_function(jl_lambda_info_t *li, bool cstyle)``. The
 
    #2  0x00007ffff7928bf7 in to_function (li=0x2812060, cstyle=false) at codegen.cpp:584
    584	        abort();
-   (gdb) p jl_(jl_uncompress_ast(li, li.ast))
+   (gdb) p jl_(jl_uncompress_ast(li, li->ast))
 
 Inserting breakpoints upon certain conditions
 ---------------------------------------------
@@ -91,10 +91,31 @@ Calling a particular method
 
 ::
 
-   (gdb) break jl_apply_generic if strcmp(F->name->name, "method_to_break")==0
+   (gdb) break jl_apply_generic if strcmp((char*)(jl_symbol_name)(jl_gf_mtable(F)->name), "method_to_break")==0
 
 Since this function is used for every call, you will make everything 1000x slower if you do this.
 
+Dealing with signals
+--------------------
+
+Julia requires a few signal to function property. The profiler uses ``SIGUSR2``
+for sampling and the garbage collector uses ``SIGSEGV`` for threads
+synchronization. If you are debugging some code that uses the profiler or
+multiple julia threads, you may want to let the debugger ignore these signals
+since they can be triggered very often during normal operations. The command to
+do this in GDB is (replace ``SIGSEGV`` with ``SIGUSRS`` or other signals you
+want to ignore)::
+
+   (gdb) handle SIGSEGV noprint nostop pass
+
+The corresponding LLDB command is (after the process is started)::
+
+   (lldb) pro hand -p true -s false -n false SIGSEGV
+
+If you are debugging a segfault with threaded code, you can set a breakpoint on
+``jl_critical_error`` (``sigdie_handler`` should also work on Linux and BSD) in
+order to only catch the actual segfault rather than the GC synchronization points.
+
 Debugging during julia's build process (bootstrap)
 --------------------------------------------------
 
diff --git a/src/ast.c b/src/ast.c
index 02f548d93f8dd..cb02fd4cf59a6 100644
--- a/src/ast.c
+++ b/src/ast.c
@@ -201,7 +201,7 @@ static jl_value_t *resolve_globals(jl_value_t *expr, jl_lambda_info_t *lam);
 
 static jl_value_t *scm_to_julia(value_t e, int expronly)
 {
-    int en = jl_gc_enable(0);
+    int en = jl_gc_enable(0); // Might GC
     jl_value_t *v;
     JL_TRY {
         v = scm_to_julia_(e, expronly);
diff --git a/src/builtins.c b/src/builtins.c
index d50f7ce29bbb3..0f4b733019fc8 100644
--- a/src/builtins.c
+++ b/src/builtins.c
@@ -194,6 +194,9 @@ JL_DLLEXPORT void jl_enter_handler(jl_handler_t *eh)
     JL_SIGATOMIC_BEGIN();
     eh->prev = jl_current_task->eh;
     eh->gcstack = jl_pgcstack;
+#ifdef JULIA_ENABLE_THREADING
+    eh->gc_state = jl_get_ptls_states()->gc_state;
+#endif
     jl_current_task->eh = eh;
     // TODO: this should really go after setjmp(). see comment in
     // ctx_switch in task.c.
diff --git a/src/debuginfo.cpp b/src/debuginfo.cpp
index d639705ea90c1..0da60cab73bb3 100644
--- a/src/debuginfo.cpp
+++ b/src/debuginfo.cpp
@@ -194,7 +194,9 @@ class JuliaJITEventListener: public JITEventListener
     virtual void NotifyFunctionEmitted(const Function &F, void *Code,
                                        size_t Size, const EmittedFunctionDetails &Details)
     {
+        int8_t gc_state = jl_gc_safe_enter();
         uv_rwlock_wrlock(&threadsafe);
+        jl_gc_safe_leave(gc_state);
 #if defined(_OS_WINDOWS_)
         create_PRUNTIME_FUNCTION((uint8_t*)Code, Size, F.getName(), (uint8_t*)Code, Size, NULL);
 #endif
@@ -205,7 +207,9 @@ class JuliaJITEventListener: public JITEventListener
 
     std::map<size_t, FuncInfo, revcomp>& getMap()
     {
+        int8_t gc_state = jl_gc_safe_enter();
         uv_rwlock_rdlock(&threadsafe);
+        jl_gc_safe_leave(gc_state);
         return info;
     }
 #endif // ifndef USE_MCJIT
@@ -225,7 +229,9 @@ class JuliaJITEventListener: public JITEventListener
     virtual void NotifyObjectEmitted(const ObjectImage &obj)
 #endif
     {
+        int8_t gc_state = jl_gc_safe_enter();
         uv_rwlock_wrlock(&threadsafe);
+        jl_gc_safe_leave(gc_state);
 #ifdef LLVM36
         object::section_iterator Section = obj.section_begin();
         object::section_iterator EndSection = obj.section_end();
@@ -458,7 +464,9 @@ class JuliaJITEventListener: public JITEventListener
 
     std::map<size_t, ObjectInfo, revcomp>& getObjectMap()
     {
+        int8_t gc_state = jl_gc_safe_enter();
         uv_rwlock_rdlock(&threadsafe);
+        jl_gc_safe_leave(gc_state);
         return objectmap;
     }
 #endif // USE_MCJIT
@@ -477,6 +485,8 @@ JL_DLLEXPORT void ORCNotifyObjectEmitted(JITEventListener *Listener,
 extern "C"
 char *jl_demangle(const char *name)
 {
+    // This function is not allowed to reference any TLS variables since
+    // it can be called from an unmanaged thread on OSX.
     const char *start = name + 6;
     const char *end = name + strlen(name);
     char *ret;
@@ -508,6 +518,8 @@ void lookup_pointer(DIContext *context, char **name, size_t *line,
                     char **inlinedat_file, size_t pointer,
                     int demangle, int *fromC)
 {
+    // This function is not allowed to reference any TLS variables since
+    // it can be called from an unmanaged thread on OSX.
     DILineInfo info, topinfo;
     DIInliningInfo inlineinfo;
     if (demangle && *name != NULL) {
@@ -629,6 +641,8 @@ void jl_getDylibFunctionInfo(char **name, char **filename, size_t *line,
                              char** inlinedat_file, size_t *inlinedat_line,
                              size_t pointer, int *fromC, int skipC, int skipInline)
 {
+    // This function is not allowed to reference any TLS variables since
+    // it can be called from an unmanaged thread on OSX.
 #ifdef _OS_WINDOWS_
     IMAGEHLP_MODULE64 ModuleInfo;
     BOOL isvalid;
@@ -838,6 +852,8 @@ void jl_getFunctionInfo(char **name, char **filename, size_t *line,
                         char **inlinedat_file, size_t *inlinedat_line,
                         size_t pointer, int *fromC, int skipC, int skipInline)
 {
+    // This function is not allowed to reference any TLS variables since
+    // it can be called from an unmanaged thread on OSX.
     *name = NULL;
     *line = -1;
     *filename = NULL;
diff --git a/src/dump.c b/src/dump.c
index e11bff55ac73a..6995c3e0ff055 100644
--- a/src/dump.c
+++ b/src/dump.c
@@ -1974,7 +1974,7 @@ JL_DLLEXPORT jl_value_t *jl_ast_rettype(jl_lambda_info_t *li, jl_value_t *ast)
     ios_mem(&src, 0);
     ios_setbuf(&src, (char*)bytes->data, jl_array_len(bytes), 0);
     src.size = jl_array_len(bytes);
-    int en = jl_gc_enable(0);
+    int en = jl_gc_enable(0); // Might GC
     jl_value_t *rt = jl_deserialize_value(&src, NULL);
     jl_gc_enable(en);
     tree_literal_values = NULL;
@@ -1994,7 +1994,7 @@ JL_DLLEXPORT jl_value_t *jl_compress_ast(jl_lambda_info_t *li, jl_value_t *ast)
     ios_mem(&dest, 0);
     jl_array_t *last_tlv = tree_literal_values;
     jl_module_t *last_tem = tree_enclosing_module;
-    int en = jl_gc_enable(0);
+    int en = jl_gc_enable(0); // Might GC
 
     if (li->module->constant_table == NULL) {
         li->module->constant_table = jl_alloc_cell_1d(0);
@@ -2038,7 +2038,7 @@ JL_DLLEXPORT jl_value_t *jl_uncompress_ast(jl_lambda_info_t *li, jl_value_t *dat
     ios_mem(&src, 0);
     ios_setbuf(&src, (char*)bytes->data, jl_array_len(bytes), 0);
     src.size = jl_array_len(bytes);
-    int en = jl_gc_enable(0);
+    int en = jl_gc_enable(0); // Might GC
     (void)jl_deserialize_value(&src, NULL); // skip ret type
     jl_value_t *v = jl_deserialize_value(&src, NULL);
     jl_gc_enable(en);
diff --git a/src/gc.c b/src/gc.c
index 1c3d35f82b94a..ee1a34c797d7b 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -34,8 +34,41 @@ extern "C" {
 #endif
 
 JL_DEFINE_MUTEX(pagealloc)
+// Protect all access to `finalizer_list`, `finalizer_list_marked` and
+// `to_finalize`.
 JL_DEFINE_MUTEX(finalizers)
 
+/**
+ * Note about GC synchronization:
+ *
+ * When entering `jl_gc_collect()`, `jl_gc_running` is atomically changed from
+ * `0` to `1` to make sure that only one thread can be running the GC. Other
+ * threads that enters `jl_gc_collect()` at the same time (or later calling
+ * from unmanaged code) will wait in `jl_gc_collect()` until the GC is finished.
+ *
+ * Before starting the mark phase the GC thread calls `jl_gc_signal_begin()`
+ * to make sure all the thread are in a safe state for the GC. The function
+ * activates the safepoint and wait for all the threads to get ready for the
+ * GC (`gc_state != 0`). It also acquires the `finalizers` lock so that no
+ * other thread will access them when the GC is running.
+ *
+ * During the mark and sweep phase of the GC, the threads that are not running
+ * the GC should either be running unmanaged code (or code section that does
+ * not have a GC critical region mainly including storing to the stack or
+ * another object) or paused at a safepoint and wait for the GC to finish.
+ * If a thread want to switch from running unmanaged code to running managed
+ * code, it has to perform a GC safepoint check after setting the `gc_state`
+ * flag (see `jl_gc_state_save_and_set()`. it is possible that the thread might
+ * have `gc_state == 0` in the middle of the GC transition back before entering
+ * the safepoint. This is fine since the thread won't be executing any GC
+ * critical region during that time).
+ *
+ * The finalizers are run after the GC finishes in normal mode (the `gc_state`
+ * when `jl_gc_collect` is called) with `jl_in_finalizer = 1`. (TODO:) When we
+ * have proper support of GC transition in codegen, we should execute the
+ * finalizers in unmanaged (GC safe) mode.
+ */
+
 // manipulating mark bits
 
 #define GC_CLEAN 0 // freshly allocated
@@ -313,7 +346,96 @@ NOINLINE static uintptr_t gc_get_stack_ptr(void)
 
 #include "gc-debug.c"
 
-int jl_in_gc; // referenced from switchto task.c
+// Only one thread can be doing the collection right now. That thread set
+// `jl_running_gc` to one on entering the GC and set it back afterward.
+static volatile uint64_t jl_gc_running = 0;
+
+#ifdef JULIA_ENABLE_THREADING
+JL_DLLEXPORT volatile size_t *jl_gc_signal_page = NULL;
+
+static void jl_wait_for_gc(void)
+{
+    while (jl_gc_running) {
+        jl_cpu_pause(); // yield?
+    }
+}
+
+void jl_gc_signal_wait(void)
+{
+    int8_t state = jl_get_ptls_states()->gc_state;
+    jl_get_ptls_states()->gc_state = JL_GC_STATE_WAITING;
+    jl_wait_for_gc();
+    jl_get_ptls_states()->gc_state = state;
+}
+
+static void jl_gc_wait_for_the_world(void)
+{
+    for (int i = 0;i < jl_n_threads;i++) {
+        jl_tls_states_t *ptls = jl_all_task_states[i].ptls;
+        while (!ptls->gc_state) {
+            jl_cpu_pause(); // yield?
+        }
+    }
+}
+
+void jl_gc_signal_init(void)
+{
+    // jl_page_size isn't available yet.
+#ifdef _OS_WINDOWS_
+    jl_gc_signal_page = (size_t*)VirtualAlloc(NULL, jl_getpagesize(),
+                                              MEM_COMMIT, PAGE_READONLY);
+#else
+    jl_gc_signal_page = (size_t*)mmap(0, jl_getpagesize(), PROT_READ,
+                                      MAP_NORESERVE | MAP_PRIVATE |
+                                      MAP_ANONYMOUS, -1, 0);
+    if (jl_gc_signal_page == MAP_FAILED)
+        jl_gc_signal_page = NULL;
+#endif
+    if (jl_gc_signal_page == NULL) {
+        jl_printf(JL_STDERR, "could not allocate GC synchronization page\n");
+        abort();
+    }
+}
+
+static void jl_gc_signal_begin(void)
+{
+#ifdef __APPLE__
+    // This needs to be after setting `jl_gc_running` so that only one thread
+    // can talk to the signal handler
+    jl_mach_gc_begin();
+#endif
+#ifdef _OS_WINDOWS_
+    DWORD old_prot;
+    VirtualProtect((void*)jl_gc_signal_page, jl_page_size,
+                   PAGE_NOACCESS, &old_prot);
+#else
+    mprotect((void*)jl_gc_signal_page, jl_page_size, PROT_NONE);
+#endif
+    jl_gc_wait_for_the_world();
+    JL_LOCK_NOGC(finalizers);
+}
+
+static void jl_gc_signal_end(void)
+{
+    JL_UNLOCK_NOGC(finalizers);
+#ifdef _OS_WINDOWS_
+    DWORD old_prot;
+    VirtualProtect((void*)jl_gc_signal_page, jl_page_size,
+                   PAGE_READONLY, &old_prot);
+#else
+    mprotect((void*)jl_gc_signal_page, jl_page_size, PROT_READ);
+#endif
+#ifdef __APPLE__
+    jl_mach_gc_end();
+#endif
+}
+#else
+
+#define jl_gc_signal_begin()
+#define jl_gc_signal_end()
+
+#endif
+
 static int jl_gc_finalizers_inhibited; // don't run finalizers during codegen #11956
 
 // malloc wrappers, aligned allocation
@@ -375,12 +497,15 @@ static void jl_gc_push_arraylist(arraylist_t *list)
     jl_pgcstack = (jl_gcframe_t*)list->items;
 }
 
-// Same assumption as `jl_gc_push_arraylist`
+// Same assumption as `jl_gc_push_arraylist`. Requires the finalizers lock
+// to be hold for the current thread and will release the lock when the
+// function returns.
 static void jl_gc_run_finalizers_in_list(arraylist_t *list)
 {
     size_t len = list->len;
     jl_value_t **items = (jl_value_t**)list->items;
     jl_gc_push_arraylist(list);
+    JL_UNLOCK_NOGC(finalizers);
     for (size_t i = 2;i < len;i += 2) {
         run_finalizer(items[i], items[i + 1]);
     }
@@ -389,8 +514,11 @@ static void jl_gc_run_finalizers_in_list(arraylist_t *list)
 
 static void run_finalizers(void)
 {
-    if (to_finalize.len == 0)
+    JL_LOCK_NOGC(finalizers);
+    if (to_finalize.len == 0) {
+        JL_UNLOCK_NOGC(finalizers);
         return;
+    }
     arraylist_t copied_list;
     memcpy(&copied_list, &to_finalize, sizeof(copied_list));
     if (to_finalize.items == to_finalize._space) {
@@ -400,6 +528,7 @@ static void run_finalizers(void)
     // empty out the first two entries for the GC frame
     arraylist_push(&copied_list, copied_list.items[0]);
     arraylist_push(&copied_list, copied_list.items[1]);
+    // This releases the finalizers lock.
     jl_gc_run_finalizers_in_list(&copied_list);
     arraylist_free(&copied_list);
 }
@@ -408,10 +537,10 @@ void jl_gc_inhibit_finalizers(int state)
 {
     // NOTE: currently only called with the codegen lock held, but might need
     // more synchronization in the future
-    if (jl_gc_finalizers_inhibited && !state && !jl_in_gc) {
-        jl_in_gc = 1;
+    if (jl_gc_finalizers_inhibited && !state && !jl_in_finalizer) {
+        jl_in_finalizer = 1;
         run_finalizers();
-        jl_in_gc = 0;
+        jl_in_finalizer = 0;
     }
     jl_gc_finalizers_inhibited = state;
 }
@@ -430,22 +559,24 @@ static void schedule_all_finalizers(arraylist_t* flist)
 
 void jl_gc_run_all_finalizers(void)
 {
+    JL_LOCK_NOGC(finalizers);
     schedule_all_finalizers(&finalizer_list);
     schedule_all_finalizers(&finalizer_list_marked);
+    JL_UNLOCK_NOGC(finalizers);
     run_finalizers();
 }
 
 JL_DLLEXPORT void jl_gc_add_finalizer(jl_value_t *v, jl_function_t *f)
 {
-    JL_LOCK(finalizers);
+    JL_LOCK_NOGC(finalizers);
     arraylist_push(&finalizer_list, (void*)v);
     arraylist_push(&finalizer_list, (void*)f);
-    JL_UNLOCK(finalizers);
+    JL_UNLOCK_NOGC(finalizers);
 }
 
 JL_DLLEXPORT void jl_finalize(jl_value_t *o)
 {
-    JL_LOCK(finalizers);
+    JL_LOCK_NOGC(finalizers);
     // Copy the finalizers into a temporary list so that code in the finalizer
     // won't change the list as we loop through them.
     // This list is also used as the GC frame when we are running the finalizers
@@ -457,10 +588,13 @@ JL_DLLEXPORT void jl_finalize(jl_value_t *o)
     // still holding a reference to the object
     finalize_object(&finalizer_list, o, &copied_list);
     finalize_object(&finalizer_list_marked, o, &copied_list);
-    JL_UNLOCK(finalizers);
     if (copied_list.len > 2) {
+        // This releases the finalizers lock.
         jl_gc_run_finalizers_in_list(&copied_list);
     }
+    else {
+        JL_UNLOCK_NOGC(finalizers);
+    }
     arraylist_free(&copied_list);
 }
 
@@ -712,7 +846,7 @@ static NOINLINE void *malloc_page(void)
     int i;
     region_t* region;
     int region_i = 0;
-    JL_LOCK(pagealloc);
+    JL_LOCK_NOGC(pagealloc);
     while(region_i < REGION_COUNT) {
         region = regions[region_i];
         if (region == NULL) {
@@ -780,7 +914,7 @@ static NOINLINE void *malloc_page(void)
 #endif
     current_pg_count++;
     max_pg_count = max_pg_count < current_pg_count ? current_pg_count : max_pg_count;
-    JL_UNLOCK(pagealloc);
+    JL_UNLOCK_NOGC(pagealloc);
     return ptr;
 }
 
@@ -830,6 +964,7 @@ static inline int maybe_collect(void)
         jl_gc_collect(0);
         return 1;
     }
+    jl_gc_safepoint();
     return 0;
 }
 
@@ -1111,6 +1246,9 @@ static inline void *__pool_alloc(pool_t* p, int osize, int end_offset)
         jl_gc_collect(0);
         //allocd_bytes += osize;
     }
+    else {
+        jl_gc_safepoint();
+    }
     gc_num.poolalloc++;
     // first try to use the freelist
     v = p->freelist;
@@ -1993,15 +2131,29 @@ static void post_mark(arraylist_t *list, int dryrun)
 }
 
 // collector entry point and control
+static volatile uint64_t jl_gc_disable_counter = 0;
 
-static int is_gc_enabled = 1;
 JL_DLLEXPORT int jl_gc_enable(int on)
 {
-    int prev = is_gc_enabled;
-    is_gc_enabled = (on!=0);
+    jl_tls_states_t *ptls = jl_get_ptls_states();
+    int prev = !ptls->disable_gc;
+    ptls->disable_gc = (on == 0);
+    if (on && !prev) {
+        // disable -> enable
+        JL_ATOMIC_FETCH_AND_ADD(jl_gc_disable_counter, -1);
+    }
+    else if (prev && !on) {
+        // enable -> disable
+        JL_ATOMIC_FETCH_AND_ADD(jl_gc_disable_counter, 1);
+        // check if the GC is running and wait for it to finish
+        jl_gc_safepoint();
+    }
     return prev;
 }
-JL_DLLEXPORT int jl_gc_is_enabled(void) { return is_gc_enabled; }
+JL_DLLEXPORT int jl_gc_is_enabled(void)
+{
+    return !jl_get_ptls_states()->disable_gc;
+}
 
 JL_DLLEXPORT int64_t jl_gc_total_bytes(void) { return total_allocd_bytes + allocd_bytes + collect_interval; }
 JL_DLLEXPORT uint64_t jl_gc_total_hrtime(void) { return total_gc_time; }
@@ -2066,24 +2218,9 @@ static int saved_mark_sp = 0;
 static int sweep_mask = GC_MARKED;
 #define MIN_SCAN_BYTES 1024*1024
 
-JL_DLLEXPORT void jl_gc_collect(int full)
+// Only one thread should be running in this function
+static void _jl_gc_collect(int full, char *stack_hi)
 {
-    if (!is_gc_enabled) return;
-    if (jl_in_gc) return;
-    char *stack_hi = (char*)gc_get_stack_ptr();
-    gc_debug_print();
-    JL_SIGATOMIC_BEGIN();
-
-#ifdef JULIA_ENABLE_THREADING
-    ti_threadgroup_barrier(tgworld, ti_tid);
-    if (ti_tid != 0) {
-        JL_SIGATOMIC_END();
-        ti_threadgroup_barrier(tgworld, ti_tid);
-        return;
-    }
-#endif
-
-    jl_in_gc = 1;
     uint64_t t0 = jl_hrtime();
     int recollect = 0;
 #if defined(GC_TIME)
@@ -2162,7 +2299,7 @@ JL_DLLEXPORT void jl_gc_collect(int full)
     int64_t estimate_freed = -1;
 
 #if defined(GC_TIME) || defined(GC_FINAL_STATS)
-    uint64_t post_time = 0, finalize_time = 0;
+    uint64_t post_time = 0;
 #endif
     if (mark_sp == 0 || sweeping) {
 #if defined(GC_TIME) || defined(GC_FINAL_STATS)
@@ -2274,26 +2411,16 @@ JL_DLLEXPORT void jl_gc_collect(int full)
             allocd_bytes_since_sweep = 0;
             jl_gc_total_freed_bytes += freed_bytes;
             freed_bytes = 0;
-
-#if defined(GC_FINAL_STATS) || defined(GC_TIME)
-            finalize_time = jl_hrtime();
-#endif
-            if (!jl_gc_finalizers_inhibited) {
-                run_finalizers();
-            }
-#if defined(GC_FINAL_STATS) || defined(GC_TIME)
-            finalize_time = jl_hrtime() - finalize_time;
-#endif
         }
 #if defined(GC_FINAL_STATS) || defined(GC_TIME)
         uint64_t sweep_pause = jl_hrtime() - sweep_t0;
 #endif
 #ifdef GC_FINAL_STATS
-        total_sweep_time += sweep_pause - finalize_time - post_time;
-        total_fin_time += finalize_time + post_time;
+        total_sweep_time += sweep_pause - post_time;
+        total_fin_time += + post_time;
 #endif
 #ifdef GC_TIME
-        jl_printf(JL_STDOUT, "GC sweep pause %.2f ms live %ld kB (freed %d kB EST %d kB [error %d] = %d%% of allocd %d kB b/r %ld/%ld) (%.2f ms in post_mark, %.2f ms in %d fin) (marked in %d inc) mask %d | next in %d kB\n", NS2MS(sweep_pause), live_bytes/1024, SAVE2/1024, estimate_freed/1024, (SAVE2 - estimate_freed), pct, SAVE3/1024, bonus/1024, SAVE/1024, NS2MS(post_time), NS2MS(finalize_time), n_finalized, inc_count, sweep_mask, -allocd_bytes/1024);
+        jl_printf(JL_STDOUT, "GC sweep pause %.2f ms live %ld kB (freed %d kB EST %d kB [error %d] = %d%% of allocd %d kB b/r %ld/%ld) (%.2f ms in post_mark) (marked in %d inc) mask %d | next in %d kB\n", NS2MS(sweep_pause), live_bytes/1024, SAVE2/1024, estimate_freed/1024, (SAVE2 - estimate_freed), pct, SAVE3/1024, bonus/1024, SAVE/1024, NS2MS(post_time), inc_count, sweep_mask, -allocd_bytes/1024);
 #endif
     }
     n_pause++;
@@ -2302,13 +2429,7 @@ JL_DLLEXPORT void jl_gc_collect(int full)
 #ifdef GC_FINAL_STATS
     max_pause = max_pause < pause ? pause : max_pause;
 #endif
-    jl_in_gc = 0;
 
-#ifdef JULIA_ENABLE_THREADING
-    ti_threadgroup_barrier(tgworld, ti_tid);
-#endif
-
-    JL_SIGATOMIC_END();
 #ifdef GC_TIME
     if (estimate_freed != SAVE2) {
         // this should not happen but it does
@@ -2317,7 +2438,54 @@ JL_DLLEXPORT void jl_gc_collect(int full)
 #endif
     if (recollect) {
         n_pause--;
-        jl_gc_collect(0);
+        _jl_gc_collect(0, stack_hi);
+    }
+}
+
+JL_DLLEXPORT void jl_gc_collect(int full)
+{
+    if (jl_gc_disable_counter)
+        return;
+    char *stack_hi = (char*)gc_get_stack_ptr();
+    gc_debug_print();
+    JL_SIGATOMIC_BEGIN();
+
+    int8_t old_state = jl_get_ptls_states()->gc_state;
+    jl_get_ptls_states()->gc_state = JL_GC_STATE_WAITING;
+    // In case multiple threads enter the GC at the same time, only allow
+    // one of them to actually run the collection. We can't just let the
+    // master thread do the GC since it might be running unmanaged code
+    // and can take arbitrarily long time before hitting a safe point.
+    if (JL_ATOMIC_COMPARE_AND_SWAP(jl_gc_running, 0, 1) != 0) {
+#ifdef JULIA_ENABLE_THREADING
+        JL_SIGATOMIC_END();
+        jl_wait_for_gc();
+        jl_gc_state_set(old_state, JL_GC_STATE_WAITING);
+#else
+        // For single thread, GC should not call itself (in finalizers) before
+        // setting jl_gc_running to false so this should never happen.
+        assert(0 && "GC synchronization failure");
+#endif
+        return;
+    }
+    jl_gc_signal_begin();
+
+    if (!jl_gc_disable_counter)
+        _jl_gc_collect(full, stack_hi);
+
+    // Need to reset the page protection before resetting the flag since
+    // the thread will trigger a segfault immediately after returning from
+    // the signal handler.
+    jl_gc_signal_end();
+    jl_gc_running = 0;
+    JL_SIGATOMIC_END();
+    jl_gc_state_set(old_state, JL_GC_STATE_WAITING);
+
+    if (!jl_gc_finalizers_inhibited) {
+        int8_t was_in_finalizer = jl_in_finalizer;
+        jl_in_finalizer = 1;
+        run_finalizers();
+        jl_in_finalizer = was_in_finalizer;
     }
 }
 
diff --git a/src/gf.c b/src/gf.c
index 4e165226f379e..58cb31cf2a9b3 100644
--- a/src/gf.c
+++ b/src/gf.c
@@ -424,7 +424,7 @@ jl_function_t *jl_method_cache_insert(jl_methtable_t *mt, jl_tupletype_t *type,
 int jl_in_inference = 0;
 void jl_type_infer(jl_lambda_info_t *li, jl_tupletype_t *argtypes, jl_lambda_info_t *def)
 {
-    JL_LOCK(codegen);
+    JL_LOCK(codegen); // Might GC
     int last_ii = jl_in_inference;
     jl_in_inference = 1;
     if (jl_typeinf_func != NULL) {
@@ -490,7 +490,7 @@ static jl_function_t *cache_method(jl_methtable_t *mt, jl_tupletype_t *type,
                                    jl_function_t *method, jl_tupletype_t *decl,
                                    jl_svec_t *sparams, int8_t isstaged)
 {
-    JL_LOCK(codegen);
+    JL_LOCK(codegen); // Might GC
     size_t i;
     int need_guard_entries = 0;
     jl_value_t *temp=NULL;
diff --git a/src/init.c b/src/init.c
index a1a3601969a37..d7d7f37c1b6f3 100644
--- a/src/init.c
+++ b/src/init.c
@@ -528,6 +528,7 @@ void _julia_init(JL_IMAGE_SEARCH rel)
 #ifdef JULIA_ENABLE_THREADING
     // Make sure we finalize the tls callback before starting any threads.
     jl_get_ptls_states_getter();
+    jl_gc_signal_init();
 #endif
     libsupport_init();
     jl_io_loop = uv_default_loop(); // this loop will internal events (spawning process etc.),
@@ -676,6 +677,7 @@ void _julia_init(JL_IMAGE_SEARCH rel)
             jl_current_module;
     }
 
+    // This needs to be after jl_start_threads
     if (jl_options.handle_signals == JL_OPTIONS_HANDLE_SIGNALS_ON)
         jl_install_default_signal_handlers();
 
diff --git a/src/jlapi.c b/src/jlapi.c
index 4dcfb25b7f0f4..009e64651557c 100644
--- a/src/jlapi.c
+++ b/src/jlapi.c
@@ -318,6 +318,26 @@ JL_DLLEXPORT jl_value_t *(jl_typeof)(jl_value_t *v)
     return jl_typeof(v);
 }
 
+JL_DLLEXPORT int8_t (jl_gc_unsafe_enter)(void)
+{
+    return jl_gc_unsafe_enter();
+}
+
+JL_DLLEXPORT void (jl_gc_unsafe_leave)(int8_t state)
+{
+    jl_gc_unsafe_leave(state);
+}
+
+JL_DLLEXPORT int8_t (jl_gc_safe_enter)(void)
+{
+    return jl_gc_safe_enter();
+}
+
+JL_DLLEXPORT void (jl_gc_safe_leave)(int8_t state)
+{
+    jl_gc_safe_leave(state);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/jltypes.c b/src/jltypes.c
index db316bbd50dff..7b784f118bbdd 100644
--- a/src/jltypes.c
+++ b/src/jltypes.c
@@ -1920,7 +1920,7 @@ static ssize_t lookup_type_idx(jl_typename_t *tn, jl_value_t **key, size_t n, in
 static jl_value_t *lookup_type(jl_typename_t *tn, jl_value_t **key, size_t n)
 {
     int ord = is_typekey_ordered(key, n);
-    JL_LOCK(typecache);
+    JL_LOCK(typecache); // Might GC
     ssize_t idx = lookup_type_idx(tn, key, n, ord);
     jl_value_t *t = (idx < 0) ? NULL : jl_svecref(ord ? tn->cache : tn->linearcache, idx);
     JL_UNLOCK(typecache);
@@ -2003,7 +2003,7 @@ jl_value_t *jl_cache_type_(jl_datatype_t *type)
 {
     if (is_cacheable(type)) {
         int ord = is_typekey_ordered(jl_svec_data(type->parameters), jl_svec_len(type->parameters));
-        JL_LOCK(typecache);
+        JL_LOCK(typecache); // Might GC
         ssize_t idx = lookup_type_idx(type->name, jl_svec_data(type->parameters),
                                       jl_svec_len(type->parameters), ord);
         if (idx >= 0)
diff --git a/src/julia.h b/src/julia.h
index b93c268f284b9..6b3c95bdda7b9 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -67,6 +67,20 @@ extern "C" {
 // JULIA_ENABLE_THREADING is switched on in Make.inc if JULIA_THREADS is
 // set (in Make.user)
 
+#ifdef JULIA_ENABLE_THREADING
+JL_DLLEXPORT extern volatile size_t *jl_gc_signal_page;
+STATIC_INLINE void jl_gc_safepoint(void)
+{
+    // This triggers a SegFault when we are in GC
+    // Assign it to a variable to make sure the compiler emit the load
+    // and to avoid Clang warning for -Wunused-volatile-lvalue
+    size_t v = *jl_gc_signal_page;
+    (void)v;
+}
+#else // JULIA_ENABLE_THREADING
+#define jl_gc_safepoint()
+#endif // JULIA_ENABLE_THREADING
+
 JL_DLLEXPORT int16_t jl_threadid(void);
 JL_DLLEXPORT void *jl_threadgroup(void);
 JL_DLLEXPORT void jl_cpu_pause(void);
@@ -75,8 +89,9 @@ JL_DLLEXPORT void jl_threading_profile(void);
 #if defined(__GNUC__)
 #  define JL_ATOMIC_FETCH_AND_ADD(a,b)                                    \
        __sync_fetch_and_add(&(a), (b))
+// Returns the original value of `a`
 #  define JL_ATOMIC_COMPARE_AND_SWAP(a,b,c)                               \
-       __sync_bool_compare_and_swap(&(a), (b), (c))
+       __sync_val_compare_and_swap(&(a), (b), (c))
 #  define JL_ATOMIC_TEST_AND_SET(a)                                       \
        __sync_lock_test_and_set(&(a), 1)
 #  define JL_ATOMIC_RELEASE(a)                                            \
@@ -84,12 +99,13 @@ JL_DLLEXPORT void jl_threading_profile(void);
 #elif defined(_OS_WINDOWS_)
 #  define JL_ATOMIC_FETCH_AND_ADD(a,b)                                    \
        _InterlockedExchangeAdd((volatile LONG *)&(a), (b))
+// Returns the original value of `a`
 #  define JL_ATOMIC_COMPARE_AND_SWAP(a,b,c)                               \
        _InterlockedCompareExchange64((volatile LONG64 *)&(a), (c), (b))
 #  define JL_ATOMIC_TEST_AND_SET(a)                                       \
        _InterlockedExchange64(&(a), 1)
 #  define JL_ATOMIC_RELEASE(a)                                            \
-       _InterlockedExchange64(&(a), 0)
+       (void)_InterlockedExchange64(&(a), 0)
 #else
 #  error "No atomic operations supported."
 #endif
@@ -104,7 +120,7 @@ JL_DLLEXPORT void jl_threading_profile(void);
     extern uint64_t volatile m ## _mutex;                                 \
     extern int32_t m ## _lock_count;
 
-#define JL_LOCK(m) do {                                                 \
+#define JL_LOCK_WAIT(m, wait_ex) do {                                   \
         if (m ## _mutex == uv_thread_self()) {                          \
             ++m ## _lock_count;                                         \
         }                                                               \
@@ -112,10 +128,11 @@ JL_DLLEXPORT void jl_threading_profile(void);
             for (;;) {                                                  \
                 if (m ## _mutex == 0 &&                                 \
                     JL_ATOMIC_COMPARE_AND_SWAP(m ## _mutex, 0,          \
-                                               uv_thread_self())) {     \
+                                               uv_thread_self()) == 0) { \
                     m ## _lock_count = 1;                               \
                     break;                                              \
                 }                                                       \
+                wait_ex;                                                \
                 jl_cpu_pause();                                         \
             }                                                           \
         }                                                               \
@@ -132,10 +149,16 @@ JL_DLLEXPORT void jl_threading_profile(void);
 #else
 #define JL_DEFINE_MUTEX(m)
 #define JL_DEFINE_MUTEX_EXT(m)
-#define JL_LOCK(m) do {} while (0)
+#define JL_LOCK_WAIT(m, wait_ex) do {} while (0)
 #define JL_UNLOCK(m) do {} while (0)
 #endif
 
+// JL_LOCK is a GC safe point while JL_LOCK_NOGC is not
+// Always use JL_LOCK unless no one holding the lock can trigger a GC or GC
+// safepoint. JL_LOCK_NOGC should only be needed for GC internal locks.
+#define JL_LOCK(m) JL_LOCK_WAIT(m, jl_gc_safepoint())
+#define JL_LOCK_NOGC(m) JL_LOCK_WAIT(m, )
+#define JL_UNLOCK_NOGC(m) JL_UNLOCK(m)
 
 // core data types ------------------------------------------------------------
 
@@ -1391,6 +1414,7 @@ typedef struct _jl_handler_t {
     jl_jmp_buf eh_ctx;
     jl_gcframe_t *gcstack;
     struct _jl_handler_t *prev;
+    int8_t gc_state;
 } jl_handler_t;
 
 typedef struct _jl_task_t {
@@ -1426,6 +1450,16 @@ typedef struct _jl_task_t {
 typedef struct _jl_tls_states_t {
     jl_gcframe_t *pgcstack;
     jl_value_t *exception_in_transit;
+    // Whether it is safe to execute GC at the same time.
+#define JL_GC_STATE_WAITING 1
+    // gc_state = 1 means the thread is doing GC or is waiting for the GC to
+    //              finish.
+#define JL_GC_STATE_SAFE 2
+    // gc_state = 2 means the thread is running unmanaged code that can be
+    //              execute at the same time with the GC.
+    volatile int8_t gc_state;
+    volatile int8_t in_finalizer;
+    int8_t disable_gc;
     struct _jl_thread_heap_t *heap;
     jl_task_t *volatile current_task;
     jl_task_t *root_task;
@@ -1462,16 +1496,48 @@ JL_DLLEXPORT JL_CONST_FUNC jl_tls_states_t *(jl_get_ptls_states)(void);
 #ifndef JULIA_ENABLE_THREADING
 extern JL_DLLEXPORT jl_tls_states_t jl_tls_states;
 #define jl_get_ptls_states() (&jl_tls_states)
-#else
+STATIC_INLINE int8_t jl_gc_state_set(int8_t state, int8_t old_state)
+{
+    (void)state;
+    return old_state;
+}
+STATIC_INLINE int8_t jl_gc_state_save_and_set(int8_t state)
+{
+    (void)state;
+    return 0;
+}
+#define jl_gc_unsafe_enter() jl_gc_state_save_and_set(0)
+#define jl_gc_unsafe_leave(state) ((void)state)
+#define jl_gc_safe_enter() jl_gc_state_save_and_set(JL_GC_STATE_SAFE)
+#define jl_gc_safe_leave(state) ((void)state)
+#else // ifndef JULIA_ENABLE_THREADING
 typedef jl_tls_states_t *(*jl_get_ptls_states_func)(void);
 JL_DLLEXPORT void jl_set_ptls_states_getter(jl_get_ptls_states_func f);
-#endif
+STATIC_INLINE int8_t jl_gc_state_set(int8_t state, int8_t old_state)
+{
+    jl_get_ptls_states()->gc_state = state;
+    // A safe point is required if we transition from GC-safe region to
+    // non GC-safe region.
+    if (old_state && !state)
+        jl_gc_safepoint();
+    return old_state;
+}
+STATIC_INLINE int8_t jl_gc_state_save_and_set(int8_t state)
+{
+    return jl_gc_state_set(state, jl_get_ptls_states()->gc_state);
+}
+#define jl_gc_unsafe_enter() jl_gc_state_save_and_set(0)
+#define jl_gc_unsafe_leave(state) jl_gc_state_set((state), 0)
+#define jl_gc_safe_enter() jl_gc_state_save_and_set(JL_GC_STATE_SAFE)
+#define jl_gc_safe_leave(state) jl_gc_state_set((state), JL_GC_STATE_SAFE)
+#endif // ifndef JULIA_ENABLE_THREADING
 
 STATIC_INLINE void jl_eh_restore_state(jl_handler_t *eh)
 {
     JL_SIGATOMIC_BEGIN();
     jl_current_task->eh = eh->prev;
     jl_pgcstack = eh->gcstack;
+    jl_gc_state_save_and_set(eh->gc_state);
     JL_SIGATOMIC_END();
 }
 
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 233b153955113..6ce57c36760bb 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -21,6 +21,7 @@ extern unsigned sig_stack_size;
 
 JL_DLLEXPORT extern int jl_lineno;
 JL_DLLEXPORT extern const char *jl_filename;
+#define jl_in_finalizer (jl_get_ptls_states()->in_finalizer)
 
 STATIC_INLINE jl_value_t *newobj(jl_value_t *type, size_t nfields)
 {
@@ -246,6 +247,8 @@ void jl_start_threads(void);
 void jl_shutdown_threading(void);
 #ifdef JULIA_ENABLE_THREADING
 jl_get_ptls_states_func jl_get_ptls_states_getter(void);
+void jl_gc_signal_init(void);
+void jl_gc_signal_wait(void);
 #endif
 
 void jl_dump_bitcode(char *fname, const char *sysimg_data, size_t sysimg_len);
@@ -282,9 +285,6 @@ JL_DLLEXPORT size_t rec_backtrace_ctx(ptrint_t *data, size_t maxsize, bt_context
 size_t rec_backtrace_ctx_dwarf(ptrint_t *data, size_t maxsize, bt_context_t ctx);
 #endif
 JL_DLLEXPORT void jl_raise_debugger(void);
-#ifdef _OS_DARWIN_
-JL_DLLEXPORT void attach_exception_port(void);
-#endif
 // Set *name and *filename to either NULL or malloc'd string
 void jl_getFunctionInfo(char **name, char **filename, size_t *line,
                         char **inlinedat_file, size_t *inlinedat_line,
@@ -436,6 +436,11 @@ int jl_array_isdefined(jl_value_t **args, int nargs);
 
 JL_DEFINE_MUTEX_EXT(codegen)
 
+#if defined(__APPLE__) && defined(JULIA_ENABLE_THREADING)
+void jl_mach_gc_begin(void);
+void jl_mach_gc_end(void);
+#endif
+
 #if defined(_OS_WINDOWS_)
 STATIC_INLINE void *jl_malloc_aligned(size_t sz, size_t align)
 {
diff --git a/src/signal-handling.c b/src/signal-handling.c
index 7dbabf669730b..88732a354d705 100644
--- a/src/signal-handling.c
+++ b/src/signal-handling.c
@@ -49,6 +49,9 @@ static void jl_critical_error(int sig, bt_context_t context, ptrint_t *bt_data,
 // what to do on a critical error
 static void jl_critical_error(int sig, bt_context_t context, ptrint_t *bt_data, size_t *bt_size)
 {
+    // This function is not allowed to reference any TLS variables.
+    // We need to explicitly pass in the TLS buffer pointer when
+    // we make `jl_filename` and `jl_lineno` thread local.
     size_t n = *bt_size;
     if (sig)
         jl_safe_printf("\nsignal (%d): %s\n", sig, strsignal(sig));
diff --git a/src/signals-mach.c b/src/signals-mach.c
index 8f32afd15356d..f2e7387f0af20 100644
--- a/src/signals-mach.c
+++ b/src/signals-mach.c
@@ -15,6 +15,37 @@
 #include <sys/_structs.h>
 #endif
 
+static void attach_exception_port(thread_port_t thread);
+
+#ifdef JULIA_ENABLE_THREADING
+JL_DEFINE_MUTEX(gc_suspend)
+// This is a copy of `jl_gc_safepoint_activated` to make it easier
+// to synchronic the GC and the signal handler
+static int jl_gc_safepoint_activated = 0;
+// low 16 bits are the thread id, the next 8 bits are the original gc_state
+static arraylist_t suspended_threads;
+void jl_mach_gc_begin(void)
+{
+    JL_LOCK_NOGC(gc_suspend);
+    jl_gc_safepoint_activated = 1;
+    JL_UNLOCK_NOGC(gc_suspend);
+}
+void jl_mach_gc_end(void)
+{
+    JL_LOCK_NOGC(gc_suspend);
+    jl_gc_safepoint_activated = 0;
+    for (size_t i = 0;i < suspended_threads.len;i++) {
+        uintptr_t item = (uintptr_t)suspended_threads.items[i];
+        int16_t tid = (int16_t)item;
+        int8_t gc_state = (int8_t)(item >> 8);
+        jl_all_task_states[tid].ptls->gc_state = gc_state;
+        thread_resume(pthread_mach_thread_np(jl_all_task_states[tid].system_id));
+    }
+    suspended_threads.len = 0;
+    JL_UNLOCK_NOGC(gc_suspend);
+}
+#endif
+
 static mach_port_t segv_port = 0;
 
 extern boolean_t exc_server(mach_msg_header_t *, mach_msg_header_t *);
@@ -36,6 +67,9 @@ void *mach_segv_listener(void *arg)
 
 static void allocate_segv_handler()
 {
+#ifdef JULIA_ENABLE_THREADING
+    arraylist_new(&suspended_threads, jl_n_threads);
+#endif
     pthread_t thread;
     pthread_attr_t attr;
     kern_return_t ret;
@@ -53,7 +87,9 @@ static void allocate_segv_handler()
         jl_error("pthread_create failed");
     }
     pthread_attr_destroy(&attr);
-    attach_exception_port();
+    for (int16_t tid = 0;tid < jl_n_threads;tid++) {
+        attach_exception_port(pthread_mach_thread_np(jl_all_task_states[tid].system_id));
+    }
 }
 
 #ifdef LIBOSXUNWIND
@@ -117,20 +153,47 @@ kern_return_t catch_exception_raise(mach_port_t            exception_port,
         return profiler_segv_handler(exception_port, thread, task, exception, code, code_count);
     }
 #endif
+    int16_t tid;
 #ifdef JULIA_ENABLE_THREADING
     jl_tls_states_t *ptls = NULL;
-    for (int16_t tid = 0;tid < jl_n_threads;tid++) {
+    for (tid = 0;tid < jl_n_threads;tid++) {
         if (pthread_mach_thread_np(jl_all_task_states[tid].system_id) == thread) {
             ptls = jl_all_task_states[tid].ptls;
             break;
         }
     }
+    if (!ptls) {
+        // We don't know about this thread, let the kernel try another handler
+        // instead. This shouldn't actually happen since we only register the
+        // handler for the threads we know about.
+        jl_safe_printf("ERROR: Exception handler triggered on unmanaged thread.\n");
+        return KERN_INVALID_ARGUMENT;
+    }
 #else
     jl_tls_states_t *ptls = &jl_tls_states;
+    tid = 0;
 #endif
     kern_return_t ret = thread_get_state(thread, x86_EXCEPTION_STATE64, (thread_state_t)&exc_state, &exc_count);
     HANDLE_MACH_ERROR("thread_get_state", ret);
     uint64_t fault_addr = exc_state.__faultvaddr;
+#ifdef JULIA_ENABLE_THREADING
+    if (fault_addr == (uintptr_t)jl_gc_signal_page) {
+        JL_LOCK_NOGC(gc_suspend);
+        if (!jl_gc_safepoint_activated) {
+            // GC is done before we get the message, do nothing and return
+            JL_UNLOCK_NOGC(gc_suspend);
+            return KERN_SUCCESS;
+        }
+        // Otherwise, set the gc state of the thread, suspend and record it
+        int8_t gc_state = ptls->gc_state;
+        ptls->gc_state = JL_GC_STATE_WAITING;
+        uintptr_t item = tid | (((uintptr_t)gc_state) << 16);
+        arraylist_push(&suspended_threads, (void*)item);
+        thread_suspend(thread);
+        JL_UNLOCK_NOGC(gc_suspend);
+        return KERN_SUCCESS;
+    }
+#endif
 #ifdef SEGV_EXCEPTION
     if (1) {
 #else
@@ -151,7 +214,7 @@ kern_return_t catch_exception_raise(mach_port_t            exception_port,
                 return KERN_INVALID_ARGUMENT; // rethrow the SEGV since it wasn't an error with writing to read-only memory
             excpt = jl_readonlymemory_exception;
         }
-        jl_throw_in_thread(0, thread, excpt);
+        jl_throw_in_thread(tid, thread, excpt);
 
         return KERN_SUCCESS;
     }
@@ -159,16 +222,16 @@ kern_return_t catch_exception_raise(mach_port_t            exception_port,
         kern_return_t ret = thread_get_state(thread, x86_THREAD_STATE64, (thread_state_t)&state, &count);
         HANDLE_MACH_ERROR("thread_get_state", ret);
         jl_critical_error(SIGSEGV, (unw_context_t*)&state,
-                          jl_bt_data, &jl_bt_size);
+                          ptls->bt_data, &ptls->bt_size);
         return KERN_INVALID_ARGUMENT;
     }
 }
 
-JL_DLLEXPORT void attach_exception_port(void)
+static void attach_exception_port(thread_port_t thread)
 {
     kern_return_t ret;
     // http://www.opensource.apple.com/source/xnu/xnu-2782.1.97/osfmk/man/thread_set_exception_ports.html
-    ret = thread_set_exception_ports(mach_thread_self(), EXC_MASK_BAD_ACCESS, segv_port, EXCEPTION_DEFAULT, MACHINE_THREAD_STATE);
+    ret = thread_set_exception_ports(thread, EXC_MASK_BAD_ACCESS, segv_port, EXCEPTION_DEFAULT, MACHINE_THREAD_STATE);
     HANDLE_MACH_ERROR("thread_set_exception_ports", ret);
 }
 
@@ -269,7 +332,7 @@ void *mach_profile_listener(void *arg)
     (void)arg;
     int i;
     const int max_size = 512;
-    attach_exception_port();
+    attach_exception_port(mach_thread_self());
 #ifdef LIBOSXUNWIND
     mach_profiler_thread = mach_thread_self();
 #endif
diff --git a/src/signals-unix.c b/src/signals-unix.c
index 94cf958648c29..1ebb8a863d853 100644
--- a/src/signals-unix.c
+++ b/src/signals-unix.c
@@ -75,6 +75,15 @@ static void segv_handler(int sig, siginfo_t *info, void *context)
     sigset_t sset;
     assert(sig == SIGSEGV);
 
+#ifdef JULIA_ENABLE_THREADING
+    if (info->si_addr == jl_gc_signal_page) {
+        sigemptyset(&sset);
+        sigaddset(&sset, SIGSEGV);
+        sigprocmask(SIG_UNBLOCK, &sset, NULL);
+        jl_gc_signal_wait();
+        return;
+    }
+#endif
     if (jl_in_jl_ || is_addr_on_stack(jl_get_ptls_states(), info->si_addr)) { // stack overflow, or restarting jl_
         sigemptyset(&sset);
         sigaddset(&sset, SIGSEGV);
diff --git a/src/signals-win.c b/src/signals-win.c
index 991a7cd188210..0d651b48c1da7 100644
--- a/src/signals-win.c
+++ b/src/signals-win.c
@@ -169,6 +169,13 @@ static LONG WINAPI _exception_handler(struct _EXCEPTION_POINTERS *ExceptionInfo,
                 jl_throw_in_ctx(jl_stackovf_exception, ExceptionInfo->ContextRecord,in_ctx&&pSetThreadStackGuarantee);
                 return EXCEPTION_CONTINUE_EXECUTION;
             case EXCEPTION_ACCESS_VIOLATION:
+#ifdef JULIA_ENABLE_THREADING
+                if (ExceptionInfo->ExceptionRecord->ExceptionInformation[1] ==
+                    (intptr_t)jl_gc_signal_page) {
+                    jl_gc_signal_wait();
+                    return EXCEPTION_CONTINUE_EXECUTION;
+                }
+#endif
                 if (ExceptionInfo->ExceptionRecord->ExceptionInformation[0] == 1) { // writing to read-only memory (e.g. mmap)
                     jl_throw_in_ctx(jl_readonlymemory_exception, ExceptionInfo->ContextRecord,in_ctx);
                     return EXCEPTION_CONTINUE_EXECUTION;
diff --git a/src/task.c b/src/task.c
index b0cf00ee7cb8c..a675279a59ddb 100644
--- a/src/task.c
+++ b/src/task.c
@@ -355,7 +355,6 @@ static void ctx_switch(jl_task_t *t, jl_jmp_buf *where)
     //JL_SIGATOMIC_END();
 }
 
-extern int jl_in_gc;
 JL_DLLEXPORT jl_value_t *jl_switchto(jl_task_t *t, jl_value_t *arg)
 {
     if (t == jl_current_task) {
@@ -368,13 +367,15 @@ JL_DLLEXPORT jl_value_t *jl_switchto(jl_task_t *t, jl_value_t *arg)
             jl_throw(t->exception);
         return t->result;
     }
-    if (jl_in_gc)
+    if (jl_in_finalizer)
         jl_error("task switch not allowed from inside gc finalizer");
+    int8_t gc_state = jl_gc_unsafe_enter();
     jl_task_arg_in_transit = arg;
     ctx_switch(t, &t->ctx);
     jl_value_t *val = jl_task_arg_in_transit;
     jl_task_arg_in_transit = jl_nothing;
     throw_if_exception_set(jl_current_task);
+    jl_gc_unsafe_leave(gc_state);
     return val;
 }
 
@@ -481,6 +482,8 @@ static int frame_info_from_ip(char **func_name,
                               char **inlinedat_file, size_t *inlinedat_line,
                               size_t ip, int skipC, int skipInline)
 {
+    // This function is not allowed to reference any TLS variables since
+    // it can be called from an unmanaged thread on OSX.
     static const char *name_unknown = "???";
     int fromC = 0;
 
@@ -758,6 +761,8 @@ JL_DLLEXPORT jl_value_t *jl_get_backtrace(void)
 //for looking up functions from gdb:
 JL_DLLEXPORT void gdblookup(ptrint_t ip)
 {
+    // This function is not allowed to reference any TLS variables since
+    // it can be called from an unmanaged thread on OSX.
     char *func_name;
     size_t line_num;
     char *file_name;
@@ -808,6 +813,7 @@ JL_DLLEXPORT void gdbbacktrace(void)
 // yield to exception handler
 void JL_NORETURN throw_internal(jl_value_t *e)
 {
+    jl_gc_unsafe_enter();
     assert(e != NULL);
     jl_exception_in_transit = e;
     if (jl_current_task->eh != NULL) {
@@ -884,8 +890,8 @@ JL_DLLEXPORT jl_task_t *jl_new_task(jl_function_t *start, size_t ssize)
     stk += pagesz;
 
     init_task(t, stk);
-    JL_GC_POP();
     jl_gc_add_finalizer((jl_value_t*)t, jl_unprotect_stack_func);
+    JL_GC_POP();
 #endif
 
     return t;
diff --git a/src/threadgroup.h b/src/threadgroup.h
index b54b5f05adea7..ef15cda0bcbec 100644
--- a/src/threadgroup.h
+++ b/src/threadgroup.h
@@ -49,4 +49,3 @@ int  ti_threadgroup_destroy(ti_threadgroup_t *tg);
 extern ti_threadgroup_t *tgworld;
 
 #endif  /* THREADGROUP_H */
-
diff --git a/src/threading.c b/src/threading.c
index 84e22c8e1adb6..2655987101961 100644
--- a/src/threading.c
+++ b/src/threading.c
@@ -134,6 +134,8 @@ JL_DLLEXPORT int jl_n_threads;     // # threads we're actually using
 jl_thread_task_state_t *jl_all_task_states;
 
 // return calling thread's ID
+// Also update the suspended_threads list in signals-mach when changing the
+// type of the thread id.
 JL_DLLEXPORT int16_t jl_threadid(void) { return ti_tid; }
 
 struct _jl_thread_heap_t *jl_mk_thread_heap(void);
@@ -228,6 +230,12 @@ void ti_threadfun(void *arg)
     while (ta->state == TI_THREAD_INIT)
         cpu_pause();
     cpu_lfence();
+
+    // Assuming the functions called below doesn't contain unprotected GC
+    // critical region. In general, the following part of this function
+    // shouldn't call any managed code without calling `jl_gc_unsafe_enter`
+    // first.
+    jl_gc_state_set(JL_GC_STATE_SAFE, 0);
     uv_barrier_wait(&thread_init_done);
     // initialize this thread in the thread group
     tg = ta->tg;
@@ -250,11 +258,19 @@ void ti_threadfun(void *arg)
 #endif
 
         if (work) {
-            if (work->command == TI_THREADWORK_DONE)
+            if (work->command == TI_THREADWORK_DONE) {
                 break;
-            else if (work->command == TI_THREADWORK_RUN)
+            }
+            else if (work->command == TI_THREADWORK_RUN) {
                 // TODO: return value? reduction?
+                // TODO: before we support getting return value from
+                //       the work, and after we have proper GC transition
+                //       support in the codegen and runtime we don't need to
+                //       enter GC unsafe region when starting the work.
+                int8_t gc_state = jl_gc_unsafe_enter();
                 ti_run_fun(work->fun, work->args);
+                jl_gc_unsafe_leave(gc_state);
+            }
         }
 
 #if PROFILE_JL_THREADING
@@ -406,6 +422,7 @@ JL_DLLEXPORT void *jl_threadgroup(void) { return (void *)tgworld; }
 // and run it in all threads
 JL_DLLEXPORT jl_value_t *jl_threading_run(jl_function_t *f, jl_svec_t *args)
 {
+    // GC safe
 #if PROFILE_JL_THREADING
     uint64_t tstart = rdtsc();
 #endif
@@ -417,6 +434,7 @@ JL_DLLEXPORT jl_value_t *jl_threading_run(jl_function_t *f, jl_svec_t *args)
     JL_TYPECHK(jl_threading_run, function, (jl_value_t*)f);
     JL_TYPECHK(jl_threading_run, simplevector, (jl_value_t*)args);
 
+    int8_t gc_state = jl_gc_unsafe_enter();
     JL_GC_PUSH2(&argtypes, &fun);
     if (jl_svec_len(args) == 0)
         argtypes = (jl_tupletype_t*)jl_typeof(jl_emptytuple);
@@ -454,8 +472,10 @@ JL_DLLEXPORT jl_value_t *jl_threading_run(jl_function_t *f, jl_svec_t *args)
     user_ticks[ti_tid] += (trun - tfork);
 #endif
 
+    jl_gc_state_set(JL_GC_STATE_SAFE, 0);
     // wait for completion (TODO: nowait?)
     ti_threadgroup_join(tgworld, ti_tid);
+    jl_gc_state_set(0, JL_GC_STATE_SAFE);
 
 #if PROFILE_JL_THREADING
     uint64_t tjoin = rdtsc();
@@ -463,6 +483,7 @@ JL_DLLEXPORT jl_value_t *jl_threading_run(jl_function_t *f, jl_svec_t *args)
 #endif
 
     JL_GC_POP();
+    jl_gc_unsafe_leave(gc_state);
 
     return tw->ret;
 }
diff --git a/src/toplevel.c b/src/toplevel.c
index 73692abd3d979..4eb9f3020d04c 100644
--- a/src/toplevel.c
+++ b/src/toplevel.c
@@ -25,9 +25,9 @@ extern "C" {
 #endif
 
 // current line number in a file
-JL_DLLEXPORT int jl_lineno = 0;
+JL_DLLEXPORT int jl_lineno = 0; // need to update jl_critical_error if this is TLS
 // current file name
-JL_DLLEXPORT const char *jl_filename = "no file";
+JL_DLLEXPORT const char *jl_filename = "no file"; // need to update jl_critical_error if this is TLS
 
 jl_module_t *jl_old_base_module = NULL;
 // the Main we started with, in case it is switched
diff --git a/test/threads.jl b/test/threads.jl
index 10b7ec13b5178..9fc414d0548fb 100644
--- a/test/threads.jl
+++ b/test/threads.jl
@@ -83,3 +83,18 @@ let lock = Threads.RecursiveSpinLock()
     @test unlock!(lock) == 0
     @test unlock!(lock) == 1
 end
+
+# Make sure doing a GC while holding a lock doesn't cause dead lock
+# PR 14190. (This is only meaningful for threading)
+function threaded_gc_locked{LockT}(::Type{LockT})
+    lock = LockT()
+    @threads for i = 1:20
+        lock!(lock)
+        gc(false)
+        unlock!(lock)
+    end
+end
+
+threaded_gc_locked(SpinLock)
+threaded_gc_locked(Threads.RecursiveSpinLock)
+threaded_gc_locked(Mutex)