From 381937116326400c1162a710c16496152b542443 Mon Sep 17 00:00:00 2001
From: Luuk de Gram <luuk@degram.dev>
Date: Sun, 18 Jun 2023 17:50:51 +0200
Subject: [PATCH 1/9] wasm-ld: implement `--export-memory` flag

This flag allows the user to force export the memory to the host
environment. This is useful when the memory is imported from the
host but must also be exported. This is (currently) required
to pass the memory validation for runtimes when using threads.
In this future this may become an error instead.
---
 src/Compilation.zig | 3 +++
 src/link.zig        | 1 +
 src/link/Wasm.zig   | 5 +++++
 src/main.zig        | 7 +++++++
 4 files changed, 16 insertions(+)

diff --git a/src/Compilation.zig b/src/Compilation.zig
index 92de342a16bb..4e0a36d652d5 100644
--- a/src/Compilation.zig
+++ b/src/Compilation.zig
@@ -557,6 +557,7 @@ pub const InitOptions = struct {
     linker_allow_shlib_undefined: ?bool = null,
     linker_bind_global_refs_locally: ?bool = null,
     linker_import_memory: ?bool = null,
+    linker_export_memory: ?bool = null,
     linker_import_symbols: bool = false,
     linker_import_table: bool = false,
     linker_export_table: bool = false,
@@ -1463,6 +1464,7 @@ pub fn create(gpa: Allocator, options: InitOptions) !*Compilation {
             .module_definition_file = options.linker_module_definition_file,
             .sort_section = options.linker_sort_section,
             .import_memory = options.linker_import_memory orelse false,
+            .export_memory = options.linker_export_memory orelse !(options.linker_import_memory orelse false),
             .import_symbols = options.linker_import_symbols,
             .import_table = options.linker_import_table,
             .export_table = options.linker_export_table,
@@ -2324,6 +2326,7 @@ fn addNonIncrementalStuffToCacheManifest(comp: *Compilation, man: *Cache.Manifes
 
     // WASM specific stuff
     man.hash.add(comp.bin_file.options.import_memory);
+    man.hash.add(comp.bin_file.options.export_memory);
     man.hash.addOptional(comp.bin_file.options.initial_memory);
     man.hash.addOptional(comp.bin_file.options.max_memory);
     man.hash.add(comp.bin_file.options.shared_memory);
diff --git a/src/link.zig b/src/link.zig
index 9458bd6c0a37..148138a149a4 100644
--- a/src/link.zig
+++ b/src/link.zig
@@ -133,6 +133,7 @@ pub const Options = struct {
     compress_debug_sections: CompressDebugSections,
     bind_global_refs_locally: bool,
     import_memory: bool,
+    export_memory: bool,
     import_symbols: bool,
     import_table: bool,
     export_table: bool,
diff --git a/src/link/Wasm.zig b/src/link/Wasm.zig
index 97a05a6e4ac1..b35ed0d26bc2 100644
--- a/src/link/Wasm.zig
+++ b/src/link/Wasm.zig
@@ -4251,6 +4251,7 @@ fn linkWithLLD(wasm: *Wasm, comp: *Compilation, prog_node: *std.Progress.Node) !
         man.hash.addOptional(wasm.base.options.stack_size_override);
         man.hash.add(wasm.base.options.build_id);
         man.hash.add(wasm.base.options.import_memory);
+        man.hash.add(wasm.base.options.export_memory);
         man.hash.add(wasm.base.options.import_table);
         man.hash.add(wasm.base.options.export_table);
         man.hash.addOptional(wasm.base.options.initial_memory);
@@ -4338,6 +4339,10 @@ fn linkWithLLD(wasm: *Wasm, comp: *Compilation, prog_node: *std.Progress.Node) !
             try argv.append("--import-memory");
         }
 
+        if (wasm.base.options.export_memory) {
+            try argv.append("--export-memory");
+        }
+
         if (wasm.base.options.import_table) {
             assert(!wasm.base.options.export_table);
             try argv.append("--import-table");
diff --git a/src/main.zig b/src/main.zig
index 95062c1723e8..4726d3e3080d 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -544,6 +544,7 @@ const usage_build_generic =
     \\  -dead_strip                    (Darwin) remove functions and data that are unreachable by the entry point or exported symbols
     \\  -dead_strip_dylibs             (Darwin) remove dylibs that are unreachable by the entry point or exported symbols
     \\  --import-memory                (WebAssembly) import memory from the environment
+    \\  --export-memory                (WebAssembly) export memory to the host (Default unless --import-memory used)
     \\  --import-symbols               (WebAssembly) import missing symbols from the host environment
     \\  --import-table                 (WebAssembly) import function table from the host environment
     \\  --export-table                 (WebAssembly) export function table to the host environment
@@ -787,6 +788,7 @@ fn buildOutputType(
     var linker_allow_shlib_undefined: ?bool = null;
     var linker_bind_global_refs_locally: ?bool = null;
     var linker_import_memory: ?bool = null;
+    var linker_export_memory: ?bool = null;
     var linker_import_symbols: bool = false;
     var linker_import_table: bool = false;
     var linker_export_table: bool = false;
@@ -1419,6 +1421,8 @@ fn buildOutputType(
                         }
                     } else if (mem.eql(u8, arg, "--import-memory")) {
                         linker_import_memory = true;
+                    } else if (mem.eql(u8, arg, "--export-memory")) {
+                        linker_export_memory = true;
                     } else if (mem.eql(u8, arg, "--import-symbols")) {
                         linker_import_symbols = true;
                     } else if (mem.eql(u8, arg, "--import-table")) {
@@ -1982,6 +1986,8 @@ fn buildOutputType(
                     linker_bind_global_refs_locally = true;
                 } else if (mem.eql(u8, arg, "--import-memory")) {
                     linker_import_memory = true;
+                } else if (mem.eql(u8, arg, "--export-memory")) {
+                    linker_export_memory = true;
                 } else if (mem.eql(u8, arg, "--import-symbols")) {
                     linker_import_symbols = true;
                 } else if (mem.eql(u8, arg, "--import-table")) {
@@ -3113,6 +3119,7 @@ fn buildOutputType(
         .linker_allow_shlib_undefined = linker_allow_shlib_undefined,
         .linker_bind_global_refs_locally = linker_bind_global_refs_locally,
         .linker_import_memory = linker_import_memory,
+        .linker_export_memory = linker_export_memory,
         .linker_import_symbols = linker_import_symbols,
         .linker_import_table = linker_import_table,
         .linker_export_table = linker_export_table,

From 062eb6f3c0027e8a371e6865963645b70e49b84e Mon Sep 17 00:00:00 2001
From: Luuk de Gram <luuk@degram.dev>
Date: Sun, 18 Jun 2023 18:22:04 +0200
Subject: [PATCH 2/9] Compilation: allow threads for Wasm when shared-memory is
 enabled

When the user enabled the linker-feature 'shared-memory' we do not force
a singlethreaded build. The linker already verifies all other CPU features
required for threads are enabled. This is true for both WASI and
freestanding.
---
 src/Compilation.zig | 2 +-
 src/target.zig      | 4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/Compilation.zig b/src/Compilation.zig
index 4e0a36d652d5..fe13df80976d 100644
--- a/src/Compilation.zig
+++ b/src/Compilation.zig
@@ -1029,7 +1029,7 @@ pub fn create(gpa: Allocator, options: InitOptions) !*Compilation {
 
         const include_compiler_rt = options.want_compiler_rt orelse needs_c_symbols;
 
-        const must_single_thread = target_util.isSingleThreaded(options.target);
+        const must_single_thread = options.target.isWasm() and !options.linker_shared_memory;
         const single_threaded = options.single_threaded orelse must_single_thread;
         if (must_single_thread and !single_threaded) {
             return error.TargetRequiresSingleThreaded;
diff --git a/src/target.zig b/src/target.zig
index 2d27869cf603..28833428a7e1 100644
--- a/src/target.zig
+++ b/src/target.zig
@@ -207,10 +207,6 @@ pub fn supports_fpic(target: std.Target) bool {
     return target.os.tag != .windows and target.os.tag != .uefi;
 }
 
-pub fn isSingleThreaded(target: std.Target) bool {
-    return target.isWasm();
-}
-
 /// Valgrind supports more, but Zig does not support them yet.
 pub fn hasValgrindSupport(target: std.Target) bool {
     switch (target.cpu.arch) {

From ea0d4c8377aeabe1ef588e82cbdd9aa729adbce0 Mon Sep 17 00:00:00 2001
From: Luuk de Gram <luuk@degram.dev>
Date: Mon, 19 Jun 2023 12:10:32 +0200
Subject: [PATCH 3/9] std: implement `Futex` for WebAssembly

Implements std's `Futex` for the WebAssembly target using Wasm's
`atomics` instruction set. When the `atomics` cpu feature is disabled
we emit a compile-error.
---
 lib/std/Thread/Futex.zig | 45 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/lib/std/Thread/Futex.zig b/lib/std/Thread/Futex.zig
index 768442539b84..2fe4120f6855 100644
--- a/lib/std/Thread/Futex.zig
+++ b/lib/std/Thread/Futex.zig
@@ -73,6 +73,8 @@ else if (builtin.os.tag == .openbsd)
     OpenbsdImpl
 else if (builtin.os.tag == .dragonfly)
     DragonflyImpl
+else if (builtin.target.isWasm())
+    WasmImpl
 else if (std.Thread.use_pthreads)
     PosixImpl
 else
@@ -446,6 +448,49 @@ const DragonflyImpl = struct {
     }
 };
 
+const WasmImpl = struct {
+    fn wait(ptr: *const Atomic(u32), expect: u32, timeout: ?u64) error{Timeout}!void {
+        if (!comptime std.Target.wasm.featureSetHas(builtin.target.cpu.features, .atomics)) {
+            @compileError("WASI target missing cpu feature 'atomics'");
+        }
+        const to: i64 = if (timeout) |to| @intCast(i64, to) else -1;
+        const result = asm (
+            \\local.get %[ptr]
+            \\local.get %[expected]
+            \\local.get %[timeout]
+            \\memory.atomic.wait32 0
+            \\local.set %[ret]
+            : [ret] "=r" (-> u32),
+            : [ptr] "r" (&ptr.value),
+              [expected] "r" (@bitCast(i32, expect)),
+              [timeout] "r" (to),
+        );
+        switch (result) {
+            0 => {}, // ok
+            1 => {}, // expected =! loaded
+            2 => return error.Timeout,
+            else => unreachable,
+        }
+    }
+
+    fn wake(ptr: *const Atomic(u32), max_waiters: u32) void {
+        if (!comptime std.Target.wasm.featureSetHas(builtin.target.cpu.features, .atomics)) {
+            @compileError("WASI target missing cpu feature 'atomics'");
+        }
+        assert(max_waiters != 0);
+        const woken_count = asm (
+            \\local.get %[ptr]
+            \\local.get %[waiters]
+            \\memory.atomic.notify 0
+            \\local.set %[ret]
+            : [ret] "=r" (-> u32),
+            : [ptr] "r" (&ptr.value),
+              [waiters] "r" (max_waiters),
+        );
+        _ = woken_count; // can be 0 when linker flag 'shared-memory' is not enabled
+    }
+};
+
 /// Modified version of linux's futex and Go's sema to implement userspace wait queues with pthread:
 /// https://code.woboq.org/linux/linux/kernel/futex.c.html
 /// https://go.dev/src/runtime/sema.go

From a97dbdfa0b1246913ba90cd5c05ff633e9003cb9 Mon Sep 17 00:00:00 2001
From: Luuk de Gram <luuk@degram.dev>
Date: Tue, 20 Jun 2023 22:19:51 +0200
Subject: [PATCH 4/9] std: implement `Thread` `spawn` for WASI

This implements a first version to spawn a WASI-thread. For a new thread
to be created, we calculate the size required to store TLS, the new stack,
and metadata. This size is then allocated using a user-provided allocator.

After a new thread is spawn, the HOST will call into our bootstrap procedure.
This bootstrap procedure will then initialize the TLS segment and set the
newly spawned thread's TID. It will also set the stack pointer to the newly
created stack to ensure we do not clobber the main thread's stack.

When bootstrapping the thread is completed, we will call the user's
function on this new thread.
---
 lib/std/Thread.zig | 192 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 192 insertions(+)

diff --git a/lib/std/Thread.zig b/lib/std/Thread.zig
index a3b469ad6f15..e7edc17a350c 100644
--- a/lib/std/Thread.zig
+++ b/lib/std/Thread.zig
@@ -28,6 +28,8 @@ else if (use_pthreads)
     PosixThreadImpl
 else if (target.os.tag == .linux)
     LinuxThreadImpl
+else if (target.os.tag == .wasi)
+    WasiThreadImpl
 else
     UnsupportedImpl;
 
@@ -266,6 +268,7 @@ pub const Id = switch (target.os.tag) {
     .freebsd,
     .openbsd,
     .haiku,
+    .wasi,
     => u32,
     .macos, .ios, .watchos, .tvos => u64,
     .windows => os.windows.DWORD,
@@ -296,6 +299,8 @@ pub const SpawnConfig = struct {
 
     /// Size in bytes of the Thread's stack
     stack_size: usize = 16 * 1024 * 1024,
+    /// The allocator to be used to allocate memory for the to-be-spawned thread
+    allocator: ?std.mem.Allocator = null,
 };
 
 pub const SpawnError = error{
@@ -733,6 +738,193 @@ const PosixThreadImpl = struct {
     }
 };
 
+const WasiThreadImpl = struct {
+    comptime {
+        // Sets the stack pointer, which is needed after creating a new thread
+        // to ensure the stack of the main thread isn't being poluted.
+        asm (
+            \\ .text
+            \\ .export_name	__set_stack_pointer, __set_stack_pointer
+            \\ .globaltype __stack_pointer, i32
+            \\ .hidden wasi_thread_start
+            \\ .globl wasi_thread_start
+            \\ .type __set_stack_pointer, @function
+            \\
+            \\ __set_stack_pointer:
+            \\	  .functype	__set_stack_pointer (i32) -> ()
+            \\    local.get 0 # The raw pointer which replaces the stack pointer
+            \\    global.set __stack_pointer
+            \\    end_function
+        );
+    }
+    thread: *WasiThread,
+
+    pub const ThreadHandle = i32;
+    threadlocal var tls_thread_id: Id = 0;
+
+    const WasiThread = struct {
+        tid: Atomic(i32) = Atomic(i32).init(0),
+        memory: []u8,
+    };
+
+    /// A meta-data structure used to bootstrap a thread
+    const Instance = struct {
+        thread: WasiThread,
+        /// Address of this `Instance`
+        base: usize,
+        /// Contains the pointer of the new __tls_base.
+        tls_base: usize,
+        /// Contains the pointer to the stack for the newly spawned thread.
+        stack_pointer: usize,
+        /// Contains the pointer to the wrapper which holds all arguments
+        /// for the callback.
+        raw_ptr: usize,
+        /// Function pointer to a wrapping function which will call the user's
+        /// function upon thread spawn. The above mentioned pointer will be passed
+        /// to this function pointer as its argument.
+        call_back: *const fn (usize) void,
+    };
+
+    fn getCurrentId() Id {
+        return tls_thread_id;
+    }
+
+    fn getHandle(self: Impl) ThreadHandle {
+        return self.thread.tid;
+    }
+
+    fn detach(self: Impl) void {
+        _ = self;
+    }
+
+    fn join(self: Impl) void {
+        _ = self;
+    }
+
+    fn spawn(config: std.Thread.SpawnConfig, comptime f: anytype, args: anytype) !WasiThreadImpl {
+        if (config.allocator == null) return error.OutOfMemory; // an allocator is required to spawn a WASI-thread
+
+        // Wrapping struct required to hold the user-provided function arguments.
+        const Wrapper = struct {
+            args: @TypeOf(args),
+            fn entry(ptr: usize) void {
+                const w = @intToPtr(*@This(), ptr);
+                @call(.auto, f, w.args);
+            }
+        };
+
+        var guard_offset: usize = undefined;
+        var stack_offset: usize = undefined;
+        var tls_offset: usize = undefined;
+        var wrapper_offset: usize = undefined;
+        var instance_offset: usize = undefined;
+
+        // Calculate the bytes we have to allocate to store all thread information, including:
+        // - The actual stack for the thread
+        // - The TLS segment
+        // - `Instance` - containing information about how to call the user's function.
+        const map_bytes = blk: {
+            var bytes: usize = std.wasm.page_size;
+            guard_offset = bytes;
+
+            bytes = std.mem.alignForward(usize, bytes, 16); // align stack to 16 bytes
+            stack_offset = bytes;
+            bytes += @max(std.wasm.page_size, config.stack_size);
+
+            bytes = std.mem.alignForward(usize, bytes, __tls_align());
+            tls_offset = bytes;
+            bytes += __tls_size();
+
+            bytes = std.mem.alignForward(usize, bytes, @alignOf(Wrapper));
+            wrapper_offset = bytes;
+            bytes += @sizeOf(Wrapper);
+
+            bytes = std.mem.alignForward(usize, bytes, @alignOf(Instance));
+            instance_offset = bytes;
+            bytes += @sizeOf(Instance);
+
+            bytes = std.mem.alignForward(usize, bytes, std.wasm.page_size);
+            break :blk bytes;
+        };
+
+        // Allocate the amount of memory required for all meta data.
+        const allocated_memory = try config.allocator.?.alloc(u8, map_bytes);
+
+        const wrapper = @ptrCast(*Wrapper, @alignCast(@alignOf(Wrapper), &allocated_memory[wrapper_offset]));
+        wrapper.* = .{ .args = args };
+
+        const instance = @ptrCast(*Instance, @alignCast(@alignOf(Instance), &allocated_memory[instance_offset]));
+        instance.* = .{
+            .thread = .{ .memory = allocated_memory },
+            .base = @ptrToInt(allocated_memory.ptr),
+            .tls_base = tls_offset,
+            .stack_pointer = stack_offset,
+            .raw_ptr = @ptrToInt(wrapper),
+            .call_back = &Wrapper.entry,
+        };
+
+        const tid = spawnWasiThread(instance);
+        // The specification says any value lower than 0 indicates an error.
+        // The values of such error are unspecified. WASI-Libc treats it as EAGAIN.
+        if (tid < 0) {
+            return error.SystemResources;
+        }
+        instance.thread.tid.store(tid, .SeqCst);
+
+        return .{ .thread = &instance.thread };
+    }
+
+    export fn wasi_thread_start(tid: i32, arg: *const Instance) void {
+        __set_stack_pointer(arg.thread.memory.ptr + arg.stack_pointer);
+        __wasm_init_tls(arg.thread.memory.ptr + arg.tls_base);
+        WasiThreadImpl.tls_thread_id = @intCast(u32, tid);
+
+        // finished bootstrapping, call user's procedure.
+        arg.call_back(arg.raw_ptr);
+    }
+
+    // Asks the host to create a new thread for us.
+    // Newly created thread wil lcall `wasi_tread_start` with the thread ID as well
+    // as the input `arg` that was provided to `spawnWasiThread`
+    const spawnWasiThread = @"thread-spawn";
+    extern "wasi" fn @"thread-spawn"(arg: *const Instance) i32;
+
+    /// Initializes the TLS data segment starting at `memory`.
+    /// This is a synthetic function, generated by the linker.
+    extern fn __wasm_init_tls(memory: [*]u8) void;
+    extern fn __set_stack_pointer(ptr: [*]u8) void;
+
+    /// Returns a pointer to the base of the TLS data segment for the current thread
+    inline fn __tls_base() [*]u8 {
+        return asm (
+            \\ .globaltype __tls_base, i32
+            \\ global.get __tls_base
+            \\ local.set %[ret]
+            : [ret] "=r" (-> [*]u8),
+        );
+    }
+
+    /// Returns the size of the TLS segment
+    inline fn __tls_size() u32 {
+        return asm volatile (
+            \\ .globaltype __tls_size, i32, immutable
+            \\ global.get __tls_size
+            \\ local.set %[ret]
+            : [ret] "=r" (-> u32),
+        );
+    }
+
+    /// Returns the alignment of the TLS segment
+    inline fn __tls_align() u32 {
+        return asm (
+            \\ .globaltype __tls_align, i32, immutable
+            \\ global.get __tls_align
+            \\ local.set %[ret]
+            : [ret] "=r" (-> u32),
+        );
+    }
+};
+
 const LinuxThreadImpl = struct {
     const linux = os.linux;
 

From 10bf58b2db0e340137228f706ee2e8a67b46a892 Mon Sep 17 00:00:00 2001
From: Luuk de Gram <luuk@degram.dev>
Date: Wed, 21 Jun 2023 21:43:11 +0200
Subject: [PATCH 5/9] store allocator & remove global assembly

We now store the original allocator that was used to allocate the
memory required for the thread. This allocator can then be used
in any cleanup functionality to ensure the memory is freed correctly.

Secondly, we now use a function to set the stack pointer instead of
generating a function using global assembly. This is a lot cleaner
and more readable.
---
 lib/std/Thread.zig | 48 +++++++++++++++++++++++++---------------------
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/lib/std/Thread.zig b/lib/std/Thread.zig
index e7edc17a350c..04531f97bc46 100644
--- a/lib/std/Thread.zig
+++ b/lib/std/Thread.zig
@@ -739,32 +739,24 @@ const PosixThreadImpl = struct {
 };
 
 const WasiThreadImpl = struct {
-    comptime {
-        // Sets the stack pointer, which is needed after creating a new thread
-        // to ensure the stack of the main thread isn't being poluted.
-        asm (
-            \\ .text
-            \\ .export_name	__set_stack_pointer, __set_stack_pointer
-            \\ .globaltype __stack_pointer, i32
-            \\ .hidden wasi_thread_start
-            \\ .globl wasi_thread_start
-            \\ .type __set_stack_pointer, @function
-            \\
-            \\ __set_stack_pointer:
-            \\	  .functype	__set_stack_pointer (i32) -> ()
-            \\    local.get 0 # The raw pointer which replaces the stack pointer
-            \\    global.set __stack_pointer
-            \\    end_function
-        );
-    }
     thread: *WasiThread,
 
     pub const ThreadHandle = i32;
     threadlocal var tls_thread_id: Id = 0;
 
     const WasiThread = struct {
+        /// Thread ID
         tid: Atomic(i32) = Atomic(i32).init(0),
+        /// Contains all memory which was allocated to bootstrap this thread, including:
+        /// - Guard page
+        /// - Stack
+        /// - TLS segment
+        /// - `Instance`
+        /// All memory is freed upon call to `join`
         memory: []u8,
+        /// The allocator used to allocate the thread's memory,
+        /// which is also used during `join` to ensure clean-up.
+        allocator: std.mem.Allocator,
     };
 
     /// A meta-data structure used to bootstrap a thread
@@ -790,7 +782,7 @@ const WasiThreadImpl = struct {
     }
 
     fn getHandle(self: Impl) ThreadHandle {
-        return self.thread.tid;
+        return self.thread.tid.load(.SeqCst);
     }
 
     fn detach(self: Impl) void {
@@ -813,7 +805,6 @@ const WasiThreadImpl = struct {
             }
         };
 
-        var guard_offset: usize = undefined;
         var stack_offset: usize = undefined;
         var tls_offset: usize = undefined;
         var wrapper_offset: usize = undefined;
@@ -824,8 +815,11 @@ const WasiThreadImpl = struct {
         // - The TLS segment
         // - `Instance` - containing information about how to call the user's function.
         const map_bytes = blk: {
+            // start with atleast a single page, which is used as a guard to prevent
+            // other threads clobbering our new thread.
+            // Unfortunately, WebAssembly has no notion of read-only segments, so this
+            // is only a temporary measure until the entire page is "run over".
             var bytes: usize = std.wasm.page_size;
-            guard_offset = bytes;
 
             bytes = std.mem.alignForward(usize, bytes, 16); // align stack to 16 bytes
             stack_offset = bytes;
@@ -855,7 +849,7 @@ const WasiThreadImpl = struct {
 
         const instance = @ptrCast(*Instance, @alignCast(@alignOf(Instance), &allocated_memory[instance_offset]));
         instance.* = .{
-            .thread = .{ .memory = allocated_memory },
+            .thread = .{ .memory = allocated_memory, .allocator = config.allocator.? },
             .base = @ptrToInt(allocated_memory.ptr),
             .tls_base = tls_offset,
             .stack_pointer = stack_offset,
@@ -923,6 +917,16 @@ const WasiThreadImpl = struct {
             : [ret] "=r" (-> u32),
         );
     }
+
+    /// Allows for setting the stack pointer in the WebAssembly module.
+    inline fn __set_stack_pointer(addr: [*]u8) void {
+        asm volatile (
+            \\ local.get %[ptr]
+            \\ global.set __stack_pointer
+            :
+            : [ptr] "r" (addr),
+        );
+    }
 };
 
 const LinuxThreadImpl = struct {

From 834609038c83e0122b506fadfa38b9acb7ee9068 Mon Sep 17 00:00:00 2001
From: Luuk de Gram <luuk@degram.dev>
Date: Wed, 21 Jun 2023 21:44:53 +0200
Subject: [PATCH 6/9] std: implement `join` for WASI-threads

We now reset the Thread ID to 0 and wake up the main thread listening
for the thread to finish. We use inline assembly as we cannot use
the stack to set the thread ID as it could possibly clobber any
of the memory.

Currently, we leak the memory that was allocated for the thread.
We need to implement a way where we can clean up the memory without
using the stack (as the stack is stored inside this same memory).
---
 lib/std/Thread.zig | 67 ++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 59 insertions(+), 8 deletions(-)

diff --git a/lib/std/Thread.zig b/lib/std/Thread.zig
index 04531f97bc46..29ba789ffcde 100644
--- a/lib/std/Thread.zig
+++ b/lib/std/Thread.zig
@@ -790,7 +790,40 @@ const WasiThreadImpl = struct {
     }
 
     fn join(self: Impl) void {
-        _ = self;
+        // TODO cleanup memory
+        // The memory also contains the thread's stack, which is problematic while freeing the memory
+        // defer self.thread.allocator.free(self.thread.memory);
+
+        var spin: u8 = 10;
+        while (true) {
+            const tid = self.thread.tid.load(.SeqCst);
+            if (tid == 0) {
+                break;
+            }
+
+            if (spin > 0) {
+                spin -= 1;
+                std.atomic.spinLoopHint();
+                continue;
+            }
+
+            const result = asm (
+                \\local.get %[ptr]
+                \\local.get %[expected]
+                \\i64.const -1 # infinite
+                \\memory.atomic.wait32 0
+                \\local.set %[ret]
+                : [ret] "=r" (-> u32),
+                : [ptr] "r" (&self.thread.tid.value),
+                  [expected] "r" (tid),
+            );
+            switch (result) {
+                0 => continue, // ok
+                1 => continue, // expected =! loaded
+                2 => unreachable, // timeout (infinite)
+                else => unreachable,
+            }
+        }
     }
 
     fn spawn(config: std.Thread.SpawnConfig, comptime f: anytype, args: anytype) !WasiThreadImpl {
@@ -868,25 +901,43 @@ const WasiThreadImpl = struct {
         return .{ .thread = &instance.thread };
     }
 
-    export fn wasi_thread_start(tid: i32, arg: *const Instance) void {
+    /// Bootstrap procedure, called by the HOST environment after thread creation.
+    export fn wasi_thread_start(tid: i32, arg: *Instance) void {
         __set_stack_pointer(arg.thread.memory.ptr + arg.stack_pointer);
         __wasm_init_tls(arg.thread.memory.ptr + arg.tls_base);
         WasiThreadImpl.tls_thread_id = @intCast(u32, tid);
 
-        // finished bootstrapping, call user's procedure.
+        // Finished bootstrapping, call user's procedure.
         arg.call_back(arg.raw_ptr);
+
+        // Thread finished. Reset Thread ID and wake up the main thread if needed.
+        // We use inline assembly here as we must ensure not to use the stack.
+        asm volatile (
+            \\ local.get %[ptr]
+            \\ i32.const 0
+            \\ i32.atomic.store 0
+            :
+            : [ptr] "r" (&arg.thread.tid.value),
+        );
+        asm volatile (
+            \\ local.get %[ptr]
+            \\ i32.const 1 # waiters
+            \\ memory.atomic.notify 0
+            \\ drop # no need to know the waiters
+            :
+            : [ptr] "r" (&arg.thread.tid.value),
+        );
     }
 
-    // Asks the host to create a new thread for us.
-    // Newly created thread wil lcall `wasi_tread_start` with the thread ID as well
-    // as the input `arg` that was provided to `spawnWasiThread`
+    /// Asks the host to create a new thread for us.
+    /// Newly created thread will call `wasi_tread_start` with the thread ID as well
+    /// as the input `arg` that was provided to `spawnWasiThread`
     const spawnWasiThread = @"thread-spawn";
-    extern "wasi" fn @"thread-spawn"(arg: *const Instance) i32;
+    extern "wasi" fn @"thread-spawn"(arg: *Instance) i32;
 
     /// Initializes the TLS data segment starting at `memory`.
     /// This is a synthetic function, generated by the linker.
     extern fn __wasm_init_tls(memory: [*]u8) void;
-    extern fn __set_stack_pointer(ptr: [*]u8) void;
 
     /// Returns a pointer to the base of the TLS data segment for the current thread
     inline fn __tls_base() [*]u8 {

From 622b7c47468bc4508f4cfe840e0f8c51b54949dd Mon Sep 17 00:00:00 2001
From: Luuk de Gram <luuk@degram.dev>
Date: Thu, 22 Jun 2023 19:53:07 +0200
Subject: [PATCH 7/9] free allocated memory upon call `join`

When `join` detects a thread has completed, it will free the allocated
memory of the thread. For this we must first copy the allocator. This is
required as the allocated memory holds a reference to the original
allocator. If we free the memory, we would end up with UB as the
allocator would free itself.
---
 lib/std/Thread.zig | 42 ++++++++++++++++++++++--------------------
 1 file changed, 22 insertions(+), 20 deletions(-)

diff --git a/lib/std/Thread.zig b/lib/std/Thread.zig
index 29ba789ffcde..f5479c0b4e24 100644
--- a/lib/std/Thread.zig
+++ b/lib/std/Thread.zig
@@ -762,13 +762,13 @@ const WasiThreadImpl = struct {
     /// A meta-data structure used to bootstrap a thread
     const Instance = struct {
         thread: WasiThread,
-        /// Address of this `Instance`
-        base: usize,
-        /// Contains the pointer of the new __tls_base.
-        tls_base: usize,
-        /// Contains the pointer to the stack for the newly spawned thread.
-        stack_pointer: usize,
-        /// Contains the pointer to the wrapper which holds all arguments
+        /// Contains the offset to the new __tls_base.
+        /// The offset starting from the memory's base.
+        tls_offset: usize,
+        /// Contains the offset to the stack for the newly spawned thread.
+        /// The offset is calculated starting from the memory's base.
+        stack_offset: usize,
+        /// Contains the raw pointer value to the wrapper which holds all arguments
         /// for the callback.
         raw_ptr: usize,
         /// Function pointer to a wrapping function which will call the user's
@@ -790,9 +790,12 @@ const WasiThreadImpl = struct {
     }
 
     fn join(self: Impl) void {
-        // TODO cleanup memory
-        // The memory also contains the thread's stack, which is problematic while freeing the memory
-        // defer self.thread.allocator.free(self.thread.memory);
+        defer {
+            // Create a copy of the allocator so we do not free the reference to the
+            // original allocator while freeing the memory.
+            var allocator = self.thread.allocator;
+            allocator.free(self.thread.memory);
+        }
 
         var spin: u8 = 10;
         while (true) {
@@ -808,11 +811,11 @@ const WasiThreadImpl = struct {
             }
 
             const result = asm (
-                \\local.get %[ptr]
-                \\local.get %[expected]
-                \\i64.const -1 # infinite
-                \\memory.atomic.wait32 0
-                \\local.set %[ret]
+                \\ local.get %[ptr]
+                \\ local.get %[expected]
+                \\ i64.const -1 # infinite
+                \\ memory.atomic.wait32 0
+                \\ local.set %[ret]
                 : [ret] "=r" (-> u32),
                 : [ptr] "r" (&self.thread.tid.value),
                   [expected] "r" (tid),
@@ -883,9 +886,8 @@ const WasiThreadImpl = struct {
         const instance = @ptrCast(*Instance, @alignCast(@alignOf(Instance), &allocated_memory[instance_offset]));
         instance.* = .{
             .thread = .{ .memory = allocated_memory, .allocator = config.allocator.? },
-            .base = @ptrToInt(allocated_memory.ptr),
-            .tls_base = tls_offset,
-            .stack_pointer = stack_offset,
+            .tls_offset = tls_offset,
+            .stack_offset = stack_offset,
             .raw_ptr = @ptrToInt(wrapper),
             .call_back = &Wrapper.entry,
         };
@@ -903,8 +905,8 @@ const WasiThreadImpl = struct {
 
     /// Bootstrap procedure, called by the HOST environment after thread creation.
     export fn wasi_thread_start(tid: i32, arg: *Instance) void {
-        __set_stack_pointer(arg.thread.memory.ptr + arg.stack_pointer);
-        __wasm_init_tls(arg.thread.memory.ptr + arg.tls_base);
+        __set_stack_pointer(arg.thread.memory.ptr + arg.stack_offset);
+        __wasm_init_tls(arg.thread.memory.ptr + arg.tls_offset);
         WasiThreadImpl.tls_thread_id = @intCast(u32, tid);
 
         // Finished bootstrapping, call user's procedure.

From e06ab1b0107e8a6a1720703a6df0f61f535b5e5a Mon Sep 17 00:00:00 2001
From: Luuk de Gram <luuk@degram.dev>
Date: Fri, 23 Jun 2023 19:14:55 +0200
Subject: [PATCH 8/9] std: implement `detach` for WASI-threads

When a thread is detached from the main thread, we automatically
cleanup any allocated memory. For this we first reset the stack-pointer
to the original stack-pointer of the main-thread so we can safely clear
the memory which also contains the thread's stack.
---
 lib/std/Thread.zig       | 87 ++++++++++++++++++++++++++++------------
 lib/std/Thread/Futex.zig |  4 +-
 2 files changed, 64 insertions(+), 27 deletions(-)

diff --git a/lib/std/Thread.zig b/lib/std/Thread.zig
index f5479c0b4e24..b8cc8c8869b6 100644
--- a/lib/std/Thread.zig
+++ b/lib/std/Thread.zig
@@ -757,6 +757,8 @@ const WasiThreadImpl = struct {
         /// The allocator used to allocate the thread's memory,
         /// which is also used during `join` to ensure clean-up.
         allocator: std.mem.Allocator,
+        /// The current state of the thread.
+        state: State = State.init(.running),
     };
 
     /// A meta-data structure used to bootstrap a thread
@@ -775,8 +777,15 @@ const WasiThreadImpl = struct {
         /// function upon thread spawn. The above mentioned pointer will be passed
         /// to this function pointer as its argument.
         call_back: *const fn (usize) void,
+        /// When a thread is in `detached` state, we must free all of its memory
+        /// upon thread completion. However, as this is done while still within
+        /// the thread, we must first jump back to the main thread's stack or else
+        /// we end up freeing the stack that we're currently using.
+        original_stack_pointer: [*]u8,
     };
 
+    const State = Atomic(enum(u8) { running, completed, detached });
+
     fn getCurrentId() Id {
         return tls_thread_id;
     }
@@ -786,7 +795,11 @@ const WasiThreadImpl = struct {
     }
 
     fn detach(self: Impl) void {
-        _ = self;
+        switch (self.thread.state.swap(.detached, .SeqCst)) {
+            .running => {},
+            .completed => self.join(),
+            .detached => unreachable,
+        }
     }
 
     fn join(self: Impl) void {
@@ -836,7 +849,7 @@ const WasiThreadImpl = struct {
         const Wrapper = struct {
             args: @TypeOf(args),
             fn entry(ptr: usize) void {
-                const w = @intToPtr(*@This(), ptr);
+                const w: *@This() = @ptrFromInt(ptr);
                 @call(.auto, f, w.args);
             }
         };
@@ -854,7 +867,7 @@ const WasiThreadImpl = struct {
             // start with atleast a single page, which is used as a guard to prevent
             // other threads clobbering our new thread.
             // Unfortunately, WebAssembly has no notion of read-only segments, so this
-            // is only a temporary measure until the entire page is "run over".
+            // is only a best effort.
             var bytes: usize = std.wasm.page_size;
 
             bytes = std.mem.alignForward(usize, bytes, 16); // align stack to 16 bytes
@@ -880,16 +893,17 @@ const WasiThreadImpl = struct {
         // Allocate the amount of memory required for all meta data.
         const allocated_memory = try config.allocator.?.alloc(u8, map_bytes);
 
-        const wrapper = @ptrCast(*Wrapper, @alignCast(@alignOf(Wrapper), &allocated_memory[wrapper_offset]));
+        const wrapper: *Wrapper = @ptrCast(@alignCast(&allocated_memory[wrapper_offset]));
         wrapper.* = .{ .args = args };
 
-        const instance = @ptrCast(*Instance, @alignCast(@alignOf(Instance), &allocated_memory[instance_offset]));
+        const instance: *Instance = @ptrCast(@alignCast(&allocated_memory[instance_offset]));
         instance.* = .{
             .thread = .{ .memory = allocated_memory, .allocator = config.allocator.? },
             .tls_offset = tls_offset,
             .stack_offset = stack_offset,
-            .raw_ptr = @ptrToInt(wrapper),
+            .raw_ptr = @intFromPtr(wrapper),
             .call_back = &Wrapper.entry,
+            .original_stack_pointer = __get_stack_pointer(),
         };
 
         const tid = spawnWasiThread(instance);
@@ -903,32 +917,46 @@ const WasiThreadImpl = struct {
         return .{ .thread = &instance.thread };
     }
 
-    /// Bootstrap procedure, called by the HOST environment after thread creation.
+    /// Bootstrap procedure, called by the host environment after thread creation.
     export fn wasi_thread_start(tid: i32, arg: *Instance) void {
         __set_stack_pointer(arg.thread.memory.ptr + arg.stack_offset);
         __wasm_init_tls(arg.thread.memory.ptr + arg.tls_offset);
-        WasiThreadImpl.tls_thread_id = @intCast(u32, tid);
+        @atomicStore(u32, &WasiThreadImpl.tls_thread_id, @intCast(tid), .SeqCst);
 
         // Finished bootstrapping, call user's procedure.
         arg.call_back(arg.raw_ptr);
 
-        // Thread finished. Reset Thread ID and wake up the main thread if needed.
-        // We use inline assembly here as we must ensure not to use the stack.
-        asm volatile (
-            \\ local.get %[ptr]
-            \\ i32.const 0
-            \\ i32.atomic.store 0
-            :
-            : [ptr] "r" (&arg.thread.tid.value),
-        );
-        asm volatile (
-            \\ local.get %[ptr]
-            \\ i32.const 1 # waiters
-            \\ memory.atomic.notify 0
-            \\ drop # no need to know the waiters
-            :
-            : [ptr] "r" (&arg.thread.tid.value),
-        );
+        switch (arg.thread.state.swap(.completed, .SeqCst)) {
+            .running => {
+                // reset the Thread ID
+                asm volatile (
+                    \\ local.get %[ptr]
+                    \\ i32.const 0
+                    \\ i32.atomic.store 0
+                    :
+                    : [ptr] "r" (&arg.thread.tid.value),
+                );
+
+                // Wake the main thread listening to this thread
+                asm volatile (
+                    \\ local.get %[ptr]
+                    \\ i32.const 1 # waiters
+                    \\ memory.atomic.notify 0
+                    \\ drop # no need to know the waiters
+                    :
+                    : [ptr] "r" (&arg.thread.tid.value),
+                );
+            },
+            .completed => unreachable,
+            .detached => {
+                // restore the original stack pointer so we can free the memory
+                // without having to worry about freeing the stack
+                __set_stack_pointer(arg.original_stack_pointer);
+                // Ensure a copy so we don't free the allocator reference itself
+                var allocator = arg.thread.allocator;
+                allocator.free(arg.thread.memory);
+            },
+        }
     }
 
     /// Asks the host to create a new thread for us.
@@ -980,6 +1008,15 @@ const WasiThreadImpl = struct {
             : [ptr] "r" (addr),
         );
     }
+
+    /// Returns the current value of the stack pointer
+    inline fn __get_stack_pointer() [*]u8 {
+        return asm (
+            \\ global.get __stack_pointer
+            \\ local.set %[stack_ptr]
+            : [stack_ptr] "=r" (-> [*]u8),
+        );
+    }
 };
 
 const LinuxThreadImpl = struct {
diff --git a/lib/std/Thread/Futex.zig b/lib/std/Thread/Futex.zig
index 2fe4120f6855..0020180bcb6d 100644
--- a/lib/std/Thread/Futex.zig
+++ b/lib/std/Thread/Futex.zig
@@ -453,7 +453,7 @@ const WasmImpl = struct {
         if (!comptime std.Target.wasm.featureSetHas(builtin.target.cpu.features, .atomics)) {
             @compileError("WASI target missing cpu feature 'atomics'");
         }
-        const to: i64 = if (timeout) |to| @intCast(i64, to) else -1;
+        const to: i64 = if (timeout) |to| @intCast(to) else -1;
         const result = asm (
             \\local.get %[ptr]
             \\local.get %[expected]
@@ -462,7 +462,7 @@ const WasmImpl = struct {
             \\local.set %[ret]
             : [ret] "=r" (-> u32),
             : [ptr] "r" (&ptr.value),
-              [expected] "r" (@bitCast(i32, expect)),
+              [expected] "r" (@as(i32, @bitCast(expect))),
               [timeout] "r" (to),
         );
         switch (result) {

From 87b8a0567b0f54415aeecd879d3f1a4e12014d22 Mon Sep 17 00:00:00 2001
From: Luuk de Gram <luuk@degram.dev>
Date: Mon, 26 Jun 2023 19:10:34 +0200
Subject: [PATCH 9/9] default to single-threaded for WebAssembly

When targeting WebAssembly, we default to building a single-threaded build
as threads are still experimental. The user however can enable a multi-
threaded build by specifying '-fno-single-threaded'. It's a compile-error
to enable this flag, but not also enable shared-memory.
---
 lib/std/Thread.zig  |  4 ++++
 src/Compilation.zig |  2 +-
 src/main.zig        | 27 ++++++++++++++++++++-------
 src/target.zig      |  5 +++++
 4 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/lib/std/Thread.zig b/lib/std/Thread.zig
index b8cc8c8869b6..99e2feb4cf43 100644
--- a/lib/std/Thread.zig
+++ b/lib/std/Thread.zig
@@ -919,6 +919,10 @@ const WasiThreadImpl = struct {
 
     /// Bootstrap procedure, called by the host environment after thread creation.
     export fn wasi_thread_start(tid: i32, arg: *Instance) void {
+        if (builtin.single_threaded) {
+            // ensure function is not analyzed in single-threaded mode
+            return;
+        }
         __set_stack_pointer(arg.thread.memory.ptr + arg.stack_offset);
         __wasm_init_tls(arg.thread.memory.ptr + arg.tls_offset);
         @atomicStore(u32, &WasiThreadImpl.tls_thread_id, @intCast(tid), .SeqCst);
diff --git a/src/Compilation.zig b/src/Compilation.zig
index fe13df80976d..4e0a36d652d5 100644
--- a/src/Compilation.zig
+++ b/src/Compilation.zig
@@ -1029,7 +1029,7 @@ pub fn create(gpa: Allocator, options: InitOptions) !*Compilation {
 
         const include_compiler_rt = options.want_compiler_rt orelse needs_c_symbols;
 
-        const must_single_thread = options.target.isWasm() and !options.linker_shared_memory;
+        const must_single_thread = target_util.isSingleThreaded(options.target);
         const single_threaded = options.single_threaded orelse must_single_thread;
         if (must_single_thread and !single_threaded) {
             return error.TargetRequiresSingleThreaded;
diff --git a/src/main.zig b/src/main.zig
index 4726d3e3080d..1b4a93eb4552 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -2428,15 +2428,28 @@ fn buildOutputType(
             link_libcpp = true;
     }
 
-    if (target_info.target.cpu.arch.isWasm() and linker_shared_memory) {
-        if (output_mode == .Obj) {
-            fatal("shared memory is not allowed in object files", .{});
+    if (target_info.target.cpu.arch.isWasm()) blk: {
+        if (single_threaded == null) {
+            single_threaded = true;
         }
+        if (linker_shared_memory) {
+            if (output_mode == .Obj) {
+                fatal("shared memory is not allowed in object files", .{});
+            }
 
-        if (!target_info.target.cpu.features.isEnabled(@intFromEnum(std.Target.wasm.Feature.atomics)) or
-            !target_info.target.cpu.features.isEnabled(@intFromEnum(std.Target.wasm.Feature.bulk_memory)))
-        {
-            fatal("'atomics' and 'bulk-memory' features must be enabled to use shared memory", .{});
+            if (!target_info.target.cpu.features.isEnabled(@intFromEnum(std.Target.wasm.Feature.atomics)) or
+                !target_info.target.cpu.features.isEnabled(@intFromEnum(std.Target.wasm.Feature.bulk_memory)))
+            {
+                fatal("'atomics' and 'bulk-memory' features must be enabled to use shared memory", .{});
+            }
+            break :blk;
+        }
+
+        // Single-threaded is the default for WebAssembly, so only when the user specified `-fno_single-threaded`
+        // can they enable multithreaded WebAssembly builds.
+        const is_single_threaded = single_threaded.?;
+        if (!is_single_threaded) {
+            fatal("'-fno-single-threaded' requires the linker feature shared-memory to be enabled using '--shared-memory'", .{});
         }
     }
 
diff --git a/src/target.zig b/src/target.zig
index 28833428a7e1..f07dcc43d21e 100644
--- a/src/target.zig
+++ b/src/target.zig
@@ -207,6 +207,11 @@ pub fn supports_fpic(target: std.Target) bool {
     return target.os.tag != .windows and target.os.tag != .uefi;
 }
 
+pub fn isSingleThreaded(target: std.Target) bool {
+    _ = target;
+    return false;
+}
+
 /// Valgrind supports more, but Zig does not support them yet.
 pub fn hasValgrindSupport(target: std.Target) bool {
     switch (target.cpu.arch) {