From 07133795f2617a47dded898e0868fd5021bce757 Mon Sep 17 00:00:00 2001 From: Misaki Kasumi Date: Tue, 31 Dec 2024 17:01:23 +0800 Subject: [PATCH] std.process.Child: use clone3 on x86 and x86_64 --- lib/std/os/linux.zig | 34 ++++++++++++++++++++++++++++++++++ lib/std/os/linux/x86.zig | 31 +++++++++++++++++++++++++++++++ lib/std/os/linux/x86_64.zig | 21 +++++++++++++++++++++ lib/std/process/Child.zig | 37 +++++++++++++++++++++++++++---------- 4 files changed, 113 insertions(+), 10 deletions(-) diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig index 85a98a574f66..32070e5a03e9 100644 --- a/lib/std/os/linux.zig +++ b/lib/std/os/linux.zig @@ -87,6 +87,40 @@ pub fn clone( ) callconv(.C) usize, @ptrCast(&syscall_bits.clone))(func, stack, flags, arg, ptid, tp, ctid); } +pub const clone_args = extern struct { + flags: u64, + pidfd: u64, + child_tid: u64, + parent_tid: u64, + exit_signal: u64, + stack: u64, + stack_size: u64, + tls: u64, + set_tid: u64, + set_tid_size: u64, + cgroup: u64, +}; + +pub fn clone3( + cl_args: *const clone_args, + size: usize, + func: *const fn (arg: usize) callconv(.C) u8, + arg: usize, +) usize { + // TODO: write asm for other arch. + if (@hasDecl(syscall_bits, "clone3")) { + // Can't directly call a naked function; cast to C calling convention first. + return @as(*const fn ( + cl_args: *const clone_args, + size: usize, + func: *const fn (arg: usize) callconv(.C) u8, + arg: usize, + ) callconv(.C) usize, @ptrCast(&syscall_bits.clone3))(cl_args, size, func, arg); + } else { + return @bitCast(-@as(isize, @intFromEnum(E.NOSYS))); + } +} + pub const ARCH = arch_bits.ARCH; pub const Elf_Symndx = arch_bits.Elf_Symndx; pub const F = arch_bits.F; diff --git a/lib/std/os/linux/x86.zig b/lib/std/os/linux/x86.zig index cb746e52a8dc..0c572aa5fa80 100644 --- a/lib/std/os/linux/x86.zig +++ b/lib/std/os/linux/x86.zig @@ -167,6 +167,37 @@ pub fn clone() callconv(.Naked) usize { ); } +pub fn clone3() callconv(.Naked) usize { + asm volatile ( + \\ movl 4(%%esp),%%ecx + \\ movl 12(%esp),%%edx + \\ pushl %%ebx + \\ pushl %%esi + \\ movl 24(%%esp),%%esi + \\ movl %%ecx,%%ebx + \\ movl 16(%%esp),%%ecx + \\ movl $435,%%eax // SYS_clone3 + \\ int $128 + \\ testl %%eax,%%eax + \\ jz 1f + \\ popl %%esi + \\ popl %%ebx + \\ retl + \\ + \\1: + \\ .cfi_undefined %%eip + \\ xorl %%ebp,%%ebp + \\ + \\ andl $-16,%%esp + \\ subl $12,%%esp + \\ pushl %%esi + \\ calll *%%edx + \\ movl %%eax,%%ebx + \\ movl $1,%%eax // SYS_exit + \\ int $128 + ); +} + pub fn restore() callconv(.Naked) noreturn { switch (@import("builtin").zig_backend) { .stage2_c => asm volatile ( diff --git a/lib/std/os/linux/x86_64.zig b/lib/std/os/linux/x86_64.zig index 44a37345f0f4..c177c542d3d0 100644 --- a/lib/std/os/linux/x86_64.zig +++ b/lib/std/os/linux/x86_64.zig @@ -129,6 +129,27 @@ pub fn clone() callconv(.Naked) usize { ); } +pub fn clone3() callconv(.Naked) usize { + asm volatile ( + \\ movl $435,%%eax // SYS_clone3 + \\ movq %%rcx,%%r8 + \\ syscall + \\ testq %%rax,%%rax + \\ jz 1f + \\ retq + \\ + \\1: .cfi_undefined %%rip + \\ xorl %%ebp,%%ebp + \\ + \\ movq %%r8,%%rdi + \\ callq *%%rdx + \\ movl %%eax,%%edi + \\ movl $60,%%eax // SYS_exit + \\ syscall + \\ + ); +} + pub const restore = restore_rt; pub fn restore_rt() callconv(.Naked) noreturn { diff --git a/lib/std/process/Child.zig b/lib/std/process/Child.zig index 61c5a64318fb..c86dd181af9a 100644 --- a/lib/std/process/Child.zig +++ b/lib/std/process/Child.zig @@ -724,10 +724,6 @@ fn spawnPosix(self: *ChildProcess) SpawnError!void { immediateExit(spawnPosixChildHelper(@intFromPtr(&child_arg))); } } else { - var old_mask: posix.sigset_t = undefined; - posix.sigprocmask(posix.SIG.SETMASK, &linux.all_mask, &old_mask); - defer posix.sigprocmask(posix.SIG.SETMASK, &old_mask, null); - child_arg.sigmask = &old_mask; child_arg.ret_err = null; // Although the stack is fixed sized, we alloc it here, // because stack-smashing protection may have higher overhead than allocation. @@ -737,13 +733,34 @@ fn spawnPosix(self: *ChildProcess) SpawnError!void { // For simplicity, we just align it to page boundary here. const stack = try self.allocator.alignedAlloc(u8, mem.page_size, stack_size); defer self.allocator.free(stack); - const rc = linux.clone(spawnPosixChildHelper, @intFromPtr(stack.ptr) + stack_size, linux.CLONE.VM | linux.CLONE.VFORK | linux.SIG.CHLD, @intFromPtr(&child_arg), null, 0, null); - pid_result = switch (posix.errno(rc)) { - .SUCCESS => @intCast(rc), - .AGAIN => return error.SystemResources, - .NOMEM => return error.SystemResources, + + var clone_args = mem.zeroes(linux.clone_args); + clone_args.flags = linux.CLONE.VM | linux.CLONE.VFORK | linux.CLONE.CLEAR_SIGHAND; + clone_args.exit_signal = linux.SIG.CHLD; + clone_args.stack = @intFromPtr(stack.ptr); + clone_args.stack_size = stack_size; + var rc = linux.clone3(&clone_args, @sizeOf(linux.clone_args), spawnPosixChildHelper, @intFromPtr(&child_arg)); + switch (posix.errno(rc)) { + .SUCCESS => {}, + .AGAIN, .NOMEM => return error.SystemResources, + .INVAL, .NOSYS => { + // Fallback to use clone(). + // We need to block signals here because we share VM with child before exec. + // Signal handlers may mess up our memory. + var old_mask: posix.sigset_t = undefined; + posix.sigprocmask(posix.SIG.SETMASK, &linux.all_mask, &old_mask); + defer posix.sigprocmask(posix.SIG.SETMASK, &old_mask, null); + child_arg.sigmask = &old_mask; + rc = linux.clone(spawnPosixChildHelper, @intFromPtr(stack.ptr) + stack_size, linux.CLONE.VM | linux.CLONE.VFORK | linux.SIG.CHLD, @intFromPtr(&child_arg), null, 0, null); + switch (posix.errno(rc)) { + .SUCCESS => {}, + .AGAIN, .NOMEM => return error.SystemResources, + else => |err| return posix.unexpectedErrno(err), + } + }, else => |err| return posix.unexpectedErrno(err), - }; + } + pid_result = @intCast(rc); if (child_arg.ret_err) |err| { return err; }