diff --git a/lib/std/os/linux.zig b/lib/std/os/linux.zig index 85a98a574f66..20906f4633c0 100644 --- a/lib/std/os/linux.zig +++ b/lib/std/os/linux.zig @@ -87,6 +87,40 @@ pub fn clone( ) callconv(.C) usize, @ptrCast(&syscall_bits.clone))(func, stack, flags, arg, ptid, tp, ctid); } +pub const clone_args = extern struct { + flags: u64, + pidfd: u64, + child_tid: u64, + parent_tid: u64, + exit_signal: u64, + stack: u64, + stack_size: u64, + tls: u64, + set_tid: u64, + set_tid_size: u64, + cgroup: u64, +}; + +pub fn clone3( + cl_args: *const clone_args, + size: usize, + func: *const fn (arg: usize) callconv(.C) u8, + arg: usize, +) usize { + // TODO: write asm for other arch. + if (@hasDecl(syscall_bits, "clone3")) { + // Can't directly call a naked function; cast to C calling convention first. + return @as(*const fn ( + cl_args: *const clone_args, + size: usize, + func: *const fn (arg: usize) callconv(.C) u8, + arg: usize, + ) callconv(.C) usize, @ptrCast(&syscall_bits.clone3))(cl_args, size, func, arg); + } else { + @compileError("clone3() implementation has not been written for this target"); + } +} + pub const ARCH = arch_bits.ARCH; pub const Elf_Symndx = arch_bits.Elf_Symndx; pub const F = arch_bits.F; @@ -1710,24 +1744,21 @@ pub fn sigprocmask(flags: u32, noalias set: ?*const sigset_t, noalias oldset: ?* return syscall4(.rt_sigprocmask, flags, @intFromPtr(set), @intFromPtr(oldset), NSIG / 8); } -pub fn sigaction(sig: u6, noalias act: ?*const Sigaction, noalias oact: ?*Sigaction) usize { - assert(sig >= 1); - assert(sig != SIG.KILL); - assert(sig != SIG.STOP); - +pub fn sigaction(sig: u8, noalias act: ?*const Sigaction, noalias oact: ?*Sigaction) usize { var ksa: k_sigaction = undefined; var oldksa: k_sigaction = undefined; const mask_size = @sizeOf(@TypeOf(ksa.mask)); if (act) |new| { - const restorer_fn = if ((new.flags & SA.SIGINFO) != 0) &restore_rt else &restore; - ksa = k_sigaction{ - .handler = new.handler.handler, - .flags = new.flags | SA.RESTORER, - .mask = undefined, - .restorer = @ptrCast(restorer_fn), - }; - @memcpy(@as([*]u8, @ptrCast(&ksa.mask))[0..mask_size], @as([*]const u8, @ptrCast(&new.mask))); + ksa.handler = new.handler.handler; + if (ksa.handler == SIG.DFL or ksa.handler == SIG.IGN) { + ksa.flags = new.flags; + } else { + const restorer_fn = if ((new.flags & SA.SIGINFO) != 0) &restore_rt else &restore; + ksa.flags = new.flags | SA.RESTORER; + ksa.restorer = @ptrCast(restorer_fn); + @memcpy(@as([*]u8, @ptrCast(&ksa.mask))[0..mask_size], @as([*]const u8, @ptrCast(&new.mask))); + } } const ksa_arg = if (act != null) @intFromPtr(&ksa) else 0; diff --git a/lib/std/os/linux/aarch64.zig b/lib/std/os/linux/aarch64.zig index db304a3a33b4..cb9f93465e3b 100644 --- a/lib/std/os/linux/aarch64.zig +++ b/lib/std/os/linux/aarch64.zig @@ -133,6 +133,25 @@ pub fn clone() callconv(.Naked) usize { ); } +pub fn clone3() callconv(.Naked) usize { + asm volatile ( + \\ mov x8,#435 // SYS_clone3 + \\ svc #0 + \\ + \\ cbz x0,1f + \\ ret + \\ + \\1: .cfi_undefined lr + \\ mov fp, 0 + \\ mov lr, 0 + \\ + \\ mov x0,x3 + \\ blr x2 + \\ mov x8,#93 // SYS_exit + \\ svc #0 + ); +} + pub const restore = restore_rt; pub fn restore_rt() callconv(.Naked) noreturn { diff --git a/lib/std/os/linux/arm.zig b/lib/std/os/linux/arm.zig index e9263b5cea85..c936156577a7 100644 --- a/lib/std/os/linux/arm.zig +++ b/lib/std/os/linux/arm.zig @@ -134,6 +134,26 @@ pub fn clone() callconv(.Naked) usize { ); } +pub fn clone3() callconv(.Naked) usize { + asm volatile ( + \\ stmfd sp!,{r7} + \\ mov r7,#435 // SYS_clone3 + \\ svc 0 + \\ tst r0,r0 + \\ beq 1f + \\ ldmfd sp!,{r7} + \\ bx lr + \\ + \\ // https://github.com/llvm/llvm-project/issues/115891 + \\1: mov r11, #0 + \\ mov lr, #0 + \\ mov r0,r3 + \\ bx r2 + \\ mov r7,#1 // SYS_exit + \\ svc 0 + ); +} + pub fn restore() callconv(.Naked) noreturn { switch (@import("builtin").zig_backend) { .stage2_c => asm volatile ( diff --git a/lib/std/os/linux/thumb.zig b/lib/std/os/linux/thumb.zig index a464030858b2..9ae546a2c385 100644 --- a/lib/std/os/linux/thumb.zig +++ b/lib/std/os/linux/thumb.zig @@ -142,6 +142,7 @@ pub fn syscall6( } pub const clone = @import("arm.zig").clone; +pub const clone3 = @import("arm.zig").clone3; pub fn restore() callconv(.Naked) noreturn { asm volatile ( diff --git a/lib/std/os/linux/x86.zig b/lib/std/os/linux/x86.zig index cb746e52a8dc..5b74a39244f4 100644 --- a/lib/std/os/linux/x86.zig +++ b/lib/std/os/linux/x86.zig @@ -167,6 +167,36 @@ pub fn clone() callconv(.Naked) usize { ); } +pub fn clone3() callconv(.Naked) usize { + asm volatile ( + \\ pushl %%ebx + \\ pushl %%esi + \\ movl 12(%%esp),%%ebx + \\ movl 16(%%esp),%%ecx + \\ movl 20(%%esp),%%edx + \\ movl 24(%%esp),%%esi + \\ movl $435,%%eax // SYS_clone3 + \\ int $128 + \\ testl %%eax,%%eax + \\ jz 1f + \\ popl %%esi + \\ popl %%ebx + \\ retl + \\ + \\1: + \\ .cfi_undefined %%eip + \\ xorl %%ebp,%%ebp + \\ + \\ andl $-16,%%esp + \\ subl $12,%%esp + \\ pushl %%esi + \\ calll *%%edx + \\ movl %%eax,%%ebx + \\ movl $1,%%eax // SYS_exit + \\ int $128 + ); +} + pub fn restore() callconv(.Naked) noreturn { switch (@import("builtin").zig_backend) { .stage2_c => asm volatile ( diff --git a/lib/std/os/linux/x86_64.zig b/lib/std/os/linux/x86_64.zig index 44a37345f0f4..c177c542d3d0 100644 --- a/lib/std/os/linux/x86_64.zig +++ b/lib/std/os/linux/x86_64.zig @@ -129,6 +129,27 @@ pub fn clone() callconv(.Naked) usize { ); } +pub fn clone3() callconv(.Naked) usize { + asm volatile ( + \\ movl $435,%%eax // SYS_clone3 + \\ movq %%rcx,%%r8 + \\ syscall + \\ testq %%rax,%%rax + \\ jz 1f + \\ retq + \\ + \\1: .cfi_undefined %%rip + \\ xorl %%ebp,%%ebp + \\ + \\ movq %%r8,%%rdi + \\ callq *%%rdx + \\ movl %%eax,%%edi + \\ movl $60,%%eax // SYS_exit + \\ syscall + \\ + ); +} + pub const restore = restore_rt; pub fn restore_rt() callconv(.Naked) noreturn { diff --git a/lib/std/process/Child.zig b/lib/std/process/Child.zig index 72277c2e627b..bdf4260e50b7 100644 --- a/lib/std/process/Child.zig +++ b/lib/std/process/Child.zig @@ -15,6 +15,8 @@ const native_os = builtin.os.tag; const Allocator = std.mem.Allocator; const ChildProcess = @This(); +const use_clone = native_os == .linux and builtin.zig_backend != .stage2_c; + pub const Id = switch (native_os) { .windows => windows.HANDLE, .wasi => void, @@ -27,6 +29,9 @@ pub const Id = switch (native_os) { id: Id, thread_handle: if (native_os == .windows) windows.HANDLE else void, +/// Linux only. May be unavailable on older kernel versions. +pid_fd: ?posix.fd_t, + allocator: mem.Allocator, /// The writing end of the child process's standard input pipe. @@ -73,7 +78,7 @@ cwd: ?[]const u8, /// Once that is done, `cwd` will be deprecated in favor of this field. cwd_dir: ?fs.Dir = null, -err_pipe: ?if (native_os == .windows) void else [2]posix.fd_t, +err_pipe: ?if (native_os == .windows or use_clone) void else [2]posix.fd_t, expand_arg0: Arg0Expand, @@ -211,6 +216,7 @@ pub fn init(argv: []const []const u8, allocator: mem.Allocator) ChildProcess { .argv = argv, .id = undefined, .thread_handle = undefined, + .pid_fd = null, .err_pipe = null, .term = null, .env_map = null, @@ -289,10 +295,22 @@ pub fn killPosix(self: *ChildProcess) !Term { self.cleanupStreams(); return term; } - posix.kill(self.id, posix.SIG.TERM) catch |err| switch (err) { - error.ProcessNotFound => return error.AlreadyTerminated, - else => return err, - }; + if (self.pid_fd) |pid_fd| { + if (native_os == .linux) { + switch (linux.E.init(linux.pidfd_send_signal(pid_fd, posix.SIG.TERM, null, 0))) { + .SUCCESS => {}, + .SRCH => return error.AlreadyTerminated, + else => |err| return posix.unexpectedErrno(err), + } + } else { + unreachable; + } + } else { + posix.kill(self.id, posix.SIG.TERM) catch |err| switch (err) { + error.ProcessNotFound => return error.AlreadyTerminated, + else => return err, + }; + } self.waitUnwrapped(); return self.term.?; } @@ -303,6 +321,7 @@ pub const WaitError = SpawnError || std.os.windows.GetProcessMemoryInfoError; pub fn wait(self: *ChildProcess) WaitError!Term { const term = if (native_os == .windows) try self.waitWindows() else self.waitPosix(); self.id = undefined; + self.pid_fd = null; return term; } @@ -449,6 +468,34 @@ fn waitUnwrappedWindows(self: *ChildProcess) WaitError!void { fn waitUnwrapped(self: *ChildProcess) void { const res: posix.WaitPidResult = res: { + if (self.pid_fd) |pid_fd| { + if (native_os == .linux) { + var info: linux.siginfo_t = undefined; + var ru: linux.rusage = undefined; + while (true) { + switch (linux.E.init(linux.syscall5(.waitid, @intFromEnum(linux.P.PIDFD), @intCast(pid_fd), @intFromPtr(&info), linux.W.EXITED, @intFromPtr(&ru)))) { + .SUCCESS => break, + .INTR => continue, + else => unreachable, + } + } + if (self.request_resource_usage_statistics) { + self.resource_usage_statistics.rusage = ru; + } + const status: u32 = @bitCast(info.fields.common.second.sigchld.status); + break :res posix.WaitPidResult{ + .pid = info.fields.common.first.piduid.pid, + .status = switch (info.code) { + 1 => (status & 0xff) << 8, // CLD_EXITED + 2, 3 => status & 0x7f, // CLD_KILLED, CLD_DUMPED + else => unreachable, + }, + }; + } else { + unreachable; + } + } + if (self.request_resource_usage_statistics) { switch (native_os) { .linux, .macos, .ios => { @@ -488,27 +535,10 @@ fn cleanupStreams(self: *ChildProcess) void { } fn cleanupAfterWait(self: *ChildProcess, status: u32) !Term { - if (self.err_pipe) |err_pipe| { - defer destroyPipe(err_pipe); + if (!use_clone) { + if (self.err_pipe) |err_pipe| { + defer destroyPipe(err_pipe); - if (native_os == .linux) { - var fd = [1]posix.pollfd{posix.pollfd{ - .fd = err_pipe[0], - .events = posix.POLL.IN, - .revents = undefined, - }}; - - // Check if the eventfd buffer stores a non-zero value by polling - // it, that's the error code returned by the child process. - _ = posix.poll(&fd, 0) catch unreachable; - - // According to eventfd(2) the descriptor is readable if the counter - // has a value greater than 0 - if ((fd[0].revents & posix.POLL.IN) != 0) { - const err_int = try readIntFd(err_pipe[0]); - return @as(SpawnError, @errorCast(@errorFromInt(err_int))); - } - } else { // Write maxInt(ErrInt) to the write end of the err_pipe. This is after // waitpid, so this write is guaranteed to be after the child // pid potentially wrote an error. This way we can do a blocking @@ -538,6 +568,71 @@ fn statusToTerm(status: u32) Term { Term{ .Unknown = status }; } +const RetErr = if (use_clone) ?SpawnError else posix.fd_t; + +const ChildArg = struct { + self: *ChildProcess, + stdin_pipe_0: posix.fd_t, + stdout_pipe_1: posix.fd_t, + stderr_pipe_1: posix.fd_t, + prog_pipe_1: posix.fd_t, + dev_null_fd: posix.fd_t, + argv_buf: [:null]?[*:0]const u8, + envp: [*:null]const ?[*:0]const u8, + sigmask: ?*posix.sigset_t, + ret_err: RetErr, +}; + +fn spawnPosixChildHelper(arg: usize) callconv(.c) u8 { + const child_arg: *ChildArg = @ptrFromInt(arg); + const prog_fileno = 3; + + setUpChildIo(child_arg.self.stdin_behavior, child_arg.stdin_pipe_0, posix.STDIN_FILENO, child_arg.dev_null_fd) catch |err| return forkChildErrReport(&child_arg.ret_err, err); + setUpChildIo(child_arg.self.stdout_behavior, child_arg.stdout_pipe_1, posix.STDOUT_FILENO, child_arg.dev_null_fd) catch |err| return forkChildErrReport(&child_arg.ret_err, err); + setUpChildIo(child_arg.self.stderr_behavior, child_arg.stderr_pipe_1, posix.STDERR_FILENO, child_arg.dev_null_fd) catch |err| return forkChildErrReport(&child_arg.ret_err, err); + + if (child_arg.self.cwd_dir) |cwd| { + posix.fchdir(cwd.fd) catch |err| return forkChildErrReport(&child_arg.ret_err, err); + } else if (child_arg.self.cwd) |cwd| { + posix.chdir(cwd) catch |err| return forkChildErrReport(&child_arg.ret_err, err); + } + + // Must happen after fchdir above, the cwd file descriptor might be + // equal to prog_fileno and be clobbered by this dup2 call. + if (child_arg.prog_pipe_1 != -1) posix.dup2(child_arg.prog_pipe_1, prog_fileno) catch |err| return forkChildErrReport(&child_arg.ret_err, err); + + if (child_arg.self.gid) |gid| { + posix.setregid(gid, gid) catch |err| return forkChildErrReport(&child_arg.ret_err, err); + } + + if (child_arg.self.uid) |uid| { + posix.setreuid(uid, uid) catch |err| return forkChildErrReport(&child_arg.ret_err, err); + } + + if (child_arg.self.pgid) |pid| { + posix.setpgid(0, pid) catch |err| return forkChildErrReport(&child_arg.ret_err, err); + } + + if (native_os == .linux and child_arg.sigmask != null) { + std.debug.assert(linux.SIG.DFL == null); + for (1..linux.NSIG) |sig| { + var old_act: linux.Sigaction = undefined; + const new_act = mem.zeroes(posix.Sigaction); + _ = linux.sigaction(@intCast(sig), &new_act, &old_act); + if (old_act.handler.handler == linux.SIG.IGN) { + _ = linux.sigaction(@intCast(sig), &old_act, null); + } + } + std.debug.assert(linux.sigprocmask(linux.SIG.SETMASK, child_arg.sigmask, null) == 0); + } + + const err = switch (child_arg.self.expand_arg0) { + .expand => posix.execvpeZ_expandArg0(.expand, child_arg.argv_buf.ptr[0].?, child_arg.argv_buf.ptr, child_arg.envp), + .no_expand => posix.execvpeZ_expandArg0(.no_expand, child_arg.argv_buf.ptr[0].?, child_arg.argv_buf.ptr, child_arg.envp), + }; + return forkChildErrReport(&child_arg.ret_err, err); +} + fn spawnPosix(self: *ChildProcess) SpawnError!void { // The child process does need to access (one end of) these pipes. However, // we must initially set CLOEXEC to avoid a race condition. If another thread @@ -639,51 +734,77 @@ fn spawnPosix(self: *ChildProcess) SpawnError!void { // This pipe is used to communicate errors between the time of fork // and execve from the child process to the parent process. const err_pipe = blk: { - if (native_os == .linux) { - const fd = try posix.eventfd(0, linux.EFD.CLOEXEC); - // There's no distinction between the readable and the writeable - // end with eventfd - break :blk [2]posix.fd_t{ fd, fd }; - } else { + if (!use_clone) { break :blk try posix.pipe2(.{ .CLOEXEC = true }); + } else { + break :blk [_]posix.fd_t{ -1, -1 }; } }; errdefer destroyPipe(err_pipe); - const pid_result = try posix.fork(); - if (pid_result == 0) { - // we are the child - setUpChildIo(self.stdin_behavior, stdin_pipe[0], posix.STDIN_FILENO, dev_null_fd) catch |err| forkChildErrReport(err_pipe[1], err); - setUpChildIo(self.stdout_behavior, stdout_pipe[1], posix.STDOUT_FILENO, dev_null_fd) catch |err| forkChildErrReport(err_pipe[1], err); - setUpChildIo(self.stderr_behavior, stderr_pipe[1], posix.STDERR_FILENO, dev_null_fd) catch |err| forkChildErrReport(err_pipe[1], err); - - if (self.cwd_dir) |cwd| { - posix.fchdir(cwd.fd) catch |err| forkChildErrReport(err_pipe[1], err); - } else if (self.cwd) |cwd| { - posix.chdir(cwd) catch |err| forkChildErrReport(err_pipe[1], err); - } - - // Must happen after fchdir above, the cwd file descriptor might be - // equal to prog_fileno and be clobbered by this dup2 call. - if (prog_pipe[1] != -1) posix.dup2(prog_pipe[1], prog_fileno) catch |err| forkChildErrReport(err_pipe[1], err); + var child_arg = ChildArg{ + .self = self, + .stdin_pipe_0 = stdin_pipe[0], + .stdout_pipe_1 = stdout_pipe[1], + .stderr_pipe_1 = stderr_pipe[1], + .prog_pipe_1 = prog_pipe[1], + .dev_null_fd = dev_null_fd, + .argv_buf = argv_buf, + .envp = envp, + .sigmask = null, + .ret_err = undefined, + }; - if (self.gid) |gid| { - posix.setregid(gid, gid) catch |err| forkChildErrReport(err_pipe[1], err); + var pid_result: posix.pid_t = undefined; + if (!use_clone) { + child_arg.ret_err = err_pipe[1]; + pid_result = try posix.fork(); + if (pid_result == 0) { + immediateExit(spawnPosixChildHelper(@intFromPtr(&child_arg))); } - - if (self.uid) |uid| { - posix.setreuid(uid, uid) catch |err| forkChildErrReport(err_pipe[1], err); + } else { + child_arg.ret_err = null; + // Although the stack is fixed sized, we alloc it here, + // because stack-smashing protection may have higher overhead than allocation. + const stack_size = 0x8000; + // On aarch64, stack address must be a multiple of 16. + const stack = try self.allocator.alignedAlloc(u8, 16, stack_size); + defer self.allocator.free(stack); + + var clone_args = mem.zeroes(linux.clone_args); + var pid_fd: posix.fd_t = undefined; + clone_args.flags = linux.CLONE.VM | linux.CLONE.VFORK | linux.CLONE.CLEAR_SIGHAND | linux.CLONE.PIDFD; + clone_args.exit_signal = linux.SIG.CHLD; + clone_args.stack = @intFromPtr(stack.ptr); + clone_args.stack_size = stack_size; + clone_args.pidfd = @intFromPtr(&pid_fd); + var rc = linux.clone3(&clone_args, @sizeOf(linux.clone_args), spawnPosixChildHelper, @intFromPtr(&child_arg)); + switch (linux.E.init(rc)) { + .SUCCESS => { + self.pid_fd = pid_fd; + }, + .AGAIN, .NOMEM => return error.SystemResources, + .INVAL, .NOSYS => { + // Fallback to use clone(). + // We need to block signals here because we share VM with child before exec. + // Signal handlers may mess up our memory. + var old_mask: posix.sigset_t = undefined; + std.debug.assert(linux.sigprocmask(linux.SIG.SETMASK, &linux.all_mask, &old_mask) == 0); + defer std.debug.assert(linux.sigprocmask(linux.SIG.SETMASK, &old_mask, null) == 0); + child_arg.sigmask = &old_mask; + rc = linux.clone(spawnPosixChildHelper, @intFromPtr(stack.ptr) + stack_size, linux.CLONE.VM | linux.CLONE.VFORK | linux.SIG.CHLD, @intFromPtr(&child_arg), null, 0, null); + switch (linux.E.init(rc)) { + .SUCCESS => {}, + .AGAIN, .NOMEM => return error.SystemResources, + else => |err| return posix.unexpectedErrno(err), + } + }, + else => |err| return posix.unexpectedErrno(err), } - - if (self.pgid) |pid| { - posix.setpgid(0, pid) catch |err| forkChildErrReport(err_pipe[1], err); + pid_result = @intCast(rc); + if (child_arg.ret_err) |err| { + return err; } - - const err = switch (self.expand_arg0) { - .expand => posix.execvpeZ_expandArg0(.expand, argv_buf.ptr[0].?, argv_buf.ptr, envp), - .no_expand => posix.execvpeZ_expandArg0(.no_expand, argv_buf.ptr[0].?, argv_buf.ptr, envp), - }; - forkChildErrReport(err_pipe[1], err); } // we are the parent @@ -705,7 +826,9 @@ fn spawnPosix(self: *ChildProcess) SpawnError!void { } self.id = pid; - self.err_pipe = err_pipe; + if (!use_clone) { + self.err_pipe = err_pipe; + } self.term = null; if (self.stdin_behavior == .Pipe) { @@ -1012,19 +1135,27 @@ fn destroyPipe(pipe: [2]posix.fd_t) void { if (pipe[0] != pipe[1]) posix.close(pipe[1]); } -// Child of fork calls this to report an error to the fork parent. -// Then the child exits. -fn forkChildErrReport(fd: i32, err: ChildProcess.SpawnError) noreturn { - writeIntFd(fd, @as(ErrInt, @intFromError(err))) catch {}; +fn immediateExit(exitcode: u8) noreturn { // If we're linking libc, some naughty applications may have registered atexit handlers // which we really do not want to run in the fork child. I caught LLVM doing this and // it caused a deadlock instead of doing an exit syscall. In the words of Avril Lavigne, // "Why'd you have to go and make things so complicated?" if (builtin.link_libc) { // The _exit(2) function does nothing but make the exit syscall, unlike exit(3) - std.c._exit(1); + std.c._exit(exitcode); + } + posix.exit(exitcode); +} + +// Child of fork calls this to report an error to the fork parent. +// Returns exit code. +fn forkChildErrReport(retErr: *RetErr, err: ChildProcess.SpawnError) u8 { + if (!use_clone) { + writeIntFd(retErr.*, @as(ErrInt, @intFromError(err))) catch {}; + } else { + retErr.* = err; } - posix.exit(1); + return 1; } fn writeIntFd(fd: i32, value: ErrInt) !void {