//
// Syd: rock-solid application kernel
// src/syd.rs: Main entry point
//
// Copyright (c) 2023, 2024, 2025, 2026 Ali Polatel <alip@chesswob.org>
// Proxying code is based in part upon socksns crate which is:
//     Copyright (c) 2020 Steven Engler
//     SPDX-License-Identifier: MIT
//
// SPDX-License-Identifier: GPL-3.0

//! Syd: rock-solid application kernel
//! Main entry point.

// We like clean and simple code with documentation.
// Keep in sync with lib.rs.
#![forbid(clippy::as_ptr_cast_mut)]
#![forbid(clippy::cast_ptr_alignment)]
#![deny(missing_docs)]
#![deny(clippy::arithmetic_side_effects)]
#![deny(clippy::as_underscore)]
#![deny(clippy::assertions_on_result_states)]
#![deny(clippy::borrow_as_ptr)]
#![deny(clippy::branches_sharing_code)]
#![deny(clippy::case_sensitive_file_extension_comparisons)]
#![deny(clippy::cast_lossless)]
#![deny(clippy::cast_possible_truncation)]
#![deny(clippy::cast_possible_wrap)]
#![deny(clippy::cast_precision_loss)]
#![deny(clippy::cast_sign_loss)]
#![deny(clippy::checked_conversions)]
#![deny(clippy::clear_with_drain)]
#![deny(clippy::clone_on_ref_ptr)]
#![deny(clippy::cloned_instead_of_copied)]
#![deny(clippy::cognitive_complexity)]
#![deny(clippy::collection_is_never_read)]
#![deny(clippy::copy_iterator)]
#![deny(clippy::create_dir)]
#![deny(clippy::dbg_macro)]
#![deny(clippy::debug_assert_with_mut_call)]
#![deny(clippy::decimal_literal_representation)]
#![deny(clippy::default_trait_access)]
#![deny(clippy::default_union_representation)]
#![deny(clippy::derive_partial_eq_without_eq)]
#![deny(clippy::doc_link_with_quotes)]
#![deny(clippy::doc_markdown)]
#![deny(clippy::explicit_into_iter_loop)]
#![deny(clippy::explicit_iter_loop)]
#![deny(clippy::fallible_impl_from)]
#![deny(clippy::missing_safety_doc)]
#![deny(clippy::undocumented_unsafe_blocks)]

use std::{
    env,
    env::VarError,
    ffi::OsString,
    fs::OpenOptions,
    io::{stdin, stdout, BufWriter, Write},
    os::{
        fd::{AsFd, AsRawFd, BorrowedFd, IntoRawFd, OwnedFd},
        unix::{ffi::OsStrExt, fs::OpenOptionsExt},
    },
    process::{exit, ExitCode},
    str::FromStr,
};

use data_encoding::HEXLOWER;
use libseccomp::{scmp_cmp, ScmpAction, ScmpFilterContext, ScmpSyscall};
use memchr::arch::all::is_equal;
use nix::{
    errno::Errno,
    fcntl::OFlag,
    sched::{unshare, CloneFlags},
    sys::{
        resource::Resource,
        wait::{Id, WaitPidFlag},
    },
    unistd::{fchdir, getgid, getpid, getuid, isatty, Pid},
};
use syd::{
    bins::{pty::pty_bin_main, tor::tor_bin_main},
    caps,
    compat::{set_name, set_no_new_privs, waitid, ResolveFlag, WaitStatus},
    config::*,
    confine::{
        confine_landlock_scope, confine_rlimit_zero, confine_scmp_madvise, confine_scmp_wx_syd,
        secure_getenv, ExportMode,
    },
    err::err2no,
    error,
    fd::{closeexcept, fdclone},
    fs::{format_clone_flags, format_clone_names},
    hash::{get_at_random_hex, hash, HashAlgorithm},
    hook::Supervisor,
    ignore_signals, info,
    landlock_policy::LandlockPolicy,
    log::log_init,
    lookup::safe_open_path,
    namespace::{
        ns_setup_net, ns_setup_pid, ns_setup_time, ns_setup_tor, ns_setup_user, ns_setup_uts,
    },
    path::XPathBuf,
    proc::proc_open,
    pty::pty_setup,
    rng::duprand,
    sandbox::Sandbox,
    seal::ensure_sealed,
    set_sigpipe_dfl, syd_code_name, syd_info,
    syslog::LogLevel,
    IgnoreSignalOpts,
};

// Set global allocator to GrapheneOS allocator.
#[cfg(all(
    not(coverage),
    not(feature = "prof"),
    not(target_os = "android"),
    not(target_arch = "riscv64"),
    target_page_size_4k,
    target_pointer_width = "64"
))]
#[global_allocator]
static GLOBAL: hardened_malloc::HardenedMalloc = hardened_malloc::HardenedMalloc;

// Set global allocator to tcmalloc if profiling is enabled.
#[cfg(feature = "prof")]
#[global_allocator]
static GLOBAL: tcmalloc::TCMalloc = tcmalloc::TCMalloc;

syd::main! {
    use lexopt::prelude::*;

    // Get process name in argv[0] and multicall utilities:
    //    - syd-pty(1) for PTY sandboxing.
    //    - syd-tor(1) for Proxy sandboxing.
    let name = env::args_os().next();
    if let Some(name) = name {
        let name = name.as_bytes();
        if is_equal(name, b"syd-pty") {
            return Ok(pty_bin_main());
        } else if is_equal(name, b"syd-tor") {
            return Ok(tor_bin_main());
        }
    }

    // Initialize logging.
    log_init(LogLevel::Warn, Some(libc::STDERR_FILENO))?;

    // Set process name, ignore errors.
    let _ = set_name(c"syd");

    // Parse CLI options.
    //
    // Note, option parsing is POSIXly correct:
    // POSIX recommends that no more options are parsed after the first
    // positional argument. The other arguments are then all treated as
    // positional arguments.
    // See: https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap12.html#tag_12_02
    let mut parser = lexopt::Parser::from_env();
    let is_login = parser
        .bin_name()
        .map(|name| name.starts_with('-'))
        .unwrap_or(false);
    let mut is_quick = env::var_os(ENV_QUICK_BOOT).is_some();

    // Handle quick options early before reexecution for convenience.
    if !is_login {
        if let Some(raw) = parser.try_raw_args() {
            if let Some(Some(arg)) = raw.peek().map(|arg| arg.to_str()) {
                match arg {
                    "-h" | "--help" => {
                        set_sigpipe_dfl()?;
                        help();
                        return Ok(ExitCode::SUCCESS);
                    }
                    "-C" | "--check" => {
                        set_sigpipe_dfl()?;
                        syd_info(true)?;
                        return Ok(ExitCode::SUCCESS);
                    }
                    "-V" | "--version" => {
                        set_sigpipe_dfl()?;
                        syd_info(false)?;
                        return Ok(ExitCode::SUCCESS);
                    }
                    "--el" => {
                        set_sigpipe_dfl()?;
                        stdout().write_all(SYD_EL.as_bytes())?;
                        return Ok(ExitCode::SUCCESS);
                    }
                    "--sh" => {
                        set_sigpipe_dfl()?;
                        stdout().write_all(ESYD_SH.as_bytes())?;
                        return Ok(ExitCode::SUCCESS);
                    }
                    "--api" => {
                        set_sigpipe_dfl()?;
                        #[expect(clippy::disallowed_methods)]
                        let api = serde_json::to_string_pretty(&*syd::api::API_SPEC).expect("JSON");
                        stdout().write_all(api.as_bytes())?;
                        return Ok(ExitCode::SUCCESS);
                    }
                    "-q" => is_quick = true,
                    _ => {}
                }
            }
        }
    }

    // Set NO_NEW_PRIVS as early as possible.
    set_no_new_privs()?;

    // Apply a landlock(7) scope sandbox to restrict:
    // 1. ptrace(2) attach outside landlock(7).
    // 2. Signal send outside landlock(7).
    // 3. We leave path and network restrictions for Landlock
    //    to be configured by the user using Lock sandboxing.
    // 4. We do this before memfd-reexec to add an additional
    //    guard against proc(5) havoc.
    if let Err(errno) = confine_landlock_scope() {
        error!("ctx": "landlock_scope",
            "err": errno as i32,
            "msg": format!("landlock scope failed: {errno}"),
            "tip": "submit a bug report");
        return Err(errno.into());
    }

    // Guard against CVE-2019-5736:
    // Copy /proc/self/exe in an anonymous fd (created via memfd_create), seal it and re-execute it.
    // See:
    // - https://github.com/opencontainers/runc/commit/0a8e4117e7f715d5fbeef398405813ce8e88558b
    // - https://github.com/lxc/lxc/commit/6400238d08cdf1ca20d49bafb85f4e224348bf9d
    // Note: syd's procfs protections is another layer of defense against this.
    #[expect(clippy::disallowed_methods)]
    let cookie = if !is_quick {
        match env::var(ENV_RAND) {
            Ok(cookie0) => {
                // Best-effort ensure cookie0 was not tampered.
                assert_eq!(cookie0.len(), 32,
                    "PANIC: Internal environment variable {ENV_RAND} tampered by user!");
                assert!(cookie0.bytes().all(|b| b.is_ascii_hexdigit() && !b.is_ascii_uppercase()),
                    "PANIC: Internal environment variable {ENV_RAND} tampered by user!");
                let cookie1 = get_at_random_hex(false);
                env::set_var(ENV_RAND, format!("{cookie0}{cookie1}"));
                info!("ctx": "set_random_cookie",
                    "cookie": [&cookie0, &cookie1], "src": "AT_RANDOM",
                    "msg": format!("appended random cookie from AT_RANDOM {cookie0}+{cookie1}={cookie0}{cookie1} after memfd-reexec"));
            }
            Err(VarError::NotPresent) => {
                let cookie = get_at_random_hex(false);
                env::set_var(ENV_RAND, &cookie);
                info!("ctx": "set_random_cookie",
                    "cookie": &cookie, "src": "AT_RANDOM",
                    "msg": format!("set random cookie from AT_RANDOM to {cookie}"));
            }
            Err(VarError::NotUnicode(cookie)) => {
                error!("ctx": "set_random_cookie",
                    "cookie": &cookie, "src": "AT_RANDOM", "err": libc::EINVAL,
                    "msg": format!("get random cookie from {ENV_RAND} failed: {}", Errno::EINVAL));
            }
        }

        match ensure_sealed() {
            Ok(()) => env::var(ENV_RAND).unwrap(),
            Err(errno) => {
                error!("ctx": "memfd_reexec",
                    "err": errno as i32,
                    "msg": format!("reexecute self with a sealed memfd failed: {errno}"),
                    "tip": "set SYD_QUICK_BOOT and/or submit a bug report");
                return Err(errno.into());
            }
        }
    } else {
        // See seal.rs for the other branch.
        // Rest is handled in unshare/child.rs
        match env::var_os("RUST_BACKTRACE") {
            Some(val) => env::set_var("SYD_RUST_BACKTRACE", val),
            None => env::remove_var("SYD_RUST_BACKTRACE"),
        };
        if secure_getenv(ENV_SKIP_SCMP).is_none() {
            env::set_var("RUST_BACKTRACE", "0");
        }
        env::set_var(ENV_RAND, get_at_random_hex(false));
        env::var(ENV_RAND).unwrap()
    };

    // Generate unique sandbox id from AT_RANDOM bytes.
    // Allow the user to override by setting SYD_ID.
    // Panic if SYD_ID is incorrectly formatted.
    #[expect(clippy::disallowed_methods)]
    if let Some(sandbox_id) = env::var_os(ENV_ID) {
        assert_eq!(sandbox_id.len(), 128,
            "PANIC: Sandbox ID in SYD_ID environment variable isn't in correct format!");
        assert!(sandbox_id.as_bytes().iter().all(|b| b.is_ascii_hexdigit() && !b.is_ascii_uppercase()),
            "PANIC: Sandbox ID in SYD_ID environment variable isn't in correct format!");
        let machine_id = &sandbox_id.as_bytes()[..32];
        assert!(machine_id.iter().any(|&b| b != b'0'),
            "PANIC: Sandbox ID in SYD_ID environment variable isn't in correct format!");
    } else {
        let sandbox_id = HEXLOWER.encode(&hash(cookie.as_bytes(), HashAlgorithm::Sha512).unwrap());
        env::set_var(ENV_ID, &sandbox_id);
        info!("ctx": "set_sandbox_id",
            "id": &sandbox_id, "cookie": &cookie, "hash": "sha3-512",
            "msg": format!("generated syd id:{sandbox_id} from cookie:{cookie} using SHA3-512"));
    }

    // SYD_PID_FN -> Write PID file.
    if let Some(pid_fn) = env::var_os(ENV_PID_FN).map(XPathBuf::from) {
        let pid = getpid().as_raw();

        let mut pid_str = itoa::Buffer::new();
        let pid_str = pid_str.format(pid);

        let mut openopts = OpenOptions::new();
        openopts
            .mode(0o400)
            .write(true)
            .create_new(true);
        #[expect(clippy::disallowed_methods)]
        let mut pid_file = match openopts.open(&pid_fn).map(BufWriter::new) {
            Ok(pid_file) => pid_file,
            Err(error) => {
                let errno = err2no(&error);
                error!("ctx": "write_pid_file",
                    "pid_file": &pid_fn, "err": errno as i32,
                    "msg": format!("pid file create error: {error}"),
                    "tip": format!("remove file `{pid_fn}' or unset SYD_PID_FN"));
                return Err(error.into());
            }
        };

        match pid_file.write_all(pid_str.as_bytes()) {
            Ok(_) => {
                info!("ctx": "write_pid_file",
                    "msg": format!("Syd pid {pid} written to file `{pid_fn}'"),
                    "pid_file": &pid_fn);
            }
            Err(error) => {
                let errno = err2no(&error);
                error!("ctx": "write_pid_file",
                    "pid_fn": &pid_fn, "err": errno as i32,
                    "msg": format!("pid file write error: {error}"),
                    "tip": format!("remove file `{pid_fn}' or unset SYD_PID_FN"));
                return Err(error.into());
            }
        }
    }

    // Parse CLI arguments
    let mut export: Option<ExportMode> = ExportMode::from_env();
    let mut sandbox: Sandbox = Sandbox::default();
    let mut cmd_arg0: Option<OsString> = None;
    let mut cmd_argv: Vec<OsString> = vec![];

    // SYD_PROXY_{HOST,PORT,UNIX} -> proxy/ext/{host,port,unix}
    #[expect(clippy::disallowed_methods)]
    match env::var(ENV_PROXY_HOST) {
        Ok(host) => sandbox
            .config(&format!("proxy/ext/host:{host}"))
            .expect(ENV_PROXY_HOST),
        Err(env::VarError::NotPresent) => {}
        Err(error) => panic!("Invalid UTF-8 in {ENV_PROXY_HOST}: {error}"),
    };
    #[expect(clippy::disallowed_methods)]
    match env::var(ENV_PROXY_PORT) {
        Ok(port) => sandbox
            .config(&format!("proxy/ext/port:{port}"))
            .expect(ENV_PROXY_PORT),
        Err(env::VarError::NotPresent) => {}
        Err(error) => panic!("Invalid UTF-8 in {ENV_PROXY_PORT}: {error}"),
    };
    #[expect(clippy::disallowed_methods)]
    match env::var(ENV_PROXY_UNIX) {
        Ok(unix) => sandbox
            .config(&format!("proxy/ext/unix:{unix}"))
            .expect(ENV_PROXY_UNIX),
        Err(env::VarError::NotPresent) => {}
        Err(error) => panic!("Invalid UTF-8 in {ENV_PROXY_UNIX}: {error}"),
    };

    // Initialize Options.
    let mut user_parse = false;
    let user_done = if is_login
        || parser
            .try_raw_args()
            .map(|raw| raw.peek().is_none())
            .unwrap_or(true)
    {
        sandbox.parse_profile(b"user")?;
        true
    } else {
        false
    };

    // Determine default shell to execute.
    let mut is_rbash_def = false;
    #[expect(clippy::disallowed_methods)]
    let sh: Vec<_> = match env::var(ENV_SH) {
        Ok(val) => shell_words::split(&val),
        Err(VarError::NotPresent) => {
            is_rbash_def = true;
            shell_words::split(SYD_SH)
        }
        Err(error) => {
            error!("ctx": "parse_shell", "op": "get_environment",
                "msg": format!("detected invalid unicode in {ENV_SH}: {error}"),
                "tip": format!("unset {ENV_SH} environment variable"));
            return Err(error.into());
        }
    }?.into_iter().map(OsString::from).collect();
    if sh.is_empty() {
        error!("ctx": "parse_shell", "op": "split_shell",
            "msg": format!("detected empty {ENV_SH}"),
            "tip": format!("unset {ENV_SH} environment variable"));
        return Err(shell_words::ParseError.into());
    }

    // Local options handled by this function.
    while let Some(arg) = parser.next()? {
        match arg {
            /*
             * Basic options
             */
            Short('h') | Long("help") => {
                set_sigpipe_dfl()?;
                help();
                return Ok(ExitCode::SUCCESS);
            }
            Short('C') | Long("check") => {
                set_sigpipe_dfl()?;
                syd_info(true)?;
                return Ok(ExitCode::SUCCESS);
            }
            // syd -V is called often by paludis.
            // We want to keep its output short and parseable.
            Short('V') | Long("version") => {
                set_sigpipe_dfl()?;
                syd_info(false)?;
                return Ok(ExitCode::SUCCESS);
            }
            Short('v') | Long("verbose") => sandbox.increase_verbosity(),
            Long("el") => {
                set_sigpipe_dfl()?;
                stdout().write_all(SYD_EL.as_bytes())?;
                return Ok(ExitCode::SUCCESS);
            }
            Long("sh") => {
                set_sigpipe_dfl()?;
                stdout().write_all(ESYD_SH.as_bytes())?;
                return Ok(ExitCode::SUCCESS);
            }
            Long("api") => {
                set_sigpipe_dfl()?;
                #[expect(clippy::disallowed_methods)]
                let api = serde_json::to_string_pretty(&*syd::api::API_SPEC).expect("JSON");
                stdout().write_all(api.as_bytes())?;
                return Ok(ExitCode::SUCCESS);
            }
            Short('q') => {} // Ignore, must be first!

            /*
             * Sandbox options
             */
            Short('E') => {
                export = Some(
                    parser
                        .value()?
                        .parse::<String>()
                        .map(|arg| ExportMode::from_str(&arg))??,
                );
            }
            Short('m') => {
                let cmd = parser.value().map(XPathBuf::from)?;
                if sandbox.is_locked() {
                    eprintln!("Failed to execute magic command `{cmd}': sandbox locked!");
                    return Err(Errno::EPERM.into());
                } else {
                    sandbox.config(&cmd.to_string())?;
                }
            }
            Short('t') => {
                let tmout = parser.value()
                    .ok()
                    .and_then(|ostr| ostr.into_string().ok())
                    .ok_or(Errno::EINVAL)?;
                if sandbox.is_locked() {
                    eprintln!("Failed to set sandbox timeout: sandbox locked!");
                    return Err(Errno::EPERM.into());
                } else {
                    sandbox.config(&format!("timeout:{tmout}"))?;
                }
            }
            Short('x') => sandbox.parse_profile(b"trace")?,
            Short('f') => {
                // Login shell compatibility:
                // Parse user profile as necessary.
                user_parse = true;
            }
            Short('l') | Long("login") => {
                // Login shell compatibility:
                // Parse user profile as necessary.
                user_parse = true;
            }
            Short('c') => {
                // When multiple -c arguments are given,
                // only the first one is honoured and
                // the rest is ignored in consistency
                // with how bash and dash behaves.
                user_parse = true;
                if cmd_argv.is_empty() {
                    cmd_argv.extend(sh.clone());
                    cmd_argv.push(OsString::from("-c"));
                    cmd_argv.push(parser.value()?);
                }
            }
            Short('P') => {
                let path = parser.value().map(XPathBuf::from)?;
                if sandbox.is_locked() {
                    eprintln!("Failed to parse config file `{path}': sandbox locked!");
                    return Err(Errno::EPERM.into());
                }
                sandbox.parse_config_file(&path)?;
            }
            /* We keep --profile for syd-1 compatibility.
             * It's undocumented. */
            Short('p') | Long("profile") => {
                let profile = parser.value()?.parse::<String>()?;
                if sandbox.is_locked() {
                    eprintln!("Failed to parse profile `{profile}': sandbox locked!");
                    return Err(Errno::EPERM.into());
                }
                sandbox.parse_profile(profile.as_bytes())?;
            }

            /*
             * Unshare options
             */
            Short('a') => cmd_arg0 = Some(parser.value()?),
            Short('e') => {
                let value = parser.value()?.parse::<String>()?;
                match value.split_once('=') {
                    Some((var, val)) => {
                        sandbox.env_add_pass(var)?;
                        if !val.is_empty() {
                            // This way we give the user the chance to pass-through
                            // denylisted environment variables e.g.
                            //      syd -eLD_LIBRARY_PATH= cmd
                            // is equivalent to
                            //      syd -eLD_LIBRARY_PATH=$LD_LIBRARY_PATH cmd
                            env::set_var(var, val);
                        }
                    }
                    None => {
                        sandbox.env_del_pass(&value)?;
                        env::remove_var(value);
                    }
                }
            }

            // Profiling options.
            #[cfg(feature = "prof")]
            Long("prof") => match parser.value()?.parse::<String>()?.as_str() {
                "cpu" => env::set_var("SYD_PROF", "cpu"),
                "mem" => env::set_var("SYD_PROF", "mem"),
                val => {
                    eprintln!("Invalid profile mode `{val}'!");
                    eprintln!("Expected exactly one of `cpu' or `mem'!");
                    help();
                    return Ok(ExitCode::FAILURE);
                }
            },

            Value(prog) => {
                cmd_argv.push(prog);
                cmd_argv.extend(parser.raw_args()?);
            }
            _ => return Err(arg.unexpected().into()),
        }
    }

    if let Some(export_mode) = export {
        // SYD_DUMP_SCMP makes setup_seccomp_parent print rules.
        // In addition per-thread filters are printed out.
        match export_mode {
            ExportMode::BerkeleyPacketFilter => env::set_var(ENV_DUMP_SCMP, "bpf"),
            ExportMode::PseudoFiltercode => env::set_var(ENV_DUMP_SCMP, "pfc"),
        }

        // Note, we do not intervene with sandbox policy here, and let
        // the user configure it through other means. This way the user
        // can dump seccomp filters for different set of options.
    } else {
        env::remove_var(ENV_DUMP_SCMP);
    }

    if user_parse && !user_done && !sandbox.is_locked() {
        sandbox.parse_profile(b"user")?;
    }

    // Prepare the command to execute, which may be a login shell.
    let mut is_rbash = env::var_os(ENV_CD).is_some();
    if cmd_argv.is_empty() {
        cmd_argv = sh;
        if cmd_arg0.is_none() {
            // Allow user to override with -a.
            cmd_arg0 = Some(OsString::from("-"));
        }
        if is_rbash_def && export.is_none() {
            is_rbash = true;
        }
    }
    let argv0 = cmd_argv.remove(0);

    // Ignore all signals except the following signals:
    // SIGALRM, SIGCHLD, SIGKILL, SIGSTOP.
    // Skip ignoring signals with default action Core,
    // if trace/allow_unsafe_prlimit:1 is set at startup.
    let mut opts = IgnoreSignalOpts::SkipIgnoreAlarm;
    if sandbox.options.allow_unsafe_prlimit() {
        opts.insert(IgnoreSignalOpts::SkipIgnoreCoreDump);
    }
    ignore_signals(opts).inspect_err(|errno| {
        error!("ctx": "ignore_signals",
            "opt": opts, "err": *errno as i32,
            "msg": format!("ignore signals failed: {errno}"),
            "tip": "check with SYD_LOG=debug and/or submit a bug report");
    })?;
    info!("ctx": "ignore_signals",
        "opt": opts, "msg": "ignored all signals for signal safety");

    // SAFETY: We cannot support NEWPID without NEWNS.
    // ie, pid namespace must have its own private /proc.
    if sandbox.options.unshare_pid() {
        sandbox.set_unshare_mount(true);
    }
    if sandbox.options.unshare_mount() {
        sandbox.set_unshare_pid(true);
    }

    let has_ns_user = sandbox.options.unshare_user();
    let has_pid_max = sandbox.options.unshare_pid() && sandbox.has_pid() && sandbox.pid_max > 0;
    let has_ns_time = sandbox.options.unshare_time();

    // Save original UID/GID to map inside new user namespace.
    let (uid, gid) = if has_ns_user {
        (Some(getuid()), Some(getgid()))
    } else {
        (None, None)
    };

    // Open /proc safely as necessary.
    // unshare/user:1 -> Need to write UID/GID mappings.
    // unshare/pid:1  -> Need to write pid_max sysctl.
    // unshare/time:1 -> Need to write time namespace offsets.
    let fd_proc = if is_rbash || has_ns_user || has_pid_max || has_ns_time {
        let fd = proc_open().inspect_err(|errno| {
            error!("ctx": "setup_namespaces", "op": "open_procfs",
                "err": *errno as i32,
                "msg": format!("open /proc filesystem failed: {errno}"),
                "tip": "mount procfs on top of /proc directory");
        })?;

        Some(fd)
    } else {
        None
    };

    // Switch to safe directory for rbash.
    #[expect(clippy::disallowed_methods)]
    if is_rbash {
        let mut pfd = XPathBuf::from_pid(Pid::this())?;
        pfd.push(b"fdinfo");

        let fd_proc = fd_proc.as_ref().unwrap();
        safe_open_path(fd_proc, &pfd, OFlag::empty(), ResolveFlag::RESOLVE_NO_XDEV).inspect_err(|errno| {
            error!("ctx": "setup_restricted_shell", "op": "open_procfs",
                "err": *errno as i32,
                "msg": format!("open /proc filesystem failed: {errno}"),
                "tip": "mount procfs on top of /proc directory");
        }).and_then(fchdir).inspect_err(|errno| {
            error!("ctx": "setup_restricted_shell", "op": "chdir_procfs",
                "err": *errno as i32,
                "msg": format!("change dir to /proc filesystem failed: {errno}"),
                "tip": "mount procfs on top of /proc directory");
        })?;
    }

    // Set up PTY sandboxing.
    let pty_child = if sandbox.has_pty()
        && isatty(stdin()).unwrap_or(false)
        && isatty(stdout()).unwrap_or(false)
    {
        let pty_debug = secure_getenv("SYD_PTY_DEBUG").is_some();
        let pty_child = pty_setup(sandbox.pty_ws_x(), sandbox.pty_ws_y(), pty_debug)?;

        let mut buf = itoa::Buffer::new();
        env::set_var(ENV_PTY_FD, buf.format(pty_child.as_raw_fd()));

        Some(pty_child)
    } else {
        env::remove_var(ENV_PTY_FD);
        None
    };

    let proxy_debug = secure_getenv("SYD_TOR_DEBUG").is_some();
    let proxy = if sandbox.has_proxy() {
        // sandbox/proxy:on implies unshare/net:1.
        sandbox.set_unshare_net(true);

        // Set up syd-tor.
        Some(ns_setup_tor(
                sandbox.proxy_ext_addr,
                sandbox.proxy_ext_port,
                sandbox.proxy_ext_unix.as_deref(),
                sandbox.proxy_repr().as_str(),
                proxy_debug)?)
    } else {
        None
    };

    // Set up Linux namespaces if requested. Note,
    // we set it up here before spawning the child so as to
    // include the Syd process into the pid namespace as well
    // such that the sandbox process and syd have the identical
    // view of /proc.
    let namespaces = sandbox.options.namespaces();
    if namespaces == 0 {
        // Drop /proc fd which may be open due to is_rbash.
        drop(fd_proc);

        // No namespace arguments passed, run normally.
        return match Supervisor::run(
            sandbox,
            pty_child,
            &argv0,
            cmd_argv,
            cmd_arg0,
        ) {
            Ok(code) => Ok(ExitCode::from(code)),
            Err(error) => {
                let errno = error.errno().unwrap_or(Errno::ENOSYS);
                error!("ctx": "run", "op": "run_supervisor",
                    "msg": format!("failed to run supervisor: {error:?}"),
                    "tip": "check with SYD_LOG=debug and/or submit a bug report");
                Ok(ExitCode::from(u8::try_from(errno as i32).unwrap_or(127)))
            }
        };
    }

    // Tell the kernel to keep the capabilities after the unshare call.
    // This is important because unshare(2) can change the user
    // namespace, which often leads to a loss of capabilities.
    caps::securebits::set_keepcaps(true)?;

    // CLONE_NEWTIME may only be used with unshare(2).
    // CloneFlags don't support CLONE_NEWTIME directly so we use retain.
    let clone_flags = CloneFlags::from_bits_retain(namespaces);
    let clone_names = format_clone_flags(clone_flags);
    let clone_types = format_clone_names(&clone_names);
    unshare(clone_flags).inspect_err(|errno| {
        error!("ctx": "unshare_namespaces",
            "ns": &clone_names, "err": *errno as i32,
            "msg": format!("unshare into {clone_types} failed: {errno}"),
            "tip": "check with SYD_LOG=debug and/or set `unshare/user:1'");
    })?;
    info!("ctx": "setup_namespaces", "op": "unshare", "ns": &clone_names,
        "msg": format!("unshared into {clone_types}"));
    drop(clone_names);
    drop(clone_types);

    // Set up user namespace.
    #[expect(clippy::disallowed_methods)]
    if has_ns_user {
        let fd_proc = fd_proc.as_ref().unwrap();
        ns_setup_user(fd_proc, uid.unwrap(), gid.unwrap(), sandbox.options.map_root())?;
    }

    // Set up PID namespace.
    #[expect(clippy::disallowed_methods)]
    if has_pid_max {
        let fd_proc = fd_proc.as_ref().unwrap();
        ns_setup_pid(fd_proc, sandbox.pid_max)?;
    }

    // Set up time namespace.
    if has_ns_time {
        #[expect(clippy::disallowed_methods)]
        let fd_proc = fd_proc.as_ref().unwrap();
        ns_setup_time(fd_proc, sandbox.boottime, sandbox.monotime)?;
    }
    drop(fd_proc); // drop /proc fd.

    // Set up network namespace.
    if sandbox.options.unshare_net() {
        ns_setup_net(proxy.as_ref(), sandbox.proxy_addr, sandbox.proxy_port, proxy_debug)?;
    }
    drop(proxy); // close syd-tor end.

    // Set up UTS namespace.
    if sandbox.options.unshare_uts() {
        ns_setup_uts(sandbox.hostname.as_deref(), sandbox.domainname.as_deref())?;
    }

    // Disable Speculative Store Bypass mitigations
    // for trace/allow_unsafe_exec_speculative:1
    let ssb = sandbox.options.allow_unsafe_exec_speculative();

    // Prepare stack for new Syd process.
    // SAFETY: Heap-allocate the clone stack to avoid caller's
    // stack frame, which may overflow RLIMIT_STACK.
    let mut stack = Vec::new();
    let stack_siz = MAIN_STACK_SIZE.try_into().or(Err(Errno::EOVERFLOW))?;
    stack.try_reserve(stack_siz).or(Err(Errno::ENOMEM))?;
    stack.resize(stack_siz, 0);

    struct SydChildInfo {
        sandbox: Sandbox,
        pty_child: Option<OwnedFd>,
        argv0: OsString,
        cmd_argv: Vec<OsString>,
        cmd_arg0: Option<OsString>,
    }

    // Run the supervisor inside child process.
    // Syd will be the first process in new namespaces.
    extern "C" fn syd_child_after_clone(arg: *mut libc::c_void) -> libc::c_int {
        // SAFETY: arg is a valid SydChildInfo structure.
        let child_info: Box<SydChildInfo> = unsafe { Box::from_raw(arg as *mut SydChildInfo) };

        let retval = match Supervisor::run(
            child_info.sandbox,
            child_info.pty_child,
            &child_info.argv0,
            child_info.cmd_argv,
            child_info.cmd_arg0,
        ) {
            Ok(retval) => i32::from(retval),
            Err(error) => {
                let errno = Errno::last();
                eprintln!("{error:?}");
                errno as i32
            }
        };
        exit(retval);
    }

    // Prepare information for the Syd child.
    let child_info = Box::new(SydChildInfo {
        sandbox,
        pty_child,
        argv0,
        cmd_argv,
        cmd_arg0,
    });
    let child_info_ptr: *mut libc::c_void = Box::into_raw(child_info) as *mut libc::c_void;

    // SAFETY: Use clone(2) with CLONE_PIDFD to avoid pid recycling.
    let result = unsafe { fdclone(
            syd_child_after_clone,
            &mut stack,
            child_info_ptr,
            CloneFlags::empty(),
            Some(libc::SIGCHLD),
        )};

    // SAFETY: Reconstruct and drop parent's copy.
    drop(unsafe { Box::from_raw(child_info_ptr as *mut SydChildInfo) });

    let (pid_fd, _) = result?;

    // SAFETY: Randomize the pid FD for hardening.
    let pid_fd_rand = duprand(pid_fd.as_raw_fd(), OFlag::O_CLOEXEC)?;
    drop(pid_fd);

    // SAFETY: duprand returns a valid FD on success.
    // Create a BorrowedFd and NOT and OwnedFd, because this fd will
    // never be explicitly closed, it will be closed on exit. Moreover,
    // attempts to close any fd will fail due to the seccomp(2) filter
    // which will be applied before wait loop.
    let pid_fd = unsafe { BorrowedFd::borrow_raw(pid_fd_rand.into_raw_fd()) };

    // Close all file descriptors but stderr and pidfd.
    // Set must be sorted because pidfd may be lower than stderr.
    #[expect(clippy::cast_sign_loss)]
    let mut set = vec![
        libc::STDERR_FILENO as libc::c_uint,
        pid_fd.as_raw_fd() as libc::c_uint,
    ];
    set.sort_unstable();
    closeexcept(&set)?;
    drop(set);

    // Confine resource limits in the new process:
    // Set nfiles, nprocs, and filesize rlimits to zero.
    // Set locks, memory lock and msgqueue rlimits to zero.
    confine_rlimit_zero(&[
        Resource::RLIMIT_FSIZE,
        Resource::RLIMIT_NOFILE,
        Resource::RLIMIT_NPROC,
        Resource::RLIMIT_LOCKS,
        Resource::RLIMIT_MEMLOCK,
        Resource::RLIMIT_MSGQUEUE,
    ])?;

    // SAFETY: Set up a Landlock sandbox to disallow all access.
    let abi = syd::landlock::ABI::new_current();
    let policy = LandlockPolicy {
        scoped_abs: true,
        scoped_sig: true,

        ..Default::default()
    };
    let _ = policy.restrict_self(abi);

    // SAFETY: Set up a seccomp filter which only allows
    // 1. write to standard error.
    // 2. waitid and exit.
    // 3. memory allocation syscalls
    // 4. signal handling syscalls
    let mut ctx = ScmpFilterContext::new(ScmpAction::KillProcess)?;

    // Enforce the NO_NEW_PRIVS functionality before
    // loading the seccomp filter into the kernel.
    ctx.set_ctl_nnp(true)?;

    // Disable Speculative Store Bypass mitigations
    // with trace/allow_unsafe_exec_speculative:1
    ctx.set_ctl_ssb(ssb)?;

    // DO NOT synchronize filter to all threads.
    // Main thread will confine itself.
    ctx.set_ctl_tsync(false)?;

    // We kill for bad system call and bad arch.
    ctx.set_act_badarch(ScmpAction::KillProcess)?;

    // Use a binary tree sorted by syscall number if possible.
    let _ = ctx.set_ctl_optimize(2);

    // SAFETY: Do NOT add supported architectures to the filter.
    // This ensures Syd can never run a non-native system call,
    // which we do not need at all.
    // seccomp_add_architectures(&mut ctx)?;

    const ALLOW_SYSCALLS: &[&str] = &[
        "exit",
        "exit_group",
        "waitid",
        "brk",
        //"madvise", advice are confined.
        "mremap",
        "munmap",
        "sigaction",
        "sigaltstack",
        "sigpending",
        "sigprocmask",
        "sigsuspend",
        "sigreturn",
        "rt_sigaction",
        "rt_sigpending",
        "rt_sigprocmask",
        "rt_sigqueueinfo",
        "rt_sigreturn",
        "rt_sigtimedwait",
        "rt_sigtimedwait_time64",
        #[cfg(feature = "prof")]
        "getpid",
        #[cfg(feature = "prof")]
        "gettid",
    ];
    for name in ALLOW_SYSCALLS.iter().chain(VDSO_SYSCALLS) {
        if let Ok(syscall) = ScmpSyscall::from_name(name) {
            ctx.add_rule(ScmpAction::Allow, syscall)?;
        }
    }

    // Allow safe madvise(2) advice.
    confine_scmp_madvise(&mut ctx)?;

    // Allow write(2) to standard error.
    if let Ok(syscall) = ScmpSyscall::from_name("write") {
        ctx.add_rule_conditional(
            ScmpAction::Allow,
            syscall,
            &[scmp_cmp!($arg0 == libc::STDERR_FILENO as u64)],
        )?;
    }

    // Prevent executable memory.
    confine_scmp_wx_syd(&mut ctx)?;

    // Load the seccomp(2) filter.
    ctx.load()?;

    // All done, start the wait loop.
    loop {
        #[expect(clippy::cast_possible_truncation)]
        #[expect(clippy::cast_sign_loss)]
        break match waitid(Id::PIDFd(pid_fd.as_fd()), WaitPidFlag::WEXITED) {
            Ok(WaitStatus::Exited(_, code)) =>
            {
                #[expect(clippy::cast_possible_truncation)]
                #[expect(clippy::cast_sign_loss)]
                Ok(ExitCode::from(code as u8))
            }
            Ok(WaitStatus::Signaled(_, signal, _)) => {
                Ok(ExitCode::from(128_u8.saturating_add(signal as u8)))
            }
            Ok(WaitStatus::StillAlive) | Err(Errno::EINTR) => continue,
            Ok(_status) => Err(Errno::EINVAL.into()),
            Err(errno) => Err(errno.into()),
        };
    }
}

fn help() {
    let mut c_blue = "\x1b[0;1;35;95m";
    let mut c_bold = "\x1b[1m";
    let mut c_cyan = "\x1b[0;1;36;96m";
    let mut c_green = "\x1b[0;1;32;92m";
    let mut c_orng = "\x1b[0;1;34;94m";
    let mut c_red = "\x1b[0;1;31;91m";
    let mut c_res = "\x1b[0m";
    let mut c_yll = "\x1b[0;1;33;93m";
    if !isatty(std::io::stdout()).unwrap_or(false) {
        c_blue = "";
        c_bold = "";
        c_cyan = "";
        c_green = "";
        c_orng = "";
        c_red = "";
        c_res = "";
        c_yll = "";
    }

    println!(
        "{c_red}syd{c_res} {c_cyan}{}{c_res} ({c_orng}{}{c_res})",
        *syd::config::VERSION,
        syd_code_name()
    );
    println!("{c_yll}Rock solid application kernel{c_res}");
    println!("{c_blue}Author:{c_res} {c_yll}Ali Polatel{c_res} <{c_bold}alip@chesswob.org{c_res}>");
    println!("{c_blue}License:{c_res} {c_yll}GPL-3.0-only{c_res}");
    println!();
    println!("{c_green}$ syd [-acefhlmpqxEPV] [--] {{command [arg...]}}{c_res}");
    println!("  {c_bold}Run a program under Syd.{c_res}");
    println!("{c_green}$ syd --api{c_res}");
    println!("  {c_bold}Print syd(2) API specification.{c_res}");
    println!("{c_green}$ syd --check{c_res}");
    println!("  {c_bold}Print sandboxing support information.{c_res}");
    println!("{c_green}$ syd --el{c_res}");
    println!("  {c_bold}Output syd.el the Emacs Lisp implementation of syd(2) interface.{c_res}");
    println!("{c_green}$ syd --sh{c_res}");
    println!("  {c_bold}Output a shell script which defines the esyd helper function.{c_res}");
    println!();
    print!("{SEE_EMILY_PLAY}");
    println!();
    println!("{c_orng}Send bug reports to{c_res} {c_bold}https://gitlab.exherbo.org/groups/sydbox/-/issues{c_res}");
    println!("{c_orng}Attaching poems encourages consideration tremendously.{c_res}");
}
