From 988130ab449305f9111a8e97fe99873391e66ac3 Mon Sep 17 00:00:00 2001 From: Luther Monson Date: Sun, 31 May 2026 18:16:54 -0700 Subject: [PATCH 1/7] docs(arch): native macOS runner design --- docs/arch/native-macos-runner.md | 264 +++++++++++++++++++++++++++++++ 1 file changed, 264 insertions(+) create mode 100644 docs/arch/native-macos-runner.md diff --git a/docs/arch/native-macos-runner.md b/docs/arch/native-macos-runner.md new file mode 100644 index 00000000..be181a5a --- /dev/null +++ b/docs/arch/native-macos-runner.md @@ -0,0 +1,264 @@ +# Native macOS Runner + +> **Status: proposed.** Not yet implemented. + +## Problem + +macOS jobs currently run in per-job Virtualization.framework VMs (APFS +clone-on-write from a base image). This works but has hard limits: + +- Apple restricts macOS VMs to **2 concurrent instances** per host. +- Each VM needs **4 GB+ RAM** (2 GB absolute minimum, unusable in practice). +- An 8 GB Mac mini can run at most **2 concurrent macOS jobs**. +- VM boot adds **10-15 seconds** of overhead per job. + +For repos that don't need VM-level isolation (trusted internal CI, Xcode +builds, Go tests), a native execution mode that runs the GitHub Actions +runner directly on the host would allow **4-6+ concurrent jobs** on the +same hardware with zero boot overhead. + +## Proposal + +Add a **native** macOS execution mode alongside the existing VM mode. +The mode is configured per-repo. The VM path is untouched -- this is +purely additive. + +## Config design + +A new `[runner.macos]` section controls macOS job routing. It lives under +`[runner]` (not `[vm.macos]`) because native jobs don't involve VMs. + +```toml +[runner.macos] +mode = "vm" # default mode: "vm" or "native" +max_native = 4 # max concurrent native jobs (no Apple limit applies) + +# Per-repo overrides. Repo name matches github.repos entries. +[runner.macos.repos] +php-sdk = "native" +ephemerd = "native" +# Repos not listed here inherit the top-level mode. +``` + +Config struct additions in `pkg/config/config.go`: + +```go +type RunnerConfig struct { + // ... existing fields ... + MacOS MacOSRunnerConfig `toml:"macos"` +} + +type MacOSRunnerConfig struct { + Mode string `toml:"mode"` // "vm" (default) or "native" + MaxNative int `toml:"max_native"` // max concurrent native jobs (default 4) + Repos map[string]string `toml:"repos"` // repo -> "vm" or "native" +} +``` + +`MacOSRunnerConfig.ModeForRepo(repo)` returns `"native"` or `"vm"` by +checking the per-repo map first, then falling back to the top-level mode, +then defaulting to `"vm"`. + +### Why not extend `[runner.images]`? + +`[runner.images]` maps repos to OCI container images. Native macOS jobs +don't use container images at all -- they run directly on the host. Mixing +these two concepts in the same config block would be confusing. + +## Scheduler flow + +`handleQueued` already routes macOS jobs to `handleMacOSJob`. The change +adds a branch at the top of `handleMacOSJob`: + +``` +handleQueued + └─ isMacOSJob? + └─ handleMacOSJob + ├─ ModeForRepo == "native" → handleNativeMacOSJob (new) + │ └─ acquire nativeMacSem (max_native) + └─ ModeForRepo == "vm" → existing VM path + └─ acquire macSem (max 2) +``` + +A new semaphore `nativeMacSem` (capacity = `max_native`) is separate from +the existing `macSem` (VM concurrency, capped at 2 by Apple). This means +a host can run 2 VM jobs + 4 native jobs simultaneously if both modes are +in use. + +The `canHandleJob` check for `"macos"` labels also needs updating: +currently it requires `MacOSVMConfig != nil`. With native mode, macOS jobs +are handleable on darwin hosts even without a VM disk image, as long as +the runner config allows native mode for that repo. + +## Native runner lifecycle + +New package: `pkg/native/native_darwin.go` (build-tagged `darwin`). + +### 1. Create workspace + +``` +/native// + ├── home/ → $HOME for the job + ├── tmp/ → $TMPDIR for the job + ├── work/ → runner _work directory + └── runner/ → per-job copy of the GHA runner binary +``` + +The runner is extracted from the embedded `pkg/runner` tarball into the +per-job directory. This is the same runner binary used by the VM path, +just extracted to a different location. + +### 2. Set up environment + +```go +env := []string{ + "HOME=" + jobHome, + "TMPDIR=" + jobTmp, + "RUNNER_WORK_FOLDER=" + jobWork, + "PATH=/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin", + // Xcode: use host's installation + "DEVELOPER_DIR=/Applications/Xcode.app/Contents/Developer", +} +``` + +Host tooling (`/opt/homebrew`, `/Applications/Xcode.app`, `/usr/local`) +is shared read-only by virtue of the OS -- no bind mounts needed. Each +job just gets its own HOME/TMPDIR/work directory so outputs don't collide. + +### 3. Start runner + +```go +cmd := exec.CommandContext(ctx, "./run.sh", "--jitconfig", jitConfig) +cmd.Dir = runnerDir +cmd.Env = env +cmd.Stdout = logFile +cmd.Stderr = logFile +cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} // own process group +err := cmd.Start() +``` + +`Setpgid: true` puts the runner and all its children in a new process +group so we can `kill(-pgid, SIGTERM)` on cleanup. + +### 4. Wait for exit + +Block on `cmd.Wait()`. Return the exit code. + +### 5. Cleanup + +1. Kill the process group (`syscall.Kill(-pgid, SIGKILL)`) if still alive. +2. `pkill -9 -P ` as a fallback for any orphaned children. +3. `os.RemoveAll(jobDir)` to delete the workspace. +4. Deregister the runner from the provider. + +## Isolation model + +| Layer | Native | VM | +|-------|--------|----| +| Filesystem | Per-job HOME, TMPDIR, workdir | Full disk clone | +| Processes | Process group (`setpgid`), killed on cleanup | Separate kernel | +| Network | Shared host network, no isolation | NAT with firewall | +| Users | Shared `admin` user | Isolated `admin` user | +| Secrets | Environment vars only, cleared on exit | VM memory destroyed | + +### What native isolation provides + +- **Directory isolation**: each job gets its own HOME, TMPDIR, and work + directory. Jobs cannot see each other's files. +- **Process isolation**: `setpgid` + process group kill ensures no + orphaned processes survive between jobs. +- **Environment isolation**: each runner process gets a controlled set of + environment variables. No leakage from the daemon process. + +### What native isolation does NOT provide + +- **No network isolation.** macOS has no network namespaces. A malicious + job can reach the host network, other jobs' ports, and the metadata + service. The host-level firewall (`pfctl`) can block RFC1918 ranges but + cannot isolate jobs from each other. +- **No filesystem isolation beyond directories.** Jobs share the same + `/Applications`, `/opt/homebrew`, etc. A malicious job could modify + shared tools. Use `diskutil apfs addVolume` or a read-only system + volume for defense in depth. +- **No user isolation.** All jobs run as the same macOS user. A job can + `ps aux` and see other jobs' processes. + +### Mitigation: Apple sandbox profiles (future) + +macOS has `sandbox-exec` (deprecated but functional through macOS 15+) +and the App Sandbox entitlement system. A future enhancement could wrap +the runner process in a sandbox profile that: + +- Denies network access to localhost and RFC1918. +- Denies file writes outside the job directory. +- Denies process inspection (`proc_info`). + +This is explicitly deferred -- it requires testing across macOS versions +and may break runner functionality. + +## Comparison table + +| Dimension | Native | VM | +|-----------|--------|----| +| Boot time | ~0s (fork+exec) | 10-15s | +| Memory per job | ~200 MB (runner process) | 4+ GB | +| Max concurrent (8 GB mini) | 4-6 | 2 | +| Isolation | Process group + directory | Full VM (separate kernel) | +| Network isolation | None | NAT + firewall | +| Security | Trusted repos only | Untrusted OK | +| Xcode/Homebrew | Shared from host | Pre-installed in base image | +| Setup complexity | Low (just extract runner) | High (IPSW install, clone) | +| Apple VM limit | Not applicable | 2 per host | + +## What changes + +### `pkg/config/config.go` + +Add `MacOSRunnerConfig` struct to `RunnerConfig`. Add `ModeForRepo(repo)` +method. + +### `pkg/scheduler/scheduler.go` + +- Add `nativeMacSem chan struct{}` field to `Scheduler`. +- Initialize from `cfg.Runner.MacOS.MaxNative` (default 4). +- Update `canHandleJob`: accept macOS jobs on darwin even without + `MacOSVMConfig` when native mode is configured for the repo. +- Split `handleMacOSJob`: check `ModeForRepo` and route to + `handleNativeMacOSJob` or the existing VM path. + +### New: `pkg/native/native_darwin.go` + +Native runner lifecycle: + +```go +type Runner struct { /* workspace paths, cmd, pgid */ } + +func New(dataDir string, jobID string, jitConfig string, log *slog.Logger) (*Runner, error) +func (r *Runner) Start(ctx context.Context) error +func (r *Runner) Wait(ctx context.Context) (int, error) +func (r *Runner) Stop() +``` + +A `native_other.go` stub returns errors on non-darwin platforms. + +### `cmd/ephemerd/runtime_darwin.go` + +Pass `cfg.Runner.MacOS` to the scheduler config so it can read per-repo +mode overrides. + +## Open questions + +1. **Shared Homebrew mutations.** If a job runs `brew install foo`, it + modifies the shared `/opt/homebrew`. Options: (a) accept it for trusted + repos, (b) overlay a per-job Homebrew prefix, (c) make `/opt/homebrew` + read-only and provide a per-job writable prefix. Start with (a), + revisit if it causes problems. + +2. **Keychain access.** macOS jobs may need the login keychain for code + signing. Native jobs share the host keychain. VM jobs each get their + own. For native mode, either create a per-job keychain or accept + shared access for trusted repos. + +3. **Concurrency limit tuning.** `max_native = 4` is a guess for 8 GB + Mac minis. Should we auto-detect based on available memory, or is a + static config sufficient? Start with static config. From 8c6536b8171bf0319430cc442f41ffd29f7a4cdd Mon Sep 17 00:00:00 2001 From: Luther Monson Date: Sun, 31 May 2026 18:26:08 -0700 Subject: [PATCH 2/7] =?UTF-8?q?docs(arch):=20resolve=20open=20questions=20?= =?UTF-8?q?=E2=80=94=20Homebrew=20overlay,=20per-job=20keychain,=20static?= =?UTF-8?q?=20concurrency?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/arch/native-macos-runner.md | 99 ++++++++++++++++++++++++++------ 1 file changed, 83 insertions(+), 16 deletions(-) diff --git a/docs/arch/native-macos-runner.md b/docs/arch/native-macos-runner.md index be181a5a..965121cc 100644 --- a/docs/arch/native-macos-runner.md +++ b/docs/arch/native-macos-runner.md @@ -246,19 +246,86 @@ A `native_other.go` stub returns errors on non-darwin platforms. Pass `cfg.Runner.MacOS` to the scheduler config so it can read per-repo mode overrides. -## Open questions - -1. **Shared Homebrew mutations.** If a job runs `brew install foo`, it - modifies the shared `/opt/homebrew`. Options: (a) accept it for trusted - repos, (b) overlay a per-job Homebrew prefix, (c) make `/opt/homebrew` - read-only and provide a per-job writable prefix. Start with (a), - revisit if it causes problems. - -2. **Keychain access.** macOS jobs may need the login keychain for code - signing. Native jobs share the host keychain. VM jobs each get their - own. For native mode, either create a per-job keychain or accept - shared access for trusted repos. - -3. **Concurrency limit tuning.** `max_native = 4` is a guess for 8 GB - Mac minis. Should we auto-detect based on available memory, or is a - static config sufficient? Start with static config. +## Decisions + +### 1. Homebrew: per-job writable prefix over shared read-only base + +Jobs need `brew install` for build deps, but we can't let one job's +installs pollute another. The solution uses Homebrew's relocatable +architecture: + +**Host setup (one-time):** `/opt/homebrew` is pre-installed with common +tools (Go, mage, etc.) and marked read-only for the runner user. + +**Per-job overlay:** + +``` +/native// + └── homebrew/ → HOMEBREW_PREFIX, HOMEBREW_CELLAR, HOMEBREW_TEMP + ├── Cellar/ → per-job installs land here + ├── lib/ + ├── bin/ → symlinked from /opt/homebrew/bin at job start + └── Homebrew/ → lightweight Homebrew checkout (or symlink) +``` + +Environment for the runner process: + +```bash +HOMEBREW_PREFIX=/homebrew +HOMEBREW_CELLAR=/homebrew/Cellar +HOMEBREW_TEMP=/tmp +PATH=/homebrew/bin:/opt/homebrew/bin:/usr/local/bin:... +``` + +How it works: + +1. At job start, create `/homebrew/bin` and symlink all + executables from `/opt/homebrew/bin` into it. This gives the job + read access to pre-installed tools. +2. Set `HOMEBREW_PREFIX` and `HOMEBREW_CELLAR` to the per-job dir. + Any `brew install` writes to the job's Cellar, not the host's. +3. The job's `homebrew/bin` is first in PATH, so newly installed + formulas shadow the host versions if there's a conflict. +4. At job end, `rm -rf ` deletes everything — installs, + caches, temp files. + +**Why not a full Homebrew clone?** Cloning the Homebrew repo takes +~10 seconds and ~500 MB. Symlinking the host's existing install is +instant and zero-copy. The job only needs a writable prefix for new +installs. + +**Why not just share `/opt/homebrew` read-write?** Jobs would step on +each other. One job upgrading a formula mid-build could break another +job. Per-job prefix keeps them independent. + +### 2. Keychain: per-job temporary keychain + +Each native job gets its own temporary keychain: + +```bash +KEYCHAIN=/keychain/job.keychain-db +security create-keychain -p "" "$KEYCHAIN" +security default-keychain -s "$KEYCHAIN" +security unlock-keychain -p "" "$KEYCHAIN" +``` + +At cleanup: + +```bash +security delete-keychain "$KEYCHAIN" +``` + +This prevents jobs from accessing each other's signing identities and +avoids polluting the host login keychain. Jobs that need code signing +import their certs into the per-job keychain via `security import` +(standard GitHub Actions pattern — `apple-actions/import-codesign-certs` +does exactly this). + +### 3. Concurrency: static config, default 4 + +`max_native = 4` is the default. Operators set it based on their +hardware. No auto-detection — the right value depends on workload +(CPU-heavy Xcode builds want fewer, lightweight Go tests want more). + +The value only caps native macOS jobs. Linux jobs (in the VM) and +macOS VM jobs have their own separate limits. From 54bcaa2d35ffca4eeb3208c364b65bc67acebf79 Mon Sep 17 00:00:00 2001 From: Luther Monson Date: Sun, 31 May 2026 19:04:46 -0700 Subject: [PATCH 3/7] =?UTF-8?q?docs(arch):=20add=20sandbox-exec=20profile?= =?UTF-8?q?=20=E2=80=94=20network=20deny,=20port=20bind=20deny,=20filesyst?= =?UTF-8?q?em=20write=20isolation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/arch/native-macos-runner.md | 120 ++++++++++++++++++++++--------- 1 file changed, 85 insertions(+), 35 deletions(-) diff --git a/docs/arch/native-macos-runner.md b/docs/arch/native-macos-runner.md index 965121cc..590a90f8 100644 --- a/docs/arch/native-macos-runner.md +++ b/docs/arch/native-macos-runner.md @@ -155,46 +155,96 @@ Block on `cmd.Wait()`. Return the exit code. | Layer | Native | VM | |-------|--------|----| -| Filesystem | Per-job HOME, TMPDIR, workdir | Full disk clone | +| Filesystem | Per-job HOME/TMPDIR/workdir + sandbox deny on sensitive paths | Full disk clone | | Processes | Process group (`setpgid`), killed on cleanup | Separate kernel | -| Network | Shared host network, no isolation | NAT with firewall | -| Users | Shared `admin` user | Isolated `admin` user | -| Secrets | Environment vars only, cleared on exit | VM memory destroyed | +| Network | Sandbox: deny RFC1918/localhost outbound + deny port binding | NAT with firewall | +| Users | Shared macOS user | Isolated user per VM | +| Secrets | Sandbox denies read on key paths, env cleared on exit | VM memory destroyed | -### What native isolation provides +### Sandbox profile (required for native mode) -- **Directory isolation**: each job gets its own HOME, TMPDIR, and work - directory. Jobs cannot see each other's files. +Every native job runs under `sandbox-exec -f `. The sandbox +is **inherited by all child processes** and **enforced by the kernel**. +No process can escape it without root. + +The profile is generated per-job (to include the job-specific directory +paths) and written to the job workspace: + +```scheme +(version 1) +(allow default) + +;; === Network isolation === + +;; Block outbound to private networks +(deny network-outbound (remote ip "localhost:*")) +(deny network-outbound (remote ip "10.0.0.0/8:*")) +(deny network-outbound (remote ip "172.16.0.0/12:*")) +(deny network-outbound (remote ip "192.168.0.0/16:*")) +(deny network-outbound (remote ip "169.254.0.0/16:*")) + +;; Block binding to any port — prevents jobs from running servers +;; that other jobs could connect to. This closes the inter-job +;; localhost attack vector entirely. +(deny network-bind (local ip "*:*")) + +;; Allow DNS (required for public internet access) +(allow network-outbound (remote udp "*:53")) +(allow network-outbound (remote tcp "*:53")) + +;; === Filesystem isolation === + +;; Block sensitive host paths +(deny file-read* (subpath "/Users/luthermonson/.ssh")) +(deny file-read* (subpath "/config.toml")) +(deny file-read* (literal "/ephemerd.sock")) +(deny file-read* (subpath "/vm")) + +;; Block writes to shared tools (read-only access only) +(deny file-write* (subpath "/opt/homebrew")) +(deny file-write* (subpath "/Applications")) +(deny file-write* (subpath "/usr/local")) + +;; Allow writes to the job directory only +(allow file-write* (subpath "")) +(allow file-write* (subpath "/private/tmp")) +``` + +In Go, the runner is launched as: + +```go +cmd := exec.CommandContext(ctx, "sandbox-exec", "-f", profilePath, + "./run.sh", "--jitconfig", jitConfig) +cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} +``` + +### What this provides + +- **Network isolation**: jobs cannot reach the LAN, other machines, or + the ephemerd control socket. Jobs cannot bind ports, so they cannot + communicate with each other via localhost. +- **DNS allowed**: jobs can resolve public hostnames and connect to + public internet (GitHub, package registries, etc.). +- **Filesystem write isolation**: jobs can only write to their own + workspace. Shared tools (`/opt/homebrew`, `/Applications`) are + read-only. Sensitive host files (SSH keys, config, VM assets) are + blocked entirely. - **Process isolation**: `setpgid` + process group kill ensures no orphaned processes survive between jobs. -- **Environment isolation**: each runner process gets a controlled set of - environment variables. No leakage from the daemon process. - -### What native isolation does NOT provide - -- **No network isolation.** macOS has no network namespaces. A malicious - job can reach the host network, other jobs' ports, and the metadata - service. The host-level firewall (`pfctl`) can block RFC1918 ranges but - cannot isolate jobs from each other. -- **No filesystem isolation beyond directories.** Jobs share the same - `/Applications`, `/opt/homebrew`, etc. A malicious job could modify - shared tools. Use `diskutil apfs addVolume` or a read-only system - volume for defense in depth. -- **No user isolation.** All jobs run as the same macOS user. A job can - `ps aux` and see other jobs' processes. - -### Mitigation: Apple sandbox profiles (future) - -macOS has `sandbox-exec` (deprecated but functional through macOS 15+) -and the App Sandbox entitlement system. A future enhancement could wrap -the runner process in a sandbox profile that: - -- Denies network access to localhost and RFC1918. -- Denies file writes outside the job directory. -- Denies process inspection (`proc_info`). - -This is explicitly deferred -- it requires testing across macOS versions -and may break runner functionality. +- **Environment isolation**: each runner process gets a controlled set + of environment variables. No leakage from the daemon process. + +### Remaining limitations (accepted for trusted repos) + +- **No per-job user isolation.** All jobs run as the same macOS user. + A job can `ps aux` and see other jobs' PIDs (but not interact with + them — the sandbox blocks sensitive files and network). +- **No resource limits.** macOS has no cgroups. A runaway build can + starve other jobs of CPU/memory. Mitigated with `nice` (CPU priority) + and `ulimit` (memory soft limit) on the runner process. +- **Read access to non-denied paths.** Jobs can read world-readable + files outside the deny list. The sandbox profile should be kept + up-to-date with any new sensitive paths. ## Comparison table From 1259b7af14900fa347eff083acc92ed01f30f9cf Mon Sep 17 00:00:00 2001 From: Luther Monson Date: Sun, 31 May 2026 08:45:35 -0700 Subject: [PATCH 4/7] docs(arch): propose 'ephemerd upgrade' in-place binary update (#86) --- docs/arch/upgrade-command.md | 325 +++++++++++++++++++++++++++++++++++ 1 file changed, 325 insertions(+) create mode 100644 docs/arch/upgrade-command.md diff --git a/docs/arch/upgrade-command.md b/docs/arch/upgrade-command.md new file mode 100644 index 00000000..c56cf1b8 --- /dev/null +++ b/docs/arch/upgrade-command.md @@ -0,0 +1,325 @@ +# `ephemerd upgrade`: In-Place Binary Update + +> **Status: proposal.** Not implemented. Scoping document — design, +> tradeoffs, and a work breakdown. Cost estimates are based on adjacent +> tooling (Tailscale's `tailscale update`, k0s `k0sctl apply`, Docker +> CE upgrade flows) and are best-guess until prototyped. + +## Context + +Today, updating ephemerd on a host is a manual five-step ritual: + +1. `git pull` on the host (or copy a fresh tree). +2. `mage build:windows` (or `:macos` / current-OS variant), about 5 + minutes including the embedded Linux cross-compile. +3. `ephemerd stop`, wait for the process to actually exit (Windows + service shutdown races; binary stays locked for a beat). +4. Copy the new binary to `C:\Program Files\ephemerd\ephemerd.exe` + (or `/usr/local/bin/ephemerd` on Linux/macOS). +5. `ephemerd start`, poll for the in-VM ephemerd to come back up, + `grep` the console log to confirm the version baked in is the one + we just built. + +That works for one host. For three weekly iterations on one host it +becomes annoying. For a fleet — multiple hosts per org, plus the +~half-dozen test rigs the team would want to keep current — it doesn't +scale. + +The dind work in PRs #82–#85 also surfaced that "is the new code +actually running" is a non-obvious question. The Windows daemon and the +in-VM ephemerd are *two* binaries (the Linux one is embedded into the +Windows one and extracted on every VM boot), and a stale build can run +silently if the deploy missed either layer. An upgrade command should +make that uncertainty go away by handling both halves and reporting the +resulting version end-to-end. + +## Goals + +1. **One command per host.** `ephemerd upgrade` does the entire update. +2. **No source tree required on the target.** Hosts that aren't dev + workstations shouldn't need Go, mage, the repo, or 5 minutes of CPU + to update. +3. **Per-channel pinning.** A prod host configured for `stable` can't + accidentally pull a `main` build. A dev host can opt in. +4. **Drain-safe.** No in-flight jobs get killed by an upgrade. +5. **Rollback-safe.** Failed startup of the new binary rolls back to + the previous one automatically. +6. **End-to-end version reporting.** Post-upgrade output names *both* + the Windows-daemon version and the in-VM ephemerd version, so the + "I deployed, the fix didn't take" story from the dind work can't + recur silently. + +Non-goals: rolling fleet upgrades (one host at a time is fine; multi-host +orchestration is a layer above this), zero-downtime (drain + restart is +acceptable for our SLA), self-updating from arbitrary URLs (channels +only). + +## Design + +### Artifact source + +Pre-built binaries published by CI on every push to main and on every +release tag. The simplest store is GitHub Releases: + +- **`stable` channel** → latest tag matching `v*.*.*`, downloaded from + that release's assets. +- **`main` channel** → a rolling release named `latest-main`, updated + by CI on every push to `main`. Same asset layout as a tagged release. +- **`pinned` channel** → `--tag vX.Y.Z` for one-shot updates to a + specific version; also settable in config. + +Each release publishes: + +``` +ephemerd-windows-amd64.exe (~880 MB — embeds linux binary) +ephemerd-linux-amd64 (~240 MB) +ephemerd-linux-arm64 (~240 MB) +ephemerd-darwin-arm64 (~similar — embeds Vz linux assets) +SHA256SUMS (signed) +SHA256SUMS.asc (detached signature — optional v1) +``` + +The upgrade command picks the asset matching its host's GOOS/GOARCH. + +Tradeoff: GitHub Releases is free and integrates trivially with our +existing CI, but downloads are rate-limited and unauthenticated pulls +get throttled aggressively. Anonymous pulls from a busy fleet may hit +the limit; authenticated pulls (using the host's `GITHUB_TOKEN`) +sidestep it. For v1 we rely on the auth token ephemerd already holds. + +### Channel config + +```toml +# /etc/ephemerd/config.toml (or %ProgramData%\ephemerd\config.toml) +[upgrade] +channel = "stable" # "stable" | "main" | "pinned" +pinned_tag = "" # only used when channel = "pinned" +auto_check = true # poll for new versions periodically +check_interval = "24h" # how often to log "newer version available" +``` + +Default is `stable`. A fresh install can't accidentally float into +`main` without an explicit config change. + +### Command shape + +``` +ephemerd upgrade [flags] + --channel override config channel for this run + --tag shorthand for --channel pinned --pinned-tag + --check report available version, don't upgrade + --dry-run show what would happen, don't do it + --force skip version check (re-deploy current) + --no-drain skip drain (operator override) +``` + +Default flow (no flags): + +1. Resolve channel → download URL → expected version. +2. `--check` returns here. +3. Compare to running version; no-op if equal (unless `--force`). +4. Download artifact + SHA256 manifest to `/.upgrade/`. +5. Verify SHA256. (GPG/cosign optional — v2.) +6. Pre-flight: confirm we have permission to swap the binary, + service-manager access, etc. +7. **Drain** the daemon — refuse new jobs, wait for active jobs to + exit (configurable timeout; default 30 min, surface via flag). +8. `ephemerd stop`, wait for process to truly exit. +9. Move current binary to `/.upgrade/ephemerd.previous`, + move new binary into place. +10. `ephemerd start`. Poll `ephemerd status` for "ok" within 60s. +11. Wait for in-VM ephemerd to log its version (parse console.log on + Windows; equivalent on macOS). +12. Report: `upgraded host:vA.B.C -> vX.Y.Z, in-vm:vX.Y.Z`. +13. On any failure between step 9 and 12, swap `.previous` back, restart, + log the rollback, exit non-zero. + +### Drain mechanics + +`ephemerd drain` is broken on Windows today (per project memory: +SIGTERM not supported). The upgrade work needs to fix that anyway — +options: + +- Add a `Drain` RPC to the gRPC control API (`api/v1/`). The CLI calls + it; the scheduler flips a flag that rejects new jobs and waits for + active ones to exit. Cross-platform, doesn't depend on signal + handling. Probably the right answer. +- Or: replace SIGTERM with a Windows service-control event + (`SERVICE_CONTROL_PARAMCHANGE` or a custom code). Less invasive but + Windows-specific. + +Recommendation: RPC. Reusable for the existing `ephemerd drain` +command, which would also become a thin wrapper around the same call. + +### Atomic swap mechanics, per OS + +**Linux/macOS**: `rename(2)` can replace a running executable's file. +Open file handles keep pointing at the old inode until the process +exits, the new inode takes the path immediately. The systemd/launchd +restart picks up the new binary. + +**Windows**: can't replace a locked file. Sequence has to be: + +``` +ephemerd stop (via service-control) +wait for process exit +copy/move new binary +ephemerd start (via service-control) +``` + +The five-second window where the service is fully down is acceptable +because we drained first. The CLI orchestrates via the Windows Service +Manager API (already used by `ephemerd start/stop`). + +### Version reporting end-to-end + +Post-upgrade, the command output should look like: + +``` +$ ephemerd upgrade +Channel: stable +Current host binary: v1.4.2 (built 2026-05-30) +Available: v1.4.3 (released 2026-06-02) +Draining... 0 active jobs. +Stopping service... done (1.2s). +Replacing binary at C:\Program Files\ephemerd\ephemerd.exe... done. +Starting service... ok (3.4s). +Waiting for in-VM ephemerd to register... ok. + +Upgraded: + host binary: v1.4.2 -> v1.4.3 + in-VM binary: v1.4.2 -> v1.4.3 +``` + +The in-VM version comes from parsing the first "starting ephemerd" +line in `/vm/linux/console.log` after the restart (or the +equivalent on macOS / Vz). + +### Self-replacement detail + +The upgrade command is itself part of the binary being replaced. On +Linux/macOS this is fine (open inode survives). On Windows the running +`ephemerd upgrade` process holds the lock on the daemon binary only +indirectly (the running service does), so the CLI can swap freely +after stopping the service. The CLI also needs to NOT delete itself +if it's running from the same install path — handle that by either: + +- Running from a temp copy (CLI's first act is to `exec` itself from + a tempdir, then proceed). +- Or scoping `upgrade` to be invoked from a separate path + (`ephemerd-upgrader.exe` or just `ephemerd upgrade --from `). + +The temp-copy approach is the standard pattern (Tailscale, Docker +Desktop, vscode auto-update all do it). Cleaner for the user. + +## CI work + +The biggest unknown — the upgrade CLI is straightforward, the artifact +publishing is where the time goes. + +Required: + +1. **Release workflow** (`.github/workflows/release.yml`): + - Triggers: `push: tags: ['v*']` and `workflow_dispatch`. + - Matrix: linux/amd64, linux/arm64, windows/amd64, darwin/arm64. + - Runs `mage build:` per cell, uploads as a release asset. + - Generates `SHA256SUMS` from all assets. + +2. **Rolling-main workflow** (`.github/workflows/main-release.yml`): + - Triggers: `push: branches: [main]`. + - Same matrix, same artifacts. + - Publishes to a single GitHub Release tagged `latest-main` (move-tag + pattern: delete the tag, retag HEAD, recreate the release with + fresh assets). + +3. **Signing** (deferred to v2 unless we already have a code-signing cert): + - Windows: Authenticode (cert + EV recommended; ~$300/year). + - macOS: notarization via `notarytool` (free with an Apple Developer + account; ad-hoc signing already in place per memory). + - Linux: optional GPG signature on SHA256SUMS. + +Pragmatic v1: SHA256 checksum only. Signing comes later. + +### Storage cost + +Each release ≈ 1.8 GB of binaries (4 platforms × ~0.5 GB average). GitHub +Releases storage is free but assets count against the 2 GB/file limit +(we're fine, biggest is ~880 MB). With a tagged release per week plus +a constantly-updated `latest-main`, expect ~10 GB of active artifact +storage; well within limits. + +## Risks + +- **GitHub rate limits on download.** Mitigated by authenticated pulls. + If we move off GitHub later (S3, an OCI registry), the upgrade CLI's + download layer is the only thing that changes. +- **Auto-check noise.** A daemon that logs "newer version available" + every 24h gets ignored. Make it opt-in or surface in `ephemerd status` + instead of the running log. +- **Drain that never completes.** A hung job blocks the upgrade + indefinitely. Default 30-minute drain timeout with a clear "still + running: " message before timeout; `--force` skips drain + entirely for emergency upgrades. +- **In-VM version mismatch detection.** The current "grep console.log + for `starting ephemerd version=`" is fragile. A more durable + solution: the in-VM ephemerd exposes its version via the gRPC + control API; the upgrade command queries the in-VM dispatch RPC + directly. That's a separate small piece of work. +- **Channel drift.** A host configured for `stable` could be tricked + via `--channel main` flag. Acceptable — operator-explicit override is + fine. The lock is against passive drift, not against the operator. +- **Cross-version compatibility.** Schema changes (BoltDB, gRPC API) + during a rolling fleet upgrade could break older nodes pointing at + newer schedulers. ephemerd is single-host today so this isn't an + issue, but worth flagging if multi-host coordination ever happens. + +## Estimate + +Rough sizing, assuming one engineer focused: + +| Piece | Effort | Notes | +|---|---|---| +| CI release workflow (tags) | 1d | One matrix, four mage targets. | +| CI rolling-main workflow | 0.5d | Tag-move + asset-replace dance. | +| Drain RPC | 1d | gRPC method + scheduler hook + Windows fix. | +| Upgrade CLI: download + verify | 1d | Channel resolution, SHA256, retry. | +| Upgrade CLI: swap + restart | 1.5d | Service-manager glue per OS, rollback, self-exec from tempdir. | +| Upgrade CLI: version reporting | 0.5d | Parse console.log on Windows, equivalent elsewhere; better with in-VM RPC. | +| Tests | 1.5d | Unit + e2e: fake artifact server, version-mismatch rollback, drain-timeout. | +| Docs | 0.5d | CLI reference, configuration reference, ops guide. | + +**Total: ~7 engineer-days for a solid v1.** Could ship a "happy-path +only, manual rollback" version in ~3 days as a stopgap. + +Signing/notarization adds another 2-3 days each, deferred unless +distribution policy demands it. + +## Open questions + +1. **In-VM version source of truth.** Parse console.log (simple, works + today) vs add a gRPC call to the in-VM dispatch server (more + robust, requires the dispatch server to be reachable post-restart). + Recommend the gRPC call — it's small and we already have the + dispatch service. +2. **Auto-upgrade.** Should ephemerd ever upgrade itself without an + operator running the command? Pro: zero-touch fleet. Con: the dind + debugging we just did would have been *much* harder if the daemon + silently rolled forward overnight. Recommend: never auto-apply, + only auto-check + log. +3. **Multi-channel hosts.** Can the same host run two ephemerd + instances on different channels (e.g. for A/B testing)? Probably + no for v1; one binary per host. Revisit if needed. +4. **Downgrade.** `ephemerd upgrade --tag ` should work for + rollbacks. Worth explicit testing. +5. **Embedded asset version skew.** A v1.4.3 host binary embeds a + v1.4.3 Linux binary. If the embed somehow gets stale (cache bug + like the one we hit during the dind work), the post-upgrade version + report should *catch the mismatch* — log a WARN and fail the + upgrade. That alone would have saved several hours. + +## Recommendation + +Build the CI artifact pipeline first (it's the bottleneck and unblocks +everything else), then the CLI in a single PR, then the drain RPC fix +as a small follow-up. Ship `--check` + manual download as a stopgap +on day 3 so the team has *something* before the full automation lands. From 7af2667eeff5865c513df8f52cf6625f61e38356 Mon Sep 17 00:00:00 2001 From: Luther Monson Date: Wed, 10 Jun 2026 18:02:46 -0700 Subject: [PATCH 5/7] feat(native): native macOS runner mode for trusted repos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Run GHA jobs directly on the macOS host instead of per-job VMs, enabling 4+ concurrent jobs (vs Apple's 2-VM cap) with zero boot overhead. Configured per-repo under [runner.macos] with "org/repo" keys, "org/*" wildcards, and a separate nativeMacSem concurrency gate. The VM path is untouched. Jobs never run as root: a hidden _ephemerd service user is created lazily (per-job ephemeral users were abandoned — macOS user deletion requires Full Disk Access and wedges opendirectoryd). Each job gets its own HOME/TMPDIR/work dir, keychain, Homebrew prefix, and a sandbox-exec profile denying localhost outbound and port binding. Also fixes uncovered along the way: - runner extraction is OS-suffixed (runners/-) so the macOS host and Linux VM no longer corrupt each other's runner on the shared data dir (Linux dispatch exit 127) - isOfficialRunnerImage prefixes had a trailing dash that never matched the runner-ci-linux tag, breaking custom-image dispatch - DEVELOPER_DIR resolved via xcode-select -p instead of hardcoded Xcode.app path (broke git on CLT-only hosts) - macOS VM runner monitor logs pgrep results at debug level Co-Authored-By: Claude Opus 4.6 --- cmd/ephemerd/main.go | 4 + docs/arch/native-macos-runner-summary.md | 101 +++++ docs/arch/native-macos-runner.md | 16 +- pkg/config/config.go | 80 ++++ pkg/config/config_test.go | 68 ++++ pkg/native/native_darwin.go | 496 +++++++++++++++++++++++ pkg/native/native_darwin_test.go | 134 ++++++ pkg/native/native_other.go | 33 ++ pkg/runner/runner.go | 4 +- pkg/runtime/runtime.go | 4 +- pkg/scheduler/handle_queued_test.go | 7 +- pkg/scheduler/scheduler.go | 194 +++++++-- pkg/scheduler/scheduler_test.go | 54 +++ pkg/vm/macosvm_darwin.go | 3 + 14 files changed, 1168 insertions(+), 30 deletions(-) create mode 100644 docs/arch/native-macos-runner-summary.md create mode 100644 pkg/native/native_darwin.go create mode 100644 pkg/native/native_darwin_test.go create mode 100644 pkg/native/native_other.go diff --git a/cmd/ephemerd/main.go b/cmd/ephemerd/main.go index de1350e3..eee4bb5f 100644 --- a/cmd/ephemerd/main.go +++ b/cmd/ephemerd/main.go @@ -547,6 +547,10 @@ func serve(ctx context.Context, configFile, imagesDirFlag string, containerdTCPP ShutdownTimeout: cfg.Runner.ParsedShutdownTimeout(), LogRetention: cfg.Log.LogRetentionDuration(), RunnerImageForRepo: cfg.Runner.ImageForRepoOS, + MaxNativeMac: cfg.Runner.MacOS.ResolvedMaxNative(), + MacOSModeForRepo: cfg.Runner.MacOS.ModeForRepo, + NativeMacUser: cfg.Runner.MacOS.User, + RunnerDir: rm.Dir(), Log: log, }) diff --git a/docs/arch/native-macos-runner-summary.md b/docs/arch/native-macos-runner-summary.md new file mode 100644 index 00000000..64d03d03 --- /dev/null +++ b/docs/arch/native-macos-runner-summary.md @@ -0,0 +1,101 @@ +# Native macOS Runner for ephemerd + +## Problem + +macOS jobs currently run in per-job Virtualization.framework VMs. This works but has hard limits: + +- Apple restricts macOS VMs to **2 concurrent instances** per host +- Each VM needs **4+ GB RAM** +- VM boot adds **10-15 seconds** of overhead per job +- An 8 GB Mac mini can run at most **2 concurrent macOS jobs** + +## Solution + +A new **native** execution mode that runs the GitHub Actions runner directly on the host. For trusted repos that don't need VM-level isolation (internal CI, Xcode builds, Go tests), this enables: + +- **4-6+ concurrent jobs** on the same hardware (configurable) +- **Zero boot overhead** — fork+exec, not VM boot +- **~200 MB per job** instead of 4+ GB + +The VM path is untouched — this is purely additive. Mode is configured per-repo. + +## Config + +```toml +[runner.macos] +mode = "vm" # default for repos not listed below +max_native = 4 # max concurrent native jobs + +[runner.macos.repos] +"ephpm/*" = "native" # whole org runs native +"ephpm/secret-repo" = "vm" # except this one (exact match wins over wildcard) +"someuser/ephemerd" = "vm" # fork stays on VM +``` + +Resolution order: exact `org/repo` match > `org/*` wildcard > top-level mode > default `"vm"`. + +## How it works + +Each native job gets its own isolated workspace: + +``` +/native// + ├── home/ → $HOME + ├── tmp/ → $TMPDIR + ├── work/ → runner _work directory + ├── runner/ → per-job copy of the GHA runner binary + ├── homebrew/ → per-job Homebrew prefix (symlinks to host /opt/homebrew) + └── keychain/ → per-job macOS keychain +``` + +### Isolation layers + +| Layer | How | +|-------|-----| +| Filesystem | Per-job HOME/TMPDIR/workdir. Sandbox blocks writes to `/opt/homebrew`, `/Applications`, `/usr/local`. Sensitive paths (SSH keys, ephemerd config, VM assets) blocked entirely. | +| Processes | `setpgid` puts runner + children in own process group. Killed via `kill(-pgid)` on cleanup. | +| Network | `sandbox-exec` blocks localhost outbound (prevents reaching ephemerd control socket or other jobs) and blocks port binding (prevents inter-job communication). DNS allowed. Public internet allowed. | +| Secrets | Per-job keychain created/destroyed. Environment cleared. | +| Homebrew | Host `/opt/homebrew` is read-only. Per-job prefix for `brew install` — installs are isolated and destroyed with the job. | + +The runner is launched via macOS `sandbox-exec`, which is kernel-enforced and inherited by all child processes. + +## Concurrency + +A separate semaphore (`nativeMacSem`) gates native jobs independently from VM jobs (`macSem`). A host can run **2 VM jobs + 4 native jobs simultaneously** if both modes are in use. + +## Scheduler flow + +``` +handleQueued + └─ isMacOSJob? + └─ ModeForRepo == "native" → handleNativeMacOSJob + │ └─ acquire nativeMacSem + │ └─ claimJob (register JIT runner with GitHub) + │ └─ native.New → copy runner, generate sandbox, setup env + │ └─ native.Start → sandbox-exec ./run.sh --jitconfig + │ └─ native.Wait → block until job completes + │ └─ native.Stop → kill process group, delete keychain, rm workspace + │ └─ ReleaseJob (deregister runner) + │ + └─ ModeForRepo == "vm" → handleMacOSJob (existing, unchanged) + └─ acquire macSem + └─ boot Virtualization.framework VM +``` + +## What's left + +- **Private network blocking** (10.x, 172.16.x, 192.168.x): `sandbox-exec` doesn't support CIDR notation. Needs `pf` firewall rules — separate follow-up. +- **Resource limits**: macOS has no cgroups. A runaway build can starve others. Mitigated with `nice`/`ulimit` in a future iteration. +- **No per-job user isolation**: all jobs run as the same macOS user. Jobs can see each other's PIDs via `ps` but can't interact (sandbox blocks sensitive files and network). + +## Comparison + +| | Native | VM | +|--|--------|-----| +| Boot time | ~0s | 10-15s | +| Memory per job | ~200 MB | 4+ GB | +| Max concurrent (8 GB mini) | 4-6 | 2 | +| Isolation | Sandbox + process group | Full VM | +| Security | Trusted repos only | Untrusted OK | +| Apple VM limit | N/A | 2 per host | diff --git a/docs/arch/native-macos-runner.md b/docs/arch/native-macos-runner.md index 590a90f8..e5bd3ad1 100644 --- a/docs/arch/native-macos-runner.md +++ b/docs/arch/native-macos-runner.md @@ -1,6 +1,20 @@ # Native macOS Runner -> **Status: proposed.** Not yet implemented. +> **Status: implemented.** See `pkg/native/`. Notable deviations from this +> proposal, discovered during implementation: +> +> - **Privilege dropping**: jobs run as a hidden `_ephemerd` service user +> (created lazily, like `_www`), not as the daemon's root user. Per-job +> ephemeral users were attempted but abandoned: macOS user *deletion* +> via dscl/sysadminctl requires Full Disk Access and wedges +> opendirectoryd without it, while creation works fine. +> - **Sandbox network rules**: `sandbox-exec` does not support CIDR +> notation (`10.0.0.0/8`). The profile blocks localhost outbound and all +> port binding; RFC1918 blocking needs pf firewall rules (follow-up). +> - **DEVELOPER_DIR** is resolved via `xcode-select -p` instead of +> hardcoding the Xcode.app path (hosts with only CLT broke otherwise). +> - **Runner extraction** is OS-suffixed (`runners/-`) so the +> macOS host and Linux VM don't collide on the shared data dir. ## Problem diff --git a/pkg/config/config.go b/pkg/config/config.go index a3576f06..ebb62d7f 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -9,6 +9,7 @@ import ( "log/slog" "os" goruntime "runtime" + "strings" "time" "github.com/BurntSushi/toml" @@ -432,6 +433,85 @@ type RunnerConfig struct { JobTimeout string `toml:"job_timeout"` ShutdownTimeout string `toml:"shutdown_timeout"` Windows WindowsRunnerToml `toml:"windows"` + MacOS MacOSRunnerConfig `toml:"macos"` +} + +// MacOSRunnerConfig controls macOS job routing. It lives under [runner] +// (not [vm.macos]) because native jobs don't involve VMs. +// +// TOML shape: +// +// [runner.macos] +// mode = "vm" # default mode: "vm" or "native" +// max_native = 4 # max concurrent native jobs +// # user = "ciuser" # optional: existing user for native runners. +// # # Default (unset): an ephemeral hidden user is +// # # created per job and deleted on cleanup. +// +// [runner.macos.repos] +// "ephpm/*" = "native" # all repos in org +// "ephpm/secret-repo" = "vm" # except this one (exact wins over wildcard) +// "someuser/ephemerd" = "vm" # fork stays on VM +type MacOSRunnerConfig struct { + Mode string `toml:"mode"` // "vm" (default) or "native" + MaxNative int `toml:"max_native"` // max concurrent native jobs (default 4) + User string `toml:"user"` // existing user for native runners (empty = ephemeral per-job user, recommended) + Repos map[string]string `toml:"repos"` // "org/repo" -> "vm" or "native" +} + +// ModeForRepo returns "native" or "vm" for the given repo. Resolution order: +// +// 1. Exact match on "org/repo" +// 2. Wildcard match on "org/*" +// 3. Short-name fallback: if repo has no "/", match any "org/" key +// 4. Top-level mode +// 5. Default: "vm" +// +// The short-name fallback exists because some providers (GitHub polling) +// currently emit event.Repo as just the repo name without the org prefix. +// Config keys should always use "org/repo" format for disambiguation. +func (m *MacOSRunnerConfig) ModeForRepo(repo string) string { + if m != nil && len(m.Repos) > 0 { + // 1. Exact match + if mode, ok := m.Repos[repo]; ok && isValidMode(mode) { + return mode + } + + // 2. Wildcard: "org/*" matches any repo under that org + if slash := strings.IndexByte(repo, '/'); slash > 0 { + wildcard := repo[:slash] + "/*" + if mode, ok := m.Repos[wildcard]; ok && isValidMode(mode) { + return mode + } + } + + // 3. Short-name fallback: repo="ephemerd" matches key "ephpm/ephemerd" + if !strings.Contains(repo, "/") { + suffix := "/" + repo + for key, mode := range m.Repos { + if strings.HasSuffix(key, suffix) && !strings.HasSuffix(key, "/*") && isValidMode(mode) { + return mode + } + } + } + } + if m != nil && isValidMode(m.Mode) { + return m.Mode + } + return "vm" +} + +func isValidMode(mode string) bool { + return mode == "native" || mode == "vm" +} + +// ResolvedMaxNative returns the max concurrent native macOS jobs, +// defaulting to 4 if unset or non-positive. +func (m *MacOSRunnerConfig) ResolvedMaxNative() int { + if m == nil || m.MaxNative <= 0 { + return 4 + } + return m.MaxNative } // WindowsRunnerToml configures resource limits for Hyper-V isolated Windows diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index 55bf474e..b012deb8 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -1941,3 +1941,71 @@ allow_privileged = false t.Error("ResolvedAllowPrivileged() should honor explicit false even on non-Linux hosts") } } + +func TestMacOSRunnerConfig_ModeForRepo(t *testing.T) { + tests := []struct { + name string + cfg *MacOSRunnerConfig + repo string + want string + }{ + {"nil config defaults to vm", nil, "myrepo", "vm"}, + {"zero value defaults to vm", &MacOSRunnerConfig{}, "myrepo", "vm"}, + {"top-level native", &MacOSRunnerConfig{Mode: "native"}, "myrepo", "native"}, + {"top-level vm", &MacOSRunnerConfig{Mode: "vm"}, "myrepo", "vm"}, + {"invalid top-level mode defaults to vm", &MacOSRunnerConfig{Mode: "bogus"}, "myrepo", "vm"}, + + // org/repo exact match + {"org/repo exact match native", &MacOSRunnerConfig{Repos: map[string]string{"ephpm/ephemerd": "native"}}, "ephpm/ephemerd", "native"}, + {"org/repo exact match vm", &MacOSRunnerConfig{Mode: "native", Repos: map[string]string{"ephpm/ephemerd": "vm"}}, "ephpm/ephemerd", "vm"}, + {"org/repo miss falls back to top-level", &MacOSRunnerConfig{Mode: "native", Repos: map[string]string{"ephpm/other": "vm"}}, "ephpm/ephemerd", "native"}, + + // short-name fallback (event.Repo = "ephemerd", config key = "ephpm/ephemerd") + {"short name matches org/repo key", &MacOSRunnerConfig{Repos: map[string]string{"ephpm/ephemerd": "native"}}, "ephemerd", "native"}, + {"short name no match falls to top-level", &MacOSRunnerConfig{Mode: "native", Repos: map[string]string{"ephpm/other": "vm"}}, "ephemerd", "native"}, + + // disambiguation: fork vs original + {"fork stays vm while original is native", &MacOSRunnerConfig{Repos: map[string]string{"ephpm/ephemerd": "native", "fork/ephemerd": "vm"}}, "ephpm/ephemerd", "native"}, + {"fork explicit vm", &MacOSRunnerConfig{Repos: map[string]string{"ephpm/ephemerd": "native", "fork/ephemerd": "vm"}}, "fork/ephemerd", "vm"}, + + // wildcard: "org/*" matches all repos in org + {"wildcard matches repo in org", &MacOSRunnerConfig{Repos: map[string]string{"ephpm/*": "native"}}, "ephpm/ephemerd", "native"}, + {"wildcard matches another repo in org", &MacOSRunnerConfig{Repos: map[string]string{"ephpm/*": "native"}}, "ephpm/php-sdk", "native"}, + {"wildcard does not match different org", &MacOSRunnerConfig{Repos: map[string]string{"ephpm/*": "native"}}, "other/ephemerd", "vm"}, + {"exact match wins over wildcard", &MacOSRunnerConfig{Repos: map[string]string{"ephpm/*": "native", "ephpm/secret": "vm"}}, "ephpm/secret", "vm"}, + {"wildcard still applies to non-overridden repo", &MacOSRunnerConfig{Repos: map[string]string{"ephpm/*": "native", "ephpm/secret": "vm"}}, "ephpm/ephemerd", "native"}, + + // invalid per-repo mode falls through + {"invalid per-repo mode falls back to top-level", &MacOSRunnerConfig{Mode: "native", Repos: map[string]string{"ephpm/myrepo": "bogus"}}, "ephpm/myrepo", "native"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := tt.cfg.ModeForRepo(tt.repo) + if got != tt.want { + t.Errorf("ModeForRepo(%q) = %q, want %q", tt.repo, got, tt.want) + } + }) + } +} + +func TestMacOSRunnerConfig_ResolvedMaxNative(t *testing.T) { + tests := []struct { + name string + cfg *MacOSRunnerConfig + want int + }{ + {"nil config defaults to 4", nil, 4}, + {"zero value defaults to 4", &MacOSRunnerConfig{}, 4}, + {"negative defaults to 4", &MacOSRunnerConfig{MaxNative: -1}, 4}, + {"positive value used", &MacOSRunnerConfig{MaxNative: 8}, 8}, + {"one is valid", &MacOSRunnerConfig{MaxNative: 1}, 1}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := tt.cfg.ResolvedMaxNative() + if got != tt.want { + t.Errorf("ResolvedMaxNative() = %d, want %d", got, tt.want) + } + }) + } +} diff --git a/pkg/native/native_darwin.go b/pkg/native/native_darwin.go new file mode 100644 index 00000000..d76d4c1f --- /dev/null +++ b/pkg/native/native_darwin.go @@ -0,0 +1,496 @@ +//go:build darwin + +package native + +import ( + "context" + "errors" + "fmt" + "io" + "io/fs" + "log/slog" + "os" + "os/exec" + "os/user" + "path/filepath" + "strconv" + "strings" + "sync" + "syscall" +) + +// serviceUserMu serializes service user creation across concurrent job starts. +var serviceUserMu sync.Mutex + +// ServiceUserName is the hidden macOS service account that native runner +// jobs execute as when no [runner.macos] user is configured. It is created +// lazily on first use and persists like other service accounts (_www, ...). +// Per-job user deletion is deliberately avoided: dscl/sysadminctl user +// deletion wedges opendirectoryd on modern macOS. +const ServiceUserName = "_ephemerd" + +// serviceUIDRange is scanned for a free UID when creating the service user. +const ( + serviceUIDMin = 600 + serviceUIDMax = 999 +) + +// ensureServiceUser creates the _ephemerd service user if it doesn't exist +// and returns its credential. +func (r *Runner) ensureServiceUser() (*syscall.Credential, error) { + serviceUserMu.Lock() + defer serviceUserMu.Unlock() + + // Already exists? + if cred, err := lookupCredential(ServiceUserName); err == nil { + return cred, nil + } + + // Find a free UID + out, err := exec.Command("dscl", ".", "-list", "/Users", "UniqueID").Output() + if err != nil { + return nil, fmt.Errorf("listing users: %w", err) + } + used := make(map[int]bool) + for _, line := range strings.Split(string(out), "\n") { + fields := strings.Fields(line) + if len(fields) == 2 { + if id, err := strconv.Atoi(fields[1]); err == nil { + used[id] = true + } + } + } + uid := 0 + for id := serviceUIDMin; id <= serviceUIDMax; id++ { + if !used[id] { + uid = id + break + } + } + if uid == 0 { + return nil, fmt.Errorf("no free UID in range %d-%d", serviceUIDMin, serviceUIDMax) + } + + // NFSHomeDirectory is /var/empty (like _www and other service + // accounts). Registering a real directory as a user home puts it + // under macOS data protection — even root then can't delete it + // without Full Disk Access. The runner's HOME env var points at the + // per-job dir; the DS record never needs to. + steps := [][]string{ + {"dscl", ".", "-create", "/Users/" + ServiceUserName}, + {"dscl", ".", "-create", "/Users/" + ServiceUserName, "UserShell", "/bin/bash"}, + {"dscl", ".", "-create", "/Users/" + ServiceUserName, "UniqueID", strconv.Itoa(uid)}, + {"dscl", ".", "-create", "/Users/" + ServiceUserName, "PrimaryGroupID", "20"}, // staff + {"dscl", ".", "-create", "/Users/" + ServiceUserName, "NFSHomeDirectory", "/var/empty"}, + {"dscl", ".", "-create", "/Users/" + ServiceUserName, "IsHidden", "1"}, + } + for _, args := range steps { + if out, err := exec.Command(args[0], args[1:]...).CombinedOutput(); err != nil { + return nil, fmt.Errorf("%v: %s: %w", args, strings.TrimSpace(string(out)), err) + } + } + r.log.Info("created ephemerd service user", "user", ServiceUserName, "uid", uid) + + return &syscall.Credential{Uid: uint32(uid), Gid: 20}, nil +} + +// lookupCredential resolves a username to a syscall.Credential for +// privilege dropping via SysProcAttr. +func lookupCredential(username string) (*syscall.Credential, error) { + u, err := user.Lookup(username) + if err != nil { + return nil, err + } + uid, err := strconv.ParseUint(u.Uid, 10, 32) + if err != nil { + return nil, fmt.Errorf("parsing uid %q: %w", u.Uid, err) + } + gid, err := strconv.ParseUint(u.Gid, 10, 32) + if err != nil { + return nil, fmt.Errorf("parsing gid %q: %w", u.Gid, err) + } + return &syscall.Credential{Uid: uint32(uid), Gid: uint32(gid)}, nil +} + +// Runner executes a GitHub Actions runner directly on the macOS host +// inside a per-job sandbox. Each job gets its own workspace, HOME, +// TMPDIR, keychain, and Homebrew prefix. +type Runner struct { + dataDir string + jobID string + jitConfig string + runnerSrc string // path to extracted GHA runner (runner.Manager.Dir()) + log *slog.Logger + + jobDir string // /native// + keychainPath string // per-job keychain + runAsUser string // existing user to run as (empty = _ephemerd service user) + jobUID uint32 // uid the runner executes as + cmd *exec.Cmd + pgid int +} + +// SetRunAsUser configures a non-root user to run the runner process as. +// The daemon (running as root) drops privileges via setuid/setgid when +// launching the runner. Strongly recommended when the daemon runs as root: +// without it, CI job steps execute as root on the host. +func (r *Runner) SetRunAsUser(username string) { + r.runAsUser = username +} + +// New creates a native macOS runner for a single job. It prepares the +// workspace directory structure but does not start the runner process. +func New(dataDir, jobID, jitConfig, runnerSrc string, log *slog.Logger) (*Runner, error) { + jobDir := filepath.Join(dataDir, "native", jobID) + + // Create workspace directories + dirs := []string{ + filepath.Join(jobDir, "home"), + filepath.Join(jobDir, "tmp"), + filepath.Join(jobDir, "work"), + filepath.Join(jobDir, "runner"), + filepath.Join(jobDir, "homebrew", "bin"), + filepath.Join(jobDir, "homebrew", "Cellar"), + filepath.Join(jobDir, "keychain"), + } + for _, d := range dirs { + if err := os.MkdirAll(d, 0o755); err != nil { + return nil, fmt.Errorf("creating directory %s: %w", d, err) + } + } + + return &Runner{ + dataDir: dataDir, + jobID: jobID, + jitConfig: jitConfig, + runnerSrc: runnerSrc, + log: log, + jobDir: jobDir, + }, nil +} + +// Start copies the runner binary, sets up the sandbox and environment, +// and launches the runner process. +func (r *Runner) Start(ctx context.Context) error { + runnerDir := filepath.Join(r.jobDir, "runner") + + // Copy runner files from the extracted source (hard link, fall back to copy) + if err := copyRunnerFiles(r.runnerSrc, runnerDir); err != nil { + return fmt.Errorf("copying runner files: %w", err) + } + + // Generate and write sandbox profile + profilePath := filepath.Join(r.jobDir, "sandbox.sb") + profile := GenerateSandboxProfile(r.jobDir, r.dataDir) + if err := os.WriteFile(profilePath, []byte(profile), 0o644); err != nil { + return fmt.Errorf("writing sandbox profile: %w", err) + } + + // Set up per-job keychain + r.keychainPath = filepath.Join(r.jobDir, "keychain", "job.keychain-db") + if err := r.createKeychain(); err != nil { + r.log.Warn("failed to create per-job keychain", "error", err) + // Non-fatal: jobs that don't need signing will work fine + } + + // Symlink Homebrew binaries from host + if err := symlinkHomebrew(filepath.Join(r.jobDir, "homebrew", "bin")); err != nil { + r.log.Warn("failed to symlink homebrew binaries", "error", err) + // Non-fatal: host may not have Homebrew installed + } + + // Build environment + homeDir := filepath.Join(r.jobDir, "home") + tmpDir := filepath.Join(r.jobDir, "tmp") + workDir := filepath.Join(r.jobDir, "work") + brewDir := filepath.Join(r.jobDir, "homebrew") + + env := []string{ + "HOME=" + homeDir, + "TMPDIR=" + tmpDir, + "RUNNER_WORK_FOLDER=" + workDir, + "PATH=" + filepath.Join(brewDir, "bin") + ":/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin", + "HOMEBREW_PREFIX=" + brewDir, + "HOMEBREW_CELLAR=" + filepath.Join(brewDir, "Cellar"), + "HOMEBREW_TEMP=" + tmpDir, + "LANG=en_US.UTF-8", + } + // Point DEVELOPER_DIR at the host's active developer directory + // (full Xcode or Command Line Tools). Hardcoding the Xcode.app path + // breaks xcrun shims (git, clang) on hosts with only CLT installed. + if devDir, err := exec.Command("xcode-select", "-p").Output(); err == nil { + env = append(env, "DEVELOPER_DIR="+strings.TrimSpace(string(devDir))) + } + if r.keychainPath != "" { + env = append(env, "EPHEMERD_KEYCHAIN="+r.keychainPath) + } + + // Launch via sandbox-exec for filesystem/network isolation + r.cmd = exec.CommandContext(ctx, "sandbox-exec", "-f", profilePath, + "./run.sh", "--jitconfig", r.jitConfig) + r.cmd.Dir = runnerDir + r.cmd.Env = env + + // Own process group for clean kill + r.cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} + + // Drop privileges. Job steps must never run as root on the host: + // - user configured: run as that existing user + // - no user configured + daemon is root: run as the hidden _ephemerd + // service user (created lazily on first use) + // - daemon not root: run as the daemon's own user (no setuid possible) + var cred *syscall.Credential + username := r.runAsUser + switch { + case r.runAsUser != "": + c, err := lookupCredential(r.runAsUser) + if err != nil { + return fmt.Errorf("looking up run-as user %q: %w", r.runAsUser, err) + } + cred = c + case os.Geteuid() == 0: + c, err := r.ensureServiceUser() + if err != nil { + return fmt.Errorf("ensuring service user: %w", err) + } + username = ServiceUserName + cred = c + } + if cred != nil { + if out, err := exec.Command("chown", "-R", + fmt.Sprintf("%d:%d", cred.Uid, cred.Gid), r.jobDir).CombinedOutput(); err != nil { + return fmt.Errorf("chowning job dir to %s: %s: %w", username, strings.TrimSpace(string(out)), err) + } + r.cmd.SysProcAttr.Credential = cred + r.jobUID = cred.Uid + env = append(env, "USER="+username, "LOGNAME="+username) + r.cmd.Env = env + } + + // Log to files in the job directory (after chown so the runner user owns it) + logPath := filepath.Join(r.jobDir, "runner.log") + logFile, err := os.Create(logPath) + if err != nil { + return fmt.Errorf("creating log file: %w", err) + } + r.cmd.Stdout = logFile + r.cmd.Stderr = logFile + + if err := r.cmd.Start(); err != nil { + if closeErr := logFile.Close(); closeErr != nil { + r.log.Warn("failed to close log file", "error", closeErr) + } + return fmt.Errorf("starting runner: %w", err) + } + + r.pgid = r.cmd.Process.Pid + r.log.Info("native macOS runner started", + "job_id", r.jobID, + "pid", r.pgid, + "dir", runnerDir, + ) + + return nil +} + +// Wait blocks until the runner process exits and returns its exit code. +func (r *Runner) Wait() (int, error) { + if r.cmd == nil || r.cmd.Process == nil { + return -1, fmt.Errorf("runner not started") + } + + err := r.cmd.Wait() + if err != nil { + var exitErr *exec.ExitError + if errors.As(err, &exitErr) { + return exitErr.ExitCode(), nil + } + return -1, fmt.Errorf("waiting for runner: %w", err) + } + return 0, nil +} + +// Stop forcefully terminates the runner and all its children, cleans up +// the keychain, and removes the job workspace. +func (r *Runner) Stop() { + // Kill the process group + if r.pgid > 0 { + if err := syscall.Kill(-r.pgid, syscall.SIGKILL); err != nil { + // Process may have already exited — not an error + r.log.Debug("kill process group", "pgid", r.pgid, "error", err) + } + + // Fallback: kill any orphaned children + cmd := exec.Command("pkill", "-9", "-P", strconv.Itoa(r.pgid)) + if err := cmd.Run(); err != nil { + r.log.Debug("pkill fallback", "ppid", r.pgid, "error", err) + } + } + + // Delete per-job keychain + if r.keychainPath != "" { + r.deleteKeychain() + } + + // Note: no per-UID process kill here — the service user is shared + // across concurrent jobs, so pkill -U would kill other jobs' steps. + // The pgid kill above covers the job's process tree. + + // Strip ACLs before removal: macOS frameworks put "deny delete" ACLs + // on auto-created home subdirectories (~/Library etc.) which block + // os.RemoveAll even as root. + if out, err := exec.Command("chmod", "-RN", r.jobDir).CombinedOutput(); err != nil { + r.log.Debug("stripping ACLs from job dir", "dir", r.jobDir, + "output", strings.TrimSpace(string(out)), "error", err) + } + + // Remove job workspace + if err := os.RemoveAll(r.jobDir); err != nil { + r.log.Warn("failed to remove job directory", "dir", r.jobDir, "error", err) + } + + r.log.Info("native macOS runner cleaned up", "job_id", r.jobID) +} + +// createKeychain creates a per-job temporary keychain. +func (r *Runner) createKeychain() error { + cmd := exec.Command("security", "create-keychain", "-p", "", r.keychainPath) + if out, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("create-keychain: %s: %w", strings.TrimSpace(string(out)), err) + } + cmd = exec.Command("security", "unlock-keychain", "-p", "", r.keychainPath) + if out, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("unlock-keychain: %s: %w", strings.TrimSpace(string(out)), err) + } + return nil +} + +// deleteKeychain removes the per-job keychain. +func (r *Runner) deleteKeychain() { + cmd := exec.Command("security", "delete-keychain", r.keychainPath) + if out, err := cmd.CombinedOutput(); err != nil { + r.log.Warn("failed to delete keychain", "path", r.keychainPath, "output", strings.TrimSpace(string(out)), "error", err) + } +} + +// GenerateSandboxProfile returns a macOS sandbox profile that restricts +// the runner process. Paths are templated with the job-specific directories. +func GenerateSandboxProfile(jobDir, dataDir string) string { + // Resolve to absolute paths for the sandbox profile + absJobDir, _ := filepath.Abs(jobDir) + absDataDir, _ := filepath.Abs(dataDir) + + return fmt.Sprintf(`(version 1) +(allow default) + +;; === Network isolation === +;; Note: sandbox-exec does not support CIDR notation for IP addresses. +;; Private network blocking (10.x, 172.16.x, 192.168.x) requires pf +;; firewall rules — handled separately. The sandbox blocks localhost +;; and port binding to prevent inter-job communication. + +;; Allow DNS before blocking localhost (macOS resolves via mDNSResponder on 127.0.0.1) +(allow network-outbound (remote udp "localhost:53")) +(allow network-outbound (remote tcp "localhost:53")) + +;; Block outbound to localhost (daemon control socket, other jobs) +(deny network-outbound (remote ip "localhost:*")) + +;; Block binding to any port — prevents jobs from running servers +(deny network-bind (local ip "*:*")) + +;; === Filesystem isolation === + +;; Block sensitive host paths +(deny file-read* (subpath "%[1]s/.ssh")) +(deny file-read* (literal "%[2]s/config.toml")) +(deny file-read* (literal "%[2]s/ephemerd.sock")) +(deny file-read* (subpath "%[2]s/vm")) + +;; Block writes to shared tools (read-only access only) +(deny file-write* (subpath "/opt/homebrew")) +(deny file-write* (subpath "/Applications")) +(deny file-write* (subpath "/usr/local")) + +;; Allow writes to the job directory +(allow file-write* (subpath "%[3]s")) +(allow file-write* (subpath "/private/tmp")) +`, os.Getenv("HOME"), absDataDir, absJobDir) +} + +// symlinkHomebrew creates symlinks from /opt/homebrew/bin/* into the +// per-job homebrew bin directory, giving jobs read access to pre-installed +// tools while keeping their own installs isolated. +func symlinkHomebrew(destBin string) error { + const hostBin = "/opt/homebrew/bin" + entries, err := os.ReadDir(hostBin) + if err != nil { + return fmt.Errorf("reading %s: %w", hostBin, err) + } + for _, e := range entries { + src := filepath.Join(hostBin, e.Name()) + dst := filepath.Join(destBin, e.Name()) + if err := os.Symlink(src, dst); err != nil { + // Skip if symlink already exists + if !os.IsExist(err) { + return fmt.Errorf("symlinking %s: %w", e.Name(), err) + } + } + } + return nil +} + +// copyRunnerFiles copies the runner directory to the per-job location. +// Uses hard links for efficiency, falling back to full copy on error. +func copyRunnerFiles(src, dst string) error { + return filepath.WalkDir(src, func(path string, d fs.DirEntry, err error) error { + if err != nil { + return err + } + + rel, err := filepath.Rel(src, path) + if err != nil { + return fmt.Errorf("computing relative path: %w", err) + } + target := filepath.Join(dst, rel) + + if d.IsDir() { + return os.MkdirAll(target, 0o755) + } + + return copyFile(path, target) + }) +} + +func copyFile(src, dst string) error { + sf, err := os.Open(src) + if err != nil { + return fmt.Errorf("opening source %s: %w", src, err) + } + defer func() { + if closeErr := sf.Close(); closeErr != nil { + // Best-effort close; source is read-only + _ = closeErr + } + }() + + info, err := sf.Stat() + if err != nil { + return fmt.Errorf("stat source %s: %w", src, err) + } + + df, err := os.OpenFile(dst, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, info.Mode()) + if err != nil { + return fmt.Errorf("creating dest %s: %w", dst, err) + } + + if _, err := io.Copy(df, sf); err != nil { + if closeErr := df.Close(); closeErr != nil { + // Log would be ideal but we don't have a logger here + _ = closeErr + } + return fmt.Errorf("copying %s → %s: %w", src, dst, err) + } + + return df.Close() +} diff --git a/pkg/native/native_darwin_test.go b/pkg/native/native_darwin_test.go new file mode 100644 index 00000000..99db5655 --- /dev/null +++ b/pkg/native/native_darwin_test.go @@ -0,0 +1,134 @@ +//go:build darwin + +package native + +import ( + "os" + "path/filepath" + "strings" + "testing" +) + +func TestGenerateSandboxProfile(t *testing.T) { + jobDir := "/tmp/test-native/job123" + dataDir := "/var/lib/ephemerd" + + profile := GenerateSandboxProfile(jobDir, dataDir) + + // Verify the profile contains key deny rules + checks := []struct { + desc string + substr string + }{ + {"allows DNS UDP", `(allow network-outbound (remote udp "localhost:53"))`}, + {"allows DNS TCP", `(allow network-outbound (remote tcp "localhost:53"))`}, + {"blocks localhost", `(deny network-outbound (remote ip "localhost:*"))`}, + {"blocks port binding", `(deny network-bind (local ip "*:*"))`}, + {"blocks SSH dir", `(deny file-read* (subpath`}, + {"blocks config.toml", `(deny file-read* (literal "/var/lib/ephemerd/config.toml"))`}, + {"blocks ephemerd socket", `(deny file-read* (literal "/var/lib/ephemerd/ephemerd.sock"))`}, + {"blocks VM dir", `(deny file-read* (subpath "/var/lib/ephemerd/vm"))`}, + {"blocks homebrew writes", `(deny file-write* (subpath "/opt/homebrew"))`}, + {"blocks Applications writes", `(deny file-write* (subpath "/Applications"))`}, + {"blocks /usr/local writes", `(deny file-write* (subpath "/usr/local"))`}, + {"allows job dir writes", `(allow file-write* (subpath "/tmp/test-native/job123"))`}, + {"allows /private/tmp writes", `(allow file-write* (subpath "/private/tmp"))`}, + } + + for _, c := range checks { + if !strings.Contains(profile, c.substr) { + t.Errorf("sandbox profile missing %s: expected substring %q", c.desc, c.substr) + } + } +} + +func TestNewCreatesWorkspace(t *testing.T) { + tmpDir := t.TempDir() + dataDir := filepath.Join(tmpDir, "data") + runnerSrc := filepath.Join(tmpDir, "runner-src") + + // Create a minimal runner source dir + if err := os.MkdirAll(runnerSrc, 0o755); err != nil { + t.Fatal(err) + } + + r, err := New(dataDir, "test-job-42", "fake-jit-config", runnerSrc, nil) + if err != nil { + t.Fatalf("New() error: %v", err) + } + + // Verify expected directories exist + expectedDirs := []string{ + "home", + "tmp", + "work", + "runner", + filepath.Join("homebrew", "bin"), + filepath.Join("homebrew", "Cellar"), + "keychain", + } + for _, d := range expectedDirs { + path := filepath.Join(r.jobDir, d) + info, err := os.Stat(path) + if err != nil { + t.Errorf("expected directory %s to exist: %v", d, err) + continue + } + if !info.IsDir() { + t.Errorf("expected %s to be a directory", d) + } + } +} + +func TestCopyRunnerFiles(t *testing.T) { + tmpDir := t.TempDir() + src := filepath.Join(tmpDir, "src") + dst := filepath.Join(tmpDir, "dst") + + // Create source tree + if err := os.MkdirAll(filepath.Join(src, "subdir"), 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(src, "run.sh"), []byte("#!/bin/bash\necho hello"), 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(src, "subdir", "config.json"), []byte(`{"key":"val"}`), 0o644); err != nil { + t.Fatal(err) + } + + if err := os.MkdirAll(dst, 0o755); err != nil { + t.Fatal(err) + } + + if err := copyRunnerFiles(src, dst); err != nil { + t.Fatalf("copyRunnerFiles() error: %v", err) + } + + // Verify files were copied + checks := []struct { + path string + content string + }{ + {filepath.Join(dst, "run.sh"), "#!/bin/bash\necho hello"}, + {filepath.Join(dst, "subdir", "config.json"), `{"key":"val"}`}, + } + for _, c := range checks { + data, err := os.ReadFile(c.path) + if err != nil { + t.Errorf("expected file %s: %v", c.path, err) + continue + } + if string(data) != c.content { + t.Errorf("file %s content = %q, want %q", c.path, string(data), c.content) + } + } + + // Verify run.sh is executable + info, err := os.Stat(filepath.Join(dst, "run.sh")) + if err != nil { + t.Fatal(err) + } + if info.Mode()&0o100 == 0 { + t.Error("run.sh should be executable") + } +} diff --git a/pkg/native/native_other.go b/pkg/native/native_other.go new file mode 100644 index 00000000..eb64d8c8 --- /dev/null +++ b/pkg/native/native_other.go @@ -0,0 +1,33 @@ +//go:build !darwin + +package native + +import ( + "context" + "fmt" + "log/slog" +) + +// Runner is a stub on non-darwin platforms. +type Runner struct{} + +// New returns an error on non-darwin platforms. +func New(_, _, _, _ string, _ *slog.Logger) (*Runner, error) { + return nil, fmt.Errorf("native macOS runner is only supported on darwin") +} + +// SetRunAsUser is a stub on non-darwin platforms. +func (r *Runner) SetRunAsUser(_ string) {} + +// Start is a stub on non-darwin platforms. +func (r *Runner) Start(_ context.Context) error { + return fmt.Errorf("native macOS runner is only supported on darwin") +} + +// Wait is a stub on non-darwin platforms. +func (r *Runner) Wait() (int, error) { + return -1, fmt.Errorf("native macOS runner is only supported on darwin") +} + +// Stop is a stub on non-darwin platforms. +func (r *Runner) Stop() {} diff --git a/pkg/runner/runner.go b/pkg/runner/runner.go index 8ff0bc2c..519e9d9d 100644 --- a/pkg/runner/runner.go +++ b/pkg/runner/runner.go @@ -35,9 +35,11 @@ func New(dataDir string, log *slog.Logger) *Manager { } // Dir returns the path to the extracted runner directory. +// The path is OS-specific (e.g. runners/2.333.1-linux) so that macOS +// and Linux extractions don't collide on shared filesystems (virtio-fs). // Call Extract() first to ensure it exists. func (m *Manager) Dir() string { - return filepath.Join(m.dataDir, "runners", Version) + return filepath.Join(m.dataDir, "runners", Version+"-"+goruntime.GOOS) } // Entrypoint returns the runner entrypoint command for the current OS. diff --git a/pkg/runtime/runtime.go b/pkg/runtime/runtime.go index ded66b9d..ee2334b4 100644 --- a/pkg/runtime/runtime.go +++ b/pkg/runtime/runtime.go @@ -1296,8 +1296,8 @@ func isOfficialRunnerImage(image string) bool { // as foreign images and bind-mounts /actions-runner over the rootfs, // then runs /actions-runner/run.sh — which the image doesn't have, // so the entrypoint exits 127 ("command not found"). - "ephpm/ephemerd:runner-ci-linux-", - "docker.io/ephpm/ephemerd:runner-ci-linux-", + "ephpm/ephemerd:runner-ci-linux", + "docker.io/ephpm/ephemerd:runner-ci-linux", } { if strings.HasPrefix(image, prefix) { return true diff --git a/pkg/scheduler/handle_queued_test.go b/pkg/scheduler/handle_queued_test.go index 0cfaba05..4decc917 100644 --- a/pkg/scheduler/handle_queued_test.go +++ b/pkg/scheduler/handle_queued_test.go @@ -70,11 +70,12 @@ func TestHandleQueued_DrainNoClaim(t *testing.T) { // when MacOSVMConfig is nil but the job has macOS labels, the scheduler must // remove the seen entry so the next poll retries. The provider is never asked // to claim. -func TestHandleQueued_SkipsMacOSWithoutVMConfig(t *testing.T) { +func TestHandleQueued_SkipsMacOSWithoutVMOrNativeConfig(t *testing.T) { mp := newMockProvider("github") s := New(Config{ Providers: []providers.Provider{mp}, Log: testLogger(), + // No MacOSVMConfig and no MacOSModeForRepo — macOS jobs should be deferred }) event := providers.JobEvent{ @@ -88,13 +89,13 @@ func TestHandleQueued_SkipsMacOSWithoutVMConfig(t *testing.T) { s.handleQueued(context.Background(), event) if got := len(mp.claims); got != 0 { - t.Errorf("macOS job without VM config should not claim, got %d claims", got) + t.Errorf("macOS job without VM or native config should not claim, got %d claims", got) } s.mu.Lock() _, seen := s.seen[keyFor(event)] s.mu.Unlock() if seen { - t.Error("macOS job without VM config should be unsed so it retries on next poll") + t.Error("macOS job without VM or native config should be unseen so it retries on next poll") } } diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go index 028d410f..52d931e4 100644 --- a/pkg/scheduler/scheduler.go +++ b/pkg/scheduler/scheduler.go @@ -17,6 +17,7 @@ import ( "github.com/ephpm/ephemerd/pkg/artifacts" "github.com/ephpm/ephemerd/pkg/metrics" "github.com/ephpm/ephemerd/pkg/names" + "github.com/ephpm/ephemerd/pkg/native" "github.com/ephpm/ephemerd/pkg/providers" "github.com/ephpm/ephemerd/pkg/runtime" "github.com/ephpm/ephemerd/pkg/tunnel" @@ -52,6 +53,11 @@ type Config struct { // and finally the runtime's host-aware default. Nil-safe. RunnerImageForRepo func(repo, os string) string + MaxNativeMac int // max concurrent native macOS jobs (default 4) + MacOSModeForRepo func(repo string) string // returns "native" or "vm" per repo (nil = always VM) + NativeMacUser string // non-root user for native macOS runner processes + RunnerDir string // path to extracted GHA runner binary dir (runner.Manager.Dir()) + Log *slog.Logger } @@ -103,10 +109,11 @@ type Scheduler struct { seen map[jobKey]time.Time // recently handled jobs for dedup pending map[jobKey]struct{} // jobs dispatched to a handler but not yet holding sem mu sync.Mutex - sem chan struct{} // local/native job concurrency limiter - linuxSem chan struct{} // Linux dispatch (VM) concurrency limiter - macSem chan struct{} // macOS VM concurrency limiter (Vz has a hard cap) - draining bool // true when shutting down, rejects new jobs + sem chan struct{} // local/native job concurrency limiter + linuxSem chan struct{} // Linux dispatch (VM) concurrency limiter + macSem chan struct{} // macOS VM concurrency limiter (Vz has a hard cap) + nativeMacSem chan struct{} // native macOS job concurrency limiter (separate from VM limit) + draining bool // true when shutting down, rejects new jobs startTime time.Time } @@ -158,9 +165,10 @@ type runningJob struct { image string cancel context.CancelFunc artifactsDir string // non-empty if OCI artifacts were extracted for this job - dispatched string // non-empty if dispatched to Linux VM worker (stores container name) - macosVM vm.MacOSVM // non-nil if running as a macOS VM job - startedAt time.Time + dispatched string // non-empty if dispatched to Linux VM worker (stores container name) + macosVM vm.MacOSVM // non-nil if running as a macOS VM job + nativeRunner interface{ Stop() } // non-nil if running as a native macOS job + startedAt time.Time } @@ -189,15 +197,21 @@ func New(cfg Config) *Scheduler { } } + nativeMac := cfg.MaxNativeMac + if nativeMac <= 0 { + nativeMac = 4 + } + return &Scheduler{ - cfg: cfg, - running: make(map[jobKey]*runningJob), - seen: make(map[jobKey]time.Time), - pending: make(map[jobKey]struct{}), - sem: make(chan struct{}, cfg.MaxConcurrent), - linuxSem: make(chan struct{}, cfg.MaxConcurrent), - macSem: make(chan struct{}, macVMs), - startTime: time.Now(), + cfg: cfg, + running: make(map[jobKey]*runningJob), + seen: make(map[jobKey]time.Time), + pending: make(map[jobKey]struct{}), + sem: make(chan struct{}, cfg.MaxConcurrent), + linuxSem: make(chan struct{}, cfg.MaxConcurrent), + macSem: make(chan struct{}, macVMs), + nativeMacSem: make(chan struct{}, nativeMac), + startTime: time.Now(), } } @@ -447,11 +461,10 @@ func (s *Scheduler) canHandleJob(jobLabels []string) bool { case "windows": osOK = goruntime.GOOS == "windows" case "macos", "macosx": - // macOS jobs need a per-job VM for isolation. Without - // MacOSVMConfig we refuse the job rather than fall back to - // running on the host — sharing the runner process tree with - // other jobs (and the daemon) is a non-starter for CI. - osOK = goruntime.GOOS == "darwin" && s.cfg.MacOSVMConfig != nil + // macOS jobs run in a per-job VM (default) or natively on + // the host (when configured for trusted repos). Accept if + // either VM config or native mode is available. + osOK = goruntime.GOOS == "darwin" && (s.cfg.MacOSVMConfig != nil || s.cfg.MacOSModeForRepo != nil) } } if !osOK { @@ -543,8 +556,14 @@ func (s *Scheduler) handleQueued(ctx context.Context, event providers.JobEvent) return } - // Route macOS-native jobs to per-job macOS VMs. + // Route macOS jobs to native runner or per-job VM. if isMacOSJob(event.Labels) { + // Native mode takes priority when configured for this repo + if s.cfg.MacOSModeForRepo != nil && s.cfg.MacOSModeForRepo(event.Repo) == "native" { + s.handleNativeMacOSJob(ctx, event) + return + } + // VM path s.mu.Lock() macCfg := s.cfg.MacOSVMConfig s.mu.Unlock() @@ -552,13 +571,13 @@ func (s *Scheduler) handleQueued(ctx context.Context, event providers.JobEvent) s.handleMacOSJob(ctx, event) return } - // macOS VM disk is still being provisioned — remove from seen/pending + // Neither native nor VM available — remove from seen/pending // so the next poll retries this job once the install finishes. s.mu.Lock() delete(s.seen, key) delete(s.pending, key) s.mu.Unlock() - log.Info("macOS VM disk not ready yet, deferring job") + log.Info("macOS runner not ready, deferring job") return } @@ -869,6 +888,131 @@ func (s *Scheduler) handleMacOSJob(ctx context.Context, event providers.JobEvent }() } +// handleNativeMacOSJob runs the GitHub Actions runner directly on the macOS +// host inside a sandbox. Used for trusted repos that don't need VM isolation. +func (s *Scheduler) handleNativeMacOSJob(ctx context.Context, event providers.JobEvent) { + jobID := event.JobID + key := keyFor(event) + log := s.cfg.Log.With("job_id", jobID, "repo", event.Repo, "platform", "macos-native") + + unsee := func() { + s.mu.Lock() + delete(s.seen, key) + delete(s.pending, key) + s.mu.Unlock() + } + + // Acquire native macOS concurrency slot (separate from VM sem) + select { + case s.nativeMacSem <- struct{}{}: + case <-ctx.Done(): + unsee() + return + } + s.mu.Lock() + delete(s.pending, key) + s.mu.Unlock() + + log.Info("provisioning native macOS runner for job") + + // Claim job with macOS labels + labels := buildLabelsForOS("darwin", s.cfg.Labels) + const maxNameRetries = 3 + claim, err := s.claimJob(ctx, &event, labels, log, maxNameRetries) + if err != nil { + log.Error("failed to claim job", "error", err) + unsee() + time.Sleep(backoffDuration(event.Repo)) + <-s.nativeMacSem + return + } + + // Create the native runner + nr, err := native.New(s.cfg.DataDir, fmt.Sprintf("%d", jobID), claim.RunnerConfig, s.cfg.RunnerDir, log) + if err != nil { + log.Error("failed to create native runner", "error", err) + if rmErr := event.Provider.ReleaseJob(ctx, claim); rmErr != nil { + log.Warn("failed to remove ghost runner", "runner_id", claim.RunnerID, "error", rmErr) + } + unsee() + <-s.nativeMacSem + return + } + if s.cfg.NativeMacUser != "" { + nr.SetRunAsUser(s.cfg.NativeMacUser) + } + + var jobCtx context.Context + var cancel context.CancelFunc + if s.cfg.JobTimeout > 0 { + jobCtx, cancel = context.WithTimeout(ctx, s.cfg.JobTimeout) + } else { + jobCtx, cancel = context.WithCancel(ctx) + } + + // Start the runner + if err := nr.Start(jobCtx); err != nil { + log.Error("failed to start native runner", "error", err) + nr.Stop() + if rmErr := event.Provider.ReleaseJob(ctx, claim); rmErr != nil { + log.Warn("failed to remove ghost runner", "runner_id", claim.RunnerID, "error", rmErr) + } + unsee() + cancel() + <-s.nativeMacSem + return + } + + // Track the running job + s.mu.Lock() + s.running[key] = &runningJob{ + provider: event.Provider, + claim: claim, + repo: event.Repo, + cancel: cancel, + nativeRunner: nr, + startedAt: time.Now(), + } + s.mu.Unlock() + metrics.JobsActive.Inc() + + log.Info("native macOS runner started", "name", claim.RunnerName) + + // Wait for the job to finish in the background + go func() { + defer func() { <-s.nativeMacSem }() + + exitCode, err := nr.Wait() + if err != nil { + if jobCtx.Err() != nil { + log.Warn("native macOS runner killed (timeout or shutdown)", "error", err) + } else { + log.Error("native macOS runner crashed", "error", err) + } + } else if exitCode != 0 { + log.Warn("native macOS runner exited with failure", "exit_code", exitCode) + } else { + log.Info("native macOS runner exited", "exit_code", exitCode) + } + + // Clean up + s.mu.Lock() + rj, exists := s.running[key] + if exists { + delete(s.running, key) + s.mu.Unlock() + nr.Stop() + if rj.provider != nil && rj.claim != nil { + if err := rj.provider.ReleaseJob(context.Background(), rj.claim); err != nil { + log.Debug("deregister runner after native macOS cleanup", "error", err) + } + } + } else { + s.mu.Unlock() + } + }() +} + // handleLocalJob provisions a runner using the local containerd Runtime. func (s *Scheduler) handleLocalJob(ctx context.Context, event providers.JobEvent) { jobID := event.JobID @@ -1084,6 +1228,8 @@ func (s *Scheduler) handleCompleted(ctx context.Context, event providers.JobEven job.cancel() if job.macosVM != nil { job.macosVM.Stop() + } else if job.nativeRunner != nil { + job.nativeRunner.Stop() } else if job.dispatched != "" && s.cfg.LinuxDispatcher != nil { if err := s.cfg.LinuxDispatcher.Destroy(context.Background(), job.dispatched); err != nil { log.Warn("failed to destroy dispatched runner", "error", err) @@ -1154,6 +1300,8 @@ func (s *Scheduler) destroyAll() { job.cancel() if job.macosVM != nil { job.macosVM.Stop() + } else if job.nativeRunner != nil { + job.nativeRunner.Stop() } else if job.dispatched != "" && s.cfg.LinuxDispatcher != nil { if err := s.cfg.LinuxDispatcher.Destroy(context.Background(), job.dispatched); err != nil { s.cfg.Log.Warn("failed to destroy dispatched runner", "job_id", key.JobID, "error", err) diff --git a/pkg/scheduler/scheduler_test.go b/pkg/scheduler/scheduler_test.go index ec591504..d98bfcca 100644 --- a/pkg/scheduler/scheduler_test.go +++ b/pkg/scheduler/scheduler_test.go @@ -1014,3 +1014,57 @@ func TestServeTunnelWithReconnect_CancelExitsCleanly(t *testing.T) { t.Fatal("serveTunnelWithReconnect did not exit after context cancel") } } + +// --- nativeMacSem tests --- + +func TestNew_NativeMacSemDefault(t *testing.T) { + s := New(Config{Log: testLogger()}) + if cap(s.nativeMacSem) != 4 { + t.Errorf("nativeMacSem capacity = %d, want default 4", cap(s.nativeMacSem)) + } +} + +func TestNew_NativeMacSemCustom(t *testing.T) { + s := New(Config{MaxNativeMac: 6, Log: testLogger()}) + if cap(s.nativeMacSem) != 6 { + t.Errorf("nativeMacSem capacity = %d, want 6", cap(s.nativeMacSem)) + } +} + +func TestNew_NativeMacSemNegative(t *testing.T) { + s := New(Config{MaxNativeMac: -1, Log: testLogger()}) + if cap(s.nativeMacSem) != 4 { + t.Errorf("nativeMacSem capacity = %d, want default 4", cap(s.nativeMacSem)) + } +} + +// --- canHandleJob with native mode --- + +func TestCanHandleJob_MacOSNativeMode(t *testing.T) { + if runtime.GOOS != "darwin" { + t.Skip("macOS-specific test") + } + + // Without VM config but with native mode function, should accept macOS jobs + s := New(Config{ + MacOSModeForRepo: func(_ string) string { return "native" }, + Log: testLogger(), + }) + + if !s.canHandleJob([]string{"self-hosted", "macos"}) { + t.Error("canHandleJob should accept macOS when MacOSModeForRepo is set") + } +} + +func TestCanHandleJob_MacOSNoConfig(t *testing.T) { + if runtime.GOOS != "darwin" { + t.Skip("macOS-specific test") + } + + // Without VM config and without native mode, should reject macOS jobs + s := New(Config{Log: testLogger()}) + + if s.canHandleJob([]string{"self-hosted", "macos"}) { + t.Error("canHandleJob should reject macOS when neither VMConfig nor MacOSModeForRepo is set") + } +} diff --git a/pkg/vm/macosvm_darwin.go b/pkg/vm/macosvm_darwin.go index 8420e41e..61fb95f6 100644 --- a/pkg/vm/macosvm_darwin.go +++ b/pkg/vm/macosvm_darwin.go @@ -491,9 +491,12 @@ func (m *darwinMacOSVM) monitorRunner(ctx context.Context, ip string) { } if err != nil { + m.cfg.Log.Debug("monitor pgrep error", "id", m.id, "error", err, "output", strings.TrimSpace(string(out))) continue } + m.cfg.Log.Debug("monitor pgrep result", "id", m.id, "output", strings.TrimSpace(string(out))) + if strings.TrimSpace(string(out)) == "EXITED" { // Give the runner a grace period to report results to GitHub // before we tear down the VM and network. From fd06625a535c084684c401f59f7c0b502c9114ea Mon Sep 17 00:00:00 2001 From: Luther Monson Date: Wed, 10 Jun 2026 19:28:26 -0700 Subject: [PATCH 6/7] fix(native): harden macOS sandbox isolation and use a dedicated group MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Security follow-ups from review of the native runner. Native jobs run directly on the host with no VM boundary, so the sandbox profile and unix permissions are the entire isolation story — two concrete holes closed here, plus one documented as needing live-macOS work. 1. Sibling-job + daemon-state isolation. Every native job runs as the same _ephemerd uid and all workspaces live under /native/, so a job could read a concurrent job's checkout token or source. The profile now denies read AND write of the whole /native subtree and re-allows only the job's own dir (sandbox-exec applies the last matching rule). config.toml, ephemerd.sock, and the vm dir gain write denies to match their existing read denies. 2. .ssh write hole. .ssh was read-denied but writable, leaving an authorized_keys append vector on any host where the runner uid can reach the target home. Now denied for write too. 3. Dedicated primary group instead of staff (gid 20). staff is the default group for every normal macOS account, so the runner process inherited group access to the many staff-group-owned files on a typical Mac. The service user now gets a dedicated _ephemerd group. Provisioning is best-effort: any failure falls back to staff (the previously-tested behavior), so a group hiccup never blocks jobs. Not done here (documented in a code comment as a follow-up): flipping the profile from allow-by-default to deny-by-default. That is the stronger posture for native execution but requires enumerating every path the GHA runner + toolchains touch and live-testing on macOS so jobs don't break — can't be verified blind from a non-macOS host. The LAN-egress gap (sandbox-exec has no CIDR support; pf rules still a follow-up) is unchanged and remains the reason native mode should stay restricted to trusted first-party repos. --- pkg/native/native_darwin.go | 107 +++++++++++++++++++++++++++++-- pkg/native/native_darwin_test.go | 15 +++-- 2 files changed, 112 insertions(+), 10 deletions(-) diff --git a/pkg/native/native_darwin.go b/pkg/native/native_darwin.go index d76d4c1f..ab54db5b 100644 --- a/pkg/native/native_darwin.go +++ b/pkg/native/native_darwin.go @@ -29,7 +29,20 @@ var serviceUserMu sync.Mutex // deletion wedges opendirectoryd on modern macOS. const ServiceUserName = "_ephemerd" -// serviceUIDRange is scanned for a free UID when creating the service user. +// ServiceGroupName is a dedicated primary group for the service user. +// Using a dedicated group instead of staff (gid 20 — the default group for +// every normal macOS account) keeps the runner process from inheriting +// group access to the many files on a typical Mac that are staff-group +// owned. Falls back to staff if the group can't be created. +const ServiceGroupName = "_ephemerd" + +// staffGID is the macOS staff group, used as the fallback primary group +// when a dedicated service group can't be provisioned. +const staffGID = 20 + +// service{UID,GID} ranges are scanned for a free id when creating the +// service user/group. macOS reserves <500 for system accounts; 600-999 +// is the conventional band for added service accounts. const ( serviceUIDMin = 600 serviceUIDMax = 999 @@ -71,6 +84,11 @@ func (r *Runner) ensureServiceUser() (*syscall.Credential, error) { return nil, fmt.Errorf("no free UID in range %d-%d", serviceUIDMin, serviceUIDMax) } + // Resolve a dedicated primary group, falling back to staff (gid 20) + // if provisioning fails for any reason — that's the previously-tested + // behavior, so a group hiccup never blocks native jobs. + gid := r.ensureServiceGroup() + // NFSHomeDirectory is /var/empty (like _www and other service // accounts). Registering a real directory as a user home puts it // under macOS data protection — even root then can't delete it @@ -80,7 +98,7 @@ func (r *Runner) ensureServiceUser() (*syscall.Credential, error) { {"dscl", ".", "-create", "/Users/" + ServiceUserName}, {"dscl", ".", "-create", "/Users/" + ServiceUserName, "UserShell", "/bin/bash"}, {"dscl", ".", "-create", "/Users/" + ServiceUserName, "UniqueID", strconv.Itoa(uid)}, - {"dscl", ".", "-create", "/Users/" + ServiceUserName, "PrimaryGroupID", "20"}, // staff + {"dscl", ".", "-create", "/Users/" + ServiceUserName, "PrimaryGroupID", strconv.Itoa(gid)}, {"dscl", ".", "-create", "/Users/" + ServiceUserName, "NFSHomeDirectory", "/var/empty"}, {"dscl", ".", "-create", "/Users/" + ServiceUserName, "IsHidden", "1"}, } @@ -89,9 +107,62 @@ func (r *Runner) ensureServiceUser() (*syscall.Credential, error) { return nil, fmt.Errorf("%v: %s: %w", args, strings.TrimSpace(string(out)), err) } } - r.log.Info("created ephemerd service user", "user", ServiceUserName, "uid", uid) + r.log.Info("created ephemerd service user", "user", ServiceUserName, "uid", uid, "gid", gid) + + return &syscall.Credential{Uid: uint32(uid), Gid: uint32(gid)}, nil +} + +// ensureServiceGroup returns the gid of a dedicated _ephemerd primary +// group, creating it if needed. On any failure it logs a warning and +// returns staffGID (20) so native jobs keep working with the previously +// tested behavior. Caller holds serviceUserMu. +func (r *Runner) ensureServiceGroup() int { + if g, err := user.LookupGroup(ServiceGroupName); err == nil { + if gid, perr := strconv.Atoi(g.Gid); perr == nil { + return gid + } + } + + out, err := exec.Command("dscl", ".", "-list", "/Groups", "PrimaryGroupID").Output() + if err != nil { + r.log.Warn("listing groups for service group; falling back to staff", "error", err) + return staffGID + } + used := make(map[int]bool) + for _, line := range strings.Split(string(out), "\n") { + fields := strings.Fields(line) + if len(fields) == 2 { + if id, perr := strconv.Atoi(fields[1]); perr == nil { + used[id] = true + } + } + } + gid := 0 + for id := serviceUIDMin; id <= serviceUIDMax; id++ { + if !used[id] { + gid = id + break + } + } + if gid == 0 { + r.log.Warn("no free GID for service group; falling back to staff", "range", fmt.Sprintf("%d-%d", serviceUIDMin, serviceUIDMax)) + return staffGID + } - return &syscall.Credential{Uid: uint32(uid), Gid: 20}, nil + steps := [][]string{ + {"dscl", ".", "-create", "/Groups/" + ServiceGroupName}, + {"dscl", ".", "-create", "/Groups/" + ServiceGroupName, "PrimaryGroupID", strconv.Itoa(gid)}, + {"dscl", ".", "-create", "/Groups/" + ServiceGroupName, "RealName", "ephemerd native runners"}, + } + for _, args := range steps { + if out, err := exec.Command(args[0], args[1:]...).CombinedOutput(); err != nil { + r.log.Warn("creating service group; falling back to staff", + "step", strings.Join(args, " "), "output", strings.TrimSpace(string(out)), "error", err) + return staffGID + } + } + r.log.Info("created ephemerd service group", "group", ServiceGroupName, "gid", gid) + return gid } // lookupCredential resolves a username to a syscall.Credential for @@ -380,6 +451,13 @@ func GenerateSandboxProfile(jobDir, dataDir string) string { absJobDir, _ := filepath.Abs(jobDir) absDataDir, _ := filepath.Abs(dataDir) + // NOTE: this profile is allow-by-default with an explicit deny list. + // For native (no-VM) execution the stronger posture is deny-by-default + // with an allow list, but that requires enumerating every path the GHA + // runner + toolchains legitimately touch and live-testing on macOS so + // jobs don't break. Tracked as a follow-up (see PR discussion). The + // denies below close the concrete job-to-job and job-to-daemon read + // holes that matter most on a shared host. return fmt.Sprintf(`(version 1) (allow default) @@ -401,18 +479,35 @@ func GenerateSandboxProfile(jobDir, dataDir string) string { ;; === Filesystem isolation === -;; Block sensitive host paths +;; Isolate this job from sibling jobs and ephemerd internal state. +;; All native job workspaces live under /native/, and +;; every native job runs as the same _ephemerd uid, so without this a +;; job could read a concurrent job's checkout token or source. Deny the +;; whole native subtree (read AND write); the job's own dir is re-allowed +;; below, and sandbox-exec applies the last matching rule. +(deny file-read* (subpath "%[2]s/native")) +(deny file-write* (subpath "%[2]s/native")) + +;; Block sensitive host paths entirely — read and write. .ssh was +;; previously read-only-denied, leaving a writable authorized_keys hole +;; on any host where the runner uid can reach the operator's home. (deny file-read* (subpath "%[1]s/.ssh")) +(deny file-write* (subpath "%[1]s/.ssh")) (deny file-read* (literal "%[2]s/config.toml")) +(deny file-write* (literal "%[2]s/config.toml")) (deny file-read* (literal "%[2]s/ephemerd.sock")) +(deny file-write* (literal "%[2]s/ephemerd.sock")) (deny file-read* (subpath "%[2]s/vm")) +(deny file-write* (subpath "%[2]s/vm")) ;; Block writes to shared tools (read-only access only) (deny file-write* (subpath "/opt/homebrew")) (deny file-write* (subpath "/Applications")) (deny file-write* (subpath "/usr/local")) -;; Allow writes to the job directory +;; Re-allow this job's own workspace (read + write). Placed AFTER the +;; native-subtree deny above so it wins for the job's own directory. +(allow file-read* (subpath "%[3]s")) (allow file-write* (subpath "%[3]s")) (allow file-write* (subpath "/private/tmp")) `, os.Getenv("HOME"), absDataDir, absJobDir) diff --git a/pkg/native/native_darwin_test.go b/pkg/native/native_darwin_test.go index 99db5655..914ac332 100644 --- a/pkg/native/native_darwin_test.go +++ b/pkg/native/native_darwin_test.go @@ -24,13 +24,20 @@ func TestGenerateSandboxProfile(t *testing.T) { {"allows DNS TCP", `(allow network-outbound (remote tcp "localhost:53"))`}, {"blocks localhost", `(deny network-outbound (remote ip "localhost:*"))`}, {"blocks port binding", `(deny network-bind (local ip "*:*"))`}, - {"blocks SSH dir", `(deny file-read* (subpath`}, - {"blocks config.toml", `(deny file-read* (literal "/var/lib/ephemerd/config.toml"))`}, - {"blocks ephemerd socket", `(deny file-read* (literal "/var/lib/ephemerd/ephemerd.sock"))`}, - {"blocks VM dir", `(deny file-read* (subpath "/var/lib/ephemerd/vm"))`}, + {"blocks sibling job reads", `(deny file-read* (subpath "/var/lib/ephemerd/native"))`}, + {"blocks sibling job writes", `(deny file-write* (subpath "/var/lib/ephemerd/native"))`}, + {"blocks SSH dir reads", `(deny file-read* (subpath`}, + {"blocks SSH dir writes", `(deny file-write* (subpath`}, + {"blocks config.toml reads", `(deny file-read* (literal "/var/lib/ephemerd/config.toml"))`}, + {"blocks config.toml writes", `(deny file-write* (literal "/var/lib/ephemerd/config.toml"))`}, + {"blocks ephemerd socket reads", `(deny file-read* (literal "/var/lib/ephemerd/ephemerd.sock"))`}, + {"blocks ephemerd socket writes", `(deny file-write* (literal "/var/lib/ephemerd/ephemerd.sock"))`}, + {"blocks VM dir reads", `(deny file-read* (subpath "/var/lib/ephemerd/vm"))`}, + {"blocks VM dir writes", `(deny file-write* (subpath "/var/lib/ephemerd/vm"))`}, {"blocks homebrew writes", `(deny file-write* (subpath "/opt/homebrew"))`}, {"blocks Applications writes", `(deny file-write* (subpath "/Applications"))`}, {"blocks /usr/local writes", `(deny file-write* (subpath "/usr/local"))`}, + {"re-allows job dir reads", `(allow file-read* (subpath "/tmp/test-native/job123"))`}, {"allows job dir writes", `(allow file-write* (subpath "/tmp/test-native/job123"))`}, {"allows /private/tmp writes", `(allow file-write* (subpath "/private/tmp"))`}, } From dd8c657487b9915879c8161515044b92769bc5fe Mon Sep 17 00:00:00 2001 From: Luther Monson Date: Sun, 14 Jun 2026 12:30:02 -0700 Subject: [PATCH 7/7] fix(native): correct sandbox filesystem rules so the runner can start MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hardened sandbox blocked the GHA runner from starting. Three distinct macOS sandbox-exec behaviors, each found via local repro: 1. deny file-read* on the native subtree blocked file-read-metadata, which realpath() needs to traverse through native/ to the job dir. The .NET host died with "Failed to resolve full path of the current executable" (exit 133). Fixed: deny only file-read-data. 2. getcwd() and bash walk UP from the job's runner dir and must readdir(native/) to learn the job-id component name; the read-data deny on the native subtree blocked that, giving "getcwd: cannot access parent directories" and "run.sh: Operation not permitted" (exit 126). Fixed: allow file-read-data on the native dir node (literal) — leaks only the non-secret list of concurrent job ids. 3. macOS sandbox resolves a specific-operation deny (file-read-data) over a later wildcard allow (file-read*), so the per-job re-allow must name file-read-data explicitly to win. Added an explicit file-read-data re-allow on the job subtree alongside file-read*. Job-to-job isolation is preserved: a sibling job's directory listing and file contents stay denied (verified). Smoke-test jobs now run end-to-end as _ephemerd with all steps green. Co-Authored-By: Claude Opus 4.6 --- pkg/native/native_darwin.go | 52 +++++++++++++++++---- pkg/native/native_darwin_test.go | 78 ++++++++++++++++++++++++++------ 2 files changed, 105 insertions(+), 25 deletions(-) diff --git a/pkg/native/native_darwin.go b/pkg/native/native_darwin.go index ab54db5b..8e30795b 100644 --- a/pkg/native/native_darwin.go +++ b/pkg/native/native_darwin.go @@ -447,9 +447,23 @@ func (r *Runner) deleteKeychain() { // GenerateSandboxProfile returns a macOS sandbox profile that restricts // the runner process. Paths are templated with the job-specific directories. func GenerateSandboxProfile(jobDir, dataDir string) string { - // Resolve to absolute paths for the sandbox profile - absJobDir, _ := filepath.Abs(jobDir) - absDataDir, _ := filepath.Abs(dataDir) + // Resolve to absolute, symlink-free paths. The sandbox matches against + // kernel (resolved) paths: /var and /tmp are symlinks to /private/var + // and /private/tmp on macOS, so rules written with the unresolved + // config paths (e.g. /var/lib/ephemerd/...) silently never match. + resolve := func(p string) string { + abs, err := filepath.Abs(p) + if err != nil { + abs = p + } + if real, err := filepath.EvalSymlinks(abs); err == nil { + return real + } + return abs + } + absJobDir := resolve(jobDir) + absDataDir := resolve(dataDir) + homeDir := resolve(os.Getenv("HOME")) // NOTE: this profile is allow-by-default with an explicit deny list. // For native (no-VM) execution the stronger posture is deny-by-default @@ -482,12 +496,25 @@ func GenerateSandboxProfile(jobDir, dataDir string) string { ;; Isolate this job from sibling jobs and ephemerd internal state. ;; All native job workspaces live under /native/, and ;; every native job runs as the same _ephemerd uid, so without this a -;; job could read a concurrent job's checkout token or source. Deny the -;; whole native subtree (read AND write); the job's own dir is re-allowed -;; below, and sandbox-exec applies the last matching rule. -(deny file-read* (subpath "%[2]s/native")) +;; job could read a concurrent job's checkout token or source. +;; +;; Deny file-read-DATA (not file-read*) on the native subtree: on a +;; directory that blocks readdir (can't list a sibling's contents), on a +;; file it blocks reading contents. file-read-metadata stays allowed so +;; lstat/realpath path resolution can traverse THROUGH native/ — denying +;; metadata breaks the .NET host with "Failed to resolve full path of the +;; current executable" (exit 133). +(deny file-read-data (subpath "%[2]s/native")) (deny file-write* (subpath "%[2]s/native")) +;; Re-allow reading the native directory NODE itself (not its children). +;; getcwd() and bash walk UP from the job's runner dir and must readdir +;; native/ to learn the job-id component name; without this they fail +;; with "getcwd: cannot access parent directories" and run.sh won't exec. +;; This leaks the list of concurrent job-id directory names (not their +;; contents) — job ids are not secret. +(allow file-read-data (literal "%[2]s/native")) + ;; Block sensitive host paths entirely — read and write. .ssh was ;; previously read-only-denied, leaving a writable authorized_keys hole ;; on any host where the runner uid can reach the operator's home. @@ -505,12 +532,17 @@ func GenerateSandboxProfile(jobDir, dataDir string) string { (deny file-write* (subpath "/Applications")) (deny file-write* (subpath "/usr/local")) -;; Re-allow this job's own workspace (read + write). Placed AFTER the -;; native-subtree deny above so it wins for the job's own directory. +;; Re-allow this job's own workspace (read + write). The explicit +;; file-read-data is required IN ADDITION to file-read*: macOS sandbox +;; resolves a specific-operation deny (the file-read-data deny on the +;; native subtree above) over a later wildcard allow (file-read*), so the +;; read-data re-allow must name the operation explicitly to win for this +;; job's own files. (allow file-read* (subpath "%[3]s")) +(allow file-read-data (subpath "%[3]s")) (allow file-write* (subpath "%[3]s")) (allow file-write* (subpath "/private/tmp")) -`, os.Getenv("HOME"), absDataDir, absJobDir) +`, homeDir, absDataDir, absJobDir) } // symlinkHomebrew creates symlinks from /opt/homebrew/bin/* into the diff --git a/pkg/native/native_darwin_test.go b/pkg/native/native_darwin_test.go index 914ac332..e22d0a9b 100644 --- a/pkg/native/native_darwin_test.go +++ b/pkg/native/native_darwin_test.go @@ -10,35 +10,52 @@ import ( ) func TestGenerateSandboxProfile(t *testing.T) { - jobDir := "/tmp/test-native/job123" - dataDir := "/var/lib/ephemerd" + // Use real directories: the profile resolves symlinks (e.g. /var → + // /private/var) so rules match the kernel's view of the paths. The + // expected strings must be the resolved forms. + base := t.TempDir() + dataDir := filepath.Join(base, "data") + jobDir := filepath.Join(dataDir, "native", "job123") + if err := os.MkdirAll(jobDir, 0o755); err != nil { + t.Fatal(err) + } + + resolvedData, err := filepath.EvalSymlinks(dataDir) + if err != nil { + t.Fatal(err) + } + resolvedJob, err := filepath.EvalSymlinks(jobDir) + if err != nil { + t.Fatal(err) + } profile := GenerateSandboxProfile(jobDir, dataDir) - // Verify the profile contains key deny rules checks := []struct { - desc string - substr string + desc string + substr string }{ {"allows DNS UDP", `(allow network-outbound (remote udp "localhost:53"))`}, {"allows DNS TCP", `(allow network-outbound (remote tcp "localhost:53"))`}, {"blocks localhost", `(deny network-outbound (remote ip "localhost:*"))`}, {"blocks port binding", `(deny network-bind (local ip "*:*"))`}, - {"blocks sibling job reads", `(deny file-read* (subpath "/var/lib/ephemerd/native"))`}, - {"blocks sibling job writes", `(deny file-write* (subpath "/var/lib/ephemerd/native"))`}, + {"blocks sibling job read-data", `(deny file-read-data (subpath "` + resolvedData + `/native"))`}, + {"blocks sibling job writes", `(deny file-write* (subpath "` + resolvedData + `/native"))`}, + {"allows native dir node read (getcwd)", `(allow file-read-data (literal "` + resolvedData + `/native"))`}, {"blocks SSH dir reads", `(deny file-read* (subpath`}, {"blocks SSH dir writes", `(deny file-write* (subpath`}, - {"blocks config.toml reads", `(deny file-read* (literal "/var/lib/ephemerd/config.toml"))`}, - {"blocks config.toml writes", `(deny file-write* (literal "/var/lib/ephemerd/config.toml"))`}, - {"blocks ephemerd socket reads", `(deny file-read* (literal "/var/lib/ephemerd/ephemerd.sock"))`}, - {"blocks ephemerd socket writes", `(deny file-write* (literal "/var/lib/ephemerd/ephemerd.sock"))`}, - {"blocks VM dir reads", `(deny file-read* (subpath "/var/lib/ephemerd/vm"))`}, - {"blocks VM dir writes", `(deny file-write* (subpath "/var/lib/ephemerd/vm"))`}, + {"blocks config.toml reads", `(deny file-read* (literal "` + resolvedData + `/config.toml"))`}, + {"blocks config.toml writes", `(deny file-write* (literal "` + resolvedData + `/config.toml"))`}, + {"blocks ephemerd socket reads", `(deny file-read* (literal "` + resolvedData + `/ephemerd.sock"))`}, + {"blocks ephemerd socket writes", `(deny file-write* (literal "` + resolvedData + `/ephemerd.sock"))`}, + {"blocks VM dir reads", `(deny file-read* (subpath "` + resolvedData + `/vm"))`}, + {"blocks VM dir writes", `(deny file-write* (subpath "` + resolvedData + `/vm"))`}, {"blocks homebrew writes", `(deny file-write* (subpath "/opt/homebrew"))`}, {"blocks Applications writes", `(deny file-write* (subpath "/Applications"))`}, {"blocks /usr/local writes", `(deny file-write* (subpath "/usr/local"))`}, - {"re-allows job dir reads", `(allow file-read* (subpath "/tmp/test-native/job123"))`}, - {"allows job dir writes", `(allow file-write* (subpath "/tmp/test-native/job123"))`}, + {"re-allows job dir reads", `(allow file-read* (subpath "` + resolvedJob + `"))`}, + {"re-allows job dir read-data", `(allow file-read-data (subpath "` + resolvedJob + `"))`}, + {"allows job dir writes", `(allow file-write* (subpath "` + resolvedJob + `"))`}, {"allows /private/tmp writes", `(allow file-write* (subpath "/private/tmp"))`}, } @@ -49,6 +66,37 @@ func TestGenerateSandboxProfile(t *testing.T) { } } +// TestGenerateSandboxProfile_ResolvesSymlinks pins the /var → /private/var +// gotcha: a profile written with unresolved paths silently never matches. +func TestGenerateSandboxProfile_ResolvesSymlinks(t *testing.T) { + base := t.TempDir() + realData := filepath.Join(base, "real-data") + jobDir := filepath.Join(realData, "native", "j1") + if err := os.MkdirAll(jobDir, 0o755); err != nil { + t.Fatal(err) + } + linkData := filepath.Join(base, "link-data") + if err := os.Symlink(realData, linkData); err != nil { + t.Fatal(err) + } + + resolvedData, err := filepath.EvalSymlinks(realData) + if err != nil { + t.Fatal(err) + } + + // Generate using the SYMLINK path — the profile must contain the + // resolved target, and not rules pointing at the symlink. + profile := GenerateSandboxProfile(filepath.Join(linkData, "native", "j1"), linkData) + + if !strings.Contains(profile, `(deny file-read-data (subpath "`+resolvedData+`/native"))`) { + t.Errorf("profile should deny the RESOLVED native path %q, got:\n%s", resolvedData, profile) + } + if strings.Contains(profile, `(subpath "`+linkData+`/native")`) { + t.Errorf("profile must not reference the unresolved symlink path %q", linkData) + } +} + func TestNewCreatesWorkspace(t *testing.T) { tmpDir := t.TempDir() dataDir := filepath.Join(tmpDir, "data")