diff --git a/cmd/ephemerd/main.go b/cmd/ephemerd/main.go index de1350e3..eee4bb5f 100644 --- a/cmd/ephemerd/main.go +++ b/cmd/ephemerd/main.go @@ -547,6 +547,10 @@ func serve(ctx context.Context, configFile, imagesDirFlag string, containerdTCPP ShutdownTimeout: cfg.Runner.ParsedShutdownTimeout(), LogRetention: cfg.Log.LogRetentionDuration(), RunnerImageForRepo: cfg.Runner.ImageForRepoOS, + MaxNativeMac: cfg.Runner.MacOS.ResolvedMaxNative(), + MacOSModeForRepo: cfg.Runner.MacOS.ModeForRepo, + NativeMacUser: cfg.Runner.MacOS.User, + RunnerDir: rm.Dir(), Log: log, }) diff --git a/docs/arch/native-macos-runner-summary.md b/docs/arch/native-macos-runner-summary.md new file mode 100644 index 00000000..64d03d03 --- /dev/null +++ b/docs/arch/native-macos-runner-summary.md @@ -0,0 +1,101 @@ +# Native macOS Runner for ephemerd + +## Problem + +macOS jobs currently run in per-job Virtualization.framework VMs. This works but has hard limits: + +- Apple restricts macOS VMs to **2 concurrent instances** per host +- Each VM needs **4+ GB RAM** +- VM boot adds **10-15 seconds** of overhead per job +- An 8 GB Mac mini can run at most **2 concurrent macOS jobs** + +## Solution + +A new **native** execution mode that runs the GitHub Actions runner directly on the host. For trusted repos that don't need VM-level isolation (internal CI, Xcode builds, Go tests), this enables: + +- **4-6+ concurrent jobs** on the same hardware (configurable) +- **Zero boot overhead** — fork+exec, not VM boot +- **~200 MB per job** instead of 4+ GB + +The VM path is untouched — this is purely additive. Mode is configured per-repo. + +## Config + +```toml +[runner.macos] +mode = "vm" # default for repos not listed below +max_native = 4 # max concurrent native jobs + +[runner.macos.repos] +"ephpm/*" = "native" # whole org runs native +"ephpm/secret-repo" = "vm" # except this one (exact match wins over wildcard) +"someuser/ephemerd" = "vm" # fork stays on VM +``` + +Resolution order: exact `org/repo` match > `org/*` wildcard > top-level mode > default `"vm"`. + +## How it works + +Each native job gets its own isolated workspace: + +``` +/native// + ├── home/ → $HOME + ├── tmp/ → $TMPDIR + ├── work/ → runner _work directory + ├── runner/ → per-job copy of the GHA runner binary + ├── homebrew/ → per-job Homebrew prefix (symlinks to host /opt/homebrew) + └── keychain/ → per-job macOS keychain +``` + +### Isolation layers + +| Layer | How | +|-------|-----| +| Filesystem | Per-job HOME/TMPDIR/workdir. Sandbox blocks writes to `/opt/homebrew`, `/Applications`, `/usr/local`. Sensitive paths (SSH keys, ephemerd config, VM assets) blocked entirely. | +| Processes | `setpgid` puts runner + children in own process group. Killed via `kill(-pgid)` on cleanup. | +| Network | `sandbox-exec` blocks localhost outbound (prevents reaching ephemerd control socket or other jobs) and blocks port binding (prevents inter-job communication). DNS allowed. Public internet allowed. | +| Secrets | Per-job keychain created/destroyed. Environment cleared. | +| Homebrew | Host `/opt/homebrew` is read-only. Per-job prefix for `brew install` — installs are isolated and destroyed with the job. | + +The runner is launched via macOS `sandbox-exec`, which is kernel-enforced and inherited by all child processes. + +## Concurrency + +A separate semaphore (`nativeMacSem`) gates native jobs independently from VM jobs (`macSem`). A host can run **2 VM jobs + 4 native jobs simultaneously** if both modes are in use. + +## Scheduler flow + +``` +handleQueued + └─ isMacOSJob? + └─ ModeForRepo == "native" → handleNativeMacOSJob + │ └─ acquire nativeMacSem + │ └─ claimJob (register JIT runner with GitHub) + │ └─ native.New → copy runner, generate sandbox, setup env + │ └─ native.Start → sandbox-exec ./run.sh --jitconfig + │ └─ native.Wait → block until job completes + │ └─ native.Stop → kill process group, delete keychain, rm workspace + │ └─ ReleaseJob (deregister runner) + │ + └─ ModeForRepo == "vm" → handleMacOSJob (existing, unchanged) + └─ acquire macSem + └─ boot Virtualization.framework VM +``` + +## What's left + +- **Private network blocking** (10.x, 172.16.x, 192.168.x): `sandbox-exec` doesn't support CIDR notation. Needs `pf` firewall rules — separate follow-up. +- **Resource limits**: macOS has no cgroups. A runaway build can starve others. Mitigated with `nice`/`ulimit` in a future iteration. +- **No per-job user isolation**: all jobs run as the same macOS user. Jobs can see each other's PIDs via `ps` but can't interact (sandbox blocks sensitive files and network). + +## Comparison + +| | Native | VM | +|--|--------|-----| +| Boot time | ~0s | 10-15s | +| Memory per job | ~200 MB | 4+ GB | +| Max concurrent (8 GB mini) | 4-6 | 2 | +| Isolation | Sandbox + process group | Full VM | +| Security | Trusted repos only | Untrusted OK | +| Apple VM limit | N/A | 2 per host | diff --git a/docs/arch/native-macos-runner.md b/docs/arch/native-macos-runner.md new file mode 100644 index 00000000..e5bd3ad1 --- /dev/null +++ b/docs/arch/native-macos-runner.md @@ -0,0 +1,395 @@ +# Native macOS Runner + +> **Status: implemented.** See `pkg/native/`. Notable deviations from this +> proposal, discovered during implementation: +> +> - **Privilege dropping**: jobs run as a hidden `_ephemerd` service user +> (created lazily, like `_www`), not as the daemon's root user. Per-job +> ephemeral users were attempted but abandoned: macOS user *deletion* +> via dscl/sysadminctl requires Full Disk Access and wedges +> opendirectoryd without it, while creation works fine. +> - **Sandbox network rules**: `sandbox-exec` does not support CIDR +> notation (`10.0.0.0/8`). The profile blocks localhost outbound and all +> port binding; RFC1918 blocking needs pf firewall rules (follow-up). +> - **DEVELOPER_DIR** is resolved via `xcode-select -p` instead of +> hardcoding the Xcode.app path (hosts with only CLT broke otherwise). +> - **Runner extraction** is OS-suffixed (`runners/-`) so the +> macOS host and Linux VM don't collide on the shared data dir. + +## Problem + +macOS jobs currently run in per-job Virtualization.framework VMs (APFS +clone-on-write from a base image). This works but has hard limits: + +- Apple restricts macOS VMs to **2 concurrent instances** per host. +- Each VM needs **4 GB+ RAM** (2 GB absolute minimum, unusable in practice). +- An 8 GB Mac mini can run at most **2 concurrent macOS jobs**. +- VM boot adds **10-15 seconds** of overhead per job. + +For repos that don't need VM-level isolation (trusted internal CI, Xcode +builds, Go tests), a native execution mode that runs the GitHub Actions +runner directly on the host would allow **4-6+ concurrent jobs** on the +same hardware with zero boot overhead. + +## Proposal + +Add a **native** macOS execution mode alongside the existing VM mode. +The mode is configured per-repo. The VM path is untouched -- this is +purely additive. + +## Config design + +A new `[runner.macos]` section controls macOS job routing. It lives under +`[runner]` (not `[vm.macos]`) because native jobs don't involve VMs. + +```toml +[runner.macos] +mode = "vm" # default mode: "vm" or "native" +max_native = 4 # max concurrent native jobs (no Apple limit applies) + +# Per-repo overrides. Repo name matches github.repos entries. +[runner.macos.repos] +php-sdk = "native" +ephemerd = "native" +# Repos not listed here inherit the top-level mode. +``` + +Config struct additions in `pkg/config/config.go`: + +```go +type RunnerConfig struct { + // ... existing fields ... + MacOS MacOSRunnerConfig `toml:"macos"` +} + +type MacOSRunnerConfig struct { + Mode string `toml:"mode"` // "vm" (default) or "native" + MaxNative int `toml:"max_native"` // max concurrent native jobs (default 4) + Repos map[string]string `toml:"repos"` // repo -> "vm" or "native" +} +``` + +`MacOSRunnerConfig.ModeForRepo(repo)` returns `"native"` or `"vm"` by +checking the per-repo map first, then falling back to the top-level mode, +then defaulting to `"vm"`. + +### Why not extend `[runner.images]`? + +`[runner.images]` maps repos to OCI container images. Native macOS jobs +don't use container images at all -- they run directly on the host. Mixing +these two concepts in the same config block would be confusing. + +## Scheduler flow + +`handleQueued` already routes macOS jobs to `handleMacOSJob`. The change +adds a branch at the top of `handleMacOSJob`: + +``` +handleQueued + └─ isMacOSJob? + └─ handleMacOSJob + ├─ ModeForRepo == "native" → handleNativeMacOSJob (new) + │ └─ acquire nativeMacSem (max_native) + └─ ModeForRepo == "vm" → existing VM path + └─ acquire macSem (max 2) +``` + +A new semaphore `nativeMacSem` (capacity = `max_native`) is separate from +the existing `macSem` (VM concurrency, capped at 2 by Apple). This means +a host can run 2 VM jobs + 4 native jobs simultaneously if both modes are +in use. + +The `canHandleJob` check for `"macos"` labels also needs updating: +currently it requires `MacOSVMConfig != nil`. With native mode, macOS jobs +are handleable on darwin hosts even without a VM disk image, as long as +the runner config allows native mode for that repo. + +## Native runner lifecycle + +New package: `pkg/native/native_darwin.go` (build-tagged `darwin`). + +### 1. Create workspace + +``` +/native// + ├── home/ → $HOME for the job + ├── tmp/ → $TMPDIR for the job + ├── work/ → runner _work directory + └── runner/ → per-job copy of the GHA runner binary +``` + +The runner is extracted from the embedded `pkg/runner` tarball into the +per-job directory. This is the same runner binary used by the VM path, +just extracted to a different location. + +### 2. Set up environment + +```go +env := []string{ + "HOME=" + jobHome, + "TMPDIR=" + jobTmp, + "RUNNER_WORK_FOLDER=" + jobWork, + "PATH=/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin", + // Xcode: use host's installation + "DEVELOPER_DIR=/Applications/Xcode.app/Contents/Developer", +} +``` + +Host tooling (`/opt/homebrew`, `/Applications/Xcode.app`, `/usr/local`) +is shared read-only by virtue of the OS -- no bind mounts needed. Each +job just gets its own HOME/TMPDIR/work directory so outputs don't collide. + +### 3. Start runner + +```go +cmd := exec.CommandContext(ctx, "./run.sh", "--jitconfig", jitConfig) +cmd.Dir = runnerDir +cmd.Env = env +cmd.Stdout = logFile +cmd.Stderr = logFile +cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} // own process group +err := cmd.Start() +``` + +`Setpgid: true` puts the runner and all its children in a new process +group so we can `kill(-pgid, SIGTERM)` on cleanup. + +### 4. Wait for exit + +Block on `cmd.Wait()`. Return the exit code. + +### 5. Cleanup + +1. Kill the process group (`syscall.Kill(-pgid, SIGKILL)`) if still alive. +2. `pkill -9 -P ` as a fallback for any orphaned children. +3. `os.RemoveAll(jobDir)` to delete the workspace. +4. Deregister the runner from the provider. + +## Isolation model + +| Layer | Native | VM | +|-------|--------|----| +| Filesystem | Per-job HOME/TMPDIR/workdir + sandbox deny on sensitive paths | Full disk clone | +| Processes | Process group (`setpgid`), killed on cleanup | Separate kernel | +| Network | Sandbox: deny RFC1918/localhost outbound + deny port binding | NAT with firewall | +| Users | Shared macOS user | Isolated user per VM | +| Secrets | Sandbox denies read on key paths, env cleared on exit | VM memory destroyed | + +### Sandbox profile (required for native mode) + +Every native job runs under `sandbox-exec -f `. The sandbox +is **inherited by all child processes** and **enforced by the kernel**. +No process can escape it without root. + +The profile is generated per-job (to include the job-specific directory +paths) and written to the job workspace: + +```scheme +(version 1) +(allow default) + +;; === Network isolation === + +;; Block outbound to private networks +(deny network-outbound (remote ip "localhost:*")) +(deny network-outbound (remote ip "10.0.0.0/8:*")) +(deny network-outbound (remote ip "172.16.0.0/12:*")) +(deny network-outbound (remote ip "192.168.0.0/16:*")) +(deny network-outbound (remote ip "169.254.0.0/16:*")) + +;; Block binding to any port — prevents jobs from running servers +;; that other jobs could connect to. This closes the inter-job +;; localhost attack vector entirely. +(deny network-bind (local ip "*:*")) + +;; Allow DNS (required for public internet access) +(allow network-outbound (remote udp "*:53")) +(allow network-outbound (remote tcp "*:53")) + +;; === Filesystem isolation === + +;; Block sensitive host paths +(deny file-read* (subpath "/Users/luthermonson/.ssh")) +(deny file-read* (subpath "/config.toml")) +(deny file-read* (literal "/ephemerd.sock")) +(deny file-read* (subpath "/vm")) + +;; Block writes to shared tools (read-only access only) +(deny file-write* (subpath "/opt/homebrew")) +(deny file-write* (subpath "/Applications")) +(deny file-write* (subpath "/usr/local")) + +;; Allow writes to the job directory only +(allow file-write* (subpath "")) +(allow file-write* (subpath "/private/tmp")) +``` + +In Go, the runner is launched as: + +```go +cmd := exec.CommandContext(ctx, "sandbox-exec", "-f", profilePath, + "./run.sh", "--jitconfig", jitConfig) +cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} +``` + +### What this provides + +- **Network isolation**: jobs cannot reach the LAN, other machines, or + the ephemerd control socket. Jobs cannot bind ports, so they cannot + communicate with each other via localhost. +- **DNS allowed**: jobs can resolve public hostnames and connect to + public internet (GitHub, package registries, etc.). +- **Filesystem write isolation**: jobs can only write to their own + workspace. Shared tools (`/opt/homebrew`, `/Applications`) are + read-only. Sensitive host files (SSH keys, config, VM assets) are + blocked entirely. +- **Process isolation**: `setpgid` + process group kill ensures no + orphaned processes survive between jobs. +- **Environment isolation**: each runner process gets a controlled set + of environment variables. No leakage from the daemon process. + +### Remaining limitations (accepted for trusted repos) + +- **No per-job user isolation.** All jobs run as the same macOS user. + A job can `ps aux` and see other jobs' PIDs (but not interact with + them — the sandbox blocks sensitive files and network). +- **No resource limits.** macOS has no cgroups. A runaway build can + starve other jobs of CPU/memory. Mitigated with `nice` (CPU priority) + and `ulimit` (memory soft limit) on the runner process. +- **Read access to non-denied paths.** Jobs can read world-readable + files outside the deny list. The sandbox profile should be kept + up-to-date with any new sensitive paths. + +## Comparison table + +| Dimension | Native | VM | +|-----------|--------|----| +| Boot time | ~0s (fork+exec) | 10-15s | +| Memory per job | ~200 MB (runner process) | 4+ GB | +| Max concurrent (8 GB mini) | 4-6 | 2 | +| Isolation | Process group + directory | Full VM (separate kernel) | +| Network isolation | None | NAT + firewall | +| Security | Trusted repos only | Untrusted OK | +| Xcode/Homebrew | Shared from host | Pre-installed in base image | +| Setup complexity | Low (just extract runner) | High (IPSW install, clone) | +| Apple VM limit | Not applicable | 2 per host | + +## What changes + +### `pkg/config/config.go` + +Add `MacOSRunnerConfig` struct to `RunnerConfig`. Add `ModeForRepo(repo)` +method. + +### `pkg/scheduler/scheduler.go` + +- Add `nativeMacSem chan struct{}` field to `Scheduler`. +- Initialize from `cfg.Runner.MacOS.MaxNative` (default 4). +- Update `canHandleJob`: accept macOS jobs on darwin even without + `MacOSVMConfig` when native mode is configured for the repo. +- Split `handleMacOSJob`: check `ModeForRepo` and route to + `handleNativeMacOSJob` or the existing VM path. + +### New: `pkg/native/native_darwin.go` + +Native runner lifecycle: + +```go +type Runner struct { /* workspace paths, cmd, pgid */ } + +func New(dataDir string, jobID string, jitConfig string, log *slog.Logger) (*Runner, error) +func (r *Runner) Start(ctx context.Context) error +func (r *Runner) Wait(ctx context.Context) (int, error) +func (r *Runner) Stop() +``` + +A `native_other.go` stub returns errors on non-darwin platforms. + +### `cmd/ephemerd/runtime_darwin.go` + +Pass `cfg.Runner.MacOS` to the scheduler config so it can read per-repo +mode overrides. + +## Decisions + +### 1. Homebrew: per-job writable prefix over shared read-only base + +Jobs need `brew install` for build deps, but we can't let one job's +installs pollute another. The solution uses Homebrew's relocatable +architecture: + +**Host setup (one-time):** `/opt/homebrew` is pre-installed with common +tools (Go, mage, etc.) and marked read-only for the runner user. + +**Per-job overlay:** + +``` +/native// + └── homebrew/ → HOMEBREW_PREFIX, HOMEBREW_CELLAR, HOMEBREW_TEMP + ├── Cellar/ → per-job installs land here + ├── lib/ + ├── bin/ → symlinked from /opt/homebrew/bin at job start + └── Homebrew/ → lightweight Homebrew checkout (or symlink) +``` + +Environment for the runner process: + +```bash +HOMEBREW_PREFIX=/homebrew +HOMEBREW_CELLAR=/homebrew/Cellar +HOMEBREW_TEMP=/tmp +PATH=/homebrew/bin:/opt/homebrew/bin:/usr/local/bin:... +``` + +How it works: + +1. At job start, create `/homebrew/bin` and symlink all + executables from `/opt/homebrew/bin` into it. This gives the job + read access to pre-installed tools. +2. Set `HOMEBREW_PREFIX` and `HOMEBREW_CELLAR` to the per-job dir. + Any `brew install` writes to the job's Cellar, not the host's. +3. The job's `homebrew/bin` is first in PATH, so newly installed + formulas shadow the host versions if there's a conflict. +4. At job end, `rm -rf ` deletes everything — installs, + caches, temp files. + +**Why not a full Homebrew clone?** Cloning the Homebrew repo takes +~10 seconds and ~500 MB. Symlinking the host's existing install is +instant and zero-copy. The job only needs a writable prefix for new +installs. + +**Why not just share `/opt/homebrew` read-write?** Jobs would step on +each other. One job upgrading a formula mid-build could break another +job. Per-job prefix keeps them independent. + +### 2. Keychain: per-job temporary keychain + +Each native job gets its own temporary keychain: + +```bash +KEYCHAIN=/keychain/job.keychain-db +security create-keychain -p "" "$KEYCHAIN" +security default-keychain -s "$KEYCHAIN" +security unlock-keychain -p "" "$KEYCHAIN" +``` + +At cleanup: + +```bash +security delete-keychain "$KEYCHAIN" +``` + +This prevents jobs from accessing each other's signing identities and +avoids polluting the host login keychain. Jobs that need code signing +import their certs into the per-job keychain via `security import` +(standard GitHub Actions pattern — `apple-actions/import-codesign-certs` +does exactly this). + +### 3. Concurrency: static config, default 4 + +`max_native = 4` is the default. Operators set it based on their +hardware. No auto-detection — the right value depends on workload +(CPU-heavy Xcode builds want fewer, lightweight Go tests want more). + +The value only caps native macOS jobs. Linux jobs (in the VM) and +macOS VM jobs have their own separate limits. diff --git a/docs/arch/upgrade-command.md b/docs/arch/upgrade-command.md new file mode 100644 index 00000000..c56cf1b8 --- /dev/null +++ b/docs/arch/upgrade-command.md @@ -0,0 +1,325 @@ +# `ephemerd upgrade`: In-Place Binary Update + +> **Status: proposal.** Not implemented. Scoping document — design, +> tradeoffs, and a work breakdown. Cost estimates are based on adjacent +> tooling (Tailscale's `tailscale update`, k0s `k0sctl apply`, Docker +> CE upgrade flows) and are best-guess until prototyped. + +## Context + +Today, updating ephemerd on a host is a manual five-step ritual: + +1. `git pull` on the host (or copy a fresh tree). +2. `mage build:windows` (or `:macos` / current-OS variant), about 5 + minutes including the embedded Linux cross-compile. +3. `ephemerd stop`, wait for the process to actually exit (Windows + service shutdown races; binary stays locked for a beat). +4. Copy the new binary to `C:\Program Files\ephemerd\ephemerd.exe` + (or `/usr/local/bin/ephemerd` on Linux/macOS). +5. `ephemerd start`, poll for the in-VM ephemerd to come back up, + `grep` the console log to confirm the version baked in is the one + we just built. + +That works for one host. For three weekly iterations on one host it +becomes annoying. For a fleet — multiple hosts per org, plus the +~half-dozen test rigs the team would want to keep current — it doesn't +scale. + +The dind work in PRs #82–#85 also surfaced that "is the new code +actually running" is a non-obvious question. The Windows daemon and the +in-VM ephemerd are *two* binaries (the Linux one is embedded into the +Windows one and extracted on every VM boot), and a stale build can run +silently if the deploy missed either layer. An upgrade command should +make that uncertainty go away by handling both halves and reporting the +resulting version end-to-end. + +## Goals + +1. **One command per host.** `ephemerd upgrade` does the entire update. +2. **No source tree required on the target.** Hosts that aren't dev + workstations shouldn't need Go, mage, the repo, or 5 minutes of CPU + to update. +3. **Per-channel pinning.** A prod host configured for `stable` can't + accidentally pull a `main` build. A dev host can opt in. +4. **Drain-safe.** No in-flight jobs get killed by an upgrade. +5. **Rollback-safe.** Failed startup of the new binary rolls back to + the previous one automatically. +6. **End-to-end version reporting.** Post-upgrade output names *both* + the Windows-daemon version and the in-VM ephemerd version, so the + "I deployed, the fix didn't take" story from the dind work can't + recur silently. + +Non-goals: rolling fleet upgrades (one host at a time is fine; multi-host +orchestration is a layer above this), zero-downtime (drain + restart is +acceptable for our SLA), self-updating from arbitrary URLs (channels +only). + +## Design + +### Artifact source + +Pre-built binaries published by CI on every push to main and on every +release tag. The simplest store is GitHub Releases: + +- **`stable` channel** → latest tag matching `v*.*.*`, downloaded from + that release's assets. +- **`main` channel** → a rolling release named `latest-main`, updated + by CI on every push to `main`. Same asset layout as a tagged release. +- **`pinned` channel** → `--tag vX.Y.Z` for one-shot updates to a + specific version; also settable in config. + +Each release publishes: + +``` +ephemerd-windows-amd64.exe (~880 MB — embeds linux binary) +ephemerd-linux-amd64 (~240 MB) +ephemerd-linux-arm64 (~240 MB) +ephemerd-darwin-arm64 (~similar — embeds Vz linux assets) +SHA256SUMS (signed) +SHA256SUMS.asc (detached signature — optional v1) +``` + +The upgrade command picks the asset matching its host's GOOS/GOARCH. + +Tradeoff: GitHub Releases is free and integrates trivially with our +existing CI, but downloads are rate-limited and unauthenticated pulls +get throttled aggressively. Anonymous pulls from a busy fleet may hit +the limit; authenticated pulls (using the host's `GITHUB_TOKEN`) +sidestep it. For v1 we rely on the auth token ephemerd already holds. + +### Channel config + +```toml +# /etc/ephemerd/config.toml (or %ProgramData%\ephemerd\config.toml) +[upgrade] +channel = "stable" # "stable" | "main" | "pinned" +pinned_tag = "" # only used when channel = "pinned" +auto_check = true # poll for new versions periodically +check_interval = "24h" # how often to log "newer version available" +``` + +Default is `stable`. A fresh install can't accidentally float into +`main` without an explicit config change. + +### Command shape + +``` +ephemerd upgrade [flags] + --channel override config channel for this run + --tag shorthand for --channel pinned --pinned-tag + --check report available version, don't upgrade + --dry-run show what would happen, don't do it + --force skip version check (re-deploy current) + --no-drain skip drain (operator override) +``` + +Default flow (no flags): + +1. Resolve channel → download URL → expected version. +2. `--check` returns here. +3. Compare to running version; no-op if equal (unless `--force`). +4. Download artifact + SHA256 manifest to `/.upgrade/`. +5. Verify SHA256. (GPG/cosign optional — v2.) +6. Pre-flight: confirm we have permission to swap the binary, + service-manager access, etc. +7. **Drain** the daemon — refuse new jobs, wait for active jobs to + exit (configurable timeout; default 30 min, surface via flag). +8. `ephemerd stop`, wait for process to truly exit. +9. Move current binary to `/.upgrade/ephemerd.previous`, + move new binary into place. +10. `ephemerd start`. Poll `ephemerd status` for "ok" within 60s. +11. Wait for in-VM ephemerd to log its version (parse console.log on + Windows; equivalent on macOS). +12. Report: `upgraded host:vA.B.C -> vX.Y.Z, in-vm:vX.Y.Z`. +13. On any failure between step 9 and 12, swap `.previous` back, restart, + log the rollback, exit non-zero. + +### Drain mechanics + +`ephemerd drain` is broken on Windows today (per project memory: +SIGTERM not supported). The upgrade work needs to fix that anyway — +options: + +- Add a `Drain` RPC to the gRPC control API (`api/v1/`). The CLI calls + it; the scheduler flips a flag that rejects new jobs and waits for + active ones to exit. Cross-platform, doesn't depend on signal + handling. Probably the right answer. +- Or: replace SIGTERM with a Windows service-control event + (`SERVICE_CONTROL_PARAMCHANGE` or a custom code). Less invasive but + Windows-specific. + +Recommendation: RPC. Reusable for the existing `ephemerd drain` +command, which would also become a thin wrapper around the same call. + +### Atomic swap mechanics, per OS + +**Linux/macOS**: `rename(2)` can replace a running executable's file. +Open file handles keep pointing at the old inode until the process +exits, the new inode takes the path immediately. The systemd/launchd +restart picks up the new binary. + +**Windows**: can't replace a locked file. Sequence has to be: + +``` +ephemerd stop (via service-control) +wait for process exit +copy/move new binary +ephemerd start (via service-control) +``` + +The five-second window where the service is fully down is acceptable +because we drained first. The CLI orchestrates via the Windows Service +Manager API (already used by `ephemerd start/stop`). + +### Version reporting end-to-end + +Post-upgrade, the command output should look like: + +``` +$ ephemerd upgrade +Channel: stable +Current host binary: v1.4.2 (built 2026-05-30) +Available: v1.4.3 (released 2026-06-02) +Draining... 0 active jobs. +Stopping service... done (1.2s). +Replacing binary at C:\Program Files\ephemerd\ephemerd.exe... done. +Starting service... ok (3.4s). +Waiting for in-VM ephemerd to register... ok. + +Upgraded: + host binary: v1.4.2 -> v1.4.3 + in-VM binary: v1.4.2 -> v1.4.3 +``` + +The in-VM version comes from parsing the first "starting ephemerd" +line in `/vm/linux/console.log` after the restart (or the +equivalent on macOS / Vz). + +### Self-replacement detail + +The upgrade command is itself part of the binary being replaced. On +Linux/macOS this is fine (open inode survives). On Windows the running +`ephemerd upgrade` process holds the lock on the daemon binary only +indirectly (the running service does), so the CLI can swap freely +after stopping the service. The CLI also needs to NOT delete itself +if it's running from the same install path — handle that by either: + +- Running from a temp copy (CLI's first act is to `exec` itself from + a tempdir, then proceed). +- Or scoping `upgrade` to be invoked from a separate path + (`ephemerd-upgrader.exe` or just `ephemerd upgrade --from `). + +The temp-copy approach is the standard pattern (Tailscale, Docker +Desktop, vscode auto-update all do it). Cleaner for the user. + +## CI work + +The biggest unknown — the upgrade CLI is straightforward, the artifact +publishing is where the time goes. + +Required: + +1. **Release workflow** (`.github/workflows/release.yml`): + - Triggers: `push: tags: ['v*']` and `workflow_dispatch`. + - Matrix: linux/amd64, linux/arm64, windows/amd64, darwin/arm64. + - Runs `mage build:` per cell, uploads as a release asset. + - Generates `SHA256SUMS` from all assets. + +2. **Rolling-main workflow** (`.github/workflows/main-release.yml`): + - Triggers: `push: branches: [main]`. + - Same matrix, same artifacts. + - Publishes to a single GitHub Release tagged `latest-main` (move-tag + pattern: delete the tag, retag HEAD, recreate the release with + fresh assets). + +3. **Signing** (deferred to v2 unless we already have a code-signing cert): + - Windows: Authenticode (cert + EV recommended; ~$300/year). + - macOS: notarization via `notarytool` (free with an Apple Developer + account; ad-hoc signing already in place per memory). + - Linux: optional GPG signature on SHA256SUMS. + +Pragmatic v1: SHA256 checksum only. Signing comes later. + +### Storage cost + +Each release ≈ 1.8 GB of binaries (4 platforms × ~0.5 GB average). GitHub +Releases storage is free but assets count against the 2 GB/file limit +(we're fine, biggest is ~880 MB). With a tagged release per week plus +a constantly-updated `latest-main`, expect ~10 GB of active artifact +storage; well within limits. + +## Risks + +- **GitHub rate limits on download.** Mitigated by authenticated pulls. + If we move off GitHub later (S3, an OCI registry), the upgrade CLI's + download layer is the only thing that changes. +- **Auto-check noise.** A daemon that logs "newer version available" + every 24h gets ignored. Make it opt-in or surface in `ephemerd status` + instead of the running log. +- **Drain that never completes.** A hung job blocks the upgrade + indefinitely. Default 30-minute drain timeout with a clear "still + running: " message before timeout; `--force` skips drain + entirely for emergency upgrades. +- **In-VM version mismatch detection.** The current "grep console.log + for `starting ephemerd version=`" is fragile. A more durable + solution: the in-VM ephemerd exposes its version via the gRPC + control API; the upgrade command queries the in-VM dispatch RPC + directly. That's a separate small piece of work. +- **Channel drift.** A host configured for `stable` could be tricked + via `--channel main` flag. Acceptable — operator-explicit override is + fine. The lock is against passive drift, not against the operator. +- **Cross-version compatibility.** Schema changes (BoltDB, gRPC API) + during a rolling fleet upgrade could break older nodes pointing at + newer schedulers. ephemerd is single-host today so this isn't an + issue, but worth flagging if multi-host coordination ever happens. + +## Estimate + +Rough sizing, assuming one engineer focused: + +| Piece | Effort | Notes | +|---|---|---| +| CI release workflow (tags) | 1d | One matrix, four mage targets. | +| CI rolling-main workflow | 0.5d | Tag-move + asset-replace dance. | +| Drain RPC | 1d | gRPC method + scheduler hook + Windows fix. | +| Upgrade CLI: download + verify | 1d | Channel resolution, SHA256, retry. | +| Upgrade CLI: swap + restart | 1.5d | Service-manager glue per OS, rollback, self-exec from tempdir. | +| Upgrade CLI: version reporting | 0.5d | Parse console.log on Windows, equivalent elsewhere; better with in-VM RPC. | +| Tests | 1.5d | Unit + e2e: fake artifact server, version-mismatch rollback, drain-timeout. | +| Docs | 0.5d | CLI reference, configuration reference, ops guide. | + +**Total: ~7 engineer-days for a solid v1.** Could ship a "happy-path +only, manual rollback" version in ~3 days as a stopgap. + +Signing/notarization adds another 2-3 days each, deferred unless +distribution policy demands it. + +## Open questions + +1. **In-VM version source of truth.** Parse console.log (simple, works + today) vs add a gRPC call to the in-VM dispatch server (more + robust, requires the dispatch server to be reachable post-restart). + Recommend the gRPC call — it's small and we already have the + dispatch service. +2. **Auto-upgrade.** Should ephemerd ever upgrade itself without an + operator running the command? Pro: zero-touch fleet. Con: the dind + debugging we just did would have been *much* harder if the daemon + silently rolled forward overnight. Recommend: never auto-apply, + only auto-check + log. +3. **Multi-channel hosts.** Can the same host run two ephemerd + instances on different channels (e.g. for A/B testing)? Probably + no for v1; one binary per host. Revisit if needed. +4. **Downgrade.** `ephemerd upgrade --tag ` should work for + rollbacks. Worth explicit testing. +5. **Embedded asset version skew.** A v1.4.3 host binary embeds a + v1.4.3 Linux binary. If the embed somehow gets stale (cache bug + like the one we hit during the dind work), the post-upgrade version + report should *catch the mismatch* — log a WARN and fail the + upgrade. That alone would have saved several hours. + +## Recommendation + +Build the CI artifact pipeline first (it's the bottleneck and unblocks +everything else), then the CLI in a single PR, then the drain RPC fix +as a small follow-up. Ship `--check` + manual download as a stopgap +on day 3 so the team has *something* before the full automation lands. diff --git a/pkg/config/config.go b/pkg/config/config.go index a3576f06..ebb62d7f 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -9,6 +9,7 @@ import ( "log/slog" "os" goruntime "runtime" + "strings" "time" "github.com/BurntSushi/toml" @@ -432,6 +433,85 @@ type RunnerConfig struct { JobTimeout string `toml:"job_timeout"` ShutdownTimeout string `toml:"shutdown_timeout"` Windows WindowsRunnerToml `toml:"windows"` + MacOS MacOSRunnerConfig `toml:"macos"` +} + +// MacOSRunnerConfig controls macOS job routing. It lives under [runner] +// (not [vm.macos]) because native jobs don't involve VMs. +// +// TOML shape: +// +// [runner.macos] +// mode = "vm" # default mode: "vm" or "native" +// max_native = 4 # max concurrent native jobs +// # user = "ciuser" # optional: existing user for native runners. +// # # Default (unset): an ephemeral hidden user is +// # # created per job and deleted on cleanup. +// +// [runner.macos.repos] +// "ephpm/*" = "native" # all repos in org +// "ephpm/secret-repo" = "vm" # except this one (exact wins over wildcard) +// "someuser/ephemerd" = "vm" # fork stays on VM +type MacOSRunnerConfig struct { + Mode string `toml:"mode"` // "vm" (default) or "native" + MaxNative int `toml:"max_native"` // max concurrent native jobs (default 4) + User string `toml:"user"` // existing user for native runners (empty = ephemeral per-job user, recommended) + Repos map[string]string `toml:"repos"` // "org/repo" -> "vm" or "native" +} + +// ModeForRepo returns "native" or "vm" for the given repo. Resolution order: +// +// 1. Exact match on "org/repo" +// 2. Wildcard match on "org/*" +// 3. Short-name fallback: if repo has no "/", match any "org/" key +// 4. Top-level mode +// 5. Default: "vm" +// +// The short-name fallback exists because some providers (GitHub polling) +// currently emit event.Repo as just the repo name without the org prefix. +// Config keys should always use "org/repo" format for disambiguation. +func (m *MacOSRunnerConfig) ModeForRepo(repo string) string { + if m != nil && len(m.Repos) > 0 { + // 1. Exact match + if mode, ok := m.Repos[repo]; ok && isValidMode(mode) { + return mode + } + + // 2. Wildcard: "org/*" matches any repo under that org + if slash := strings.IndexByte(repo, '/'); slash > 0 { + wildcard := repo[:slash] + "/*" + if mode, ok := m.Repos[wildcard]; ok && isValidMode(mode) { + return mode + } + } + + // 3. Short-name fallback: repo="ephemerd" matches key "ephpm/ephemerd" + if !strings.Contains(repo, "/") { + suffix := "/" + repo + for key, mode := range m.Repos { + if strings.HasSuffix(key, suffix) && !strings.HasSuffix(key, "/*") && isValidMode(mode) { + return mode + } + } + } + } + if m != nil && isValidMode(m.Mode) { + return m.Mode + } + return "vm" +} + +func isValidMode(mode string) bool { + return mode == "native" || mode == "vm" +} + +// ResolvedMaxNative returns the max concurrent native macOS jobs, +// defaulting to 4 if unset or non-positive. +func (m *MacOSRunnerConfig) ResolvedMaxNative() int { + if m == nil || m.MaxNative <= 0 { + return 4 + } + return m.MaxNative } // WindowsRunnerToml configures resource limits for Hyper-V isolated Windows diff --git a/pkg/config/config_test.go b/pkg/config/config_test.go index 55bf474e..b012deb8 100644 --- a/pkg/config/config_test.go +++ b/pkg/config/config_test.go @@ -1941,3 +1941,71 @@ allow_privileged = false t.Error("ResolvedAllowPrivileged() should honor explicit false even on non-Linux hosts") } } + +func TestMacOSRunnerConfig_ModeForRepo(t *testing.T) { + tests := []struct { + name string + cfg *MacOSRunnerConfig + repo string + want string + }{ + {"nil config defaults to vm", nil, "myrepo", "vm"}, + {"zero value defaults to vm", &MacOSRunnerConfig{}, "myrepo", "vm"}, + {"top-level native", &MacOSRunnerConfig{Mode: "native"}, "myrepo", "native"}, + {"top-level vm", &MacOSRunnerConfig{Mode: "vm"}, "myrepo", "vm"}, + {"invalid top-level mode defaults to vm", &MacOSRunnerConfig{Mode: "bogus"}, "myrepo", "vm"}, + + // org/repo exact match + {"org/repo exact match native", &MacOSRunnerConfig{Repos: map[string]string{"ephpm/ephemerd": "native"}}, "ephpm/ephemerd", "native"}, + {"org/repo exact match vm", &MacOSRunnerConfig{Mode: "native", Repos: map[string]string{"ephpm/ephemerd": "vm"}}, "ephpm/ephemerd", "vm"}, + {"org/repo miss falls back to top-level", &MacOSRunnerConfig{Mode: "native", Repos: map[string]string{"ephpm/other": "vm"}}, "ephpm/ephemerd", "native"}, + + // short-name fallback (event.Repo = "ephemerd", config key = "ephpm/ephemerd") + {"short name matches org/repo key", &MacOSRunnerConfig{Repos: map[string]string{"ephpm/ephemerd": "native"}}, "ephemerd", "native"}, + {"short name no match falls to top-level", &MacOSRunnerConfig{Mode: "native", Repos: map[string]string{"ephpm/other": "vm"}}, "ephemerd", "native"}, + + // disambiguation: fork vs original + {"fork stays vm while original is native", &MacOSRunnerConfig{Repos: map[string]string{"ephpm/ephemerd": "native", "fork/ephemerd": "vm"}}, "ephpm/ephemerd", "native"}, + {"fork explicit vm", &MacOSRunnerConfig{Repos: map[string]string{"ephpm/ephemerd": "native", "fork/ephemerd": "vm"}}, "fork/ephemerd", "vm"}, + + // wildcard: "org/*" matches all repos in org + {"wildcard matches repo in org", &MacOSRunnerConfig{Repos: map[string]string{"ephpm/*": "native"}}, "ephpm/ephemerd", "native"}, + {"wildcard matches another repo in org", &MacOSRunnerConfig{Repos: map[string]string{"ephpm/*": "native"}}, "ephpm/php-sdk", "native"}, + {"wildcard does not match different org", &MacOSRunnerConfig{Repos: map[string]string{"ephpm/*": "native"}}, "other/ephemerd", "vm"}, + {"exact match wins over wildcard", &MacOSRunnerConfig{Repos: map[string]string{"ephpm/*": "native", "ephpm/secret": "vm"}}, "ephpm/secret", "vm"}, + {"wildcard still applies to non-overridden repo", &MacOSRunnerConfig{Repos: map[string]string{"ephpm/*": "native", "ephpm/secret": "vm"}}, "ephpm/ephemerd", "native"}, + + // invalid per-repo mode falls through + {"invalid per-repo mode falls back to top-level", &MacOSRunnerConfig{Mode: "native", Repos: map[string]string{"ephpm/myrepo": "bogus"}}, "ephpm/myrepo", "native"}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := tt.cfg.ModeForRepo(tt.repo) + if got != tt.want { + t.Errorf("ModeForRepo(%q) = %q, want %q", tt.repo, got, tt.want) + } + }) + } +} + +func TestMacOSRunnerConfig_ResolvedMaxNative(t *testing.T) { + tests := []struct { + name string + cfg *MacOSRunnerConfig + want int + }{ + {"nil config defaults to 4", nil, 4}, + {"zero value defaults to 4", &MacOSRunnerConfig{}, 4}, + {"negative defaults to 4", &MacOSRunnerConfig{MaxNative: -1}, 4}, + {"positive value used", &MacOSRunnerConfig{MaxNative: 8}, 8}, + {"one is valid", &MacOSRunnerConfig{MaxNative: 1}, 1}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := tt.cfg.ResolvedMaxNative() + if got != tt.want { + t.Errorf("ResolvedMaxNative() = %d, want %d", got, tt.want) + } + }) + } +} diff --git a/pkg/native/native_darwin.go b/pkg/native/native_darwin.go new file mode 100644 index 00000000..8e30795b --- /dev/null +++ b/pkg/native/native_darwin.go @@ -0,0 +1,623 @@ +//go:build darwin + +package native + +import ( + "context" + "errors" + "fmt" + "io" + "io/fs" + "log/slog" + "os" + "os/exec" + "os/user" + "path/filepath" + "strconv" + "strings" + "sync" + "syscall" +) + +// serviceUserMu serializes service user creation across concurrent job starts. +var serviceUserMu sync.Mutex + +// ServiceUserName is the hidden macOS service account that native runner +// jobs execute as when no [runner.macos] user is configured. It is created +// lazily on first use and persists like other service accounts (_www, ...). +// Per-job user deletion is deliberately avoided: dscl/sysadminctl user +// deletion wedges opendirectoryd on modern macOS. +const ServiceUserName = "_ephemerd" + +// ServiceGroupName is a dedicated primary group for the service user. +// Using a dedicated group instead of staff (gid 20 — the default group for +// every normal macOS account) keeps the runner process from inheriting +// group access to the many files on a typical Mac that are staff-group +// owned. Falls back to staff if the group can't be created. +const ServiceGroupName = "_ephemerd" + +// staffGID is the macOS staff group, used as the fallback primary group +// when a dedicated service group can't be provisioned. +const staffGID = 20 + +// service{UID,GID} ranges are scanned for a free id when creating the +// service user/group. macOS reserves <500 for system accounts; 600-999 +// is the conventional band for added service accounts. +const ( + serviceUIDMin = 600 + serviceUIDMax = 999 +) + +// ensureServiceUser creates the _ephemerd service user if it doesn't exist +// and returns its credential. +func (r *Runner) ensureServiceUser() (*syscall.Credential, error) { + serviceUserMu.Lock() + defer serviceUserMu.Unlock() + + // Already exists? + if cred, err := lookupCredential(ServiceUserName); err == nil { + return cred, nil + } + + // Find a free UID + out, err := exec.Command("dscl", ".", "-list", "/Users", "UniqueID").Output() + if err != nil { + return nil, fmt.Errorf("listing users: %w", err) + } + used := make(map[int]bool) + for _, line := range strings.Split(string(out), "\n") { + fields := strings.Fields(line) + if len(fields) == 2 { + if id, err := strconv.Atoi(fields[1]); err == nil { + used[id] = true + } + } + } + uid := 0 + for id := serviceUIDMin; id <= serviceUIDMax; id++ { + if !used[id] { + uid = id + break + } + } + if uid == 0 { + return nil, fmt.Errorf("no free UID in range %d-%d", serviceUIDMin, serviceUIDMax) + } + + // Resolve a dedicated primary group, falling back to staff (gid 20) + // if provisioning fails for any reason — that's the previously-tested + // behavior, so a group hiccup never blocks native jobs. + gid := r.ensureServiceGroup() + + // NFSHomeDirectory is /var/empty (like _www and other service + // accounts). Registering a real directory as a user home puts it + // under macOS data protection — even root then can't delete it + // without Full Disk Access. The runner's HOME env var points at the + // per-job dir; the DS record never needs to. + steps := [][]string{ + {"dscl", ".", "-create", "/Users/" + ServiceUserName}, + {"dscl", ".", "-create", "/Users/" + ServiceUserName, "UserShell", "/bin/bash"}, + {"dscl", ".", "-create", "/Users/" + ServiceUserName, "UniqueID", strconv.Itoa(uid)}, + {"dscl", ".", "-create", "/Users/" + ServiceUserName, "PrimaryGroupID", strconv.Itoa(gid)}, + {"dscl", ".", "-create", "/Users/" + ServiceUserName, "NFSHomeDirectory", "/var/empty"}, + {"dscl", ".", "-create", "/Users/" + ServiceUserName, "IsHidden", "1"}, + } + for _, args := range steps { + if out, err := exec.Command(args[0], args[1:]...).CombinedOutput(); err != nil { + return nil, fmt.Errorf("%v: %s: %w", args, strings.TrimSpace(string(out)), err) + } + } + r.log.Info("created ephemerd service user", "user", ServiceUserName, "uid", uid, "gid", gid) + + return &syscall.Credential{Uid: uint32(uid), Gid: uint32(gid)}, nil +} + +// ensureServiceGroup returns the gid of a dedicated _ephemerd primary +// group, creating it if needed. On any failure it logs a warning and +// returns staffGID (20) so native jobs keep working with the previously +// tested behavior. Caller holds serviceUserMu. +func (r *Runner) ensureServiceGroup() int { + if g, err := user.LookupGroup(ServiceGroupName); err == nil { + if gid, perr := strconv.Atoi(g.Gid); perr == nil { + return gid + } + } + + out, err := exec.Command("dscl", ".", "-list", "/Groups", "PrimaryGroupID").Output() + if err != nil { + r.log.Warn("listing groups for service group; falling back to staff", "error", err) + return staffGID + } + used := make(map[int]bool) + for _, line := range strings.Split(string(out), "\n") { + fields := strings.Fields(line) + if len(fields) == 2 { + if id, perr := strconv.Atoi(fields[1]); perr == nil { + used[id] = true + } + } + } + gid := 0 + for id := serviceUIDMin; id <= serviceUIDMax; id++ { + if !used[id] { + gid = id + break + } + } + if gid == 0 { + r.log.Warn("no free GID for service group; falling back to staff", "range", fmt.Sprintf("%d-%d", serviceUIDMin, serviceUIDMax)) + return staffGID + } + + steps := [][]string{ + {"dscl", ".", "-create", "/Groups/" + ServiceGroupName}, + {"dscl", ".", "-create", "/Groups/" + ServiceGroupName, "PrimaryGroupID", strconv.Itoa(gid)}, + {"dscl", ".", "-create", "/Groups/" + ServiceGroupName, "RealName", "ephemerd native runners"}, + } + for _, args := range steps { + if out, err := exec.Command(args[0], args[1:]...).CombinedOutput(); err != nil { + r.log.Warn("creating service group; falling back to staff", + "step", strings.Join(args, " "), "output", strings.TrimSpace(string(out)), "error", err) + return staffGID + } + } + r.log.Info("created ephemerd service group", "group", ServiceGroupName, "gid", gid) + return gid +} + +// lookupCredential resolves a username to a syscall.Credential for +// privilege dropping via SysProcAttr. +func lookupCredential(username string) (*syscall.Credential, error) { + u, err := user.Lookup(username) + if err != nil { + return nil, err + } + uid, err := strconv.ParseUint(u.Uid, 10, 32) + if err != nil { + return nil, fmt.Errorf("parsing uid %q: %w", u.Uid, err) + } + gid, err := strconv.ParseUint(u.Gid, 10, 32) + if err != nil { + return nil, fmt.Errorf("parsing gid %q: %w", u.Gid, err) + } + return &syscall.Credential{Uid: uint32(uid), Gid: uint32(gid)}, nil +} + +// Runner executes a GitHub Actions runner directly on the macOS host +// inside a per-job sandbox. Each job gets its own workspace, HOME, +// TMPDIR, keychain, and Homebrew prefix. +type Runner struct { + dataDir string + jobID string + jitConfig string + runnerSrc string // path to extracted GHA runner (runner.Manager.Dir()) + log *slog.Logger + + jobDir string // /native// + keychainPath string // per-job keychain + runAsUser string // existing user to run as (empty = _ephemerd service user) + jobUID uint32 // uid the runner executes as + cmd *exec.Cmd + pgid int +} + +// SetRunAsUser configures a non-root user to run the runner process as. +// The daemon (running as root) drops privileges via setuid/setgid when +// launching the runner. Strongly recommended when the daemon runs as root: +// without it, CI job steps execute as root on the host. +func (r *Runner) SetRunAsUser(username string) { + r.runAsUser = username +} + +// New creates a native macOS runner for a single job. It prepares the +// workspace directory structure but does not start the runner process. +func New(dataDir, jobID, jitConfig, runnerSrc string, log *slog.Logger) (*Runner, error) { + jobDir := filepath.Join(dataDir, "native", jobID) + + // Create workspace directories + dirs := []string{ + filepath.Join(jobDir, "home"), + filepath.Join(jobDir, "tmp"), + filepath.Join(jobDir, "work"), + filepath.Join(jobDir, "runner"), + filepath.Join(jobDir, "homebrew", "bin"), + filepath.Join(jobDir, "homebrew", "Cellar"), + filepath.Join(jobDir, "keychain"), + } + for _, d := range dirs { + if err := os.MkdirAll(d, 0o755); err != nil { + return nil, fmt.Errorf("creating directory %s: %w", d, err) + } + } + + return &Runner{ + dataDir: dataDir, + jobID: jobID, + jitConfig: jitConfig, + runnerSrc: runnerSrc, + log: log, + jobDir: jobDir, + }, nil +} + +// Start copies the runner binary, sets up the sandbox and environment, +// and launches the runner process. +func (r *Runner) Start(ctx context.Context) error { + runnerDir := filepath.Join(r.jobDir, "runner") + + // Copy runner files from the extracted source (hard link, fall back to copy) + if err := copyRunnerFiles(r.runnerSrc, runnerDir); err != nil { + return fmt.Errorf("copying runner files: %w", err) + } + + // Generate and write sandbox profile + profilePath := filepath.Join(r.jobDir, "sandbox.sb") + profile := GenerateSandboxProfile(r.jobDir, r.dataDir) + if err := os.WriteFile(profilePath, []byte(profile), 0o644); err != nil { + return fmt.Errorf("writing sandbox profile: %w", err) + } + + // Set up per-job keychain + r.keychainPath = filepath.Join(r.jobDir, "keychain", "job.keychain-db") + if err := r.createKeychain(); err != nil { + r.log.Warn("failed to create per-job keychain", "error", err) + // Non-fatal: jobs that don't need signing will work fine + } + + // Symlink Homebrew binaries from host + if err := symlinkHomebrew(filepath.Join(r.jobDir, "homebrew", "bin")); err != nil { + r.log.Warn("failed to symlink homebrew binaries", "error", err) + // Non-fatal: host may not have Homebrew installed + } + + // Build environment + homeDir := filepath.Join(r.jobDir, "home") + tmpDir := filepath.Join(r.jobDir, "tmp") + workDir := filepath.Join(r.jobDir, "work") + brewDir := filepath.Join(r.jobDir, "homebrew") + + env := []string{ + "HOME=" + homeDir, + "TMPDIR=" + tmpDir, + "RUNNER_WORK_FOLDER=" + workDir, + "PATH=" + filepath.Join(brewDir, "bin") + ":/opt/homebrew/bin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin", + "HOMEBREW_PREFIX=" + brewDir, + "HOMEBREW_CELLAR=" + filepath.Join(brewDir, "Cellar"), + "HOMEBREW_TEMP=" + tmpDir, + "LANG=en_US.UTF-8", + } + // Point DEVELOPER_DIR at the host's active developer directory + // (full Xcode or Command Line Tools). Hardcoding the Xcode.app path + // breaks xcrun shims (git, clang) on hosts with only CLT installed. + if devDir, err := exec.Command("xcode-select", "-p").Output(); err == nil { + env = append(env, "DEVELOPER_DIR="+strings.TrimSpace(string(devDir))) + } + if r.keychainPath != "" { + env = append(env, "EPHEMERD_KEYCHAIN="+r.keychainPath) + } + + // Launch via sandbox-exec for filesystem/network isolation + r.cmd = exec.CommandContext(ctx, "sandbox-exec", "-f", profilePath, + "./run.sh", "--jitconfig", r.jitConfig) + r.cmd.Dir = runnerDir + r.cmd.Env = env + + // Own process group for clean kill + r.cmd.SysProcAttr = &syscall.SysProcAttr{Setpgid: true} + + // Drop privileges. Job steps must never run as root on the host: + // - user configured: run as that existing user + // - no user configured + daemon is root: run as the hidden _ephemerd + // service user (created lazily on first use) + // - daemon not root: run as the daemon's own user (no setuid possible) + var cred *syscall.Credential + username := r.runAsUser + switch { + case r.runAsUser != "": + c, err := lookupCredential(r.runAsUser) + if err != nil { + return fmt.Errorf("looking up run-as user %q: %w", r.runAsUser, err) + } + cred = c + case os.Geteuid() == 0: + c, err := r.ensureServiceUser() + if err != nil { + return fmt.Errorf("ensuring service user: %w", err) + } + username = ServiceUserName + cred = c + } + if cred != nil { + if out, err := exec.Command("chown", "-R", + fmt.Sprintf("%d:%d", cred.Uid, cred.Gid), r.jobDir).CombinedOutput(); err != nil { + return fmt.Errorf("chowning job dir to %s: %s: %w", username, strings.TrimSpace(string(out)), err) + } + r.cmd.SysProcAttr.Credential = cred + r.jobUID = cred.Uid + env = append(env, "USER="+username, "LOGNAME="+username) + r.cmd.Env = env + } + + // Log to files in the job directory (after chown so the runner user owns it) + logPath := filepath.Join(r.jobDir, "runner.log") + logFile, err := os.Create(logPath) + if err != nil { + return fmt.Errorf("creating log file: %w", err) + } + r.cmd.Stdout = logFile + r.cmd.Stderr = logFile + + if err := r.cmd.Start(); err != nil { + if closeErr := logFile.Close(); closeErr != nil { + r.log.Warn("failed to close log file", "error", closeErr) + } + return fmt.Errorf("starting runner: %w", err) + } + + r.pgid = r.cmd.Process.Pid + r.log.Info("native macOS runner started", + "job_id", r.jobID, + "pid", r.pgid, + "dir", runnerDir, + ) + + return nil +} + +// Wait blocks until the runner process exits and returns its exit code. +func (r *Runner) Wait() (int, error) { + if r.cmd == nil || r.cmd.Process == nil { + return -1, fmt.Errorf("runner not started") + } + + err := r.cmd.Wait() + if err != nil { + var exitErr *exec.ExitError + if errors.As(err, &exitErr) { + return exitErr.ExitCode(), nil + } + return -1, fmt.Errorf("waiting for runner: %w", err) + } + return 0, nil +} + +// Stop forcefully terminates the runner and all its children, cleans up +// the keychain, and removes the job workspace. +func (r *Runner) Stop() { + // Kill the process group + if r.pgid > 0 { + if err := syscall.Kill(-r.pgid, syscall.SIGKILL); err != nil { + // Process may have already exited — not an error + r.log.Debug("kill process group", "pgid", r.pgid, "error", err) + } + + // Fallback: kill any orphaned children + cmd := exec.Command("pkill", "-9", "-P", strconv.Itoa(r.pgid)) + if err := cmd.Run(); err != nil { + r.log.Debug("pkill fallback", "ppid", r.pgid, "error", err) + } + } + + // Delete per-job keychain + if r.keychainPath != "" { + r.deleteKeychain() + } + + // Note: no per-UID process kill here — the service user is shared + // across concurrent jobs, so pkill -U would kill other jobs' steps. + // The pgid kill above covers the job's process tree. + + // Strip ACLs before removal: macOS frameworks put "deny delete" ACLs + // on auto-created home subdirectories (~/Library etc.) which block + // os.RemoveAll even as root. + if out, err := exec.Command("chmod", "-RN", r.jobDir).CombinedOutput(); err != nil { + r.log.Debug("stripping ACLs from job dir", "dir", r.jobDir, + "output", strings.TrimSpace(string(out)), "error", err) + } + + // Remove job workspace + if err := os.RemoveAll(r.jobDir); err != nil { + r.log.Warn("failed to remove job directory", "dir", r.jobDir, "error", err) + } + + r.log.Info("native macOS runner cleaned up", "job_id", r.jobID) +} + +// createKeychain creates a per-job temporary keychain. +func (r *Runner) createKeychain() error { + cmd := exec.Command("security", "create-keychain", "-p", "", r.keychainPath) + if out, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("create-keychain: %s: %w", strings.TrimSpace(string(out)), err) + } + cmd = exec.Command("security", "unlock-keychain", "-p", "", r.keychainPath) + if out, err := cmd.CombinedOutput(); err != nil { + return fmt.Errorf("unlock-keychain: %s: %w", strings.TrimSpace(string(out)), err) + } + return nil +} + +// deleteKeychain removes the per-job keychain. +func (r *Runner) deleteKeychain() { + cmd := exec.Command("security", "delete-keychain", r.keychainPath) + if out, err := cmd.CombinedOutput(); err != nil { + r.log.Warn("failed to delete keychain", "path", r.keychainPath, "output", strings.TrimSpace(string(out)), "error", err) + } +} + +// GenerateSandboxProfile returns a macOS sandbox profile that restricts +// the runner process. Paths are templated with the job-specific directories. +func GenerateSandboxProfile(jobDir, dataDir string) string { + // Resolve to absolute, symlink-free paths. The sandbox matches against + // kernel (resolved) paths: /var and /tmp are symlinks to /private/var + // and /private/tmp on macOS, so rules written with the unresolved + // config paths (e.g. /var/lib/ephemerd/...) silently never match. + resolve := func(p string) string { + abs, err := filepath.Abs(p) + if err != nil { + abs = p + } + if real, err := filepath.EvalSymlinks(abs); err == nil { + return real + } + return abs + } + absJobDir := resolve(jobDir) + absDataDir := resolve(dataDir) + homeDir := resolve(os.Getenv("HOME")) + + // NOTE: this profile is allow-by-default with an explicit deny list. + // For native (no-VM) execution the stronger posture is deny-by-default + // with an allow list, but that requires enumerating every path the GHA + // runner + toolchains legitimately touch and live-testing on macOS so + // jobs don't break. Tracked as a follow-up (see PR discussion). The + // denies below close the concrete job-to-job and job-to-daemon read + // holes that matter most on a shared host. + return fmt.Sprintf(`(version 1) +(allow default) + +;; === Network isolation === +;; Note: sandbox-exec does not support CIDR notation for IP addresses. +;; Private network blocking (10.x, 172.16.x, 192.168.x) requires pf +;; firewall rules — handled separately. The sandbox blocks localhost +;; and port binding to prevent inter-job communication. + +;; Allow DNS before blocking localhost (macOS resolves via mDNSResponder on 127.0.0.1) +(allow network-outbound (remote udp "localhost:53")) +(allow network-outbound (remote tcp "localhost:53")) + +;; Block outbound to localhost (daemon control socket, other jobs) +(deny network-outbound (remote ip "localhost:*")) + +;; Block binding to any port — prevents jobs from running servers +(deny network-bind (local ip "*:*")) + +;; === Filesystem isolation === + +;; Isolate this job from sibling jobs and ephemerd internal state. +;; All native job workspaces live under /native/, and +;; every native job runs as the same _ephemerd uid, so without this a +;; job could read a concurrent job's checkout token or source. +;; +;; Deny file-read-DATA (not file-read*) on the native subtree: on a +;; directory that blocks readdir (can't list a sibling's contents), on a +;; file it blocks reading contents. file-read-metadata stays allowed so +;; lstat/realpath path resolution can traverse THROUGH native/ — denying +;; metadata breaks the .NET host with "Failed to resolve full path of the +;; current executable" (exit 133). +(deny file-read-data (subpath "%[2]s/native")) +(deny file-write* (subpath "%[2]s/native")) + +;; Re-allow reading the native directory NODE itself (not its children). +;; getcwd() and bash walk UP from the job's runner dir and must readdir +;; native/ to learn the job-id component name; without this they fail +;; with "getcwd: cannot access parent directories" and run.sh won't exec. +;; This leaks the list of concurrent job-id directory names (not their +;; contents) — job ids are not secret. +(allow file-read-data (literal "%[2]s/native")) + +;; Block sensitive host paths entirely — read and write. .ssh was +;; previously read-only-denied, leaving a writable authorized_keys hole +;; on any host where the runner uid can reach the operator's home. +(deny file-read* (subpath "%[1]s/.ssh")) +(deny file-write* (subpath "%[1]s/.ssh")) +(deny file-read* (literal "%[2]s/config.toml")) +(deny file-write* (literal "%[2]s/config.toml")) +(deny file-read* (literal "%[2]s/ephemerd.sock")) +(deny file-write* (literal "%[2]s/ephemerd.sock")) +(deny file-read* (subpath "%[2]s/vm")) +(deny file-write* (subpath "%[2]s/vm")) + +;; Block writes to shared tools (read-only access only) +(deny file-write* (subpath "/opt/homebrew")) +(deny file-write* (subpath "/Applications")) +(deny file-write* (subpath "/usr/local")) + +;; Re-allow this job's own workspace (read + write). The explicit +;; file-read-data is required IN ADDITION to file-read*: macOS sandbox +;; resolves a specific-operation deny (the file-read-data deny on the +;; native subtree above) over a later wildcard allow (file-read*), so the +;; read-data re-allow must name the operation explicitly to win for this +;; job's own files. +(allow file-read* (subpath "%[3]s")) +(allow file-read-data (subpath "%[3]s")) +(allow file-write* (subpath "%[3]s")) +(allow file-write* (subpath "/private/tmp")) +`, homeDir, absDataDir, absJobDir) +} + +// symlinkHomebrew creates symlinks from /opt/homebrew/bin/* into the +// per-job homebrew bin directory, giving jobs read access to pre-installed +// tools while keeping their own installs isolated. +func symlinkHomebrew(destBin string) error { + const hostBin = "/opt/homebrew/bin" + entries, err := os.ReadDir(hostBin) + if err != nil { + return fmt.Errorf("reading %s: %w", hostBin, err) + } + for _, e := range entries { + src := filepath.Join(hostBin, e.Name()) + dst := filepath.Join(destBin, e.Name()) + if err := os.Symlink(src, dst); err != nil { + // Skip if symlink already exists + if !os.IsExist(err) { + return fmt.Errorf("symlinking %s: %w", e.Name(), err) + } + } + } + return nil +} + +// copyRunnerFiles copies the runner directory to the per-job location. +// Uses hard links for efficiency, falling back to full copy on error. +func copyRunnerFiles(src, dst string) error { + return filepath.WalkDir(src, func(path string, d fs.DirEntry, err error) error { + if err != nil { + return err + } + + rel, err := filepath.Rel(src, path) + if err != nil { + return fmt.Errorf("computing relative path: %w", err) + } + target := filepath.Join(dst, rel) + + if d.IsDir() { + return os.MkdirAll(target, 0o755) + } + + return copyFile(path, target) + }) +} + +func copyFile(src, dst string) error { + sf, err := os.Open(src) + if err != nil { + return fmt.Errorf("opening source %s: %w", src, err) + } + defer func() { + if closeErr := sf.Close(); closeErr != nil { + // Best-effort close; source is read-only + _ = closeErr + } + }() + + info, err := sf.Stat() + if err != nil { + return fmt.Errorf("stat source %s: %w", src, err) + } + + df, err := os.OpenFile(dst, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, info.Mode()) + if err != nil { + return fmt.Errorf("creating dest %s: %w", dst, err) + } + + if _, err := io.Copy(df, sf); err != nil { + if closeErr := df.Close(); closeErr != nil { + // Log would be ideal but we don't have a logger here + _ = closeErr + } + return fmt.Errorf("copying %s → %s: %w", src, dst, err) + } + + return df.Close() +} diff --git a/pkg/native/native_darwin_test.go b/pkg/native/native_darwin_test.go new file mode 100644 index 00000000..e22d0a9b --- /dev/null +++ b/pkg/native/native_darwin_test.go @@ -0,0 +1,189 @@ +//go:build darwin + +package native + +import ( + "os" + "path/filepath" + "strings" + "testing" +) + +func TestGenerateSandboxProfile(t *testing.T) { + // Use real directories: the profile resolves symlinks (e.g. /var → + // /private/var) so rules match the kernel's view of the paths. The + // expected strings must be the resolved forms. + base := t.TempDir() + dataDir := filepath.Join(base, "data") + jobDir := filepath.Join(dataDir, "native", "job123") + if err := os.MkdirAll(jobDir, 0o755); err != nil { + t.Fatal(err) + } + + resolvedData, err := filepath.EvalSymlinks(dataDir) + if err != nil { + t.Fatal(err) + } + resolvedJob, err := filepath.EvalSymlinks(jobDir) + if err != nil { + t.Fatal(err) + } + + profile := GenerateSandboxProfile(jobDir, dataDir) + + checks := []struct { + desc string + substr string + }{ + {"allows DNS UDP", `(allow network-outbound (remote udp "localhost:53"))`}, + {"allows DNS TCP", `(allow network-outbound (remote tcp "localhost:53"))`}, + {"blocks localhost", `(deny network-outbound (remote ip "localhost:*"))`}, + {"blocks port binding", `(deny network-bind (local ip "*:*"))`}, + {"blocks sibling job read-data", `(deny file-read-data (subpath "` + resolvedData + `/native"))`}, + {"blocks sibling job writes", `(deny file-write* (subpath "` + resolvedData + `/native"))`}, + {"allows native dir node read (getcwd)", `(allow file-read-data (literal "` + resolvedData + `/native"))`}, + {"blocks SSH dir reads", `(deny file-read* (subpath`}, + {"blocks SSH dir writes", `(deny file-write* (subpath`}, + {"blocks config.toml reads", `(deny file-read* (literal "` + resolvedData + `/config.toml"))`}, + {"blocks config.toml writes", `(deny file-write* (literal "` + resolvedData + `/config.toml"))`}, + {"blocks ephemerd socket reads", `(deny file-read* (literal "` + resolvedData + `/ephemerd.sock"))`}, + {"blocks ephemerd socket writes", `(deny file-write* (literal "` + resolvedData + `/ephemerd.sock"))`}, + {"blocks VM dir reads", `(deny file-read* (subpath "` + resolvedData + `/vm"))`}, + {"blocks VM dir writes", `(deny file-write* (subpath "` + resolvedData + `/vm"))`}, + {"blocks homebrew writes", `(deny file-write* (subpath "/opt/homebrew"))`}, + {"blocks Applications writes", `(deny file-write* (subpath "/Applications"))`}, + {"blocks /usr/local writes", `(deny file-write* (subpath "/usr/local"))`}, + {"re-allows job dir reads", `(allow file-read* (subpath "` + resolvedJob + `"))`}, + {"re-allows job dir read-data", `(allow file-read-data (subpath "` + resolvedJob + `"))`}, + {"allows job dir writes", `(allow file-write* (subpath "` + resolvedJob + `"))`}, + {"allows /private/tmp writes", `(allow file-write* (subpath "/private/tmp"))`}, + } + + for _, c := range checks { + if !strings.Contains(profile, c.substr) { + t.Errorf("sandbox profile missing %s: expected substring %q", c.desc, c.substr) + } + } +} + +// TestGenerateSandboxProfile_ResolvesSymlinks pins the /var → /private/var +// gotcha: a profile written with unresolved paths silently never matches. +func TestGenerateSandboxProfile_ResolvesSymlinks(t *testing.T) { + base := t.TempDir() + realData := filepath.Join(base, "real-data") + jobDir := filepath.Join(realData, "native", "j1") + if err := os.MkdirAll(jobDir, 0o755); err != nil { + t.Fatal(err) + } + linkData := filepath.Join(base, "link-data") + if err := os.Symlink(realData, linkData); err != nil { + t.Fatal(err) + } + + resolvedData, err := filepath.EvalSymlinks(realData) + if err != nil { + t.Fatal(err) + } + + // Generate using the SYMLINK path — the profile must contain the + // resolved target, and not rules pointing at the symlink. + profile := GenerateSandboxProfile(filepath.Join(linkData, "native", "j1"), linkData) + + if !strings.Contains(profile, `(deny file-read-data (subpath "`+resolvedData+`/native"))`) { + t.Errorf("profile should deny the RESOLVED native path %q, got:\n%s", resolvedData, profile) + } + if strings.Contains(profile, `(subpath "`+linkData+`/native")`) { + t.Errorf("profile must not reference the unresolved symlink path %q", linkData) + } +} + +func TestNewCreatesWorkspace(t *testing.T) { + tmpDir := t.TempDir() + dataDir := filepath.Join(tmpDir, "data") + runnerSrc := filepath.Join(tmpDir, "runner-src") + + // Create a minimal runner source dir + if err := os.MkdirAll(runnerSrc, 0o755); err != nil { + t.Fatal(err) + } + + r, err := New(dataDir, "test-job-42", "fake-jit-config", runnerSrc, nil) + if err != nil { + t.Fatalf("New() error: %v", err) + } + + // Verify expected directories exist + expectedDirs := []string{ + "home", + "tmp", + "work", + "runner", + filepath.Join("homebrew", "bin"), + filepath.Join("homebrew", "Cellar"), + "keychain", + } + for _, d := range expectedDirs { + path := filepath.Join(r.jobDir, d) + info, err := os.Stat(path) + if err != nil { + t.Errorf("expected directory %s to exist: %v", d, err) + continue + } + if !info.IsDir() { + t.Errorf("expected %s to be a directory", d) + } + } +} + +func TestCopyRunnerFiles(t *testing.T) { + tmpDir := t.TempDir() + src := filepath.Join(tmpDir, "src") + dst := filepath.Join(tmpDir, "dst") + + // Create source tree + if err := os.MkdirAll(filepath.Join(src, "subdir"), 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(src, "run.sh"), []byte("#!/bin/bash\necho hello"), 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(filepath.Join(src, "subdir", "config.json"), []byte(`{"key":"val"}`), 0o644); err != nil { + t.Fatal(err) + } + + if err := os.MkdirAll(dst, 0o755); err != nil { + t.Fatal(err) + } + + if err := copyRunnerFiles(src, dst); err != nil { + t.Fatalf("copyRunnerFiles() error: %v", err) + } + + // Verify files were copied + checks := []struct { + path string + content string + }{ + {filepath.Join(dst, "run.sh"), "#!/bin/bash\necho hello"}, + {filepath.Join(dst, "subdir", "config.json"), `{"key":"val"}`}, + } + for _, c := range checks { + data, err := os.ReadFile(c.path) + if err != nil { + t.Errorf("expected file %s: %v", c.path, err) + continue + } + if string(data) != c.content { + t.Errorf("file %s content = %q, want %q", c.path, string(data), c.content) + } + } + + // Verify run.sh is executable + info, err := os.Stat(filepath.Join(dst, "run.sh")) + if err != nil { + t.Fatal(err) + } + if info.Mode()&0o100 == 0 { + t.Error("run.sh should be executable") + } +} diff --git a/pkg/native/native_other.go b/pkg/native/native_other.go new file mode 100644 index 00000000..eb64d8c8 --- /dev/null +++ b/pkg/native/native_other.go @@ -0,0 +1,33 @@ +//go:build !darwin + +package native + +import ( + "context" + "fmt" + "log/slog" +) + +// Runner is a stub on non-darwin platforms. +type Runner struct{} + +// New returns an error on non-darwin platforms. +func New(_, _, _, _ string, _ *slog.Logger) (*Runner, error) { + return nil, fmt.Errorf("native macOS runner is only supported on darwin") +} + +// SetRunAsUser is a stub on non-darwin platforms. +func (r *Runner) SetRunAsUser(_ string) {} + +// Start is a stub on non-darwin platforms. +func (r *Runner) Start(_ context.Context) error { + return fmt.Errorf("native macOS runner is only supported on darwin") +} + +// Wait is a stub on non-darwin platforms. +func (r *Runner) Wait() (int, error) { + return -1, fmt.Errorf("native macOS runner is only supported on darwin") +} + +// Stop is a stub on non-darwin platforms. +func (r *Runner) Stop() {} diff --git a/pkg/runner/runner.go b/pkg/runner/runner.go index 8ff0bc2c..519e9d9d 100644 --- a/pkg/runner/runner.go +++ b/pkg/runner/runner.go @@ -35,9 +35,11 @@ func New(dataDir string, log *slog.Logger) *Manager { } // Dir returns the path to the extracted runner directory. +// The path is OS-specific (e.g. runners/2.333.1-linux) so that macOS +// and Linux extractions don't collide on shared filesystems (virtio-fs). // Call Extract() first to ensure it exists. func (m *Manager) Dir() string { - return filepath.Join(m.dataDir, "runners", Version) + return filepath.Join(m.dataDir, "runners", Version+"-"+goruntime.GOOS) } // Entrypoint returns the runner entrypoint command for the current OS. diff --git a/pkg/runtime/runtime.go b/pkg/runtime/runtime.go index ded66b9d..ee2334b4 100644 --- a/pkg/runtime/runtime.go +++ b/pkg/runtime/runtime.go @@ -1296,8 +1296,8 @@ func isOfficialRunnerImage(image string) bool { // as foreign images and bind-mounts /actions-runner over the rootfs, // then runs /actions-runner/run.sh — which the image doesn't have, // so the entrypoint exits 127 ("command not found"). - "ephpm/ephemerd:runner-ci-linux-", - "docker.io/ephpm/ephemerd:runner-ci-linux-", + "ephpm/ephemerd:runner-ci-linux", + "docker.io/ephpm/ephemerd:runner-ci-linux", } { if strings.HasPrefix(image, prefix) { return true diff --git a/pkg/scheduler/handle_queued_test.go b/pkg/scheduler/handle_queued_test.go index 0cfaba05..4decc917 100644 --- a/pkg/scheduler/handle_queued_test.go +++ b/pkg/scheduler/handle_queued_test.go @@ -70,11 +70,12 @@ func TestHandleQueued_DrainNoClaim(t *testing.T) { // when MacOSVMConfig is nil but the job has macOS labels, the scheduler must // remove the seen entry so the next poll retries. The provider is never asked // to claim. -func TestHandleQueued_SkipsMacOSWithoutVMConfig(t *testing.T) { +func TestHandleQueued_SkipsMacOSWithoutVMOrNativeConfig(t *testing.T) { mp := newMockProvider("github") s := New(Config{ Providers: []providers.Provider{mp}, Log: testLogger(), + // No MacOSVMConfig and no MacOSModeForRepo — macOS jobs should be deferred }) event := providers.JobEvent{ @@ -88,13 +89,13 @@ func TestHandleQueued_SkipsMacOSWithoutVMConfig(t *testing.T) { s.handleQueued(context.Background(), event) if got := len(mp.claims); got != 0 { - t.Errorf("macOS job without VM config should not claim, got %d claims", got) + t.Errorf("macOS job without VM or native config should not claim, got %d claims", got) } s.mu.Lock() _, seen := s.seen[keyFor(event)] s.mu.Unlock() if seen { - t.Error("macOS job without VM config should be unsed so it retries on next poll") + t.Error("macOS job without VM or native config should be unseen so it retries on next poll") } } diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go index 028d410f..52d931e4 100644 --- a/pkg/scheduler/scheduler.go +++ b/pkg/scheduler/scheduler.go @@ -17,6 +17,7 @@ import ( "github.com/ephpm/ephemerd/pkg/artifacts" "github.com/ephpm/ephemerd/pkg/metrics" "github.com/ephpm/ephemerd/pkg/names" + "github.com/ephpm/ephemerd/pkg/native" "github.com/ephpm/ephemerd/pkg/providers" "github.com/ephpm/ephemerd/pkg/runtime" "github.com/ephpm/ephemerd/pkg/tunnel" @@ -52,6 +53,11 @@ type Config struct { // and finally the runtime's host-aware default. Nil-safe. RunnerImageForRepo func(repo, os string) string + MaxNativeMac int // max concurrent native macOS jobs (default 4) + MacOSModeForRepo func(repo string) string // returns "native" or "vm" per repo (nil = always VM) + NativeMacUser string // non-root user for native macOS runner processes + RunnerDir string // path to extracted GHA runner binary dir (runner.Manager.Dir()) + Log *slog.Logger } @@ -103,10 +109,11 @@ type Scheduler struct { seen map[jobKey]time.Time // recently handled jobs for dedup pending map[jobKey]struct{} // jobs dispatched to a handler but not yet holding sem mu sync.Mutex - sem chan struct{} // local/native job concurrency limiter - linuxSem chan struct{} // Linux dispatch (VM) concurrency limiter - macSem chan struct{} // macOS VM concurrency limiter (Vz has a hard cap) - draining bool // true when shutting down, rejects new jobs + sem chan struct{} // local/native job concurrency limiter + linuxSem chan struct{} // Linux dispatch (VM) concurrency limiter + macSem chan struct{} // macOS VM concurrency limiter (Vz has a hard cap) + nativeMacSem chan struct{} // native macOS job concurrency limiter (separate from VM limit) + draining bool // true when shutting down, rejects new jobs startTime time.Time } @@ -158,9 +165,10 @@ type runningJob struct { image string cancel context.CancelFunc artifactsDir string // non-empty if OCI artifacts were extracted for this job - dispatched string // non-empty if dispatched to Linux VM worker (stores container name) - macosVM vm.MacOSVM // non-nil if running as a macOS VM job - startedAt time.Time + dispatched string // non-empty if dispatched to Linux VM worker (stores container name) + macosVM vm.MacOSVM // non-nil if running as a macOS VM job + nativeRunner interface{ Stop() } // non-nil if running as a native macOS job + startedAt time.Time } @@ -189,15 +197,21 @@ func New(cfg Config) *Scheduler { } } + nativeMac := cfg.MaxNativeMac + if nativeMac <= 0 { + nativeMac = 4 + } + return &Scheduler{ - cfg: cfg, - running: make(map[jobKey]*runningJob), - seen: make(map[jobKey]time.Time), - pending: make(map[jobKey]struct{}), - sem: make(chan struct{}, cfg.MaxConcurrent), - linuxSem: make(chan struct{}, cfg.MaxConcurrent), - macSem: make(chan struct{}, macVMs), - startTime: time.Now(), + cfg: cfg, + running: make(map[jobKey]*runningJob), + seen: make(map[jobKey]time.Time), + pending: make(map[jobKey]struct{}), + sem: make(chan struct{}, cfg.MaxConcurrent), + linuxSem: make(chan struct{}, cfg.MaxConcurrent), + macSem: make(chan struct{}, macVMs), + nativeMacSem: make(chan struct{}, nativeMac), + startTime: time.Now(), } } @@ -447,11 +461,10 @@ func (s *Scheduler) canHandleJob(jobLabels []string) bool { case "windows": osOK = goruntime.GOOS == "windows" case "macos", "macosx": - // macOS jobs need a per-job VM for isolation. Without - // MacOSVMConfig we refuse the job rather than fall back to - // running on the host — sharing the runner process tree with - // other jobs (and the daemon) is a non-starter for CI. - osOK = goruntime.GOOS == "darwin" && s.cfg.MacOSVMConfig != nil + // macOS jobs run in a per-job VM (default) or natively on + // the host (when configured for trusted repos). Accept if + // either VM config or native mode is available. + osOK = goruntime.GOOS == "darwin" && (s.cfg.MacOSVMConfig != nil || s.cfg.MacOSModeForRepo != nil) } } if !osOK { @@ -543,8 +556,14 @@ func (s *Scheduler) handleQueued(ctx context.Context, event providers.JobEvent) return } - // Route macOS-native jobs to per-job macOS VMs. + // Route macOS jobs to native runner or per-job VM. if isMacOSJob(event.Labels) { + // Native mode takes priority when configured for this repo + if s.cfg.MacOSModeForRepo != nil && s.cfg.MacOSModeForRepo(event.Repo) == "native" { + s.handleNativeMacOSJob(ctx, event) + return + } + // VM path s.mu.Lock() macCfg := s.cfg.MacOSVMConfig s.mu.Unlock() @@ -552,13 +571,13 @@ func (s *Scheduler) handleQueued(ctx context.Context, event providers.JobEvent) s.handleMacOSJob(ctx, event) return } - // macOS VM disk is still being provisioned — remove from seen/pending + // Neither native nor VM available — remove from seen/pending // so the next poll retries this job once the install finishes. s.mu.Lock() delete(s.seen, key) delete(s.pending, key) s.mu.Unlock() - log.Info("macOS VM disk not ready yet, deferring job") + log.Info("macOS runner not ready, deferring job") return } @@ -869,6 +888,131 @@ func (s *Scheduler) handleMacOSJob(ctx context.Context, event providers.JobEvent }() } +// handleNativeMacOSJob runs the GitHub Actions runner directly on the macOS +// host inside a sandbox. Used for trusted repos that don't need VM isolation. +func (s *Scheduler) handleNativeMacOSJob(ctx context.Context, event providers.JobEvent) { + jobID := event.JobID + key := keyFor(event) + log := s.cfg.Log.With("job_id", jobID, "repo", event.Repo, "platform", "macos-native") + + unsee := func() { + s.mu.Lock() + delete(s.seen, key) + delete(s.pending, key) + s.mu.Unlock() + } + + // Acquire native macOS concurrency slot (separate from VM sem) + select { + case s.nativeMacSem <- struct{}{}: + case <-ctx.Done(): + unsee() + return + } + s.mu.Lock() + delete(s.pending, key) + s.mu.Unlock() + + log.Info("provisioning native macOS runner for job") + + // Claim job with macOS labels + labels := buildLabelsForOS("darwin", s.cfg.Labels) + const maxNameRetries = 3 + claim, err := s.claimJob(ctx, &event, labels, log, maxNameRetries) + if err != nil { + log.Error("failed to claim job", "error", err) + unsee() + time.Sleep(backoffDuration(event.Repo)) + <-s.nativeMacSem + return + } + + // Create the native runner + nr, err := native.New(s.cfg.DataDir, fmt.Sprintf("%d", jobID), claim.RunnerConfig, s.cfg.RunnerDir, log) + if err != nil { + log.Error("failed to create native runner", "error", err) + if rmErr := event.Provider.ReleaseJob(ctx, claim); rmErr != nil { + log.Warn("failed to remove ghost runner", "runner_id", claim.RunnerID, "error", rmErr) + } + unsee() + <-s.nativeMacSem + return + } + if s.cfg.NativeMacUser != "" { + nr.SetRunAsUser(s.cfg.NativeMacUser) + } + + var jobCtx context.Context + var cancel context.CancelFunc + if s.cfg.JobTimeout > 0 { + jobCtx, cancel = context.WithTimeout(ctx, s.cfg.JobTimeout) + } else { + jobCtx, cancel = context.WithCancel(ctx) + } + + // Start the runner + if err := nr.Start(jobCtx); err != nil { + log.Error("failed to start native runner", "error", err) + nr.Stop() + if rmErr := event.Provider.ReleaseJob(ctx, claim); rmErr != nil { + log.Warn("failed to remove ghost runner", "runner_id", claim.RunnerID, "error", rmErr) + } + unsee() + cancel() + <-s.nativeMacSem + return + } + + // Track the running job + s.mu.Lock() + s.running[key] = &runningJob{ + provider: event.Provider, + claim: claim, + repo: event.Repo, + cancel: cancel, + nativeRunner: nr, + startedAt: time.Now(), + } + s.mu.Unlock() + metrics.JobsActive.Inc() + + log.Info("native macOS runner started", "name", claim.RunnerName) + + // Wait for the job to finish in the background + go func() { + defer func() { <-s.nativeMacSem }() + + exitCode, err := nr.Wait() + if err != nil { + if jobCtx.Err() != nil { + log.Warn("native macOS runner killed (timeout or shutdown)", "error", err) + } else { + log.Error("native macOS runner crashed", "error", err) + } + } else if exitCode != 0 { + log.Warn("native macOS runner exited with failure", "exit_code", exitCode) + } else { + log.Info("native macOS runner exited", "exit_code", exitCode) + } + + // Clean up + s.mu.Lock() + rj, exists := s.running[key] + if exists { + delete(s.running, key) + s.mu.Unlock() + nr.Stop() + if rj.provider != nil && rj.claim != nil { + if err := rj.provider.ReleaseJob(context.Background(), rj.claim); err != nil { + log.Debug("deregister runner after native macOS cleanup", "error", err) + } + } + } else { + s.mu.Unlock() + } + }() +} + // handleLocalJob provisions a runner using the local containerd Runtime. func (s *Scheduler) handleLocalJob(ctx context.Context, event providers.JobEvent) { jobID := event.JobID @@ -1084,6 +1228,8 @@ func (s *Scheduler) handleCompleted(ctx context.Context, event providers.JobEven job.cancel() if job.macosVM != nil { job.macosVM.Stop() + } else if job.nativeRunner != nil { + job.nativeRunner.Stop() } else if job.dispatched != "" && s.cfg.LinuxDispatcher != nil { if err := s.cfg.LinuxDispatcher.Destroy(context.Background(), job.dispatched); err != nil { log.Warn("failed to destroy dispatched runner", "error", err) @@ -1154,6 +1300,8 @@ func (s *Scheduler) destroyAll() { job.cancel() if job.macosVM != nil { job.macosVM.Stop() + } else if job.nativeRunner != nil { + job.nativeRunner.Stop() } else if job.dispatched != "" && s.cfg.LinuxDispatcher != nil { if err := s.cfg.LinuxDispatcher.Destroy(context.Background(), job.dispatched); err != nil { s.cfg.Log.Warn("failed to destroy dispatched runner", "job_id", key.JobID, "error", err) diff --git a/pkg/scheduler/scheduler_test.go b/pkg/scheduler/scheduler_test.go index ec591504..d98bfcca 100644 --- a/pkg/scheduler/scheduler_test.go +++ b/pkg/scheduler/scheduler_test.go @@ -1014,3 +1014,57 @@ func TestServeTunnelWithReconnect_CancelExitsCleanly(t *testing.T) { t.Fatal("serveTunnelWithReconnect did not exit after context cancel") } } + +// --- nativeMacSem tests --- + +func TestNew_NativeMacSemDefault(t *testing.T) { + s := New(Config{Log: testLogger()}) + if cap(s.nativeMacSem) != 4 { + t.Errorf("nativeMacSem capacity = %d, want default 4", cap(s.nativeMacSem)) + } +} + +func TestNew_NativeMacSemCustom(t *testing.T) { + s := New(Config{MaxNativeMac: 6, Log: testLogger()}) + if cap(s.nativeMacSem) != 6 { + t.Errorf("nativeMacSem capacity = %d, want 6", cap(s.nativeMacSem)) + } +} + +func TestNew_NativeMacSemNegative(t *testing.T) { + s := New(Config{MaxNativeMac: -1, Log: testLogger()}) + if cap(s.nativeMacSem) != 4 { + t.Errorf("nativeMacSem capacity = %d, want default 4", cap(s.nativeMacSem)) + } +} + +// --- canHandleJob with native mode --- + +func TestCanHandleJob_MacOSNativeMode(t *testing.T) { + if runtime.GOOS != "darwin" { + t.Skip("macOS-specific test") + } + + // Without VM config but with native mode function, should accept macOS jobs + s := New(Config{ + MacOSModeForRepo: func(_ string) string { return "native" }, + Log: testLogger(), + }) + + if !s.canHandleJob([]string{"self-hosted", "macos"}) { + t.Error("canHandleJob should accept macOS when MacOSModeForRepo is set") + } +} + +func TestCanHandleJob_MacOSNoConfig(t *testing.T) { + if runtime.GOOS != "darwin" { + t.Skip("macOS-specific test") + } + + // Without VM config and without native mode, should reject macOS jobs + s := New(Config{Log: testLogger()}) + + if s.canHandleJob([]string{"self-hosted", "macos"}) { + t.Error("canHandleJob should reject macOS when neither VMConfig nor MacOSModeForRepo is set") + } +} diff --git a/pkg/vm/macosvm_darwin.go b/pkg/vm/macosvm_darwin.go index 8420e41e..61fb95f6 100644 --- a/pkg/vm/macosvm_darwin.go +++ b/pkg/vm/macosvm_darwin.go @@ -491,9 +491,12 @@ func (m *darwinMacOSVM) monitorRunner(ctx context.Context, ip string) { } if err != nil { + m.cfg.Log.Debug("monitor pgrep error", "id", m.id, "error", err, "output", strings.TrimSpace(string(out))) continue } + m.cfg.Log.Debug("monitor pgrep result", "id", m.id, "output", strings.TrimSpace(string(out))) + if strings.TrimSpace(string(out)) == "EXITED" { // Give the runner a grace period to report results to GitHub // before we tear down the VM and network.