From 03cd145e96e505f44a5afae6a42b9d43f8a71a78 Mon Sep 17 00:00:00 2001 From: Sebastian Scherer Date: Thu, 14 May 2026 16:23:02 -0400 Subject: [PATCH 01/13] feat(osmo): VS Code/Cursor dev workflow on NVIDIA OSMO Adds a privileged Docker-in-Docker workspace task that lets a developer run the full AirStack docker-compose stack on OSMO and attach an IDE over SSH, with Isaac Sim WebRTC livestream + Foxglove websocket exposed via osmo port-forward. Components: - osmo/workspace/{Dockerfile,entrypoint.sh,sshd_config}: airstack-osmo-workspace image. Ubuntu 24.04 + sshd (pubkey-only) + Docker CE + Docker Compose + nvidia-container-toolkit + fuse-overlayfs (DinD-on-overlayfs needs it, otherwise dockerd falls back to vfs which bloats AirStack images ~10x). - osmo/workflows/airstack-dev.yaml: single privileged GPU task. Materializes Nucleus + airlab-docker secrets from OSMO credentials, clones AirStack, starts inner dockerd, runs `airstack up` with desktop + isaac-sim-livestream Compose profiles. - simulation/isaac-sim: isaac-sim-livestream Compose service that runs Pegasus standalone with --/app/livestream/enabled=true and exposes WebRTC port ranges 47995-48012 / 49000-49007 / 49100; launch script gates headless+livestream extension on ISAAC_SIM_LIVESTREAM env var. - .airstack/modules/osmo.sh: airstack osmo:{up,ide,foxglove,webrtc,logs,down} CLI wrappers around `osmo workflow submit` / `port-forward` / `cancel`. Persists the active workflow id and validates it's still running before each command (prevents the stale-state 410 error). - airstack.sh: bash 4+ re-exec bootstrap (macOS ships 3.2; the CLI uses `declare -A`). - osmo/README.md + docs/tutorials/airstack_on_osmo.md: admin pool setup (privileged_allowed) + per-user credentials (airlab-docker-login, airlab-nucleus) + student-facing IDE attach + WebRTC/Foxglove flow. Pool requirements: privileged_allowed: true, GPU pool with nvidia-container-toolkit on the host, ample node ephemeral storage (AirStack images extracted are ~50-100Gi via fuse-overlayfs; vfs needs ~500Gi+). Co-authored-by: Cursor --- .airstack/modules/osmo.sh | 426 +++++++++++++++ airstack.sh | 41 ++ docs/getting_started/index.md | 7 + docs/tutorials/airstack_on_osmo.md | 488 ++++++++++++++++++ docs/tutorials/index.md | 1 + mkdocs.yml | 1 + osmo/README.md | 248 +++++++++ osmo/workflows/airstack-dev.yaml | 100 ++++ osmo/workspace/Dockerfile | 104 ++++ osmo/workspace/entrypoint.sh | 201 ++++++++ osmo/workspace/sshd_config | 41 ++ .../isaac-sim/docker/docker-compose.yaml | 58 +++ .../example_one_px4_pegasus_launch_script.py | 25 +- 13 files changed, 1735 insertions(+), 6 deletions(-) create mode 100755 .airstack/modules/osmo.sh create mode 100644 docs/tutorials/airstack_on_osmo.md create mode 100644 osmo/README.md create mode 100644 osmo/workflows/airstack-dev.yaml create mode 100644 osmo/workspace/Dockerfile create mode 100755 osmo/workspace/entrypoint.sh create mode 100644 osmo/workspace/sshd_config diff --git a/.airstack/modules/osmo.sh b/.airstack/modules/osmo.sh new file mode 100755 index 000000000..e3ac7f107 --- /dev/null +++ b/.airstack/modules/osmo.sh @@ -0,0 +1,426 @@ +#!/usr/bin/env bash + +# osmo.sh — AirStack-on-OSMO convenience commands. +# +# Wraps `osmo workflow submit/port-forward/logs/cancel` for the +# osmo/workflows/airstack-dev.yaml workflow so a Mac/Windows student doesn't +# have to memorize the WebRTC port range or the entry-script path. +# +# This module is pure bash + the cross-platform `osmo` CLI — no Docker +# dependency. Safe to run on a laptop with no AirStack runtime. +# +# Most commands need a workflow id. `osmo:up` saves the id to +# $OSMO_STATE_FILE; the other commands read it from there. You can also +# override it for a single invocation by exporting AIRSTACK_OSMO_WF. + +# State directory and file: ~/.airstack/osmo-state stores the most recent +# workflow id submitted with `airstack osmo:up`. +OSMO_STATE_DIR="${HOME}/.airstack" +OSMO_STATE_FILE="${OSMO_STATE_DIR}/osmo-state" + +# WebRTC livestream ports — must match the ports published by the +# isaac-sim-livestream service in +# simulation/isaac-sim/docker/docker-compose.yaml. +OSMO_WEBRTC_TCP="47995-48012,49000-49007,49100" +OSMO_WEBRTC_UDP="47995-48012,49000-49007" + +# GCS Foxglove websocket: container 8765 → host 8766 (per +# gcs/docker/docker-compose.yaml). +OSMO_FOXGLOVE_PORT="8766:8766" + +# SSH port-forward: local 2200 → pod 22. +OSMO_SSH_PORT="2200:22" + +# Default `osmo workflow port-forward` connect-timeout (24h). +OSMO_PF_TIMEOUT="${OSMO_PF_TIMEOUT:-86400}" + +# Helper: ensure the osmo CLI is on PATH. +function _osmo_check_cli { + if ! command -v osmo >/dev/null 2>&1; then + log_error "osmo CLI not found on PATH. Install from https://github.com/NVIDIA/OSMO and run 'osmo login'." + return 1 + fi +} + +# Helper: read a value with prompt; supports -s for silent (passwords). +# +# Visible prompts switch the TTY out of canonical mode for the duration of +# the read. Without this, macOS caps each input line at MAX_CANON = 1024 +# bytes (per ) and rings the terminal bell on Enter when +# the buffer overflows. Nucleus API tokens are JWTs ~950 bytes long, so +# `Nucleus API token: ` lands right at the cap. `stty -icanon` makes +# the kernel deliver bytes to bash as they're typed, with no line-buffer +# limit; bash's `read` still terminates on newline normally. +# +# We use a trap to guarantee the saved stty is restored if the user Ctrl-Cs +# mid-paste — otherwise the shell would be left in raw mode. +function _osmo_prompt { + local var_name="$1" + local prompt_text="$2" + local silent="${3:-false}" + local saved_stty="" + + if [ "$silent" = "true" ]; then + # Passwords are short — canonical-mode cap is fine here. + read -r -s -p "${prompt_text}: " "$var_name" + printf "\n" >&2 + else + if [ -t 0 ]; then + saved_stty="$(stty -g 2>/dev/null || true)" + if [ -n "$saved_stty" ]; then + trap 'stty "$saved_stty" 2>/dev/null; trap - INT' INT + stty -icanon 2>/dev/null + fi + fi + read -r -p "${prompt_text}: " "$var_name" + if [ -n "$saved_stty" ]; then + stty "$saved_stty" 2>/dev/null + trap - INT + fi + fi + + if [ -z "${!var_name}" ]; then + log_error "Empty input for ${var_name}; aborting." + return 1 + fi +} + +# osmo:setup — interactively register the three OSMO credentials AirStack +# needs (airlab-docker-registry, airlab-docker-login, airlab-nucleus). +# Idempotent — re-running rotates the credentials. +function cmd_osmo_setup { + _osmo_check_cli || return 1 + + cat >&2 <<'EOF' + +This sets up the three per-user OSMO credentials AirStack-on-OSMO needs: + + 1. airlab-docker-registry (REGISTRY) — for OSMO to pull the workspace image + 2. airlab-docker-login (GENERIC) — for the inner dockerd to pull AirStack images + 3. airlab-nucleus (GENERIC) — for Isaac Sim Nucleus access + +You'll be asked for: + + - your Andrew ID (no @andrew.cmu.edu suffix) + - your AirLab Docker password (same as your Andrew password) + - your Nucleus API token (https://airlab-nucleus.andrew.cmu.edu/omni/web3/ + → right-click cloud → API Tokens). NOT your Andrew password. + +Values go directly to OSMO; nothing is written to disk locally. + +EOF + + local andrew_id andrew_password nucleus_token + _osmo_prompt andrew_id "Andrew ID" false || return 1 + _osmo_prompt andrew_password "AirLab Docker password (hidden)" true || return 1 + _osmo_prompt nucleus_token "Nucleus API token" false || return 1 + + local omni_server="${OMNI_SERVER:-omniverse://airlab-nucleus.andrew.cmu.edu/NVIDIA/Assets/Isaac/5.1}" + local airlab_registry="${AIRLAB_REGISTRY:-airlab-docker.andrew.cmu.edu}" + + log_info "Registering airlab-docker-registry (REGISTRY)..." + osmo credential set airlab-docker-registry \ + --type REGISTRY \ + --payload "registry=${airlab_registry}" \ + "username=${andrew_id}" \ + "auth=${andrew_password}" \ + || { log_error "osmo credential set airlab-docker-registry failed"; return 1; } + + log_info "Registering airlab-docker-login (GENERIC)..." + osmo credential set airlab-docker-login \ + --type GENERIC \ + --payload "username=${andrew_id}" \ + "password=${andrew_password}" \ + || { log_error "osmo credential set airlab-docker-login failed"; return 1; } + + log_info "Registering airlab-nucleus (GENERIC)..." + osmo credential set airlab-nucleus \ + --type GENERIC \ + --payload "omni_user=${andrew_id}" \ + "omni_pass=${nucleus_token}" \ + "omni_server=${omni_server}" \ + || { log_error "osmo credential set airlab-nucleus failed"; return 1; } + + log_info "All three credentials registered. List them with: osmo credential list" + log_info "Next: airstack osmo:up [--pool POOL]" +} + +# Helper: pick the first existing SSH public key on the host. +function _osmo_pick_pubkey { + local candidates=( + "${HOME}/.ssh/id_ed25519.pub" + "${HOME}/.ssh/id_ecdsa.pub" + "${HOME}/.ssh/id_rsa.pub" + ) + for k in "${candidates[@]}"; do + if [ -f "$k" ]; then + echo "$k" + return 0 + fi + done + return 1 +} + +# Helper: get the active workflow id (env override first, then state file). +# +# The state file persists across shell sessions, so it can easily go stale +# (e.g. a previous airstack-dev-N is now FAILED/CANCELED). To avoid the +# confusing "Workflow airstack-dev-10 is not running!" 410 error from the +# downstream osmo command, this helper verifies the saved id is still in a +# live state (PENDING / RUNNING) before returning it. +function _osmo_wf_id { + local wf + if [ -n "${AIRSTACK_OSMO_WF:-}" ]; then + wf="${AIRSTACK_OSMO_WF}" + elif [ -f "${OSMO_STATE_FILE}" ]; then + wf="$(cat "${OSMO_STATE_FILE}")" + else + log_error "No workflow id found. Run 'airstack osmo:up' first, or export AIRSTACK_OSMO_WF=." + return 1 + fi + + # Validate the workflow is still alive (only when osmo CLI is available). + if command -v osmo >/dev/null 2>&1; then + local status + status="$(osmo workflow query "${wf}" 2>/dev/null | awk -F': +' '/^Status/ {print $2; exit}' | tr -d ' \r\n')" + case "${status}" in + PENDING|RUNNING|"") + # "" means we couldn't reach osmo; let the downstream + # command surface the real error rather than failing here. + ;; + *) + log_error "Saved workflow '${wf}' is ${status}, not running." + log_warn "Run 'airstack osmo:up' to launch a fresh one, or:" + log_warn " rm ${OSMO_STATE_FILE}" + log_warn " export AIRSTACK_OSMO_WF=" + return 1 + ;; + esac + fi + + echo "${wf}" + return 0 +} + +# Helper: persist the workflow id. +function _osmo_save_wf_id { + mkdir -p "${OSMO_STATE_DIR}" + echo "$1" > "${OSMO_STATE_FILE}" + log_info "Saved workflow id '$1' to ${OSMO_STATE_FILE}" +} + +# osmo:up — submit airstack-dev.yaml with the local pubkey injected. +# +# Usage: airstack osmo:up [--pool POOL] [--key PATH] [--branch BRANCH] +function cmd_osmo_up { + _osmo_check_cli || return 1 + + local pool="${OSMO_POOL:-}" + local pubkey_file="" + local branch="" + local extra_args=() + + while [ $# -gt 0 ]; do + case "$1" in + --pool) pool="$2"; shift 2 ;; + --key) pubkey_file="$2"; shift 2 ;; + --branch) branch="$2"; shift 2 ;; + *) extra_args+=("$1"); shift ;; + esac + done + + if [ -z "$pubkey_file" ]; then + if ! pubkey_file="$(_osmo_pick_pubkey)"; then + log_error "No SSH public key found in ~/.ssh. Generate one with: ssh-keygen -t ed25519" + return 1 + fi + fi + log_info "Using SSH public key: ${pubkey_file}" + + local workflow_yaml="${PROJECT_ROOT}/osmo/workflows/airstack-dev.yaml" + if [ ! -f "$workflow_yaml" ]; then + log_error "Workflow file not found: ${workflow_yaml}" + return 1 + fi + + local cmd=(osmo workflow submit "$workflow_yaml") + if [ -n "$pool" ]; then + cmd+=(--pool "$pool") + else + log_warn "No --pool provided and OSMO_POOL is unset; using your osmo profile's default pool." + fi + # IMPORTANT: `osmo workflow submit --set-env` is variadic. Passing two + # separate `--set-env A=1 --set-env B=2` silently drops the first one + # (only the last `--set-env` flag's values are kept). We collect all + # K=V pairs and pass them under a single `--set-env`. + local env_kvs=("SSH_PUB_KEY=$(cat "$pubkey_file")") + if [ -n "$branch" ]; then + env_kvs+=("AIRSTACK_BRANCH=${branch}") + fi + cmd+=(--set-env "${env_kvs[@]}") + if [ ${#extra_args[@]} -gt 0 ]; then + cmd+=("${extra_args[@]}") + fi + + log_info "Submitting: ${cmd[*]}" + local output + if ! output="$("${cmd[@]}" 2>&1)"; then + echo "$output" >&2 + log_error "osmo workflow submit failed." + if echo "$output" | grep -q "privileged flag enabled"; then + log_error "The selected pool does not allow privileged tasks. AirStack-on-OSMO needs" + log_error "privileged: true on the workspace task (DinD)." + log_error "" + log_error "Audit available pools with:" + log_error " osmo pool list -t json | python3 -c \"import json,sys" + log_error " for ns in json.load(sys.stdin)['node_sets']:" + log_error " for p in ns['pools']:" + log_error " for n,plat in p['platforms'].items():" + log_error " print(f\\\"{p['name']:25} priv={plat['privileged_allowed']}\\\")\"" + log_error "" + log_error "If none allow privileged, ask your OSMO pool admin to flip" + log_error "platforms.default.privileged_allowed: true on the airstack pool." + log_error "Full template: docs/tutorials/airstack_on_osmo.md → 'One-time pool setup (admin)'." + fi + return 1 + fi + echo "$output" + + # Parse the workflow id out of the submit output. The cookbook examples + # show "Workflow ID - " formatted output (see OSMO + # submission.rst). Match that line. + local wf_id + wf_id="$(echo "$output" | awk -F'- ' '/^Workflow ID/ {print $2; exit}' | tr -d ' \r\n')" + if [ -z "$wf_id" ]; then + log_warn "Could not parse workflow id from submit output. Set it manually:" + log_warn " echo > ${OSMO_STATE_FILE}" + return 0 + fi + _osmo_save_wf_id "$wf_id" + + log_info "Next steps:" + log_info " airstack osmo:logs # follow startup until 'sshd listening'" + log_info " airstack osmo:ide # port-forward sshd + open VS Code" + log_info " airstack osmo:webrtc # forward Isaac Sim WebRTC ports" + log_info " airstack osmo:foxglove # forward GCS Foxglove websocket" + log_info " airstack osmo:down # cancel the workflow" +} + +# osmo:logs — follow the workspace task logs. +function cmd_osmo_logs { + _osmo_check_cli || return 1 + local wf; wf="$(_osmo_wf_id)" || return 1 + log_info "osmo workflow logs ${wf} workspace --follow" + osmo workflow logs "$wf" workspace --follow +} + +# osmo:ide — port-forward sshd + (optionally) launch VS Code/Cursor on the +# `airstack-osmo` host. Runs the port-forward in the foreground so closing +# the terminal closes the tunnel. +# +# Usage: airstack osmo:ide [--no-open] [code|cursor] +function cmd_osmo_ide { + _osmo_check_cli || return 1 + local wf; wf="$(_osmo_wf_id)" || return 1 + + local open_ide=true + local ide_cmd="" + while [ $# -gt 0 ]; do + case "$1" in + --no-open) open_ide=false; shift ;; + code|cursor) ide_cmd="$1"; shift ;; + *) log_warn "Ignoring unknown osmo:ide arg: $1"; shift ;; + esac + done + + if [ -z "$ide_cmd" ]; then + if command -v cursor >/dev/null 2>&1; then + ide_cmd="cursor" + elif command -v code >/dev/null 2>&1; then + ide_cmd="code" + else + log_warn "Neither 'cursor' nor 'code' found on PATH; will only port-forward (open the IDE manually and Connect to Host airstack-osmo)." + open_ide=false + fi + fi + + log_info "Make sure ~/.ssh/config has a 'Host airstack-osmo' entry pointing at localhost:2200, User root." + + if [ "$open_ide" = true ]; then + # vscode-remote URI launches the IDE pre-attached to the remote host. + local uri="vscode-remote://ssh-remote+airstack-osmo/root/AirStack" + log_info "Launching ${ide_cmd} → ${uri}" + ( "$ide_cmd" --folder-uri "$uri" >/dev/null 2>&1 || \ + "$ide_cmd" "$uri" >/dev/null 2>&1 || \ + log_warn "Could not launch ${ide_cmd} automatically; open it and pick airstack-osmo from Remote-SSH manually." ) & + fi + + log_info "osmo workflow port-forward ${wf} workspace --port ${OSMO_SSH_PORT} --connect-timeout ${OSMO_PF_TIMEOUT}" + log_info "Leave this terminal running for the length of your session." + osmo workflow port-forward "$wf" workspace --port "$OSMO_SSH_PORT" --connect-timeout "$OSMO_PF_TIMEOUT" +} + +# osmo:webrtc — forward both Isaac Sim WebRTC port ranges (TCP in this +# terminal, spawn UDP in the background). +function cmd_osmo_webrtc { + _osmo_check_cli || return 1 + local wf; wf="$(_osmo_wf_id)" || return 1 + + log_info "Spawning UDP port-forward in background: ${OSMO_WEBRTC_UDP}" + nohup osmo workflow port-forward "$wf" workspace \ + --port "$OSMO_WEBRTC_UDP" --udp \ + --connect-timeout "$OSMO_PF_TIMEOUT" \ + > "${OSMO_STATE_DIR}/webrtc-udp.log" 2>&1 & + log_info " UDP log: ${OSMO_STATE_DIR}/webrtc-udp.log (pid $!)" + + log_info "Foreground TCP port-forward: ${OSMO_WEBRTC_TCP}" + log_info "Open the Omniverse Streaming Client / WebRTC client at http://localhost" + osmo workflow port-forward "$wf" workspace \ + --port "$OSMO_WEBRTC_TCP" \ + --connect-timeout "$OSMO_PF_TIMEOUT" +} + +# osmo:foxglove — forward the GCS Foxglove websocket. +function cmd_osmo_foxglove { + _osmo_check_cli || return 1 + local wf; wf="$(_osmo_wf_id)" || return 1 + + log_info "osmo workflow port-forward ${wf} workspace --port ${OSMO_FOXGLOVE_PORT} --connect-timeout ${OSMO_PF_TIMEOUT}" + log_info "Then open https://app.foxglove.dev → Open connection → ws://localhost:8766" + log_info "Then Layouts → Import from file → gcs/foxglove_extensions/airstack_default.json" + osmo workflow port-forward "$wf" workspace \ + --port "$OSMO_FOXGLOVE_PORT" \ + --connect-timeout "$OSMO_PF_TIMEOUT" +} + +# osmo:down — cancel the active workflow. Reminds you to push first. +function cmd_osmo_down { + _osmo_check_cli || return 1 + local wf; wf="$(_osmo_wf_id)" || return 1 + + log_warn "About to cancel workflow '${wf}'." + log_warn "Anything not pushed to git in /root/AirStack inside the pod will be LOST." + log_warn "Hit Ctrl-C in the next 5 seconds to abort." + sleep 5 + osmo workflow cancel "$wf" + rm -f "${OSMO_STATE_FILE}" +} + +# Register commands from this module. +function register_osmo_commands { + COMMANDS["osmo:setup"]="cmd_osmo_setup" + COMMANDS["osmo:up"]="cmd_osmo_up" + COMMANDS["osmo:logs"]="cmd_osmo_logs" + COMMANDS["osmo:ide"]="cmd_osmo_ide" + COMMANDS["osmo:webrtc"]="cmd_osmo_webrtc" + COMMANDS["osmo:foxglove"]="cmd_osmo_foxglove" + COMMANDS["osmo:down"]="cmd_osmo_down" + + COMMAND_HELP["osmo:setup"]="One-time per-user OSMO credential setup (airlab-docker-registry, airlab-docker-login, airlab-nucleus)" + COMMAND_HELP["osmo:up"]="Submit osmo/workflows/airstack-dev.yaml with your SSH pubkey injected (--pool POOL, --key PATH, --branch BRANCH)" + COMMAND_HELP["osmo:logs"]="Follow the workspace task logs (osmo workflow logs workspace --follow)" + COMMAND_HELP["osmo:ide"]="Port-forward sshd (2200:22) and open VS Code/Cursor on Host airstack-osmo" + COMMAND_HELP["osmo:webrtc"]="Port-forward Isaac Sim WebRTC ranges (TCP foreground + UDP background)" + COMMAND_HELP["osmo:foxglove"]="Port-forward GCS Foxglove websocket (8766:8766)" + COMMAND_HELP["osmo:down"]="Cancel the active workflow (push to git before running this)" +} diff --git a/airstack.sh b/airstack.sh index 3d1e955e3..78b475c07 100755 --- a/airstack.sh +++ b/airstack.sh @@ -5,6 +5,47 @@ # This script provides a unified interface for common development tasks # in the AirStack project, including setup, installation, and container management. +# Re-exec under bash 4+ if necessary. macOS ships bash 3.2 which can't handle +# `declare -A` (associative arrays) used throughout this script. Searches for +# a newer bash via $AIRSTACK_BASH, then common Homebrew install paths, then +# any `bash` on PATH that reports version >= 4. Sets AIRSTACK_REEXEC_BASH=1 +# to guard against infinite re-exec loops. +if [ -z "${AIRSTACK_REEXEC_BASH:-}" ] && [ "${BASH_VERSINFO[0]:-0}" -lt 4 ]; then + _airstack_candidates=( + "${AIRSTACK_BASH:-}" + /opt/homebrew/bin/bash # Apple Silicon Homebrew + /usr/local/bin/bash # Intel Homebrew + /opt/local/bin/bash # MacPorts + ) + if command -v bash5 >/dev/null 2>&1; then + _airstack_candidates+=("$(command -v bash5)") + fi + # Add any `bash` on PATH whose version is >= 4 (other than the one we just + # got here from, which is < 4 by the if-check above). + for _alt in $(command -v -a bash 2>/dev/null); do + _airstack_candidates+=("$_alt") + done + + for _airstack_alt_bash in "${_airstack_candidates[@]}"; do + [ -z "$_airstack_alt_bash" ] && continue + [ -x "$_airstack_alt_bash" ] || continue + # Probe BASH_VERSINFO[0] without sourcing the script. + if "$_airstack_alt_bash" -c '[ "${BASH_VERSINFO[0]:-0}" -ge 4 ]' 2>/dev/null; then + export AIRSTACK_REEXEC_BASH=1 + exec "$_airstack_alt_bash" "$0" "$@" + fi + done + + cat >&2 <<'EOF' +[ERROR] airstack.sh requires bash 4 or newer (your bash is 3.x). + macOS ships bash 3.2 by default; install a modern bash with: + brew install bash + Or set AIRSTACK_BASH=/path/to/bash >= 4 before invoking this script. +EOF + exit 1 +fi +unset AIRSTACK_REEXEC_BASH + set -e # Script directory diff --git a/docs/getting_started/index.md b/docs/getting_started/index.md index e7bb64235..3b319ec00 100644 --- a/docs/getting_started/index.md +++ b/docs/getting_started/index.md @@ -1,5 +1,12 @@ # Getting Started +!!! tip "On Mac, Windows, or no GPU?" + + This page assumes a Linux desktop with an NVIDIA GPU. If that's not you, + use [AirStack on OSMO](../tutorials/airstack_on_osmo.md) instead — you + only need an SSH key, the `osmo` CLI, and VS Code or Cursor. No local + Docker, no NVIDIA drivers, no `airstack install`. + !!! warning "" AirStack is currently in ALPHA and only meant for internal usage. diff --git a/docs/tutorials/airstack_on_osmo.md b/docs/tutorials/airstack_on_osmo.md new file mode 100644 index 000000000..90d139b69 --- /dev/null +++ b/docs/tutorials/airstack_on_osmo.md @@ -0,0 +1,488 @@ +# AirStack on OSMO — Remote Development on Mac, Windows, or any Linux + +This tutorial walks through developing on AirStack from a laptop that has +**no Docker, no NVIDIA GPU, and no AirStack source tree of its own**. You'll +attach VS Code or Cursor to a remote OSMO pod via Remote-SSH, edit code as +if it were local, and stream Isaac Sim and the GCS Foxglove dashboard back +to your browser through `osmo workflow port-forward`. + +> **Prefer local development on a Linux+GPU desktop?** Use the +> [Getting Started](../getting_started/index.md) flow instead — `airstack +> install` + `airstack up` is faster and doesn't depend on a remote +> scheduler. This tutorial is for everyone *else*. + +## Who is this for? + +You want to develop AirStack and one of these is true: + +- You're on **macOS or Windows**. +- You have a Linux laptop but **no NVIDIA GPU**. +- Your lab shares a single GPU pool through OSMO and you'd like a + zero-installation onboarding path for new students. + +You're comfortable using `git` from a terminal, you have an SSH key +(`~/.ssh/id_ed25519` or similar), and you have either VS Code or Cursor +installed. That's the entire local-machine bar. + +## Architecture in a sentence + +`osmo workflow submit` spins up a privileged GPU pod that runs sshd plus a +Docker-in-Docker daemon. Inside that pod, `airstack up` brings up the +familiar three AirStack containers (Isaac Sim, robot-desktop, GCS). Your IDE +attaches over Remote-SSH; Isaac Sim and Foxglove are reached via separate +port-forwards. + +```mermaid +flowchart LR + subgraph laptop [Your laptop - Mac / Windows / Linux] + ide[VS Code or Cursor + Remote-SSH] + osmo[osmo CLI] + fox[app.foxglove.dev] + webrtc[Isaac Sim WebRTC client] + end + subgraph pod [OSMO workspace pod - GPU, privileged] + sshd[sshd] + inner[Inner dockerd] + isaac[isaac-sim container] + robot[robot-desktop container] + gcs[gcs container] + end + osmo -- submit and port-forward --> pod + ide -- ssh on 2200 --> sshd + fox -- ws on 8766 --> gcs + webrtc -- WebRTC on 47995... --> isaac + inner --> isaac + inner --> robot + inner --> gcs +``` + +## Prerequisites + +| You need | Why | +|---|---| +| The [`osmo` CLI](https://github.com/NVIDIA/OSMO) on your `PATH` | Submitting workflows and port-forwarding | +| `osmo login` done once | Stores your auth token in `~/.config/osmo` | +| An SSH keypair (e.g. `~/.ssh/id_ed25519`) | The pod authorises your pubkey at submit time. Generate one with `ssh-keygen -t ed25519` if you don't already have one. | +| **VS Code with the Remote-SSH extension** *or* **Cursor with its Remote-SSH equivalent** | Where you'll actually edit AirStack code | +| Optional: Foxglove desktop app, or just `app.foxglove.dev` | View ROS topics | +| Optional: an Omniverse Streaming Client / WebRTC browser client | View the streamed Isaac Sim render | + +You **do not** need: Docker, NVIDIA drivers, `airstack install`, `airstack +setup`, a local clone of AirStack, sudo, or Linux. + +> **Lab admin prerequisites (someone else's job, once).** A lab admin +> confirms the OSMO pool allows `privileged: true` on a GPU node and pushes +> the `airstack-osmo-workspace` image to `airlab-docker.andrew.cmu.edu`. +> Details in +> [`osmo/README.md`](https://github.com/castacks/AirStack/blob/main/osmo/README.md) +> and the [one-time pool setup section below](#one-time-pool-setup-admin). +> +> **Your job, once:** the next step. + +## One-time pool setup (admin) + +If `osmo workflow submit` returns: + +``` +Server responded with status code 400 +Error message: Workflow submit failed: +Task with platform: does not have privileged flag enabled. Task workspace +``` + +…then the OSMO pool you're targeting has `privileged_allowed: false` and the +DinD workspace can't run. You need a pool admin to flip it. **As of the +AirLab `airlab-share-01` deployment audit (May 2026), every pool ships with +`privileged_allowed: false` by default.** + +Audit pools yourself first: + +```bash +osmo pool list -t json | python3 -c " +import json, sys +for ns in json.load(sys.stdin)['node_sets']: + for p in ns['pools']: + for n, plat in p['platforms'].items(): + print(f\"{p['name']:25} {n:10} priv={plat['privileged_allowed']}\")" +``` + +If none show `priv=True`, send your pool admin a note like this: + +> Subject: Please enable privileged on the `airstack` pool +> +> Hi — I'm using the AirStack OSMO remote-dev workflow which runs the +> existing AirStack docker-compose stack inside a single pod via +> Docker-in-Docker (so students keep the `airstack up` UX). DinD requires +> `privileged: true` on the workspace task — without it the inner dockerd +> can't manage cgroups, overlayfs, the airstack_network bridge, or GPU +> device passthrough. +> +> Could you flip the `airstack` pool's platform to allow privileged tasks? +> Equivalent of: +> +> ```yaml +> platforms: +> default: +> privileged_allowed: true +> ``` +> +> No `host_network_allowed` change is needed — `osmo workflow port-forward` +> reaches the pod NS, which is enough for our Isaac Sim WebRTC and Foxglove +> streams. Workflow YAML for reference: +> `osmo/workflows/airstack-dev.yaml` in the AirStack repo. Setup details: +> `osmo/README.md`. + +Once enabled, target that pool with `--pool airstack` in Step 2. + +## Step 0 — Register your OSMO credentials (one time) + +OSMO credentials are **per-user** (each Andrew ID has its own Nucleus token, +its own AirLab Docker password, its own OSMO profile). You register them +once with the `osmo` CLI on your laptop and OSMO injects them into every +workflow you submit afterwards. They never leave your OSMO profile and your +laptop never sees the values again. + +You need three credentials. The exact names matter — the workflow YAML +references them by these exact names. + +### Option A — interactive helper (recommended) + +If you have a local AirStack clone: + +```bash +airstack osmo:setup +``` + +This prompts for your Andrew ID, AirLab Docker password, and Nucleus API +token, then runs the three `osmo credential set` commands below for you. + +> **macOS prereq: bash 4+.** macOS ships bash 3.2 by default and the +> `airstack` CLI needs bash 4+. If you see +> `airstack.sh requires bash 4 or newer`, install a modern bash with: +> +> ```bash +> brew install bash +> ``` +> +> No further config needed — `airstack.sh` auto-detects the Homebrew bash +> at `/opt/homebrew/bin/bash` (Apple Silicon) or `/usr/local/bin/bash` +> (Intel) and re-execs under it. You don't need to change your login shell. + +### Option B — three commands, copy-paste + +If you'd rather run the commands yourself (or you're on a laptop without an +AirStack clone), here they are: + +#### 1. AirLab Docker registry (REGISTRY type) + +Used by OSMO to pull the workspace image into the pod. OSMO auto-attaches +this to any image whose hostname matches `registry=` — you don't need to +reference it in YAML. + +```bash +osmo credential set airlab-docker-registry \ + --type REGISTRY \ + --payload registry=airlab-docker.andrew.cmu.edu \ + username= \ + auth='' +``` + +#### 2. AirLab Docker login (GENERIC type) + +The same Andrew ID + password, but as a **GENERIC** credential so the +**inner** dockerd inside the pod can `docker login` it and pull the +AirStack images. This duplication is unfortunately necessary — REGISTRY +credentials are for OSMO's outer image-pull only and aren't exposed to the +container as env vars. + +```bash +osmo credential set airlab-docker-login \ + --type GENERIC \ + --payload username= \ + password='' +``` + +#### 3. AirLab Nucleus (GENERIC type) + +Nucleus authenticates with an **API token**, not your password. To get one: +go to , log in, +right-click the cloud icon in the top-right → **API Tokens** → create a new +token. Save it — Nucleus shows it once. + +```bash +osmo credential set airlab-nucleus \ + --type GENERIC \ + --payload omni_user= \ + omni_pass='' \ + omni_server=omniverse://airlab-nucleus.andrew.cmu.edu/NVIDIA/Assets/Isaac/5.1 +``` + +### Verify + +List your credentials: + +```bash +osmo credential list +``` + +You should see all three. To rotate any of them later, just re-run the +matching `osmo credential set` command. + +> **Why three credentials?** It's tempting to consolidate. The reason for +> the split: OSMO REGISTRY credentials drive Kubernetes `imagePullSecrets` +> (auto-attached, never exposed as env vars), while GENERIC credentials are +> what get injected as env vars inside the running container. The pod +> needs **both** kinds of access — outer pull of the workspace image, plus +> inner login from the inner dockerd to pull AirStack images. + +## Step 1 — Add an SSH config entry (one time) + +VS Code and Cursor's Remote-SSH "Connect to Host…" picker reads +`~/.ssh/config`. Add this block once and the host shows up by name forever: + +```bash +cat >> ~/.ssh/config <<'EOF' + +Host airstack-osmo + HostName localhost + Port 2200 + User root + StrictHostKeyChecking accept-new +EOF +``` + +The `localhost:2200` is what we'll port-forward to in step 4. + +## Step 2 — Submit the workflow + +The repo ships the workflow at +[`osmo/workflows/airstack-dev.yaml`](https://github.com/castacks/AirStack/blob/main/osmo/workflows/airstack-dev.yaml). +You don't need a local AirStack clone to submit it — `osmo workflow submit` +takes a path and uploads the YAML. + +```bash +# If you don't have AirStack cloned locally: +curl -fsSL -o airstack-dev.yaml \ + https://raw.githubusercontent.com/castacks/AirStack/main/osmo/workflows/airstack-dev.yaml + +# Submit (replace airstack with whatever pool your admin enabled privileged on): +osmo workflow submit airstack-dev.yaml \ + --pool airstack \ + --set-env "SSH_PUB_KEY=$(cat ~/.ssh/id_ed25519.pub)" +``` + +> **Got `Task with platform ... does not have privileged flag enabled`?** +> The pool you picked doesn't allow privileged tasks. See the +> [one-time pool setup section](#one-time-pool-setup-admin) above — +> AirLab's default pools all ship with privileged off and need an admin +> to flip it on. + +The `--set-env "SSH_PUB_KEY=..."` line is what authorises **your** key on +**this** workflow. Each student passes their own pubkey at submit time — +the lab admin doesn't manage a global authorized_keys file. + +The command prints a workflow ID like `airstack-dev-1`. Save it; you'll +reuse it for every other command in this tutorial. The shell snippets below +assume you've stored it as `WF`: + +```bash +export WF=airstack-dev-1 +``` + +## Step 3 — Wait for the stack to come up + +Tail the lead task's logs and watch for milestones: + +```bash +osmo workflow logs $WF workspace --follow +``` + +Expected milestones, in order (each is one line in the log): + +1. `[entrypoint] sshd listening on :22` — VS Code/Cursor can attach. +2. `[entrypoint] dockerd ready` — the inner Docker daemon is up. +3. `Successfully built airstack_isaac-sim` *(or `Pulled` if pre-built)* — + the image set is in place. +4. `airstack-isaac-sim-livestream-1 ... started` +5. `airstack-robot-desktop-1 ... started` +6. `airstack-gcs-1 ... started` + +If step (1) appears, you can attach the IDE while the rest is still +spinning up — the bring-up will continue in the background. + +## Step 4 — Forward sshd and attach the IDE + +In one terminal, start the port-forward and **leave it running** for the +length of your session. The 24h connect-timeout matches the workflow's +`exec_timeout`: + +```bash +osmo workflow port-forward $WF workspace --port 2200:22 --connect-timeout 86400 +``` + +In your editor: + +- **VS Code:** Command Palette → **Remote-SSH: Connect to Host…** → pick + `airstack-osmo`. +- **Cursor:** the same flow under its remote-development menu. + +The IDE installs its remote server in the pod on first connect (~50 MB, +slower on a fresh pod, cached on subsequent connects). Then: + +1. **Open Folder…** → `/root/AirStack`. +2. Open the integrated terminal — you're root in `/root/AirStack`. +3. Edit code in the IDE; the changes land directly on the pod's disk. + +Verify everything is wired up by running: + +```bash +docker ps +``` + +You should see four containers: `airstack-isaac-sim-livestream-1`, +`airstack-robot-desktop-1`, `airstack-gcs-1`, plus the AirStack CLI helper. + +## Step 5 — Pick a feature branch and start working + +The pod cloned `main` into `/root/AirStack` on startup. Treat it like any +git working tree: + +```bash +git checkout -b my-feature +# edit code in the IDE... +bws --packages-select # build inside the robot-desktop container per AGENTS.md +``` + +Standard ROS 2 commands work from the integrated terminal: + +```bash +docker exec airstack-robot-desktop-1 bash -c "ros2 node list" +docker exec airstack-robot-desktop-1 bash -c "ros2 topic hz /robot_1/odometry" +``` + +This is the same `docker exec` pattern documented in +[AGENTS.md](https://github.com/castacks/AirStack/blob/main/AGENTS.md) — the +fact that you're on a remote pod is invisible from inside the IDE. + +## Step 6 — View Isaac Sim (WebRTC livestream) + +Isaac Sim runs headless inside the pod with the Kit +`omni.kit.livestream.webrtc` extension enabled (configured by the +`isaac-sim-livestream` Compose profile). To view it locally, forward the +livestream port range — **two** terminals because livestream uses both TCP +and UDP: + +```bash +# Terminal A (TCP): +osmo workflow port-forward $WF workspace \ + --port 47995-48012,49000-49007,49100 --connect-timeout 86400 +``` + +```bash +# Terminal B (UDP): +osmo workflow port-forward $WF workspace \ + --port 47995-48012,49000-49007 --udp --connect-timeout 86400 +``` + +Then point the **Omniverse Streaming Client** (or a WebRTC-capable browser +client) at `http://localhost`. The simulation viewport shows up the same +way it would on a local Linux desktop. + +## Step 7 — View ROS topics in Foxglove + +The GCS container runs `foxglove_bridge` on container-port `8765`, +published as host-port `8766` on the workspace pod. Forward it once: + +```bash +osmo workflow port-forward $WF workspace --port 8766:8766 --connect-timeout 86400 +``` + +Then in [https://app.foxglove.dev](https://app.foxglove.dev): + +1. **Open connection** → `ws://localhost:8766`. +2. **Layouts** → **Import from file** → + [`gcs/foxglove_extensions/airstack_default.json`](https://github.com/castacks/AirStack/blob/main/gcs/foxglove_extensions/airstack_default.json) + from your local AirStack clone (or download it via the GitHub raw URL). +3. Pick the imported layout from the layout dropdown in the top-right. + +The full Foxglove flow — layout import, panel customisation, DDS bridge +naming — is documented at +[Foxglove Visualization](../gcs/foxglove.md). The only OSMO-specific +difference is the `port-forward` line in front of it. + +## Step 8 — Commit and push from inside the IDE + +The pod's filesystem is **ephemeral**. The persistence boundary is git, not +disk. Commit and push every meaningful chunk of work — a Source Control +panel commit + push, or in the integrated terminal: + +```bash +git add -A +git commit -m "WIP: feature X" +git push -u origin my-feature +``` + +Once your branch is on the remote, you can pull it from anywhere — your +laptop, a fresh pod tomorrow, a colleague's machine. + +> **Configuring git auth in the pod.** The pod is yours for the session. +> Inside the IDE's integrated terminal, set `git config user.name`, +> `user.email`, and configure your push auth (HTTPS + a GitHub PAT, or a +> per-pod SSH key the IDE forwards via `AllowAgentForwarding yes`). The +> `airstack-osmo-workspace` image deliberately does not bake any one +> student's git creds. + +## Step 9 — Tearing down + +When you're done: + +```bash +osmo workflow cancel $WF +``` + +> **Push first.** Anything that's still in your working tree, in `.git/` +> but not pushed, in `build/`, in `bags/`, or in `/root/` outside the repo +> **will be lost** on cancel. The pod is cattle. If you forget and need +> something pulled out, see "I forgot to push before tearing down" below +> *before* hitting cancel. + +## Troubleshooting + +| Symptom | Likely cause | Fix | +|---|---|---| +| `Remote-SSH: Connection refused` after a working session | Port-forward died (laptop slept, network blip) | Re-run `osmo workflow port-forward $WF workspace --port 2200:22 --connect-timeout 86400` | +| `Permission denied (publickey)` on Remote-SSH | The pod authorised a different pubkey than the one your local SSH client is offering | Confirm `cat ~/.ssh/id_ed25519.pub` matches what was passed to `--set-env "SSH_PUB_KEY=..."`. Re-submit if the wrong key was used. | +| `osmo workflow logs` shows `ERROR: SSH_PUB_KEY not set` | You forgot `--set-env` on submit | Cancel the workflow and resubmit with `--set-env "SSH_PUB_KEY=$(cat ~/.ssh/id_ed25519.pub)"` | +| `docker pull` fails inside the pod with `unauthorized` | Your `airlab-docker-login` credential is missing or has the wrong Andrew ID/password | Re-run `airstack osmo:setup` (or the `osmo credential set airlab-docker-login ...` command in Step 0). | +| Logs show `WARN: airlab-nucleus OSMO credential not set` and Isaac Sim asset loads fail | Your `airlab-nucleus` credential is missing or its API token expired | Re-run `airstack osmo:setup` after generating a fresh Nucleus API token. | +| Isaac Sim container restarts repeatedly | GPU not visible to the inner Docker daemon (toolkit not configured on the node) | Lab admin task. From inside the pod: `docker info \| grep -i runtime` should list `nvidia`. | +| Isaac Sim is up but the WebRTC stream is blank | The Pegasus script isn't getting `--/app/livestream/enabled=true`, or the wrong Compose profile is active | In the integrated terminal: `docker logs airstack-isaac-sim-livestream-1`. Confirm `ISAAC_SIM_LIVESTREAM=true` and that the `isaac-sim-livestream` profile is the one running (`docker ps`). | +| Foxglove "no connection" | Port-forward died, GCS container hasn't started yet, or browser is caching an old connection | Restart the `--port 8766:8766` forward; check `docker ps` shows `airstack-gcs-1` Up; try `ws://127.0.0.1:8766` instead of `ws://localhost:8766`. | +| First Remote-SSH connect takes forever | VS Code / Cursor downloading its remote server (~50 MB) into the fresh pod | Wait it out the first time. Subsequent connects to the same pod hit the cache. | +| **I forgot to push before tearing down** | The pod is still up; cancel hasn't fired yet | Don't cancel. SSH in via the existing port-forward, push from the IDE terminal, *then* cancel. If the workflow has already terminated and the pod is gone, the work is gone — git is the only persistence layer. | + +## What survives a `osmo workflow cancel`? + +| Artifact | Lives in | Survives? | +|---|---|---| +| Code committed and pushed to a feature branch | GitHub | **Yes** | +| Code committed but not pushed | Pod-local `.git` | **No** | +| Uncommitted edits in the IDE | Pod-local working tree | **No** | +| `colcon build` outputs (`build/`, `install/`, `log/`) | `/root/AirStack/**/ros_ws/...` | **No** (gitignored Linux x86_64 binaries; rebuild trivially) | +| Inner-dockerd image cache | Pod-local Docker layer cache | **No** | +| Bag files, sim recordings, debug screenshots | `/root/AirStack/bags/`, etc. | **No** — pull selectively via `osmo workflow rsync download $WF :` *before* cancel | + +The rule of thumb: **commit + push every time you'd save a file in a +git-tracked sense.** The Source Control panel is the persistence boundary. + +## See also + +- [`osmo/README.md`](https://github.com/castacks/AirStack/blob/main/osmo/README.md) + — lab-admin reference (pool prerequisites, OSMO credential registration, + workspace image build, validation stages). +- [Foxglove Visualization](../gcs/foxglove.md) — full layout import + + panel-customisation flow once your `port-forward 8766:8766` is up. +- [AGENTS.md](https://github.com/castacks/AirStack/blob/main/AGENTS.md) — + inside-the-pod workflow once you're attached: `bws`, `sws`, `docker exec`, + ROS 2 commands. +- [Getting Started](../getting_started/index.md) — the local-Linux-GPU + alternative. diff --git a/docs/tutorials/index.md b/docs/tutorials/index.md index d1606f4e3..1a9a4836f 100644 --- a/docs/tutorials/index.md +++ b/docs/tutorials/index.md @@ -5,6 +5,7 @@ Step-by-step guides for common AirStack workflows. If you are new, start with ** | Tutorial | Description | |---|---| | [Getting Started](../getting_started.md) | Install AirStack, pull Docker images, launch a simulated robot, and fly it for the first time. | +| [AirStack on OSMO (Mac/Windows OK)](airstack_on_osmo.md) | Develop on AirStack from a Mac, Windows, or no-GPU Linux laptop using NVIDIA OSMO + VS Code/Cursor Remote-SSH. No local Docker, no AirStack clone, no `airstack install`. | | [Multi-Robot Simulation](multi_robot_simulation.md) | Spin up multiple simulated robots in Isaac Sim and verify independent ROS 2 namespaces. | | [Autonomy Modes](autonomy_modes.md) | Understand `onboard_all`, `onboard_local`, and `offboard_global` modes and the commands to run each. | | [Deploying to Hardware](deploying_to_hardware.md) | Flash a Jetson or VOXL device, configure the robot hostname, and run the autonomy stack on a real drone. | diff --git a/mkdocs.yml b/mkdocs.yml index 00a1aee14..c4d92fede 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -52,6 +52,7 @@ nav: - Home: docs/index.md - Getting Started: - docs/getting_started/index.md + - docs/tutorials/airstack_on_osmo.md - docs/getting_started/tutorials_reference.md - Development: - docs/development/index.md diff --git a/osmo/README.md b/osmo/README.md new file mode 100644 index 000000000..b29b38070 --- /dev/null +++ b/osmo/README.md @@ -0,0 +1,248 @@ +# AirStack on OSMO + +This directory holds the bits that let students develop on AirStack remotely +through [NVIDIA OSMO](https://github.com/NVIDIA/OSMO): + +``` +osmo/ +├── README.md # This file (admin / operator reference) +├── workflows/ +│ └── airstack-dev.yaml # The OSMO workflow students submit +└── workspace/ + ├── Dockerfile # The airstack-osmo-workspace image + ├── sshd_config # Pubkey-only sshd config baked into the image + └── entrypoint.sh # Pod startup: sshd, dockerd, clone, airstack up +``` + +The student-facing walkthrough lives in +[`docs/tutorials/airstack_on_osmo.md`](../docs/tutorials/airstack_on_osmo.md) +— including the per-user **Step 0** for registering OSMO credentials. This +README is the **lab admin / operator** reference: pool requirements, +workspace image build & push, validation stages, plus a credential summary +for context. + +> **Scope:** developer workflow only. CI/CD on OSMO is **not** part of this +> integration — the existing `system-tests.yml` + OpenStack orchestrator path +> is unchanged. + +## Architecture in one minute + +A student submits one OSMO task that runs a privileged Docker-in-Docker (DinD) +pod with sshd. Inside that pod, `airstack.sh up` brings up the regular +three-container AirStack stack (Isaac Sim, robot-desktop, GCS) on the inner +Docker daemon. The student attaches VS Code or Cursor over Remote-SSH and +streams Isaac Sim (WebRTC) and the GCS Foxglove bridge (websocket) back to +their laptop via `osmo workflow port-forward`. + +``` +Student laptop OSMO workspace pod (privileged, GPU) +───────────────── ───────────────────────────────────── +VS Code / Cursor ── ssh ──► port-forward 2200:22 ──► sshd +Isaac Sim WebRTC ── webrtc ► port-forward 47995… ──► inner isaac-sim ctnr +app.foxglove.dev ── ws ────► port-forward 8766 ────► inner gcs ctnr (8765) + ▲ + │ inner dockerd + │ (NVIDIA runtime) + │ + airstack.sh up brings these 3 up +``` + +## Pool requirements + +> **⚠️ Most common blocker:** if `osmo workflow submit` returns `Task with +> platform: does not have privileged flag enabled`, the pool you +> selected has `privileged_allowed: false`. As of the AirLab `airlab-share-01` +> deployment audit (May 2026), **every pool defaults to `privileged_allowed: +> false`**. Ask the pool admin to flip it to `true` on the `airstack` pool (or +> create a dedicated pool) — see the message template in +> [`docs/tutorials/airstack_on_osmo.md`](../docs/tutorials/airstack_on_osmo.md#one-time-pool-setup-admin). +> You can audit which pools allow privileged with: +> ```bash +> osmo pool list -t json | python3 -c " +> import json, sys +> for ns in json.load(sys.stdin)['node_sets']: +> for p in ns['pools']: +> for n, plat in p['platforms'].items(): +> print(f\"{p['name']:25} {n:10} priv={plat['privileged_allowed']}\")" +> ``` + +The OSMO pool the workflow runs on must satisfy: + +| Requirement | Why | +|---|---| +| `privileged_allowed: true` | Required for DinD inside the workspace task. The inner `dockerd` needs cgroup manipulation, overlayfs, bridge/veth/iptables for `airstack_network`, and GPU device passthrough. There is no non-privileged path. Mounting the host's `/var/run/docker.sock` is not used — it would let the pod escape to the cluster node. | +| GPU pool with NVIDIA driver + `nvidia-container-toolkit` on each node | Isaac Sim needs the GPU. The toolkit must be on the node so the inner `dockerd` (configured with `--add-runtime nvidia=...`, `default-runtime: nvidia`) can hand the device to the inner Isaac Sim container. | +| No NetworkPolicy blocking pod-namespace ports `47995–48012/tcp+udp`, `49000–49007/tcp+udp`, `49100/tcp`, `8766/tcp`, `22/tcp` | These are the ports `osmo workflow port-forward` reaches inside the pod NS for Isaac Sim WebRTC, GCS Foxglove websocket, and sshd. | +| Resource limits ≥ `cpu: 16`, `memory: 64Gi`, `storage: 200Gi`, `gpu: 1` | Isaac Sim + AirStack images + `colcon build` working tree. Adjust upward if running multiple robots or heavy bag recording. | + +`hostNetwork: true` is **not** required. `osmo workflow port-forward` reaches +the pod's network namespace, which is where the inner `dockerd` publishes +ports via standard NAT (or `network_mode: host` on individual inner +containers, both of which terminate at the pod NS, not the cluster node). + +## OSMO credentials (per user, one time) + +OSMO credentials live in **each user's** OSMO profile, not in a lab-wide +store. Every student registers their own three credentials with `osmo +credential set` once on their laptop. The full walkthrough — including the +exact `osmo credential set ...` commands and how to obtain a Nucleus API +token — lives in +[`docs/tutorials/airstack_on_osmo.md` Step 0](../docs/tutorials/airstack_on_osmo.md#step-0--register-your-osmo-credentials-one-time). + +The three credentials, summarized for quick reference: + +| Name | Type | Used for | Referenced in workflow YAML? | +|---|---|---|---| +| `airlab-docker-registry` | `REGISTRY` | OSMO's automatic pull of the workspace image (`airlab-docker.andrew.cmu.edu/airstack/airstack-osmo-workspace:...`) | No — OSMO auto-attaches it to any image whose hostname matches the credential's `registry=` field. | +| `airlab-docker-login` | `GENERIC` | `entrypoint.sh` calls `docker login airlab-docker.andrew.cmu.edu` on the **inner** dockerd before `airstack up`, so the inner Compose stack can pull AirStack images | Yes — exposed as env vars `AIRLAB_REGISTRY_USER`/`AIRLAB_REGISTRY_PASS`. | +| `airlab-nucleus` | `GENERIC` | `entrypoint.sh` materializes `simulation/isaac-sim/docker/omni_pass.env` from it so Compose can env-file it into the Isaac Sim container | Yes — exposed as env vars `OMNI_USER`/`OMNI_PASS`/`OMNI_SERVER`. | + +The convenience helper `airstack osmo:setup` in +[`.airstack/modules/osmo.sh`](../.airstack/modules/osmo.sh) prompts for the +underlying values (Andrew ID, AirLab password, Nucleus API token) and runs +all three `osmo credential set` commands. + +> **Why a `REGISTRY` and a `GENERIC` credential for the same registry?** +> OSMO `REGISTRY` credentials drive Kubernetes `imagePullSecrets` — +> auto-attached but not exposed to the container as env vars. The +> **inner** dockerd (DinD) that `entrypoint.sh` starts is a separate +> Docker daemon and needs its own `docker login`. Hence the two-credential +> split. + +## Build & push the workspace image + +The workspace image is built once and pushed to the AirLab registry; students +never build it themselves. + +```bash +cd osmo/workspace +docker build -t airlab-docker.andrew.cmu.edu/airstack/airstack-osmo-workspace:latest . +docker push airlab-docker.andrew.cmu.edu/airstack/airstack-osmo-workspace:latest +``` + +Tag a versioned release alongside `latest` if you change anything in +`Dockerfile`, `sshd_config`, or `entrypoint.sh`: + +```bash +docker tag airlab-docker.andrew.cmu.edu/airstack/airstack-osmo-workspace:latest \ + airlab-docker.andrew.cmu.edu/airstack/airstack-osmo-workspace:v0.1.0 +docker push airlab-docker.andrew.cmu.edu/airstack/airstack-osmo-workspace:v0.1.0 +``` + +Then update the `image:` field in +[`workflows/airstack-dev.yaml`](workflows/airstack-dev.yaml) to match. + +The image bakes: + +- Ubuntu 24.04 base with `docker-ce`, `docker-compose-plugin`, `nvidia-container-toolkit` +- `git`, `python3`, `curl` +- `openssh-server` with **password auth permanently disabled** (pubkey only) via the baked `sshd_config` +- The AirStack `airstack.sh` CLI script on `PATH` + +The image does **not** bake the AirStack source tree. `entrypoint.sh` clones +it on first start (and skips re-cloning across pod restarts). + +## Validation stages + +Run these in order against a fresh submission. Each unlocks the next; if (a) +fails don't bother trying (b). + +### (a) sshd reachable, key auth works + +```bash +osmo workflow submit osmo/workflows/airstack-dev.yaml \ + --pool \ + --set-env "SSH_PUB_KEY=$(cat ~/.ssh/id_ed25519.pub)" +# → record + +osmo workflow port-forward workspace --port 2200:22 --connect-timeout 86400 & +ssh -p 2200 -o StrictHostKeyChecking=accept-new root@localhost 'echo ok && whoami' +# → "ok\nroot" +``` + +If SSH fails: check `osmo workflow logs workspace` for the +`SSH_PUB_KEY not set` error or for `sshd` failing to start. Make sure the +pool actually allowed the privileged task to run. + +### (b) VS Code / Cursor Remote-SSH attaches and opens `/root/AirStack` + +Add to `~/.ssh/config`: + +``` +Host airstack-osmo + HostName localhost + Port 2200 + User root + StrictHostKeyChecking accept-new +``` + +Then in VS Code: Command Palette → **Remote-SSH: Connect to Host…** → +`airstack-osmo` → open folder `/root/AirStack`. The IDE will install its +remote server in the pod on first connect (~50 MB download, slow on a fresh +pod; cached afterwards). + +### (c) `airstack up` brings the three containers Up + +In the IDE's integrated terminal (or `osmo workflow exec`): + +```bash +docker ps +# → expect: airstack-isaac-sim-1, airstack-robot-desktop-1, airstack-gcs-1 +``` + +If any container is missing or restarting, the most common causes (in order): + +1. The user's `airlab-docker-login` GENERIC credential is wrong / unset → + inner `docker pull` from `airlab-docker.andrew.cmu.edu` failed. + Re-run `airstack osmo:setup` (or the explicit `osmo credential set + airlab-docker-login ...` command in the tutorial Step 0). +2. `nvidia-container-toolkit` is not configured on the node → inner Isaac Sim + can't see the GPU. Check `docker info | grep -i runtime` inside the + workspace pod; you should see `nvidia` in the runtime list. +3. The pod ran out of `storage:` quota during the image pull. Bump it. + +### (d) Isaac Sim WebRTC client renders + +Two port-forwards (TCP + UDP): + +```bash +osmo workflow port-forward workspace \ + --port 47995-48012,49000-49007,49100 --connect-timeout 86400 & +osmo workflow port-forward workspace \ + --port 47995-48012,49000-49007 --udp --connect-timeout 86400 & +``` + +Open the Omniverse Streaming Client (or a browser WebRTC client) at +`http://localhost`. + +If the stream is blank: check that the Pegasus standalone script was launched +with `--/app/livestream/enabled=true`. The +[`isaac-sim-livestream`](../simulation/isaac-sim/docker/docker-compose.yaml) +Compose profile is what wires that argument; verify the workflow YAML has +`ISAAC_SIM_LIVESTREAM=true` in `environment:`. + +### (e) Foxglove websocket loads the AirStack layout + +```bash +osmo workflow port-forward workspace --port 8766:8766 --connect-timeout 86400 & +``` + +Open [https://app.foxglove.dev](https://app.foxglove.dev) → **Open +connection** → `ws://localhost:8766` → **Layouts** → **Import from file** → +[`gcs/foxglove_extensions/airstack_default.json`](../gcs/foxglove_extensions/airstack_default.json). + +The wider Foxglove layout / panel-import flow is documented in +[`docs/gcs/foxglove.md`](../docs/gcs/foxglove.md); the only OSMO-specific +piece is the `port-forward` line in front of it. + +## Out of scope (followups) + +- **OSMO-native split** — three separate OSMO tasks for `isaac-sim` / + `robot-desktop` / `gcs` instead of one DinD pod. Larger refactor of + Compose, DDS networking, and `tests/conftest.py`. The `osmo/workflows/` + layout leaves room for additional workflow files when this is done. +- **Persistent workspace** — mount `/root/AirStack` to a PVC so uncommitted + edits survive `osmo workflow cancel`. Pool-policy dependent. +- **CI/CD on OSMO** — the existing `.github/workflows/system-tests.yml` + + OpenStack ephemeral runner path is unchanged. Migrating CI to OSMO is a + separate effort. diff --git a/osmo/workflows/airstack-dev.yaml b/osmo/workflows/airstack-dev.yaml new file mode 100644 index 000000000..854a47ded --- /dev/null +++ b/osmo/workflows/airstack-dev.yaml @@ -0,0 +1,100 @@ +# AirStack remote developer workflow on OSMO. +# +# Submits a single privileged GPU task ("workspace") that runs Docker-in-Docker +# (DinD) and brings up the regular AirStack three-container stack (Isaac Sim +# with WebRTC livestream, robot-desktop, GCS) on the inner Docker daemon. The +# task also runs sshd so a student can attach VS Code or Cursor over Remote-SSH +# from their laptop (Mac, Windows, or Linux — no local Docker / NVIDIA driver +# required). +# +# To submit (replace and substitute your actual pubkey): +# +# osmo workflow submit osmo/workflows/airstack-dev.yaml \ +# --pool \ +# --set-env "SSH_PUB_KEY=$(cat ~/.ssh/id_ed25519.pub)" +# +# To stream the IDE, Isaac Sim, and Foxglove back to the laptop: +# +# # SSH so VS Code / Cursor Remote-SSH can attach (1 terminal): +# osmo workflow port-forward workspace --port 2200:22 --connect-timeout 86400 +# +# # Isaac Sim WebRTC (2 terminals — TCP + UDP): +# osmo workflow port-forward workspace \ +# --port 47995-48012,49000-49007,49100 --connect-timeout 86400 +# osmo workflow port-forward workspace \ +# --port 47995-48012,49000-49007 --udp --connect-timeout 86400 +# +# # GCS Foxglove websocket (1 terminal): +# osmo workflow port-forward workspace --port 8766:8766 --connect-timeout 86400 +# +# See docs/tutorials/airstack_on_osmo.md for the full walkthrough and +# osmo/README.md for the lab-admin setup (pool prerequisites, OSMO credential +# registration, workspace image build). + +workflow: + name: airstack-dev + groups: # `groups:` keeps room to add a separate + # dind sidecar or split out isaac-sim / + # robot-desktop / gcs as their own + # tasks later. For now, one lead task. + - name: airstack + tasks: + - name: workspace + lead: true + image: airlab-docker.andrew.cmu.edu/airstack/airstack-osmo-workspace:latest + # privileged is non-negotiable for DinD: cgroups, overlayfs, bridge, + # iptables, GPU device passthrough. hostNetwork is intentionally NOT + # set — osmo workflow port-forward reaches the pod NS, where the inner + # dockerd publishes ports via standard NAT. + privileged: true + command: ["bash"] + args: ["/usr/local/bin/entrypoint.sh"] + environment: + # Behaviour switches consumed by entrypoint.sh and airstack.sh: + AUTOLAUNCH: "true" # boot AirStack on startup + ISAAC_SIM_LIVESTREAM: "true" # use the isaac-sim-livestream profile + NUM_ROBOTS: "1" + AIRSTACK_BRANCH: "main" # branch entrypoint.sh clones + AIRSTACK_REPO_URL: "https://github.com/castacks/AirStack.git" + # SSH_PUB_KEY is supplied at submit time: + # --set-env "SSH_PUB_KEY=$(cat ~/.ssh/id_ed25519.pub)" + credentials: + # Each student registers these in their own OSMO profile once. + # See docs/tutorials/airstack_on_osmo.md "Step 0". + # + # airlab-nucleus (GENERIC) — materialized into omni_pass.env by + # entrypoint.sh so Compose env_files it into the Isaac Sim ctnr. + airlab-nucleus: + OMNI_USER: omni_user + OMNI_PASS: omni_pass + OMNI_SERVER: omni_server + # airlab-docker-login (GENERIC) — exposed as env vars so the + # inner dockerd can authenticate to airlab-docker.andrew.cmu.edu + # when `airstack up` triggers an AirStack image-pull. (The + # *outer* pod's image-pull of airstack-osmo-workspace is handled + # automatically by a sibling REGISTRY-type credential which does + # not need a reference here — OSMO auto-attaches it on submit.) + airlab-docker-login: + AIRLAB_REGISTRY_USER: username + AIRLAB_REGISTRY_PASS: password + + resources: + default: + cpu: 16 + gpu: 1 + memory: 64Gi + # AirStack image set is large: airstack-dev-9 (2026-05-14) hit the + # 100Gi container ephemeral cap during inner Compose's first image + # extract, before the second image even started downloading. So the + # full set of inner images alone exceeds 100Gi extracted. Going to + # 500Gi to comfortably hold isaac-sim + robot-desktop + gcs images, + # plus the AirStack source clone, colcon build output, and bag + # recordings. The airstack pool's workers have 4.2Ti of ephemeral + # capacity each (root disks resized 2026-05-14), so this leaves + # plenty of room for other tenants on the shared workers. + storage: 500Gi + + timeout: + # 8h covers a normal dev session. Bump for longer runs; cancel manually + # before the timeout if you want to free the GPU early. + exec_timeout: 8h diff --git a/osmo/workspace/Dockerfile b/osmo/workspace/Dockerfile new file mode 100644 index 000000000..1aff0b4af --- /dev/null +++ b/osmo/workspace/Dockerfile @@ -0,0 +1,104 @@ +# airstack-osmo-workspace +# +# Image used by the OSMO airstack-dev workflow. Boots into a Docker-in-Docker +# (DinD) pod with sshd on :22 so VS Code / Cursor Remote-SSH can attach. The +# inner dockerd then runs the regular AirStack docker-compose stack (Isaac +# Sim, robot-desktop, GCS) on the GPU forwarded into the pod. +# +# Built and pushed by the lab admin (see osmo/README.md): +# docker build -t airlab-docker.andrew.cmu.edu/airstack/airstack-osmo-workspace:latest . +# docker push airlab-docker.andrew.cmu.edu/airstack/airstack-osmo-workspace:latest +# +# Students never build this image. + +FROM ubuntu:24.04 + +ENV DEBIAN_FRONTEND=noninteractive +ENV LANG=C.UTF-8 +ENV LC_ALL=C.UTF-8 + +# Base utilities + sshd + dev ergonomics. python3 is here so the airstack.sh +# CLI's helper scripts can shell out to python3 without extra installs. +RUN apt-get update && apt-get install -y --no-install-recommends \ + ca-certificates \ + curl \ + git \ + git-lfs \ + gnupg \ + iproute2 \ + iptables \ + jq \ + less \ + locales \ + lsb-release \ + openssh-server \ + procps \ + python3 \ + python3-pip \ + rsync \ + sudo \ + tmux \ + tzdata \ + vim-tiny \ + && locale-gen C.UTF-8 \ + && rm -rf /var/lib/apt/lists/* + +# Docker CE + Compose plugin (required for `airstack up` to work inside the +# pod). The same install procedure as get.docker.com but pinned to apt repos +# so we can upgrade explicitly. +RUN install -m 0755 -d /etc/apt/keyrings \ + && curl -fsSL https://download.docker.com/linux/ubuntu/gpg \ + | gpg --dearmor -o /etc/apt/keyrings/docker.gpg \ + && chmod a+r /etc/apt/keyrings/docker.gpg \ + && echo \ + "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] \ + https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" \ + > /etc/apt/sources.list.d/docker.list \ + && apt-get update && apt-get install -y --no-install-recommends \ + docker-ce \ + docker-ce-cli \ + containerd.io \ + docker-buildx-plugin \ + docker-compose-plugin \ + fuse-overlayfs \ + && rm -rf /var/lib/apt/lists/* + +# Note: dockerd inside an OSMO/k8s pod is running on top of an overlayfs +# rootfs (the pod's own /). Docker refuses overlay2-on-overlayfs, so without +# fuse-overlayfs the entrypoint's storage-driver fallback chain lands on +# vfs, which has no copy-on-write and bloats the AirStack image set ~3x +# (airstack-dev-10, 2026-05-14 burned 270Gi for 2 of 3 inner images). +# fuse-overlayfs gives proper CoW for DinD over overlayfs. + +# NVIDIA Container Toolkit, configured to register the `nvidia` runtime with +# dockerd. This is what lets the inner Isaac Sim container see the GPU +# forwarded into the workspace pod. The pool's nodes still need the host-side +# NVIDIA driver + nvidia-container-toolkit; this just configures the inner +# dockerd. +RUN curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey \ + | gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ + && curl -fsSL https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list \ + | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' \ + > /etc/apt/sources.list.d/nvidia-container-toolkit.list \ + && apt-get update && apt-get install -y --no-install-recommends \ + nvidia-container-toolkit \ + && rm -rf /var/lib/apt/lists/* + +# sshd: pubkey-only, no password auth ever. Host keys generated at runtime in +# entrypoint.sh so each pod has unique keys (good practice; harmless cost). +COPY sshd_config /etc/ssh/sshd_config +RUN chmod 644 /etc/ssh/sshd_config && mkdir -p /var/run/sshd + +# entrypoint: starts sshd, dockerd, clones AirStack, materializes secrets, +# runs `airstack up`, then sleeps so port-forwards keep working. +COPY entrypoint.sh /usr/local/bin/entrypoint.sh +RUN chmod 0755 /usr/local/bin/entrypoint.sh + +# Symlink airstack.sh from the cloned repo into PATH on first run; for now +# expose a placeholder so command lookups don't fail before the clone. +WORKDIR /root + +# Default to a long-running entrypoint. The OSMO workflow overrides command +# and args to invoke /tmp/entry.sh which sources this image's entrypoint.sh +# logic. Either path works. +CMD ["/usr/local/bin/entrypoint.sh"] diff --git a/osmo/workspace/entrypoint.sh b/osmo/workspace/entrypoint.sh new file mode 100755 index 000000000..9a4fa8f19 --- /dev/null +++ b/osmo/workspace/entrypoint.sh @@ -0,0 +1,201 @@ +#!/usr/bin/env bash +# entrypoint.sh — airstack-osmo-workspace pod startup. +# +# Order of operations: +# 1. Install SSH_PUB_KEY into authorized_keys, generate sshd host keys, +# start sshd. Done first so the student can SSH in even if a later step +# fails (huge debugging accelerator). +# 2. Start the inner Docker daemon (DinD) with the NVIDIA runtime so Isaac +# Sim sees the GPU forwarded into the pod. +# 3. Clone AirStack into /root/AirStack (skipped if already cloned by a +# previous pod incarnation). +# 4. Materialize simulation/isaac-sim/docker/omni_pass.env from the +# `airlab-nucleus` OSMO GENERIC credential. +# 5. docker login airlab-docker.andrew.cmu.edu using the +# `airlab-docker-login` OSMO GENERIC credential. +# 6. cd /root/AirStack && ./airstack.sh up +# 7. sleep infinity so port-forwards keep working. +# +# All steps are idempotent across pod restarts: re-running this script +# inside the same pod is safe. + +set -uo pipefail + +log() { echo "[entrypoint] $*"; } +fail() { echo "[entrypoint] ERROR: $*" >&2; exit 1; } + +# ─── 1. SSHD ─────────────────────────────────────────────────────────────── + +log "configuring sshd" + +mkdir -p /root/.ssh && chmod 700 /root/.ssh + +if [ -z "${SSH_PUB_KEY:-}" ]; then + fail "SSH_PUB_KEY not set. Re-submit with --set-env \"SSH_PUB_KEY=\$(cat ~/.ssh/id_ed25519.pub)\"" +fi + +# Always overwrite — last submit wins. Single-user dev pod. +echo "${SSH_PUB_KEY}" > /root/.ssh/authorized_keys +chmod 600 /root/.ssh/authorized_keys + +# Generate fresh host keys if missing (first boot of this pod). +ssh-keygen -A + +mkdir -p /var/run/sshd +/usr/sbin/sshd +log "sshd listening on :22" + +# ─── 2. Inner dockerd (DinD) ─────────────────────────────────────────────── + +log "starting inner dockerd (DinD with NVIDIA runtime)" + +# nvidia-container-toolkit ships a CLI that registers the nvidia runtime in +# the dockerd config and (optionally) sets it as the default. We want it as +# the default so `airstack up` doesn't have to specify --runtime. +nvidia-ctk runtime configure --runtime=docker --set-as-default || \ + log "WARN: nvidia-ctk runtime configure failed — Isaac Sim probably won't see the GPU" + +# Pre-flight diagnostics so failures surface in OSMO logs (the pod is gone +# by the time anyone reads /var/log/dockerd.log otherwise). +log "diagnostics: kernel=$(uname -r) cgroups=$(stat -fc %T /sys/fs/cgroup 2>/dev/null) rootfs=$(stat -fc %T / 2>/dev/null)" +log "diagnostics: /var/lib/docker fs=$(stat -fc %T /var/lib/docker 2>/dev/null || echo absent)" + +# Inner dockerd setup. We try storage drivers in order: overlay2 (fastest, +# works on most modern hosts) → fuse-overlayfs (rootless-friendly, may not be +# present) → vfs (always works, slowest). Falling back avoids the +# overlay-on-overlay failure that bites DinD on some kernel/storage +# combinations. +_start_dockerd() { + local driver="$1" + : > /var/log/dockerd.log + # We rely on /var/lib/docker being a bind-mount of /mnt/airstack-data + # (200Gi Cinder volume) so during-pull disk peaks aren't constrained by + # node ephemeral-storage. Default --max-concurrent-downloads=3 is fine. + nohup dockerd \ + --host=unix:///var/run/docker.sock \ + --storage-driver="$driver" \ + > /var/log/dockerd.log 2>&1 & + DOCKERD_PID=$! + log "dockerd started (pid=$DOCKERD_PID, storage-driver=$driver); waiting for socket" + for i in $(seq 1 30); do + if docker info >/dev/null 2>&1; then + log "dockerd ready (storage-driver=$driver)" + return 0 + fi + if ! kill -0 "$DOCKERD_PID" 2>/dev/null; then + log "dockerd exited; tailing /var/log/dockerd.log:" + tail -40 /var/log/dockerd.log | sed 's/^/[dockerd] /' + return 1 + fi + sleep 1 + done + log "dockerd unresponsive after 30s; tailing /var/log/dockerd.log:" + tail -40 /var/log/dockerd.log | sed 's/^/[dockerd] /' + kill "$DOCKERD_PID" 2>/dev/null || true + return 1 +} + +DOCKERD_OK=false +for drv in overlay2 fuse-overlayfs vfs; do + if _start_dockerd "$drv"; then + DOCKERD_OK=true + break + fi + log "WARN: dockerd failed with storage-driver=$drv; trying next" +done +if [ "$DOCKERD_OK" != "true" ]; then + fail "dockerd refused to start with any of overlay2 / fuse-overlayfs / vfs" +fi + +# ─── 3. Clone AirStack ───────────────────────────────────────────────────── + +AIRSTACK_REPO_URL="${AIRSTACK_REPO_URL:-https://github.com/castacks/AirStack.git}" +AIRSTACK_BRANCH="${AIRSTACK_BRANCH:-main}" +AIRSTACK_ROOT=/root/AirStack + +if [ ! -d "$AIRSTACK_ROOT/.git" ]; then + log "cloning $AIRSTACK_REPO_URL ($AIRSTACK_BRANCH) -> $AIRSTACK_ROOT" + git clone --recursive --branch "$AIRSTACK_BRANCH" "$AIRSTACK_REPO_URL" "$AIRSTACK_ROOT" \ + || fail "git clone failed" +else + log "$AIRSTACK_ROOT already cloned (skipping)" +fi + +# Make sure the airstack CLI is on PATH for interactive shells. +ln -sf "$AIRSTACK_ROOT/airstack.sh" /usr/local/bin/airstack +ln -sf "$AIRSTACK_ROOT/airstack.sh" /usr/local/bin/airstack.sh + +# ─── 4. omni_pass.env from airlab-nucleus credential ─────────────────────── + +OMNI_PASS_FILE="$AIRSTACK_ROOT/simulation/isaac-sim/docker/omni_pass.env" + +if [ -z "${OMNI_USER:-}" ] || [ -z "${OMNI_PASS:-}" ]; then + log "WARN: airlab-nucleus OSMO credential not set." + log "WARN: Run on your laptop:" + log "WARN: osmo credential set airlab-nucleus --type GENERIC \\" + log "WARN: --payload omni_user= omni_pass= \\" + log "WARN: omni_server=omniverse://airlab-nucleus.andrew.cmu.edu/NVIDIA/Assets/Isaac/5.1" + log "WARN: Falling back to guest/guest (read-only Nucleus) — Isaac Sim assets may fail to load." +fi + +# Default to read-only Nucleus access so a missing credential degrades +# instead of crashing the pod. +: "${OMNI_USER:=guest}" +: "${OMNI_PASS:=guest}" +: "${OMNI_SERVER:=omniverse://airlab-nucleus.andrew.cmu.edu/NVIDIA/Assets/Isaac/5.1}" + +log "writing $OMNI_PASS_FILE (omni_user=${OMNI_USER}, omni_server=${OMNI_SERVER})" +cat > "$OMNI_PASS_FILE" < password=" +fi + +# ─── 6. airstack up ──────────────────────────────────────────────────────── + +# Honor optional overrides passed in via OSMO env. Defaults match a "single +# robot, Isaac Sim with WebRTC livestream" dev session. +export AUTOLAUNCH="${AUTOLAUNCH:-true}" +export NUM_ROBOTS="${NUM_ROBOTS:-1}" +export ISAAC_SIM_LIVESTREAM="${ISAAC_SIM_LIVESTREAM:-true}" + +# COMPOSE_PROFILES selection: the default `desktop,isaac-sim` from .env runs +# the standard isaac-sim service. If the student wants livestream, they (or +# we) swap to the isaac-sim-livestream profile, which is the OSMO-friendly +# variant defined in simulation/isaac-sim/docker/docker-compose.yaml. +if [ "$ISAAC_SIM_LIVESTREAM" = "true" ]; then + export COMPOSE_PROFILES="${COMPOSE_PROFILES:-desktop,isaac-sim-livestream}" +else + export COMPOSE_PROFILES="${COMPOSE_PROFILES:-desktop,isaac-sim}" +fi + +log "airstack up (COMPOSE_PROFILES=$COMPOSE_PROFILES, NUM_ROBOTS=$NUM_ROBOTS, livestream=$ISAAC_SIM_LIVESTREAM)" +cd "$AIRSTACK_ROOT" +./airstack.sh up || log "WARN: airstack up exited non-zero — pod stays alive for debugging via SSH" + +# ─── 7. Sleep ────────────────────────────────────────────────────────────── + +log "entrypoint complete; sleeping forever so port-forwards keep working" +log "pod-side log paths:" +log " - dockerd: /var/log/dockerd.log" +log " - airstack: docker logs airstack-isaac-sim-1 / airstack-robot-desktop-1 / airstack-gcs-1" +exec sleep infinity diff --git a/osmo/workspace/sshd_config b/osmo/workspace/sshd_config new file mode 100644 index 000000000..097efdb21 --- /dev/null +++ b/osmo/workspace/sshd_config @@ -0,0 +1,41 @@ +# sshd_config baked into airstack-osmo-workspace. +# +# Permanently disables password auth — the only way in is via a pubkey +# installed by entrypoint.sh from the SSH_PUB_KEY env var (passed at submit +# time with `osmo workflow submit ... --set-env "SSH_PUB_KEY=$(cat ~/.ssh/id_ed25519.pub)"`). + +Port 22 +AddressFamily any +ListenAddress 0.0.0.0 +ListenAddress :: + +# Pubkey only. +PasswordAuthentication no +PubkeyAuthentication yes +ChallengeResponseAuthentication no +KbdInteractiveAuthentication no +PermitEmptyPasswords no + +# Single-user dev pod; the IDE attaches as root because /root/AirStack and +# the existing AirStack ergonomics expect it. PermitRootLogin +# prohibit-password forbids password root login but allows pubkey root login. +PermitRootLogin prohibit-password + +# Standard auth path. +AuthorizedKeysFile .ssh/authorized_keys + +UsePAM yes + +# Performance / VS Code-Remote-SSH friendliness: +# - AcceptEnv lets the IDE forward LANG, ENV, etc. +# - X11 forwarding off (no display in the pod). +# - Allow forwarding so port-forwards through SSH are permitted if students +# want to layer their own. +AcceptEnv LANG LC_* +X11Forwarding no +AllowAgentForwarding yes +AllowTcpForwarding yes +PrintMotd no + +# Subsystems VS Code Remote needs for sftp/file ops. +Subsystem sftp /usr/lib/openssh/sftp-server diff --git a/simulation/isaac-sim/docker/docker-compose.yaml b/simulation/isaac-sim/docker/docker-compose.yaml index a1b40ae56..f51de0f9d 100644 --- a/simulation/isaac-sim/docker/docker-compose.yaml +++ b/simulation/isaac-sim/docker/docker-compose.yaml @@ -94,3 +94,61 @@ services: tmux send-keys -t isaac '/isaac-sim/runapp.sh' ENTER; sleep infinity" networks: !reset null + + # =================================================================================================================== + # WebRTC livestream variant for OSMO / remote dev. Headless: no X server, + # no display, no GUI window. Kit's `omni.kit.livestream.webrtc` extension + # serves the rendered viewport on the standard NVIDIA livestream ports + # (47995-48012, 49000-49007 TCP+UDP and 49100 TCP). Those ports are + # published into the host (the OSMO workspace pod's network namespace) so + # `osmo workflow port-forward` can reach them. + # + # Selected via COMPOSE_PROFILES (e.g. `desktop,isaac-sim-livestream`) in + # osmo/workspace/entrypoint.sh, or by setting ISAAC_SIM_LIVESTREAM=true. + isaac-sim-livestream: + extends: + service: isaac-sim + container_name: isaac-sim-livestream + profiles: !override + - isaac-sim-livestream + # Always run the Pegasus standalone path; the livestream branch in the + # script is gated on ISAAC_SIM_LIVESTREAM=true (env-driven, additive + # to the existing script behavior). + command: > + bash -c " + tmux new -d -s isaac; + tmux send-keys -t isaac 'PYTHONPATH=\"$$ISAAC_SIM_PYTHONPATH\" /isaac-sim/python.sh /isaac-sim/AirStack/simulation/isaac-sim/launch_scripts/${ISAAC_SIM_SCRIPT_NAME} --ext-folder ~/.local/share/ov/data/documents/Kit/shared/exts --/app/livestream/enabled=true' ENTER; + sleep infinity" + environment: + # Inherit everything from isaac-sim and append: + - ISAAC_SIM_LIVESTREAM=true + - ISAAC_SIM_USE_STANDALONE=true + - ISAAC_SIM_HEADLESS=true + # Publish the WebRTC livestream ports to the pod NS. Bridge-mode + + # publish (the conservative choice) keeps the rest of the stack on + # airstack_network for DDS multicast. + ports: + - "47995-48012:47995-48012/tcp" + - "47995-48012:47995-48012/udp" + - "49000-49007:49000-49007/tcp" + - "49000-49007:49000-49007/udp" + - "49100:49100/tcp" + # Drop X11-specific volume mounts inherited from the isaac-sim service — + # there is no X server in an OSMO pod. + volumes: !override + - $HOME/docker/isaac-sim/cache/main:/isaac-sim/.cache:rw + - $HOME/docker/isaac-sim/cache/computecache:/isaac-sim/.nv/ComputeCache:rw + - $HOME/docker/isaac-sim/logs:/isaac-sim/.nvidia-omniverse/logs:rw + - $HOME/docker/isaac-sim/config:/isaac-sim/.nvidia-omniverse/config:rw + - $HOME/docker/isaac-sim/data:/isaac-sim/.local/share/ov/data:rw + - $HOME/docker/isaac-sim/pkg:/isaac-sim/.local/share/ov/pkg:rw + - ../extensions/PegasusSimulator/extensions/pegasus.simulator:/isaac-sim/.local/share/ov/data/documents/Kit/shared/exts/pegasus.simulator/:rw + - ./omniverse.toml:/isaac-sim/.nvidia-omniverse/config/omniverse.toml:rw + - ./user.config.json:/isaac-sim/.local/share/ov/data/Kit/Isaac-Sim Full/5.1/user.config.json:rw + - .dev:/isaac-sim/.dev:rw + - .bashrc:/isaac-sim/.bashrc:rw + - ../../../common/inputrc:/etc/inputrc:rw + - ../../../common/.tmux.conf:/isaac-sim/.tmux.conf:rw + - ../../..:/isaac-sim/AirStack:rw + - ../../../.devcontainer/isaac-sim/launch.json:/isaac-sim/AirStack/.vscode/launch.json:rw + - ../../../.devcontainer/isaac-sim/tasks.json:/isaac-sim/AirStack/.vscode/tasks.json:rw diff --git a/simulation/isaac-sim/launch_scripts/example_one_px4_pegasus_launch_script.py b/simulation/isaac-sim/launch_scripts/example_one_px4_pegasus_launch_script.py index f53ea0993..72be29824 100755 --- a/simulation/isaac-sim/launch_scripts/example_one_px4_pegasus_launch_script.py +++ b/simulation/isaac-sim/launch_scripts/example_one_px4_pegasus_launch_script.py @@ -10,17 +10,30 @@ - Optionally saving the prepared scene as a self-contained USD """ -import carb -from isaacsim import SimulationApp - -# Must be created before any omni imports -simulation_app = SimulationApp({"headless": False}) - import os import sys import time import asyncio +import carb +from isaacsim import SimulationApp + +_LIVESTREAM = os.environ.get("ISAAC_SIM_LIVESTREAM", "").lower() == "true" + +# Must be created before any omni imports +simulation_app = SimulationApp({"headless": True if _LIVESTREAM else False}) + +if _LIVESTREAM: + # Headless + WebRTC livestream when ISAAC_SIM_LIVESTREAM=true (set by the + # OSMO airstack-osmo-workspace entrypoint and the isaac-sim-livestream + # Compose profile). Local desktop dev keeps the original windowed behavior. + # Mirrors AirStack's standalone livestream reference at + # simulation/isaac-sim/standalone_examples/api/isaacsim.simulation_app/livestream.py + from isaacsim.core.utils.extensions import enable_extension + simulation_app.set_setting("/app/window/drawMouse", True) + simulation_app.set_setting("/app/livestream/enabled", True) + enable_extension("omni.kit.livestream.webrtc") + import omni.kit.app import omni.timeline import omni.usd From 64a2cebf2fb89c84cd948edd8bdafb7598553bb5 Mon Sep 17 00:00:00 2001 From: Sebastian Scherer Date: Thu, 14 May 2026 16:50:48 -0400 Subject: [PATCH 02/13] fix(osmo): harden CLI + workspace image against stale-state, port-forward race, and cursor-server install hangs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four bugs that bit the first end-to-end runs (airstack-dev-10 → -13): - _osmo_wf_id: validate saved workflow id against `osmo workflow query` before returning. Without this, the state file at ~/.airstack/osmo-state outlives the workflow it points at and every subsequent osmo:webrtc / osmo:foxglove / osmo:ide call surfaces the same confusing "Workflow airstack-dev-N is not running! (status 410)" instead of the obvious "run airstack osmo:up to launch a fresh workflow". - cmd_osmo_up: `osmo workflow submit --set-env` is variadic. Passing two separate `--set-env A=1 --set-env B=2` silently drops the first one — this is what made airstack-dev-11 fail with "ERROR: SSH_PUB_KEY not set" when --branch was passed alongside the pubkey. Collapse the K=V pairs into a single --set-env. - cmd_osmo_ide: previously launched the IDE before starting the port-forward, so Cursor/VS Code would try to SSH localhost:2200 a few hundred ms before the tunnel listener existed and fail with "connect to host localhost port 2200: Connection refused". Now: detect an existing forward and reuse it (also avoids the "Address already in use" if osmo:foxglove was started in parallel), otherwise spawn the forward in the background, wait up to 30s for it to bind, then launch the IDE. Ctrl+C tears down the spawned forward cleanly via a trap. - workspace image / entrypoint: Cursor Remote-SSH hung indefinitely on airstack-dev-13 because (a) cursor-server's installer fell back to wget when curl timed out and wget was not in the image, and (b) a /tmp/cursor-remote-lock.* file left behind by the first crashed install blocked every silent retry. Add wget to the apt install list and rm -f the stale Cursor / VS Code remote lock files at the very top of entrypoint.sh so each fresh pod starts from a clean slate. Co-authored-by: Cursor --- .airstack/modules/osmo.sh | 49 +++++++++++++++++++++++++++++++++--- osmo/workspace/Dockerfile | 1 + osmo/workspace/entrypoint.sh | 15 +++++++++++ 3 files changed, 62 insertions(+), 3 deletions(-) diff --git a/.airstack/modules/osmo.sh b/.airstack/modules/osmo.sh index e3ac7f107..42ec8eedd 100755 --- a/.airstack/modules/osmo.sh +++ b/.airstack/modules/osmo.sh @@ -346,6 +346,43 @@ function cmd_osmo_ide { log_info "Make sure ~/.ssh/config has a 'Host airstack-osmo' entry pointing at localhost:2200, User root." + # Local TCP port the user's IDE will connect to (the local side of the + # `--port LOCAL:REMOTE` mapping). + local local_port="${OSMO_SSH_PORT%%:*}" + + # Reuse an existing forward if one is already listening (the user might + # have run this from a second terminal, or osmo:foxglove already opened + # a multi-port forward). Otherwise spawn one in the background and wait + # for it to bind before launching the IDE — this avoids the race where + # Cursor/VS Code tries to SSH before the tunnel exists and dies with + # "connect to host localhost port 2200: Connection refused". + local pf_pid="" + if nc -z localhost "$local_port" 2>/dev/null; then + log_info "Port ${local_port} is already listening; reusing existing port-forward." + else + log_info "osmo workflow port-forward ${wf} workspace --port ${OSMO_SSH_PORT} --connect-timeout ${OSMO_PF_TIMEOUT}" + osmo workflow port-forward "$wf" workspace --port "$OSMO_SSH_PORT" --connect-timeout "$OSMO_PF_TIMEOUT" \ + > "${OSMO_STATE_DIR}/ssh-pf.log" 2>&1 & + pf_pid=$! + # Wait up to 30s for the tunnel to start accepting connections. + local waited=0 + until nc -z localhost "$local_port" 2>/dev/null; do + sleep 1; waited=$((waited+1)) + if [ "$waited" -ge 30 ]; then + log_error "Timed out waiting for port-forward on :${local_port} after ${waited}s." + log_error " port-forward log: ${OSMO_STATE_DIR}/ssh-pf.log" + kill "$pf_pid" 2>/dev/null + return 1 + fi + if ! kill -0 "$pf_pid" 2>/dev/null; then + log_error "port-forward exited early. Tail:" + tail -10 "${OSMO_STATE_DIR}/ssh-pf.log" >&2 + return 1 + fi + done + log_info "Port-forward established on localhost:${local_port} (pid ${pf_pid})." + fi + if [ "$open_ide" = true ]; then # vscode-remote URI launches the IDE pre-attached to the remote host. local uri="vscode-remote://ssh-remote+airstack-osmo/root/AirStack" @@ -355,9 +392,15 @@ function cmd_osmo_ide { log_warn "Could not launch ${ide_cmd} automatically; open it and pick airstack-osmo from Remote-SSH manually." ) & fi - log_info "osmo workflow port-forward ${wf} workspace --port ${OSMO_SSH_PORT} --connect-timeout ${OSMO_PF_TIMEOUT}" - log_info "Leave this terminal running for the length of your session." - osmo workflow port-forward "$wf" workspace --port "$OSMO_SSH_PORT" --connect-timeout "$OSMO_PF_TIMEOUT" + if [ -n "$pf_pid" ]; then + log_info "Leave this terminal running for the length of your session (Ctrl+C to disconnect)." + # Forward Ctrl+C to the port-forward and clean up. + trap 'kill "$pf_pid" 2>/dev/null; exit 0' INT TERM + wait "$pf_pid" + else + log_info "Existing port-forward owns the tunnel; this command will exit immediately." + log_info "Stop the tunnel with: pkill -f 'osmo workflow port-forward' or airstack osmo:down" + fi } # osmo:webrtc — forward both Isaac Sim WebRTC port ranges (TCP in this diff --git a/osmo/workspace/Dockerfile b/osmo/workspace/Dockerfile index 1aff0b4af..e64d8662c 100644 --- a/osmo/workspace/Dockerfile +++ b/osmo/workspace/Dockerfile @@ -40,6 +40,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ tmux \ tzdata \ vim-tiny \ + wget \ && locale-gen C.UTF-8 \ && rm -rf /var/lib/apt/lists/* diff --git a/osmo/workspace/entrypoint.sh b/osmo/workspace/entrypoint.sh index 9a4fa8f19..04941c819 100755 --- a/osmo/workspace/entrypoint.sh +++ b/osmo/workspace/entrypoint.sh @@ -24,6 +24,21 @@ set -uo pipefail log() { echo "[entrypoint] $*"; } fail() { echo "[entrypoint] ERROR: $*" >&2; exit 1; } +# ─── 0. Stale-state cleanup ──────────────────────────────────────────────── +# +# Cursor / VS Code Remote-SSH guards its server install with a file lock +# at /tmp/cursor-remote-lock.* (and a sibling .target file naming the PIDs +# that hold it). If a previous connect attempt crashed mid-install +# (e.g. the port-forward died while the install was in flight, as +# happened on airstack-dev-13 / 2026-05-14), the lock file outlives the +# dead PIDs and every subsequent IDE retry bails out *silently* at the +# lock check — leaving an empty bin// dir and the user staring at +# a "Connecting to remote host (attempt 1)..." spinner forever. +# +# A fresh pod has nothing to preserve here, so clearing these on startup +# is always safe. +rm -f /tmp/cursor-remote-lock.* /tmp/vscode-remote-lock.* 2>/dev/null || true + # ─── 1. SSHD ─────────────────────────────────────────────────────────────── log "configuring sshd" From 769c4a2e829314b2744fae9c2ffc4d42828b683c Mon Sep 17 00:00:00 2001 From: Sebastian Scherer Date: Thu, 14 May 2026 17:08:55 -0400 Subject: [PATCH 03/13] fix(osmo): correct osmo:logs CLI invocation; install Foxglove extensions locally on osmo:foxglove MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit osmo:logs was invoking `osmo workflow logs workspace --follow`, but the real CLI takes the task via `-t TASK` (not positionally) and has no `--follow` flag at all — so the command failed immediately with "unrecognized arguments: workspace --follow". Replace with a polling loop that uses `-t workspace -n ` on a short interval, prints only the suffix that appeared since the previous fetch (find-the-last-seen-line trick; degrades to "reprint tail" with a warning if the cursor outruns -n), and exits cleanly once the workflow reaches a terminal state. Tunables: OSMO_LOGS_TASK / OSMO_LOGS_TAIL / OSMO_LOGS_INTERVAL. osmo:foxglove now installs the AirStack Foxglove extensions (robot-commands / waypoint-editor / polygon-editor) into the laptop's local Foxglove user-extensions directory before opening the port-forward. Without this, custom panels show up as "Unknown panel type: robot-commands.Robot Tasks" in the laptop's Foxglove Desktop because it has no way to discover the extension folders that live inside the GCS container. To avoid duplicating the install logic, the existing gcs/foxglove_extensions/install.py is refactored to read FOXGLOVE_EXT_SRC / FOXGLOVE_EXT_DST env vars (the in-container call already in gcs/docker/gcs-base-docker-compose.yaml keeps working unchanged via defaults). The wrapper sets those vars to ${PROJECT_ROOT}/gcs/foxglove_extensions and ~/.foxglove-studio/extensions respectively, overridable with OSMO_FOXGLOVE_EXT_DIR / skippable with OSMO_FOXGLOVE_SKIP_EXTENSIONS=1. Co-authored-by: Cursor --- .airstack/modules/osmo.sh | 94 +++++++++++++++++++++++++++--- gcs/foxglove_extensions/install.py | 36 ++++++++++-- 2 files changed, 119 insertions(+), 11 deletions(-) diff --git a/.airstack/modules/osmo.sh b/.airstack/modules/osmo.sh index 42ec8eedd..db1c9ff12 100755 --- a/.airstack/modules/osmo.sh +++ b/.airstack/modules/osmo.sh @@ -307,11 +307,61 @@ function cmd_osmo_up { } # osmo:logs — follow the workspace task logs. +# +# The osmo CLI's `workflow logs` command has no --follow flag (and the task +# is selected with `-t TASK`, not positionally). To get a tail -f experience +# we re-fetch the last N lines on a short interval and print only the lines +# that appeared since the previous poll. The "find last seen line, print +# what follows" trick degrades gracefully: if the cursor outruns -n during +# a particularly loud burst, we simply re-print the whole tail with a +# warning rather than dropping output silently. function cmd_osmo_logs { _osmo_check_cli || return 1 local wf; wf="$(_osmo_wf_id)" || return 1 - log_info "osmo workflow logs ${wf} workspace --follow" - osmo workflow logs "$wf" workspace --follow + + local task="${OSMO_LOGS_TASK:-workspace}" + local lines="${OSMO_LOGS_TAIL:-500}" + local interval="${OSMO_LOGS_INTERVAL:-3}" + + log_info "Following ${task} logs for ${wf} (polling every ${interval}s, last ${lines} lines per fetch; Ctrl+C to stop)" + + local prev="" + trap 'echo; log_info "stopped following ${wf}"; trap - INT TERM; return 0' INT TERM + while true; do + local out + out="$(osmo workflow logs "${wf}" -t "${task}" -n "${lines}" 2>/dev/null)" + if [ -n "${out}" ] && [ "${out}" != "${prev}" ]; then + if [ -z "${prev}" ]; then + printf '%s\n' "${out}" + else + local last_line; last_line="$(printf '%s' "${prev}" | tail -1)" + local suffix + suffix="$(printf '%s\n' "${out}" | awk -v L="${last_line}" ' + matched { print; next } + $0 == L { matched=1 } + ')" + if [ -n "${suffix}" ]; then + printf '%s\n' "${suffix}" + else + log_warn "log cursor outran -n ${lines}; reprinting tail" + printf '%s\n' "${out}" + fi + fi + prev="${out}" + fi + + # Exit cleanly once the workflow reaches a terminal state. + local status + status="$(osmo workflow query "${wf}" 2>/dev/null | awk -F': +' '/^Status/ {print $2; exit}' | tr -d ' \r\n')" + case "${status}" in + SUCCEEDED|FAILED|FAILED_*|CANCELED) + log_info "workflow ${wf} is ${status}; exiting follow" + break + ;; + esac + sleep "${interval}" + done + trap - INT TERM } # osmo:ide — port-forward sshd + (optionally) launch VS Code/Cursor on the @@ -423,14 +473,44 @@ function cmd_osmo_webrtc { --connect-timeout "$OSMO_PF_TIMEOUT" } -# osmo:foxglove — forward the GCS Foxglove websocket. +# osmo:foxglove — install the AirStack Foxglove extensions into the local +# Foxglove Desktop user-extensions dir, then forward the GCS Foxglove +# websocket. +# +# The extension install is the same script the GCS container runs on +# startup — gcs/foxglove_extensions/install.py — invoked with env-var +# overrides that point at the local laptop dirs. Default destination on +# Linux/macOS is ~/.foxglove-studio/extensions (Foxglove's canonical user +# extensions path; the macOS rebrand still reads from here). Override +# with OSMO_FOXGLOVE_EXT_DIR, or skip the install entirely with +# OSMO_FOXGLOVE_SKIP_EXTENSIONS=1 (e.g. when using app.foxglove.dev +# which doesn't load local extensions anyway). function cmd_osmo_foxglove { _osmo_check_cli || return 1 local wf; wf="$(_osmo_wf_id)" || return 1 + local ext_src="${PROJECT_ROOT}/gcs/foxglove_extensions" + local ext_dst="${OSMO_FOXGLOVE_EXT_DIR:-${HOME}/.foxglove-studio/extensions}" + + if [ "${OSMO_FOXGLOVE_SKIP_EXTENSIONS:-0}" != "1" ] && [ -d "${ext_src}" ]; then + if command -v python3 >/dev/null 2>&1; then + log_info "Installing Foxglove extensions to ${ext_dst}" + FOXGLOVE_EXT_SRC="${ext_src}" FOXGLOVE_EXT_DST="${ext_dst}" \ + python3 "${ext_src}/install.py" \ + || log_warn "Foxglove extension install failed; panels like 'Robot Tasks' may show as 'Unknown panel type' in Foxglove" + else + log_warn "python3 not found on PATH — skipping Foxglove extension install." + log_warn " Custom panels (Robot Tasks, Waypoint Editor, Polygon Editor) will show as 'Unknown panel type'." + log_warn " Install python3 (e.g. 'brew install python') or copy ${ext_src}/* manually to ${ext_dst}." + fi + elif [ "${OSMO_FOXGLOVE_SKIP_EXTENSIONS:-0}" = "1" ]; then + log_info "Skipping Foxglove extension install (OSMO_FOXGLOVE_SKIP_EXTENSIONS=1)." + fi + log_info "osmo workflow port-forward ${wf} workspace --port ${OSMO_FOXGLOVE_PORT} --connect-timeout ${OSMO_PF_TIMEOUT}" - log_info "Then open https://app.foxglove.dev → Open connection → ws://localhost:8766" - log_info "Then Layouts → Import from file → gcs/foxglove_extensions/airstack_default.json" + log_info "Then in Foxglove Desktop: Open connection → ws://localhost:8766" + log_info " Layouts → Import from file → ${ext_src}/airstack_default.json" + log_info " (Restart Foxglove Desktop once if newly-installed panels still show as 'Unknown panel type'.)" osmo workflow port-forward "$wf" workspace \ --port "$OSMO_FOXGLOVE_PORT" \ --connect-timeout "$OSMO_PF_TIMEOUT" @@ -461,9 +541,9 @@ function register_osmo_commands { COMMAND_HELP["osmo:setup"]="One-time per-user OSMO credential setup (airlab-docker-registry, airlab-docker-login, airlab-nucleus)" COMMAND_HELP["osmo:up"]="Submit osmo/workflows/airstack-dev.yaml with your SSH pubkey injected (--pool POOL, --key PATH, --branch BRANCH)" - COMMAND_HELP["osmo:logs"]="Follow the workspace task logs (osmo workflow logs workspace --follow)" + COMMAND_HELP["osmo:logs"]="Follow the workspace task logs (polls osmo workflow logs -t workspace -n 500; OSMO_LOGS_TASK / OSMO_LOGS_TAIL / OSMO_LOGS_INTERVAL override)" COMMAND_HELP["osmo:ide"]="Port-forward sshd (2200:22) and open VS Code/Cursor on Host airstack-osmo" COMMAND_HELP["osmo:webrtc"]="Port-forward Isaac Sim WebRTC ranges (TCP foreground + UDP background)" - COMMAND_HELP["osmo:foxglove"]="Port-forward GCS Foxglove websocket (8766:8766)" + COMMAND_HELP["osmo:foxglove"]="Install AirStack Foxglove extensions locally, then port-forward GCS Foxglove websocket (8766:8766). Override target dir with OSMO_FOXGLOVE_EXT_DIR; skip install with OSMO_FOXGLOVE_SKIP_EXTENSIONS=1." COMMAND_HELP["osmo:down"]="Cancel the active workflow (push to git before running this)" } diff --git a/gcs/foxglove_extensions/install.py b/gcs/foxglove_extensions/install.py index f948cac54..fc28de102 100644 --- a/gcs/foxglove_extensions/install.py +++ b/gcs/foxglove_extensions/install.py @@ -1,11 +1,34 @@ #!/usr/bin/env python3 +""" +Install AirStack Foxglove extensions into a Foxglove user-extensions dir. + +By default this targets the GCS container's bundled Foxglove app +(/root/.foxglove-studio/extensions), which is the entrypoint that +gcs/docker/gcs-base-docker-compose.yaml runs on container start. + +The src/dst paths can be overridden via env vars, which is how the +`airstack osmo:foxglove` wrapper reuses this same script to install the +extensions into the laptop's local Foxglove Desktop app before +port-forwarding the GCS bridge — that way the laptop's Foxglove sees +"Robot Tasks" / "Waypoint Editor" / "Polygon Editor" instead of the +"Unknown panel type: ..." placeholders. + +Env vars: + FOXGLOVE_EXT_SRC directory containing the extension subdirectories + (each with a package.json + dist/extension.js) + FOXGLOVE_EXT_DST target user-extensions directory, e.g. + ~/.foxglove-studio/extensions on Linux/macOS. +""" + import json import os import re import shutil -src = '/root/AirStack/gcs/foxglove_extensions' -dst = '/root/.foxglove-studio/extensions' +src = os.environ.get( + 'FOXGLOVE_EXT_SRC', '/root/AirStack/gcs/foxglove_extensions') +dst = os.path.expanduser(os.environ.get( + 'FOXGLOVE_EXT_DST', '/root/.foxglove-studio/extensions')) os.makedirs(dst, exist_ok=True) @@ -13,11 +36,16 @@ def _slug(s: str) -> str: return re.sub(r'[^a-z0-9-]+', '-', s.lower()).strip('-') -for ext in os.listdir(src): +installed = 0 +for ext in sorted(os.listdir(src)): pkg_path = os.path.join(src, ext, 'package.json') if not os.path.exists(pkg_path): continue pkg = json.load(open(pkg_path)) name = '{}.{}-{}'.format(_slug(pkg['publisher']), pkg['name'], pkg['version']) shutil.copytree(os.path.join(src, ext), os.path.join(dst, name), dirs_exist_ok=True) - print('Installed Foxglove extension:', name) + print('Installed Foxglove extension:', name, '->', os.path.join(dst, name)) + installed += 1 + +if installed == 0: + print('No Foxglove extensions found under', src) From 2d9b1611b27e5cd76963ecf5a5ddfeec2a007af7 Mon Sep 17 00:00:00 2001 From: Sebastian Scherer Date: Thu, 14 May 2026 17:25:28 -0400 Subject: [PATCH 04/13] fix(osmo): pin Kit livestream UDP media port to 49099 so osmo:webrtc actually shows pixels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kit 107's WebRTC livestream picks a UDP media port dynamically. The documented `omni.services.livestream.nvcf` defaults (minHostPort=47998 maxHostPort=48020 fixedHostPort=0) are ignored by the stock standalone Kit binary — on airstack-dev-13 it bound to UDP 49042, outside both the Compose-published range AND the default `osmo:webrtc --udp` forward of `47995-48012,49000-49007`. Result: TCP signaling on 49100 worked, the WebRTC Streaming Client window opened, but every SRTP media packet was dropped → black viewport plus the recurring `NVST_CCE_DISCONNECTED when m_connectionCount 0 != 1` underflow in Kit's log. Pin the media port via three `app.livestream.*` settings set on `SimulationApp` before `omni.kit.livestream.webrtc` is enabled, so whichever code path the carb.livestream-rtc.plugin consults lands on the same port: app.livestream.fixedHostPort = 49099 app.livestream.minHostPort = 49099 app.livestream.maxHostPort = 49099 49099 is a deliberate one-off from the 49100 TCP signaling port — same neighborhood, easy to remember. Verified live on airstack-dev-13 after `docker compose up -d --force-recreate isaac-sim-livestream`: Kit binds UDP 49099 (`/proc/net/udp` hex BFCB on 0.0.0.0) and docker-proxy publishes it from the pod host network. Knock-on cleanups: - `simulation/isaac-sim/docker/docker-compose.yaml` shrinks the isaac-sim-livestream `ports:` from 27 forwarded ports (`47995-48012, 49000-49007 TCP+UDP, 49100 TCP`) to just two: `49100/tcp` + `49099/udp`. - `.airstack/modules/osmo.sh` shrinks `OSMO_WEBRTC_TCP` to `49100` and `OSMO_WEBRTC_UDP` to `49099`, so `airstack osmo:webrtc` spawns two port-forwards instead of thirty. - `.gitignore` ignores `.DS_Store` so working from a Mac doesn't leak Finder metadata. After pulling this commit into a running pod: `docker compose up -d --force-recreate isaac-sim-livestream` to apply the new port mapping; then re-run `airstack osmo:webrtc` on the laptop to pick up the new forward ranges. The standalone WebRTC Streaming Client connects to `localhost` (same address as before) and now actually receives frames. Co-authored-by: Cursor --- .airstack/modules/osmo.sh | 13 +++++++--- .gitignore | 2 ++ .../isaac-sim/docker/docker-compose.yaml | 18 +++++++------- .../example_one_px4_pegasus_launch_script.py | 24 +++++++++++++++++++ 4 files changed, 45 insertions(+), 12 deletions(-) diff --git a/.airstack/modules/osmo.sh b/.airstack/modules/osmo.sh index db1c9ff12..62f792d20 100755 --- a/.airstack/modules/osmo.sh +++ b/.airstack/modules/osmo.sh @@ -20,9 +20,16 @@ OSMO_STATE_FILE="${OSMO_STATE_DIR}/osmo-state" # WebRTC livestream ports — must match the ports published by the # isaac-sim-livestream service in -# simulation/isaac-sim/docker/docker-compose.yaml. -OSMO_WEBRTC_TCP="47995-48012,49000-49007,49100" -OSMO_WEBRTC_UDP="47995-48012,49000-49007" +# simulation/isaac-sim/docker/docker-compose.yaml AND the +# app.livestream.fixedHostPort setting pinned in the Pegasus launch script +# (simulation/isaac-sim/launch_scripts/example_one_px4_pegasus_launch_script.py). +# +# Two ports total: +# TCP 49100 — omni.kit.livestream.webrtc WebSocket signaling +# UDP 49099 — SRTP media (pinned; Kit 107 otherwise picks dynamically and +# escapes both the compose-published and CLI-forwarded ranges) +OSMO_WEBRTC_TCP="49100" +OSMO_WEBRTC_UDP="49099" # GCS Foxglove websocket: container 8765 → host 8766 (per # gcs/docker/docker-compose.yaml). diff --git a/.gitignore b/.gitignore index a5776557c..4868b5c74 100644 --- a/.gitignore +++ b/.gitignore @@ -99,3 +99,5 @@ common/rayfronts/ # Docker build cache (root-owned subdirs cause permission warnings on `git add`) robot/docker/cache/ +.DS_Store +gcs/.DS_Store diff --git a/simulation/isaac-sim/docker/docker-compose.yaml b/simulation/isaac-sim/docker/docker-compose.yaml index f51de0f9d..dfd699aa4 100644 --- a/simulation/isaac-sim/docker/docker-compose.yaml +++ b/simulation/isaac-sim/docker/docker-compose.yaml @@ -98,10 +98,13 @@ services: # =================================================================================================================== # WebRTC livestream variant for OSMO / remote dev. Headless: no X server, # no display, no GUI window. Kit's `omni.kit.livestream.webrtc` extension - # serves the rendered viewport on the standard NVIDIA livestream ports - # (47995-48012, 49000-49007 TCP+UDP and 49100 TCP). Those ports are - # published into the host (the OSMO workspace pod's network namespace) so - # `osmo workflow port-forward` can reach them. + # serves WebSocket signaling on TCP 49100 and SRTP media on UDP 49099 (the + # latter pinned by `app.livestream.fixedHostPort/minHostPort/maxHostPort=49099` + # in the Pegasus launch script — see example_one_px4_pegasus_launch_script.py). + # Those two ports are published into the host (the OSMO workspace pod's + # network namespace) so `osmo workflow port-forward` can reach them. Kit + # 107's default media-port range is wider and dynamic, so pinning to a + # known value is the only way to keep the forward surface to two ports. # # Selected via COMPOSE_PROFILES (e.g. `desktop,isaac-sim-livestream`) in # osmo/workspace/entrypoint.sh, or by setting ISAAC_SIM_LIVESTREAM=true. @@ -128,11 +131,8 @@ services: # publish (the conservative choice) keeps the rest of the stack on # airstack_network for DDS multicast. ports: - - "47995-48012:47995-48012/tcp" - - "47995-48012:47995-48012/udp" - - "49000-49007:49000-49007/tcp" - - "49000-49007:49000-49007/udp" - - "49100:49100/tcp" + - "49100:49100/tcp" # WebSocket signaling (omni.kit.livestream.webrtc, app.livestream.port=49100) + - "49099:49099/udp" # SRTP media (pinned via app.livestream.fixedHostPort=49099 in the launch script) # Drop X11-specific volume mounts inherited from the isaac-sim service — # there is no X server in an OSMO pod. volumes: !override diff --git a/simulation/isaac-sim/launch_scripts/example_one_px4_pegasus_launch_script.py b/simulation/isaac-sim/launch_scripts/example_one_px4_pegasus_launch_script.py index 72be29824..fec97917f 100755 --- a/simulation/isaac-sim/launch_scripts/example_one_px4_pegasus_launch_script.py +++ b/simulation/isaac-sim/launch_scripts/example_one_px4_pegasus_launch_script.py @@ -32,6 +32,30 @@ from isaacsim.core.utils.extensions import enable_extension simulation_app.set_setting("/app/window/drawMouse", True) simulation_app.set_setting("/app/livestream/enabled", True) + + # Pin the UDP media port so it stays inside the narrow set of ports we + # publish from this container and that `airstack osmo:webrtc` forwards. + # + # Kit 107's WebRTC livestream picks a UDP media port dynamically. The + # documented `omni.services.livestream.nvcf` defaults were + # minHostPort=47998 / maxHostPort=48020 / fixedHostPort=0, but the + # actual Kit binary ignored that range on airstack-dev-13 and bound to + # UDP 49042 — outside both the Compose-published port range AND the + # default osmo `--udp` forward (47995-48012,49000-49007). Result: + # signaling worked (TCP 49100), the WebRTC Streaming Client window + # opened, but every media packet was dropped → black viewport + + # the `NVST_CCE_DISCONNECTED when m_connectionCount 0 != 1` underflow + # storm in the Kit log. + # + # Set all three settings so whichever code path the plugin reads, it + # lands on UDP 49099. The value of 49099 is picked as one-off from the + # 49100 signaling port — same range, easy to remember, and TCP/UDP can + # coexist on the same number if anyone later wants a single port. + LIVESTREAM_UDP_PORT = int(os.environ.get("ISAAC_SIM_LIVESTREAM_UDP_PORT", "49099")) + simulation_app.set_setting("/app/livestream/fixedHostPort", LIVESTREAM_UDP_PORT) + simulation_app.set_setting("/app/livestream/minHostPort", LIVESTREAM_UDP_PORT) + simulation_app.set_setting("/app/livestream/maxHostPort", LIVESTREAM_UDP_PORT) + enable_extension("omni.kit.livestream.webrtc") import omni.kit.app From 7c95b4e1be2591d0ac7aaca4e6cd71cb8e182960 Mon Sep 17 00:00:00 2001 From: Sebastian Scherer Date: Thu, 14 May 2026 17:34:20 -0400 Subject: [PATCH 05/13] fix(osmo): render Kit GUI in WebRTC stream; document SSH agent forward for in-pod git push MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two paper-cuts that bit airstack-dev-13 after the WebRTC media port pin landed (commit 2d9b1611): (1) The WebRTC stream showed only the bare 3D viewport — no menu bar, no toolbar, no panels, no console. Cause: SimulationApp's default when `headless=True` is to also hide the UI (`hide_ui=True`). The NVIDIA reference at `simulation/isaac-sim/standalone_examples/api/isaacsim.simulation_app/livestream.py` explicitly opts back into UI rendering plus picks explicit window sizing and `display_options=3286` to keep the default grid/axes visible. Mirror that config in `example_one_px4_pegasus_launch_script.py` when `ISAAC_SIM_LIVESTREAM=true` (local desktop dev keeps the minimal `headless=False` path unchanged). (2) The pod has no SSH private key, only an `authorized_keys` for inbound connections from the user's laptop. As a result, `git push` from inside the Cursor / VS Code Remote-SSH session inside the pod fails with "Permission denied (publickey)". sshd inside the workspace image already has `AllowAgentForwarding yes` baked in via `osmo/workspace/sshd_config`; the missing piece is purely on the Mac side. Update the `~/.ssh/config` block in the tutorial to include `ForwardAgent yes` (so the local agent's keys are exposed in the pod), `AddKeysToAgent yes` (auto-load on first push), and `UseKeychain yes` (macOS-only Keychain unlock without passphrase prompts; ignored on Linux). Adds an `ssh-add -l` smoke-test note. Co-authored-by: Cursor --- docs/tutorials/airstack_on_osmo.md | 19 ++++++++++++- .../example_one_px4_pegasus_launch_script.py | 27 +++++++++++++++++-- 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/docs/tutorials/airstack_on_osmo.md b/docs/tutorials/airstack_on_osmo.md index 90d139b69..90c421580 100644 --- a/docs/tutorials/airstack_on_osmo.md +++ b/docs/tutorials/airstack_on_osmo.md @@ -247,11 +247,28 @@ Host airstack-osmo Port 2200 User root StrictHostKeyChecking accept-new + # SSH agent forwarding so `git push` from inside the pod uses your + # local laptop's SSH key (the pod's sshd has AllowAgentForwarding yes + # baked in by osmo/workspace/sshd_config). Without this, the pod has + # no key to push to github.com with — its ~/.ssh/ only holds the + # authorized_keys file for inbound connections. + ForwardAgent yes + # macOS Keychain integration — first push from the pod auto-loads + # your key into the local ssh-agent and unlocks it via the system + # keychain (no passphrase prompts). Harmless on Linux: those clients + # ignore the option. AddKeysToAgent works on both OSes. + AddKeysToAgent yes + UseKeychain yes EOF ``` The `localhost:2200` is what we'll port-forward to in step 4. +> **Smoke-test the agent forward** once the pod is up: SSH in and run +> `ssh-add -l` — you should see your local key listed. If you see "The +> agent has no identities", run `ssh-add ~/.ssh/id_ed25519` on your +> laptop and reconnect. + ## Step 2 — Submit the workflow The repo ships the workflow at @@ -264,7 +281,7 @@ takes a path and uploads the YAML. curl -fsSL -o airstack-dev.yaml \ https://raw.githubusercontent.com/castacks/AirStack/main/osmo/workflows/airstack-dev.yaml -# Submit (replace airstack with whatever pool your admin enabled privileged on): +# Submit: osmo workflow submit airstack-dev.yaml \ --pool airstack \ --set-env "SSH_PUB_KEY=$(cat ~/.ssh/id_ed25519.pub)" diff --git a/simulation/isaac-sim/launch_scripts/example_one_px4_pegasus_launch_script.py b/simulation/isaac-sim/launch_scripts/example_one_px4_pegasus_launch_script.py index fec97917f..11819fc2f 100755 --- a/simulation/isaac-sim/launch_scripts/example_one_px4_pegasus_launch_script.py +++ b/simulation/isaac-sim/launch_scripts/example_one_px4_pegasus_launch_script.py @@ -20,8 +20,31 @@ _LIVESTREAM = os.environ.get("ISAAC_SIM_LIVESTREAM", "").lower() == "true" -# Must be created before any omni imports -simulation_app = SimulationApp({"headless": True if _LIVESTREAM else False}) +# Must be created before any omni imports. +# +# When livestreaming, mirror the NVIDIA reference config from +# simulation/isaac-sim/standalone_examples/api/isaacsim.simulation_app/livestream.py +# so the Kit GUI (menu bar, toolbar, viewport, status bar) actually gets +# rendered into the WebRTC stream instead of just the bare 3D viewport. +# Key field: `hide_ui: False` — SimulationApp's default when `headless=True` +# is to also hide the UI; the livestream reference opts back into showing +# it. `display_options=3286` is the same bitmask the reference uses to keep +# the default grid + axes visible at scene start. +if _LIVESTREAM: + _SIM_APP_CONFIG = { + "width": 1280, + "height": 720, + "window_width": 1920, + "window_height": 1080, + "headless": True, + "hide_ui": False, + "renderer": "RaytracedLighting", + "display_options": 3286, + } +else: + _SIM_APP_CONFIG = {"headless": False} + +simulation_app = SimulationApp(launch_config=_SIM_APP_CONFIG) if _LIVESTREAM: # Headless + WebRTC livestream when ISAAC_SIM_LIVESTREAM=true (set by the From 02fcc1969ced062af3700f867981f1c61da63e3f Mon Sep 17 00:00:00 2001 From: Sebastian Scherer Date: Thu, 14 May 2026 18:10:16 -0400 Subject: [PATCH 06/13] fix(osmo): make osmo:setup idempotent + paste-safe; document Nucleus auth-debug path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit osmo:setup hit two failure modes that wasted a debug session each: - `osmo credential set` is not an upsert for GENERIC creds — re-running setup (e.g. to rotate a Nucleus API token) failed with `400 duplicate key value violates unique constraint "credential_pkey"` and bailed before reaching the airlab-nucleus credential. Delete-then-set each credential so re-running is idempotent. - Bracket-paste mode and cross-OS clipboards routinely smuggle invisible bytes around long pastes. Nucleus's auth endpoint silently DENIES a token with one extra trailing byte, with no actionable error from the client side. _osmo_prompt now strips leading/trailing whitespace and CR/NUL bytes via a new _osmo_trim helper, and warns when bytes were stripped. cmd_osmo_setup additionally JWT-shape-checks the Nucleus token (must be eyJ...) before submitting it, so a wrong paste fails at setup time instead of silently DENIED at pod boot. Also documents how to debug the "Login Required: Unable to connect server omniverse://airlab-nucleus..." popup: SSH the Nucleus host and tail base_stack-nucleus-auth-1 for InternalCredentials.auth status: DENIED. Adds a "Nucleus connectivity from OSMO" section to the admin README clarifying that Nucleus over HTTPS uses a single 443 (no need to open the native 3009-3180 range from the OSMO cluster), per NVIDIA's TLS docs. Co-authored-by: Cursor --- .airstack/modules/osmo.sh | 60 ++++++++++++++++++++++++++++-- docs/tutorials/airstack_on_osmo.md | 2 +- osmo/README.md | 32 ++++++++++++++++ 3 files changed, 90 insertions(+), 4 deletions(-) diff --git a/.airstack/modules/osmo.sh b/.airstack/modules/osmo.sh index 62f792d20..a33a0185d 100755 --- a/.airstack/modules/osmo.sh +++ b/.airstack/modules/osmo.sh @@ -49,6 +49,30 @@ function _osmo_check_cli { fi } +# Helper: strip leading/trailing whitespace + CR/NUL bytes from the +# variable named in $1. +# +# Why this exists: bracket-paste mode and cross-OS clipboards (RDP, VNC, +# Windows-side note apps) routinely smuggle invisible bytes around long +# pastes — Nucleus API tokens (JWT, ~1 KB) and SSH keys are the usual +# victims. Nucleus's auth endpoint silently `DENIES` a token that has +# one extra trailing byte, with no actionable error from the client side. +# Stripping defensively at prompt time saves an entire round-trip of +# "regenerate token → still denied → check auth-service log" debugging. +function _osmo_trim { + local var_name="$1" + local val="${!var_name}" + local original_len="${#val}" + val="${val//$'\r'/}" + val="${val//$'\0'/}" + val="${val#"${val%%[![:space:]]*}"}" + val="${val%"${val##*[![:space:]]}"}" + if [ "${#val}" -ne "$original_len" ]; then + log_warn "Stripped $((original_len - ${#val})) whitespace/control byte(s) from ${var_name}." + fi + printf -v "$var_name" '%s' "$val" +} + # Helper: read a value with prompt; supports -s for silent (passwords). # # Visible prompts switch the TTY out of canonical mode for the duration of @@ -61,6 +85,8 @@ function _osmo_check_cli { # # We use a trap to guarantee the saved stty is restored if the user Ctrl-Cs # mid-paste — otherwise the shell would be left in raw mode. +# +# After reading we always run _osmo_trim — see comment there. function _osmo_prompt { local var_name="$1" local prompt_text="$2" @@ -86,6 +112,8 @@ function _osmo_prompt { fi fi + _osmo_trim "$var_name" + if [ -z "${!var_name}" ]; then log_error "Empty input for ${var_name}; aborting." return 1 @@ -122,10 +150,34 @@ EOF _osmo_prompt andrew_password "AirLab Docker password (hidden)" true || return 1 _osmo_prompt nucleus_token "Nucleus API token" false || return 1 + # Sanity-check the Nucleus token shape. Nucleus issues RS256 JWTs: + # base64url(header).base64url(payload).base64url(signature), with the + # header always starting `eyJ` (base64url of `{"`). Catching a wrong + # paste here (e.g. Andrew password, or token without the trailing + # signature segment) saves the user from a silent `InternalCredentials + # .auth: DENIED` round-trip later on. We do not validate the signature. + case "$nucleus_token" in + eyJ*.*.*) ;; # looks like a 3-segment JWT + *) + log_error "That doesn't look like a Nucleus API token." + log_error " - Expected: a JWT of the form eyJ…… (~1 KB long)" + log_error " - Got: ${#nucleus_token} chars, prefix '$(printf '%s' "$nucleus_token" | head -c 8)…'" + log_error " Generate one at https://airlab-nucleus.andrew.cmu.edu/omni/web3/" + log_error " → right-click cloud icon → API Tokens → Create." + return 1 + ;; + esac + local omni_server="${OMNI_SERVER:-omniverse://airlab-nucleus.andrew.cmu.edu/NVIDIA/Assets/Isaac/5.1}" local airlab_registry="${AIRLAB_REGISTRY:-airlab-docker.andrew.cmu.edu}" - log_info "Registering airlab-docker-registry (REGISTRY)..." + # `osmo credential set` is NOT an upsert for GENERIC credentials — re-setting + # one that already exists fails with `400 duplicate key value violates unique + # constraint "credential_pkey"`. Delete first so re-running osmo:setup + # (e.g. to rotate a Nucleus token) is idempotent. The `|| true` swallows the + # "credential not found" case on a first-time run. + log_info "Refreshing airlab-docker-registry (REGISTRY)..." + osmo credential delete airlab-docker-registry >/dev/null 2>&1 || true osmo credential set airlab-docker-registry \ --type REGISTRY \ --payload "registry=${airlab_registry}" \ @@ -133,14 +185,16 @@ EOF "auth=${andrew_password}" \ || { log_error "osmo credential set airlab-docker-registry failed"; return 1; } - log_info "Registering airlab-docker-login (GENERIC)..." + log_info "Refreshing airlab-docker-login (GENERIC)..." + osmo credential delete airlab-docker-login >/dev/null 2>&1 || true osmo credential set airlab-docker-login \ --type GENERIC \ --payload "username=${andrew_id}" \ "password=${andrew_password}" \ || { log_error "osmo credential set airlab-docker-login failed"; return 1; } - log_info "Registering airlab-nucleus (GENERIC)..." + log_info "Refreshing airlab-nucleus (GENERIC)..." + osmo credential delete airlab-nucleus >/dev/null 2>&1 || true osmo credential set airlab-nucleus \ --type GENERIC \ --payload "omni_user=${andrew_id}" \ diff --git a/docs/tutorials/airstack_on_osmo.md b/docs/tutorials/airstack_on_osmo.md index 90c421580..9001dee7e 100644 --- a/docs/tutorials/airstack_on_osmo.md +++ b/docs/tutorials/airstack_on_osmo.md @@ -470,7 +470,7 @@ osmo workflow cancel $WF | `Permission denied (publickey)` on Remote-SSH | The pod authorised a different pubkey than the one your local SSH client is offering | Confirm `cat ~/.ssh/id_ed25519.pub` matches what was passed to `--set-env "SSH_PUB_KEY=..."`. Re-submit if the wrong key was used. | | `osmo workflow logs` shows `ERROR: SSH_PUB_KEY not set` | You forgot `--set-env` on submit | Cancel the workflow and resubmit with `--set-env "SSH_PUB_KEY=$(cat ~/.ssh/id_ed25519.pub)"` | | `docker pull` fails inside the pod with `unauthorized` | Your `airlab-docker-login` credential is missing or has the wrong Andrew ID/password | Re-run `airstack osmo:setup` (or the `osmo credential set airlab-docker-login ...` command in Step 0). | -| Logs show `WARN: airlab-nucleus OSMO credential not set` and Isaac Sim asset loads fail | Your `airlab-nucleus` credential is missing or its API token expired | Re-run `airstack osmo:setup` after generating a fresh Nucleus API token. | +| Logs show `WARN: airlab-nucleus OSMO credential not set` and Isaac Sim asset loads fail, **or** Isaac Sim shows "Login Required: Unable to connect server omniverse://airlab-nucleus..." | Your `airlab-nucleus` API token is missing, expired, or revoked. Confirm by SSH'ing the Nucleus host and running `sudo docker logs --tail 200 base_stack-nucleus-auth-1`; look for `InternalCredentials.auth: {... 'username': ''} → status: 'DENIED'`. Regenerate the token at , then re-run `airstack osmo:setup` and resubmit the workflow (or live-edit `simulation/isaac-sim/docker/omni_pass.env` in the pod and restart the `isaac-sim-livestream` container). | | Isaac Sim container restarts repeatedly | GPU not visible to the inner Docker daemon (toolkit not configured on the node) | Lab admin task. From inside the pod: `docker info \| grep -i runtime` should list `nvidia`. | | Isaac Sim is up but the WebRTC stream is blank | The Pegasus script isn't getting `--/app/livestream/enabled=true`, or the wrong Compose profile is active | In the integrated terminal: `docker logs airstack-isaac-sim-livestream-1`. Confirm `ISAAC_SIM_LIVESTREAM=true` and that the `isaac-sim-livestream` profile is the one running (`docker ps`). | | Foxglove "no connection" | Port-forward died, GCS container hasn't started yet, or browser is caching an old connection | Restart the `--port 8766:8766` forward; check `docker ps` shows `airstack-gcs-1` Up; try `ws://127.0.0.1:8766` instead of `ws://localhost:8766`. | diff --git a/osmo/README.md b/osmo/README.md index b29b38070..adbc1dd88 100644 --- a/osmo/README.md +++ b/osmo/README.md @@ -235,6 +235,38 @@ The wider Foxglove layout / panel-import flow is documented in [`docs/gcs/foxglove.md`](../docs/gcs/foxglove.md); the only OSMO-specific piece is the `port-forward` line in front of it. +## Nucleus connectivity from OSMO + +`airlab-nucleus.andrew.cmu.edu` runs the standard Omniverse Enterprise +Nucleus stack with TLS termination at its Ingress Router (NGINX) on **port +443**. Per [NVIDIA's TLS doc](https://docs.omniverse.nvidia.com/nucleus/latest/enterprise/installation/tls.html), +clients only need outbound TCP **443** — the Ingress Router path-based- +routes requests (`/omni/api`, `/omni/auth`, `/omni/lft`, `/omni/conn`, +`/omni/web3/...`) to the internal service ports (3009, 3100, 3030, 3019, +3400). Omniclient detects SSL/TLS and prefers it, so the OSMO pod (whose +egress allows 80/443/22) reaches Nucleus over the same single 443 the +Web3 navigator uses. **The native protocol ports 3009–3180 do NOT need to +be open from OSMO** as long as TLS is configured on the Nucleus side. + +If you see Isaac Sim's "Login Required" popup at startup: + +1. **Check the auth-service log on the Nucleus host** (`ssh + ubuntu@; sudo docker logs --tail 200 + base_stack-nucleus-auth-1`). Look for `InternalCredentials.auth: + {... 'username': ''} → status: 'DENIED'` lines. That + means the API token in your `airlab-nucleus` OSMO credential is + revoked, expired, or has whitespace/quoting damage. +2. **Regenerate the token** at + → right-click the + cloud icon → **API Tokens** → create a new one. +3. **Update the OSMO credential** with `airstack osmo:setup` (or the + raw `osmo credential set airlab-nucleus ...` command from the + tutorial Step 0) and **resubmit the workflow** so the new token + lands in `omni_pass.env` on pod boot. To live-patch a running pod + instead, edit `simulation/isaac-sim/docker/omni_pass.env` inside + the workspace and `docker compose --profile isaac-sim-livestream + restart isaac-sim-livestream`. + ## Out of scope (followups) - **OSMO-native split** — three separate OSMO tasks for `isaac-sim` / From c7f89a867e242fb15728cb985b865db52aec18cb Mon Sep 17 00:00:00 2001 From: Sebastian Scherer Date: Thu, 14 May 2026 18:29:03 -0400 Subject: [PATCH 07/13] fix(osmo): use Nucleus API-token auth, with double-dollar to survive compose parser MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The OSMO entrypoint was writing OMNI_USER= alongside an API token JWT in OMNI_PASS, which routes the JWT through the password- verification path. Nucleus silently DENIES — visible only in base_stack-nucleus-auth-1 as `InternalCredentials.auth … 'username': '' … status: DENIED` (no Tokens.auth_with_api_token call). Kit then pops "Login Required: Unable to connect server omniverse://...". omniclient expects the literal sentinel username `$omni-api-token` paired with the JWT as the password. The entrypoint now detects a JWT-shaped OMNI_PASS (header starts with `eyJ`) and emits OMNI_USER=$$omni-api-token into omni_pass.env. The `$$` is intentional: docker-compose v2 interpolates env_file values, and a single `$` would be eaten by the parser (`OMNI_USER=$omni-api-token` becomes `OMNI_USER=-api-token` after ${omni}- expansion to empty). The container ultimately sees OMNI_USER=$omni-api-token, which is the correct sentinel. Also note for the next debugger: `docker compose restart` does NOT re-read env_file. Use `docker compose up -d ` to recreate the container after editing omni_pass.env. Updates omni_pass_TEMPLATE.env header to document the API-token pattern explicitly (with the $$ caveat), and adds a troubleshooting row that distinguishes "wrong auth path" (DENIED with no Tokens.auth_with_api_token call) from "bad/expired token" (Tokens.auth_with_api_token: DENIED). Co-authored-by: Cursor --- docs/tutorials/airstack_on_osmo.md | 3 +- osmo/workspace/entrypoint.sh | 25 +++++++++++++-- .../isaac-sim/docker/omni_pass_TEMPLATE.env | 31 +++++++++++++------ 3 files changed, 47 insertions(+), 12 deletions(-) diff --git a/docs/tutorials/airstack_on_osmo.md b/docs/tutorials/airstack_on_osmo.md index 9001dee7e..8bea4d3e7 100644 --- a/docs/tutorials/airstack_on_osmo.md +++ b/docs/tutorials/airstack_on_osmo.md @@ -470,7 +470,8 @@ osmo workflow cancel $WF | `Permission denied (publickey)` on Remote-SSH | The pod authorised a different pubkey than the one your local SSH client is offering | Confirm `cat ~/.ssh/id_ed25519.pub` matches what was passed to `--set-env "SSH_PUB_KEY=..."`. Re-submit if the wrong key was used. | | `osmo workflow logs` shows `ERROR: SSH_PUB_KEY not set` | You forgot `--set-env` on submit | Cancel the workflow and resubmit with `--set-env "SSH_PUB_KEY=$(cat ~/.ssh/id_ed25519.pub)"` | | `docker pull` fails inside the pod with `unauthorized` | Your `airlab-docker-login` credential is missing or has the wrong Andrew ID/password | Re-run `airstack osmo:setup` (or the `osmo credential set airlab-docker-login ...` command in Step 0). | -| Logs show `WARN: airlab-nucleus OSMO credential not set` and Isaac Sim asset loads fail, **or** Isaac Sim shows "Login Required: Unable to connect server omniverse://airlab-nucleus..." | Your `airlab-nucleus` API token is missing, expired, or revoked. Confirm by SSH'ing the Nucleus host and running `sudo docker logs --tail 200 base_stack-nucleus-auth-1`; look for `InternalCredentials.auth: {... 'username': ''} → status: 'DENIED'`. Regenerate the token at , then re-run `airstack osmo:setup` and resubmit the workflow (or live-edit `simulation/isaac-sim/docker/omni_pass.env` in the pod and restart the `isaac-sim-livestream` container). | +| Logs show `WARN: airlab-nucleus OSMO credential not set` and Isaac Sim asset loads fail, **or** Isaac Sim shows "Login Required: Unable to connect server omniverse://airlab-nucleus..." with the auth-service log showing `InternalCredentials.auth … 'username': '' … status: 'DENIED'` (no `Tokens.auth_with_api_token` call) | The pod is doing **password auth** instead of **API-token auth**. Inside the pod, `simulation/isaac-sim/docker/omni_pass.env` must have `OMNI_USER=$$omni-api-token` (literal `$$`, the sentinel for API-token auth — docker-compose v2 collapses `$$` to `$` on its way to the container). The OSMO entrypoint sets this automatically when `OMNI_PASS` looks like a JWT; if you see `OMNI_USER=` in the file, recreate the container with `docker compose --profile desktop --profile isaac-sim-livestream up -d isaac-sim-livestream` (`restart` does NOT re-read `env_file`). | +| Logs show `WARN: airlab-nucleus OSMO credential not set` and Isaac Sim asset loads fail, **or** Isaac Sim shows "Login Required: Unable to connect server omniverse://airlab-nucleus..." with the auth-service log showing `Tokens.auth_with_api_token … status: 'DENIED'` | Your `airlab-nucleus` API token is missing, expired, or revoked (rotation invalidates the predecessor). Confirm by SSH'ing the Nucleus host and running `sudo docker logs --tail 200 base_stack-nucleus-auth-1`. Regenerate the token at , then re-run `airstack osmo:setup` and resubmit the workflow (or live-edit `simulation/isaac-sim/docker/omni_pass.env` in the pod and recreate the `isaac-sim-livestream` container — see row above). | | Isaac Sim container restarts repeatedly | GPU not visible to the inner Docker daemon (toolkit not configured on the node) | Lab admin task. From inside the pod: `docker info \| grep -i runtime` should list `nvidia`. | | Isaac Sim is up but the WebRTC stream is blank | The Pegasus script isn't getting `--/app/livestream/enabled=true`, or the wrong Compose profile is active | In the integrated terminal: `docker logs airstack-isaac-sim-livestream-1`. Confirm `ISAAC_SIM_LIVESTREAM=true` and that the `isaac-sim-livestream` profile is the one running (`docker ps`). | | Foxglove "no connection" | Port-forward died, GCS container hasn't started yet, or browser is caching an old connection | Restart the `--port 8766:8766` forward; check `docker ps` shows `airstack-gcs-1` Up; try `ws://127.0.0.1:8766` instead of `ws://localhost:8766`. | diff --git a/osmo/workspace/entrypoint.sh b/osmo/workspace/entrypoint.sh index 04941c819..67068fde9 100755 --- a/osmo/workspace/entrypoint.sh +++ b/osmo/workspace/entrypoint.sh @@ -159,9 +159,30 @@ fi : "${OMNI_PASS:=guest}" : "${OMNI_SERVER:=omniverse://airlab-nucleus.andrew.cmu.edu/NVIDIA/Assets/Isaac/5.1}" -log "writing $OMNI_PASS_FILE (omni_user=${OMNI_USER}, omni_server=${OMNI_SERVER})" +# If OMNI_PASS looks like a Nucleus API JWT (header starts with `eyJ`), +# switch to API-token auth: omniclient expects the literal sentinel +# username `$omni-api-token` paired with the JWT as the password. +# Setting OMNI_USER to the actual Andrew ID would route the JWT through +# the password-verification path instead and Nucleus would silently +# DENY (visible only in the auth-service log as +# `InternalCredentials.auth … 'username': '' … status: DENIED`). +# +# docker-compose v2 interpolates env_file values, so the literal `$` +# must be doubled to `$$` to survive Compose's parser. The container +# ultimately sees `OMNI_USER=$omni-api-token`. +case "$OMNI_PASS" in + eyJ*.*.*) + log "OMNI_PASS looks like a JWT — using API-token auth (OMNI_USER=\$omni-api-token)" + OMNI_USER_LINE='OMNI_USER=$$omni-api-token' + ;; + *) + OMNI_USER_LINE="OMNI_USER=${OMNI_USER}" + ;; +esac + +log "writing $OMNI_PASS_FILE (${OMNI_USER_LINE}, omni_server=${OMNI_SERVER})" cat > "$OMNI_PASS_FILE" < ← the JWT (~1 KB, starts with eyJ) +# +# IMPORTANT: the `$$` is intentional. docker-compose v2 interpolates +# env_file values, and the literal `$` must be doubled to survive +# Compose's parser. The container ultimately sees `OMNI_USER=$omni-api-token`, +# which is what omniclient expects for API-token auth (anything else, e.g. +# your Andrew ID, routes the JWT through the password-verification path +# and Nucleus silently DENIES the request). +# +# Fallback: username/password auth. Set OMNI_USER to your Nucleus username +# and OMNI_PASS to your Nucleus password (NOT your Andrew password unless +# Nucleus is SSO-linked to it). +# +# If you skip Nucleus auth entirely, leave both at the guest defaults +# (read-only access; Isaac Sim asset loads from Nucleus may fail). +# ######################################################################### OMNI_USER=guest From 6f3a8e56bd6208dc3c7154ba475b3921c06f78bd Mon Sep 17 00:00:00 2001 From: Sebastian Scherer Date: Thu, 14 May 2026 20:06:25 -0400 Subject: [PATCH 08/13] docs(osmo): make OSMO the recommended dev path, single clone-the-repo flow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reposition the OSMO tutorial as AirStack's recommended day-to-day development path (not just a fallback for laptops without GPUs) and collapse it onto a single recipe: clone the repo, then drive everything through the airstack osmo:* wrappers in .airstack/modules/osmo.sh. - docs/tutorials/airstack_on_osmo.md - Retitle + rewrite the intro to lead with five concrete advantages (pooled GPUs, no local CUDA/Docker/driver maintenance, same image as CI + field robots, one-command onboarding, hardware bigger than your laptop). Demote the Linux+GPU-desktop path to an escape hatch. - Drop the Mac/Windows/no-GPU framing in 'Who is this for?' and the mermaid laptop subgraph label. - Add 'a local clone of AirStack' to Prerequisites; remove it from the 'do not need' list. - Replace Option A/B credential split with a single ./airstack.sh osmo:setup recipe; move the three raw osmo credential set calls into a collapsible 'Under the hood' footnote. - Replace each step's raw osmo workflow ... command with the corresponding airstack osmo:up/logs/ide/webrtc/foxglove/down wrapper; preserve the raw form in 'Under the hood' footnotes that cross-link cmd_osmo_* in .airstack/modules/osmo.sh. - Drop the export WF=... paragraph — the wrappers read the id from ~/.airstack/osmo-state automatically; AIRSTACK_OSMO_WF overrides per-invocation. \$WF now only appears inside the raw-form footnotes. - Sweep Troubleshooting + What-survives tables: redirect raw port-forward fixes to the airstack osmo:* equivalents and rename the section to 'What survives airstack osmo:down?'. - Fix WebRTC edge label (49100/tcp + 49099/udp) to match the pinned ports the workflow actually uses today. Companion cleanups now that the privileged_allowed flip is automatic on the OSMO autosync side (synchronize_osmo_team_pools.py forces privileged_allowed: true on every platform of every pool, so students never see the 'platform does not have privileged flag enabled' error): - osmo/README.md: drop the 'Most common blocker' privileged warning, the privileged_allowed row from the pool-requirements table, and the 'privileged GPU pod' / '(privileged, GPU)' descriptors in the architecture summary. Simplify the validation-stage SSH-failure hint. - osmo/workflows/airstack-dev.yaml: trim the long DinD-requires-privileged comment to a one-liner (the privileged: true directive itself stays). - .airstack/modules/osmo.sh: remove the special-case 'privileged flag enabled' error branch in cmd_osmo_up — it should never fire now. Co-authored-by: Cursor --- .airstack/modules/osmo.sh | 15 - docs/tutorials/airstack_on_osmo.md | 426 ++++++++++++++++------------- osmo/README.md | 27 +- osmo/workflows/airstack-dev.yaml | 9 +- 4 files changed, 245 insertions(+), 232 deletions(-) diff --git a/.airstack/modules/osmo.sh b/.airstack/modules/osmo.sh index a33a0185d..9e2e9bd5a 100755 --- a/.airstack/modules/osmo.sh +++ b/.airstack/modules/osmo.sh @@ -328,21 +328,6 @@ function cmd_osmo_up { if ! output="$("${cmd[@]}" 2>&1)"; then echo "$output" >&2 log_error "osmo workflow submit failed." - if echo "$output" | grep -q "privileged flag enabled"; then - log_error "The selected pool does not allow privileged tasks. AirStack-on-OSMO needs" - log_error "privileged: true on the workspace task (DinD)." - log_error "" - log_error "Audit available pools with:" - log_error " osmo pool list -t json | python3 -c \"import json,sys" - log_error " for ns in json.load(sys.stdin)['node_sets']:" - log_error " for p in ns['pools']:" - log_error " for n,plat in p['platforms'].items():" - log_error " print(f\\\"{p['name']:25} priv={plat['privileged_allowed']}\\\")\"" - log_error "" - log_error "If none allow privileged, ask your OSMO pool admin to flip" - log_error "platforms.default.privileged_allowed: true on the airstack pool." - log_error "Full template: docs/tutorials/airstack_on_osmo.md → 'One-time pool setup (admin)'." - fi return 1 fi echo "$output" diff --git a/docs/tutorials/airstack_on_osmo.md b/docs/tutorials/airstack_on_osmo.md index 8bea4d3e7..8013e0715 100644 --- a/docs/tutorials/airstack_on_osmo.md +++ b/docs/tutorials/airstack_on_osmo.md @@ -1,24 +1,37 @@ -# AirStack on OSMO — Remote Development on Mac, Windows, or any Linux - -This tutorial walks through developing on AirStack from a laptop that has -**no Docker, no NVIDIA GPU, and no AirStack source tree of its own**. You'll -attach VS Code or Cursor to a remote OSMO pod via Remote-SSH, edit code as -if it were local, and stream Isaac Sim and the GCS Foxglove dashboard back -to your browser through `osmo workflow port-forward`. - -> **Prefer local development on a Linux+GPU desktop?** Use the -> [Getting Started](../getting_started/index.md) flow instead — `airstack -> install` + `airstack up` is faster and doesn't depend on a remote -> scheduler. This tutorial is for everyone *else*. +# AirStack on OSMO — Recommended Remote Development Workflow + +This is AirStack's recommended day-to-day development path going forward. +You submit one OSMO workflow that spins up a GPU pod running the full +three-container AirStack stack (Isaac Sim, robot-desktop, GCS), attach VS +Code or Cursor to it over Remote-SSH, and stream Isaac Sim and the GCS +Foxglove dashboard back to your browser. + +Why this is the recommended path: + +- **Pooled GPUs.** A lab's GPUs are shared on-demand across the whole team + instead of pinned one-per-desktop. Onboarding doesn't require buying + hardware. +- **No local CUDA / Docker / driver maintenance.** Your laptop just needs + `git`, an SSH key, and an IDE. macOS, Windows, and Linux all work + identically. +- **Same image as CI and field robots.** The OSMO pod runs the exact + Docker images that the system tests and deployed robots run, so your + dev environment can't drift away from production. +- **One-command onboarding.** A new student goes from zero to "Isaac Sim + streaming into my browser" with `airstack osmo:setup` followed by + `airstack osmo:up` — no install marathon. +- **Hardware bigger than your laptop.** The pod has more CPU/RAM/GPU than + most dev laptops, even if you have a GPU laptop. + +> **Still want local development on a Linux+GPU desktop?** It works and +> can be faster for tight inner loops — see +> [Getting Started](../getting_started/index.md). It just isn't the +> recommended default anymore. ## Who is this for? -You want to develop AirStack and one of these is true: - -- You're on **macOS or Windows**. -- You have a Linux laptop but **no NVIDIA GPU**. -- Your lab shares a single GPU pool through OSMO and you'd like a - zero-installation onboarding path for new students. +Anyone developing AirStack — Mac, Windows, or Linux, with or without a +local GPU. You're comfortable using `git` from a terminal, you have an SSH key (`~/.ssh/id_ed25519` or similar), and you have either VS Code or Cursor @@ -26,21 +39,21 @@ installed. That's the entire local-machine bar. ## Architecture in a sentence -`osmo workflow submit` spins up a privileged GPU pod that runs sshd plus a -Docker-in-Docker daemon. Inside that pod, `airstack up` brings up the -familiar three AirStack containers (Isaac Sim, robot-desktop, GCS). Your IDE -attaches over Remote-SSH; Isaac Sim and Foxglove are reached via separate -port-forwards. +`airstack osmo:up` (which wraps `osmo workflow submit`) spins up a GPU pod +that runs sshd plus a Docker-in-Docker daemon. Inside that pod, `airstack +up` brings up the familiar three AirStack containers (Isaac Sim, +robot-desktop, GCS). Your IDE attaches over Remote-SSH; Isaac Sim and +Foxglove are reached via separate port-forwards. ```mermaid flowchart LR - subgraph laptop [Your laptop - Mac / Windows / Linux] + subgraph laptop [Your laptop] ide[VS Code or Cursor + Remote-SSH] osmo[osmo CLI] fox[app.foxglove.dev] webrtc[Isaac Sim WebRTC client] end - subgraph pod [OSMO workspace pod - GPU, privileged] + subgraph pod [OSMO workspace pod - GPU] sshd[sshd] inner[Inner dockerd] isaac[isaac-sim container] @@ -50,7 +63,7 @@ flowchart LR osmo -- submit and port-forward --> pod ide -- ssh on 2200 --> sshd fox -- ws on 8766 --> gcs - webrtc -- WebRTC on 47995... --> isaac + webrtc -- "WebRTC on 49100/tcp, 49099/udp" --> isaac inner --> isaac inner --> robot inner --> gcs @@ -60,6 +73,7 @@ flowchart LR | You need | Why | |---|---| +| A local clone of AirStack (`git clone https://github.com/castacks/AirStack.git`) | The `airstack osmo:*` wrappers, the workflow YAML, and the Foxglove extensions all live in the repo | | The [`osmo` CLI](https://github.com/NVIDIA/OSMO) on your `PATH` | Submitting workflows and port-forwarding | | `osmo login` done once | Stores your auth token in `~/.config/osmo` | | An SSH keypair (e.g. `~/.ssh/id_ed25519`) | The pod authorises your pubkey at submit time. Generate one with `ssh-keygen -t ed25519` if you don't already have one. | @@ -68,71 +82,15 @@ flowchart LR | Optional: an Omniverse Streaming Client / WebRTC browser client | View the streamed Isaac Sim render | You **do not** need: Docker, NVIDIA drivers, `airstack install`, `airstack -setup`, a local clone of AirStack, sudo, or Linux. +setup`, sudo, or Linux. > **Lab admin prerequisites (someone else's job, once).** A lab admin -> confirms the OSMO pool allows `privileged: true` on a GPU node and pushes -> the `airstack-osmo-workspace` image to `airlab-docker.andrew.cmu.edu`. -> Details in -> [`osmo/README.md`](https://github.com/castacks/AirStack/blob/main/osmo/README.md) -> and the [one-time pool setup section below](#one-time-pool-setup-admin). +> pushes the `airstack-osmo-workspace` image to +> `airlab-docker.andrew.cmu.edu`. Details in +> [`osmo/README.md`](https://github.com/castacks/AirStack/blob/main/osmo/README.md). > > **Your job, once:** the next step. -## One-time pool setup (admin) - -If `osmo workflow submit` returns: - -``` -Server responded with status code 400 -Error message: Workflow submit failed: -Task with platform: does not have privileged flag enabled. Task workspace -``` - -…then the OSMO pool you're targeting has `privileged_allowed: false` and the -DinD workspace can't run. You need a pool admin to flip it. **As of the -AirLab `airlab-share-01` deployment audit (May 2026), every pool ships with -`privileged_allowed: false` by default.** - -Audit pools yourself first: - -```bash -osmo pool list -t json | python3 -c " -import json, sys -for ns in json.load(sys.stdin)['node_sets']: - for p in ns['pools']: - for n, plat in p['platforms'].items(): - print(f\"{p['name']:25} {n:10} priv={plat['privileged_allowed']}\")" -``` - -If none show `priv=True`, send your pool admin a note like this: - -> Subject: Please enable privileged on the `airstack` pool -> -> Hi — I'm using the AirStack OSMO remote-dev workflow which runs the -> existing AirStack docker-compose stack inside a single pod via -> Docker-in-Docker (so students keep the `airstack up` UX). DinD requires -> `privileged: true` on the workspace task — without it the inner dockerd -> can't manage cgroups, overlayfs, the airstack_network bridge, or GPU -> device passthrough. -> -> Could you flip the `airstack` pool's platform to allow privileged tasks? -> Equivalent of: -> -> ```yaml -> platforms: -> default: -> privileged_allowed: true -> ``` -> -> No `host_network_allowed` change is needed — `osmo workflow port-forward` -> reaches the pod NS, which is enough for our Isaac Sim WebRTC and Foxglove -> streams. Workflow YAML for reference: -> `osmo/workflows/airstack-dev.yaml` in the AirStack repo. Setup details: -> `osmo/README.md`. - -Once enabled, target that pool with `--pool airstack` in Step 2. - ## Step 0 — Register your OSMO credentials (one time) OSMO credentials are **per-user** (each Andrew ID has its own Nucleus token, @@ -144,16 +102,19 @@ laptop never sees the values again. You need three credentials. The exact names matter — the workflow YAML references them by these exact names. -### Option A — interactive helper (recommended) - -If you have a local AirStack clone: +From your AirStack clone, run: ```bash -airstack osmo:setup +git clone https://github.com/castacks/AirStack.git +cd AirStack +./airstack.sh osmo:setup ``` This prompts for your Andrew ID, AirLab Docker password, and Nucleus API -token, then runs the three `osmo credential set` commands below for you. +token (get one at → +right-click cloud icon → **API Tokens** → Create), then registers the +three credentials with OSMO. The values go directly to your OSMO profile +— nothing is written to local disk. > **macOS prereq: bash 4+.** macOS ships bash 3.2 by default and the > `airstack` CLI needs bash 4+. If you see @@ -167,48 +128,44 @@ token, then runs the three `osmo credential set` commands below for you. > at `/opt/homebrew/bin/bash` (Apple Silicon) or `/usr/local/bin/bash` > (Intel) and re-execs under it. You don't need to change your login shell. -### Option B — three commands, copy-paste +### Verify -If you'd rather run the commands yourself (or you're on a laptop without an -AirStack clone), here they are: +List your credentials: -#### 1. AirLab Docker registry (REGISTRY type) +```bash +osmo credential list +``` -Used by OSMO to pull the workspace image into the pod. OSMO auto-attaches -this to any image whose hostname matches `registry=` — you don't need to -reference it in YAML. +You should see all three (`airlab-docker-registry`, `airlab-docker-login`, +`airlab-nucleus`). To rotate any of them later, just re-run +`./airstack.sh osmo:setup`. + +
+Under the hood — the three raw `osmo credential set` calls + +`airstack osmo:setup` (defined in +[`.airstack/modules/osmo.sh`](https://github.com/castacks/AirStack/blob/main/.airstack/modules/osmo.sh) +as `cmd_osmo_setup`) is equivalent to running these three commands by hand +— useful for debugging or rotating one credential at a time: ```bash +# 1. AirLab Docker registry (REGISTRY) — for OSMO's outer image-pull of +# airlab-docker.andrew.cmu.edu/airstack/airstack-osmo-workspace osmo credential set airlab-docker-registry \ --type REGISTRY \ --payload registry=airlab-docker.andrew.cmu.edu \ username= \ auth='' -``` - -#### 2. AirLab Docker login (GENERIC type) -The same Andrew ID + password, but as a **GENERIC** credential so the -**inner** dockerd inside the pod can `docker login` it and pull the -AirStack images. This duplication is unfortunately necessary — REGISTRY -credentials are for OSMO's outer image-pull only and aren't exposed to the -container as env vars. - -```bash +# 2. AirLab Docker login (GENERIC) — for the *inner* dockerd inside the +# pod to `docker login` and pull the AirStack image set osmo credential set airlab-docker-login \ --type GENERIC \ --payload username= \ password='' -``` -#### 3. AirLab Nucleus (GENERIC type) - -Nucleus authenticates with an **API token**, not your password. To get one: -go to , log in, -right-click the cloud icon in the top-right → **API Tokens** → create a new -token. Save it — Nucleus shows it once. - -```bash +# 3. AirLab Nucleus (GENERIC) — for Isaac Sim to authenticate against +# omniverse://airlab-nucleus.andrew.cmu.edu (API token, NOT password) osmo credential set airlab-nucleus \ --type GENERIC \ --payload omni_user= \ @@ -216,16 +173,7 @@ osmo credential set airlab-nucleus \ omni_server=omniverse://airlab-nucleus.andrew.cmu.edu/NVIDIA/Assets/Isaac/5.1 ``` -### Verify - -List your credentials: - -```bash -osmo credential list -``` - -You should see all three. To rotate any of them later, just re-run the -matching `osmo credential set` command. +
> **Why three credentials?** It's tempting to consolidate. The reason for > the split: OSMO REGISTRY credentials drive Kubernetes `imagePullSecrets` @@ -271,46 +219,49 @@ The `localhost:2200` is what we'll port-forward to in step 4. ## Step 2 — Submit the workflow -The repo ships the workflow at -[`osmo/workflows/airstack-dev.yaml`](https://github.com/castacks/AirStack/blob/main/osmo/workflows/airstack-dev.yaml). -You don't need a local AirStack clone to submit it — `osmo workflow submit` -takes a path and uploads the YAML. +From the AirStack clone: ```bash -# If you don't have AirStack cloned locally: -curl -fsSL -o airstack-dev.yaml \ - https://raw.githubusercontent.com/castacks/AirStack/main/osmo/workflows/airstack-dev.yaml - -# Submit: -osmo workflow submit airstack-dev.yaml \ - --pool airstack \ - --set-env "SSH_PUB_KEY=$(cat ~/.ssh/id_ed25519.pub)" +./airstack.sh osmo:up --pool airstack ``` -> **Got `Task with platform ... does not have privileged flag enabled`?** -> The pool you picked doesn't allow privileged tasks. See the -> [one-time pool setup section](#one-time-pool-setup-admin) above — -> AirLab's default pools all ship with privileged off and need an admin -> to flip it on. +This submits +[`osmo/workflows/airstack-dev.yaml`](https://github.com/castacks/AirStack/blob/main/osmo/workflows/airstack-dev.yaml) +with your local SSH pubkey injected as `SSH_PUB_KEY` — that's what +authorises **your** key on **this** workflow (each student passes their +own at submit time; the lab admin doesn't manage a global +`authorized_keys` file). + +`airstack osmo:up` prints a workflow id like `airstack-dev-1` and stores +it in `~/.airstack/osmo-state`, so the rest of the `airstack osmo:*` +commands in this tutorial pick it up automatically — no `export WF=...` +needed. To target a specific workflow for a single invocation, export +`AIRSTACK_OSMO_WF=`. -The `--set-env "SSH_PUB_KEY=..."` line is what authorises **your** key on -**this** workflow. Each student passes their own pubkey at submit time — -the lab admin doesn't manage a global authorized_keys file. +
+Under the hood — raw `osmo workflow submit` -The command prints a workflow ID like `airstack-dev-1`. Save it; you'll -reuse it for every other command in this tutorial. The shell snippets below -assume you've stored it as `WF`: +`airstack osmo:up` (defined in +[`.airstack/modules/osmo.sh`](https://github.com/castacks/AirStack/blob/main/.airstack/modules/osmo.sh) +as `cmd_osmo_up`) is equivalent to: ```bash -export WF=airstack-dev-1 +osmo workflow submit osmo/workflows/airstack-dev.yaml \ + --pool airstack \ + --set-env "SSH_PUB_KEY=$(cat ~/.ssh/id_ed25519.pub)" ``` +Save the printed workflow id as `$WF` if you're using the raw form, and +substitute it for `airstack osmo:*` in the rest of the tutorial. + +
+ ## Step 3 — Wait for the stack to come up Tail the lead task's logs and watch for milestones: ```bash -osmo workflow logs $WF workspace --follow +./airstack.sh osmo:logs ``` Expected milestones, in order (each is one line in the log): @@ -326,26 +277,44 @@ Expected milestones, in order (each is one line in the log): If step (1) appears, you can attach the IDE while the rest is still spinning up — the bring-up will continue in the background. -## Step 4 — Forward sshd and attach the IDE +
+Under the hood — raw `osmo workflow logs` -In one terminal, start the port-forward and **leave it running** for the -length of your session. The 24h connect-timeout matches the workflow's -`exec_timeout`: +`airstack osmo:logs` (defined in +[`.airstack/modules/osmo.sh`](https://github.com/castacks/AirStack/blob/main/.airstack/modules/osmo.sh) +as `cmd_osmo_logs`) polls the equivalent of: ```bash -osmo workflow port-forward $WF workspace --port 2200:22 --connect-timeout 86400 +osmo workflow logs $WF -t workspace -n 500 ``` -In your editor: +every few seconds and prints only the new lines (the `osmo` CLI has no +native `--follow`). Override the task / tail length / poll interval with +`OSMO_LOGS_TASK`, `OSMO_LOGS_TAIL`, `OSMO_LOGS_INTERVAL` env vars. -- **VS Code:** Command Palette → **Remote-SSH: Connect to Host…** → pick - `airstack-osmo`. -- **Cursor:** the same flow under its remote-development menu. +
+ +## Step 4 — Forward sshd and attach the IDE + +In one terminal, run: + +```bash +./airstack.sh osmo:ide +``` + +This (a) starts the `localhost:2200 → pod:22` port-forward with a 24h +connect-timeout (matching the workflow's `exec_timeout`), waits for the +tunnel to come up, then (b) launches Cursor or VS Code (whichever it +finds on `PATH`) pre-attached to +`vscode-remote://ssh-remote+airstack-osmo/root/AirStack`. **Leave the +terminal running** for the length of your session — closing it tears the +tunnel down. The IDE installs its remote server in the pod on first connect (~50 MB, slower on a fresh pod, cached on subsequent connects). Then: -1. **Open Folder…** → `/root/AirStack`. +1. The IDE should open `/root/AirStack` automatically. (If not: + **Open Folder…** → `/root/AirStack`.) 2. Open the integrated terminal — you're root in `/root/AirStack`. 3. Edit code in the IDE; the changes land directly on the pod's disk. @@ -358,6 +327,28 @@ docker ps You should see four containers: `airstack-isaac-sim-livestream-1`, `airstack-robot-desktop-1`, `airstack-gcs-1`, plus the AirStack CLI helper. +
+Under the hood — raw port-forward + manual IDE attach + +`airstack osmo:ide` (defined in +[`.airstack/modules/osmo.sh`](https://github.com/castacks/AirStack/blob/main/.airstack/modules/osmo.sh) +as `cmd_osmo_ide`) is equivalent to running the port-forward by hand: + +```bash +osmo workflow port-forward $WF workspace --port 2200:22 --connect-timeout 86400 +``` + +…then in the editor: + +- **VS Code:** Command Palette → **Remote-SSH: Connect to Host…** → pick + `airstack-osmo`. +- **Cursor:** the same flow under its remote-development menu. + +Add `--no-open` to `airstack osmo:ide` to only run the port-forward and +attach the IDE manually. + +
+ ## Step 5 — Pick a feature branch and start working The pod cloned `main` into `/root/AirStack` on startup. Treat it like any @@ -384,47 +375,87 @@ fact that you're on a remote pod is invisible from inside the IDE. Isaac Sim runs headless inside the pod with the Kit `omni.kit.livestream.webrtc` extension enabled (configured by the -`isaac-sim-livestream` Compose profile). To view it locally, forward the -livestream port range — **two** terminals because livestream uses both TCP -and UDP: +`isaac-sim-livestream` Compose profile). To view it locally: ```bash -# Terminal A (TCP): -osmo workflow port-forward $WF workspace \ - --port 47995-48012,49000-49007,49100 --connect-timeout 86400 +./airstack.sh osmo:webrtc ``` -```bash -# Terminal B (UDP): -osmo workflow port-forward $WF workspace \ - --port 47995-48012,49000-49007 --udp --connect-timeout 86400 -``` +This spawns the UDP port-forward (media, `49099`) in the background and +runs the TCP port-forward (signaling, `49100`) in the foreground — leave +that terminal running. Then point the **Omniverse Streaming Client** (or a WebRTC-capable browser client) at `http://localhost`. The simulation viewport shows up the same way it would on a local Linux desktop. +
+Under the hood — raw TCP + UDP port-forwards + +`airstack osmo:webrtc` (defined in +[`.airstack/modules/osmo.sh`](https://github.com/castacks/AirStack/blob/main/.airstack/modules/osmo.sh) +as `cmd_osmo_webrtc`) is equivalent to running the two raw port-forwards +in separate terminals — Kit's WebRTC needs both TCP signaling and UDP +SRTP media, and the AirStack workflow pins both to single ports rather +than scanning the Kit default range: + +```bash +# Terminal A — TCP signaling (49100): +osmo workflow port-forward $WF workspace --port 49100 --connect-timeout 86400 + +# Terminal B — UDP media (49099, pinned by the Pegasus launch script): +osmo workflow port-forward $WF workspace --port 49099 --udp --connect-timeout 86400 +``` + +
+ ## Step 7 — View ROS topics in Foxglove The GCS container runs `foxglove_bridge` on container-port `8765`, -published as host-port `8766` on the workspace pod. Forward it once: +published as host-port `8766` on the workspace pod. To install the +AirStack Foxglove extensions locally and forward the websocket in one +step: ```bash -osmo workflow port-forward $WF workspace --port 8766:8766 --connect-timeout 86400 +./airstack.sh osmo:foxglove ``` -Then in [https://app.foxglove.dev](https://app.foxglove.dev): +This copies the AirStack Foxglove extensions (Robot Tasks, Waypoint +Editor, Polygon Editor) into your local Foxglove Desktop user-extensions +dir (default `~/.foxglove-studio/extensions`; override with +`OSMO_FOXGLOVE_EXT_DIR`, skip with `OSMO_FOXGLOVE_SKIP_EXTENSIONS=1` for +`app.foxglove.dev` which doesn't load local extensions), then runs the +`localhost:8766 → pod:8766` port-forward in the foreground — leave the +terminal running. + +Then in [https://app.foxglove.dev](https://app.foxglove.dev) (or Foxglove +Desktop): 1. **Open connection** → `ws://localhost:8766`. 2. **Layouts** → **Import from file** → [`gcs/foxglove_extensions/airstack_default.json`](https://github.com/castacks/AirStack/blob/main/gcs/foxglove_extensions/airstack_default.json) - from your local AirStack clone (or download it via the GitHub raw URL). + from your AirStack clone. 3. Pick the imported layout from the layout dropdown in the top-right. The full Foxglove flow — layout import, panel customisation, DDS bridge naming — is documented at [Foxglove Visualization](../gcs/foxglove.md). The only OSMO-specific -difference is the `port-forward` line in front of it. +difference is the `osmo:foxglove` line in front of it. + +
+Under the hood — raw `osmo workflow port-forward` + +`airstack osmo:foxglove` (defined in +[`.airstack/modules/osmo.sh`](https://github.com/castacks/AirStack/blob/main/.airstack/modules/osmo.sh) +as `cmd_osmo_foxglove`) wraps the extension install plus: + +```bash +osmo workflow port-forward $WF workspace --port 8766:8766 --connect-timeout 86400 +``` + +Set `OSMO_FOXGLOVE_SKIP_EXTENSIONS=1` to only run the port-forward. + +
## Step 8 — Commit and push from inside the IDE @@ -453,32 +484,49 @@ laptop, a fresh pod tomorrow, a colleague's machine. When you're done: ```bash -osmo workflow cancel $WF +./airstack.sh osmo:down ``` +This prints a 5-second warning then cancels the workflow stored in +`~/.airstack/osmo-state`. Hit Ctrl-C in the grace window if you submitted +by accident. + > **Push first.** Anything that's still in your working tree, in `.git/` > but not pushed, in `build/`, in `bags/`, or in `/root/` outside the repo > **will be lost** on cancel. The pod is cattle. If you forget and need > something pulled out, see "I forgot to push before tearing down" below > *before* hitting cancel. +
+Under the hood — raw `osmo workflow cancel` + +`airstack osmo:down` (defined in +[`.airstack/modules/osmo.sh`](https://github.com/castacks/AirStack/blob/main/.airstack/modules/osmo.sh) +as `cmd_osmo_down`) is equivalent to: + +```bash +osmo workflow cancel $WF +``` + +
+ ## Troubleshooting | Symptom | Likely cause | Fix | |---|---|---| -| `Remote-SSH: Connection refused` after a working session | Port-forward died (laptop slept, network blip) | Re-run `osmo workflow port-forward $WF workspace --port 2200:22 --connect-timeout 86400` | -| `Permission denied (publickey)` on Remote-SSH | The pod authorised a different pubkey than the one your local SSH client is offering | Confirm `cat ~/.ssh/id_ed25519.pub` matches what was passed to `--set-env "SSH_PUB_KEY=..."`. Re-submit if the wrong key was used. | -| `osmo workflow logs` shows `ERROR: SSH_PUB_KEY not set` | You forgot `--set-env` on submit | Cancel the workflow and resubmit with `--set-env "SSH_PUB_KEY=$(cat ~/.ssh/id_ed25519.pub)"` | -| `docker pull` fails inside the pod with `unauthorized` | Your `airlab-docker-login` credential is missing or has the wrong Andrew ID/password | Re-run `airstack osmo:setup` (or the `osmo credential set airlab-docker-login ...` command in Step 0). | +| `Remote-SSH: Connection refused` after a working session | Port-forward died (laptop slept, network blip) | Re-run `./airstack.sh osmo:ide` | +| `Permission denied (publickey)` on Remote-SSH | The pod authorised a different pubkey than the one your local SSH client is offering | Confirm `cat ~/.ssh/id_ed25519.pub` matches the key that was injected at submit time. Re-submit with `./airstack.sh osmo:down && ./airstack.sh osmo:up --pool airstack`. | +| `airstack osmo:logs` shows `ERROR: SSH_PUB_KEY not set` | The submit didn't inject a pubkey (e.g. you ran raw `osmo workflow submit` without `--set-env`) | `./airstack.sh osmo:down`, then resubmit with `./airstack.sh osmo:up --pool airstack` (it injects `SSH_PUB_KEY` automatically). | +| `docker pull` fails inside the pod with `unauthorized` | Your `airlab-docker-login` credential is missing or has the wrong Andrew ID/password | Re-run `./airstack.sh osmo:setup`. | | Logs show `WARN: airlab-nucleus OSMO credential not set` and Isaac Sim asset loads fail, **or** Isaac Sim shows "Login Required: Unable to connect server omniverse://airlab-nucleus..." with the auth-service log showing `InternalCredentials.auth … 'username': '' … status: 'DENIED'` (no `Tokens.auth_with_api_token` call) | The pod is doing **password auth** instead of **API-token auth**. Inside the pod, `simulation/isaac-sim/docker/omni_pass.env` must have `OMNI_USER=$$omni-api-token` (literal `$$`, the sentinel for API-token auth — docker-compose v2 collapses `$$` to `$` on its way to the container). The OSMO entrypoint sets this automatically when `OMNI_PASS` looks like a JWT; if you see `OMNI_USER=` in the file, recreate the container with `docker compose --profile desktop --profile isaac-sim-livestream up -d isaac-sim-livestream` (`restart` does NOT re-read `env_file`). | -| Logs show `WARN: airlab-nucleus OSMO credential not set` and Isaac Sim asset loads fail, **or** Isaac Sim shows "Login Required: Unable to connect server omniverse://airlab-nucleus..." with the auth-service log showing `Tokens.auth_with_api_token … status: 'DENIED'` | Your `airlab-nucleus` API token is missing, expired, or revoked (rotation invalidates the predecessor). Confirm by SSH'ing the Nucleus host and running `sudo docker logs --tail 200 base_stack-nucleus-auth-1`. Regenerate the token at , then re-run `airstack osmo:setup` and resubmit the workflow (or live-edit `simulation/isaac-sim/docker/omni_pass.env` in the pod and recreate the `isaac-sim-livestream` container — see row above). | +| Logs show `WARN: airlab-nucleus OSMO credential not set` and Isaac Sim asset loads fail, **or** Isaac Sim shows "Login Required: Unable to connect server omniverse://airlab-nucleus..." with the auth-service log showing `Tokens.auth_with_api_token … status: 'DENIED'` | Your `airlab-nucleus` API token is missing, expired, or revoked (rotation invalidates the predecessor). Confirm by SSH'ing the Nucleus host and running `sudo docker logs --tail 200 base_stack-nucleus-auth-1`. Regenerate the token at , then `./airstack.sh osmo:setup` and `./airstack.sh osmo:down && ./airstack.sh osmo:up --pool airstack` to resubmit (or live-edit `simulation/isaac-sim/docker/omni_pass.env` in the pod and recreate the `isaac-sim-livestream` container — see row above). | | Isaac Sim container restarts repeatedly | GPU not visible to the inner Docker daemon (toolkit not configured on the node) | Lab admin task. From inside the pod: `docker info \| grep -i runtime` should list `nvidia`. | | Isaac Sim is up but the WebRTC stream is blank | The Pegasus script isn't getting `--/app/livestream/enabled=true`, or the wrong Compose profile is active | In the integrated terminal: `docker logs airstack-isaac-sim-livestream-1`. Confirm `ISAAC_SIM_LIVESTREAM=true` and that the `isaac-sim-livestream` profile is the one running (`docker ps`). | -| Foxglove "no connection" | Port-forward died, GCS container hasn't started yet, or browser is caching an old connection | Restart the `--port 8766:8766` forward; check `docker ps` shows `airstack-gcs-1` Up; try `ws://127.0.0.1:8766` instead of `ws://localhost:8766`. | +| Foxglove "no connection" | Port-forward died, GCS container hasn't started yet, or browser is caching an old connection | Re-run `./airstack.sh osmo:foxglove`; check `docker ps` shows `airstack-gcs-1` Up; try `ws://127.0.0.1:8766` instead of `ws://localhost:8766`. | | First Remote-SSH connect takes forever | VS Code / Cursor downloading its remote server (~50 MB) into the fresh pod | Wait it out the first time. Subsequent connects to the same pod hit the cache. | -| **I forgot to push before tearing down** | The pod is still up; cancel hasn't fired yet | Don't cancel. SSH in via the existing port-forward, push from the IDE terminal, *then* cancel. If the workflow has already terminated and the pod is gone, the work is gone — git is the only persistence layer. | +| **I forgot to push before tearing down** | The pod is still up; cancel hasn't fired yet | Don't run `./airstack.sh osmo:down`. SSH in via the existing port-forward (`./airstack.sh osmo:ide --no-open` if the tunnel is gone), push from the IDE terminal, *then* tear down. If the workflow has already terminated and the pod is gone, the work is gone — git is the only persistence layer. | -## What survives a `osmo workflow cancel`? +## What survives `airstack osmo:down`? | Artifact | Lives in | Survives? | |---|---|---| @@ -487,7 +535,7 @@ osmo workflow cancel $WF | Uncommitted edits in the IDE | Pod-local working tree | **No** | | `colcon build` outputs (`build/`, `install/`, `log/`) | `/root/AirStack/**/ros_ws/...` | **No** (gitignored Linux x86_64 binaries; rebuild trivially) | | Inner-dockerd image cache | Pod-local Docker layer cache | **No** | -| Bag files, sim recordings, debug screenshots | `/root/AirStack/bags/`, etc. | **No** — pull selectively via `osmo workflow rsync download $WF :` *before* cancel | +| Bag files, sim recordings, debug screenshots | `/root/AirStack/bags/`, etc. | **No** — pull selectively via `osmo workflow rsync download "$(cat ~/.airstack/osmo-state)" :` *before* tearing down | The rule of thumb: **commit + push every time you'd save a file in a git-tracked sense.** The Source Control panel is the persistence boundary. @@ -498,7 +546,7 @@ git-tracked sense.** The Source Control panel is the persistence boundary. — lab-admin reference (pool prerequisites, OSMO credential registration, workspace image build, validation stages). - [Foxglove Visualization](../gcs/foxglove.md) — full layout import + - panel-customisation flow once your `port-forward 8766:8766` is up. + panel-customisation flow once your `airstack osmo:foxglove` is up. - [AGENTS.md](https://github.com/castacks/AirStack/blob/main/AGENTS.md) — inside-the-pod workflow once you're attached: `bws`, `sws`, `docker exec`, ROS 2 commands. diff --git a/osmo/README.md b/osmo/README.md index adbc1dd88..09ac3b569 100644 --- a/osmo/README.md +++ b/osmo/README.md @@ -27,15 +27,15 @@ for context. ## Architecture in one minute -A student submits one OSMO task that runs a privileged Docker-in-Docker (DinD) -pod with sshd. Inside that pod, `airstack.sh up` brings up the regular +A student submits one OSMO task that runs a Docker-in-Docker (DinD) pod with +sshd. Inside that pod, `airstack.sh up` brings up the regular three-container AirStack stack (Isaac Sim, robot-desktop, GCS) on the inner Docker daemon. The student attaches VS Code or Cursor over Remote-SSH and streams Isaac Sim (WebRTC) and the GCS Foxglove bridge (websocket) back to their laptop via `osmo workflow port-forward`. ``` -Student laptop OSMO workspace pod (privileged, GPU) +Student laptop OSMO workspace pod (GPU) ───────────────── ───────────────────────────────────── VS Code / Cursor ── ssh ──► port-forward 2200:22 ──► sshd Isaac Sim WebRTC ── webrtc ► port-forward 47995… ──► inner isaac-sim ctnr @@ -49,28 +49,10 @@ app.foxglove.dev ── ws ────► port-forward 8766 ────► ## Pool requirements -> **⚠️ Most common blocker:** if `osmo workflow submit` returns `Task with -> platform: does not have privileged flag enabled`, the pool you -> selected has `privileged_allowed: false`. As of the AirLab `airlab-share-01` -> deployment audit (May 2026), **every pool defaults to `privileged_allowed: -> false`**. Ask the pool admin to flip it to `true` on the `airstack` pool (or -> create a dedicated pool) — see the message template in -> [`docs/tutorials/airstack_on_osmo.md`](../docs/tutorials/airstack_on_osmo.md#one-time-pool-setup-admin). -> You can audit which pools allow privileged with: -> ```bash -> osmo pool list -t json | python3 -c " -> import json, sys -> for ns in json.load(sys.stdin)['node_sets']: -> for p in ns['pools']: -> for n, plat in p['platforms'].items(): -> print(f\"{p['name']:25} {n:10} priv={plat['privileged_allowed']}\")" -> ``` - The OSMO pool the workflow runs on must satisfy: | Requirement | Why | |---|---| -| `privileged_allowed: true` | Required for DinD inside the workspace task. The inner `dockerd` needs cgroup manipulation, overlayfs, bridge/veth/iptables for `airstack_network`, and GPU device passthrough. There is no non-privileged path. Mounting the host's `/var/run/docker.sock` is not used — it would let the pod escape to the cluster node. | | GPU pool with NVIDIA driver + `nvidia-container-toolkit` on each node | Isaac Sim needs the GPU. The toolkit must be on the node so the inner `dockerd` (configured with `--add-runtime nvidia=...`, `default-runtime: nvidia`) can hand the device to the inner Isaac Sim container. | | No NetworkPolicy blocking pod-namespace ports `47995–48012/tcp+udp`, `49000–49007/tcp+udp`, `49100/tcp`, `8766/tcp`, `22/tcp` | These are the ports `osmo workflow port-forward` reaches inside the pod NS for Isaac Sim WebRTC, GCS Foxglove websocket, and sshd. | | Resource limits ≥ `cpu: 16`, `memory: 64Gi`, `storage: 200Gi`, `gpu: 1` | Isaac Sim + AirStack images + `colcon build` working tree. Adjust upward if running multiple robots or heavy bag recording. | @@ -161,8 +143,7 @@ ssh -p 2200 -o StrictHostKeyChecking=accept-new root@localhost 'echo ok && whoam ``` If SSH fails: check `osmo workflow logs workspace` for the -`SSH_PUB_KEY not set` error or for `sshd` failing to start. Make sure the -pool actually allowed the privileged task to run. +`SSH_PUB_KEY not set` error or for `sshd` failing to start. ### (b) VS Code / Cursor Remote-SSH attaches and opens `/root/AirStack` diff --git a/osmo/workflows/airstack-dev.yaml b/osmo/workflows/airstack-dev.yaml index 854a47ded..7c7adc943 100644 --- a/osmo/workflows/airstack-dev.yaml +++ b/osmo/workflows/airstack-dev.yaml @@ -1,6 +1,6 @@ # AirStack remote developer workflow on OSMO. # -# Submits a single privileged GPU task ("workspace") that runs Docker-in-Docker +# Submits a single GPU task ("workspace") that runs Docker-in-Docker # (DinD) and brings up the regular AirStack three-container stack (Isaac Sim # with WebRTC livestream, robot-desktop, GCS) on the inner Docker daemon. The # task also runs sshd so a student can attach VS Code or Cursor over Remote-SSH @@ -42,10 +42,9 @@ workflow: - name: workspace lead: true image: airlab-docker.andrew.cmu.edu/airstack/airstack-osmo-workspace:latest - # privileged is non-negotiable for DinD: cgroups, overlayfs, bridge, - # iptables, GPU device passthrough. hostNetwork is intentionally NOT - # set — osmo workflow port-forward reaches the pod NS, where the inner - # dockerd publishes ports via standard NAT. + # Required so the inner dockerd can run. hostNetwork is intentionally + # NOT set — osmo workflow port-forward reaches the pod NS, where the + # inner dockerd publishes ports via standard NAT. privileged: true command: ["bash"] args: ["/usr/local/bin/entrypoint.sh"] From b61b18b4e41e61cd8942d86cf327ade74f5015d6 Mon Sep 17 00:00:00 2001 From: Sebastian Scherer Date: Thu, 14 May 2026 20:26:14 -0400 Subject: [PATCH 09/13] fix(osmo): make osmo:logs actually stream + survive pod host-key churn osmo:logs was silent because cmd_osmo_logs wrapped osmo workflow logs in $( ... ) on the assumption that -n LAST_N_LINES exits after dumping the tail. Empirically the CLI keeps the stream open as new lines arrive (it already behaves like tail -f, despite --help advertising only -n), so command substitution waited forever and printed nothing. Drop the polling loop and just exec the command directly. Each fresh OSMO pod also ships a new sshd host key, so every osmo:up trips StrictHostKeyChecking against the previous workflow's fingerprint and SSH/Cursor abort with "Host key for [localhost]:2200 has changed". Switch the recommended ~/.ssh/config block (and osmo/README.md) to the ephemeral-host pattern (StrictHostKeyChecking no + UserKnownHostsFile /dev/null + LogLevel ERROR), and have cmd_osmo_ide ssh-keygen -R the stale loopback entry on every run so users on the old config get unblocked automatically. Co-authored-by: Cursor --- .airstack/modules/osmo.sh | 74 +++++++++++------------------- docs/tutorials/airstack_on_osmo.md | 33 +++++++++++-- osmo/README.md | 15 +++++- 3 files changed, 67 insertions(+), 55 deletions(-) diff --git a/.airstack/modules/osmo.sh b/.airstack/modules/osmo.sh index 9e2e9bd5a..4933ac5f4 100755 --- a/.airstack/modules/osmo.sh +++ b/.airstack/modules/osmo.sh @@ -354,60 +354,23 @@ function cmd_osmo_up { # osmo:logs — follow the workspace task logs. # -# The osmo CLI's `workflow logs` command has no --follow flag (and the task -# is selected with `-t TASK`, not positionally). To get a tail -f experience -# we re-fetch the last N lines on a short interval and print only the lines -# that appeared since the previous poll. The "find last seen line, print -# what follows" trick degrades gracefully: if the cursor outruns -n during -# a particularly loud burst, we simply re-print the whole tail with a -# warning rather than dropping output silently. +# Despite the `osmo workflow logs --help` output advertising only `-n +# LAST_N_LINES` (no `--follow`), the CLI in fact streams the tail and keeps +# the connection open as new lines arrive — i.e. it already behaves like +# `tail -f`. We just exec it in the foreground so the user sees output +# immediately and can Ctrl+C to stop. (An earlier implementation wrapped +# this in `out=$(osmo workflow logs ...)`; command substitution waits for +# the process to exit, which never happened, so nothing was ever printed.) function cmd_osmo_logs { _osmo_check_cli || return 1 local wf; wf="$(_osmo_wf_id)" || return 1 local task="${OSMO_LOGS_TASK:-workspace}" local lines="${OSMO_LOGS_TAIL:-500}" - local interval="${OSMO_LOGS_INTERVAL:-3}" - - log_info "Following ${task} logs for ${wf} (polling every ${interval}s, last ${lines} lines per fetch; Ctrl+C to stop)" - - local prev="" - trap 'echo; log_info "stopped following ${wf}"; trap - INT TERM; return 0' INT TERM - while true; do - local out - out="$(osmo workflow logs "${wf}" -t "${task}" -n "${lines}" 2>/dev/null)" - if [ -n "${out}" ] && [ "${out}" != "${prev}" ]; then - if [ -z "${prev}" ]; then - printf '%s\n' "${out}" - else - local last_line; last_line="$(printf '%s' "${prev}" | tail -1)" - local suffix - suffix="$(printf '%s\n' "${out}" | awk -v L="${last_line}" ' - matched { print; next } - $0 == L { matched=1 } - ')" - if [ -n "${suffix}" ]; then - printf '%s\n' "${suffix}" - else - log_warn "log cursor outran -n ${lines}; reprinting tail" - printf '%s\n' "${out}" - fi - fi - prev="${out}" - fi - # Exit cleanly once the workflow reaches a terminal state. - local status - status="$(osmo workflow query "${wf}" 2>/dev/null | awk -F': +' '/^Status/ {print $2; exit}' | tr -d ' \r\n')" - case "${status}" in - SUCCEEDED|FAILED|FAILED_*|CANCELED) - log_info "workflow ${wf} is ${status}; exiting follow" - break - ;; - esac - sleep "${interval}" - done - trap - INT TERM + log_info "Following ${task} logs for ${wf} (last ${lines} lines, then live; Ctrl+C to stop)" + + osmo workflow logs "${wf}" -t "${task}" -n "${lines}" } # osmo:ide — port-forward sshd + (optionally) launch VS Code/Cursor on the @@ -446,6 +409,21 @@ function cmd_osmo_ide { # `--port LOCAL:REMOTE` mapping). local local_port="${OSMO_SSH_PORT%%:*}" + # Every fresh OSMO pod ships a new sshd host key. If the user's + # ~/.ssh/known_hosts still has an entry for [localhost]:${local_port} + # from a previous workflow, ssh aborts with "Host key for [localhost] + # :${local_port} has changed and you have requested strict checking", + # which the IDE surfaces as a generic "could not connect" error. + # + # The recommended ~/.ssh/config block for `airstack-osmo` uses + # `UserKnownHostsFile /dev/null`, which sidesteps this entirely — but + # users who set up before that change still have a stale entry on + # disk. Scrub it defensively on every osmo:ide invocation. ssh-keygen + # -R is idempotent: a no-op if the entry doesn't exist. + if command -v ssh-keygen >/dev/null 2>&1; then + ssh-keygen -R "[localhost]:${local_port}" >/dev/null 2>&1 || true + fi + # Reuse an existing forward if one is already listening (the user might # have run this from a second terminal, or osmo:foxglove already opened # a multi-port forward). Otherwise spawn one in the background and wait @@ -587,7 +565,7 @@ function register_osmo_commands { COMMAND_HELP["osmo:setup"]="One-time per-user OSMO credential setup (airlab-docker-registry, airlab-docker-login, airlab-nucleus)" COMMAND_HELP["osmo:up"]="Submit osmo/workflows/airstack-dev.yaml with your SSH pubkey injected (--pool POOL, --key PATH, --branch BRANCH)" - COMMAND_HELP["osmo:logs"]="Follow the workspace task logs (polls osmo workflow logs -t workspace -n 500; OSMO_LOGS_TASK / OSMO_LOGS_TAIL / OSMO_LOGS_INTERVAL override)" + COMMAND_HELP["osmo:logs"]="Follow the workspace task logs (osmo workflow logs -t workspace -n 500; OSMO_LOGS_TASK / OSMO_LOGS_TAIL override)" COMMAND_HELP["osmo:ide"]="Port-forward sshd (2200:22) and open VS Code/Cursor on Host airstack-osmo" COMMAND_HELP["osmo:webrtc"]="Port-forward Isaac Sim WebRTC ranges (TCP foreground + UDP background)" COMMAND_HELP["osmo:foxglove"]="Install AirStack Foxglove extensions locally, then port-forward GCS Foxglove websocket (8766:8766). Override target dir with OSMO_FOXGLOVE_EXT_DIR; skip install with OSMO_FOXGLOVE_SKIP_EXTENSIONS=1." diff --git a/docs/tutorials/airstack_on_osmo.md b/docs/tutorials/airstack_on_osmo.md index 8013e0715..cb2916b75 100644 --- a/docs/tutorials/airstack_on_osmo.md +++ b/docs/tutorials/airstack_on_osmo.md @@ -194,7 +194,16 @@ Host airstack-osmo HostName localhost Port 2200 User root - StrictHostKeyChecking accept-new + # Every OSMO workflow boots a fresh pod with a fresh sshd host key, so + # any saved fingerprint for [localhost]:2200 will be wrong on the next + # `airstack osmo:up`. Skip the host-key check here: this alias only + # connects via the local port-forward, so the security boundary is + # OSMO's authenticated control-plane tunnel — not the SSH fingerprint. + # /dev/null keeps known_hosts clean (no stale entries pile up); LogLevel + # ERROR silences the "Permanently added [localhost]:2200" banner. + StrictHostKeyChecking no + UserKnownHostsFile /dev/null + LogLevel ERROR # SSH agent forwarding so `git push` from inside the pod uses your # local laptop's SSH key (the pod's sshd has AllowAgentForwarding yes # baked in by osmo/workspace/sshd_config). Without this, the pod has @@ -212,6 +221,18 @@ EOF The `localhost:2200` is what we'll port-forward to in step 4. +> **Already added the old block?** If your `~/.ssh/config` still has +> `StrictHostKeyChecking accept-new` for `airstack-osmo` from an earlier +> setup, replace it with the three lines above. As a one-time cleanup of +> the stale fingerprint left behind by previous pods, also run: +> +> ```bash +> ssh-keygen -R "[localhost]:2200" +> ``` +> +> `airstack osmo:ide` does this scrub for you on every run, so you only +> need it once when migrating. + > **Smoke-test the agent forward** once the pod is up: SSH in and run > `ssh-add -l` — you should see your local key listed. If you see "The > agent has no identities", run `ssh-add ~/.ssh/id_ed25519` on your @@ -282,15 +303,17 @@ spinning up — the bring-up will continue in the background. `airstack osmo:logs` (defined in [`.airstack/modules/osmo.sh`](https://github.com/castacks/AirStack/blob/main/.airstack/modules/osmo.sh) -as `cmd_osmo_logs`) polls the equivalent of: +as `cmd_osmo_logs`) just exec's: ```bash osmo workflow logs $WF -t workspace -n 500 ``` -every few seconds and prints only the new lines (the `osmo` CLI has no -native `--follow`). Override the task / tail length / poll interval with -`OSMO_LOGS_TASK`, `OSMO_LOGS_TAIL`, `OSMO_LOGS_INTERVAL` env vars. +The `osmo` CLI's `workflow logs` command prints the last N lines and then +keeps the stream open as new lines arrive (it already behaves like `tail +-f`, even though `--help` only documents `-n LAST_N_LINES`). Ctrl+C to +stop. Override the task / tail length with `OSMO_LOGS_TASK` / +`OSMO_LOGS_TAIL` env vars. diff --git a/osmo/README.md b/osmo/README.md index 09ac3b569..3a1b02a7f 100644 --- a/osmo/README.md +++ b/osmo/README.md @@ -138,7 +138,12 @@ osmo workflow submit osmo/workflows/airstack-dev.yaml \ # → record osmo workflow port-forward workspace --port 2200:22 --connect-timeout 86400 & -ssh -p 2200 -o StrictHostKeyChecking=accept-new root@localhost 'echo ok && whoami' +# StrictHostKeyChecking=no + UserKnownHostsFile=/dev/null because every +# fresh pod has a different sshd host key — the previous workflow's +# fingerprint will always look like a "host key changed" attack +# otherwise. +ssh -p 2200 -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null \ + root@localhost 'echo ok && whoami' # → "ok\nroot" ``` @@ -154,7 +159,13 @@ Host airstack-osmo HostName localhost Port 2200 User root - StrictHostKeyChecking accept-new + # Each fresh pod has a new sshd host key, so accept-new doesn't help + # — the second workflow always trips the "host key changed" check. + # Bypass host-key checks for this loopback alias only; the security + # boundary is OSMO's authenticated port-forward, not the local key. + StrictHostKeyChecking no + UserKnownHostsFile /dev/null + LogLevel ERROR ``` Then in VS Code: Command Palette → **Remote-SSH: Connect to Host…** → From 98b00ad991637b172ccf221ea41a18ddbd0920c2 Mon Sep 17 00:00:00 2001 From: Sebastian Scherer Date: Thu, 14 May 2026 20:38:59 -0400 Subject: [PATCH 10/13] fix(osmo): auto-pin --branch to local checkout + clean error UX when workflow dies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pod's entrypoint clones AirStack fresh from GitHub on every workflow start (the pod fs is ephemeral). It defaulted to `main`, so any developer testing branch-only OSMO changes silently ran their pod against stale `main` code — most visibly: COMPOSE_PROFILES=desktop,isaac-sim-livestream resolved to "desktop" alone on `main` because the isaac-sim-livestream service only exists on the feature branch, so isaac-sim never came up and `airstack osmo:webrtc` showed a blank stream. - cmd_osmo_up now defaults --branch to the local repo's current branch (git rev-parse --abbrev-ref HEAD). Detached HEAD or non-git checkouts fall back to `main` cleanly. Pass --branch explicitly to override. - New _osmo_check_branch_pushed warns up-front when the about-to- submit branch has no upstream, is ahead of origin, or has an uncommitted working tree. The pod doesn't see your laptop's edits. Separately, when an OSMO workflow gets canceled mid-flight (osmo:down in another shell, or OSMO timing it out), the in-flight port-forward and logs streams raise OSMOUserError("Workflow X is not running!") from inside an asyncio Task. The CLI prints "Task exception was never retrieved" + a multi-line Traceback that buries the actual one-line cause. New _osmo_pf_filter awk script collapses that into a single [ERROR] line pointing at `airstack osmo:up`. Wired into webrtc, foxglove, and logs. webrtc also gains a cleanup trap that kills the backgrounded UDP port-forward on EXIT/INT/TERM so we don't leak it against a dead workflow. Tutorial Step 2 documents the new --branch default and the "pod-clones-from-GitHub-not-your-laptop" gotcha. Co-authored-by: Cursor --- .airstack/modules/osmo.sh | 158 +++++++++++++++++++++++++++-- docs/tutorials/airstack_on_osmo.md | 22 +++- 2 files changed, 170 insertions(+), 10 deletions(-) diff --git a/.airstack/modules/osmo.sh b/.airstack/modules/osmo.sh index 4933ac5f4..053decbee 100755 --- a/.airstack/modules/osmo.sh +++ b/.airstack/modules/osmo.sh @@ -270,22 +270,95 @@ function _osmo_save_wf_id { log_info "Saved workflow id '$1' to ${OSMO_STATE_FILE}" } +# Helper: best-effort detection of the user's current AirStack branch so +# `airstack osmo:up` can default --branch to whatever the user is editing +# locally. Returns the branch name on stdout, or empty if we shouldn't +# auto-pin (detached HEAD, not a git repo, etc.). +# +# Why default to the local branch: the pod's entrypoint clones AirStack +# fresh from GitHub on every workflow start (the pod fs is ephemeral, so +# nothing else makes sense). If we don't tell it which branch, it +# defaults to `main` — and any developer testing branch-only OSMO +# changes (compose services, entrypoint tweaks, workflow yaml edits) +# silently runs against stale `main` code instead of their work. +# Defaulting to the local branch makes "edit on laptop, push, osmo:up" +# the natural workflow. +function _osmo_local_branch { + if ! command -v git >/dev/null 2>&1; then + return 0 + fi + local b + b="$(git -C "${PROJECT_ROOT}" rev-parse --abbrev-ref HEAD 2>/dev/null)" || return 0 + case "$b" in + ""|HEAD) return 0 ;; # detached HEAD or empty + esac + echo "$b" +} + +# Helper: warn if the about-to-submit branch isn't safely pushed. The +# pod clones from GitHub, so unpushed commits / dirty working tree don't +# make it into the pod even if the user thinks they did. Catching this +# before submit avoids a 60-90s "wait for pod, then realize" round trip. +function _osmo_check_branch_pushed { + local branch="$1" + command -v git >/dev/null 2>&1 || return 0 + local repo="${PROJECT_ROOT}" + [ -d "${repo}/.git" ] || return 0 + + local local_sha upstream_sha + local_sha="$(git -C "$repo" rev-parse "${branch}" 2>/dev/null)" || return 0 + + # Look for a remote-tracking branch first (the explicit upstream + # set by `git push -u`); fall back to origin/. + upstream_sha="$(git -C "$repo" rev-parse "${branch}@{upstream}" 2>/dev/null)" + if [ -z "$upstream_sha" ]; then + upstream_sha="$(git -C "$repo" rev-parse "origin/${branch}" 2>/dev/null)" + fi + + if [ -z "$upstream_sha" ]; then + log_warn "Branch '${branch}' has no upstream on origin — the pod's clone will fail. Run: git push -u origin ${branch}" + return 0 + fi + + if [ "$local_sha" != "$upstream_sha" ]; then + local ahead behind + ahead="$(git -C "$repo" rev-list --count "${upstream_sha}..${local_sha}" 2>/dev/null)" + behind="$(git -C "$repo" rev-list --count "${local_sha}..${upstream_sha}" 2>/dev/null)" + if [ "${ahead:-0}" -gt 0 ]; then + log_warn "Local '${branch}' is ${ahead} commit(s) ahead of origin/${branch} — the pod will clone the older origin tip. Run: git push" + fi + if [ "${behind:-0}" -gt 0 ]; then + log_info "Local '${branch}' is ${behind} commit(s) behind origin/${branch} (pod will clone the newer origin tip)." + fi + fi + + if [ -n "$(git -C "$repo" status --porcelain 2>/dev/null)" ]; then + log_warn "Working tree has uncommitted changes — the pod will not see them. Commit + push first if you want the pod to pick them up." + fi +} + # osmo:up — submit airstack-dev.yaml with the local pubkey injected. # # Usage: airstack osmo:up [--pool POOL] [--key PATH] [--branch BRANCH] +# +# --branch defaults to the local repo's current branch (or `main` if we +# can't detect one), and is passed through as AIRSTACK_BRANCH so the +# pod's entrypoint clones the matching code. Pass `--branch main` +# explicitly to override. function cmd_osmo_up { _osmo_check_cli || return 1 local pool="${OSMO_POOL:-}" local pubkey_file="" local branch="" + local branch_explicit=false local extra_args=() while [ $# -gt 0 ]; do case "$1" in --pool) pool="$2"; shift 2 ;; --key) pubkey_file="$2"; shift 2 ;; - --branch) branch="$2"; shift 2 ;; + --branch) branch="$2"; branch_explicit=true; shift 2 ;; *) extra_args+=("$1"); shift ;; esac done @@ -304,6 +377,19 @@ function cmd_osmo_up { return 1 fi + # Auto-pin --branch to the local checkout if the user didn't pass one. + if [ "$branch_explicit" = false ] && [ -z "$branch" ]; then + branch="$(_osmo_local_branch)" + if [ -n "$branch" ]; then + log_info "Auto-detected local branch '${branch}'; pod will clone from origin/${branch} (override with --branch main)." + else + log_info "Could not detect local branch (detached HEAD?); pod will clone from origin/main." + fi + fi + if [ -n "$branch" ]; then + _osmo_check_branch_pushed "$branch" + fi + local cmd=(osmo workflow submit "$workflow_yaml") if [ -n "$pool" ]; then cmd+=(--pool "$pool") @@ -370,7 +456,12 @@ function cmd_osmo_logs { log_info "Following ${task} logs for ${wf} (last ${lines} lines, then live; Ctrl+C to stop)" - osmo workflow logs "${wf}" -t "${task}" -n "${lines}" + # Filter stderr for the same OSMOUserError-when-workflow-dies case + # the port-forward path hits — same noisy asyncio Traceback + + # "Task exception was never retrieved" header. _osmo_pf_filter + # collapses it into one clean log line. + osmo workflow logs "${wf}" -t "${task}" -n "${lines}" \ + 2> >(_osmo_pf_filter "${wf}") } # osmo:ide — port-forward sshd + (optionally) launch VS Code/Cursor on the @@ -477,8 +568,49 @@ function cmd_osmo_ide { fi } +# Helper: filter `osmo workflow port-forward` stderr through awk to +# suppress the asyncio traceback that erupts whenever the workflow gets +# canceled mid-flight (e.g. via osmo:down in another shell, or because +# OSMO timed it out). The CLI raises OSMOUserError("Workflow X is not +# running!") from inside an asyncio Task, which then prints "Task +# exception was never retrieved" + a multi-line Traceback that obscures +# the actual one-line cause. We translate that into a single clean log +# line and drop everything else. +function _osmo_pf_filter { + local wf="$1" + awk -v WF="$wf" ' + /^Task exception was never retrieved/ { skipping=1; next } + /^future:/ { skipping=1; next } + /^Traceback \(most recent call last\):/ { skipping=1; next } + /^ File "/ { next } + /^src\.lib\.utils\.osmo_errors\.OSMOUserError/ { + sub(/^src\.lib\.utils\.osmo_errors\.OSMOUserError: */, "") + printf "\033[0;31m[ERROR]\033[0m %s (run `airstack osmo:up` to start a new workflow)\n", $0 + next + } + /OSMOUserError: Workflow .* is not running!/ { + printf "\033[0;31m[ERROR]\033[0m Workflow %s is no longer running (run `airstack osmo:up` to start a new one).\n", WF + next + } + skipping && /^$/ { skipping=0; next } + skipping { next } + { print } + ' >&2 +} + +# Helper: run `osmo workflow port-forward` with the noise filter +# attached. Returns the underlying exit code so callers can decide +# whether to retry / fail. Args after the helper name are passed to +# `osmo workflow port-forward` verbatim. +function _osmo_run_port_forward { + osmo workflow port-forward "$@" 2> >(_osmo_pf_filter "$1") +} + # osmo:webrtc — forward both Isaac Sim WebRTC port ranges (TCP in this -# terminal, spawn UDP in the background). +# terminal, spawn UDP in the background). Cleans up the UDP child on +# exit (Ctrl+C, foreground TCP failure, or the workflow disappearing +# mid-stream) so we don't leak a port-forward into the user's process +# table. function cmd_osmo_webrtc { _osmo_check_cli || return 1 local wf; wf="$(_osmo_wf_id)" || return 1 @@ -488,11 +620,25 @@ function cmd_osmo_webrtc { --port "$OSMO_WEBRTC_UDP" --udp \ --connect-timeout "$OSMO_PF_TIMEOUT" \ > "${OSMO_STATE_DIR}/webrtc-udp.log" 2>&1 & - log_info " UDP log: ${OSMO_STATE_DIR}/webrtc-udp.log (pid $!)" + local udp_pid=$! + log_info " UDP log: ${OSMO_STATE_DIR}/webrtc-udp.log (pid ${udp_pid})" + + # Tear the UDP fork down when this function exits, by any path. + # Without this, hitting Ctrl+C on the TCP foreground (or the + # workflow being canceled, which surfaces as the foreground exiting + # non-zero) leaves the UDP `osmo workflow port-forward` running + # against a dead workflow until the user notices and pkill's it. + trap ' + if kill -0 "'"${udp_pid}"'" 2>/dev/null; then + kill "'"${udp_pid}"'" 2>/dev/null + wait "'"${udp_pid}"'" 2>/dev/null + fi + trap - EXIT INT TERM + ' EXIT INT TERM log_info "Foreground TCP port-forward: ${OSMO_WEBRTC_TCP}" log_info "Open the Omniverse Streaming Client / WebRTC client at http://localhost" - osmo workflow port-forward "$wf" workspace \ + _osmo_run_port_forward "$wf" workspace \ --port "$OSMO_WEBRTC_TCP" \ --connect-timeout "$OSMO_PF_TIMEOUT" } @@ -535,7 +681,7 @@ function cmd_osmo_foxglove { log_info "Then in Foxglove Desktop: Open connection → ws://localhost:8766" log_info " Layouts → Import from file → ${ext_src}/airstack_default.json" log_info " (Restart Foxglove Desktop once if newly-installed panels still show as 'Unknown panel type'.)" - osmo workflow port-forward "$wf" workspace \ + _osmo_run_port_forward "$wf" workspace \ --port "$OSMO_FOXGLOVE_PORT" \ --connect-timeout "$OSMO_PF_TIMEOUT" } diff --git a/docs/tutorials/airstack_on_osmo.md b/docs/tutorials/airstack_on_osmo.md index cb2916b75..d4da917a1 100644 --- a/docs/tutorials/airstack_on_osmo.md +++ b/docs/tutorials/airstack_on_osmo.md @@ -248,10 +248,24 @@ From the AirStack clone: This submits [`osmo/workflows/airstack-dev.yaml`](https://github.com/castacks/AirStack/blob/main/osmo/workflows/airstack-dev.yaml) -with your local SSH pubkey injected as `SSH_PUB_KEY` — that's what -authorises **your** key on **this** workflow (each student passes their -own at submit time; the lab admin doesn't manage a global -`authorized_keys` file). +with two things injected: + +- your local SSH pubkey as `SSH_PUB_KEY` — that's what authorises + **your** key on **this** workflow (each student passes their own at + submit time; the lab admin doesn't manage a global `authorized_keys` + file). +- `AIRSTACK_BRANCH` set to your local repo's current branch — the pod + ignores your laptop's working tree (it's ephemeral and runs in a + different machine room) and clones AirStack fresh from GitHub on + every workflow start, so this is how it knows which branch to use. + Override with `--branch main` if you want the pod to track main even + while you're on a feature branch. + +> **The pod clones from GitHub, not your laptop.** Local edits (and +> commits you haven't pushed) won't make it into the pod. `airstack +> osmo:up` warns you up-front if your branch is ahead of origin or has +> uncommitted changes — `git push` first if you want the pod to pick +> them up. `airstack osmo:up` prints a workflow id like `airstack-dev-1` and stores it in `~/.airstack/osmo-state`, so the rest of the `airstack osmo:*` From 17ca30d1b7be5d495fad6662092a0a43ef081b84 Mon Sep 17 00:00:00 2001 From: Sebastian Scherer Date: Thu, 14 May 2026 20:46:26 -0400 Subject: [PATCH 11/13] perf(osmo): bump inner dockerd concurrency to saturate 10 GbE pulls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dockerd's defaults of --max-concurrent-downloads=3 / --max-concurrent -uploads=5 cap a fresh airstack-dev pod's image-pull at ~300 MiB/s against the airlab-backup-10g registry — single-stream TLS tops out around 300-500 MiB/s per core, and three parallel streams of unevenly sized blobs serialize down to that ceiling. Ceph (1014 TiB, 92 OSDs, SSD pools) and 10 GbE both have far more headroom than that. Bump to 10/10 to overlap enough blob downloads to saturate the pipe. Threaded through the DOCKERD_MAX_DOWNLOADS / DOCKERD_MAX_UPLOADS env vars so a pool can be tuned at submit time without rebuilding the workspace image. Workspace image needs a rebuild + push for this to take effect: cd osmo/workspace docker build -t airlab-docker.andrew.cmu.edu/airstack/airstack-osmo-workspace:latest . docker push airlab-docker.andrew.cmu.edu/airstack/airstack-osmo-workspace:latest Co-authored-by: Cursor --- osmo/workspace/entrypoint.sh | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/osmo/workspace/entrypoint.sh b/osmo/workspace/entrypoint.sh index 67068fde9..1d67aecd1 100755 --- a/osmo/workspace/entrypoint.sh +++ b/osmo/workspace/entrypoint.sh @@ -80,15 +80,30 @@ log "diagnostics: /var/lib/docker fs=$(stat -fc %T /var/lib/docker 2>/dev/null | # present) → vfs (always works, slowest). Falling back avoids the # overlay-on-overlay failure that bites DinD on some kernel/storage # combinations. +# +# Concurrency: dockerd's defaults are --max-concurrent-downloads=3 and +# --max-concurrent-uploads=5. With 2 GB+ AirStack image blobs on a 10 GbE +# pool, a single TLS pull stream tops out around 300-500 MiB/s (CPU-bound +# on the registry-side TLS encryption), so 3 parallel streams cap the +# whole bring-up around the 300 MiB/s mark seen empirically against the +# airlab-backup-10g registry — even though Ceph + 10 GbE can do far more. +# Bumping to 10 streams overlaps blob downloads enough to saturate the +# pipe without overwhelming the registry. Override with DOCKERD_MAX_* +# env vars at submit time if a particular pool needs different tuning. +DOCKERD_MAX_DOWNLOADS="${DOCKERD_MAX_DOWNLOADS:-10}" +DOCKERD_MAX_UPLOADS="${DOCKERD_MAX_UPLOADS:-10}" + _start_dockerd() { local driver="$1" : > /var/log/dockerd.log # We rely on /var/lib/docker being a bind-mount of /mnt/airstack-data # (200Gi Cinder volume) so during-pull disk peaks aren't constrained by - # node ephemeral-storage. Default --max-concurrent-downloads=3 is fine. + # node ephemeral-storage. nohup dockerd \ --host=unix:///var/run/docker.sock \ --storage-driver="$driver" \ + --max-concurrent-downloads="$DOCKERD_MAX_DOWNLOADS" \ + --max-concurrent-uploads="$DOCKERD_MAX_UPLOADS" \ > /var/log/dockerd.log 2>&1 & DOCKERD_PID=$! log "dockerd started (pid=$DOCKERD_PID, storage-driver=$driver); waiting for socket" From 838ec7dba40f030c4b3ecf45a233f680b1409abe Mon Sep 17 00:00:00 2001 From: Sebastian Scherer Date: Thu, 14 May 2026 20:54:28 -0400 Subject: [PATCH 12/13] docs(osmo): require buildx --platform linux/amd64 for workspace image MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A plain `docker build && docker push` on an Apple Silicon Mac silently produces a linux/arm64-only `latest` manifest. OSMO workers are amd64, so every subsequent workflow fails at the outer pod-image pull with "no match for platform in manifest" before the entrypoint even runs — a confusing failure mode whose root cause lives entirely in the push, not in the workflow yaml or the entrypoint. Switch the README and the Dockerfile docstring to the buildx form, explain the why, and document the post-push manifest check. Co-authored-by: Cursor --- osmo/README.md | 39 ++++++++++++++++++++++++++++++++++----- osmo/workspace/Dockerfile | 13 ++++++++++--- 2 files changed, 44 insertions(+), 8 deletions(-) diff --git a/osmo/README.md b/osmo/README.md index 3a1b02a7f..a9e33b41c 100644 --- a/osmo/README.md +++ b/osmo/README.md @@ -96,19 +96,48 @@ all three `osmo credential set` commands. The workspace image is built once and pushed to the AirLab registry; students never build it themselves. +> **Always use `docker buildx build --platform linux/amd64 --push`.** +> OSMO pool workers are linux/amd64. Building with plain `docker build` on +> an Apple Silicon Mac silently produces a `linux/arm64` image and the +> resulting `latest` tag will fail every workflow with +> `no match for platform in manifest ...: not found` (the outer pod's +> image-pull bails before the entrypoint even runs). Forcing `--platform +> linux/amd64` cross-compiles for amd64 even on an arm64 host. `--push` +> is required because buildx cross-platform builds can't be loaded into a +> local Docker daemon — they live only in the build cache or the +> registry. Linux/amd64 admins can use plain `docker build && docker push`. + ```bash cd osmo/workspace -docker build -t airlab-docker.andrew.cmu.edu/airstack/airstack-osmo-workspace:latest . -docker push airlab-docker.andrew.cmu.edu/airstack/airstack-osmo-workspace:latest + +# One-time builder setup (skip if `docker buildx ls` already shows a builder): +docker buildx create --use --name airstack-builder + +docker buildx build \ + --platform linux/amd64 \ + -t airlab-docker.andrew.cmu.edu/airstack/airstack-osmo-workspace:latest \ + --push \ + . +``` + +Verify the manifest has `linux/amd64` after pushing: + +```bash +docker manifest inspect airlab-docker.andrew.cmu.edu/airstack/airstack-osmo-workspace:latest \ + | grep -A2 architecture +# → "architecture": "amd64" ``` Tag a versioned release alongside `latest` if you change anything in `Dockerfile`, `sshd_config`, or `entrypoint.sh`: ```bash -docker tag airlab-docker.andrew.cmu.edu/airstack/airstack-osmo-workspace:latest \ - airlab-docker.andrew.cmu.edu/airstack/airstack-osmo-workspace:v0.1.0 -docker push airlab-docker.andrew.cmu.edu/airstack/airstack-osmo-workspace:v0.1.0 +docker buildx build \ + --platform linux/amd64 \ + -t airlab-docker.andrew.cmu.edu/airstack/airstack-osmo-workspace:latest \ + -t airlab-docker.andrew.cmu.edu/airstack/airstack-osmo-workspace:v0.1.0 \ + --push \ + . ``` Then update the `image:` field in diff --git a/osmo/workspace/Dockerfile b/osmo/workspace/Dockerfile index e64d8662c..e80f3be59 100644 --- a/osmo/workspace/Dockerfile +++ b/osmo/workspace/Dockerfile @@ -6,10 +6,17 @@ # Sim, robot-desktop, GCS) on the GPU forwarded into the pod. # # Built and pushed by the lab admin (see osmo/README.md): -# docker build -t airlab-docker.andrew.cmu.edu/airstack/airstack-osmo-workspace:latest . -# docker push airlab-docker.andrew.cmu.edu/airstack/airstack-osmo-workspace:latest # -# Students never build this image. +# cd osmo/workspace +# docker buildx build --platform linux/amd64 \ +# -t airlab-docker.andrew.cmu.edu/airstack/airstack-osmo-workspace:latest \ +# --push . +# +# Use `docker buildx build --platform linux/amd64 --push` (not plain +# `docker build && docker push`) so an Apple Silicon Mac doesn't silently +# push an arm64 image; OSMO workers are amd64 and would fail every +# workflow with "no match for platform in manifest". Students never +# build this image. FROM ubuntu:24.04 From 2dbfdf486ea9dff18df4bdc4b0e26e019e52692f Mon Sep 17 00:00:00 2001 From: Sebastian Scherer Date: Thu, 14 May 2026 21:33:14 -0400 Subject: [PATCH 13/13] perf(osmo): move dockerd data-root to /osmo/run for native overlay2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The OSMO pod's `/` is itself a containerd overlay snapshot, and Linux refuses to stack a second overlayfs on top of an overlay rootfs — which is why the inner dockerd was falling through to fuse-overlayfs. That costs a kernel↔userspace FUSE round-trip on every `creat()` during layer extraction, which murders throughput on apt/pip/ROS layers (measured: 32-50 MB/s for small-file-heavy layers vs 480 MB/s for big-file layers in the same pull). Pointing dockerd at /osmo/run/docker (the kubelet emptyDir backed by ext4 on /dev/vda3) lets the existing overlay2-first fallback chain actually succeed on its first try, restoring kernel-overlay extraction performance. emptyDir lifetime matches the workflow lifetime, so the docker layer cache gets the right scope automatically. Falls back to /var/lib/docker if /osmo/run isn't present so the image still works in non-OSMO test contexts. Co-authored-by: Cursor --- osmo/workspace/entrypoint.sh | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/osmo/workspace/entrypoint.sh b/osmo/workspace/entrypoint.sh index 1d67aecd1..d119016bf 100755 --- a/osmo/workspace/entrypoint.sh +++ b/osmo/workspace/entrypoint.sh @@ -81,6 +81,28 @@ log "diagnostics: /var/lib/docker fs=$(stat -fc %T /var/lib/docker 2>/dev/null | # overlay-on-overlay failure that bites DinD on some kernel/storage # combinations. # +# data-root: the OSMO pod's `/` is itself an overlay (containerd's +# snapshot), and Linux refuses to stack a second overlayfs on top of an +# overlay rootfs — that's exactly why dockerd here used to fall through +# to fuse-overlayfs. fuse-overlayfs is a userspace FUSE driver, and every +# `creat()` during layer extraction pays a kernel↔userspace round-trip, +# which crushes throughput on the apt/pip/ROS layers (observed: ~30-50 +# MB/s vs. ~480 MB/s on layers with few large files). Pointing data-root +# at /osmo/run/docker (the kubelet emptyDir bind-mount, backed by ext4 on +# /dev/vda3) lets us use kernel overlay2 instead, restoring the 10× +# extraction speed-up. emptyDir lives for the workflow's lifetime, which +# is exactly the docker-cache lifetime we want anyway. +DOCKERD_DATA_ROOT="${DOCKERD_DATA_ROOT:-}" +if [ -z "$DOCKERD_DATA_ROOT" ]; then + if [ -d /osmo/run ] && [ -w /osmo/run ]; then + DOCKERD_DATA_ROOT=/osmo/run/docker + else + DOCKERD_DATA_ROOT=/var/lib/docker + fi +fi +mkdir -p "$DOCKERD_DATA_ROOT" +log "dockerd data-root: $DOCKERD_DATA_ROOT (fs=$(stat -fc %T "$DOCKERD_DATA_ROOT" 2>/dev/null))" + # Concurrency: dockerd's defaults are --max-concurrent-downloads=3 and # --max-concurrent-uploads=5. With 2 GB+ AirStack image blobs on a 10 GbE # pool, a single TLS pull stream tops out around 300-500 MiB/s (CPU-bound @@ -96,17 +118,15 @@ DOCKERD_MAX_UPLOADS="${DOCKERD_MAX_UPLOADS:-10}" _start_dockerd() { local driver="$1" : > /var/log/dockerd.log - # We rely on /var/lib/docker being a bind-mount of /mnt/airstack-data - # (200Gi Cinder volume) so during-pull disk peaks aren't constrained by - # node ephemeral-storage. nohup dockerd \ --host=unix:///var/run/docker.sock \ + --data-root="$DOCKERD_DATA_ROOT" \ --storage-driver="$driver" \ --max-concurrent-downloads="$DOCKERD_MAX_DOWNLOADS" \ --max-concurrent-uploads="$DOCKERD_MAX_UPLOADS" \ > /var/log/dockerd.log 2>&1 & DOCKERD_PID=$! - log "dockerd started (pid=$DOCKERD_PID, storage-driver=$driver); waiting for socket" + log "dockerd started (pid=$DOCKERD_PID, data-root=$DOCKERD_DATA_ROOT, storage-driver=$driver); waiting for socket" for i in $(seq 1 30); do if docker info >/dev/null 2>&1; then log "dockerd ready (storage-driver=$driver)"