diff --git a/.github/scripts/heap_snapshots.sh b/.github/scripts/heap_snapshots.sh new file mode 100755 index 00000000..8f216ca1 --- /dev/null +++ b/.github/scripts/heap_snapshots.sh @@ -0,0 +1,83 @@ +#!/usr/bin/env bash +# Periodically capture live-heap snapshots of a running process so a reachable +# (non-freed but still-referenced) memory growth can be attributed to a type and +# allocation stack. `leaks` only reports *unreachable* blocks and runs at exit; +# this instead samples the live heap mid-run, which is where the PlatformAudio +# retention shows up. +# +# Two complementary tools are used (both require inspecting another task, so they +# run under `sudo -n`; GitHub macOS runners allow passwordless sudo): +# - `heap` : summary of live allocations grouped by type/class/binary. +# Diffing successive summaries shows which category grows. +# - `malloc_history` : per-allocation backtraces (needs MallocStackLogging in +# the target); captured only on the last few ticks because +# the output is large. +# +# Self-terminates when the target process exits. +set -uo pipefail + +pattern=${1:?usage: heap_snapshots.sh [interval_sec] [max_snaps]} +outdir=${2:?usage: heap_snapshots.sh [interval_sec] [max_snaps]} +interval=${3:-25} +max_snaps=${4:-10} + +mkdir -p "${outdir}" +self=$$ + +if [[ "$(uname -s)" != "Darwin" ]]; then + echo "heap_snapshots: only supported on macOS" >&2 + exit 0 +fi + +sudo_ok=0 +if sudo -n true >/dev/null 2>&1; then sudo_ok=1; fi +if (( ! sudo_ok )); then + echo "heap_snapshots: passwordless sudo unavailable; heap/malloc_history need it" >&2 +fi + +# Pick the matching PID with the largest RSS (the instrumented test binary), so +# we never attach to this script or the run_tests wrapper shell. +pick_target() { + local best="" best_rss=0 p rss + for p in $(pgrep -f "${pattern}" 2>/dev/null); do + [[ "${p}" == "${self}" ]] && continue + rss=$(ps -o rss= -p "${p}" 2>/dev/null | tr -d ' ') + [[ -z "${rss}" ]] && continue + if (( rss > best_rss )); then best_rss=${rss}; best=${p}; fi + done + echo "${best}" +} + +pid="" +for _ in $(seq 1 120); do + pid=$(pick_target) + [[ -n "${pid}" ]] && break + sleep 1 +done +if [[ -z "${pid}" ]]; then + echo "heap_snapshots: process matching '${pattern}' never appeared" >&2 + exit 0 +fi +echo "heap_snapshots: tracking pid ${pid} (pattern '${pattern}')" >&2 + +snap=0 +while kill -0 "${pid}" 2>/dev/null && (( snap < max_snaps )); do + sleep "${interval}" + kill -0 "${pid}" 2>/dev/null || break + snap=$((snap + 1)) + rss=$(ps -o rss= -p "${pid}" 2>/dev/null | tr -d ' ') + ts=$(date -u +%H%M%S) + label=$(printf '%02d_t%s_rss%sk' "${snap}" "${ts}" "${rss:-0}") + echo "heap_snapshots: snapshot ${label}" >&2 + + if (( sudo_ok )); then + sudo -n heap "${pid}" > "${outdir}/heap-${label}.txt" 2>&1 || true + # malloc_history -allBySize sorts largest-first, so head keeps the biggest + # offenders while bounding artifact size. Capture every tick so we always + # have stacks even if the process exits/hangs before max_snaps. + sudo -n malloc_history "${pid}" -allBySize 2>/dev/null \ + | head -400 > "${outdir}/mhist-${label}.txt" || true + fi +done + +echo "heap_snapshots: done (${snap} snapshots) for pid ${pid}" >&2 diff --git a/.github/scripts/run_tests_with_backtrace.sh b/.github/scripts/run_tests_with_backtrace.sh new file mode 100755 index 00000000..8765c59c --- /dev/null +++ b/.github/scripts/run_tests_with_backtrace.sh @@ -0,0 +1,220 @@ +#!/usr/bin/env bash +# Run a test binary under debug CI. On fatal signals, print post-mortem +# backtraces from core dumps when available. Linux also runs under catchsegv +# so a partial backtrace appears in the log even without a core file. +# +# When LIVEKIT_TEST_STALL_SECONDS is set to a positive integer, a watchdog +# monitors test output and dumps live thread backtraces if the log goes silent +# for that many seconds (integration-test hang diagnostics on linux-x64). +set -uo pipefail + +usage() { + echo "Usage: $0 [gtest-args...]" >&2 + exit 2 +} + +[[ $# -ge 1 ]] || usage + +binary=$1 +shift + +if [[ ! -x "$binary" ]]; then + echo "Error: not executable: $binary" >&2 + exit 2 +fi + +binary_abs=$(cd "$(dirname "$binary")" && pwd)/$(basename "$binary") +core_dir="${RUNNER_TEMP:-/tmp}/livekit-test-cores" +mkdir -p "$core_dir" + +ulimit -c unlimited || true + +if [[ "$(uname -s)" == "Linux" ]]; then + echo "${core_dir}/core.%e.%p" | sudo tee /proc/sys/kernel/core_pattern >/dev/null || true +fi + +if [[ "$(uname -s)" == "Darwin" ]]; then + ulimit -c unlimited || true + sudo sysctl -w kern.coredump=1 >/dev/null 2>&1 || true + sudo mkdir -p /cores 2>/dev/null || true + sudo chmod 1777 /cores 2>/dev/null || true +fi + +dump_macos_crash_reports() { + local binary_name + binary_name=$(basename "${binary_abs}") + echo "=== macOS DiagnosticReports for ${binary_name} ===" + local found=0 + for report_dir in "${HOME}/Library/Logs/DiagnosticReports" "/Library/Logs/DiagnosticReports"; do + if [[ ! -d "${report_dir}" ]]; then + continue + fi + while IFS= read -r report; do + found=1 + echo "Crash report: ${report}" + # .ips files are JSON-ish; print the first 200 lines for the CI log. + head -n 200 "${report}" || true + done < <(find "${report_dir}" -maxdepth 1 -name "${binary_name}*.ips" -type f -print 2>/dev/null | sort -r | head -3) + done + if ((found == 0)); then + echo "No DiagnosticReports .ips found for ${binary_name}" + fi +} + +dump_live_backtraces() { + local test_pid=$1 + local reason=$2 + + echo "=== live backtrace diagnostics (${reason}, pid ${test_pid}) ===" + + if [[ "$(uname -s)" == "Linux" ]]; then + if command -v gdb >/dev/null 2>&1; then + gdb -batch \ + -ex 'set pagination off' \ + -ex 'thread apply all bt full' \ + -p "${test_pid}" || true + else + echo "gdb not available; install gdb for live backtraces" + fi + return 0 + fi + + if [[ "$(uname -s)" == "Darwin" ]]; then + if command -v sample >/dev/null 2>&1; then + sample "${test_pid}" 5 -mayDie 2>&1 || true + fi + if command -v lldb >/dev/null 2>&1; then + lldb -p "${test_pid}" --batch -o 'thread backtrace all' -o 'detach' -o 'quit' 2>&1 || true + else + echo "lldb not available" + fi + fi +} + +dump_backtraces() { + local test_pid=$1 + local status=$2 + + echo "=== crash diagnostics (exit status ${status}, pid ${test_pid}) ===" + + if [[ "$(uname -s)" == "Linux" ]]; then + local core="" + core=$(find "$core_dir" -maxdepth 1 -name 'core.*' -type f 2>/dev/null | sort -r | head -1) + if [[ -z "$core" ]]; then + core=$(find /tmp -maxdepth 1 -name 'core.*' -type f 2>/dev/null | sort -r | head -1) + fi + if [[ -n "$core" && -f "$core" ]]; then + echo "Core file: ${core}" + if command -v gdb >/dev/null 2>&1; then + gdb -batch \ + -ex 'set pagination off' \ + -ex 'thread apply all bt full' \ + "${binary_abs}" "${core}" || true + else + echo "gdb not available; install gdb for post-mortem backtraces" + fi + cp -a "${core}" "${core_dir}/" 2>/dev/null || true + basename "${core}" >"${core_dir}/last-core.name" + else + echo "No core file found under ${core_dir} or /tmp" + fi + return 0 + fi + + if [[ "$(uname -s)" == "Darwin" ]]; then + local core="" + for candidate in "/cores/core.${test_pid}" "/cores/core.${test_pid}.dump"; do + if [[ -f "${candidate}" ]]; then + core=${candidate} + break + fi + done + if [[ -z "$core" ]]; then + core=$(find /cores -maxdepth 1 -name "core.*" -type f 2>/dev/null | sort -r | head -1) + fi + if [[ -n "$core" && -f "$core" ]]; then + echo "Core file: ${core}" + if command -v lldb >/dev/null 2>&1; then + lldb -b -c "${core}" -o 'thread backtrace all' -o 'quit' -- "${binary_abs}" || true + else + echo "lldb not available" + fi + cp -a "${core}" "${core_dir}/" 2>/dev/null || true + basename "${core}" >"${core_dir}/last-core.name" + else + echo "No core file found under /cores for pid ${test_pid}" + fi + dump_macos_crash_reports + fi +} + +run_test() { + if [[ "$(uname -s)" == "Linux" ]] && command -v catchsegv >/dev/null 2>&1; then + catchsegv "${binary_abs}" "$@" + else + "${binary_abs}" "$@" + fi +} + +start_stall_watchdog() { + local test_pid=$1 + local log_file=$2 + local stall_limit=$3 + + ( + local last_size=-1 + local stall=0 + while kill -0 "${test_pid}" 2>/dev/null; do + local size + size=$(wc -c <"${log_file}" 2>/dev/null || echo 0) + if [[ "${size}" == "${last_size}" ]]; then + stall=$((stall + 5)) + else + stall=0 + last_size=${size} + fi + if ((stall >= stall_limit)); then + echo "=== TEST HANG DETECTED: no output for ${stall}s (pid ${test_pid}) ===" + echo "--- last log lines ---" + tail -n 40 "${log_file}" || true + dump_live_backtraces "${test_pid}" "stall ${stall}s" + kill -ABRT "${test_pid}" 2>/dev/null || kill -TERM "${test_pid}" 2>/dev/null || true + break + fi + sleep 5 + done + ) & + echo $! +} + +stall_limit=${LIVEKIT_TEST_STALL_SECONDS:-0} +log_file="${RUNNER_TEMP:-/tmp}/livekit-test-output.log" + +set +e +if ((stall_limit > 0)); then + : >"${log_file}" + run_test "$@" >"${log_file}" 2>&1 & + test_pid=$! + watchdog_pid=$(start_stall_watchdog "${test_pid}" "${log_file}" "${stall_limit}") + wait "${test_pid}" + status=$? + kill "${watchdog_pid}" 2>/dev/null || true + wait "${watchdog_pid}" 2>/dev/null || true + cat "${log_file}" +else + run_test "$@" & + test_pid=$! + wait "${test_pid}" + status=$? +fi +set -e + +if ((status > 128)); then + signal=$((status - 128)) + echo "Test process ${test_pid} terminated by signal ${signal}" + dump_backtraces "${test_pid}" "${status}" +elif ((status != 0)); then + echo "Test process exited with status ${status}" +fi + +exit "${status}" diff --git a/.github/scripts/sample_process_resources.sh b/.github/scripts/sample_process_resources.sh new file mode 100755 index 00000000..68be3732 --- /dev/null +++ b/.github/scripts/sample_process_resources.sh @@ -0,0 +1,99 @@ +#!/usr/bin/env bash +# Periodically sample resource usage of a process matched by name and emit CSV. +# +# Tracks the metrics that reveal a native teardown/recreate leak in the +# PlatformAudio triage: resident memory, OS thread count, open file +# descriptors, and (macOS) mach-port count. Mach ports are the tell for a +# CoreAudio HAL client leak -- each ADM Init talks to coreaudiod over a mach +# port, so a port count that climbs across dispose/recreate cycles points at an +# ADM Terminate() that is not fully releasing HAL resources. +# +# Self-terminates when the target process exits, so it can be safely launched +# in the background ahead of a test run. +set -uo pipefail + +pattern=${1:?usage: sample_process_resources.sh [interval_sec]} +out=${2:?usage: sample_process_resources.sh [interval_sec]} +interval=${3:-3} + +echo "iso_time,elapsed_s,pid,rss_kb,threads,fds,mach_ports" > "${out}" + +# Resolve the target PID. `pgrep -f` also matches this script (its own argv +# contains the pattern) and the run_tests wrapper shell, so picking head -1 grabs +# the wrong, idle process. Instead choose the matching PID with the largest RSS: +# the instrumented test binary uses orders of magnitude more memory than any +# shell, which disambiguates it reliably. Exclude this script's own PID. +self=$$ +RSS_THRESHOLD_KB=${SAMPLER_RSS_THRESHOLD_KB:-50000} + +pick_target() { + local best="" best_rss=0 p rss + for p in $(pgrep -f "${pattern}" 2>/dev/null); do + [[ "${p}" == "${self}" ]] && continue + rss=$(ps -o rss= -p "${p}" 2>/dev/null | tr -d ' ') + [[ -z "${rss}" ]] && continue + if (( rss > best_rss )); then best_rss=${rss}; best=${p}; fi + done + echo "${best} ${best_rss}" +} + +# Wait up to 120s for the real binary (RSS over threshold) to come up. Fall back +# to the largest match seen if nothing crosses the threshold before timeout. +pid="" +for _ in $(seq 1 120); do + read -r cand cand_rss <<< "$(pick_target)" + if [[ -n "${cand}" ]]; then + pid=${cand} + (( cand_rss >= RSS_THRESHOLD_KB )) && break + fi + sleep 1 +done +if [[ -z "${pid}" ]]; then + echo "sampler: process matching '${pattern}' never appeared" >&2 + exit 0 +fi +echo "sampler: tracking pid ${pid} (pattern '${pattern}')" >&2 + +is_macos=0 +[[ "$(uname -s)" == "Darwin" ]] && is_macos=1 + +# mach-port counting needs lsmp + root to inspect another task. GitHub macOS +# runners allow passwordless sudo; `sudo -n` fails fast (no prompt) elsewhere, +# in which case the mach_ports column is left blank rather than a misleading 0. +mach_ports_cmd="" +if (( is_macos )) && command -v lsmp >/dev/null 2>&1; then + if lsmp -p "$$" >/dev/null 2>&1; then + mach_ports_cmd="lsmp -p" + elif sudo -n true >/dev/null 2>&1; then + mach_ports_cmd="sudo -n lsmp -p" + fi +fi + +start=$(date +%s) +while kill -0 "${pid}" 2>/dev/null; do + now=$(date +%s) + elapsed=$((now - start)) + ts=$(date -u +%Y-%m-%dT%H:%M:%SZ) + + rss=$(ps -o rss= -p "${pid}" 2>/dev/null | tr -d ' ') + + if (( is_macos )); then + # macOS: ps -M lists one line per thread (plus a header line). + threads=$(ps -M -p "${pid}" 2>/dev/null | tail -n +2 | grep -c . || true) + if [[ -n "${mach_ports_cmd}" ]]; then + mach_ports=$(${mach_ports_cmd} "${pid}" 2>/dev/null | grep -c -E 'send|recv|port set|dead' || true) + else + mach_ports="" + fi + else + threads=$(ps -o nlwp= -p "${pid}" 2>/dev/null | tr -d ' ') + mach_ports="" + fi + + fds=$(lsof -p "${pid}" 2>/dev/null | tail -n +2 | grep -c . || true) + + echo "${ts},${elapsed},${pid},${rss:-},${threads:-},${fds:-},${mach_ports:-}" >> "${out}" + sleep "${interval}" +done + +echo "sampler: process ${pid} exited; samples written to ${out}" >&2 diff --git a/.github/scripts/stage_crash_diagnostics.sh b/.github/scripts/stage_crash_diagnostics.sh new file mode 100755 index 00000000..47cb1835 --- /dev/null +++ b/.github/scripts/stage_crash_diagnostics.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +# Collect unstripped test binaries, shared libraries, and core dumps for upload. +set -euo pipefail + +build_dir=${1:?usage: stage_crash_diagnostics.sh } +staging="${RUNNER_TEMP}/crash-diagnostics" + +rm -rf "${staging}" +mkdir -p "${staging}/bin" "${staging}/lib" "${staging}/cores" + +shopt -s nullglob +for bin in "${build_dir}"/bin/livekit_*; do + if [[ -f "${bin}" && -x "${bin}" ]]; then + cp -a "${bin}" "${staging}/bin/" + fi +done + +for lib in "${build_dir}"/lib/liblivekit.*; do + if [[ -f "${lib}" ]]; then + cp -a "${lib}" "${staging}/lib/" + fi +done + +while IFS= read -r -d '' ffi_lib; do + cp -a "${ffi_lib}" "${staging}/lib/" +done < <(find client-sdk-rust/target/debug -name 'liblivekit_ffi.*' -print0 2>/dev/null) + +core_dir="${RUNNER_TEMP}/livekit-test-cores" +if [[ -d "${core_dir}" ]]; then + find "${core_dir}" -maxdepth 1 -name 'core.*' -type f -exec cp -a {} "${staging}/cores/" \; 2>/dev/null || true +fi + +if [[ "$(uname -s)" == "Darwin" && -d /cores ]]; then + find /cores -maxdepth 1 -name 'core.*' -type f -exec cp -a {} "${staging}/cores/" \; 2>/dev/null || true +fi + +if [[ "$(uname -s)" == "Darwin" ]]; then + mkdir -p "${staging}/crash-reports" + for report_dir in "${HOME}/Library/Logs/DiagnosticReports" "/Library/Logs/DiagnosticReports"; do + if [[ -d "${report_dir}" ]]; then + find "${report_dir}" -maxdepth 1 -name '*.ips' -type f -exec cp -a {} "${staging}/crash-reports/" \; 2>/dev/null || true + fi + done +fi + +echo "Staged crash diagnostics under ${staging}:" +find "${staging}" -type f -print diff --git a/.github/workflows/platform-audio-triage.yml b/.github/workflows/platform-audio-triage.yml new file mode 100644 index 00000000..3d0a9fe3 --- /dev/null +++ b/.github/workflows/platform-audio-triage.yml @@ -0,0 +1,456 @@ +name: PlatformAudio Triage + +# Focused, mac-only crash-hunting harness for the PlatformAudio instability. +# It builds ONLY the integration test binary and runs the PlatformAudio cases +# in a tight, high-repeat loop with backtrace + crash-report capture. +# +# The instability reproduces on BOTH macOS architectures (Intel x64 hits an +# assertion timeout / no-frames; Apple Silicon arm64 has been seen to SIGSEGV), +# so by default this fans out across one Intel and one arm64 runner. +# +# Two arms run so a failure is diagnostic, not just a red X: +# - repeat arm: the standard cases under --gtest_repeat. Each iteration calls +# livekit::shutdown() -> FFI dispose -> Arc drop -> +# AdmProxy::~AdmProxy() -> CoreAudio ADM Terminate(). This is the suspected +# crash path (full ADM teardown/recreate every iteration). +# - pinned arm: PinnedRuntimeRepeatedPublishStress holds one PlatformAudio for +# the whole test, so the ADM is created once and never terminated between +# cycles. If the repeat arm crashes but this stays green, the bug is in ADM +# teardown, not the steady media path. +# +# Trigger from the Actions tab (workflow_dispatch) and tune the inputs. + +on: + workflow_dispatch: + inputs: + runners: + description: "Which macOS runners to triage (the instability reproduces on both Intel x64 and Apple Silicon arm64)" + type: choice + default: all + options: + - all # both arches: macos-15-large (x64) + macos-15 (arm64) + - macos-15-large # Intel x64 + - macos-15 # Apple Silicon arm64 + - macos-13 # Intel x64 (older) + - macos-14 # Apple Silicon arm64 (older) + - macos-26-xlarge # Apple Silicon arm64 (newer, larger) + repeat: + description: "gtest_repeat count for the dispose-each-iteration arm" + type: string + default: "200" + pin_iterations: + description: "PLATFORM_AUDIO_PIN_ITERATIONS for the pinned control arm" + type: string + default: "200" + leak_iterations: + description: "Pinned-cycle iterations for the leaks arm (kept small; MallocStackLogging is heavy)" + type: string + default: "40" + ubsan: + description: "Build with UndefinedBehaviorSanitizer (ASan is incompatible with macOS CoreAudio)" + type: boolean + default: true + +permissions: + contents: read + actions: read + +jobs: + # Expand the runner selection into a matrix. "all" fans out across both + # architectures so a single dispatch confirms whether the instability + # reproduces on Intel x64 and Apple Silicon arm64. + prepare: + name: Resolve runner matrix + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set.outputs.matrix }} + steps: + - id: set + shell: bash + run: | + sel="${{ github.event.inputs.runners || 'all' }}" + if [[ "${sel}" == "all" ]]; then + echo 'matrix={"runner":["macos-15-large","macos-15"]}' >> "$GITHUB_OUTPUT" + else + printf 'matrix={"runner":["%s"]}\n' "${sel}" >> "$GITHUB_OUTPUT" + fi + + triage: + needs: prepare + strategy: + fail-fast: false + matrix: ${{ fromJSON(needs.prepare.outputs.matrix) }} + name: PlatformAudio Triage (${{ matrix.runner }}) + runs-on: ${{ matrix.runner }} + timeout-minutes: 90 + env: + CARGO_TERM_COLOR: always + CARGO_INCREMENTAL: "0" + RUST_BACKTRACE: full + UBSAN_OPTIONS: halt_on_error=1:print_stacktrace=1 + REPEAT: ${{ github.event.inputs.repeat || '200' }} + PIN_ITERATIONS: ${{ github.event.inputs.pin_iterations || '200' }} + USE_UBSAN: ${{ github.event.inputs.ubsan || 'true' }} + + steps: + - name: Checkout (with submodules) + uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + with: + submodules: recursive + fetch-depth: 1 + + - name: Pull LFS files + run: git lfs pull + + - name: Prepare CI test scripts + run: | + chmod +x .github/scripts/run_tests_with_backtrace.sh + chmod +x .github/scripts/stage_crash_diagnostics.sh + chmod +x .github/scripts/sample_process_resources.sh + chmod +x .github/scripts/heap_snapshots.sh + + - name: Install deps + run: | + set -eux + brew update + brew install cmake ninja protobuf abseil + + - name: Install Rust (stable) + uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 + with: + toolchain: stable + + # Cache the Rust submodule build so re-runs skip the ~20-minute cold build. + - name: Cache Rust build + uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2 + with: + workspaces: client-sdk-rust + + - name: Set build environment + run: | + echo "CXXFLAGS=-fno-omit-frame-pointer" >> "$GITHUB_ENV" + echo "CFLAGS=-fno-omit-frame-pointer" >> "$GITHUB_ENV" + + - name: Configure build + run: | + if [[ "${USE_UBSAN}" == "true" ]]; then + cmake --preset macos-debug-tests \ + -DCMAKE_C_FLAGS="-fsanitize=undefined -fno-omit-frame-pointer" \ + -DCMAKE_CXX_FLAGS="-fsanitize=undefined -fno-omit-frame-pointer" \ + -DCMAKE_EXE_LINKER_FLAGS="-fsanitize=undefined" \ + -DCMAKE_SHARED_LINKER_FLAGS="-fsanitize=undefined" + else + cmake --preset macos-debug-tests + fi + + - name: Build integration tests only + run: cmake --build build-debug --target livekit_integration_tests --parallel 2 + + - name: Start livekit-server + id: livekit_server + uses: livekit/dev-server-action@61e2b4dcb170dd3591e0c9b0db3c3fe5db93b500 + continue-on-error: true + with: + github-token: ${{ github.token }} + + - name: Start livekit-server fallback + if: steps.livekit_server.outcome == 'failure' + id: livekit_server_fallback + shell: bash + run: | + set -euxo pipefail + brew install livekit + cat > "$RUNNER_TEMP/livekit.yaml" <<'EOF' + logging: { json: true } + EOF + livekit-server --config "$RUNNER_TEMP/livekit.yaml" --dev > "$RUNNER_TEMP/livekit.jsonl" 2>&1 & + echo "log-path=$RUNNER_TEMP/livekit.jsonl" >> "$GITHUB_OUTPUT" + for i in $(seq 1 30); do + if [[ "$(curl -fsS http://localhost:7880/ || true)" == "OK" ]]; then + exit 0 + fi + sleep 1 + done + exit 1 + + - name: Install livekit-cli + shell: bash + run: brew install livekit-cli + + # Arm A: full ADM teardown/recreate every iteration (suspected crash path). + - name: "Arm A — repeat (dispose+ADM Terminate each iteration)" + id: repeat_arm + continue-on-error: true + timeout-minutes: 40 + shell: bash + env: + RUST_LOG: info,livekit_ffi::server=debug,livekit_ffi::server::platform_audio=debug,livekit::platform_audio=debug + RUST_BACKTRACE: full + run: | + set -euo pipefail + source .token_helpers/set_data_track_test_tokens.bash + # Sample process resources in the background; it self-exits with the test. + .github/scripts/sample_process_resources.sh \ + livekit_integration_tests "$RUNNER_TEMP/resources-arm-a.csv" 3 & + .github/scripts/run_tests_with_backtrace.sh \ + build-debug/bin/livekit_integration_tests \ + --gtest_filter='PlatformAudioIntegrationTest.*-PlatformAudioIntegrationTest.PinnedRuntimeRepeatedPublishStress' \ + --gtest_repeat="${REPEAT}" \ + --gtest_recreate_environments_when_repeating=1 \ + --gtest_output=xml:build-debug/triage-repeat-arm.xml + + # Arm B: control. Runtime/ADM pinned alive for the whole test. + - name: "Arm B — pinned runtime (ADM created once, never terminated)" + id: pinned_arm + continue-on-error: true + timeout-minutes: 40 + shell: bash + env: + RUST_LOG: info,livekit_ffi::server=debug,livekit_ffi::server::platform_audio=debug,livekit::platform_audio=debug + RUST_BACKTRACE: full + PLATFORM_AUDIO_PIN_ITERATIONS: ${{ env.PIN_ITERATIONS }} + run: | + set -euo pipefail + source .token_helpers/set_data_track_test_tokens.bash + # Sample process resources in the background; it self-exits with the test. + .github/scripts/sample_process_resources.sh \ + livekit_integration_tests "$RUNNER_TEMP/resources-arm-b.csv" 3 & + .github/scripts/run_tests_with_backtrace.sh \ + build-debug/bin/livekit_integration_tests \ + --gtest_filter='PlatformAudioIntegrationTest.PinnedRuntimeRepeatedPublishStress' \ + --gtest_output=xml:build-debug/triage-pinned-arm.xml + + # Arm C: isolates the frame-flow check. Running only this test with a small + # repeat (fresh ADM recreated each iteration, no sibling tests churning it + # first) distinguishes "frame flow is dead even on a fresh ADM" from "frame + # flow only dies after prior teardown/recreate cycles". + - name: "Arm C — frame-flow only (fresh ADM each iteration, isolated)" + id: frames_arm + continue-on-error: true + timeout-minutes: 30 + shell: bash + env: + RUST_LOG: info,livekit_ffi::server=debug,livekit_ffi::server::platform_audio=debug,livekit::platform_audio=debug + RUST_BACKTRACE: full + run: | + set -euo pipefail + source .token_helpers/set_data_track_test_tokens.bash + # Sample process resources in the background; it self-exits with the test. + .github/scripts/sample_process_resources.sh \ + livekit_integration_tests "$RUNNER_TEMP/resources-arm-c.csv" 3 & + .github/scripts/run_tests_with_backtrace.sh \ + build-debug/bin/livekit_integration_tests \ + --gtest_filter='PlatformAudioIntegrationTest.PlatformAudioFramesReachRemote' \ + --gtest_repeat=5 \ + --gtest_recreate_environments_when_repeating=1 \ + --gtest_output=xml:build-debug/triage-frames-arm.xml + + # Arm D: name the leak. The pinned-cycle reproducer (ADM held alive, rooms + # recycled each cycle) is run under macOS `leaks` with MallocStackLogging so + # every still-allocated block is reported with the backtrace that allocated + # it. This points directly at the leaking call site (C++ SDK vs Rust FFI) + # instead of us guessing. A small iteration count keeps the stack-logging + # overhead bounded while leaking enough to aggregate clear stacks. + - name: "Arm D — leak backtraces (pinned cycle under leaks)" + id: leaks_arm + continue-on-error: true + timeout-minutes: 40 + shell: bash + env: + # Quieter than the other arms: the leak report is the signal here. + RUST_LOG: warn + RUST_BACKTRACE: "1" + MallocStackLogging: "1" + PLATFORM_AUDIO_PIN_ITERATIONS: ${{ github.event.inputs.leak_iterations || '40' }} + run: | + set -uo pipefail + source .token_helpers/set_data_track_test_tokens.bash + report="$RUNNER_TEMP/leaks-report.txt" + # `leaks --atExit -- ` launches the binary, lets it run to normal + # exit, then dumps leaks (grouped by identical backtrace with counts). + leaks --atExit -- \ + build-debug/bin/livekit_integration_tests \ + --gtest_filter='PlatformAudioIntegrationTest.PinnedRuntimeRepeatedPublishStress' \ + > "$report" 2>&1 + status=$? + echo "leaks exit status: ${status}" + # Surface the leak summary + the largest stacks in the step log. + grep -E "Process [0-9]+: .* leaks for|leaks for [0-9,]+ total|^[[:space:]]*[0-9]+ \(" "$report" | head -40 || true + echo "--- top of leaks report ---" + tail -n 200 "$report" || true + # leaks exits non-zero when leaks are found; treat that as success for + # this diagnostic arm (the report is the deliverable). + exit 0 + + # Arm E: attribute the reachable retention. `leaks` reports 0 (the growth is + # still-referenced, not lost) so we instead sample the LIVE heap mid-run on + # the dispose+recreate path (the worst leaker) with MallocStackLogging, then + # diff successive `heap` summaries + read `malloc_history` stacks to name the + # growing allocation type and its call site. + - name: "Arm E — live-heap attribution (dispose path under heap/malloc_history)" + id: heap_arm + continue-on-error: true + timeout-minutes: 40 + shell: bash + env: + RUST_LOG: warn + RUST_BACKTRACE: "1" + MallocStackLogging: "1" + run: | + set -uo pipefail + source .token_helpers/set_data_track_test_tokens.bash + mkdir -p "$RUNNER_TEMP/heap-snapshots" + # Snapshot the live heap every 10s while the dispose-path test repeats. + # Frequent early ticks catch the growth curve (not just a plateau), and + # malloc_history is captured on every tick so we always get stacks. + # A larger repeat keeps the process churning across the whole window. + .github/scripts/heap_snapshots.sh \ + livekit_integration_tests "$RUNNER_TEMP/heap-snapshots" 10 30 & + .github/scripts/run_tests_with_backtrace.sh \ + build-debug/bin/livekit_integration_tests \ + --gtest_filter='PlatformAudioIntegrationTest.*-PlatformAudioIntegrationTest.PinnedRuntimeRepeatedPublishStress' \ + --gtest_repeat=60 \ + --gtest_recreate_environments_when_repeating=1 \ + --gtest_output=xml:build-debug/triage-heap-arm.xml || true + echo "--- heap snapshots captured ---" + ls -la "$RUNNER_TEMP/heap-snapshots" || true + # Surface the 'all zones' growth across the first vs last heap summary. + first=$(ls "$RUNNER_TEMP"/heap-snapshots/heap-*.txt 2>/dev/null | head -1) + last=$(ls "$RUNNER_TEMP"/heap-snapshots/heap-*.txt 2>/dev/null | tail -1) + if [[ -n "${first}" && -n "${last}" && "${first}" != "${last}" ]]; then + echo "=== FIRST heap summary (${first##*/}) ==="; grep -E "Process [0-9]+:|total|COUNT" "${first}" | head -25 || true + echo "=== LAST heap summary (${last##*/}) ==="; grep -E "Process [0-9]+:|total|COUNT" "${last}" | head -25 || true + fi + exit 0 + + - name: Dump livekit-server log on failure + if: failure() || steps.repeat_arm.outcome == 'failure' || steps.pinned_arm.outcome == 'failure' || steps.frames_arm.outcome == 'failure' + shell: bash + run: | + log_path="${{ steps.livekit_server.outputs.log-path }}" + if [[ -z "$log_path" ]]; then + log_path="${{ steps.livekit_server_fallback.outputs.log-path }}" + fi + tail -n 500 "$log_path" || true + + - name: Stage crash diagnostics + if: steps.repeat_arm.outcome == 'failure' || steps.pinned_arm.outcome == 'failure' || steps.frames_arm.outcome == 'failure' + run: .github/scripts/stage_crash_diagnostics.sh build-debug + + - name: Upload test results + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: platform-audio-triage-results-${{ matrix.runner }} + path: | + build-debug/triage-repeat-arm.xml + build-debug/triage-pinned-arm.xml + build-debug/triage-frames-arm.xml + if-no-files-found: ignore + retention-days: 14 + + - name: Upload crash diagnostics + if: steps.repeat_arm.outcome == 'failure' || steps.pinned_arm.outcome == 'failure' || steps.frames_arm.outcome == 'failure' + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: platform-audio-triage-crash-diagnostics-${{ matrix.runner }} + path: ${{ runner.temp }}/crash-diagnostics/ + if-no-files-found: ignore + retention-days: 14 + + - name: Upload resource samples + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: platform-audio-triage-resource-samples-${{ matrix.runner }} + path: | + ${{ runner.temp }}/resources-arm-a.csv + ${{ runner.temp }}/resources-arm-b.csv + ${{ runner.temp }}/resources-arm-c.csv + if-no-files-found: ignore + retention-days: 14 + + - name: Upload leak report + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: platform-audio-triage-leak-report-${{ matrix.runner }} + path: ${{ runner.temp }}/leaks-report.txt + if-no-files-found: ignore + retention-days: 14 + + - name: Upload heap snapshots + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: platform-audio-triage-heap-snapshots-${{ matrix.runner }} + path: ${{ runner.temp }}/heap-snapshots/ + if-no-files-found: ignore + retention-days: 14 + + # Surface the diagnostic contrast and fail the job if either arm failed. + - name: Triage summary + if: always() + shell: bash + run: | + repeat="${{ steps.repeat_arm.outcome }}" + pinned="${{ steps.pinned_arm.outcome }}" + frames="${{ steps.frames_arm.outcome }}" + leaks="${{ steps.leaks_arm.outcome }}" + heap="${{ steps.heap_arm.outcome }}" + + # Print first vs last resource sample so the leak curve is visible inline. + resource_delta() { + local csv=$1 + [[ -f "${csv}" ]] || { echo "| (no samples) | | | | |"; return; } + local first last + first=$(tail -n +2 "${csv}" | head -1) + last=$(tail -n 1 "${csv}") + [[ -n "${first}" && "${first}" != "${last}" ]] || { echo "| (insufficient samples) | | | | |"; return; } + IFS=, read -r _ _ _ frss fthreads ffds fports <<< "${first}" + IFS=, read -r _ le _ lrss lthreads lfds lports <<< "${last}" + echo "| rss_kb | ${frss} | ${lrss} | over ${le}s |" + echo "| threads | ${fthreads} | ${lthreads} | |" + echo "| fds | ${ffds} | ${lfds} | |" + echo "| mach_ports | ${fports} | ${lports} | |" + } + + { + echo "## PlatformAudio triage" + echo "" + echo "| Arm | Outcome |" + echo "| --- | --- |" + echo "| A — Repeat (dispose+ADM Terminate each iter) | ${repeat} |" + echo "| B — Pinned runtime (ADM never terminated) | ${pinned} |" + echo "| C — Frame-flow only (fresh ADM each iter) | ${frames} |" + echo "| D — Leak backtraces (see leak-report artifact) | ${leaks} |" + echo "| E — Live-heap attribution (see heap-snapshots artifact) | ${heap} |" + echo "" + if [[ "${repeat}" == "failure" && "${pinned}" == "success" ]]; then + echo "➡️ Repeat failed while pinned passed: consistent with an ADM teardown/recreate bug." + elif [[ "${repeat}" == "failure" && "${pinned}" == "failure" ]]; then + echo "➡️ Both repeat and pinned failed: instability is not exclusive to ADM teardown." + elif [[ "${repeat}" == "success" && "${pinned}" == "success" ]]; then + echo "➡️ Repeat and pinned both passed this run (did not reproduce). Try a higher repeat count." + fi + echo "" + echo "_Arm C (frame-flow isolated): if iteration 1 passes but later iterations fail," + echo "frame delivery degrades with each fresh ADM recreate; if it fails on iteration 1," + echo "frame flow is broken even on a first/fresh ADM on this runner._" + echo "" + echo "### Resource growth — Arm A (dispose each iteration)" + echo "| metric | first | last | note |" + echo "| --- | --- | --- | --- |" + resource_delta "$RUNNER_TEMP/resources-arm-a.csv" + echo "" + echo "### Resource growth — Arm B (pinned)" + echo "| metric | first | last | note |" + echo "| --- | --- | --- | --- |" + resource_delta "$RUNNER_TEMP/resources-arm-b.csv" + echo "" + echo "### Resource growth — Arm C (frame-flow isolated)" + echo "| metric | first | last | note |" + echo "| --- | --- | --- | --- |" + resource_delta "$RUNNER_TEMP/resources-arm-c.csv" + } >> "$GITHUB_STEP_SUMMARY" + [[ "${repeat}" == "success" && "${pinned}" == "success" && "${frames}" == "success" ]] diff --git a/client-sdk-rust b/client-sdk-rust index 8e551062..6881168d 160000 --- a/client-sdk-rust +++ b/client-sdk-rust @@ -1 +1 @@ -Subproject commit 8e551062c59f912159b8cebac44b2cdcce0024ef +Subproject commit 6881168dfefad6605fcf6697085dcad2421bde68 diff --git a/include/livekit/local_participant.h b/include/livekit/local_participant.h index 9369a914..2c1d5cf5 100644 --- a/include/livekit/local_participant.h +++ b/include/livekit/local_participant.h @@ -246,6 +246,14 @@ class LIVEKIT_API LocalParticipant : public Participant { /// prune expired @c weak_ptr entries. mutable TrackMap published_tracks_by_sid_; + /// Guards @ref published_tracks_by_sid_. The map is written from the + /// application thread (@ref publishTrack / @ref unpublishTrack) and both read + /// and pruned from the FFI callback thread (@ref trackPublications / + /// @ref findTrackPublication, reached via Room::onEvent). Without this lock + /// those concurrent accesses race and free map nodes out from under each + /// other (heap-use-after-free). Leaf lock: no other lock is taken while held. + mutable std::mutex published_tracks_mutex_; + std::unordered_map rpc_handlers_; // Shared state for RPC invocation tracking. Using shared_ptr so the state diff --git a/src/data_track_stream.cpp b/src/data_track_stream.cpp index 155f4788..6a777377 100644 --- a/src/data_track_stream.cpp +++ b/src/data_track_stream.cpp @@ -52,22 +52,28 @@ void DataTrackStream::init(FfiHandle subscription_handle) { bool DataTrackStream::read(DataTrackFrame& out) { proto::DataTrackStreamReadResponse read_response; bool missing_read_response = false; + std::uint64_t subscription_handle = 0; { const std::scoped_lock lock(mutex_); if (closed_ || eof_) { return false; } + subscription_handle = static_cast(subscription_handle_.get()); + } - const auto subscription_handle = static_cast(subscription_handle_.get()); + // Do not hold mutex_ across sendRequest: readFrameWithTimeout may call close() + // from another thread on timeout, and close() also needs mutex_. + proto::FfiRequest req; + auto* msg = req.mutable_data_track_stream_read(); + msg->set_stream_handle(subscription_handle); + const proto::FfiResponse resp = FfiClient::instance().sendRequest(req); - // Signal the Rust side that we're ready to receive the next frame. - // The Rust SubscriptionTask uses a demand-driven protocol: it won't pull - // from the underlying stream until notified via this request. - proto::FfiRequest req; - auto* msg = req.mutable_data_track_stream_read(); - msg->set_stream_handle(subscription_handle); - const proto::FfiResponse resp = FfiClient::instance().sendRequest(req); + { + const std::scoped_lock lock(mutex_); + if (closed_ || eof_) { + return false; + } if (!resp.has_data_track_stream_read()) { missing_read_response = true; } else { diff --git a/src/local_participant.cpp b/src/local_participant.cpp index 7fda68ac..1e8d9a0f 100644 --- a/src/local_participant.cpp +++ b/src/local_participant.cpp @@ -197,7 +197,10 @@ void LocalParticipant::publishTrack(const std::shared_ptr& track, const T auto publication = std::make_shared(owned_pub); const std::string sid = publication->sid(); - published_tracks_by_sid_[sid] = std::weak_ptr(track); + { + const std::scoped_lock lock(published_tracks_mutex_); + published_tracks_by_sid_[sid] = std::weak_ptr(track); + } track->setPublication(publication); } @@ -237,6 +240,7 @@ void LocalParticipant::unpublishTrack(const std::string& track_sid) { fut.get(); + const std::scoped_lock lock(published_tracks_mutex_); if (auto it = published_tracks_by_sid_.find(track_sid); it != published_tracks_by_sid_.end()) { if (auto t = it->second.lock()) { t->setPublication(nullptr); @@ -247,6 +251,7 @@ void LocalParticipant::unpublishTrack(const std::string& track_sid) { LocalParticipant::PublicationMap LocalParticipant::trackPublications() const { PublicationMap out; + const std::scoped_lock lock(published_tracks_mutex_); for (auto it = published_tracks_by_sid_.begin(); it != published_tracks_by_sid_.end();) { auto t = it->second.lock(); if (!t) { @@ -443,6 +448,7 @@ void LocalParticipant::handleRpcMethodInvocation(uint64_t invocation_id, const s } std::shared_ptr LocalParticipant::findTrackPublication(const std::string& sid) const { + const std::scoped_lock lock(published_tracks_mutex_); auto it = published_tracks_by_sid_.find(sid); if (it == published_tracks_by_sid_.end()) { return nullptr; diff --git a/src/room.cpp b/src/room.cpp index 3ad58938..71680389 100644 --- a/src/room.cpp +++ b/src/room.cpp @@ -436,7 +436,7 @@ void Room::onEvent(const FfiEvent& event) { if (event.message_case() == FfiEvent::kRpcMethodInvocation) { const auto& rpc = event.rpc_method_invocation(); - LocalParticipant* lp = nullptr; + std::shared_ptr lp; { const std::scoped_lock guard(lock_); if (!local_participant_) { @@ -448,7 +448,7 @@ void Room::onEvent(const FfiEvent& event) { // RPC is not targeted at this room's local participant; ignore. return; } - lp = local_participant_.get(); + lp = local_participant_; } // Call outside the lock to avoid deadlocks / re-entrancy issues. diff --git a/src/tests/integration/test_platform_audio.cpp b/src/tests/integration/test_platform_audio.cpp index c6e86596..f24de719 100644 --- a/src/tests/integration/test_platform_audio.cpp +++ b/src/tests/integration/test_platform_audio.cpp @@ -319,4 +319,88 @@ TEST_F(PlatformAudioIntegrationTest, PlatformAudioFramesReachRemote) { receiver_room->clearOnAudioFrameCallback(sender_identity, track_name); } +namespace { + +/// Run one publish/subscribe/unpublish cycle against a fresh pair of rooms, +/// reusing a caller-owned PlatformAudio so the underlying Rust LkRuntime (and +/// therefore the platform Audio Device Module) is never torn down between +/// cycles. Returns true if the receiver observed the published track. +bool runPlatformAudioCycle(PlatformAudio& platform_audio, const TestConfig& config, const std::string& track_name) { + RoomOptions options; + options.auto_subscribe = true; + + PlatformTrackState receiver_state; + PlatformTrackCollectorDelegate receiver_delegate(receiver_state); + + auto receiver_room = std::make_unique(); + receiver_room->setDelegate(&receiver_delegate); + if (!receiver_room->connect(config.url, config.token_b, options)) { + return false; + } + + auto sender_room = std::make_unique(); + if (!sender_room->connect(config.url, config.token_a, options)) { + return false; + } + + const auto source = platform_audio.createAudioSource(); + if (source == nullptr) { + return false; + } + + const auto track = LocalAudioTrack::createLocalAudioTrack(track_name, source); + if (track == nullptr) { + return false; + } + + TrackPublishOptions publish_options; + publish_options.source = TrackSource::SOURCE_MICROPHONE; + lockLocalParticipant(*sender_room)->publishTrack(track, publish_options); + + std::unique_lock lock(receiver_state.mutex); + return receiver_state.cv.wait_for(lock, kSubscriptionTimeout, + [&]() { return receiver_state.subscribed_audio_names.count(track_name) > 0; }); +} + +} // namespace + +// Control arm for the macOS PlatformAudio instability investigation. +// +// The standard PlatformAudioIntegrationTest cases each call livekit::shutdown() +// in TearDown(), which disposes the FFI server, drops the last Arc, +// and runs AdmProxy::~AdmProxy() -> platform_adm_->Terminate(). Under +// --gtest_repeat that means the native CoreAudio ADM is fully terminated and +// recreated on *every* iteration -- the suspected crash path. +// +// This test instead holds a single PlatformAudio alive for the whole test, so +// the runtime and ADM are created once and never terminated between cycles. It +// loops the same connect/publish/subscribe cycle PLATFORM_AUDIO_PIN_ITERATIONS +// times (default 20). If the repeat arm crashes on macOS but this pinned arm +// stays green, the instability is in ADM teardown/recreation, not the steady +// media path. +TEST_F(PlatformAudioIntegrationTest, PinnedRuntimeRepeatedPublishStress) { + EXPECT_TRUE(config_.available) << "Missing integration configuration"; + + std::unique_ptr platform_audio; + try { + platform_audio = std::make_unique(); + } catch (const PlatformAudioError& error) { + GTEST_SKIP() << "PlatformAudio unavailable: " << error.what(); + } + + int iterations = 20; + if (const char* env = std::getenv("PLATFORM_AUDIO_PIN_ITERATIONS")) { + const int parsed = std::atoi(env); + if (parsed > 0) { + iterations = parsed; + } + } + + for (int i = 0; i < iterations; ++i) { + const std::string track_name = "platform-mic-pinned-" + std::to_string(i); + const bool subscribed = runPlatformAudioCycle(*platform_audio, config_, track_name); + ASSERT_TRUE(subscribed) << "Receiver never subscribed on pinned iteration " << i; + } +} + } // namespace livekit::test