From bbed1887d59f8657327fbd2b0ed36cb20d478969 Mon Sep 17 00:00:00 2001 From: Alan George Date: Mon, 22 Jun 2026 21:25:30 -0600 Subject: [PATCH 01/10] New changes to catch audio bug --- .github/scripts/run_tests_with_backtrace.sh | 220 ++++++++++++++++ .github/scripts/stage_crash_diagnostics.sh | 47 ++++ .github/workflows/nightly.yml | 180 +++++++++++++ .github/workflows/platform-audio-triage.yml | 240 ++++++++++++++++++ client-sdk-rust | 2 +- src/data_track_stream.cpp | 22 +- src/room.cpp | 4 +- src/tests/integration/test_platform_audio.cpp | 84 ++++++ 8 files changed, 788 insertions(+), 11 deletions(-) create mode 100755 .github/scripts/run_tests_with_backtrace.sh create mode 100755 .github/scripts/stage_crash_diagnostics.sh create mode 100644 .github/workflows/nightly.yml create mode 100644 .github/workflows/platform-audio-triage.yml diff --git a/.github/scripts/run_tests_with_backtrace.sh b/.github/scripts/run_tests_with_backtrace.sh new file mode 100755 index 00000000..8765c59c --- /dev/null +++ b/.github/scripts/run_tests_with_backtrace.sh @@ -0,0 +1,220 @@ +#!/usr/bin/env bash +# Run a test binary under debug CI. On fatal signals, print post-mortem +# backtraces from core dumps when available. Linux also runs under catchsegv +# so a partial backtrace appears in the log even without a core file. +# +# When LIVEKIT_TEST_STALL_SECONDS is set to a positive integer, a watchdog +# monitors test output and dumps live thread backtraces if the log goes silent +# for that many seconds (integration-test hang diagnostics on linux-x64). +set -uo pipefail + +usage() { + echo "Usage: $0 [gtest-args...]" >&2 + exit 2 +} + +[[ $# -ge 1 ]] || usage + +binary=$1 +shift + +if [[ ! -x "$binary" ]]; then + echo "Error: not executable: $binary" >&2 + exit 2 +fi + +binary_abs=$(cd "$(dirname "$binary")" && pwd)/$(basename "$binary") +core_dir="${RUNNER_TEMP:-/tmp}/livekit-test-cores" +mkdir -p "$core_dir" + +ulimit -c unlimited || true + +if [[ "$(uname -s)" == "Linux" ]]; then + echo "${core_dir}/core.%e.%p" | sudo tee /proc/sys/kernel/core_pattern >/dev/null || true +fi + +if [[ "$(uname -s)" == "Darwin" ]]; then + ulimit -c unlimited || true + sudo sysctl -w kern.coredump=1 >/dev/null 2>&1 || true + sudo mkdir -p /cores 2>/dev/null || true + sudo chmod 1777 /cores 2>/dev/null || true +fi + +dump_macos_crash_reports() { + local binary_name + binary_name=$(basename "${binary_abs}") + echo "=== macOS DiagnosticReports for ${binary_name} ===" + local found=0 + for report_dir in "${HOME}/Library/Logs/DiagnosticReports" "/Library/Logs/DiagnosticReports"; do + if [[ ! -d "${report_dir}" ]]; then + continue + fi + while IFS= read -r report; do + found=1 + echo "Crash report: ${report}" + # .ips files are JSON-ish; print the first 200 lines for the CI log. + head -n 200 "${report}" || true + done < <(find "${report_dir}" -maxdepth 1 -name "${binary_name}*.ips" -type f -print 2>/dev/null | sort -r | head -3) + done + if ((found == 0)); then + echo "No DiagnosticReports .ips found for ${binary_name}" + fi +} + +dump_live_backtraces() { + local test_pid=$1 + local reason=$2 + + echo "=== live backtrace diagnostics (${reason}, pid ${test_pid}) ===" + + if [[ "$(uname -s)" == "Linux" ]]; then + if command -v gdb >/dev/null 2>&1; then + gdb -batch \ + -ex 'set pagination off' \ + -ex 'thread apply all bt full' \ + -p "${test_pid}" || true + else + echo "gdb not available; install gdb for live backtraces" + fi + return 0 + fi + + if [[ "$(uname -s)" == "Darwin" ]]; then + if command -v sample >/dev/null 2>&1; then + sample "${test_pid}" 5 -mayDie 2>&1 || true + fi + if command -v lldb >/dev/null 2>&1; then + lldb -p "${test_pid}" --batch -o 'thread backtrace all' -o 'detach' -o 'quit' 2>&1 || true + else + echo "lldb not available" + fi + fi +} + +dump_backtraces() { + local test_pid=$1 + local status=$2 + + echo "=== crash diagnostics (exit status ${status}, pid ${test_pid}) ===" + + if [[ "$(uname -s)" == "Linux" ]]; then + local core="" + core=$(find "$core_dir" -maxdepth 1 -name 'core.*' -type f 2>/dev/null | sort -r | head -1) + if [[ -z "$core" ]]; then + core=$(find /tmp -maxdepth 1 -name 'core.*' -type f 2>/dev/null | sort -r | head -1) + fi + if [[ -n "$core" && -f "$core" ]]; then + echo "Core file: ${core}" + if command -v gdb >/dev/null 2>&1; then + gdb -batch \ + -ex 'set pagination off' \ + -ex 'thread apply all bt full' \ + "${binary_abs}" "${core}" || true + else + echo "gdb not available; install gdb for post-mortem backtraces" + fi + cp -a "${core}" "${core_dir}/" 2>/dev/null || true + basename "${core}" >"${core_dir}/last-core.name" + else + echo "No core file found under ${core_dir} or /tmp" + fi + return 0 + fi + + if [[ "$(uname -s)" == "Darwin" ]]; then + local core="" + for candidate in "/cores/core.${test_pid}" "/cores/core.${test_pid}.dump"; do + if [[ -f "${candidate}" ]]; then + core=${candidate} + break + fi + done + if [[ -z "$core" ]]; then + core=$(find /cores -maxdepth 1 -name "core.*" -type f 2>/dev/null | sort -r | head -1) + fi + if [[ -n "$core" && -f "$core" ]]; then + echo "Core file: ${core}" + if command -v lldb >/dev/null 2>&1; then + lldb -b -c "${core}" -o 'thread backtrace all' -o 'quit' -- "${binary_abs}" || true + else + echo "lldb not available" + fi + cp -a "${core}" "${core_dir}/" 2>/dev/null || true + basename "${core}" >"${core_dir}/last-core.name" + else + echo "No core file found under /cores for pid ${test_pid}" + fi + dump_macos_crash_reports + fi +} + +run_test() { + if [[ "$(uname -s)" == "Linux" ]] && command -v catchsegv >/dev/null 2>&1; then + catchsegv "${binary_abs}" "$@" + else + "${binary_abs}" "$@" + fi +} + +start_stall_watchdog() { + local test_pid=$1 + local log_file=$2 + local stall_limit=$3 + + ( + local last_size=-1 + local stall=0 + while kill -0 "${test_pid}" 2>/dev/null; do + local size + size=$(wc -c <"${log_file}" 2>/dev/null || echo 0) + if [[ "${size}" == "${last_size}" ]]; then + stall=$((stall + 5)) + else + stall=0 + last_size=${size} + fi + if ((stall >= stall_limit)); then + echo "=== TEST HANG DETECTED: no output for ${stall}s (pid ${test_pid}) ===" + echo "--- last log lines ---" + tail -n 40 "${log_file}" || true + dump_live_backtraces "${test_pid}" "stall ${stall}s" + kill -ABRT "${test_pid}" 2>/dev/null || kill -TERM "${test_pid}" 2>/dev/null || true + break + fi + sleep 5 + done + ) & + echo $! +} + +stall_limit=${LIVEKIT_TEST_STALL_SECONDS:-0} +log_file="${RUNNER_TEMP:-/tmp}/livekit-test-output.log" + +set +e +if ((stall_limit > 0)); then + : >"${log_file}" + run_test "$@" >"${log_file}" 2>&1 & + test_pid=$! + watchdog_pid=$(start_stall_watchdog "${test_pid}" "${log_file}" "${stall_limit}") + wait "${test_pid}" + status=$? + kill "${watchdog_pid}" 2>/dev/null || true + wait "${watchdog_pid}" 2>/dev/null || true + cat "${log_file}" +else + run_test "$@" & + test_pid=$! + wait "${test_pid}" + status=$? +fi +set -e + +if ((status > 128)); then + signal=$((status - 128)) + echo "Test process ${test_pid} terminated by signal ${signal}" + dump_backtraces "${test_pid}" "${status}" +elif ((status != 0)); then + echo "Test process exited with status ${status}" +fi + +exit "${status}" diff --git a/.github/scripts/stage_crash_diagnostics.sh b/.github/scripts/stage_crash_diagnostics.sh new file mode 100755 index 00000000..47cb1835 --- /dev/null +++ b/.github/scripts/stage_crash_diagnostics.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +# Collect unstripped test binaries, shared libraries, and core dumps for upload. +set -euo pipefail + +build_dir=${1:?usage: stage_crash_diagnostics.sh } +staging="${RUNNER_TEMP}/crash-diagnostics" + +rm -rf "${staging}" +mkdir -p "${staging}/bin" "${staging}/lib" "${staging}/cores" + +shopt -s nullglob +for bin in "${build_dir}"/bin/livekit_*; do + if [[ -f "${bin}" && -x "${bin}" ]]; then + cp -a "${bin}" "${staging}/bin/" + fi +done + +for lib in "${build_dir}"/lib/liblivekit.*; do + if [[ -f "${lib}" ]]; then + cp -a "${lib}" "${staging}/lib/" + fi +done + +while IFS= read -r -d '' ffi_lib; do + cp -a "${ffi_lib}" "${staging}/lib/" +done < <(find client-sdk-rust/target/debug -name 'liblivekit_ffi.*' -print0 2>/dev/null) + +core_dir="${RUNNER_TEMP}/livekit-test-cores" +if [[ -d "${core_dir}" ]]; then + find "${core_dir}" -maxdepth 1 -name 'core.*' -type f -exec cp -a {} "${staging}/cores/" \; 2>/dev/null || true +fi + +if [[ "$(uname -s)" == "Darwin" && -d /cores ]]; then + find /cores -maxdepth 1 -name 'core.*' -type f -exec cp -a {} "${staging}/cores/" \; 2>/dev/null || true +fi + +if [[ "$(uname -s)" == "Darwin" ]]; then + mkdir -p "${staging}/crash-reports" + for report_dir in "${HOME}/Library/Logs/DiagnosticReports" "/Library/Logs/DiagnosticReports"; do + if [[ -d "${report_dir}" ]]; then + find "${report_dir}" -maxdepth 1 -name '*.ips' -type f -exec cp -a {} "${staging}/crash-reports/" \; 2>/dev/null || true + fi + done +fi + +echo "Staged crash diagnostics under ${staging}:" +find "${staging}" -type f -print diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml new file mode 100644 index 00000000..2d701a8a --- /dev/null +++ b/.github/workflows/nightly.yml @@ -0,0 +1,180 @@ +name: Nightly + +# Scheduled Linux ASan/UBSan coverage for the integration + unit suites. +# The self-contained job below does not call the reusable build/test workflows +# so it can iterate independently of the main CI matrix. Focused, mac-only +# PlatformAudio crash hunting lives in platform-audio-triage.yml. + +on: + schedule: + - cron: "23 7 * * *" + workflow_dispatch: + +permissions: + contents: read + actions: read + +jobs: + sanitizer: + name: Sanitizer Checks + runs-on: ubuntu-latest + timeout-minutes: 90 + env: + CARGO_TERM_COLOR: always + CARGO_INCREMENTAL: "0" + RUST_BACKTRACE: full + ASAN_OPTIONS: detect_leaks=0:halt_on_error=1:symbolize=1:print_stacktrace=1 + UBSAN_OPTIONS: halt_on_error=1:print_stacktrace=1 + + steps: + - name: Checkout (with submodules) + uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + with: + submodules: recursive + fetch-depth: 1 + + - name: Pull LFS files + run: git lfs pull + + - name: Prepare CI test scripts + run: | + chmod +x .github/scripts/run_tests_with_backtrace.sh + chmod +x .github/scripts/stage_crash_diagnostics.sh + + - name: Install deps + run: | + set -eux + sudo apt-get update + sudo apt-get install -y \ + build-essential cmake ninja-build pkg-config \ + llvm-dev libclang-dev clang \ + libva-dev libdrm-dev libgbm-dev libx11-dev libgl1-mesa-dev \ + libxext-dev libxcomposite-dev libxdamage-dev libxfixes-dev \ + libxrandr-dev libxi-dev libxkbcommon-dev \ + libasound2-dev libpulse-dev \ + libssl-dev \ + libprotobuf-dev protobuf-compiler \ + libabsl-dev \ + libwayland-dev libdecor-0-dev \ + jq + + - name: Install Rust (stable) + uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 + with: + toolchain: stable + + - name: Set build environment + run: | + LLVM_VERSION=$(llvm-config --version | cut -d. -f1) + echo "LIBCLANG_PATH=/usr/lib/llvm-${LLVM_VERSION}/lib" >> "$GITHUB_ENV" + echo "CXXFLAGS=-Wno-deprecated-declarations -fno-omit-frame-pointer" >> "$GITHUB_ENV" + echo "CFLAGS=-Wno-deprecated-declarations -fno-omit-frame-pointer" >> "$GITHUB_ENV" + + - name: Configure sanitizer build + run: | + cmake --preset linux-debug-tests \ + -DCMAKE_C_FLAGS="-Wno-deprecated-declarations -fsanitize=address,undefined -fno-omit-frame-pointer" \ + -DCMAKE_CXX_FLAGS="-Wno-deprecated-declarations -fsanitize=address,undefined -fno-omit-frame-pointer" \ + -DCMAKE_EXE_LINKER_FLAGS="-fsanitize=address,undefined" \ + -DCMAKE_SHARED_LINKER_FLAGS="-fsanitize=address,undefined" + + - name: Build sanitizer tests + run: cmake --build build-debug --target livekit_unit_tests livekit_integration_tests --parallel 2 + + - name: Run sanitizer unit tests + timeout-minutes: 20 + run: | + .github/scripts/run_tests_with_backtrace.sh \ + build-debug/bin/livekit_unit_tests \ + --gtest_brief=1 \ + --gtest_output=xml:build-debug/sanitizer-unit-test-results.xml + + - name: Start livekit-server + id: livekit_server + uses: livekit/dev-server-action@61e2b4dcb170dd3591e0c9b0db3c3fe5db93b500 + continue-on-error: true + with: + github-token: ${{ github.token }} + + - name: Start livekit-server fallback + if: steps.livekit_server.outcome == 'failure' + id: livekit_server_fallback + shell: bash + env: + GH_TOKEN: ${{ github.token }} + run: | + set -euxo pipefail + tag="$( + gh api repos/livekit/livekit/releases \ + --jq 'limit(1; .[] | select([.assets[].name] | any(endswith("_linux_amd64.tar.gz"))) | .tag_name)' + )" + gh release download "${tag}" \ + --repo livekit/livekit \ + --pattern "*_linux_amd64.tar.gz" \ + --output "$RUNNER_TEMP/livekit-server-archive" + tar -xzf "$RUNNER_TEMP/livekit-server-archive" -C "$RUNNER_TEMP" + chmod +x "$RUNNER_TEMP/livekit-server" + cat > "$RUNNER_TEMP/livekit.yaml" <<'EOF' + logging: { json: true } + EOF + "$RUNNER_TEMP/livekit-server" --config "$RUNNER_TEMP/livekit.yaml" --dev > "$RUNNER_TEMP/livekit.jsonl" 2>&1 & + echo "log-path=$RUNNER_TEMP/livekit.jsonl" >> "$GITHUB_OUTPUT" + for i in $(seq 1 30); do + if [[ "$(curl -fsS http://localhost:7880/ || true)" == "OK" ]]; then + exit 0 + fi + sleep 1 + done + exit 1 + + - name: Install livekit-cli + shell: bash + run: curl -sSL https://get.livekit.io/cli | bash + + - name: Run sanitizer integration tests (lifecycle subset) + timeout-minutes: 30 + shell: bash + run: | + set -euo pipefail + source .token_helpers/set_data_track_test_tokens.bash + .github/scripts/run_tests_with_backtrace.sh \ + build-debug/bin/livekit_integration_tests \ + --gtest_filter='PlatformAudioIntegrationTest.*:DataTrackE2ETest.UnpublishUpdatesPublishedStateEndToEnd:DataTrackPayloads/DataTrackTransportTest.PublishesAndReceivesFramesEndToEnd/MultiPacket' \ + --gtest_repeat=10 \ + --gtest_recreate_environments_when_repeating=1 \ + --gtest_brief=1 \ + --gtest_output=xml:build-debug/sanitizer-integration-test-results.xml + + - name: Dump livekit-server log on failure + if: failure() + shell: bash + run: | + log_path="${{ steps.livekit_server.outputs.log-path }}" + if [[ -z "$log_path" ]]; then + log_path="${{ steps.livekit_server_fallback.outputs.log-path }}" + fi + tail -n 500 "$log_path" || true + + - name: Stage crash diagnostics + if: failure() + run: .github/scripts/stage_crash_diagnostics.sh build-debug + + - name: Upload sanitizer test results + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: sanitizer-test-results + path: | + build-debug/sanitizer-unit-test-results.xml + build-debug/sanitizer-integration-test-results.xml + if-no-files-found: ignore + retention-days: 14 + + - name: Upload sanitizer crash diagnostics + if: failure() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: sanitizer-crash-diagnostics + path: ${{ runner.temp }}/crash-diagnostics/ + if-no-files-found: ignore + retention-days: 14 diff --git a/.github/workflows/platform-audio-triage.yml b/.github/workflows/platform-audio-triage.yml new file mode 100644 index 00000000..06749396 --- /dev/null +++ b/.github/workflows/platform-audio-triage.yml @@ -0,0 +1,240 @@ +name: PlatformAudio Triage + +# Focused, mac-only crash-hunting harness for the PlatformAudio instability. +# It builds ONLY the integration test binary and runs the PlatformAudio cases +# in a tight, high-repeat loop with backtrace + crash-report capture. +# +# Two arms run so a failure is diagnostic, not just a red X: +# - repeat arm: the standard cases under --gtest_repeat. Each iteration calls +# livekit::shutdown() -> FFI dispose -> Arc drop -> +# AdmProxy::~AdmProxy() -> CoreAudio ADM Terminate(). This is the suspected +# crash path (full ADM teardown/recreate every iteration). +# - pinned arm: PinnedRuntimeRepeatedPublishStress holds one PlatformAudio for +# the whole test, so the ADM is created once and never terminated between +# cycles. If the repeat arm crashes but this stays green, the bug is in ADM +# teardown, not the steady media path. +# +# Trigger from the Actions tab (workflow_dispatch) and tune the inputs. The +# pull_request trigger is TEMPORARY for validating the workflow itself; remove +# it before merging. + +on: + workflow_dispatch: + inputs: + runner: + description: "macOS runner (Intel = x64, where the instability reproduces)" + type: choice + default: macos-15-large + options: + - macos-15-large + - macos-13 + - macos-26-xlarge + repeat: + description: "gtest_repeat count for the dispose-each-iteration arm" + type: string + default: "100" + pin_iterations: + description: "PLATFORM_AUDIO_PIN_ITERATIONS for the pinned control arm" + type: string + default: "50" + ubsan: + description: "Build with UndefinedBehaviorSanitizer (ASan is incompatible with macOS CoreAudio)" + type: boolean + default: true + pull_request: + types: [opened, reopened, synchronize, ready_for_review] + branches: ["main"] + +permissions: + contents: read + actions: read + +jobs: + triage: + name: PlatformAudio Triage (${{ github.event.inputs.runner || 'macos-15-large' }}) + runs-on: ${{ github.event.inputs.runner || 'macos-15-large' }} + timeout-minutes: 90 + env: + CARGO_TERM_COLOR: always + CARGO_INCREMENTAL: "0" + RUST_BACKTRACE: full + UBSAN_OPTIONS: halt_on_error=1:print_stacktrace=1 + # On pull_request validation runs keep counts small; dispatch uses inputs. + REPEAT: ${{ github.event.inputs.repeat || '3' }} + PIN_ITERATIONS: ${{ github.event.inputs.pin_iterations || '3' }} + USE_UBSAN: ${{ github.event.inputs.ubsan || 'true' }} + + steps: + - name: Checkout (with submodules) + uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + with: + submodules: recursive + fetch-depth: 1 + + - name: Pull LFS files + run: git lfs pull + + - name: Prepare CI test scripts + run: | + chmod +x .github/scripts/run_tests_with_backtrace.sh + chmod +x .github/scripts/stage_crash_diagnostics.sh + + - name: Install deps + run: | + set -eux + brew update + brew install cmake ninja protobuf abseil + + - name: Install Rust (stable) + uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 + with: + toolchain: stable + + - name: Set build environment + run: | + echo "CXXFLAGS=-fno-omit-frame-pointer" >> "$GITHUB_ENV" + echo "CFLAGS=-fno-omit-frame-pointer" >> "$GITHUB_ENV" + + - name: Configure build + run: | + if [[ "${USE_UBSAN}" == "true" ]]; then + cmake --preset macos-debug-tests \ + -DCMAKE_C_FLAGS="-fsanitize=undefined -fno-omit-frame-pointer" \ + -DCMAKE_CXX_FLAGS="-fsanitize=undefined -fno-omit-frame-pointer" \ + -DCMAKE_EXE_LINKER_FLAGS="-fsanitize=undefined" \ + -DCMAKE_SHARED_LINKER_FLAGS="-fsanitize=undefined" + else + cmake --preset macos-debug-tests + fi + + - name: Build integration tests only + run: cmake --build build-debug --target livekit_integration_tests --parallel 2 + + - name: Start livekit-server + id: livekit_server + uses: livekit/dev-server-action@61e2b4dcb170dd3591e0c9b0db3c3fe5db93b500 + continue-on-error: true + with: + github-token: ${{ github.token }} + + - name: Start livekit-server fallback + if: steps.livekit_server.outcome == 'failure' + id: livekit_server_fallback + shell: bash + run: | + set -euxo pipefail + brew install livekit + cat > "$RUNNER_TEMP/livekit.yaml" <<'EOF' + logging: { json: true } + EOF + livekit-server --config "$RUNNER_TEMP/livekit.yaml" --dev > "$RUNNER_TEMP/livekit.jsonl" 2>&1 & + echo "log-path=$RUNNER_TEMP/livekit.jsonl" >> "$GITHUB_OUTPUT" + for i in $(seq 1 30); do + if [[ "$(curl -fsS http://localhost:7880/ || true)" == "OK" ]]; then + exit 0 + fi + sleep 1 + done + exit 1 + + - name: Install livekit-cli + shell: bash + run: brew install livekit-cli + + # Arm A: full ADM teardown/recreate every iteration (suspected crash path). + - name: "Arm A — repeat (dispose+ADM Terminate each iteration)" + id: repeat_arm + continue-on-error: true + timeout-minutes: 40 + shell: bash + env: + RUST_LOG: info,livekit_ffi::server=debug,livekit_ffi::server::platform_audio=debug,livekit::platform_audio=debug + RUST_BACKTRACE: full + run: | + set -euo pipefail + source .token_helpers/set_data_track_test_tokens.bash + .github/scripts/run_tests_with_backtrace.sh \ + build-debug/bin/livekit_integration_tests \ + --gtest_filter='PlatformAudioIntegrationTest.*-PlatformAudioIntegrationTest.PinnedRuntimeRepeatedPublishStress' \ + --gtest_repeat="${REPEAT}" \ + --gtest_recreate_environments_when_repeating=1 \ + --gtest_break_on_failure \ + --gtest_output=xml:build-debug/triage-repeat-arm.xml + + # Arm B: control. Runtime/ADM pinned alive for the whole test. + - name: "Arm B — pinned runtime (ADM created once, never terminated)" + id: pinned_arm + continue-on-error: true + timeout-minutes: 40 + shell: bash + env: + RUST_LOG: info,livekit_ffi::server=debug,livekit_ffi::server::platform_audio=debug,livekit::platform_audio=debug + RUST_BACKTRACE: full + PLATFORM_AUDIO_PIN_ITERATIONS: ${{ env.PIN_ITERATIONS }} + run: | + set -euo pipefail + source .token_helpers/set_data_track_test_tokens.bash + .github/scripts/run_tests_with_backtrace.sh \ + build-debug/bin/livekit_integration_tests \ + --gtest_filter='PlatformAudioIntegrationTest.PinnedRuntimeRepeatedPublishStress' \ + --gtest_break_on_failure \ + --gtest_output=xml:build-debug/triage-pinned-arm.xml + + - name: Dump livekit-server log on failure + if: failure() || steps.repeat_arm.outcome == 'failure' || steps.pinned_arm.outcome == 'failure' + shell: bash + run: | + log_path="${{ steps.livekit_server.outputs.log-path }}" + if [[ -z "$log_path" ]]; then + log_path="${{ steps.livekit_server_fallback.outputs.log-path }}" + fi + tail -n 500 "$log_path" || true + + - name: Stage crash diagnostics + if: steps.repeat_arm.outcome == 'failure' || steps.pinned_arm.outcome == 'failure' + run: .github/scripts/stage_crash_diagnostics.sh build-debug + + - name: Upload test results + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: platform-audio-triage-results + path: | + build-debug/triage-repeat-arm.xml + build-debug/triage-pinned-arm.xml + if-no-files-found: ignore + retention-days: 14 + + - name: Upload crash diagnostics + if: steps.repeat_arm.outcome == 'failure' || steps.pinned_arm.outcome == 'failure' + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: platform-audio-triage-crash-diagnostics + path: ${{ runner.temp }}/crash-diagnostics/ + if-no-files-found: ignore + retention-days: 14 + + # Surface the diagnostic contrast and fail the job if either arm failed. + - name: Triage summary + if: always() + shell: bash + run: | + repeat="${{ steps.repeat_arm.outcome }}" + pinned="${{ steps.pinned_arm.outcome }}" + { + echo "## PlatformAudio triage" + echo "" + echo "| Arm | Outcome |" + echo "| --- | --- |" + echo "| Repeat (dispose+ADM Terminate each iter) | ${repeat} |" + echo "| Pinned runtime (ADM never terminated) | ${pinned} |" + echo "" + if [[ "${repeat}" == "failure" && "${pinned}" == "success" ]]; then + echo "➡️ Repeat failed while pinned passed: consistent with an ADM teardown/recreate bug." + elif [[ "${repeat}" == "failure" && "${pinned}" == "failure" ]]; then + echo "➡️ Both arms failed: instability is not exclusive to ADM teardown." + elif [[ "${repeat}" == "success" && "${pinned}" == "success" ]]; then + echo "➡️ Both arms passed this run (did not reproduce). Try a higher repeat count." + fi + } >> "$GITHUB_STEP_SUMMARY" + [[ "${repeat}" == "success" && "${pinned}" == "success" ]] diff --git a/client-sdk-rust b/client-sdk-rust index 175cf276..ee0a86ac 160000 --- a/client-sdk-rust +++ b/client-sdk-rust @@ -1 +1 @@ -Subproject commit 175cf276a8aa6770dbc795404fa91dc55dd27f10 +Subproject commit ee0a86aca3b6fc91badf085b4a40255b77bdefe1 diff --git a/src/data_track_stream.cpp b/src/data_track_stream.cpp index 155f4788..6a777377 100644 --- a/src/data_track_stream.cpp +++ b/src/data_track_stream.cpp @@ -52,22 +52,28 @@ void DataTrackStream::init(FfiHandle subscription_handle) { bool DataTrackStream::read(DataTrackFrame& out) { proto::DataTrackStreamReadResponse read_response; bool missing_read_response = false; + std::uint64_t subscription_handle = 0; { const std::scoped_lock lock(mutex_); if (closed_ || eof_) { return false; } + subscription_handle = static_cast(subscription_handle_.get()); + } - const auto subscription_handle = static_cast(subscription_handle_.get()); + // Do not hold mutex_ across sendRequest: readFrameWithTimeout may call close() + // from another thread on timeout, and close() also needs mutex_. + proto::FfiRequest req; + auto* msg = req.mutable_data_track_stream_read(); + msg->set_stream_handle(subscription_handle); + const proto::FfiResponse resp = FfiClient::instance().sendRequest(req); - // Signal the Rust side that we're ready to receive the next frame. - // The Rust SubscriptionTask uses a demand-driven protocol: it won't pull - // from the underlying stream until notified via this request. - proto::FfiRequest req; - auto* msg = req.mutable_data_track_stream_read(); - msg->set_stream_handle(subscription_handle); - const proto::FfiResponse resp = FfiClient::instance().sendRequest(req); + { + const std::scoped_lock lock(mutex_); + if (closed_ || eof_) { + return false; + } if (!resp.has_data_track_stream_read()) { missing_read_response = true; } else { diff --git a/src/room.cpp b/src/room.cpp index 3ad58938..71680389 100644 --- a/src/room.cpp +++ b/src/room.cpp @@ -436,7 +436,7 @@ void Room::onEvent(const FfiEvent& event) { if (event.message_case() == FfiEvent::kRpcMethodInvocation) { const auto& rpc = event.rpc_method_invocation(); - LocalParticipant* lp = nullptr; + std::shared_ptr lp; { const std::scoped_lock guard(lock_); if (!local_participant_) { @@ -448,7 +448,7 @@ void Room::onEvent(const FfiEvent& event) { // RPC is not targeted at this room's local participant; ignore. return; } - lp = local_participant_.get(); + lp = local_participant_; } // Call outside the lock to avoid deadlocks / re-entrancy issues. diff --git a/src/tests/integration/test_platform_audio.cpp b/src/tests/integration/test_platform_audio.cpp index fe35c7b7..93942052 100644 --- a/src/tests/integration/test_platform_audio.cpp +++ b/src/tests/integration/test_platform_audio.cpp @@ -313,4 +313,88 @@ TEST_F(PlatformAudioIntegrationTest, PlatformAudioFramesReachRemote) { receiver_room->clearOnAudioFrameCallback(sender_identity, track_name); } +namespace { + +/// Run one publish/subscribe/unpublish cycle against a fresh pair of rooms, +/// reusing a caller-owned PlatformAudio so the underlying Rust LkRuntime (and +/// therefore the platform Audio Device Module) is never torn down between +/// cycles. Returns true if the receiver observed the published track. +bool runPlatformAudioCycle(PlatformAudio& platform_audio, const TestConfig& config, const std::string& track_name) { + RoomOptions options; + options.auto_subscribe = true; + + PlatformTrackState receiver_state; + PlatformTrackCollectorDelegate receiver_delegate(receiver_state); + + auto receiver_room = std::make_unique(); + receiver_room->setDelegate(&receiver_delegate); + if (!receiver_room->connect(config.url, config.token_b, options)) { + return false; + } + + auto sender_room = std::make_unique(); + if (!sender_room->connect(config.url, config.token_a, options)) { + return false; + } + + const auto source = platform_audio.createAudioSource(); + if (source == nullptr) { + return false; + } + + const auto track = LocalAudioTrack::createLocalAudioTrack(track_name, source); + if (track == nullptr) { + return false; + } + + TrackPublishOptions publish_options; + publish_options.source = TrackSource::SOURCE_MICROPHONE; + lockLocalParticipant(*sender_room)->publishTrack(track, publish_options); + + std::unique_lock lock(receiver_state.mutex); + return receiver_state.cv.wait_for(lock, kSubscriptionTimeout, + [&]() { return receiver_state.subscribed_audio_names.count(track_name) > 0; }); +} + +} // namespace + +// Control arm for the macOS PlatformAudio instability investigation. +// +// The standard PlatformAudioIntegrationTest cases each call livekit::shutdown() +// in TearDown(), which disposes the FFI server, drops the last Arc, +// and runs AdmProxy::~AdmProxy() -> platform_adm_->Terminate(). Under +// --gtest_repeat that means the native CoreAudio ADM is fully terminated and +// recreated on *every* iteration -- the suspected crash path. +// +// This test instead holds a single PlatformAudio alive for the whole test, so +// the runtime and ADM are created once and never terminated between cycles. It +// loops the same connect/publish/subscribe cycle PLATFORM_AUDIO_PIN_ITERATIONS +// times (default 20). If the repeat arm crashes on macOS but this pinned arm +// stays green, the instability is in ADM teardown/recreation, not the steady +// media path. +TEST_F(PlatformAudioIntegrationTest, PinnedRuntimeRepeatedPublishStress) { + EXPECT_TRUE(config_.available) << "Missing integration configuration"; + + std::unique_ptr platform_audio; + try { + platform_audio = std::make_unique(); + } catch (const PlatformAudioError& error) { + GTEST_SKIP() << "PlatformAudio unavailable: " << error.what(); + } + + int iterations = 20; + if (const char* env = std::getenv("PLATFORM_AUDIO_PIN_ITERATIONS")) { + const int parsed = std::atoi(env); + if (parsed > 0) { + iterations = parsed; + } + } + + for (int i = 0; i < iterations; ++i) { + const std::string track_name = "platform-mic-pinned-" + std::to_string(i); + const bool subscribed = runPlatformAudioCycle(*platform_audio, config_, track_name); + ASSERT_TRUE(subscribed) << "Receiver never subscribed on pinned iteration " << i; + } +} + } // namespace livekit::test From 528d7f20a8ff5fbe5e54b35892619942e26306a0 Mon Sep 17 00:00:00 2001 From: Alan George Date: Mon, 22 Jun 2026 21:54:45 -0600 Subject: [PATCH 02/10] Focus PlatformAudio triage workflow; remove nightly harness Drop nightly.yml (superseded by platform-audio-triage.yml). Make the triage workflow the single focused, mac-only crash-hunting tool: - Remove the temporary pull_request trigger (workflow_dispatch only) so it stops doing a ~20-minute build on every PR push. - Cache the Rust submodule build (Swatinem/rust-cache) to skip the cold build on re-runs. - Raise dispatch defaults to repeat=500 / pin_iterations=200 now that the test loop is confirmed cheap relative to the build. Co-authored-by: Cursor --- .github/workflows/nightly.yml | 180 -------------------- .github/workflows/platform-audio-triage.yml | 22 +-- 2 files changed, 11 insertions(+), 191 deletions(-) delete mode 100644 .github/workflows/nightly.yml diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml deleted file mode 100644 index 2d701a8a..00000000 --- a/.github/workflows/nightly.yml +++ /dev/null @@ -1,180 +0,0 @@ -name: Nightly - -# Scheduled Linux ASan/UBSan coverage for the integration + unit suites. -# The self-contained job below does not call the reusable build/test workflows -# so it can iterate independently of the main CI matrix. Focused, mac-only -# PlatformAudio crash hunting lives in platform-audio-triage.yml. - -on: - schedule: - - cron: "23 7 * * *" - workflow_dispatch: - -permissions: - contents: read - actions: read - -jobs: - sanitizer: - name: Sanitizer Checks - runs-on: ubuntu-latest - timeout-minutes: 90 - env: - CARGO_TERM_COLOR: always - CARGO_INCREMENTAL: "0" - RUST_BACKTRACE: full - ASAN_OPTIONS: detect_leaks=0:halt_on_error=1:symbolize=1:print_stacktrace=1 - UBSAN_OPTIONS: halt_on_error=1:print_stacktrace=1 - - steps: - - name: Checkout (with submodules) - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 - with: - submodules: recursive - fetch-depth: 1 - - - name: Pull LFS files - run: git lfs pull - - - name: Prepare CI test scripts - run: | - chmod +x .github/scripts/run_tests_with_backtrace.sh - chmod +x .github/scripts/stage_crash_diagnostics.sh - - - name: Install deps - run: | - set -eux - sudo apt-get update - sudo apt-get install -y \ - build-essential cmake ninja-build pkg-config \ - llvm-dev libclang-dev clang \ - libva-dev libdrm-dev libgbm-dev libx11-dev libgl1-mesa-dev \ - libxext-dev libxcomposite-dev libxdamage-dev libxfixes-dev \ - libxrandr-dev libxi-dev libxkbcommon-dev \ - libasound2-dev libpulse-dev \ - libssl-dev \ - libprotobuf-dev protobuf-compiler \ - libabsl-dev \ - libwayland-dev libdecor-0-dev \ - jq - - - name: Install Rust (stable) - uses: dtolnay/rust-toolchain@3c5f7ea28cd621ae0bf5283f0e981fb97b8a7af9 - with: - toolchain: stable - - - name: Set build environment - run: | - LLVM_VERSION=$(llvm-config --version | cut -d. -f1) - echo "LIBCLANG_PATH=/usr/lib/llvm-${LLVM_VERSION}/lib" >> "$GITHUB_ENV" - echo "CXXFLAGS=-Wno-deprecated-declarations -fno-omit-frame-pointer" >> "$GITHUB_ENV" - echo "CFLAGS=-Wno-deprecated-declarations -fno-omit-frame-pointer" >> "$GITHUB_ENV" - - - name: Configure sanitizer build - run: | - cmake --preset linux-debug-tests \ - -DCMAKE_C_FLAGS="-Wno-deprecated-declarations -fsanitize=address,undefined -fno-omit-frame-pointer" \ - -DCMAKE_CXX_FLAGS="-Wno-deprecated-declarations -fsanitize=address,undefined -fno-omit-frame-pointer" \ - -DCMAKE_EXE_LINKER_FLAGS="-fsanitize=address,undefined" \ - -DCMAKE_SHARED_LINKER_FLAGS="-fsanitize=address,undefined" - - - name: Build sanitizer tests - run: cmake --build build-debug --target livekit_unit_tests livekit_integration_tests --parallel 2 - - - name: Run sanitizer unit tests - timeout-minutes: 20 - run: | - .github/scripts/run_tests_with_backtrace.sh \ - build-debug/bin/livekit_unit_tests \ - --gtest_brief=1 \ - --gtest_output=xml:build-debug/sanitizer-unit-test-results.xml - - - name: Start livekit-server - id: livekit_server - uses: livekit/dev-server-action@61e2b4dcb170dd3591e0c9b0db3c3fe5db93b500 - continue-on-error: true - with: - github-token: ${{ github.token }} - - - name: Start livekit-server fallback - if: steps.livekit_server.outcome == 'failure' - id: livekit_server_fallback - shell: bash - env: - GH_TOKEN: ${{ github.token }} - run: | - set -euxo pipefail - tag="$( - gh api repos/livekit/livekit/releases \ - --jq 'limit(1; .[] | select([.assets[].name] | any(endswith("_linux_amd64.tar.gz"))) | .tag_name)' - )" - gh release download "${tag}" \ - --repo livekit/livekit \ - --pattern "*_linux_amd64.tar.gz" \ - --output "$RUNNER_TEMP/livekit-server-archive" - tar -xzf "$RUNNER_TEMP/livekit-server-archive" -C "$RUNNER_TEMP" - chmod +x "$RUNNER_TEMP/livekit-server" - cat > "$RUNNER_TEMP/livekit.yaml" <<'EOF' - logging: { json: true } - EOF - "$RUNNER_TEMP/livekit-server" --config "$RUNNER_TEMP/livekit.yaml" --dev > "$RUNNER_TEMP/livekit.jsonl" 2>&1 & - echo "log-path=$RUNNER_TEMP/livekit.jsonl" >> "$GITHUB_OUTPUT" - for i in $(seq 1 30); do - if [[ "$(curl -fsS http://localhost:7880/ || true)" == "OK" ]]; then - exit 0 - fi - sleep 1 - done - exit 1 - - - name: Install livekit-cli - shell: bash - run: curl -sSL https://get.livekit.io/cli | bash - - - name: Run sanitizer integration tests (lifecycle subset) - timeout-minutes: 30 - shell: bash - run: | - set -euo pipefail - source .token_helpers/set_data_track_test_tokens.bash - .github/scripts/run_tests_with_backtrace.sh \ - build-debug/bin/livekit_integration_tests \ - --gtest_filter='PlatformAudioIntegrationTest.*:DataTrackE2ETest.UnpublishUpdatesPublishedStateEndToEnd:DataTrackPayloads/DataTrackTransportTest.PublishesAndReceivesFramesEndToEnd/MultiPacket' \ - --gtest_repeat=10 \ - --gtest_recreate_environments_when_repeating=1 \ - --gtest_brief=1 \ - --gtest_output=xml:build-debug/sanitizer-integration-test-results.xml - - - name: Dump livekit-server log on failure - if: failure() - shell: bash - run: | - log_path="${{ steps.livekit_server.outputs.log-path }}" - if [[ -z "$log_path" ]]; then - log_path="${{ steps.livekit_server_fallback.outputs.log-path }}" - fi - tail -n 500 "$log_path" || true - - - name: Stage crash diagnostics - if: failure() - run: .github/scripts/stage_crash_diagnostics.sh build-debug - - - name: Upload sanitizer test results - if: always() - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - with: - name: sanitizer-test-results - path: | - build-debug/sanitizer-unit-test-results.xml - build-debug/sanitizer-integration-test-results.xml - if-no-files-found: ignore - retention-days: 14 - - - name: Upload sanitizer crash diagnostics - if: failure() - uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 - with: - name: sanitizer-crash-diagnostics - path: ${{ runner.temp }}/crash-diagnostics/ - if-no-files-found: ignore - retention-days: 14 diff --git a/.github/workflows/platform-audio-triage.yml b/.github/workflows/platform-audio-triage.yml index 06749396..721953b2 100644 --- a/.github/workflows/platform-audio-triage.yml +++ b/.github/workflows/platform-audio-triage.yml @@ -14,9 +14,7 @@ name: PlatformAudio Triage # cycles. If the repeat arm crashes but this stays green, the bug is in ADM # teardown, not the steady media path. # -# Trigger from the Actions tab (workflow_dispatch) and tune the inputs. The -# pull_request trigger is TEMPORARY for validating the workflow itself; remove -# it before merging. +# Trigger from the Actions tab (workflow_dispatch) and tune the inputs. on: workflow_dispatch: @@ -32,18 +30,15 @@ on: repeat: description: "gtest_repeat count for the dispose-each-iteration arm" type: string - default: "100" + default: "500" pin_iterations: description: "PLATFORM_AUDIO_PIN_ITERATIONS for the pinned control arm" type: string - default: "50" + default: "200" ubsan: description: "Build with UndefinedBehaviorSanitizer (ASan is incompatible with macOS CoreAudio)" type: boolean default: true - pull_request: - types: [opened, reopened, synchronize, ready_for_review] - branches: ["main"] permissions: contents: read @@ -59,9 +54,8 @@ jobs: CARGO_INCREMENTAL: "0" RUST_BACKTRACE: full UBSAN_OPTIONS: halt_on_error=1:print_stacktrace=1 - # On pull_request validation runs keep counts small; dispatch uses inputs. - REPEAT: ${{ github.event.inputs.repeat || '3' }} - PIN_ITERATIONS: ${{ github.event.inputs.pin_iterations || '3' }} + REPEAT: ${{ github.event.inputs.repeat || '500' }} + PIN_ITERATIONS: ${{ github.event.inputs.pin_iterations || '200' }} USE_UBSAN: ${{ github.event.inputs.ubsan || 'true' }} steps: @@ -90,6 +84,12 @@ jobs: with: toolchain: stable + # Cache the Rust submodule build so re-runs skip the ~20-minute cold build. + - name: Cache Rust build + uses: Swatinem/rust-cache@e18b497796c12c097a38f9edb9d0641fb99eee32 # v2 + with: + workspaces: client-sdk-rust + - name: Set build environment run: | echo "CXXFLAGS=-fno-omit-frame-pointer" >> "$GITHUB_ENV" From 493fd226e9b907cf38c5434188a3eea7ec6d7c61 Mon Sep 17 00:00:00 2001 From: Alan George Date: Tue, 23 Jun 2026 09:19:58 -0600 Subject: [PATCH 03/10] Add resource sampler to PlatformAudio triage harness Instrument both triage arms with a background sampler that records RSS, thread count, fd count, and mach-port count of the integration-test process over time. Mach-port growth is the tell for a CoreAudio HAL client leak across ADM dispose/recreate cycles. CSVs are uploaded as artifacts and first/last deltas are surfaced in the job summary. Also lower the default repeat from 500 to 200 so Arm A finishes and yields a full leak curve instead of timing out. Co-authored-by: Cursor --- .github/scripts/sample_process_resources.sh | 75 +++++++++++++++++++++ .github/workflows/platform-audio-triage.yml | 49 +++++++++++++- 2 files changed, 122 insertions(+), 2 deletions(-) create mode 100755 .github/scripts/sample_process_resources.sh diff --git a/.github/scripts/sample_process_resources.sh b/.github/scripts/sample_process_resources.sh new file mode 100755 index 00000000..da94296d --- /dev/null +++ b/.github/scripts/sample_process_resources.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +# Periodically sample resource usage of a process matched by name and emit CSV. +# +# Tracks the metrics that reveal a native teardown/recreate leak in the +# PlatformAudio triage: resident memory, OS thread count, open file +# descriptors, and (macOS) mach-port count. Mach ports are the tell for a +# CoreAudio HAL client leak -- each ADM Init talks to coreaudiod over a mach +# port, so a port count that climbs across dispose/recreate cycles points at an +# ADM Terminate() that is not fully releasing HAL resources. +# +# Self-terminates when the target process exits, so it can be safely launched +# in the background ahead of a test run. +set -uo pipefail + +pattern=${1:?usage: sample_process_resources.sh [interval_sec]} +out=${2:?usage: sample_process_resources.sh [interval_sec]} +interval=${3:-3} + +echo "iso_time,elapsed_s,pid,rss_kb,threads,fds,mach_ports" > "${out}" + +# Wait up to 60s for the process to appear. +pid="" +for _ in $(seq 1 60); do + pid=$(pgrep -f "${pattern}" | head -1 || true) + [[ -n "${pid}" ]] && break + sleep 1 +done +if [[ -z "${pid}" ]]; then + echo "sampler: process matching '${pattern}' never appeared" >&2 + exit 0 +fi + +is_macos=0 +[[ "$(uname -s)" == "Darwin" ]] && is_macos=1 + +# mach-port counting needs lsmp + root to inspect another task. GitHub macOS +# runners allow passwordless sudo; `sudo -n` fails fast (no prompt) elsewhere, +# in which case the mach_ports column is left blank rather than a misleading 0. +mach_ports_cmd="" +if (( is_macos )) && command -v lsmp >/dev/null 2>&1; then + if lsmp -p "$$" >/dev/null 2>&1; then + mach_ports_cmd="lsmp -p" + elif sudo -n true >/dev/null 2>&1; then + mach_ports_cmd="sudo -n lsmp -p" + fi +fi + +start=$(date +%s) +while kill -0 "${pid}" 2>/dev/null; do + now=$(date +%s) + elapsed=$((now - start)) + ts=$(date -u +%Y-%m-%dT%H:%M:%SZ) + + rss=$(ps -o rss= -p "${pid}" 2>/dev/null | tr -d ' ') + + if (( is_macos )); then + # macOS: ps -M lists one line per thread (plus a header line). + threads=$(ps -M -p "${pid}" 2>/dev/null | tail -n +2 | grep -c . || true) + if [[ -n "${mach_ports_cmd}" ]]; then + mach_ports=$(${mach_ports_cmd} "${pid}" 2>/dev/null | grep -c -E 'send|recv|port set|dead' || true) + else + mach_ports="" + fi + else + threads=$(ps -o nlwp= -p "${pid}" 2>/dev/null | tr -d ' ') + mach_ports="" + fi + + fds=$(lsof -p "${pid}" 2>/dev/null | tail -n +2 | grep -c . || true) + + echo "${ts},${elapsed},${pid},${rss:-},${threads:-},${fds:-},${mach_ports:-}" >> "${out}" + sleep "${interval}" +done + +echo "sampler: process ${pid} exited; samples written to ${out}" >&2 diff --git a/.github/workflows/platform-audio-triage.yml b/.github/workflows/platform-audio-triage.yml index 721953b2..8598eccd 100644 --- a/.github/workflows/platform-audio-triage.yml +++ b/.github/workflows/platform-audio-triage.yml @@ -30,7 +30,7 @@ on: repeat: description: "gtest_repeat count for the dispose-each-iteration arm" type: string - default: "500" + default: "200" pin_iterations: description: "PLATFORM_AUDIO_PIN_ITERATIONS for the pinned control arm" type: string @@ -54,7 +54,7 @@ jobs: CARGO_INCREMENTAL: "0" RUST_BACKTRACE: full UBSAN_OPTIONS: halt_on_error=1:print_stacktrace=1 - REPEAT: ${{ github.event.inputs.repeat || '500' }} + REPEAT: ${{ github.event.inputs.repeat || '200' }} PIN_ITERATIONS: ${{ github.event.inputs.pin_iterations || '200' }} USE_UBSAN: ${{ github.event.inputs.ubsan || 'true' }} @@ -72,6 +72,7 @@ jobs: run: | chmod +x .github/scripts/run_tests_with_backtrace.sh chmod +x .github/scripts/stage_crash_diagnostics.sh + chmod +x .github/scripts/sample_process_resources.sh - name: Install deps run: | @@ -153,6 +154,9 @@ jobs: run: | set -euo pipefail source .token_helpers/set_data_track_test_tokens.bash + # Sample process resources in the background; it self-exits with the test. + .github/scripts/sample_process_resources.sh \ + livekit_integration_tests "$RUNNER_TEMP/resources-arm-a.csv" 3 & .github/scripts/run_tests_with_backtrace.sh \ build-debug/bin/livekit_integration_tests \ --gtest_filter='PlatformAudioIntegrationTest.*-PlatformAudioIntegrationTest.PinnedRuntimeRepeatedPublishStress' \ @@ -174,6 +178,9 @@ jobs: run: | set -euo pipefail source .token_helpers/set_data_track_test_tokens.bash + # Sample process resources in the background; it self-exits with the test. + .github/scripts/sample_process_resources.sh \ + livekit_integration_tests "$RUNNER_TEMP/resources-arm-b.csv" 3 & .github/scripts/run_tests_with_backtrace.sh \ build-debug/bin/livekit_integration_tests \ --gtest_filter='PlatformAudioIntegrationTest.PinnedRuntimeRepeatedPublishStress' \ @@ -214,6 +221,17 @@ jobs: if-no-files-found: ignore retention-days: 14 + - name: Upload resource samples + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: platform-audio-triage-resource-samples + path: | + ${{ runner.temp }}/resources-arm-a.csv + ${{ runner.temp }}/resources-arm-b.csv + if-no-files-found: ignore + retention-days: 14 + # Surface the diagnostic contrast and fail the job if either arm failed. - name: Triage summary if: always() @@ -221,6 +239,23 @@ jobs: run: | repeat="${{ steps.repeat_arm.outcome }}" pinned="${{ steps.pinned_arm.outcome }}" + + # Print first vs last resource sample so the leak curve is visible inline. + resource_delta() { + local csv=$1 + [[ -f "${csv}" ]] || { echo "| (no samples) | | | | |"; return; } + local first last + first=$(tail -n +2 "${csv}" | head -1) + last=$(tail -n 1 "${csv}") + [[ -n "${first}" && "${first}" != "${last}" ]] || { echo "| (insufficient samples) | | | | |"; return; } + IFS=, read -r _ _ _ frss fthreads ffds fports <<< "${first}" + IFS=, read -r _ le _ lrss lthreads lfds lports <<< "${last}" + echo "| rss_kb | ${frss} | ${lrss} | over ${le}s |" + echo "| threads | ${fthreads} | ${lthreads} | |" + echo "| fds | ${ffds} | ${lfds} | |" + echo "| mach_ports | ${fports} | ${lports} | |" + } + { echo "## PlatformAudio triage" echo "" @@ -236,5 +271,15 @@ jobs: elif [[ "${repeat}" == "success" && "${pinned}" == "success" ]]; then echo "➡️ Both arms passed this run (did not reproduce). Try a higher repeat count." fi + echo "" + echo "### Resource growth — Arm A (dispose each iteration)" + echo "| metric | first | last | note |" + echo "| --- | --- | --- | --- |" + resource_delta "$RUNNER_TEMP/resources-arm-a.csv" + echo "" + echo "### Resource growth — Arm B (pinned)" + echo "| metric | first | last | note |" + echo "| --- | --- | --- | --- |" + resource_delta "$RUNNER_TEMP/resources-arm-b.csv" } >> "$GITHUB_STEP_SUMMARY" [[ "${repeat}" == "success" && "${pinned}" == "success" ]] From f2280c35ad230e65ac7efb2909a0344dd1cd3f84 Mon Sep 17 00:00:00 2001 From: Alan George Date: Tue, 23 Jun 2026 10:42:31 -0600 Subject: [PATCH 04/10] Fix triage sampler PID selection; add isolated frame-flow arm The resource sampler matched its own command line (argv contains the process pattern) and picked it via head -1, so it measured an idle 1-thread shell instead of the test binary. Select the matching PID with the largest RSS instead, excluding the sampler's own PID, so it tracks the real instrumented binary. Drop --gtest_break_on_failure from the triage arms: it converted ordinary EXPECT failures (e.g. "no platform audio frames received") into SIGTRAP core dumps (a misleading ~2GB artifact) and halted the repeat loop before the sampler could capture the full curve. Add Arm C, which runs only PlatformAudioFramesReachRemote with a small repeat, to distinguish "frame flow dead on a fresh ADM" from "frame flow only dies after prior teardown/recreate cycles churn the ADM". Co-authored-by: Cursor --- .github/scripts/sample_process_resources.sh | 32 ++++++++++-- .github/workflows/platform-audio-triage.yml | 56 +++++++++++++++++---- 2 files changed, 74 insertions(+), 14 deletions(-) diff --git a/.github/scripts/sample_process_resources.sh b/.github/scripts/sample_process_resources.sh index da94296d..68be3732 100755 --- a/.github/scripts/sample_process_resources.sh +++ b/.github/scripts/sample_process_resources.sh @@ -18,17 +18,41 @@ interval=${3:-3} echo "iso_time,elapsed_s,pid,rss_kb,threads,fds,mach_ports" > "${out}" -# Wait up to 60s for the process to appear. +# Resolve the target PID. `pgrep -f` also matches this script (its own argv +# contains the pattern) and the run_tests wrapper shell, so picking head -1 grabs +# the wrong, idle process. Instead choose the matching PID with the largest RSS: +# the instrumented test binary uses orders of magnitude more memory than any +# shell, which disambiguates it reliably. Exclude this script's own PID. +self=$$ +RSS_THRESHOLD_KB=${SAMPLER_RSS_THRESHOLD_KB:-50000} + +pick_target() { + local best="" best_rss=0 p rss + for p in $(pgrep -f "${pattern}" 2>/dev/null); do + [[ "${p}" == "${self}" ]] && continue + rss=$(ps -o rss= -p "${p}" 2>/dev/null | tr -d ' ') + [[ -z "${rss}" ]] && continue + if (( rss > best_rss )); then best_rss=${rss}; best=${p}; fi + done + echo "${best} ${best_rss}" +} + +# Wait up to 120s for the real binary (RSS over threshold) to come up. Fall back +# to the largest match seen if nothing crosses the threshold before timeout. pid="" -for _ in $(seq 1 60); do - pid=$(pgrep -f "${pattern}" | head -1 || true) - [[ -n "${pid}" ]] && break +for _ in $(seq 1 120); do + read -r cand cand_rss <<< "$(pick_target)" + if [[ -n "${cand}" ]]; then + pid=${cand} + (( cand_rss >= RSS_THRESHOLD_KB )) && break + fi sleep 1 done if [[ -z "${pid}" ]]; then echo "sampler: process matching '${pattern}' never appeared" >&2 exit 0 fi +echo "sampler: tracking pid ${pid} (pattern '${pattern}')" >&2 is_macos=0 [[ "$(uname -s)" == "Darwin" ]] && is_macos=1 diff --git a/.github/workflows/platform-audio-triage.yml b/.github/workflows/platform-audio-triage.yml index 8598eccd..c0fcee83 100644 --- a/.github/workflows/platform-audio-triage.yml +++ b/.github/workflows/platform-audio-triage.yml @@ -162,7 +162,6 @@ jobs: --gtest_filter='PlatformAudioIntegrationTest.*-PlatformAudioIntegrationTest.PinnedRuntimeRepeatedPublishStress' \ --gtest_repeat="${REPEAT}" \ --gtest_recreate_environments_when_repeating=1 \ - --gtest_break_on_failure \ --gtest_output=xml:build-debug/triage-repeat-arm.xml # Arm B: control. Runtime/ADM pinned alive for the whole test. @@ -184,11 +183,35 @@ jobs: .github/scripts/run_tests_with_backtrace.sh \ build-debug/bin/livekit_integration_tests \ --gtest_filter='PlatformAudioIntegrationTest.PinnedRuntimeRepeatedPublishStress' \ - --gtest_break_on_failure \ --gtest_output=xml:build-debug/triage-pinned-arm.xml + # Arm C: isolates the frame-flow check. Running only this test with a small + # repeat (fresh ADM recreated each iteration, no sibling tests churning it + # first) distinguishes "frame flow is dead even on a fresh ADM" from "frame + # flow only dies after prior teardown/recreate cycles". + - name: "Arm C — frame-flow only (fresh ADM each iteration, isolated)" + id: frames_arm + continue-on-error: true + timeout-minutes: 30 + shell: bash + env: + RUST_LOG: info,livekit_ffi::server=debug,livekit_ffi::server::platform_audio=debug,livekit::platform_audio=debug + RUST_BACKTRACE: full + run: | + set -euo pipefail + source .token_helpers/set_data_track_test_tokens.bash + # Sample process resources in the background; it self-exits with the test. + .github/scripts/sample_process_resources.sh \ + livekit_integration_tests "$RUNNER_TEMP/resources-arm-c.csv" 3 & + .github/scripts/run_tests_with_backtrace.sh \ + build-debug/bin/livekit_integration_tests \ + --gtest_filter='PlatformAudioIntegrationTest.PlatformAudioFramesReachRemote' \ + --gtest_repeat=5 \ + --gtest_recreate_environments_when_repeating=1 \ + --gtest_output=xml:build-debug/triage-frames-arm.xml + - name: Dump livekit-server log on failure - if: failure() || steps.repeat_arm.outcome == 'failure' || steps.pinned_arm.outcome == 'failure' + if: failure() || steps.repeat_arm.outcome == 'failure' || steps.pinned_arm.outcome == 'failure' || steps.frames_arm.outcome == 'failure' shell: bash run: | log_path="${{ steps.livekit_server.outputs.log-path }}" @@ -198,7 +221,7 @@ jobs: tail -n 500 "$log_path" || true - name: Stage crash diagnostics - if: steps.repeat_arm.outcome == 'failure' || steps.pinned_arm.outcome == 'failure' + if: steps.repeat_arm.outcome == 'failure' || steps.pinned_arm.outcome == 'failure' || steps.frames_arm.outcome == 'failure' run: .github/scripts/stage_crash_diagnostics.sh build-debug - name: Upload test results @@ -209,11 +232,12 @@ jobs: path: | build-debug/triage-repeat-arm.xml build-debug/triage-pinned-arm.xml + build-debug/triage-frames-arm.xml if-no-files-found: ignore retention-days: 14 - name: Upload crash diagnostics - if: steps.repeat_arm.outcome == 'failure' || steps.pinned_arm.outcome == 'failure' + if: steps.repeat_arm.outcome == 'failure' || steps.pinned_arm.outcome == 'failure' || steps.frames_arm.outcome == 'failure' uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: platform-audio-triage-crash-diagnostics @@ -229,6 +253,7 @@ jobs: path: | ${{ runner.temp }}/resources-arm-a.csv ${{ runner.temp }}/resources-arm-b.csv + ${{ runner.temp }}/resources-arm-c.csv if-no-files-found: ignore retention-days: 14 @@ -239,6 +264,7 @@ jobs: run: | repeat="${{ steps.repeat_arm.outcome }}" pinned="${{ steps.pinned_arm.outcome }}" + frames="${{ steps.frames_arm.outcome }}" # Print first vs last resource sample so the leak curve is visible inline. resource_delta() { @@ -261,17 +287,22 @@ jobs: echo "" echo "| Arm | Outcome |" echo "| --- | --- |" - echo "| Repeat (dispose+ADM Terminate each iter) | ${repeat} |" - echo "| Pinned runtime (ADM never terminated) | ${pinned} |" + echo "| A — Repeat (dispose+ADM Terminate each iter) | ${repeat} |" + echo "| B — Pinned runtime (ADM never terminated) | ${pinned} |" + echo "| C — Frame-flow only (fresh ADM each iter) | ${frames} |" echo "" if [[ "${repeat}" == "failure" && "${pinned}" == "success" ]]; then echo "➡️ Repeat failed while pinned passed: consistent with an ADM teardown/recreate bug." elif [[ "${repeat}" == "failure" && "${pinned}" == "failure" ]]; then - echo "➡️ Both arms failed: instability is not exclusive to ADM teardown." + echo "➡️ Both repeat and pinned failed: instability is not exclusive to ADM teardown." elif [[ "${repeat}" == "success" && "${pinned}" == "success" ]]; then - echo "➡️ Both arms passed this run (did not reproduce). Try a higher repeat count." + echo "➡️ Repeat and pinned both passed this run (did not reproduce). Try a higher repeat count." fi echo "" + echo "_Arm C (frame-flow isolated): if iteration 1 passes but later iterations fail," + echo "frame delivery degrades with each fresh ADM recreate; if it fails on iteration 1," + echo "frame flow is broken even on a first/fresh ADM on this runner._" + echo "" echo "### Resource growth — Arm A (dispose each iteration)" echo "| metric | first | last | note |" echo "| --- | --- | --- | --- |" @@ -281,5 +312,10 @@ jobs: echo "| metric | first | last | note |" echo "| --- | --- | --- | --- |" resource_delta "$RUNNER_TEMP/resources-arm-b.csv" + echo "" + echo "### Resource growth — Arm C (frame-flow isolated)" + echo "| metric | first | last | note |" + echo "| --- | --- | --- | --- |" + resource_delta "$RUNNER_TEMP/resources-arm-c.csv" } >> "$GITHUB_STEP_SUMMARY" - [[ "${repeat}" == "success" && "${pinned}" == "success" ]] + [[ "${repeat}" == "success" && "${pinned}" == "success" && "${frames}" == "success" ]] From e6237932666c3811ff2799f100230dbf890f04f7 Mon Sep 17 00:00:00 2001 From: Alan George Date: Tue, 23 Jun 2026 10:46:16 -0600 Subject: [PATCH 05/10] Run PlatformAudio triage across both macOS architectures The instability reproduces on Apple Silicon too (arm64 integration tests have been seen to SIGSEGV, exit 139), not just Intel x64. Replace the single-runner input with a matrix that fans "all" out across one Intel (macos-15-large) and one arm64 (macos-15) runner by default, while still allowing a single runner to be targeted. Artifact names are suffixed with the runner so the parallel arms don't collide on upload. Co-authored-by: Cursor --- .github/workflows/platform-audio-triage.yml | 52 ++++++++++++++++----- 1 file changed, 41 insertions(+), 11 deletions(-) diff --git a/.github/workflows/platform-audio-triage.yml b/.github/workflows/platform-audio-triage.yml index c0fcee83..a773fb38 100644 --- a/.github/workflows/platform-audio-triage.yml +++ b/.github/workflows/platform-audio-triage.yml @@ -4,6 +4,10 @@ name: PlatformAudio Triage # It builds ONLY the integration test binary and runs the PlatformAudio cases # in a tight, high-repeat loop with backtrace + crash-report capture. # +# The instability reproduces on BOTH macOS architectures (Intel x64 hits an +# assertion timeout / no-frames; Apple Silicon arm64 has been seen to SIGSEGV), +# so by default this fans out across one Intel and one arm64 runner. +# # Two arms run so a failure is diagnostic, not just a red X: # - repeat arm: the standard cases under --gtest_repeat. Each iteration calls # livekit::shutdown() -> FFI dispose -> Arc drop -> @@ -19,14 +23,17 @@ name: PlatformAudio Triage on: workflow_dispatch: inputs: - runner: - description: "macOS runner (Intel = x64, where the instability reproduces)" + runners: + description: "Which macOS runners to triage (the instability reproduces on both Intel x64 and Apple Silicon arm64)" type: choice - default: macos-15-large + default: all options: - - macos-15-large - - macos-13 - - macos-26-xlarge + - all # both arches: macos-15-large (x64) + macos-15 (arm64) + - macos-15-large # Intel x64 + - macos-15 # Apple Silicon arm64 + - macos-13 # Intel x64 (older) + - macos-14 # Apple Silicon arm64 (older) + - macos-26-xlarge # Apple Silicon arm64 (newer, larger) repeat: description: "gtest_repeat count for the dispose-each-iteration arm" type: string @@ -45,9 +52,32 @@ permissions: actions: read jobs: + # Expand the runner selection into a matrix. "all" fans out across both + # architectures so a single dispatch confirms whether the instability + # reproduces on Intel x64 and Apple Silicon arm64. + prepare: + name: Resolve runner matrix + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set.outputs.matrix }} + steps: + - id: set + shell: bash + run: | + sel="${{ github.event.inputs.runners || 'all' }}" + if [[ "${sel}" == "all" ]]; then + echo 'matrix={"runner":["macos-15-large","macos-15"]}' >> "$GITHUB_OUTPUT" + else + printf 'matrix={"runner":["%s"]}\n' "${sel}" >> "$GITHUB_OUTPUT" + fi + triage: - name: PlatformAudio Triage (${{ github.event.inputs.runner || 'macos-15-large' }}) - runs-on: ${{ github.event.inputs.runner || 'macos-15-large' }} + needs: prepare + strategy: + fail-fast: false + matrix: ${{ fromJSON(needs.prepare.outputs.matrix) }} + name: PlatformAudio Triage (${{ matrix.runner }}) + runs-on: ${{ matrix.runner }} timeout-minutes: 90 env: CARGO_TERM_COLOR: always @@ -228,7 +258,7 @@ jobs: if: always() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: - name: platform-audio-triage-results + name: platform-audio-triage-results-${{ matrix.runner }} path: | build-debug/triage-repeat-arm.xml build-debug/triage-pinned-arm.xml @@ -240,7 +270,7 @@ jobs: if: steps.repeat_arm.outcome == 'failure' || steps.pinned_arm.outcome == 'failure' || steps.frames_arm.outcome == 'failure' uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: - name: platform-audio-triage-crash-diagnostics + name: platform-audio-triage-crash-diagnostics-${{ matrix.runner }} path: ${{ runner.temp }}/crash-diagnostics/ if-no-files-found: ignore retention-days: 14 @@ -249,7 +279,7 @@ jobs: if: always() uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: - name: platform-audio-triage-resource-samples + name: platform-audio-triage-resource-samples-${{ matrix.runner }} path: | ${{ runner.temp }}/resources-arm-a.csv ${{ runner.temp }}/resources-arm-b.csv From 45b282480832c8ac9a53eab1e4398701d2a48c08 Mon Sep 17 00:00:00 2001 From: Alan George Date: Tue, 23 Jun 2026 13:21:36 -0600 Subject: [PATCH 06/10] Add leak-backtrace arm to PlatformAudio triage The corrected resource sampler showed RSS growing unbounded (to ~4.9 GB on Intel before it crashed) while threads, fds, and mach ports stay flat, and the growth reproduces even with the ADM pinned -- i.e. a heap leak in the per-room publish/subscribe cycle, not an ADM-teardown or handle leak. Add Arm D, which runs the pinned-cycle reproducer under macOS `leaks --atExit` with MallocStackLogging so each still-allocated block is reported with its allocating backtrace (symbol + file:line). This names the leaking call site (C++ SDK vs Rust FFI) directly. The report is uploaded as an artifact and a small leak_iterations input keeps the stack-logging overhead bounded. Co-authored-by: Cursor --- .github/workflows/platform-audio-triage.yml | 52 +++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/.github/workflows/platform-audio-triage.yml b/.github/workflows/platform-audio-triage.yml index a773fb38..7578ef40 100644 --- a/.github/workflows/platform-audio-triage.yml +++ b/.github/workflows/platform-audio-triage.yml @@ -42,6 +42,10 @@ on: description: "PLATFORM_AUDIO_PIN_ITERATIONS for the pinned control arm" type: string default: "200" + leak_iterations: + description: "Pinned-cycle iterations for the leaks arm (kept small; MallocStackLogging is heavy)" + type: string + default: "40" ubsan: description: "Build with UndefinedBehaviorSanitizer (ASan is incompatible with macOS CoreAudio)" type: boolean @@ -240,6 +244,43 @@ jobs: --gtest_recreate_environments_when_repeating=1 \ --gtest_output=xml:build-debug/triage-frames-arm.xml + # Arm D: name the leak. The pinned-cycle reproducer (ADM held alive, rooms + # recycled each cycle) is run under macOS `leaks` with MallocStackLogging so + # every still-allocated block is reported with the backtrace that allocated + # it. This points directly at the leaking call site (C++ SDK vs Rust FFI) + # instead of us guessing. A small iteration count keeps the stack-logging + # overhead bounded while leaking enough to aggregate clear stacks. + - name: "Arm D — leak backtraces (pinned cycle under leaks)" + id: leaks_arm + continue-on-error: true + timeout-minutes: 40 + shell: bash + env: + # Quieter than the other arms: the leak report is the signal here. + RUST_LOG: warn + RUST_BACKTRACE: "1" + MallocStackLogging: "1" + PLATFORM_AUDIO_PIN_ITERATIONS: ${{ github.event.inputs.leak_iterations || '40' }} + run: | + set -uo pipefail + source .token_helpers/set_data_track_test_tokens.bash + report="$RUNNER_TEMP/leaks-report.txt" + # `leaks --atExit -- ` launches the binary, lets it run to normal + # exit, then dumps leaks (grouped by identical backtrace with counts). + leaks --atExit -- \ + build-debug/bin/livekit_integration_tests \ + --gtest_filter='PlatformAudioIntegrationTest.PinnedRuntimeRepeatedPublishStress' \ + > "$report" 2>&1 + status=$? + echo "leaks exit status: ${status}" + # Surface the leak summary + the largest stacks in the step log. + grep -E "Process [0-9]+: .* leaks for|leaks for [0-9,]+ total|^[[:space:]]*[0-9]+ \(" "$report" | head -40 || true + echo "--- top of leaks report ---" + tail -n 200 "$report" || true + # leaks exits non-zero when leaks are found; treat that as success for + # this diagnostic arm (the report is the deliverable). + exit 0 + - name: Dump livekit-server log on failure if: failure() || steps.repeat_arm.outcome == 'failure' || steps.pinned_arm.outcome == 'failure' || steps.frames_arm.outcome == 'failure' shell: bash @@ -287,6 +328,15 @@ jobs: if-no-files-found: ignore retention-days: 14 + - name: Upload leak report + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: platform-audio-triage-leak-report-${{ matrix.runner }} + path: ${{ runner.temp }}/leaks-report.txt + if-no-files-found: ignore + retention-days: 14 + # Surface the diagnostic contrast and fail the job if either arm failed. - name: Triage summary if: always() @@ -295,6 +345,7 @@ jobs: repeat="${{ steps.repeat_arm.outcome }}" pinned="${{ steps.pinned_arm.outcome }}" frames="${{ steps.frames_arm.outcome }}" + leaks="${{ steps.leaks_arm.outcome }}" # Print first vs last resource sample so the leak curve is visible inline. resource_delta() { @@ -320,6 +371,7 @@ jobs: echo "| A — Repeat (dispose+ADM Terminate each iter) | ${repeat} |" echo "| B — Pinned runtime (ADM never terminated) | ${pinned} |" echo "| C — Frame-flow only (fresh ADM each iter) | ${frames} |" + echo "| D — Leak backtraces (see leak-report artifact) | ${leaks} |" echo "" if [[ "${repeat}" == "failure" && "${pinned}" == "success" ]]; then echo "➡️ Repeat failed while pinned passed: consistent with an ADM teardown/recreate bug." From 1ca2011c2cdc8ccd55b39bb7fe28d6997b3a1b61 Mon Sep 17 00:00:00 2001 From: Alan George Date: Tue, 23 Jun 2026 16:58:33 -0600 Subject: [PATCH 07/10] Add live-heap attribution arm to PlatformAudio triage The leak is reachable retention, not lost-pointer leaks: `leaks` reports 0 on both arches because the growing memory is still referenced. `leaks --atExit` also can't see it (cleanup reclaims at shutdown). Code review ruled out the obvious C++ suspects -- the FFI response buffer handle is already dropped via an FfiHandle guard in sendRequest, and Room deregisters its FfiClient listener on disconnect/destruction. Add Arm E, which runs the dispose+recreate path (the worst leaker) under MallocStackLogging and samples the LIVE heap mid-run via `heap` and `malloc_history` (new heap_snapshots.sh). Diffing successive heap summaries plus the malloc_history stacks names the growing allocation type and its call site so we can localize the retention to C++ SDK vs Rust FFI vs WebRTC. Co-authored-by: Cursor --- .github/scripts/heap_snapshots.sh | 82 +++++++++++++++++++++ .github/workflows/platform-audio-triage.yml | 50 +++++++++++++ 2 files changed, 132 insertions(+) create mode 100755 .github/scripts/heap_snapshots.sh diff --git a/.github/scripts/heap_snapshots.sh b/.github/scripts/heap_snapshots.sh new file mode 100755 index 00000000..05320b80 --- /dev/null +++ b/.github/scripts/heap_snapshots.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash +# Periodically capture live-heap snapshots of a running process so a reachable +# (non-freed but still-referenced) memory growth can be attributed to a type and +# allocation stack. `leaks` only reports *unreachable* blocks and runs at exit; +# this instead samples the live heap mid-run, which is where the PlatformAudio +# retention shows up. +# +# Two complementary tools are used (both require inspecting another task, so they +# run under `sudo -n`; GitHub macOS runners allow passwordless sudo): +# - `heap` : summary of live allocations grouped by type/class/binary. +# Diffing successive summaries shows which category grows. +# - `malloc_history` : per-allocation backtraces (needs MallocStackLogging in +# the target); captured only on the last few ticks because +# the output is large. +# +# Self-terminates when the target process exits. +set -uo pipefail + +pattern=${1:?usage: heap_snapshots.sh [interval_sec] [max_snaps]} +outdir=${2:?usage: heap_snapshots.sh [interval_sec] [max_snaps]} +interval=${3:-25} +max_snaps=${4:-10} + +mkdir -p "${outdir}" +self=$$ + +if [[ "$(uname -s)" != "Darwin" ]]; then + echo "heap_snapshots: only supported on macOS" >&2 + exit 0 +fi + +sudo_ok=0 +if sudo -n true >/dev/null 2>&1; then sudo_ok=1; fi +if (( ! sudo_ok )); then + echo "heap_snapshots: passwordless sudo unavailable; heap/malloc_history need it" >&2 +fi + +# Pick the matching PID with the largest RSS (the instrumented test binary), so +# we never attach to this script or the run_tests wrapper shell. +pick_target() { + local best="" best_rss=0 p rss + for p in $(pgrep -f "${pattern}" 2>/dev/null); do + [[ "${p}" == "${self}" ]] && continue + rss=$(ps -o rss= -p "${p}" 2>/dev/null | tr -d ' ') + [[ -z "${rss}" ]] && continue + if (( rss > best_rss )); then best_rss=${rss}; best=${p}; fi + done + echo "${best}" +} + +pid="" +for _ in $(seq 1 120); do + pid=$(pick_target) + [[ -n "${pid}" ]] && break + sleep 1 +done +if [[ -z "${pid}" ]]; then + echo "heap_snapshots: process matching '${pattern}' never appeared" >&2 + exit 0 +fi +echo "heap_snapshots: tracking pid ${pid} (pattern '${pattern}')" >&2 + +snap=0 +while kill -0 "${pid}" 2>/dev/null && (( snap < max_snaps )); do + sleep "${interval}" + kill -0 "${pid}" 2>/dev/null || break + snap=$((snap + 1)) + rss=$(ps -o rss= -p "${pid}" 2>/dev/null | tr -d ' ') + ts=$(date -u +%H%M%S) + label=$(printf '%02d_t%s_rss%sk' "${snap}" "${ts}" "${rss:-0}") + echo "heap_snapshots: snapshot ${label}" >&2 + + if (( sudo_ok )); then + sudo -n heap "${pid}" > "${outdir}/heap-${label}.txt" 2>&1 || true + # malloc_history is large; only capture it on the last few ticks. + if (( snap >= max_snaps - 1 )); then + sudo -n malloc_history "${pid}" -allBySize > "${outdir}/mhist-${label}.txt" 2>&1 || true + fi + fi +done + +echo "heap_snapshots: done (${snap} snapshots) for pid ${pid}" >&2 diff --git a/.github/workflows/platform-audio-triage.yml b/.github/workflows/platform-audio-triage.yml index 7578ef40..d34fdeca 100644 --- a/.github/workflows/platform-audio-triage.yml +++ b/.github/workflows/platform-audio-triage.yml @@ -107,6 +107,7 @@ jobs: chmod +x .github/scripts/run_tests_with_backtrace.sh chmod +x .github/scripts/stage_crash_diagnostics.sh chmod +x .github/scripts/sample_process_resources.sh + chmod +x .github/scripts/heap_snapshots.sh - name: Install deps run: | @@ -281,6 +282,44 @@ jobs: # this diagnostic arm (the report is the deliverable). exit 0 + # Arm E: attribute the reachable retention. `leaks` reports 0 (the growth is + # still-referenced, not lost) so we instead sample the LIVE heap mid-run on + # the dispose+recreate path (the worst leaker) with MallocStackLogging, then + # diff successive `heap` summaries + read `malloc_history` stacks to name the + # growing allocation type and its call site. + - name: "Arm E — live-heap attribution (dispose path under heap/malloc_history)" + id: heap_arm + continue-on-error: true + timeout-minutes: 40 + shell: bash + env: + RUST_LOG: warn + RUST_BACKTRACE: "1" + MallocStackLogging: "1" + run: | + set -uo pipefail + source .token_helpers/set_data_track_test_tokens.bash + mkdir -p "$RUNNER_TEMP/heap-snapshots" + # Snapshot the live heap every 25s while the dispose-path test repeats. + .github/scripts/heap_snapshots.sh \ + livekit_integration_tests "$RUNNER_TEMP/heap-snapshots" 25 12 & + .github/scripts/run_tests_with_backtrace.sh \ + build-debug/bin/livekit_integration_tests \ + --gtest_filter='PlatformAudioIntegrationTest.*-PlatformAudioIntegrationTest.PinnedRuntimeRepeatedPublishStress' \ + --gtest_repeat=30 \ + --gtest_recreate_environments_when_repeating=1 \ + --gtest_output=xml:build-debug/triage-heap-arm.xml || true + echo "--- heap snapshots captured ---" + ls -la "$RUNNER_TEMP/heap-snapshots" || true + # Surface the 'all zones' growth across the first vs last heap summary. + first=$(ls "$RUNNER_TEMP"/heap-snapshots/heap-*.txt 2>/dev/null | head -1) + last=$(ls "$RUNNER_TEMP"/heap-snapshots/heap-*.txt 2>/dev/null | tail -1) + if [[ -n "${first}" && -n "${last}" && "${first}" != "${last}" ]]; then + echo "=== FIRST heap summary (${first##*/}) ==="; grep -E "Process [0-9]+:|total|COUNT" "${first}" | head -25 || true + echo "=== LAST heap summary (${last##*/}) ==="; grep -E "Process [0-9]+:|total|COUNT" "${last}" | head -25 || true + fi + exit 0 + - name: Dump livekit-server log on failure if: failure() || steps.repeat_arm.outcome == 'failure' || steps.pinned_arm.outcome == 'failure' || steps.frames_arm.outcome == 'failure' shell: bash @@ -337,6 +376,15 @@ jobs: if-no-files-found: ignore retention-days: 14 + - name: Upload heap snapshots + if: always() + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 + with: + name: platform-audio-triage-heap-snapshots-${{ matrix.runner }} + path: ${{ runner.temp }}/heap-snapshots/ + if-no-files-found: ignore + retention-days: 14 + # Surface the diagnostic contrast and fail the job if either arm failed. - name: Triage summary if: always() @@ -346,6 +394,7 @@ jobs: pinned="${{ steps.pinned_arm.outcome }}" frames="${{ steps.frames_arm.outcome }}" leaks="${{ steps.leaks_arm.outcome }}" + heap="${{ steps.heap_arm.outcome }}" # Print first vs last resource sample so the leak curve is visible inline. resource_delta() { @@ -372,6 +421,7 @@ jobs: echo "| B — Pinned runtime (ADM never terminated) | ${pinned} |" echo "| C — Frame-flow only (fresh ADM each iter) | ${frames} |" echo "| D — Leak backtraces (see leak-report artifact) | ${leaks} |" + echo "| E — Live-heap attribution (see heap-snapshots artifact) | ${heap} |" echo "" if [[ "${repeat}" == "failure" && "${pinned}" == "success" ]]; then echo "➡️ Repeat failed while pinned passed: consistent with an ADM teardown/recreate bug." From fdc15157900341a841b0c2d74b48e5736653a0ad Mon Sep 17 00:00:00 2001 From: Alan George Date: Tue, 23 Jun 2026 17:43:23 -0600 Subject: [PATCH 08/10] Improve heap attribution arm: catch growth curve + always capture stacks First run plateaued before the snapshot window opened, so all heap samples were identical and malloc_history (gated to the last ticks) never fired. The steady-state heap was already telling: dominated by webrtc::Codec copies and StatsReport entries in liblivekit_ffi.dylib. Snapshot every 10s (not 25s), allow more ticks, raise the dispose-path repeat to 60 so the process keeps churning across the window, and capture malloc_history (-allBySize | head) on every tick so we always get the allocating backtraces even if the process exits or hangs early. Co-authored-by: Cursor --- .github/scripts/heap_snapshots.sh | 9 +++++---- .github/workflows/platform-audio-triage.yml | 9 ++++++--- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/.github/scripts/heap_snapshots.sh b/.github/scripts/heap_snapshots.sh index 05320b80..8f216ca1 100755 --- a/.github/scripts/heap_snapshots.sh +++ b/.github/scripts/heap_snapshots.sh @@ -72,10 +72,11 @@ while kill -0 "${pid}" 2>/dev/null && (( snap < max_snaps )); do if (( sudo_ok )); then sudo -n heap "${pid}" > "${outdir}/heap-${label}.txt" 2>&1 || true - # malloc_history is large; only capture it on the last few ticks. - if (( snap >= max_snaps - 1 )); then - sudo -n malloc_history "${pid}" -allBySize > "${outdir}/mhist-${label}.txt" 2>&1 || true - fi + # malloc_history -allBySize sorts largest-first, so head keeps the biggest + # offenders while bounding artifact size. Capture every tick so we always + # have stacks even if the process exits/hangs before max_snaps. + sudo -n malloc_history "${pid}" -allBySize 2>/dev/null \ + | head -400 > "${outdir}/mhist-${label}.txt" || true fi done diff --git a/.github/workflows/platform-audio-triage.yml b/.github/workflows/platform-audio-triage.yml index d34fdeca..3d0a9fe3 100644 --- a/.github/workflows/platform-audio-triage.yml +++ b/.github/workflows/platform-audio-triage.yml @@ -300,13 +300,16 @@ jobs: set -uo pipefail source .token_helpers/set_data_track_test_tokens.bash mkdir -p "$RUNNER_TEMP/heap-snapshots" - # Snapshot the live heap every 25s while the dispose-path test repeats. + # Snapshot the live heap every 10s while the dispose-path test repeats. + # Frequent early ticks catch the growth curve (not just a plateau), and + # malloc_history is captured on every tick so we always get stacks. + # A larger repeat keeps the process churning across the whole window. .github/scripts/heap_snapshots.sh \ - livekit_integration_tests "$RUNNER_TEMP/heap-snapshots" 25 12 & + livekit_integration_tests "$RUNNER_TEMP/heap-snapshots" 10 30 & .github/scripts/run_tests_with_backtrace.sh \ build-debug/bin/livekit_integration_tests \ --gtest_filter='PlatformAudioIntegrationTest.*-PlatformAudioIntegrationTest.PinnedRuntimeRepeatedPublishStress' \ - --gtest_repeat=30 \ + --gtest_repeat=60 \ --gtest_recreate_environments_when_repeating=1 \ --gtest_output=xml:build-debug/triage-heap-arm.xml || true echo "--- heap snapshots captured ---" From 13f8b11f7d974a86e4b0d9b8e1456dc7d32b2336 Mon Sep 17 00:00:00 2001 From: Alan George Date: Tue, 23 Jun 2026 22:40:28 -0600 Subject: [PATCH 09/10] Possible race in local_participant --- include/livekit/local_participant.h | 8 ++++++++ src/local_participant.cpp | 8 +++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/include/livekit/local_participant.h b/include/livekit/local_participant.h index 9369a914..2c1d5cf5 100644 --- a/include/livekit/local_participant.h +++ b/include/livekit/local_participant.h @@ -246,6 +246,14 @@ class LIVEKIT_API LocalParticipant : public Participant { /// prune expired @c weak_ptr entries. mutable TrackMap published_tracks_by_sid_; + /// Guards @ref published_tracks_by_sid_. The map is written from the + /// application thread (@ref publishTrack / @ref unpublishTrack) and both read + /// and pruned from the FFI callback thread (@ref trackPublications / + /// @ref findTrackPublication, reached via Room::onEvent). Without this lock + /// those concurrent accesses race and free map nodes out from under each + /// other (heap-use-after-free). Leaf lock: no other lock is taken while held. + mutable std::mutex published_tracks_mutex_; + std::unordered_map rpc_handlers_; // Shared state for RPC invocation tracking. Using shared_ptr so the state diff --git a/src/local_participant.cpp b/src/local_participant.cpp index 7fda68ac..1e8d9a0f 100644 --- a/src/local_participant.cpp +++ b/src/local_participant.cpp @@ -197,7 +197,10 @@ void LocalParticipant::publishTrack(const std::shared_ptr& track, const T auto publication = std::make_shared(owned_pub); const std::string sid = publication->sid(); - published_tracks_by_sid_[sid] = std::weak_ptr(track); + { + const std::scoped_lock lock(published_tracks_mutex_); + published_tracks_by_sid_[sid] = std::weak_ptr(track); + } track->setPublication(publication); } @@ -237,6 +240,7 @@ void LocalParticipant::unpublishTrack(const std::string& track_sid) { fut.get(); + const std::scoped_lock lock(published_tracks_mutex_); if (auto it = published_tracks_by_sid_.find(track_sid); it != published_tracks_by_sid_.end()) { if (auto t = it->second.lock()) { t->setPublication(nullptr); @@ -247,6 +251,7 @@ void LocalParticipant::unpublishTrack(const std::string& track_sid) { LocalParticipant::PublicationMap LocalParticipant::trackPublications() const { PublicationMap out; + const std::scoped_lock lock(published_tracks_mutex_); for (auto it = published_tracks_by_sid_.begin(); it != published_tracks_by_sid_.end();) { auto t = it->second.lock(); if (!t) { @@ -443,6 +448,7 @@ void LocalParticipant::handleRpcMethodInvocation(uint64_t invocation_id, const s } std::shared_ptr LocalParticipant::findTrackPublication(const std::string& sid) const { + const std::scoped_lock lock(published_tracks_mutex_); auto it = published_tracks_by_sid_.find(sid); if (it == published_tracks_by_sid_.end()) { return nullptr; From 3980fb48874d4bf4b84f942fb7c50f3afc6d5310 Mon Sep 17 00:00:00 2001 From: Alan George Date: Tue, 23 Jun 2026 22:58:49 -0600 Subject: [PATCH 10/10] Maybe fix shutdown --- client-sdk-rust | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client-sdk-rust b/client-sdk-rust index ee0a86ac..6881168d 160000 --- a/client-sdk-rust +++ b/client-sdk-rust @@ -1 +1 @@ -Subproject commit ee0a86aca3b6fc91badf085b4a40255b77bdefe1 +Subproject commit 6881168dfefad6605fcf6697085dcad2421bde68