diff --git a/examples/pico2_dualcore/CMakeLists.txt b/examples/pico2_dualcore/CMakeLists.txt
new file mode 100644
index 0000000..cdb9c7a
--- /dev/null
+++ b/examples/pico2_dualcore/CMakeLists.txt
@@ -0,0 +1,58 @@
+# Standalone Raspberry Pi Pico 2 (RP2350) firmware: dual-core deployment of
+# the ASRC — producer on core0, consumer on core1, one core per clock domain
+# (docs/HARDWARE_TESTING.md, Setup 2, "Dual-core deployment"). Deliberately
+# NOT wired into the root build — configure this directory on its own:
+#
+#   cmake -B build -DPICO_BOARD=pico2
+#   cmake --build build -j
+#
+# The Pico SDK is fetched by git tag rather than release-tarball URL+SHA256:
+# GitHub source tarballs exclude submodules, and USB CDC stdio needs
+# lib/tinyusb. GIT_SUBMODULES limits the clone to that one submodule
+# (lwip/btstack/mbedtls/cyw43 are not used here).
+cmake_minimum_required(VERSION 3.24)
+
+set(PICO_BOARD pico2 CACHE STRING "Pico SDK board")
+set(PICO_PLATFORM rp2350-arm-s CACHE STRING "Pico SDK platform (secure Arm)")
+
+# The library's constructors throw (allocation, filter design); the
+# 12-channel phase relies on catching bad_alloc instead of crashing. The SDK
+# default is -fno-exceptions.
+set(PICO_CXX_ENABLE_EXCEPTIONS 1 CACHE BOOL "")
+
+include(FetchContent)
+FetchContent_Declare(
+    pico_sdk
+    GIT_REPOSITORY https://github.com/raspberrypi/pico-sdk.git
+    GIT_TAG 2.1.1 # first-class RP2350 support
+    GIT_SHALLOW TRUE
+    GIT_SUBMODULES "lib/tinyusb")
+FetchContent_GetProperties(pico_sdk)
+if(NOT pico_sdk_POPULATED)
+    FetchContent_Populate(pico_sdk)
+endif()
+# Must be included before project() so the SDK can inject its toolchain file.
+include(${pico_sdk_SOURCE_DIR}/pico_sdk_init.cmake)
+
+project(pico2_dualcore C CXX ASM)
+set(CMAKE_C_STANDARD 11)
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+pico_sdk_init()
+
+add_executable(pico2_dualcore main.cpp)
+# Self-contained except for the header-only library itself.
+target_include_directories(pico2_dualcore PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../include)
+target_link_libraries(pico2_dualcore PRIVATE pico_stdlib pico_multicore cmsis_core)
+
+# Telemetry prints from the producer core; when the USB host stops draining
+# the CDC buffer, stdio_usb blocks the writer for up to this long before
+# dropping output (SDK default: 500 ms — far more than the FIFO can absorb).
+# Capped at 2 ms so the worst push stall stays inside the FIFO setpoint
+# budget; see the kTargetLatencyFrames comment in main.cpp.
+target_compile_definitions(pico2_dualcore PRIVATE PICO_STDIO_USB_STDOUT_TIMEOUT_US=2000)
+
+pico_enable_stdio_usb(pico2_dualcore 1)
+pico_enable_stdio_uart(pico2_dualcore 0)
+pico_add_extra_outputs(pico2_dualcore) # .uf2 etc.
diff --git a/examples/pico2_dualcore/README.md b/examples/pico2_dualcore/README.md
new file mode 100644
index 0000000..ea8f218
--- /dev/null
+++ b/examples/pico2_dualcore/README.md
@@ -0,0 +1,136 @@
+# pico2_dualcore — RP2350 dual-core deployment firmware
+
+The dual-core deliverable of [docs/HARDWARE_TESTING.md](../../docs/HARDWARE_TESTING.md)
+Setup 2: the converter's two ends run on the RP2350's two Cortex-M33 cores,
+one core per clock domain, and the firmware judges its own run — servo lock,
+ppm convergence against a synthesized truth value, clean counters, and
+measured `pull()` cycles — against PASS/FAIL gates.
+
+## The deployment shape it validates
+
+The README's platform guidance says that on Pico-class parts the Q15/Q31
+paths are the right datapaths, 48 kHz mono fits a 150 MHz core with room to
+spare, and stereo `balanced()` wants the `fast()` preset *or the RP2350's
+second core*. This firmware is that second-core deployment, built the way a
+real one is:
+
+- **core0 = producer (input clock domain).** Pushes 32-frame blocks of a
+  997 Hz sine, busy-paced on the microsecond timer at
+  `rate × (1 + 200e-6)` — a +200 ppm clock offset synthesized from the
+  shared timebase, so the servo's converged estimate has an exact truth
+  value to be judged against (the one thing two real crystals cannot give
+  you). core0 also owns USB telemetry, printing once per second.
+- **core1 = consumer (output clock domain).** Pulls 32-frame blocks paced
+  at exactly the nominal rate and times every `pull()` with its own
+  DWT.CYCCNT (each RP2350 core has a private DWT — the 0xE000_0000 PPB
+  region is per-core, so the counter must be enabled *on* core1).
+- The library's `push()`/`pull()` contract is one producer agent and one
+  consumer agent around a lock-free SPSC ring with acquire/release atomics —
+  the contract names agents, not threads, so two cores sharing the RP2350's
+  coherent SRAM satisfy it exactly as two threads do. Everything else that
+  crosses cores is an explicit block of 32-bit atomics (32-bit because
+  64-bit `std::atomic` is not lock-free on the M33 — the same constraint
+  that shapes the library's own telemetry).
+
+Two phases, ~30 s each:
+
+| Phase | Config | Rates | Why |
+|---|---|---|---|
+| A | Q15 stereo `balanced()` | 48 kHz out, +200 ppm in | the config the README calls tight on one core |
+| B | Q15 12-channel, `balanced()` band edges and servo scaled ×16/48 | 16 kHz out, +200 ppm in | the reference-microphone/AVB 12-channel shape at its deployment rate |
+
+Phase B is 16 kHz **by arithmetic, not caution**: the M33 QEMU baseline puts
+`pipeline12_q15` at 10,027 insns/frame against a 150 MHz / 48 kHz budget of
+3,125 cycles/frame — more than 3× over, and `pull()` of a single instance is
+one consumer by contract, so no core assignment can split it across cores.
+Dual-core buys one clock domain per core, not more datapath than one core
+has. At 16 kHz the budget is 9,375 cycles/frame. The measured cycles/block
+is rate-independent, so phase B still produces the real-silicon counterpart
+of the 12-channel baseline.
+
+## Build
+
+Standalone project — *not* part of the root CMake build. Requires
+`cmake` ≥ 3.24, `arm-none-eabi-gcc` (tested with 13.2), and network access
+on first configure (fetches Pico SDK 2.1.1 plus its TinyUSB submodule;
+several minutes, and a native compiler for the SDK's picotool build).
+
+```sh
+cd examples/pico2_dualcore
+cmake -B build -DPICO_BOARD=pico2
+cmake --build build -j
+```
+
+Produces `build/pico2_dualcore.uf2`.
+
+## Flash and run
+
+Hold BOOTSEL while plugging in and copy the UF2 onto the `RP2350` drive, or
+use picotool:
+
+```sh
+cp build/pico2_dualcore.uf2 /media/$USER/RP2350/   # or:
+picotool load -f build/pico2_dualcore.uf2 && picotool reboot
+```
+
+Open the USB serial port (`picocom /dev/ttyACM0`); the firmware waits for a
+terminal before starting, so nothing is lost.
+
+## Expected output
+
+A header, then one telemetry line per second per phase:
+
+```
+[A t= 9s] Locked    ppm=+200.05 fill= 144.2 und=0 ovr=0 rsy=0 | pull/blk mean=... p99=... max=... (..% core) late<=..us
+```
+
+and per phase a verdict line:
+
+```
+SUMMARY A q15 2ch balanced @48000: PASS lock_ms=... ppm_final=+200.0 post_lock_und=0 ovr=0 rsy=0 pull_cyc_blk mean=... p99=... max=... cyc_frame=... core_pct=... late_max_us=...
+```
+
+PASS requires: Locked within 2 s (phase A; 6 s for B, whose servo is scaled
+3× slower), every 1 Hz ppm sample after 10 s (A) / 15 s (B) within ±5 of
++200, and zero underruns/overruns/resyncs after first lock. The run ends
+with an `OVERALL` line and:
+
+```
+SRT_PICO2_DUALCORE_DONE
+```
+
+## Reading the numbers
+
+- **core_pct** is the headroom figure: one stream's share of core1 at the
+  reported sys clock, `cyc_frame × rate / 150 MHz`. It prices `pull()` only
+  — by design, since `push()` is a ring write and the producer core's real
+  budget goes to whatever feeds it (here: telemetry).
+- **Relation to the QEMU baselines** (`bench/baselines.json`, 2 s = 96,000
+  frames per workload): `pipeline_q15` 484,146,844 insns = **5,043
+  insns/frame**, `pipeline12_q15` 962,613,655 = **10,027 insns/frame**.
+  Those figures amortize one-time setup (soft-double Kaiser design, input
+  synthesis) over the workload, so they are upper bounds for the
+  steady-state loop this firmware times; `cyc_frame ÷ insns/frame` from the
+  sibling `examples/pico2_cyccnt` run gives the cycles-per-instruction
+  calibration that converts every M33 baseline into a cycle budget.
+- Against the budgets: 3,125 cycles/frame buys one 48 kHz frame at
+  150 MHz, 9,375 one 16 kHz frame. Phase A's `core_pct` is the measured
+  version of "stereo balanced() is tight on one core": whatever it reads,
+  that is the share of core1 a deployment must reserve — and on a
+  *single*-core deployment the same cycles would contend with the producer
+  side and the rest of the application, which is exactly why the input
+  domain lives on core0 here.
+- **late_max_us** is the consumer's worst schedule slip. If `pull()` ever
+  exceeded the block period, lateness shows here long before the FIFO
+  counters do.
+- The FIFO setpoint is 144 frames (3 ms at 48 kHz) rather than the default
+  48: the producer core shares its time with USB logging, whose worst-case
+  writer stall is capped at 2 ms in the CMakeLists. The README latency rule
+  — the setpoint must exceed the peak occupancy excursion of push/pull
+  jitter — applied to a producer that also logs.
+
+This firmware cannot prove the inter-crystal lock that
+HARDWARE_TESTING.md Setup 1/2 ultimately want (both domains here are paced
+from the RP2350's one timer, which is what makes ppm = +200.0 an exact,
+assertable truth); it proves the *deployment shape*: two cores, two clock
+domains, lock-free handoff, and real cycle headroom numbers.
diff --git a/examples/pico2_dualcore/main.cpp b/examples/pico2_dualcore/main.cpp
new file mode 100644
index 0000000..eec0a8b
--- /dev/null
+++ b/examples/pico2_dualcore/main.cpp
@@ -0,0 +1,548 @@
+// Dual-core deployment of the ASRC on the RP2350 (docs/HARDWARE_TESTING.md,
+// Setup 2, "Dual-core deployment"): the converter's two ends on the two
+// Cortex-M33 cores, one core per clock domain — the shape the README
+// prescribes for configurations that are tight on a single 150 MHz core
+// (Q15 stereo balanced(); 12-channel).
+//
+//   core0  producer: push(32) paced at rate * (1 + 200e-6), plus USB telemetry
+//   core1  consumer: pull(32) paced at exactly the nominal rate, every call
+//          timed with the core-local DWT.CYCCNT
+//
+// Cross-core safety, stated explicitly: the library's runtime contract is
+// one producer agent and one consumer agent around a lock-free SPSC ring
+// with acquire/release atomics (srt/spsc_ring.hpp; "one producer thread and
+// one consumer thread" in the README's Limitations). The contract is about
+// agents and memory ordering, not about std::thread: the RP2350's cores
+// share coherent SRAM (no data caches in front of it), so two CORES satisfy
+// it exactly as two threads do. push() stays core0-only, pull() stays
+// core1-only, status() is documented any-thread. Everything else that
+// crosses cores is the explicit Shared block of 32-bit atomics below — kept
+// 32-bit for the same reason the library keeps its telemetry 32-bit: on the
+// M33, 64-bit std::atomic is not lock-free and would route through a
+// library lock (see the footnote in asrc.hpp).
+//
+// Both pacing schedules derive from the same 64-bit microsecond timebase
+// (the RP2350 timer is one shared block read by both cores), so the
+// +200 ppm offset is exact by construction and the servo's converged
+// estimate has a known truth value to be judged against — the one thing
+// genuinely independent oscillators cannot provide. Due times are absolute,
+// t0 + (b * num) / den in integer microseconds, so a stall (a USB telemetry
+// write on core0) is followed by catch-up pushes, not permanent schedule
+// slip.
+#include <algorithm>
+#include <atomic>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <exception>
+#include <memory>
+#include <numbers>
+#include <vector>
+
+#include "RP2350.h"
+#include "hardware/clocks.h"
+#include "pico/multicore.h"
+#include "pico/stdlib.h"
+
+#include "srt/asrc.hpp"
+
+namespace {
+
+using Asrc = srt::AsyncSampleRateConverterQ15;
+
+constexpr std::size_t kBlockFrames = 32;
+constexpr std::size_t kMaxChannels = 12;
+constexpr std::size_t kInputFrames = 4800; // cycled producer buffer (0.1 s at 48 kHz)
+constexpr double kOffsetPpm = 200.0;
+constexpr double kPpmTolerance = 5.0;
+
+// FIFO setpoint budget: the producer core also writes telemetry, and
+// stdio_usb may stall the writer for up to PICO_STDIO_USB_STDOUT_TIMEOUT_US
+// (capped to 2 ms in CMakeLists.txt) when the host stops draining the CDC
+// buffer. During such a stall the consumer keeps pulling — 2 ms is 96 frames
+// at 48 kHz — so the setpoint must exceed it with margin: 144 frames (3 ms
+// at 48 kHz, 9 ms at 16 kHz). This is the README's latency-section rule
+// (the setpoint must exceed the peak occupancy excursion of push/pull
+// jitter) applied to a producer that shares its core with logging.
+constexpr std::size_t kTargetLatencyFrames = 144;
+
+// ---------------------------------------------------------------------------
+// Cross-core shared state. The converter object itself is shared only
+// through the pointer handoff below; this block carries phase control and
+// the consumer's cycle statistics. All payload fields are lock-free 32-bit
+// atomics; wider accumulators (the cycle sum) stay core1-private.
+struct Shared {
+    // Phase handoff, core0 -> core1. The release store of the converter
+    // pointer publishes every plain write the constructor performed (filter
+    // table, ring, servo state) plus the relaxed parameter stores preceding
+    // it; core1's acquire load synchronizes-with that store. The SDK
+    // multicore FIFO is left to the launch protocol — an atomic pointer
+    // makes the C++ happens-before explicit instead of relying on hardware
+    // FIFO side effects.
+    std::atomic<Asrc*> asrc{nullptr};
+    std::atomic<std::uint32_t> consNumUs{0}; // consumer due(b) = t0 + (b*num)/den us
+    std::atomic<std::uint32_t> consDen{1};
+    std::atomic<std::uint32_t> statsSkipBlocks{0}; // exclude fill/acquire from stats
+    std::atomic<bool> stop{false};                 // core0 -> core1: end of phase
+    std::atomic<bool> consumerDone{false};         // core1 -> core0: final stats published
+    std::atomic<std::uint32_t> cyccnt{0};          // core1 -> core0: 0 unknown, 1 ok, 2 absent
+
+    // Consumer stats snapshot, seqlock-style: seq is odd while the writer is
+    // mid-update; the reader retries until the same even value brackets the
+    // payload. The payload fields are themselves relaxed atomics (no torn
+    // reads, no UB); the seqlock only adds mutual coherence, so one printed
+    // line describes one instant.
+    std::atomic<std::uint32_t> seq{0};
+    std::atomic<std::uint32_t> blocks{0};  // measured pull() calls
+    std::atomic<std::uint32_t> meanCyc{0}; // cycles per pull(32)
+    std::atomic<std::uint32_t> p99Cyc{0};
+    std::atomic<std::uint32_t> maxCyc{0};
+    std::atomic<std::uint32_t> lateMaxUs{0}; // worst consumer schedule slip
+};
+
+static_assert(std::atomic<std::uint32_t>::is_always_lock_free &&
+                  std::atomic<Asrc*>::is_always_lock_free && std::atomic<bool>::is_always_lock_free,
+              "cross-core state must be lock-free on the M33");
+
+Shared g;
+
+struct Snapshot {
+    std::uint32_t blocks = 0;
+    std::uint32_t meanCyc = 0;
+    std::uint32_t p99Cyc = 0;
+    std::uint32_t maxCyc = 0;
+    std::uint32_t lateMaxUs = 0;
+};
+
+// Seqlock writer (core1 only). The release fence orders the odd mark before
+// the payload stores; the final release store orders the payload before the
+// even mark.
+void publishSnapshot(const Snapshot& s) {
+    const std::uint32_t q = g.seq.load(std::memory_order_relaxed);
+    g.seq.store(q + 1, std::memory_order_relaxed);
+    std::atomic_thread_fence(std::memory_order_release);
+    g.blocks.store(s.blocks, std::memory_order_relaxed);
+    g.meanCyc.store(s.meanCyc, std::memory_order_relaxed);
+    g.p99Cyc.store(s.p99Cyc, std::memory_order_relaxed);
+    g.maxCyc.store(s.maxCyc, std::memory_order_relaxed);
+    g.lateMaxUs.store(s.lateMaxUs, std::memory_order_relaxed);
+    g.seq.store(q + 2, std::memory_order_release);
+}
+
+// Seqlock reader (core0). The acquire fence pairs with the writer's final
+// release store; a retry costs nothing at the 1 Hz read rate.
+Snapshot readSnapshot() {
+    for (;;) {
+        const std::uint32_t q0 = g.seq.load(std::memory_order_acquire);
+        if (q0 & 1u)
+            continue;
+        Snapshot s;
+        s.blocks = g.blocks.load(std::memory_order_relaxed);
+        s.meanCyc = g.meanCyc.load(std::memory_order_relaxed);
+        s.p99Cyc = g.p99Cyc.load(std::memory_order_relaxed);
+        s.maxCyc = g.maxCyc.load(std::memory_order_relaxed);
+        s.lateMaxUs = g.lateMaxUs.load(std::memory_order_relaxed);
+        std::atomic_thread_fence(std::memory_order_acquire);
+        if (g.seq.load(std::memory_order_relaxed) == q0)
+            return s;
+    }
+}
+
+// ---------------------------------------------------------------------------
+// core1: the consumer / output clock domain.
+
+// TRCENA gates the whole DWT block; CYCCNTENA starts the free-running 32-bit
+// cycle counter. CMSIS names from the SDK's core_cm33.h; the firmware runs in
+// the secure state (rp2350-arm-s) so the registers are directly writable.
+// 32-bit wrap is ~28.6 s at 150 MHz — per-block unsigned deltas are safe.
+//
+// Per-core, verified in the SDK headers: DWT_BASE 0xE0001000 (core_cm33.h)
+// sits inside the PPB (PPB_BASE 0xe0000000, hardware/regs/addressmap.h),
+// and hardware/structs/m33.h maps the whole PPB — dwt_ctrl/dwt_cyccnt
+// included — as `m33_hw` at that one fixed address: whichever core
+// dereferences it reaches its OWN block (the device header marks PPB
+// registers such as NMI_MASK0 "core-local"). So this must run ON core1;
+// enabling CYCCNT from core0 would only start core0's counter. One header
+// caveat: the SVD-derived regs/m33.h gives a DWT_CTRL reset value with
+// NOCYCCNT=1 — but that value (NUMCOMP=7; an M33 has at most 4 DWT
+// comparators) is Arm's generic ARMv8-M template, contradicted by the RW
+// DWT_CYCCNT register the same SVD defines; the runtime check below is the
+// authoritative gate.
+bool enableCycleCounter() {
+    CoreDebug->DEMCR |= CoreDebug_DEMCR_TRCENA_Msk;
+    if (DWT->CTRL & DWT_CTRL_NOCYCCNT_Msk)
+        return false; // implementation without a cycle counter
+    DWT->CYCCNT = 0;
+    DWT->CTRL |= DWT_CTRL_CYCCNTENA_Msk;
+    return true;
+}
+
+// Histogram of cycles per pull(32) in 512-cycle buckets (covers 1M cycles
+// per block — several times the heaviest expected case) so a running p99 is
+// available without storing per-block samples.
+constexpr unsigned kHistShift = 9;
+constexpr std::size_t kHistBuckets = 2048;
+std::uint32_t gHist[kHistBuckets];
+
+std::int16_t gOut[kBlockFrames * kMaxChannels]; // consumer output block
+
+// Derive mean and p99 from core1-private accumulators and publish. p99 is
+// the upper edge of the histogram bucket containing the 99th percentile.
+void finalizeAndPublish(Snapshot s, std::uint64_t cycSum) {
+    if (s.blocks != 0) {
+        s.meanCyc = static_cast<std::uint32_t>(cycSum / s.blocks);
+        const std::uint64_t target = static_cast<std::uint64_t>(s.blocks) * 99 / 100;
+        std::uint64_t cum = 0;
+        for (std::size_t i = 0; i < kHistBuckets; ++i) {
+            cum += gHist[i];
+            if (cum > target) {
+                s.p99Cyc = static_cast<std::uint32_t>((i + 1) << kHistShift);
+                break;
+            }
+        }
+        s.p99Cyc = std::min(s.p99Cyc, s.maxCyc);
+    }
+    publishSnapshot(s);
+}
+
+// Consumer loop: wait for a phase (converter pointer), pull at the exact
+// nominal output rate until told to stop, publish stats once per second.
+// Never prints — stdio stays a core0 concern, both for the FIFO budget and
+// because contending on the stdio mutex from the paced core would put USB
+// stalls on the output clock domain.
+[[noreturn]] void core1Main() {
+    g.cyccnt.store(enableCycleCounter() ? 1 : 2, std::memory_order_release);
+
+    for (;;) {
+        Asrc* asrc;
+        while ((asrc = g.asrc.load(std::memory_order_acquire)) == nullptr)
+            tight_loop_contents();
+
+        const std::uint32_t num = g.consNumUs.load(std::memory_order_relaxed);
+        const std::uint32_t den = g.consDen.load(std::memory_order_relaxed);
+        const std::uint32_t skip = g.statsSkipBlocks.load(std::memory_order_relaxed);
+        const bool timed = g.cyccnt.load(std::memory_order_relaxed) == 1;
+
+        std::fill(std::begin(gHist), std::end(gHist), 0u);
+        std::uint64_t cycSum = 0; // core1-private; only 32-bit digests cross cores
+        Snapshot s;
+        publishSnapshot(s); // zero the previous phase's numbers
+
+        const std::uint64_t t0 = time_us_64() + 1000;
+        std::uint64_t nextPubUs = t0 + 1000000;
+        for (std::uint64_t b = 0; !g.stop.load(std::memory_order_acquire); ++b) {
+            const std::uint64_t due = t0 + (b * num) / den;
+            std::uint64_t now = time_us_64();
+            while (now < due) {
+                tight_loop_contents();
+                now = time_us_64();
+            }
+
+            // Timed region is pull() alone: the per-block datapath + servo
+            // cost the deployment budget cares about.
+            const std::uint32_t c0 = DWT->CYCCNT;
+            asrc->pull(gOut, kBlockFrames);
+            const std::uint32_t cyc = DWT->CYCCNT - c0;
+
+            if (b >= skip) {
+                if (timed) {
+                    cycSum += cyc;
+                    ++s.blocks;
+                    s.maxCyc = std::max(s.maxCyc, cyc);
+                    ++gHist[std::min<std::uint32_t>(cyc >> kHistShift, kHistBuckets - 1)];
+                }
+                // Schedule slip: if pull() ever exceeded the block period,
+                // lateness accumulates here long before the FIFO notices.
+                const std::uint64_t late = now - due;
+                if (late > s.lateMaxUs)
+                    s.lateMaxUs = static_cast<std::uint32_t>(std::min<std::uint64_t>(late, ~0u));
+            }
+            if (now >= nextPubUs) {
+                nextPubUs += 1000000;
+                finalizeAndPublish(s, cycSum);
+            }
+        }
+
+        finalizeAndPublish(s, cycSum); // final numbers for the summary line
+        g.consumerDone.store(true, std::memory_order_release);
+        // Wait out the teardown so a stale pointer cannot restart the phase.
+        while (g.asrc.load(std::memory_order_acquire) != nullptr)
+            tight_loop_contents();
+    }
+}
+
+// Explicit core1 stack (the SDK default is 2 KB scratch RAM; pull() plus the
+// servo's soft-double helpers fit, but the margin is not worth proving).
+std::uint32_t gCore1Stack[1024];
+
+// ---------------------------------------------------------------------------
+// core0: the producer / input clock domain, plus telemetry and verdicts.
+
+struct PhaseSpec {
+    char tag;
+    const char* desc;
+    std::size_t channels;
+    double rateHz;
+    bool scaledTo16k;        // scale balanced() band edges + servo by 16/48
+    std::uint32_t prodNumUs; // producer due(b) = t0 + (b*num)/den us, +200 ppm baked in
+    std::uint32_t prodDen;
+    std::uint32_t consNumUs; // consumer schedule, exact nominal rate
+    std::uint32_t consDen;
+    std::uint32_t statsSkip;   // consumer blocks excluded from cycle stats (~5 s)
+    std::uint32_t lockLimitMs; // PASS: Locked within this
+    std::uint32_t ppmSettleMs; // PASS: |ppm - 200| < 5 at every 1 Hz sample after this
+    std::uint32_t runMs;
+};
+
+struct PhaseResult {
+    bool ran = false;
+    bool pass = false;
+};
+
+// balanced() with band edges scaled to 16 kHz: identical L/T — same table
+// size and same per-frame cycle cost — with pass/stop at the same normalized
+// frequencies (README "Measured performance"; tests/test_asrc_quality_16k.cpp).
+srt::FilterSpec balanced16k() {
+    srt::FilterSpec f = srt::FilterSpec::balanced();
+    f.passbandHz = 20000.0 * 16.0 / 48.0;
+    f.stopbandHz = 28000.0 * 16.0 / 48.0;
+    return f;
+}
+
+const char* stateName(srt::State s) {
+    switch (s) {
+    case srt::State::Filling:
+        return "Filling";
+    case srt::State::Acquiring:
+        return "Acquiring";
+    default:
+        return "Locked";
+    }
+}
+
+// 997 Hz at 0.5 FS, replicated to every channel: lock dynamics and cycle
+// cost are content-independent, and one shared fractional position per
+// frame is the multichannel design anyway. The cycled buffer's wrap seam is
+// not phase-continuous; irrelevant for the same reason (same note as
+// pico2_cyccnt).
+std::vector<std::int16_t> sineInput(std::size_t channels, double rateHz) {
+    std::vector<std::int16_t> out(kInputFrames * channels);
+    const double w = 2.0 * std::numbers::pi * 997.0 / rateHz;
+    for (std::size_t f = 0; f < kInputFrames; ++f) {
+        const auto v = srt::detail::roundSat<std::int16_t>(
+            0.5 * std::sin(w * static_cast<double>(f)) * 32767.0);
+        for (std::size_t c = 0; c < channels; ++c)
+            out[f * channels + c] = v;
+    }
+    return out;
+}
+
+PhaseResult runPhase(const PhaseSpec& ph) {
+    PhaseResult r;
+
+    srt::Config cfg;
+    cfg.sampleRateHz = ph.rateHz;
+    cfg.channels = ph.channels;
+    cfg.targetLatencyFrames = kTargetLatencyFrames;
+    if (ph.scaledTo16k) {
+        // FilterSpec band edges and ServoConfig bandwidths are absolute Hz
+        // designed for ~48 kHz; both scale with the rate (README).
+        cfg.filter = balanced16k();
+        const double sc = ph.rateHz / 48000.0;
+        cfg.servo.acquireBandwidthHz *= sc;
+        cfg.servo.trackBandwidthHz *= sc;
+        cfg.servo.quietBandwidthHz *= sc;
+        cfg.servo.acquireSmootherHz *= sc;
+        cfg.servo.trackSmootherHz *= sc;
+        cfg.servo.quietSmootherHz *= sc;
+    }
+
+    // Heap-constructed so allocation failure (the 12-channel phase on a
+    // tighter build) degrades to a printed SKIP instead of a hard fault.
+    std::unique_ptr<Asrc> asrc;
+    std::vector<std::int16_t> input;
+    try {
+        asrc = std::make_unique<Asrc>(cfg);
+        input = sineInput(ph.channels, ph.rateHz);
+    } catch (const std::exception& e) {
+        std::printf("PHASE %c %s: SKIP (%s)\n", ph.tag, ph.desc, e.what());
+        return r;
+    }
+    r.ran = true;
+
+    std::printf("PHASE %c %s: %lu s run, lock limit %.1f s, ppm gate +/-%.0f after %.0f s\n",
+                ph.tag, ph.desc, static_cast<unsigned long>(ph.runMs / 1000),
+                static_cast<double>(ph.lockLimitMs) / 1000.0, kPpmTolerance,
+                static_cast<double>(ph.ppmSettleMs) / 1000.0);
+
+    // Hand the phase to core1: parameters first (relaxed), then the pointer
+    // with release — the store core1's acquire load synchronizes with.
+    g.consumerDone.store(false, std::memory_order_relaxed);
+    g.stop.store(false, std::memory_order_relaxed);
+    g.consNumUs.store(ph.consNumUs, std::memory_order_relaxed);
+    g.consDen.store(ph.consDen, std::memory_order_relaxed);
+    g.statsSkipBlocks.store(ph.statsSkip, std::memory_order_relaxed);
+    g.asrc.store(asrc.get(), std::memory_order_release);
+
+    const std::uint64_t tStart = time_us_64();
+    const std::uint64_t tEnd = tStart + static_cast<std::uint64_t>(ph.runMs) * 1000;
+    const std::uint64_t t0 = tStart + 1000;
+    std::uint64_t nextTelemetryUs = tStart + 1000000;
+
+    bool locked = false;
+    std::uint64_t lockUs = 0;
+    std::uint64_t undAtLock = 0, ovrAtLock = 0, rsyAtLock = 0;
+    bool ppmOk = true;
+    bool ppmSampled = false;
+    double ppmFinal = 0.0;
+
+    std::size_t off = 0;
+    for (std::uint64_t b = 0;; ++b) {
+        const std::uint64_t due = t0 + (b * ph.prodNumUs) / ph.prodDen;
+        if (due >= tEnd)
+            break;
+        std::uint64_t now = time_us_64();
+        while (now < due) {
+            tight_loop_contents();
+            now = time_us_64();
+        }
+
+        asrc->push(input.data() + off, kBlockFrames);
+        off += kBlockFrames * ph.channels;
+        if (off + kBlockFrames * ph.channels > input.size())
+            off = 0;
+
+        const srt::Status st = asrc->status();
+        if (!locked && st.state == srt::State::Locked) {
+            locked = true;
+            lockUs = time_us_64() - tStart;
+            undAtLock = st.underruns;
+            ovrAtLock = st.overruns;
+            rsyAtLock = st.resyncs;
+        }
+
+        // 1 Hz telemetry. The printf may stall up to the 2 ms stdio cap;
+        // the absolute push schedule catches up immediately afterwards and
+        // the FIFO setpoint absorbs the dip (see kTargetLatencyFrames).
+        if (now >= nextTelemetryUs) {
+            nextTelemetryUs += 1000000;
+            const std::uint64_t tMs = (now - tStart) / 1000;
+            const Snapshot sn = readSnapshot();
+            const double cycFrame =
+                static_cast<double>(sn.meanCyc) / static_cast<double>(kBlockFrames);
+            const double pctCore =
+                cycFrame * ph.rateHz / static_cast<double>(clock_get_hz(clk_sys)) * 100.0;
+            std::printf(
+                "[%c t=%2lus] %-9s ppm=%+7.2f fill=%6.1f und=%lu ovr=%lu rsy=%lu | "
+                "pull/blk mean=%lu p99=%lu max=%lu (%4.1f%% core) late<=%luus\n",
+                ph.tag, static_cast<unsigned long>(tMs / 1000), stateName(st.state), st.ppm,
+                st.fifoFillFrames, static_cast<unsigned long>(st.underruns),
+                static_cast<unsigned long>(st.overruns), static_cast<unsigned long>(st.resyncs),
+                static_cast<unsigned long>(sn.meanCyc), static_cast<unsigned long>(sn.p99Cyc),
+                static_cast<unsigned long>(sn.maxCyc), pctCore,
+                static_cast<unsigned long>(sn.lateMaxUs));
+            if (tMs >= ph.ppmSettleMs) {
+                ppmSampled = true;
+                if (std::fabs(st.ppm - kOffsetPpm) >= kPpmTolerance)
+                    ppmOk = false;
+            }
+        }
+    }
+
+    // Teardown: stop core1, wait for its final stats. consumerDone's
+    // release/acquire pair orders core1's last pull() before this point, so
+    // destroying the converter afterwards is safe.
+    g.stop.store(true, std::memory_order_release);
+    while (!g.consumerDone.load(std::memory_order_acquire))
+        tight_loop_contents();
+    const Snapshot fin = readSnapshot();
+    const srt::Status st = asrc->status();
+    ppmFinal = st.ppm;
+    g.asrc.store(nullptr, std::memory_order_release);
+
+    // PASS = the deployment-shape claims, made falsifiable:
+    //   1. servo Locked within lockLimitMs of a cold start;
+    //   2. every 1 Hz ppm sample after ppmSettleMs within +/-5 of the
+    //      synthesized +200 ppm truth (and at least one such sample);
+    //   3. zero underruns/overruns/resyncs after first lock — the
+    //      both-cores-keeping-real-time criterion (overruns/resyncs are the
+    //      signature of a consumer that cannot keep up, so they gate too).
+    const std::uint64_t und = st.underruns - undAtLock;
+    const std::uint64_t ovr = st.overruns - ovrAtLock;
+    const std::uint64_t rsy = st.resyncs - rsyAtLock;
+    const bool lockOk = locked && lockUs <= static_cast<std::uint64_t>(ph.lockLimitMs) * 1000;
+    const bool cleanOk = locked && und == 0 && ovr == 0 && rsy == 0;
+    r.pass = lockOk && ppmOk && ppmSampled && cleanOk;
+
+    const double cycFrame = static_cast<double>(fin.meanCyc) / static_cast<double>(kBlockFrames);
+    const double pctCore =
+        cycFrame * ph.rateHz / static_cast<double>(clock_get_hz(clk_sys)) * 100.0;
+    std::printf("SUMMARY %c %s: %s lock_ms=%lu ppm_final=%+.2f post_lock_und=%lu ovr=%lu "
+                "rsy=%lu pull_cyc_blk mean=%lu p99=%lu max=%lu cyc_frame=%.1f core_pct=%.1f "
+                "late_max_us=%lu\n",
+                ph.tag, ph.desc, r.pass ? "PASS" : "FAIL",
+                static_cast<unsigned long>(lockUs / 1000), ppmFinal,
+                static_cast<unsigned long>(und), static_cast<unsigned long>(ovr),
+                static_cast<unsigned long>(rsy), static_cast<unsigned long>(fin.meanCyc),
+                static_cast<unsigned long>(fin.p99Cyc), static_cast<unsigned long>(fin.maxCyc),
+                cycFrame, pctCore, static_cast<unsigned long>(fin.lateMaxUs));
+    return r;
+}
+
+} // namespace
+
+int main() {
+    stdio_init_all();
+    // USB CDC drops everything printed before a host terminal attaches.
+    while (!stdio_usb_connected())
+        sleep_ms(100);
+    sleep_ms(250);
+
+    std::printf("SampleRateTap RP2350 dual-core deployment\n");
+    std::printf("sys clock %lu Hz; core0 push @ nominal*(1+%.0fe-6), core1 pull @ nominal; "
+                "block %u frames\n",
+                static_cast<unsigned long>(clock_get_hz(clk_sys)), kOffsetPpm,
+                static_cast<unsigned>(kBlockFrames));
+
+    multicore_launch_core1_with_stack(core1Main, gCore1Stack, sizeof(gCore1Stack));
+    while (g.cyccnt.load(std::memory_order_acquire) == 0)
+        tight_loop_contents(); // doubles as the launch handshake
+    if (g.cyccnt.load(std::memory_order_relaxed) == 2)
+        std::printf("WARN: core1 DWT has no cycle counter; pull timings will read 0\n");
+
+    // Producer schedules bake in the +200 ppm offset (exact integer rationals):
+    //   A: 48 kHz * 1.0002 = 48009.6 Hz; 32 frames = 32e6/48009.6 us = 1e7/15003 us
+    //   B: 16 kHz * 1.0002 = 16003.2 Hz; 32 frames = 1e7/5001 us
+    // Consumer schedules are the exact nominal rates:
+    //   A: 32/48000 s = 2000/3 us;  B: 32/16000 s = 2000/1 us
+    //
+    // Phase B pins the 12-channel shape at 16 kHz — the README's
+    // reference-microphone/AVB deployment rate — not 48 kHz: the M33 QEMU
+    // baseline puts pipeline12_q15 at 10,027 insns/frame against a
+    // 150 MHz / 48 kHz budget of 3,125 cycles/frame, more than 3x over, and
+    // pull() of one instance is a single consumer by contract — no core
+    // assignment can split it. At 16 kHz the budget is 9,375 cycles/frame.
+    // The measured cycles/block is rate-independent either way, so phase B
+    // still yields the real-silicon counterpart of the 12-channel baseline.
+    // Its lock/settle gates scale with the servo (bandwidths * 16/48).
+    const PhaseSpec phases[] = {
+        {'A', "q15 2ch balanced @48000", 2, 48000.0, false, 10000000, 15003, 2000, 3, 7500, 2000,
+         10000, 30000},
+        {'B', "q15 12ch balanced16k @16000", 12, 16000.0, true, 10000000, 5001, 2000, 1, 2500, 6000,
+         15000, 30000},
+    };
+
+    PhaseResult res[2];
+    for (std::size_t i = 0; i < 2; ++i)
+        res[i] = runPhase(phases[i]);
+
+    // A skipped phase B (allocation) is reported but does not fail the
+    // deployment verdict — the configuration is optional by RAM budget.
+    const bool overall = res[0].ran && res[0].pass && (!res[1].ran || res[1].pass);
+    std::printf("OVERALL: %s (A %s, B %s)\n", overall ? "PASS" : "FAIL",
+                res[0].ran ? (res[0].pass ? "PASS" : "FAIL") : "SKIP",
+                res[1].ran ? (res[1].pass ? "PASS" : "FAIL") : "SKIP");
+    std::printf("SRT_PICO2_DUALCORE_DONE\n");
+    while (true)
+        sleep_ms(1000);
+}