diff --git a/examples/pico2_dualcore/CMakeLists.txt b/examples/pico2_dualcore/CMakeLists.txt new file mode 100644 index 0000000..cdb9c7a --- /dev/null +++ b/examples/pico2_dualcore/CMakeLists.txt @@ -0,0 +1,58 @@ +# Standalone Raspberry Pi Pico 2 (RP2350) firmware: dual-core deployment of +# the ASRC — producer on core0, consumer on core1, one core per clock domain +# (docs/HARDWARE_TESTING.md, Setup 2, "Dual-core deployment"). Deliberately +# NOT wired into the root build — configure this directory on its own: +# +# cmake -B build -DPICO_BOARD=pico2 +# cmake --build build -j +# +# The Pico SDK is fetched by git tag rather than release-tarball URL+SHA256: +# GitHub source tarballs exclude submodules, and USB CDC stdio needs +# lib/tinyusb. GIT_SUBMODULES limits the clone to that one submodule +# (lwip/btstack/mbedtls/cyw43 are not used here). +cmake_minimum_required(VERSION 3.24) + +set(PICO_BOARD pico2 CACHE STRING "Pico SDK board") +set(PICO_PLATFORM rp2350-arm-s CACHE STRING "Pico SDK platform (secure Arm)") + +# The library's constructors throw (allocation, filter design); the +# 12-channel phase relies on catching bad_alloc instead of crashing. The SDK +# default is -fno-exceptions. +set(PICO_CXX_ENABLE_EXCEPTIONS 1 CACHE BOOL "") + +include(FetchContent) +FetchContent_Declare( + pico_sdk + GIT_REPOSITORY https://github.com/raspberrypi/pico-sdk.git + GIT_TAG 2.1.1 # first-class RP2350 support + GIT_SHALLOW TRUE + GIT_SUBMODULES "lib/tinyusb") +FetchContent_GetProperties(pico_sdk) +if(NOT pico_sdk_POPULATED) + FetchContent_Populate(pico_sdk) +endif() +# Must be included before project() so the SDK can inject its toolchain file. +include(${pico_sdk_SOURCE_DIR}/pico_sdk_init.cmake) + +project(pico2_dualcore C CXX ASM) +set(CMAKE_C_STANDARD 11) +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +pico_sdk_init() + +add_executable(pico2_dualcore main.cpp) +# Self-contained except for the header-only library itself. +target_include_directories(pico2_dualcore PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../include) +target_link_libraries(pico2_dualcore PRIVATE pico_stdlib pico_multicore cmsis_core) + +# Telemetry prints from the producer core; when the USB host stops draining +# the CDC buffer, stdio_usb blocks the writer for up to this long before +# dropping output (SDK default: 500 ms — far more than the FIFO can absorb). +# Capped at 2 ms so the worst push stall stays inside the FIFO setpoint +# budget; see the kTargetLatencyFrames comment in main.cpp. +target_compile_definitions(pico2_dualcore PRIVATE PICO_STDIO_USB_STDOUT_TIMEOUT_US=2000) + +pico_enable_stdio_usb(pico2_dualcore 1) +pico_enable_stdio_uart(pico2_dualcore 0) +pico_add_extra_outputs(pico2_dualcore) # .uf2 etc. diff --git a/examples/pico2_dualcore/README.md b/examples/pico2_dualcore/README.md new file mode 100644 index 0000000..ea8f218 --- /dev/null +++ b/examples/pico2_dualcore/README.md @@ -0,0 +1,136 @@ +# pico2_dualcore — RP2350 dual-core deployment firmware + +The dual-core deliverable of [docs/HARDWARE_TESTING.md](../../docs/HARDWARE_TESTING.md) +Setup 2: the converter's two ends run on the RP2350's two Cortex-M33 cores, +one core per clock domain, and the firmware judges its own run — servo lock, +ppm convergence against a synthesized truth value, clean counters, and +measured `pull()` cycles — against PASS/FAIL gates. + +## The deployment shape it validates + +The README's platform guidance says that on Pico-class parts the Q15/Q31 +paths are the right datapaths, 48 kHz mono fits a 150 MHz core with room to +spare, and stereo `balanced()` wants the `fast()` preset *or the RP2350's +second core*. This firmware is that second-core deployment, built the way a +real one is: + +- **core0 = producer (input clock domain).** Pushes 32-frame blocks of a + 997 Hz sine, busy-paced on the microsecond timer at + `rate × (1 + 200e-6)` — a +200 ppm clock offset synthesized from the + shared timebase, so the servo's converged estimate has an exact truth + value to be judged against (the one thing two real crystals cannot give + you). core0 also owns USB telemetry, printing once per second. +- **core1 = consumer (output clock domain).** Pulls 32-frame blocks paced + at exactly the nominal rate and times every `pull()` with its own + DWT.CYCCNT (each RP2350 core has a private DWT — the 0xE000_0000 PPB + region is per-core, so the counter must be enabled *on* core1). +- The library's `push()`/`pull()` contract is one producer agent and one + consumer agent around a lock-free SPSC ring with acquire/release atomics — + the contract names agents, not threads, so two cores sharing the RP2350's + coherent SRAM satisfy it exactly as two threads do. Everything else that + crosses cores is an explicit block of 32-bit atomics (32-bit because + 64-bit `std::atomic` is not lock-free on the M33 — the same constraint + that shapes the library's own telemetry). + +Two phases, ~30 s each: + +| Phase | Config | Rates | Why | +|---|---|---|---| +| A | Q15 stereo `balanced()` | 48 kHz out, +200 ppm in | the config the README calls tight on one core | +| B | Q15 12-channel, `balanced()` band edges and servo scaled ×16/48 | 16 kHz out, +200 ppm in | the reference-microphone/AVB 12-channel shape at its deployment rate | + +Phase B is 16 kHz **by arithmetic, not caution**: the M33 QEMU baseline puts +`pipeline12_q15` at 10,027 insns/frame against a 150 MHz / 48 kHz budget of +3,125 cycles/frame — more than 3× over, and `pull()` of a single instance is +one consumer by contract, so no core assignment can split it across cores. +Dual-core buys one clock domain per core, not more datapath than one core +has. At 16 kHz the budget is 9,375 cycles/frame. The measured cycles/block +is rate-independent, so phase B still produces the real-silicon counterpart +of the 12-channel baseline. + +## Build + +Standalone project — *not* part of the root CMake build. Requires +`cmake` ≥ 3.24, `arm-none-eabi-gcc` (tested with 13.2), and network access +on first configure (fetches Pico SDK 2.1.1 plus its TinyUSB submodule; +several minutes, and a native compiler for the SDK's picotool build). + +```sh +cd examples/pico2_dualcore +cmake -B build -DPICO_BOARD=pico2 +cmake --build build -j +``` + +Produces `build/pico2_dualcore.uf2`. + +## Flash and run + +Hold BOOTSEL while plugging in and copy the UF2 onto the `RP2350` drive, or +use picotool: + +```sh +cp build/pico2_dualcore.uf2 /media/$USER/RP2350/ # or: +picotool load -f build/pico2_dualcore.uf2 && picotool reboot +``` + +Open the USB serial port (`picocom /dev/ttyACM0`); the firmware waits for a +terminal before starting, so nothing is lost. + +## Expected output + +A header, then one telemetry line per second per phase: + +``` +[A t= 9s] Locked ppm=+200.05 fill= 144.2 und=0 ovr=0 rsy=0 | pull/blk mean=... p99=... max=... (..% core) late<=..us +``` + +and per phase a verdict line: + +``` +SUMMARY A q15 2ch balanced @48000: PASS lock_ms=... ppm_final=+200.0 post_lock_und=0 ovr=0 rsy=0 pull_cyc_blk mean=... p99=... max=... cyc_frame=... core_pct=... late_max_us=... +``` + +PASS requires: Locked within 2 s (phase A; 6 s for B, whose servo is scaled +3× slower), every 1 Hz ppm sample after 10 s (A) / 15 s (B) within ±5 of ++200, and zero underruns/overruns/resyncs after first lock. The run ends +with an `OVERALL` line and: + +``` +SRT_PICO2_DUALCORE_DONE +``` + +## Reading the numbers + +- **core_pct** is the headroom figure: one stream's share of core1 at the + reported sys clock, `cyc_frame × rate / 150 MHz`. It prices `pull()` only + — by design, since `push()` is a ring write and the producer core's real + budget goes to whatever feeds it (here: telemetry). +- **Relation to the QEMU baselines** (`bench/baselines.json`, 2 s = 96,000 + frames per workload): `pipeline_q15` 484,146,844 insns = **5,043 + insns/frame**, `pipeline12_q15` 962,613,655 = **10,027 insns/frame**. + Those figures amortize one-time setup (soft-double Kaiser design, input + synthesis) over the workload, so they are upper bounds for the + steady-state loop this firmware times; `cyc_frame ÷ insns/frame` from the + sibling `examples/pico2_cyccnt` run gives the cycles-per-instruction + calibration that converts every M33 baseline into a cycle budget. +- Against the budgets: 3,125 cycles/frame buys one 48 kHz frame at + 150 MHz, 9,375 one 16 kHz frame. Phase A's `core_pct` is the measured + version of "stereo balanced() is tight on one core": whatever it reads, + that is the share of core1 a deployment must reserve — and on a + *single*-core deployment the same cycles would contend with the producer + side and the rest of the application, which is exactly why the input + domain lives on core0 here. +- **late_max_us** is the consumer's worst schedule slip. If `pull()` ever + exceeded the block period, lateness shows here long before the FIFO + counters do. +- The FIFO setpoint is 144 frames (3 ms at 48 kHz) rather than the default + 48: the producer core shares its time with USB logging, whose worst-case + writer stall is capped at 2 ms in the CMakeLists. The README latency rule + — the setpoint must exceed the peak occupancy excursion of push/pull + jitter — applied to a producer that also logs. + +This firmware cannot prove the inter-crystal lock that +HARDWARE_TESTING.md Setup 1/2 ultimately want (both domains here are paced +from the RP2350's one timer, which is what makes ppm = +200.0 an exact, +assertable truth); it proves the *deployment shape*: two cores, two clock +domains, lock-free handoff, and real cycle headroom numbers. diff --git a/examples/pico2_dualcore/main.cpp b/examples/pico2_dualcore/main.cpp new file mode 100644 index 0000000..eec0a8b --- /dev/null +++ b/examples/pico2_dualcore/main.cpp @@ -0,0 +1,548 @@ +// Dual-core deployment of the ASRC on the RP2350 (docs/HARDWARE_TESTING.md, +// Setup 2, "Dual-core deployment"): the converter's two ends on the two +// Cortex-M33 cores, one core per clock domain — the shape the README +// prescribes for configurations that are tight on a single 150 MHz core +// (Q15 stereo balanced(); 12-channel). +// +// core0 producer: push(32) paced at rate * (1 + 200e-6), plus USB telemetry +// core1 consumer: pull(32) paced at exactly the nominal rate, every call +// timed with the core-local DWT.CYCCNT +// +// Cross-core safety, stated explicitly: the library's runtime contract is +// one producer agent and one consumer agent around a lock-free SPSC ring +// with acquire/release atomics (srt/spsc_ring.hpp; "one producer thread and +// one consumer thread" in the README's Limitations). The contract is about +// agents and memory ordering, not about std::thread: the RP2350's cores +// share coherent SRAM (no data caches in front of it), so two CORES satisfy +// it exactly as two threads do. push() stays core0-only, pull() stays +// core1-only, status() is documented any-thread. Everything else that +// crosses cores is the explicit Shared block of 32-bit atomics below — kept +// 32-bit for the same reason the library keeps its telemetry 32-bit: on the +// M33, 64-bit std::atomic is not lock-free and would route through a +// library lock (see the footnote in asrc.hpp). +// +// Both pacing schedules derive from the same 64-bit microsecond timebase +// (the RP2350 timer is one shared block read by both cores), so the +// +200 ppm offset is exact by construction and the servo's converged +// estimate has a known truth value to be judged against — the one thing +// genuinely independent oscillators cannot provide. Due times are absolute, +// t0 + (b * num) / den in integer microseconds, so a stall (a USB telemetry +// write on core0) is followed by catch-up pushes, not permanent schedule +// slip. +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "RP2350.h" +#include "hardware/clocks.h" +#include "pico/multicore.h" +#include "pico/stdlib.h" + +#include "srt/asrc.hpp" + +namespace { + +using Asrc = srt::AsyncSampleRateConverterQ15; + +constexpr std::size_t kBlockFrames = 32; +constexpr std::size_t kMaxChannels = 12; +constexpr std::size_t kInputFrames = 4800; // cycled producer buffer (0.1 s at 48 kHz) +constexpr double kOffsetPpm = 200.0; +constexpr double kPpmTolerance = 5.0; + +// FIFO setpoint budget: the producer core also writes telemetry, and +// stdio_usb may stall the writer for up to PICO_STDIO_USB_STDOUT_TIMEOUT_US +// (capped to 2 ms in CMakeLists.txt) when the host stops draining the CDC +// buffer. During such a stall the consumer keeps pulling — 2 ms is 96 frames +// at 48 kHz — so the setpoint must exceed it with margin: 144 frames (3 ms +// at 48 kHz, 9 ms at 16 kHz). This is the README's latency-section rule +// (the setpoint must exceed the peak occupancy excursion of push/pull +// jitter) applied to a producer that shares its core with logging. +constexpr std::size_t kTargetLatencyFrames = 144; + +// --------------------------------------------------------------------------- +// Cross-core shared state. The converter object itself is shared only +// through the pointer handoff below; this block carries phase control and +// the consumer's cycle statistics. All payload fields are lock-free 32-bit +// atomics; wider accumulators (the cycle sum) stay core1-private. +struct Shared { + // Phase handoff, core0 -> core1. The release store of the converter + // pointer publishes every plain write the constructor performed (filter + // table, ring, servo state) plus the relaxed parameter stores preceding + // it; core1's acquire load synchronizes-with that store. The SDK + // multicore FIFO is left to the launch protocol — an atomic pointer + // makes the C++ happens-before explicit instead of relying on hardware + // FIFO side effects. + std::atomic asrc{nullptr}; + std::atomic consNumUs{0}; // consumer due(b) = t0 + (b*num)/den us + std::atomic consDen{1}; + std::atomic statsSkipBlocks{0}; // exclude fill/acquire from stats + std::atomic stop{false}; // core0 -> core1: end of phase + std::atomic consumerDone{false}; // core1 -> core0: final stats published + std::atomic cyccnt{0}; // core1 -> core0: 0 unknown, 1 ok, 2 absent + + // Consumer stats snapshot, seqlock-style: seq is odd while the writer is + // mid-update; the reader retries until the same even value brackets the + // payload. The payload fields are themselves relaxed atomics (no torn + // reads, no UB); the seqlock only adds mutual coherence, so one printed + // line describes one instant. + std::atomic seq{0}; + std::atomic blocks{0}; // measured pull() calls + std::atomic meanCyc{0}; // cycles per pull(32) + std::atomic p99Cyc{0}; + std::atomic maxCyc{0}; + std::atomic lateMaxUs{0}; // worst consumer schedule slip +}; + +static_assert(std::atomic::is_always_lock_free && + std::atomic::is_always_lock_free && std::atomic::is_always_lock_free, + "cross-core state must be lock-free on the M33"); + +Shared g; + +struct Snapshot { + std::uint32_t blocks = 0; + std::uint32_t meanCyc = 0; + std::uint32_t p99Cyc = 0; + std::uint32_t maxCyc = 0; + std::uint32_t lateMaxUs = 0; +}; + +// Seqlock writer (core1 only). The release fence orders the odd mark before +// the payload stores; the final release store orders the payload before the +// even mark. +void publishSnapshot(const Snapshot& s) { + const std::uint32_t q = g.seq.load(std::memory_order_relaxed); + g.seq.store(q + 1, std::memory_order_relaxed); + std::atomic_thread_fence(std::memory_order_release); + g.blocks.store(s.blocks, std::memory_order_relaxed); + g.meanCyc.store(s.meanCyc, std::memory_order_relaxed); + g.p99Cyc.store(s.p99Cyc, std::memory_order_relaxed); + g.maxCyc.store(s.maxCyc, std::memory_order_relaxed); + g.lateMaxUs.store(s.lateMaxUs, std::memory_order_relaxed); + g.seq.store(q + 2, std::memory_order_release); +} + +// Seqlock reader (core0). The acquire fence pairs with the writer's final +// release store; a retry costs nothing at the 1 Hz read rate. +Snapshot readSnapshot() { + for (;;) { + const std::uint32_t q0 = g.seq.load(std::memory_order_acquire); + if (q0 & 1u) + continue; + Snapshot s; + s.blocks = g.blocks.load(std::memory_order_relaxed); + s.meanCyc = g.meanCyc.load(std::memory_order_relaxed); + s.p99Cyc = g.p99Cyc.load(std::memory_order_relaxed); + s.maxCyc = g.maxCyc.load(std::memory_order_relaxed); + s.lateMaxUs = g.lateMaxUs.load(std::memory_order_relaxed); + std::atomic_thread_fence(std::memory_order_acquire); + if (g.seq.load(std::memory_order_relaxed) == q0) + return s; + } +} + +// --------------------------------------------------------------------------- +// core1: the consumer / output clock domain. + +// TRCENA gates the whole DWT block; CYCCNTENA starts the free-running 32-bit +// cycle counter. CMSIS names from the SDK's core_cm33.h; the firmware runs in +// the secure state (rp2350-arm-s) so the registers are directly writable. +// 32-bit wrap is ~28.6 s at 150 MHz — per-block unsigned deltas are safe. +// +// Per-core, verified in the SDK headers: DWT_BASE 0xE0001000 (core_cm33.h) +// sits inside the PPB (PPB_BASE 0xe0000000, hardware/regs/addressmap.h), +// and hardware/structs/m33.h maps the whole PPB — dwt_ctrl/dwt_cyccnt +// included — as `m33_hw` at that one fixed address: whichever core +// dereferences it reaches its OWN block (the device header marks PPB +// registers such as NMI_MASK0 "core-local"). So this must run ON core1; +// enabling CYCCNT from core0 would only start core0's counter. One header +// caveat: the SVD-derived regs/m33.h gives a DWT_CTRL reset value with +// NOCYCCNT=1 — but that value (NUMCOMP=7; an M33 has at most 4 DWT +// comparators) is Arm's generic ARMv8-M template, contradicted by the RW +// DWT_CYCCNT register the same SVD defines; the runtime check below is the +// authoritative gate. +bool enableCycleCounter() { + CoreDebug->DEMCR |= CoreDebug_DEMCR_TRCENA_Msk; + if (DWT->CTRL & DWT_CTRL_NOCYCCNT_Msk) + return false; // implementation without a cycle counter + DWT->CYCCNT = 0; + DWT->CTRL |= DWT_CTRL_CYCCNTENA_Msk; + return true; +} + +// Histogram of cycles per pull(32) in 512-cycle buckets (covers 1M cycles +// per block — several times the heaviest expected case) so a running p99 is +// available without storing per-block samples. +constexpr unsigned kHistShift = 9; +constexpr std::size_t kHistBuckets = 2048; +std::uint32_t gHist[kHistBuckets]; + +std::int16_t gOut[kBlockFrames * kMaxChannels]; // consumer output block + +// Derive mean and p99 from core1-private accumulators and publish. p99 is +// the upper edge of the histogram bucket containing the 99th percentile. +void finalizeAndPublish(Snapshot s, std::uint64_t cycSum) { + if (s.blocks != 0) { + s.meanCyc = static_cast(cycSum / s.blocks); + const std::uint64_t target = static_cast(s.blocks) * 99 / 100; + std::uint64_t cum = 0; + for (std::size_t i = 0; i < kHistBuckets; ++i) { + cum += gHist[i]; + if (cum > target) { + s.p99Cyc = static_cast((i + 1) << kHistShift); + break; + } + } + s.p99Cyc = std::min(s.p99Cyc, s.maxCyc); + } + publishSnapshot(s); +} + +// Consumer loop: wait for a phase (converter pointer), pull at the exact +// nominal output rate until told to stop, publish stats once per second. +// Never prints — stdio stays a core0 concern, both for the FIFO budget and +// because contending on the stdio mutex from the paced core would put USB +// stalls on the output clock domain. +[[noreturn]] void core1Main() { + g.cyccnt.store(enableCycleCounter() ? 1 : 2, std::memory_order_release); + + for (;;) { + Asrc* asrc; + while ((asrc = g.asrc.load(std::memory_order_acquire)) == nullptr) + tight_loop_contents(); + + const std::uint32_t num = g.consNumUs.load(std::memory_order_relaxed); + const std::uint32_t den = g.consDen.load(std::memory_order_relaxed); + const std::uint32_t skip = g.statsSkipBlocks.load(std::memory_order_relaxed); + const bool timed = g.cyccnt.load(std::memory_order_relaxed) == 1; + + std::fill(std::begin(gHist), std::end(gHist), 0u); + std::uint64_t cycSum = 0; // core1-private; only 32-bit digests cross cores + Snapshot s; + publishSnapshot(s); // zero the previous phase's numbers + + const std::uint64_t t0 = time_us_64() + 1000; + std::uint64_t nextPubUs = t0 + 1000000; + for (std::uint64_t b = 0; !g.stop.load(std::memory_order_acquire); ++b) { + const std::uint64_t due = t0 + (b * num) / den; + std::uint64_t now = time_us_64(); + while (now < due) { + tight_loop_contents(); + now = time_us_64(); + } + + // Timed region is pull() alone: the per-block datapath + servo + // cost the deployment budget cares about. + const std::uint32_t c0 = DWT->CYCCNT; + asrc->pull(gOut, kBlockFrames); + const std::uint32_t cyc = DWT->CYCCNT - c0; + + if (b >= skip) { + if (timed) { + cycSum += cyc; + ++s.blocks; + s.maxCyc = std::max(s.maxCyc, cyc); + ++gHist[std::min(cyc >> kHistShift, kHistBuckets - 1)]; + } + // Schedule slip: if pull() ever exceeded the block period, + // lateness accumulates here long before the FIFO notices. + const std::uint64_t late = now - due; + if (late > s.lateMaxUs) + s.lateMaxUs = static_cast(std::min(late, ~0u)); + } + if (now >= nextPubUs) { + nextPubUs += 1000000; + finalizeAndPublish(s, cycSum); + } + } + + finalizeAndPublish(s, cycSum); // final numbers for the summary line + g.consumerDone.store(true, std::memory_order_release); + // Wait out the teardown so a stale pointer cannot restart the phase. + while (g.asrc.load(std::memory_order_acquire) != nullptr) + tight_loop_contents(); + } +} + +// Explicit core1 stack (the SDK default is 2 KB scratch RAM; pull() plus the +// servo's soft-double helpers fit, but the margin is not worth proving). +std::uint32_t gCore1Stack[1024]; + +// --------------------------------------------------------------------------- +// core0: the producer / input clock domain, plus telemetry and verdicts. + +struct PhaseSpec { + char tag; + const char* desc; + std::size_t channels; + double rateHz; + bool scaledTo16k; // scale balanced() band edges + servo by 16/48 + std::uint32_t prodNumUs; // producer due(b) = t0 + (b*num)/den us, +200 ppm baked in + std::uint32_t prodDen; + std::uint32_t consNumUs; // consumer schedule, exact nominal rate + std::uint32_t consDen; + std::uint32_t statsSkip; // consumer blocks excluded from cycle stats (~5 s) + std::uint32_t lockLimitMs; // PASS: Locked within this + std::uint32_t ppmSettleMs; // PASS: |ppm - 200| < 5 at every 1 Hz sample after this + std::uint32_t runMs; +}; + +struct PhaseResult { + bool ran = false; + bool pass = false; +}; + +// balanced() with band edges scaled to 16 kHz: identical L/T — same table +// size and same per-frame cycle cost — with pass/stop at the same normalized +// frequencies (README "Measured performance"; tests/test_asrc_quality_16k.cpp). +srt::FilterSpec balanced16k() { + srt::FilterSpec f = srt::FilterSpec::balanced(); + f.passbandHz = 20000.0 * 16.0 / 48.0; + f.stopbandHz = 28000.0 * 16.0 / 48.0; + return f; +} + +const char* stateName(srt::State s) { + switch (s) { + case srt::State::Filling: + return "Filling"; + case srt::State::Acquiring: + return "Acquiring"; + default: + return "Locked"; + } +} + +// 997 Hz at 0.5 FS, replicated to every channel: lock dynamics and cycle +// cost are content-independent, and one shared fractional position per +// frame is the multichannel design anyway. The cycled buffer's wrap seam is +// not phase-continuous; irrelevant for the same reason (same note as +// pico2_cyccnt). +std::vector sineInput(std::size_t channels, double rateHz) { + std::vector out(kInputFrames * channels); + const double w = 2.0 * std::numbers::pi * 997.0 / rateHz; + for (std::size_t f = 0; f < kInputFrames; ++f) { + const auto v = srt::detail::roundSat( + 0.5 * std::sin(w * static_cast(f)) * 32767.0); + for (std::size_t c = 0; c < channels; ++c) + out[f * channels + c] = v; + } + return out; +} + +PhaseResult runPhase(const PhaseSpec& ph) { + PhaseResult r; + + srt::Config cfg; + cfg.sampleRateHz = ph.rateHz; + cfg.channels = ph.channels; + cfg.targetLatencyFrames = kTargetLatencyFrames; + if (ph.scaledTo16k) { + // FilterSpec band edges and ServoConfig bandwidths are absolute Hz + // designed for ~48 kHz; both scale with the rate (README). + cfg.filter = balanced16k(); + const double sc = ph.rateHz / 48000.0; + cfg.servo.acquireBandwidthHz *= sc; + cfg.servo.trackBandwidthHz *= sc; + cfg.servo.quietBandwidthHz *= sc; + cfg.servo.acquireSmootherHz *= sc; + cfg.servo.trackSmootherHz *= sc; + cfg.servo.quietSmootherHz *= sc; + } + + // Heap-constructed so allocation failure (the 12-channel phase on a + // tighter build) degrades to a printed SKIP instead of a hard fault. + std::unique_ptr asrc; + std::vector input; + try { + asrc = std::make_unique(cfg); + input = sineInput(ph.channels, ph.rateHz); + } catch (const std::exception& e) { + std::printf("PHASE %c %s: SKIP (%s)\n", ph.tag, ph.desc, e.what()); + return r; + } + r.ran = true; + + std::printf("PHASE %c %s: %lu s run, lock limit %.1f s, ppm gate +/-%.0f after %.0f s\n", + ph.tag, ph.desc, static_cast(ph.runMs / 1000), + static_cast(ph.lockLimitMs) / 1000.0, kPpmTolerance, + static_cast(ph.ppmSettleMs) / 1000.0); + + // Hand the phase to core1: parameters first (relaxed), then the pointer + // with release — the store core1's acquire load synchronizes with. + g.consumerDone.store(false, std::memory_order_relaxed); + g.stop.store(false, std::memory_order_relaxed); + g.consNumUs.store(ph.consNumUs, std::memory_order_relaxed); + g.consDen.store(ph.consDen, std::memory_order_relaxed); + g.statsSkipBlocks.store(ph.statsSkip, std::memory_order_relaxed); + g.asrc.store(asrc.get(), std::memory_order_release); + + const std::uint64_t tStart = time_us_64(); + const std::uint64_t tEnd = tStart + static_cast(ph.runMs) * 1000; + const std::uint64_t t0 = tStart + 1000; + std::uint64_t nextTelemetryUs = tStart + 1000000; + + bool locked = false; + std::uint64_t lockUs = 0; + std::uint64_t undAtLock = 0, ovrAtLock = 0, rsyAtLock = 0; + bool ppmOk = true; + bool ppmSampled = false; + double ppmFinal = 0.0; + + std::size_t off = 0; + for (std::uint64_t b = 0;; ++b) { + const std::uint64_t due = t0 + (b * ph.prodNumUs) / ph.prodDen; + if (due >= tEnd) + break; + std::uint64_t now = time_us_64(); + while (now < due) { + tight_loop_contents(); + now = time_us_64(); + } + + asrc->push(input.data() + off, kBlockFrames); + off += kBlockFrames * ph.channels; + if (off + kBlockFrames * ph.channels > input.size()) + off = 0; + + const srt::Status st = asrc->status(); + if (!locked && st.state == srt::State::Locked) { + locked = true; + lockUs = time_us_64() - tStart; + undAtLock = st.underruns; + ovrAtLock = st.overruns; + rsyAtLock = st.resyncs; + } + + // 1 Hz telemetry. The printf may stall up to the 2 ms stdio cap; + // the absolute push schedule catches up immediately afterwards and + // the FIFO setpoint absorbs the dip (see kTargetLatencyFrames). + if (now >= nextTelemetryUs) { + nextTelemetryUs += 1000000; + const std::uint64_t tMs = (now - tStart) / 1000; + const Snapshot sn = readSnapshot(); + const double cycFrame = + static_cast(sn.meanCyc) / static_cast(kBlockFrames); + const double pctCore = + cycFrame * ph.rateHz / static_cast(clock_get_hz(clk_sys)) * 100.0; + std::printf( + "[%c t=%2lus] %-9s ppm=%+7.2f fill=%6.1f und=%lu ovr=%lu rsy=%lu | " + "pull/blk mean=%lu p99=%lu max=%lu (%4.1f%% core) late<=%luus\n", + ph.tag, static_cast(tMs / 1000), stateName(st.state), st.ppm, + st.fifoFillFrames, static_cast(st.underruns), + static_cast(st.overruns), static_cast(st.resyncs), + static_cast(sn.meanCyc), static_cast(sn.p99Cyc), + static_cast(sn.maxCyc), pctCore, + static_cast(sn.lateMaxUs)); + if (tMs >= ph.ppmSettleMs) { + ppmSampled = true; + if (std::fabs(st.ppm - kOffsetPpm) >= kPpmTolerance) + ppmOk = false; + } + } + } + + // Teardown: stop core1, wait for its final stats. consumerDone's + // release/acquire pair orders core1's last pull() before this point, so + // destroying the converter afterwards is safe. + g.stop.store(true, std::memory_order_release); + while (!g.consumerDone.load(std::memory_order_acquire)) + tight_loop_contents(); + const Snapshot fin = readSnapshot(); + const srt::Status st = asrc->status(); + ppmFinal = st.ppm; + g.asrc.store(nullptr, std::memory_order_release); + + // PASS = the deployment-shape claims, made falsifiable: + // 1. servo Locked within lockLimitMs of a cold start; + // 2. every 1 Hz ppm sample after ppmSettleMs within +/-5 of the + // synthesized +200 ppm truth (and at least one such sample); + // 3. zero underruns/overruns/resyncs after first lock — the + // both-cores-keeping-real-time criterion (overruns/resyncs are the + // signature of a consumer that cannot keep up, so they gate too). + const std::uint64_t und = st.underruns - undAtLock; + const std::uint64_t ovr = st.overruns - ovrAtLock; + const std::uint64_t rsy = st.resyncs - rsyAtLock; + const bool lockOk = locked && lockUs <= static_cast(ph.lockLimitMs) * 1000; + const bool cleanOk = locked && und == 0 && ovr == 0 && rsy == 0; + r.pass = lockOk && ppmOk && ppmSampled && cleanOk; + + const double cycFrame = static_cast(fin.meanCyc) / static_cast(kBlockFrames); + const double pctCore = + cycFrame * ph.rateHz / static_cast(clock_get_hz(clk_sys)) * 100.0; + std::printf("SUMMARY %c %s: %s lock_ms=%lu ppm_final=%+.2f post_lock_und=%lu ovr=%lu " + "rsy=%lu pull_cyc_blk mean=%lu p99=%lu max=%lu cyc_frame=%.1f core_pct=%.1f " + "late_max_us=%lu\n", + ph.tag, ph.desc, r.pass ? "PASS" : "FAIL", + static_cast(lockUs / 1000), ppmFinal, + static_cast(und), static_cast(ovr), + static_cast(rsy), static_cast(fin.meanCyc), + static_cast(fin.p99Cyc), static_cast(fin.maxCyc), + cycFrame, pctCore, static_cast(fin.lateMaxUs)); + return r; +} + +} // namespace + +int main() { + stdio_init_all(); + // USB CDC drops everything printed before a host terminal attaches. + while (!stdio_usb_connected()) + sleep_ms(100); + sleep_ms(250); + + std::printf("SampleRateTap RP2350 dual-core deployment\n"); + std::printf("sys clock %lu Hz; core0 push @ nominal*(1+%.0fe-6), core1 pull @ nominal; " + "block %u frames\n", + static_cast(clock_get_hz(clk_sys)), kOffsetPpm, + static_cast(kBlockFrames)); + + multicore_launch_core1_with_stack(core1Main, gCore1Stack, sizeof(gCore1Stack)); + while (g.cyccnt.load(std::memory_order_acquire) == 0) + tight_loop_contents(); // doubles as the launch handshake + if (g.cyccnt.load(std::memory_order_relaxed) == 2) + std::printf("WARN: core1 DWT has no cycle counter; pull timings will read 0\n"); + + // Producer schedules bake in the +200 ppm offset (exact integer rationals): + // A: 48 kHz * 1.0002 = 48009.6 Hz; 32 frames = 32e6/48009.6 us = 1e7/15003 us + // B: 16 kHz * 1.0002 = 16003.2 Hz; 32 frames = 1e7/5001 us + // Consumer schedules are the exact nominal rates: + // A: 32/48000 s = 2000/3 us; B: 32/16000 s = 2000/1 us + // + // Phase B pins the 12-channel shape at 16 kHz — the README's + // reference-microphone/AVB deployment rate — not 48 kHz: the M33 QEMU + // baseline puts pipeline12_q15 at 10,027 insns/frame against a + // 150 MHz / 48 kHz budget of 3,125 cycles/frame, more than 3x over, and + // pull() of one instance is a single consumer by contract — no core + // assignment can split it. At 16 kHz the budget is 9,375 cycles/frame. + // The measured cycles/block is rate-independent either way, so phase B + // still yields the real-silicon counterpart of the 12-channel baseline. + // Its lock/settle gates scale with the servo (bandwidths * 16/48). + const PhaseSpec phases[] = { + {'A', "q15 2ch balanced @48000", 2, 48000.0, false, 10000000, 15003, 2000, 3, 7500, 2000, + 10000, 30000}, + {'B', "q15 12ch balanced16k @16000", 12, 16000.0, true, 10000000, 5001, 2000, 1, 2500, 6000, + 15000, 30000}, + }; + + PhaseResult res[2]; + for (std::size_t i = 0; i < 2; ++i) + res[i] = runPhase(phases[i]); + + // A skipped phase B (allocation) is reported but does not fail the + // deployment verdict — the configuration is optional by RAM budget. + const bool overall = res[0].ran && res[0].pass && (!res[1].ran || res[1].pass); + std::printf("OVERALL: %s (A %s, B %s)\n", overall ? "PASS" : "FAIL", + res[0].ran ? (res[0].pass ? "PASS" : "FAIL") : "SKIP", + res[1].ran ? (res[1].pass ? "PASS" : "FAIL") : "SKIP"); + std::printf("SRT_PICO2_DUALCORE_DONE\n"); + while (true) + sleep_ms(1000); +}