diff --git a/examples/pico2_cyccnt/CMakeLists.txt b/examples/pico2_cyccnt/CMakeLists.txt new file mode 100644 index 0000000..1ade2a3 --- /dev/null +++ b/examples/pico2_cyccnt/CMakeLists.txt @@ -0,0 +1,58 @@ +# Standalone Raspberry Pi Pico 2 (RP2350) firmware: DWT.CYCCNT cycle +# measurement of the ASRC hot path on real Cortex-M33 silicon +# (docs/HARDWARE_TESTING.md, Setup 2). Deliberately NOT wired into the root +# build — configure this directory on its own: +# +# cmake -B build -DPICO_BOARD=pico2 +# cmake --build build -j +# +# The Pico SDK is fetched by git tag rather than release-tarball URL+SHA256: +# GitHub source tarballs exclude submodules, and USB CDC stdio needs +# lib/tinyusb. GIT_SUBMODULES limits the clone to that one submodule +# (lwip/btstack/mbedtls/cyw43 are not used here). +cmake_minimum_required(VERSION 3.24) + +set(PICO_BOARD pico2 CACHE STRING "Pico SDK board") +set(PICO_PLATFORM rp2350-arm-s CACHE STRING "Pico SDK platform (secure Arm)") + +# The library's constructors throw (allocation, filter design); the 12-channel +# case relies on catching bad_alloc instead of crashing. The SDK default is +# -fno-exceptions. +set(PICO_CXX_ENABLE_EXCEPTIONS 1 CACHE BOOL "") + +include(FetchContent) +FetchContent_Declare( + pico_sdk + GIT_REPOSITORY https://github.com/raspberrypi/pico-sdk.git + GIT_TAG 2.1.1 # first-class RP2350 support + GIT_SHALLOW TRUE + GIT_SUBMODULES "lib/tinyusb") +FetchContent_GetProperties(pico_sdk) +if(NOT pico_sdk_POPULATED) + FetchContent_Populate(pico_sdk) +endif() +# Must be included before project() so the SDK can inject its toolchain file. +include(${pico_sdk_SOURCE_DIR}/pico_sdk_init.cmake) + +project(pico2_cyccnt C CXX ASM) +set(CMAKE_C_STANDARD 11) +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +pico_sdk_init() + +# Float datapath measurement: soft-double accumulation on the M33, expected +# ~19x the Q15 instruction count — slow but a real number is still valuable. +option(PICO2_MEASURE_FLOAT "Measure the float (soft FP64) datapath too" ON) + +add_executable(pico2_cyccnt main.cpp) +# Self-contained except for the header-only library itself. +target_include_directories(pico2_cyccnt PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../include) +target_link_libraries(pico2_cyccnt PRIVATE pico_stdlib cmsis_core) +if(PICO2_MEASURE_FLOAT) + target_compile_definitions(pico2_cyccnt PRIVATE PICO2_MEASURE_FLOAT=1) +endif() + +pico_enable_stdio_usb(pico2_cyccnt 1) +pico_enable_stdio_uart(pico2_cyccnt 0) +pico_add_extra_outputs(pico2_cyccnt) # .uf2 etc. diff --git a/examples/pico2_cyccnt/README.md b/examples/pico2_cyccnt/README.md new file mode 100644 index 0000000..5dc54c5 --- /dev/null +++ b/examples/pico2_cyccnt/README.md @@ -0,0 +1,90 @@ +# pico2_cyccnt — RP2350 cycle measurement firmware + +The DWT.CYCCNT deliverable of [docs/HARDWARE_TESTING.md](../../docs/HARDWARE_TESTING.md) +Setup 2: runs the library's fixed pipeline workload (the same steady-state +`push(32)`/`pull(32)` loop as `bench/icount/icount_main.cpp`) on a real +Raspberry Pi Pico 2 and times each 32-frame block with the Cortex-M33's +hardware cycle counter. + +## Why + +[docs/PERFORMANCE.md](../../docs/PERFORMANCE.md) gates regressions on QEMU +*instruction* counts because they are deterministic and noise-free — but +silicon budgets are spent in *cycles*, and QEMU cannot provide those. This +firmware closes that loop: dividing its measured cycles/frame by the +committed M33 instruction baselines (`pipeline_q15` 484,146,844 insns per +96,000 frames = 5,043/frame; `pipeline12_q15` 962,613,655 = 10,027/frame) +calibrates the "1 QEMU instruction ≈ N RP2350 cycles" ratio, turning every +current and future M33 baseline into a real cycle budget. It also tests the +README's claim directly: Q15 mono fits a 150 MHz core with room to spare, +stereo is tighter. + +## Build + +This is a standalone project — it is *not* part of the root CMake build. +Requires `cmake` ≥ 3.24, `arm-none-eabi-gcc` (tested with 13.2), and network +access on first configure (fetches the Pico SDK 2.1.1 plus its TinyUSB +submodule; several minutes and a native compiler for the SDK's `picotool` +build). + +```sh +cd examples/pico2_cyccnt +cmake -B build -DPICO_BOARD=pico2 +cmake --build build -j +``` + +Produces `build/pico2_cyccnt.uf2`. Options: + +- `-DPICO2_MEASURE_FLOAT=OFF` — skip the float (soft FP64) cases. + +## Flash + +Either hold BOOTSEL while plugging the Pico 2 in and copy the UF2 onto the +`RP2350` mass-storage drive: + +```sh +cp build/pico2_cyccnt.uf2 /media/$USER/RP2350/ +``` + +or use picotool (no BOOTSEL dance needed if the firmware is already running): + +```sh +picotool load -f build/pico2_cyccnt.uf2 +picotool reboot +``` + +## Run + +Open the USB CDC serial port; the firmware waits for a terminal before +printing anything, so nothing is lost: + +```sh +picocom /dev/ttyACM0 # or: minicom -D /dev/ttyACM0 +``` + +Expected output: a header with the sys clock (150 MHz default), then one row +per case — Q15 × {fast, balanced} × {1, 2, 12} channels, plus float 1ch — +with mean/p99/max cycles per 32-frame block, derived cycles/frame, and the +percentage of a 150 MHz core that one 48 kHz stream costs. A 12-channel (or +float) case that cannot allocate prints a `SKIP` row instead. The run ends +with: + +``` +SRT_PICO2_DONE +``` + +## Reading the numbers + +- **cyc/frame ÷ 5,043** (Q15 balanced 2ch) and **÷ 10,027** (12ch) give the + silicon cycles-per-QEMU-instruction ratio — the calibration constant for + all M33 instruction baselines in the README table. +- **%core@48k** is the headline budget figure. The float rows exist to put a + measured number on "soft-double accumulation is the wrong datapath here"; + Q15/Q31 are the intended paths on FP64-less cores. +- p99/max vs. mean shows the (small) jitter from FIFO compaction, servo + block-rate work and whole-sample slips; the workload runs from SRAM with + interrupts live, so USB housekeeping shows up in the tail. + +Deviation from the icount workload: the cycled input buffer is 4,800 frames +(0.1 s) instead of 12,000 so the 12-channel case fits comfortably in the +RP2350's 520 KB SRAM; per-block work is unchanged. diff --git a/examples/pico2_cyccnt/main.cpp b/examples/pico2_cyccnt/main.cpp new file mode 100644 index 0000000..d8fd170 --- /dev/null +++ b/examples/pico2_cyccnt/main.cpp @@ -0,0 +1,181 @@ +// Real-silicon cycle measurement of the ASRC hot path on the RP2350's +// Cortex-M33 (docs/HARDWARE_TESTING.md, Setup 2). The steady-state workload +// is the same duplex push(32)/pull(32) loop as runPipeline() in +// bench/icount/icount_main.cpp, timed per block with DWT.CYCCNT. +// +// Calibration purpose: docs/PERFORMANCE.md gates regressions on QEMU +// *instruction* counts because they are deterministic; real cost is in +// *cycles*, which only hardware counters give. Dividing the mean cycles/frame +// printed here by the committed M33 QEMU baselines (bench/baselines.json, +// 2 s of 48 kHz audio = 96,000 frames per workload): +// +// pipeline_q15 (2ch, balanced) 484,146,844 insns = 5,043 insns/frame +// pipeline12_q15 (12ch, balanced) 962,613,655 insns = 10,027 insns/frame +// +// yields the "1 QEMU instruction ~= N RP2350 cycles" ratio that converts +// every M33 instruction baseline into a real cycle budget. +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "RP2350.h" +#include "hardware/clocks.h" +#include "pico/stdlib.h" + +#include "srt/asrc.hpp" + +namespace { + +constexpr std::size_t kBlockFrames = 32; +constexpr std::size_t kWarmupIters = 1000; // past Filling/priming + servo settled +constexpr std::size_t kMeasureIters = 2000; + +// 997 Hz at 0.5 FS, cycled, as in icount_main.cpp — but 4800 frames (0.1 s) +// instead of 12000 so the 12-channel Q15 input block fits RP2350 SRAM next +// to the converter. The wrap seam is not periodic in the sine; irrelevant +// here, the cycle cost per block does not depend on sample values. +constexpr std::size_t kInputFrames = 4800; + +static_assert(kInputFrames % kBlockFrames == 0); + +std::uint32_t gCycles[kMeasureIters]; + +// TRCENA gates the whole DWT block; CYCCNTENA starts the free-running 32-bit +// cycle counter. CMSIS names from the SDK's core_cm33.h; the firmware runs in +// the secure state (rp2350-arm-s) so the registers are directly writable. +// 32-bit wrap is ~28.6 s at 150 MHz — per-block unsigned deltas are safe. +bool enableCycleCounter() { + CoreDebug->DEMCR |= CoreDebug_DEMCR_TRCENA_Msk; + if (DWT->CTRL & DWT_CTRL_NOCYCCNT_Msk) + return false; // implementation without a cycle counter + DWT->CYCCNT = 0; + DWT->CTRL |= DWT_CTRL_CYCCNTENA_Msk; + return true; +} + +template +S makeSample(double v) { + if constexpr (std::is_floating_point_v) + return static_cast(v); + else + return srt::detail::roundSat(v * static_cast(std::numeric_limits::max())); +} + +template +std::vector sineBlock(std::size_t samples, double freqHz, double amp) { + std::vector out(samples); + const double w = 2.0 * std::numbers::pi * freqHz / 48000.0; + for (std::size_t i = 0; i < samples; ++i) + out[i] = makeSample(amp * std::sin(w * static_cast(i))); + return out; +} + +template +void runCase(const char* typeName, const char* presetName, const srt::FilterSpec& spec, + std::size_t channels) { + srt::Config cfg; + cfg.channels = channels; + cfg.filter = spec; + + // Heap-constructed so allocation failure (e.g. 12ch + float on a tighter + // build) degrades to a printed SKIP row instead of a hard fault. + std::unique_ptr> asrc; + std::vector input; + std::vector out; + try { + asrc = std::make_unique>(cfg); + input = sineBlock(kInputFrames * channels, 997.0, 0.5); + out.resize(kBlockFrames * channels); + } catch (const std::exception& e) { + std::printf("%-6s %-9s %3u SKIP (%s)\n", typeName, presetName, + static_cast(channels), e.what()); + return; + } + + // The sink defeats dead-code elimination, exactly as in the icount + // workload; its soft-double add is inside the timed region there too. + double sink = 0.0; + std::size_t off = 0; + const auto step = [&]() { + asrc->push(input.data() + off, kBlockFrames); + asrc->pull(out.data(), kBlockFrames); + off += kBlockFrames * channels; + if (off + kBlockFrames * channels > input.size()) + off = 0; + sink += static_cast(out[0]); + }; + + for (std::size_t i = 0; i < kWarmupIters; ++i) + step(); + for (std::size_t i = 0; i < kMeasureIters; ++i) { + const std::uint32_t t0 = DWT->CYCCNT; + step(); + gCycles[i] = DWT->CYCCNT - t0; + } + + std::uint64_t sum = 0; + for (const std::uint32_t c : gCycles) + sum += c; + std::sort(gCycles, gCycles + kMeasureIters); + const double mean = static_cast(sum) / static_cast(kMeasureIters); + const std::uint32_t p99 = gCycles[kMeasureIters * 99 / 100 - 1]; + const std::uint32_t mx = gCycles[kMeasureIters - 1]; + const double cyclesPerFrame = mean / static_cast(kBlockFrames); + // One 48 kHz stream's share of this core at the configured sys clock. + const double pctCore = + cyclesPerFrame * 48000.0 / static_cast(clock_get_hz(clk_sys)) * 100.0; + + const auto st = asrc->status(); + std::printf("%-6s %-9s %3u %10.0f %10lu %10lu %10.1f %8.2f%%%s\n", typeName, presetName, + static_cast(channels), mean, static_cast(p99), + static_cast(mx), cyclesPerFrame, pctCore, + (st.underruns != 0 || st.overruns != 0 || sink != sink) ? " WARN: not steady-state" + : ""); +} + +} // namespace + +int main() { + stdio_init_all(); + // USB CDC drops everything printed before a host terminal attaches. + while (!stdio_usb_connected()) + sleep_ms(100); + sleep_ms(250); + + std::printf("SampleRateTap RP2350 DWT.CYCCNT measurement\n"); + std::printf("sys clock: %lu Hz, block: %u frames, warmup: %u, measured: %u iters\n", + static_cast(clock_get_hz(clk_sys)), + static_cast(kBlockFrames), static_cast(kWarmupIters), + static_cast(kMeasureIters)); + + if (!enableCycleCounter()) { + std::printf("ERROR: DWT cycle counter not implemented\nSRT_PICO2_DONE\n"); + while (true) + sleep_ms(1000); + } + + std::printf("%-6s %-9s %3s %10s %10s %10s %10s %9s\n", "type", "preset", "ch", "mean/blk", + "p99/blk", "max/blk", "cyc/frame", "%core@48k"); + + for (const std::size_t ch : {std::size_t{1}, std::size_t{2}, std::size_t{12}}) { + runCase("q15", "fast", srt::FilterSpec::fast(), ch); + runCase("q15", "balanced", srt::FilterSpec::balanced(), ch); + } +#if PICO2_MEASURE_FLOAT + // Soft FP64 accumulation: expected brutally slow on the M33 (the QEMU + // baselines put pipeline_float at ~3.8x pipeline_q15 instructions). + runCase("float", "fast", srt::FilterSpec::fast(), 1); + runCase("float", "balanced", srt::FilterSpec::balanced(), 1); +#endif + + std::printf("SRT_PICO2_DONE\n"); + while (true) + sleep_ms(1000); +}