Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions examples/pico2_cyccnt/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# Standalone Raspberry Pi Pico 2 (RP2350) firmware: DWT.CYCCNT cycle
# measurement of the ASRC hot path on real Cortex-M33 silicon
# (docs/HARDWARE_TESTING.md, Setup 2). Deliberately NOT wired into the root
# build — configure this directory on its own:
#
# cmake -B build -DPICO_BOARD=pico2
# cmake --build build -j
#
# The Pico SDK is fetched by git tag rather than release-tarball URL+SHA256:
# GitHub source tarballs exclude submodules, and USB CDC stdio needs
# lib/tinyusb. GIT_SUBMODULES limits the clone to that one submodule
# (lwip/btstack/mbedtls/cyw43 are not used here).
cmake_minimum_required(VERSION 3.24)

set(PICO_BOARD pico2 CACHE STRING "Pico SDK board")
set(PICO_PLATFORM rp2350-arm-s CACHE STRING "Pico SDK platform (secure Arm)")

# The library's constructors throw (allocation, filter design); the 12-channel
# case relies on catching bad_alloc instead of crashing. The SDK default is
# -fno-exceptions.
set(PICO_CXX_ENABLE_EXCEPTIONS 1 CACHE BOOL "")

include(FetchContent)
FetchContent_Declare(
pico_sdk
GIT_REPOSITORY https://github.com/raspberrypi/pico-sdk.git
GIT_TAG 2.1.1 # first-class RP2350 support
GIT_SHALLOW TRUE
GIT_SUBMODULES "lib/tinyusb")
FetchContent_GetProperties(pico_sdk)
if(NOT pico_sdk_POPULATED)
FetchContent_Populate(pico_sdk)
endif()
# Must be included before project() so the SDK can inject its toolchain file.
include(${pico_sdk_SOURCE_DIR}/pico_sdk_init.cmake)

project(pico2_cyccnt C CXX ASM)
set(CMAKE_C_STANDARD 11)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

pico_sdk_init()

# Float datapath measurement: soft-double accumulation on the M33, expected
# ~19x the Q15 instruction count — slow but a real number is still valuable.
option(PICO2_MEASURE_FLOAT "Measure the float (soft FP64) datapath too" ON)

add_executable(pico2_cyccnt main.cpp)
# Self-contained except for the header-only library itself.
target_include_directories(pico2_cyccnt PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../../include)
target_link_libraries(pico2_cyccnt PRIVATE pico_stdlib cmsis_core)
if(PICO2_MEASURE_FLOAT)
target_compile_definitions(pico2_cyccnt PRIVATE PICO2_MEASURE_FLOAT=1)
endif()

pico_enable_stdio_usb(pico2_cyccnt 1)
pico_enable_stdio_uart(pico2_cyccnt 0)
pico_add_extra_outputs(pico2_cyccnt) # .uf2 etc.
90 changes: 90 additions & 0 deletions examples/pico2_cyccnt/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# pico2_cyccnt — RP2350 cycle measurement firmware

The DWT.CYCCNT deliverable of [docs/HARDWARE_TESTING.md](../../docs/HARDWARE_TESTING.md)
Setup 2: runs the library's fixed pipeline workload (the same steady-state
`push(32)`/`pull(32)` loop as `bench/icount/icount_main.cpp`) on a real
Raspberry Pi Pico 2 and times each 32-frame block with the Cortex-M33's
hardware cycle counter.

## Why

[docs/PERFORMANCE.md](../../docs/PERFORMANCE.md) gates regressions on QEMU
*instruction* counts because they are deterministic and noise-free — but
silicon budgets are spent in *cycles*, and QEMU cannot provide those. This
firmware closes that loop: dividing its measured cycles/frame by the
committed M33 instruction baselines (`pipeline_q15` 484,146,844 insns per
96,000 frames = 5,043/frame; `pipeline12_q15` 962,613,655 = 10,027/frame)
calibrates the "1 QEMU instruction ≈ N RP2350 cycles" ratio, turning every
current and future M33 baseline into a real cycle budget. It also tests the
README's claim directly: Q15 mono fits a 150 MHz core with room to spare,
stereo is tighter.

## Build

This is a standalone project — it is *not* part of the root CMake build.
Requires `cmake` ≥ 3.24, `arm-none-eabi-gcc` (tested with 13.2), and network
access on first configure (fetches the Pico SDK 2.1.1 plus its TinyUSB
submodule; several minutes and a native compiler for the SDK's `picotool`
build).

```sh
cd examples/pico2_cyccnt
cmake -B build -DPICO_BOARD=pico2
cmake --build build -j
```

Produces `build/pico2_cyccnt.uf2`. Options:

- `-DPICO2_MEASURE_FLOAT=OFF` — skip the float (soft FP64) cases.

## Flash

Either hold BOOTSEL while plugging the Pico 2 in and copy the UF2 onto the
`RP2350` mass-storage drive:

```sh
cp build/pico2_cyccnt.uf2 /media/$USER/RP2350/
```

or use picotool (no BOOTSEL dance needed if the firmware is already running):

```sh
picotool load -f build/pico2_cyccnt.uf2
picotool reboot
```

## Run

Open the USB CDC serial port; the firmware waits for a terminal before
printing anything, so nothing is lost:

```sh
picocom /dev/ttyACM0 # or: minicom -D /dev/ttyACM0
```

Expected output: a header with the sys clock (150 MHz default), then one row
per case — Q15 × {fast, balanced} × {1, 2, 12} channels, plus float 1ch —
with mean/p99/max cycles per 32-frame block, derived cycles/frame, and the
percentage of a 150 MHz core that one 48 kHz stream costs. A 12-channel (or
float) case that cannot allocate prints a `SKIP` row instead. The run ends
with:

```
SRT_PICO2_DONE
```

## Reading the numbers

- **cyc/frame ÷ 5,043** (Q15 balanced 2ch) and **÷ 10,027** (12ch) give the
silicon cycles-per-QEMU-instruction ratio — the calibration constant for
all M33 instruction baselines in the README table.
- **%core@48k** is the headline budget figure. The float rows exist to put a
measured number on "soft-double accumulation is the wrong datapath here";
Q15/Q31 are the intended paths on FP64-less cores.
- p99/max vs. mean shows the (small) jitter from FIFO compaction, servo
block-rate work and whole-sample slips; the workload runs from SRAM with
interrupts live, so USB housekeeping shows up in the tail.

Deviation from the icount workload: the cycled input buffer is 4,800 frames
(0.1 s) instead of 12,000 so the 12-channel case fits comfortably in the
RP2350's 520 KB SRAM; per-block work is unchanged.
181 changes: 181 additions & 0 deletions examples/pico2_cyccnt/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
// Real-silicon cycle measurement of the ASRC hot path on the RP2350's
// Cortex-M33 (docs/HARDWARE_TESTING.md, Setup 2). The steady-state workload
// is the same duplex push(32)/pull(32) loop as runPipeline() in
// bench/icount/icount_main.cpp, timed per block with DWT.CYCCNT.
//
// Calibration purpose: docs/PERFORMANCE.md gates regressions on QEMU
// *instruction* counts because they are deterministic; real cost is in
// *cycles*, which only hardware counters give. Dividing the mean cycles/frame
// printed here by the committed M33 QEMU baselines (bench/baselines.json,
// 2 s of 48 kHz audio = 96,000 frames per workload):
//
// pipeline_q15 (2ch, balanced) 484,146,844 insns = 5,043 insns/frame
// pipeline12_q15 (12ch, balanced) 962,613,655 insns = 10,027 insns/frame
//
// yields the "1 QEMU instruction ~= N RP2350 cycles" ratio that converts
// every M33 instruction baseline into a real cycle budget.
#include <algorithm>
#include <cmath>
#include <cstdint>
#include <cstdio>
#include <exception>
#include <limits>
#include <memory>
#include <numbers>
#include <type_traits>
#include <vector>

#include "RP2350.h"
#include "hardware/clocks.h"
#include "pico/stdlib.h"

#include "srt/asrc.hpp"

namespace {

constexpr std::size_t kBlockFrames = 32;
constexpr std::size_t kWarmupIters = 1000; // past Filling/priming + servo settled
constexpr std::size_t kMeasureIters = 2000;

// 997 Hz at 0.5 FS, cycled, as in icount_main.cpp — but 4800 frames (0.1 s)
// instead of 12000 so the 12-channel Q15 input block fits RP2350 SRAM next
// to the converter. The wrap seam is not periodic in the sine; irrelevant
// here, the cycle cost per block does not depend on sample values.
constexpr std::size_t kInputFrames = 4800;

static_assert(kInputFrames % kBlockFrames == 0);

std::uint32_t gCycles[kMeasureIters];

// TRCENA gates the whole DWT block; CYCCNTENA starts the free-running 32-bit
// cycle counter. CMSIS names from the SDK's core_cm33.h; the firmware runs in
// the secure state (rp2350-arm-s) so the registers are directly writable.
// 32-bit wrap is ~28.6 s at 150 MHz — per-block unsigned deltas are safe.
bool enableCycleCounter() {
CoreDebug->DEMCR |= CoreDebug_DEMCR_TRCENA_Msk;
if (DWT->CTRL & DWT_CTRL_NOCYCCNT_Msk)
return false; // implementation without a cycle counter
DWT->CYCCNT = 0;
DWT->CTRL |= DWT_CTRL_CYCCNTENA_Msk;
return true;
}

template <typename S>
S makeSample(double v) {
if constexpr (std::is_floating_point_v<S>)
return static_cast<S>(v);
else
return srt::detail::roundSat<S>(v * static_cast<double>(std::numeric_limits<S>::max()));
}

template <typename S>
std::vector<S> sineBlock(std::size_t samples, double freqHz, double amp) {
std::vector<S> out(samples);
const double w = 2.0 * std::numbers::pi * freqHz / 48000.0;
for (std::size_t i = 0; i < samples; ++i)
out[i] = makeSample<S>(amp * std::sin(w * static_cast<double>(i)));
return out;
}

template <typename S>
void runCase(const char* typeName, const char* presetName, const srt::FilterSpec& spec,
std::size_t channels) {
srt::Config cfg;
cfg.channels = channels;
cfg.filter = spec;

// Heap-constructed so allocation failure (e.g. 12ch + float on a tighter
// build) degrades to a printed SKIP row instead of a hard fault.
std::unique_ptr<srt::BasicAsyncSampleRateConverter<S>> asrc;
std::vector<S> input;
std::vector<S> out;
try {
asrc = std::make_unique<srt::BasicAsyncSampleRateConverter<S>>(cfg);
input = sineBlock<S>(kInputFrames * channels, 997.0, 0.5);
out.resize(kBlockFrames * channels);
} catch (const std::exception& e) {
std::printf("%-6s %-9s %3u SKIP (%s)\n", typeName, presetName,
static_cast<unsigned>(channels), e.what());
return;
}

// The sink defeats dead-code elimination, exactly as in the icount
// workload; its soft-double add is inside the timed region there too.
double sink = 0.0;
std::size_t off = 0;
const auto step = [&]() {
asrc->push(input.data() + off, kBlockFrames);
asrc->pull(out.data(), kBlockFrames);
off += kBlockFrames * channels;
if (off + kBlockFrames * channels > input.size())
off = 0;
sink += static_cast<double>(out[0]);
};

for (std::size_t i = 0; i < kWarmupIters; ++i)
step();
for (std::size_t i = 0; i < kMeasureIters; ++i) {
const std::uint32_t t0 = DWT->CYCCNT;
step();
gCycles[i] = DWT->CYCCNT - t0;
}

std::uint64_t sum = 0;
for (const std::uint32_t c : gCycles)
sum += c;
std::sort(gCycles, gCycles + kMeasureIters);
const double mean = static_cast<double>(sum) / static_cast<double>(kMeasureIters);
const std::uint32_t p99 = gCycles[kMeasureIters * 99 / 100 - 1];
const std::uint32_t mx = gCycles[kMeasureIters - 1];
const double cyclesPerFrame = mean / static_cast<double>(kBlockFrames);
// One 48 kHz stream's share of this core at the configured sys clock.
const double pctCore =
cyclesPerFrame * 48000.0 / static_cast<double>(clock_get_hz(clk_sys)) * 100.0;

const auto st = asrc->status();
std::printf("%-6s %-9s %3u %10.0f %10lu %10lu %10.1f %8.2f%%%s\n", typeName, presetName,
static_cast<unsigned>(channels), mean, static_cast<unsigned long>(p99),
static_cast<unsigned long>(mx), cyclesPerFrame, pctCore,
(st.underruns != 0 || st.overruns != 0 || sink != sink) ? " WARN: not steady-state"
: "");
}

} // namespace

int main() {
stdio_init_all();
// USB CDC drops everything printed before a host terminal attaches.
while (!stdio_usb_connected())
sleep_ms(100);
sleep_ms(250);

std::printf("SampleRateTap RP2350 DWT.CYCCNT measurement\n");
std::printf("sys clock: %lu Hz, block: %u frames, warmup: %u, measured: %u iters\n",
static_cast<unsigned long>(clock_get_hz(clk_sys)),
static_cast<unsigned>(kBlockFrames), static_cast<unsigned>(kWarmupIters),
static_cast<unsigned>(kMeasureIters));

if (!enableCycleCounter()) {
std::printf("ERROR: DWT cycle counter not implemented\nSRT_PICO2_DONE\n");
while (true)
sleep_ms(1000);
}

std::printf("%-6s %-9s %3s %10s %10s %10s %10s %9s\n", "type", "preset", "ch", "mean/blk",
"p99/blk", "max/blk", "cyc/frame", "%core@48k");

for (const std::size_t ch : {std::size_t{1}, std::size_t{2}, std::size_t{12}}) {
runCase<std::int16_t>("q15", "fast", srt::FilterSpec::fast(), ch);
runCase<std::int16_t>("q15", "balanced", srt::FilterSpec::balanced(), ch);
}
#if PICO2_MEASURE_FLOAT
// Soft FP64 accumulation: expected brutally slow on the M33 (the QEMU
// baselines put pipeline_float at ~3.8x pipeline_q15 instructions).
runCase<float>("float", "fast", srt::FilterSpec::fast(), 1);
runCase<float>("float", "balanced", srt::FilterSpec::balanced(), 1);
#endif

std::printf("SRT_PICO2_DONE\n");
while (true)
sleep_ms(1000);
}
Loading