perf(playback): optimize audio conversion with 16-bit dithering and bit shifts

Since Spotify audio is always 16-bit depth, optimize the conversion pipeline: - Always dither at 16-bit level regardless of output format - Preserve fractional precision until final rounding for better requantization - Replace floating-point multiplication with compile-time bit shifts - Add comprehensive inlining to eliminate function call overhead - Specialize 24-bit clamping to remove runtime branching This maintains proper dithering of the original 16-bit quantization artifacts while maximizing performance through bit-shift operations and eliminating unnecessary runtime calculations.
2025-10-03 01:39:28 +02:00 · 2025-08-14 00:31:59 +02:00 · 2025-08-14 00:31:59 +02:00 · f59766af7e
commit f59766af7e
parent 218eced556
3 changed files with 61 additions and 38 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -20,6 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [metadata] Replaced `AudioFileFormat` with own enum. (breaking)
 - [playback] Changed trait `Mixer::open` to return `Result<Self, Error>` instead of `Self` (breaking)
 - [playback] Changed type alias `MixerFn` to return `Result<Arc<dyn Mixer>, Error>` instead of `Arc<dyn Mixer>` (breaking)
 - [playback] Optimize audio conversion to always dither at 16-bit level and use bit shifts for scaling
 ### Added
--- a/playback/src/convert.rs
+++ b/playback/src/convert.rs
@ -35,81 +35,100 @@ impl Converter {
        }
    }
-    /// To convert PCM samples from floating point normalized as `-1.0..=1.0`
+    /// Base bit positions for PCM format scaling. These represent the position
-    /// to 32-bit signed integer, multiply by 2147483648 (0x80000000) and
+    /// of the most significant bit in each format's full-scale representation.
-    /// saturate at the bounds of `i32`.
+    /// For signed integers in two's complement, full scale is 2^(bits-1).
-    const SCALE_S32: f64 = 2147483648.;
+    const SHIFT_S16: u8 = 15; // 16-bit: 2^15 = 32768
    const SHIFT_S24: u8 = 23; // 24-bit: 2^23 = 8388608  
    const SHIFT_S32: u8 = 31; // 32-bit: 2^31 = 2147483648
    /// To convert PCM samples from floating point normalized as `-1.0..=1.0`
    /// to 24-bit signed integer, multiply by 8388608 (0x800000) and saturate
    /// at the bounds of `i24`.
    const SCALE_S24: f64 = 8388608.;
-    /// To convert PCM samples from floating point normalized as `-1.0..=1.0`
+    /// Additional bit shifts needed to scale from 16-bit to higher bit depths.
-    /// to 16-bit signed integer, multiply by 32768 (0x8000) and saturate at
+    /// These are the differences between the base shift amounts above.
-    /// the bounds of `i16`. When the samples were encoded using the same
+    const SHIFT_16_TO_24: u8 = Self::SHIFT_S24 - Self::SHIFT_S16; // 23 - 15 = 8
-    /// scaling factor, like the reference Vorbis encoder does, this makes
+    const SHIFT_16_TO_32: u8 = Self::SHIFT_S32 - Self::SHIFT_S16; // 31 - 15 = 16
    /// conversions transparent.
    const SCALE_S16: f64 = 32768.;
-    pub fn scale(&mut self, sample: f64, factor: f64) -> f64 {
+    /// Pre-calculated scale factor for 24-bit clamping bounds
-        // From the many float to int conversion methods available, match what
+    const SCALE_S24: f64 = (1_u64 << Self::SHIFT_S24) as f64;
        // the reference Vorbis implementation uses: sample * 32768 (for 16 bit)
-        // Casting float to integer rounds towards zero by default, i.e. it
+    /// Scale audio samples with optimal dithering strategy for Spotify's 16-bit source material.
-        // truncates, and that generates larger error than rounding to nearest.
+    /// 
    /// Since Spotify audio is always 16-bit depth, this function:
    /// 1. When dithering: applies noise at 16-bit level, preserves fractional precision,
    ///    then scales to target format and rounds once at the end
    /// 2. When not dithering: scales directly from normalized float to target format
    /// 
    /// The `shift` parameter specifies how many extra bits to shift beyond
    /// the base 16-bit scaling (0 for 16-bit, 8 for 24-bit, 16 for 32-bit).
    #[inline]
    pub fn scale(&mut self, sample: f64, shift: u8) -> f64 {
        match self.ditherer.as_mut() {
-            Some(d) => (sample * factor + d.noise()).round(),
+            Some(d) => {
-            None => (sample * factor).round(),
+                // With dithering: Apply noise at 16-bit level to address original quantization,
                // then scale up to target format while preserving sub-LSB information
                let dithered_16bit = sample * (1_u64 << Self::SHIFT_S16) as f64 + d.noise();
                let scaled = dithered_16bit * (1_u64 << shift) as f64;
                scaled.round()
            }
            None => {
                // No dithering: Scale directly from normalized float to target format
                // using a single bit shift operation (base 16-bit shift + additional shift)
                let total_shift = Self::SHIFT_S16 + shift;
                (sample * (1_u64 << total_shift) as f64).round()
            }
        }
    }
-    // Special case for samples packed in a word of greater bit depth (e.g.
+    /// Clamping scale specifically for 24-bit output to prevent MSB overflow.
-    // S24): clamp between min and max to ensure that the most significant
+    /// Only used for S24 formats where samples are packed in 32-bit words.
-    // byte is zero. Otherwise, dithering may cause an overflow. This is not
+    /// Ensures the most significant byte is zero to prevent overflow during dithering.
-    // necessary for other formats, because casting to integer will saturate
+    #[inline]
-    // to the bounds of the primitive.
+    pub fn clamping_scale_s24(&mut self, sample: f64) -> f64 {
-    pub fn clamping_scale(&mut self, sample: f64, factor: f64) -> f64 {
+        let int_value = self.scale(sample, Self::SHIFT_16_TO_24);
-        let int_value = self.scale(sample, factor);
+        
        // In two's complement, there are more negative than positive values.
-        let min = -factor;
+        let min = -Self::SCALE_S24;
-        let max = factor - 1.0;
+        let max = Self::SCALE_S24 - 1.0;
-
+        
        int_value.clamp(min, max)
    }
    #[inline]
    pub fn f64_to_f32(&mut self, samples: &[f64]) -> Vec<f32> {
        samples.iter().map(|sample| *sample as f32).collect()
    }
    #[inline]
    pub fn f64_to_s32(&mut self, samples: &[f64]) -> Vec<i32> {
        samples
            .iter()
-            .map(|sample| self.scale(*sample, Self::SCALE_S32) as i32)
+            .map(|sample| self.scale(*sample, Self::SHIFT_16_TO_32) as i32)
            .collect()
    }
-    // S24 is 24-bit PCM packed in an upper 32-bit word
+    /// S24 is 24-bit PCM packed in an upper 32-bit word
    #[inline]
    pub fn f64_to_s24(&mut self, samples: &[f64]) -> Vec<i32> {
        samples
            .iter()
-            .map(|sample| self.clamping_scale(*sample, Self::SCALE_S24) as i32)
+            .map(|sample| self.clamping_scale_s24(*sample) as i32)
            .collect()
    }
-    // S24_3 is 24-bit PCM in a 3-byte array
+    /// S24_3 is 24-bit PCM in a 3-byte array
    #[inline]
    pub fn f64_to_s24_3(&mut self, samples: &[f64]) -> Vec<i24> {
        samples
            .iter()
-            .map(|sample| i24::from_s24(self.clamping_scale(*sample, Self::SCALE_S24) as i32))
+            .map(|sample| i24::from_s24(self.clamping_scale_s24(*sample) as i32))
            .collect()
    }
    #[inline]
    pub fn f64_to_s16(&mut self, samples: &[f64]) -> Vec<i16> {
        samples
            .iter()
-            .map(|sample| self.scale(*sample, Self::SCALE_S16) as i16)
+            .map(|sample| self.scale(*sample, 0) as i16)
            .collect()
    }
 }
--- a/playback/src/dither.rs
+++ b/playback/src/dither.rs
@ -64,6 +64,7 @@ impl Ditherer for TriangularDitherer {
        Self::NAME
    }
    #[inline]
    fn noise(&mut self) -> f64 {
        self.distribution.sample(&mut self.cached_rng)
    }
@ -98,6 +99,7 @@ impl Ditherer for GaussianDitherer {
        Self::NAME
    }
    #[inline]
    fn noise(&mut self) -> f64 {
        self.distribution.sample(&mut self.cached_rng)
    }
@ -130,6 +132,7 @@ impl Ditherer for HighPassDitherer {
        Self::NAME
    }
    #[inline]
    fn noise(&mut self) -> f64 {
        let new_noise = self.distribution.sample(&mut self.cached_rng);
        let high_passed_noise = new_noise - self.previous_noises[self.active_channel];