fix: use rounding for float-to-integer conversions

roderickvd · roderickvd · commit 78bf2c3fa5da · 2025-09-09T08:13:19.000+02:00
Replace truncating casts with proper rounding in float-to-integer sample
conversions to eliminate bias and preserve small signals.

Changes:
- Use f32::round() and f64::round() instead of truncating `as` casts
- Eliminates bias towards zero from truncation behavior
- Preserves small audio signals that would otherwise be truncated to zero
- Removes nonlinear distortion caused by signal values in (-1.0, 1.0)
  all mapping to zero, creating an interval twice as large as any other

Inlines sqrt and round functions for performance.

Additional tests verify proper rounding behavior for cases that would
fail with truncation.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,7 @@
   yielding samples when the underlying signal gets exhausted. This is a breaking
   change. The return type of the `IntoInterleavedSamples#next_sample` method was
   modified.
+- Improved float-to-integer conversions to use proper rounding instead of truncation.
 
 ---
 
diff --git a/dasp_sample/src/conv.rs b/dasp_sample/src/conv.rs
@@ -126,7 +126,9 @@ macro_rules! conversion_fns {
 macro_rules! conversions {
     ($T:ident, $mod_name:ident { $($rest:tt)* }) => {
         pub mod $mod_name {
-            use $crate::types::{I24, U24, I48, U48};
+            #[allow(unused_imports)]
+            use $crate::ops;
+            use $crate::{types::{I24, U24, I48, U48}};
             conversion_fns!($T, $($rest)*);
         }
     };
@@ -531,12 +533,12 @@ conversions!(u64, u64 {
 // The following conversions assume `-1.0 <= s < 1.0` (note that +1.0 is excluded) and will
 // overflow otherwise.
 conversions!(f32, f32 {
-    s to_i8 { (s * 128.0) as i8 }
-    s to_i16 { (s * 32_768.0) as i16 }
-    s to_i24 { I24::new_unchecked((s * 8_388_608.0) as i32) }
-    s to_i32 { (s * 2_147_483_648.0) as i32 }
-    s to_i48 { I48::new_unchecked((s * 140_737_488_355_328.0) as i64) }
-    s to_i64 { (s * 9_223_372_036_854_775_808.0) as i64 }
+    s to_i8 { ops::f32::round(s * 128.0) as i8 }
+    s to_i16 { ops::f32::round(s * 32_768.0) as i16 }
+    s to_i24 { I24::new_unchecked(ops::f32::round(s * 8_388_608.0) as i32) }
+    s to_i32 { ops::f32::round(s * 2_147_483_648.0) as i32 }
+    s to_i48 { I48::new_unchecked(ops::f32::round(s * 140_737_488_355_328.0) as i64) }
+    s to_i64 { ops::f32::round(s * 9_223_372_036_854_775_808.0) as i64 }
     s to_u8 { super::i8::to_u8(to_i8(s)) }
     s to_u16 { super::i16::to_u16(to_i16(s)) }
     s to_u24 { super::i24::to_u24(to_i24(s)) }
@@ -549,12 +551,12 @@ conversions!(f32, f32 {
 // The following conversions assume `-1.0 <= s < 1.0` (note that +1.0 is excluded) and will
 // overflow otherwise.
 conversions!(f64, f64 {
-    s to_i8 { (s * 128.0) as i8 }
-    s to_i16 { (s * 32_768.0) as i16 }
-    s to_i24 { I24::new_unchecked((s * 8_388_608.0) as i32) }
-    s to_i32 { (s * 2_147_483_648.0) as i32 }
-    s to_i48 { I48::new_unchecked((s * 140_737_488_355_328.0) as i64) }
-    s to_i64 { (s * 9_223_372_036_854_775_808.0) as i64 }
+    s to_i8 { ops::f64::round(s * 128.0) as i8 }
+    s to_i16 { ops::f64::round(s * 32_768.0) as i16 }
+    s to_i24 { I24::new_unchecked(ops::f64::round(s * 8_388_608.0) as i32) }
+    s to_i32 { ops::f64::round(s * 2_147_483_648.0) as i32 }
+    s to_i48 { I48::new_unchecked(ops::f64::round(s * 140_737_488_355_328.0) as i64) }
+    s to_i64 { ops::f64::round(s * 9_223_372_036_854_775_808.0) as i64 }
     s to_u8 { super::i8::to_u8(to_i8(s)) }
     s to_u16 { super::i16::to_u16(to_i16(s)) }
     s to_u24 { super::i24::to_u24(to_i24(s)) }
diff --git a/dasp_sample/src/ops.rs b/dasp_sample/src/ops.rs
@@ -3,6 +3,7 @@ pub mod f32 {
     /// Uses bit manipulation for initial guess, then 3 iterations for ~6-7 decimal places.
     /// Accuracy: ~6-7 decimal places
     #[cfg(not(feature = "std"))]
+    #[inline]
     pub fn sqrt(x: f32) -> f32 {
         if x < 0.0 {
             return f32::NAN;
@@ -31,13 +32,27 @@ pub mod f32 {
     pub fn sqrt(x: f32) -> f32 {
         x.sqrt()
     }
+
+    #[cfg(not(feature = "std"))]
+    #[inline]
+    pub fn round(x: f32) -> f32 {
+        // Branchless rounding: copysign gives +0.5 for positive x, -0.5 for negative x
+        // This shifts the value toward zero before truncation, achieving proper rounding
+        (x + 0.5_f32.copysign(x)) as i64 as f32
+    }
+    #[cfg(feature = "std")]
+    #[inline]
+    pub fn round(x: f32) -> f32 {
+        x.round()
+    }
 }
 
 pub mod f64 {
     /// Newton-Raphson square root implementation for f64.
     /// Uses bit manipulation for initial guess, then 4 iterations for ~14-15 decimal places.
     /// Accuracy: ~14-15 decimal places
     #[cfg(not(feature = "std"))]
+    #[inline]
     pub fn sqrt(x: f64) -> f64 {
         if x < 0.0 {
             return f64::NAN;
@@ -66,4 +81,17 @@ pub mod f64 {
     pub fn sqrt(x: f64) -> f64 {
         x.sqrt()
     }
+
+    #[cfg(not(feature = "std"))]
+    #[inline]
+    pub fn round(x: f64) -> f64 {
+        // Branchless rounding: copysign gives +0.5 for positive x, -0.5 for negative x
+        // This shifts the value toward zero before truncation, achieving proper rounding
+        (x + 0.5_f64.copysign(x)) as i64 as f64
+    }
+    #[cfg(feature = "std")]
+    #[inline]
+    pub fn round(x: f64) -> f64 {
+        x.round()
+    }
 }
diff --git a/dasp_sample/tests/conv.rs b/dasp_sample/tests/conv.rs
@@ -479,11 +479,11 @@ tests!(u64 {
 });
 
 tests!(f32 {
-    to_i8  { -1.0, -128; 0.0, 0; }
-    to_i16 { -1.0, -32_768; 0.0, 0; }
-    to_i24 { -1.0, -8_388_608; 0.0, 0; }
-    to_i32 { -1.0, -2_147_483_648; 0.0, 0; }
-    to_i48 { -1.0, -140_737_488_355_328; 0.0, 0; }
+    to_i8  { -1.0, -128; 0.0, 0; 0.1, 13; 0.004, 1; -0.004, -1; 0.003, 0; }
+    to_i16 { -1.0, -32_768; 0.0, 0; 0.1, 3277; 0.00002, 1; 0.00001, 0; }
+    to_i24 { -1.0, -8_388_608; 0.0, 0; 0.1, 838861; 0.0000001, 1; -0.0000001, -1; 0.00000005, 0; }
+    to_i32 { -1.0, -2_147_483_648; 0.0, 0; 0.0000000004, 1; -0.0000000004, -1; 0.0000000002, 0; }
+    to_i48 { -1.0, -140_737_488_355_328; 0.0, 0; 0.000000000000006, 1; -0.000000000000006, -1; 0.000000000000003, 0; }
     to_i64 { -1.0, -9_223_372_036_854_775_808; 0.0, 0; }
     to_u8  { -1.0, 0; 0.0, 128; }
     to_u16 { -1.0, 0; 0.0, 32_768; }
@@ -495,12 +495,12 @@ tests!(f32 {
 });
 
 tests!(f64 {
-    to_i8  { -1.0, -128; 0.0, 0; }
-    to_i16 { -1.0, -32_768; 0.0, 0; }
-    to_i24 { -1.0, -8_388_608; 0.0, 0; }
-    to_i32 { -1.0, -2_147_483_648; 0.0, 0; }
-    to_i48 { -1.0, -140_737_488_355_328; 0.0, 0; }
-    to_i64 { -1.0, -9_223_372_036_854_775_808; 0.0, 0; }
+    to_i8  { -1.0, -128; 0.0, 0; 0.1, 13; 0.007, 1; -0.004, -1; 0.003, 0; }
+    to_i16 { -1.0, -32_768; 0.0, 0; 0.1, 3277; 0.00002, 1; -0.00002, -1; 0.00001, 0; }
+    to_i24 { -1.0, -8_388_608; 0.0, 0; 0.1, 838861; 0.0000001, 1; -0.0000001, -1; 0.00000005, 0; }
+    to_i32 { -1.0, -2_147_483_648; 0.0, 0; 0.1, 214748365; 0.0000000004, 1; -0.0000000004, -1; 0.0000000002, 0; }
+    to_i48 { -1.0, -140_737_488_355_328; 0.0, 0; 0.1, 14073748835533; 0.000000000000006, 1; -0.000000000000006, -1; 0.000000000000003, 0; }
+    to_i64 { -1.0, -9_223_372_036_854_775_808; 0.0, 0; 0.1, 922337203685477632; }
     to_u8  { -1.0, 0; 0.0, 128; }
     to_u16 { -1.0, 0; 0.0, 32_768; }
     to_u24 { -1.0, 0; 0.0, 8_388_608; }