Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions algorithms/linfa-tsne/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@ categories = ["algorithms", "mathematics", "science"]
[dependencies]
thiserror = "2.0"
ndarray = { version = "0.16" }
ndarray-rand = "0.15"
bhtsne = "0.4.0"
pdqselect = "=0.1.1"
bhtsne = { version = "0.5.4", default-features = false }

linfa = { version = "0.8.1", path = "../.." }
linfa-nn = { version = "0.8.1", path = "../linfa-nn" }

[dev-dependencies]
rand = "0.8"
ndarray-rand = "0.15"
approx = "0.5"

linfa-datasets = { version = "0.8.1", path = "../../datasets", features = [
Expand Down
34 changes: 17 additions & 17 deletions algorithms/linfa-tsne/src/hyperparams.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use linfa::{Float, ParamGuard};
use ndarray_rand::rand::{rngs::SmallRng, Rng, SeedableRng};

use crate::TSneError;

Expand Down Expand Up @@ -32,16 +31,16 @@ use crate::TSneError;
///
/// A verified hyper-parameter set ready for prediction
#[derive(Debug, Clone, PartialEq)]
pub struct TSneValidParams<F, R> {
pub struct TSneValidParams<F, D> {
embedding_size: usize,
approx_threshold: F,
perplexity: F,
max_iter: usize,
preliminary_iter: Option<usize>,
rng: R,
metric: D,
}

impl<F: Float, R> TSneValidParams<F, R> {
impl<F: Float, D> TSneValidParams<F, D> {
pub fn embedding_size(&self) -> usize {
self.embedding_size
}
Expand All @@ -62,45 +61,46 @@ impl<F: Float, R> TSneValidParams<F, R> {
&self.preliminary_iter
}

pub fn rng(&self) -> &R {
&self.rng
pub fn metric(&self) -> &D {
&self.metric
}
}

#[derive(Debug, Clone, PartialEq)]
pub struct TSneParams<F, R>(TSneValidParams<F, R>);
pub struct TSneParams<F, D>(TSneValidParams<F, D>);

impl<F: Float> TSneParams<F, SmallRng> {
impl<F: Float> TSneParams<F, linfa_nn::distance::L2Dist> {
/// Create a t-SNE param set with given embedding size
///
/// # Defaults to:
/// * `approx_threshold`: 0.5
/// * `perplexity`: 5.0
/// * `max_iter`: 2000
/// * `rng`: SmallRng with seed 42
pub fn embedding_size(embedding_size: usize) -> TSneParams<F, SmallRng> {
Self::embedding_size_with_rng(embedding_size, SmallRng::seed_from_u64(42))
pub fn embedding_size(embedding_size: usize) -> TSneParams<F, linfa_nn::distance::L2Dist> {
Self::embedding_size_with_metric(embedding_size, linfa_nn::distance::L2Dist)
}
}

impl<F: Float, R: Rng + Clone> TSneParams<F, R> {
/// Create a t-SNE param set with given embedding size and random number generator
impl<F: Float, D: linfa_nn::distance::Distance<F>> TSneParams<F, D> {
/// Create a t-SNE param set with given embedding size and distance metric
///
/// # Defaults to:
/// * `approx_threshold`: 0.5
/// * `perplexity`: 5.0
/// * `max_iter`: 2000
pub fn embedding_size_with_rng(embedding_size: usize, rng: R) -> TSneParams<F, R> {
pub fn embedding_size_with_metric(embedding_size: usize, metric: D) -> Self {
Self(TSneValidParams {
embedding_size,
rng,
approx_threshold: F::cast(0.5),
perplexity: F::cast(5.0),
max_iter: 2000,
preliminary_iter: None,
metric,
})
}
}

impl<F: Float, D> TSneParams<F, D> {
/// Set the approximation threshold of the Barnes Hut algorithm
///
/// The threshold decides whether a cluster centroid can be used as a summary for the whole
Expand Down Expand Up @@ -139,8 +139,8 @@ impl<F: Float, R: Rng + Clone> TSneParams<F, R> {
}
}

impl<F: Float, R> ParamGuard for TSneParams<F, R> {
type Checked = TSneValidParams<F, R>;
impl<F: Float, D> ParamGuard for TSneParams<F, D> {
type Checked = TSneValidParams<F, D>;
type Error = TSneError;

/// Validates parameters
Expand Down
96 changes: 58 additions & 38 deletions algorithms/linfa-tsne/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#![doc = include_str!("../README.md")]
use std::convert::TryFrom;

use ndarray::Array2;
use ndarray_rand::rand::Rng;
use ndarray_rand::rand_distr::Normal;
use linfa_nn::distance::Distance;
use ndarray::{Array2, ArrayView1};

use linfa::{dataset::DatasetBase, traits::Transformer, Float, ParamGuard};

Expand All @@ -12,15 +12,19 @@ mod hyperparams;
pub use error::{Result, TSneError};
pub use hyperparams::{TSneParams, TSneValidParams};

impl<F: Float, R: Rng + Clone> Transformer<Array2<F>, Result<Array2<F>>> for TSneValidParams<F, R> {
fn transform(&self, mut data: Array2<F>) -> Result<Array2<F>> {
impl<F: Float, D: Distance<F>> Transformer<Array2<F>, Result<Array2<F>>> for TSneValidParams<F, D> {
fn transform(&self, data: Array2<F>) -> Result<Array2<F>> {
let (nfeatures, nsamples) = (data.ncols(), data.nrows());

// validate parameter-data constraints
if self.embedding_size() > nfeatures {
return Err(TSneError::EmbeddingSizeTooLarge);
}

let Ok(embedding_size) = u8::try_from(self.embedding_size()) else {
return Err(TSneError::EmbeddingSizeTooLarge);
};

if F::cast(nsamples - 1) < F::cast(3) * self.perplexity() {
return Err(TSneError::PerplexityTooLarge);
}
Expand All @@ -31,43 +35,47 @@ impl<F: Float, R: Rng + Clone> Transformer<Array2<F>, Result<Array2<F>>> for TSn
None => usize::min(self.max_iter() / 2, 250),
};

let data = data.as_slice_mut().unwrap();

let mut rng = self.rng().clone();
let normal = Normal::new(0.0, 1e-4 * 10e-4).unwrap();

let mut embedding: Vec<F> = (0..nsamples * self.embedding_size())
.map(|_| rng.sample(normal))
.map(F::cast)
.collect();

bhtsne::run(
data,
nsamples,
nfeatures,
&mut embedding,
self.embedding_size(),
self.perplexity(),
self.approx_threshold(),
true,
self.max_iter() as u64,
preliminary_iter as u64,
preliminary_iter as u64,
);
let data: Vec<_> = data.as_slice().unwrap().chunks(nfeatures).collect();

let mut tsne = bhtsne::tSNE::new(&data);
let tsne = tsne
.embedding_dim(embedding_size)
.perplexity(self.perplexity())
.epochs(self.max_iter())
.stop_lying_epoch(preliminary_iter)
.momentum_switch_epoch(preliminary_iter);

let tsne = if self.approx_threshold() <= F::zero() {
// compute exact t-SNE
tsne.exact(|a, b| {
let a = ArrayView1::from(a);
let b = ArrayView1::from(b);
self.metric().distance(a, b)
})
} else {
// compute barnes-hut t-SNE
tsne.barnes_hut(self.approx_threshold(), |a, b| {
let a = ArrayView1::from(a);
let b = ArrayView1::from(b);
self.metric().distance(a, b)
})
};

let embedding = tsne.embedding();

Array2::from_shape_vec((nsamples, self.embedding_size()), embedding).map_err(|e| e.into())
}
}

impl<F: Float, R: Rng + Clone> Transformer<Array2<F>, Result<Array2<F>>> for TSneParams<F, R> {
impl<F: Float, D: Distance<F>> Transformer<Array2<F>, Result<Array2<F>>> for TSneParams<F, D> {
fn transform(&self, x: Array2<F>) -> Result<Array2<F>> {
self.check_ref()?.transform(x)
}
}

impl<T, F: Float, R: Rng + Clone>
impl<T, F: Float, D: Distance<F>>
Transformer<DatasetBase<Array2<F>, T>, Result<DatasetBase<Array2<F>, T>>>
for TSneValidParams<F, R>
for TSneValidParams<F, D>
{
fn transform(&self, ds: DatasetBase<Array2<F>, T>) -> Result<DatasetBase<Array2<F>, T>> {
let DatasetBase {
Expand All @@ -82,8 +90,8 @@ impl<T, F: Float, R: Rng + Clone>
}
}

impl<T, F: Float, R: Rng + Clone>
Transformer<DatasetBase<Array2<F>, T>, Result<DatasetBase<Array2<F>, T>>> for TSneParams<F, R>
impl<T, F: Float, D: Distance<F>>
Transformer<DatasetBase<Array2<F>, T>, Result<DatasetBase<Array2<F>, T>>> for TSneParams<F, D>
{
fn transform(&self, ds: DatasetBase<Array2<F>, T>) -> Result<DatasetBase<Array2<F>, T>> {
self.check_ref()?.transform(ds)
Expand All @@ -103,17 +111,16 @@ mod tests {
#[test]
fn autotraits() {
fn has_autotraits<T: Send + Sync + Sized + Unpin>() {}
has_autotraits::<TSneParams<f64, rand::distributions::Uniform<f64>>>();
has_autotraits::<TSneValidParams<f64, rand::distributions::Uniform<f64>>>();
has_autotraits::<TSneParams<f64, linfa_nn::distance::L2Dist>>();
has_autotraits::<TSneValidParams<f64, linfa_nn::distance::L2Dist>>();
has_autotraits::<TSneError>();
}

#[test]
fn iris_separate() -> Result<()> {
let ds = linfa_datasets::iris();
let rng = SmallRng::seed_from_u64(42);

let ds = TSneParams::embedding_size_with_rng(2, rng)
let ds = TSneParams::embedding_size(2)
.perplexity(10.0)
.approx_threshold(0.0)
.transform(ds)?;
Expand All @@ -123,6 +130,19 @@ mod tests {
Ok(())
}

#[test]
fn iris_separate_bharnes_hut() -> Result<()> {
let ds = linfa_datasets::iris();

let ds = TSneParams::embedding_size(2)
.perplexity(10.0)
.transform(ds)?;

assert!(ds.silhouette_score()? > 0.5);

Ok(())
}

#[test]
fn blob_separate() -> Result<()> {
let mut rng = SmallRng::seed_from_u64(42);
Expand All @@ -137,7 +157,7 @@ mod tests {
let targets = (0..200).map(|x| x < 100).collect::<Array1<_>>();
let dataset = Dataset::new(entries, targets);

let ds = TSneParams::embedding_size_with_rng(2, rng)
let ds = TSneParams::embedding_size(2)
.perplexity(60.0)
.approx_threshold(0.0)
.transform(dataset)?;
Expand Down
Loading