Add t stochastic neighbour embedding using Iris dataset (#13476)
* Added t-SNE with Iris dataset example * Added t-SNE with Iris dataset example * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Updated with descriptive variables * Add descriptive variable names * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Add Descriptive Variable names * Adding Descriptive variable names * Update machine_learning/t_stochastic_neighbour_embedding.py Co-authored-by: Christian Clauss <cclauss@me.com> * Update machine_learning/t_stochastic_neighbour_embedding.py Co-authored-by: Christian Clauss <cclauss@me.com> * Improved line formatting * Adding URL for t-SNE Wikipedia * Apply suggestion from @cclauss --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Christian Clauss <cclauss@me.com>
This commit is contained in:
178
machine_learning/t_stochastic_neighbour_embedding.py
Normal file
178
machine_learning/t_stochastic_neighbour_embedding.py
Normal file
@@ -0,0 +1,178 @@
|
||||
"""
|
||||
t-distributed stochastic neighbor embedding (t-SNE)
|
||||
|
||||
For more details, see:
|
||||
https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding
|
||||
"""
|
||||
|
||||
import doctest
|
||||
|
||||
import numpy as np
|
||||
from numpy import ndarray
|
||||
from sklearn.datasets import load_iris
|
||||
|
||||
|
||||
def collect_dataset() -> tuple[ndarray, ndarray]:
|
||||
"""
|
||||
Load the Iris dataset and return features and labels.
|
||||
|
||||
Returns:
|
||||
tuple[ndarray, ndarray]: Feature matrix and target labels.
|
||||
|
||||
>>> features, targets = collect_dataset()
|
||||
>>> features.shape
|
||||
(150, 4)
|
||||
>>> targets.shape
|
||||
(150,)
|
||||
"""
|
||||
iris_dataset = load_iris()
|
||||
return np.array(iris_dataset.data), np.array(iris_dataset.target)
|
||||
|
||||
|
||||
def compute_pairwise_affinities(data_matrix: ndarray, sigma: float = 1.0) -> ndarray:
|
||||
"""
|
||||
Compute high-dimensional affinities (P matrix) using a Gaussian kernel.
|
||||
|
||||
Args:
|
||||
data_matrix: Input data of shape (n_samples, n_features).
|
||||
sigma: Gaussian kernel bandwidth.
|
||||
|
||||
Returns:
|
||||
ndarray: Symmetrized probability matrix.
|
||||
|
||||
>>> x = np.array([[0.0, 0.0], [1.0, 0.0]])
|
||||
>>> probabilities = compute_pairwise_affinities(x)
|
||||
>>> float(round(probabilities[0, 1], 3))
|
||||
0.25
|
||||
"""
|
||||
n_samples = data_matrix.shape[0]
|
||||
squared_sum = np.sum(np.square(data_matrix), axis=1)
|
||||
squared_distance = np.add(
|
||||
np.add(-2 * np.dot(data_matrix, data_matrix.T), squared_sum).T, squared_sum
|
||||
)
|
||||
|
||||
affinity_matrix = np.exp(-squared_distance / (2 * sigma**2))
|
||||
np.fill_diagonal(affinity_matrix, 0)
|
||||
|
||||
affinity_matrix /= np.sum(affinity_matrix)
|
||||
return (affinity_matrix + affinity_matrix.T) / (2 * n_samples)
|
||||
|
||||
|
||||
def compute_low_dim_affinities(embedding_matrix: ndarray) -> tuple[ndarray, ndarray]:
|
||||
"""
|
||||
Compute low-dimensional affinities (Q matrix) using a Student-t distribution.
|
||||
|
||||
Args:
|
||||
embedding_matrix: Low-dimensional embedding of shape (n_samples, n_components).
|
||||
|
||||
Returns:
|
||||
tuple[ndarray, ndarray]: (Q probability matrix, numerator matrix).
|
||||
|
||||
>>> y = np.array([[0.0, 0.0], [1.0, 0.0]])
|
||||
>>> q_matrix, numerators = compute_low_dim_affinities(y)
|
||||
>>> q_matrix.shape
|
||||
(2, 2)
|
||||
"""
|
||||
squared_sum = np.sum(np.square(embedding_matrix), axis=1)
|
||||
numerator_matrix = 1 / (
|
||||
1
|
||||
+ np.add(
|
||||
np.add(-2 * np.dot(embedding_matrix, embedding_matrix.T), squared_sum).T,
|
||||
squared_sum,
|
||||
)
|
||||
)
|
||||
np.fill_diagonal(numerator_matrix, 0)
|
||||
|
||||
q_matrix = numerator_matrix / np.sum(numerator_matrix)
|
||||
return q_matrix, numerator_matrix
|
||||
|
||||
|
||||
def apply_tsne(
|
||||
data_matrix: ndarray,
|
||||
n_components: int = 2,
|
||||
learning_rate: float = 200.0,
|
||||
n_iter: int = 500,
|
||||
) -> ndarray:
|
||||
"""
|
||||
Apply t-SNE for dimensionality reduction.
|
||||
|
||||
Args:
|
||||
data_matrix: Original dataset (features).
|
||||
n_components: Target dimension (2D or 3D).
|
||||
learning_rate: Step size for gradient descent.
|
||||
n_iter: Number of iterations.
|
||||
|
||||
Returns:
|
||||
ndarray: Low-dimensional embedding of the data.
|
||||
|
||||
>>> features, _ = collect_dataset()
|
||||
>>> embedding = apply_tsne(features, n_components=2, n_iter=50)
|
||||
>>> embedding.shape
|
||||
(150, 2)
|
||||
"""
|
||||
if n_components < 1 or n_iter < 1:
|
||||
raise ValueError("n_components and n_iter must be >= 1")
|
||||
|
||||
n_samples = data_matrix.shape[0]
|
||||
rng = np.random.default_rng()
|
||||
embedding = rng.standard_normal((n_samples, n_components)) * 1e-4
|
||||
|
||||
high_dim_affinities = compute_pairwise_affinities(data_matrix)
|
||||
high_dim_affinities = np.maximum(high_dim_affinities, 1e-12)
|
||||
|
||||
embedding_increment = np.zeros_like(embedding)
|
||||
momentum = 0.5
|
||||
|
||||
for iteration in range(n_iter):
|
||||
low_dim_affinities, numerator_matrix = compute_low_dim_affinities(embedding)
|
||||
low_dim_affinities = np.maximum(low_dim_affinities, 1e-12)
|
||||
|
||||
affinity_diff = high_dim_affinities - low_dim_affinities
|
||||
|
||||
gradient = 4 * (
|
||||
np.dot((affinity_diff * numerator_matrix), embedding)
|
||||
- np.multiply(
|
||||
np.sum(affinity_diff * numerator_matrix, axis=1)[:, np.newaxis],
|
||||
embedding,
|
||||
)
|
||||
)
|
||||
|
||||
embedding_increment = momentum * embedding_increment - learning_rate * gradient
|
||||
embedding += embedding_increment
|
||||
|
||||
if iteration == int(n_iter / 4):
|
||||
momentum = 0.8
|
||||
|
||||
return embedding
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""
|
||||
Run t-SNE on the Iris dataset and display the first 5 embeddings.
|
||||
|
||||
>>> main() # doctest: +ELLIPSIS
|
||||
t-SNE embedding (first 5 points):
|
||||
[[...
|
||||
"""
|
||||
features, _labels = collect_dataset()
|
||||
embedding = apply_tsne(features, n_components=2, n_iter=300)
|
||||
|
||||
if not isinstance(embedding, np.ndarray):
|
||||
raise TypeError("t-SNE embedding must be an ndarray")
|
||||
|
||||
print("t-SNE embedding (first 5 points):")
|
||||
print(embedding[:5])
|
||||
|
||||
# Optional visualization (Ruff/mypy compliant)
|
||||
|
||||
# import matplotlib.pyplot as plt
|
||||
# plt.scatter(embedding[:, 0], embedding[:, 1], c=labels, cmap="viridis")
|
||||
# plt.title("t-SNE Visualization of the Iris Dataset")
|
||||
# plt.xlabel("Dimension 1")
|
||||
# plt.ylabel("Dimension 2")
|
||||
# plt.show()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
doctest.testmod()
|
||||
main()
|
||||
Reference in New Issue
Block a user