import math
import matplotlib.pyplot as plt
import torch
from IPython.display import HTML
from matplotlib.animation import FuncAnimation
from torch import tensor
from torch.distributions.multivariate_normal import MultivariateNormal
Clustering
Adapted from:
42); torch.manual_seed(
Norms
- M.S.E. \[ \Delta x^2 + \Delta y^2 = ||v||_2 = \sqrt{|v|} = \sqrt{|v|_2} \]
- M.A.E. \[ |\Delta x | + |\Delta y | \]
- R.M.S.E. \[ \sqrt{\frac{1}{n}\Sigma_{i=0}^n (x_i - \hat{x}_i)^2} = ||v||^2_2 = |v| = |v|_2 \]
Create synthetic data
= 4
n_clusters = 256
n_samples
def sample(mean):
= torch.diag(tensor([5.0, 5.0], dtype=torch.float))
covariance return MultivariateNormal(mean, covariance).sample((n_samples,))
= torch.rand(n_clusters, 2) * 70 - 35
centroids = [sample(mean) for mean in centroids]
data_by_centroid = torch.cat(data_by_centroid)
data
= plt.subplots(figsize=(4, 4))
fig, ax for subset in data_by_centroid:
*subset.T, alpha=0.4)
ax.scatter(*centroids.T, marker="x", color="black") ax.scatter(
Mean Shift
Neat algorithm that has some unique advantages compared to K-means, because it does not require setting the number of clusters in advance. It does, however, require a “bandwidth” parameter and that the clusters have a Gaussian falloff.
The algorithm is as follow:
- For each point, compute the distance between that point and all other points
- Reassign each point as the weighted average for each distance with a Gaussian kernel where the variance is the “bandwidth” hyperparameter
- Repeat until convergence
The bandwidth should be about a third of the data, so in this case its about 25.
= 2.5
bandwidth
def gaussian(x, mu, sig):
return (
1.0
/ (math.sqrt(2.0 * torch.pi) * sig)
* torch.exp(-torch.pow((x - mu) / sig, 2.0) / 2)
)
def gkernel(x):
return gaussian(x, mu=0, sig=bandwidth)
= torch.linspace(0, 10, 100)
x = gkernel(x)
y
= plt.subplots()
fig, ax
ax.plot(x, y)set(ylabel="$P.D.F.(X)$", xlabel="$X$") ax.
# Manipulate a copy of the data
= data.clone()
X
*_ = X
x, - X x
tensor([[ 0.0000, 0.0000],
[ 4.5407, 1.0199],
[ 4.5527, 1.6327],
...,
[47.1212, 9.3284],
[50.7346, 6.5993],
[44.9273, 8.0535]])
= ((x - X) ** 2).sum(axis=1).sqrt()
e e
tensor([ 0.0000, 4.6538, 4.8366, ..., 48.0356, 51.1620, 45.6434])
= gkernel(e)
weight weight
tensor([0.1596, 0.0282, 0.0246, ..., 0.0000, 0.0000, 0.0000])
Now, we have the weights and the data, but we need to combine them.
weight.shape, X.shape
(torch.Size([1024]), torch.Size([1024, 2]))
# This doesn't work
try:
= weight * X
weighted_rows except RuntimeError as e:
print(e)
The size of tensor a (1024) must match the size of tensor b (2) at non-singleton dimension 1
# This does, such that the weight is copied to both target
# dimensions by broadcasting
= weight[:, None] * X
weighted_rows weighted_rows
tensor([[4.9080, 4.4652],
[0.7397, 0.7607],
[0.6435, 0.6471],
...,
[-0.0000, 0.0000],
[-0.0000, 0.0000],
[-0.0000, 0.0000]])
# We need to take a weighted sum, but notice that the weights
# add up to 1200%. We need to normalize.
sum() weight.
tensor(12.1010)
sum(axis=0) / weight.sum() weighted_rows.
tensor([28.6881, 28.6938])
= data.clone()
X
def iteration(X):
for i, x in enumerate(X):
= x - X
delta = torch.einsum("ij,ij->i", delta, delta).sqrt()
dist = gkernel(dist)
weight = weight[:, None] * X
weighted_rows = weighted_rows.sum(axis=0) / weight.sum()
X[i, :]
for _ in range(5):
iteration(X)
*X.T)
plt.scatter(*centroids.T, marker="x") plt.scatter(
= data.clone()
X
def do_one(d):
if d:
iteration(X)
ax.clear()*X.T)
ax.scatter(*centroids.T, marker="x")
ax.scatter(
= plt.subplots()
fig, ax = FuncAnimation(fig, do_one, frames=5, interval=500, repeat=True)
anim
plt.close()
HTML(anim.to_jshtml())
How do we GPU accelerate this?
The issue is this loop:
for i, x in enumerate(X):
This is called the “kernel launching overhead”. Let’s try to batch this operation
= 5 bs
= data.clone()
X = X[:bs]
miniX miniX
tensor([[30.7565, 27.9813],
[26.2158, 26.9614],
[26.2037, 26.3486],
[25.6908, 30.2133],
[23.2542, 28.5149]])
Now, we need to compute the matrix of distances for each row, which is a \(m \times n\) matrix, where \(m\) is the bs
(batch size) and \(n\) is the dataset size.
Now, we can’t just subtract these off because of the mismatched first dimension.
try:
- miniX
X except RuntimeError as e:
print(e)
The size of tensor a (1024) must match the size of tensor b (5) at non-singleton dimension 0
Let’s try adding some unit dimensions
None, :].shape, miniX[:, None].shape X[
(torch.Size([1, 1024, 2]), torch.Size([5, 1, 2]))
These dimensions are broadcast compatible because they will result in a copy of the non-batch dimension across the new-unit dimension. Let’s see what happens when we do the subtraction now.
= X[None, :] - miniX[:, None]
delta delta.shape
torch.Size([5, 1024, 2])
Now, we can index into the \(row_i\) and \(row_j\) difference as desired
0, 0, :], delta[0, 1, :] delta[
(tensor([0., 0.]), tensor([-4.5407, -1.0199]))
Now, we just need to collapse this into a \(bs \times n\) matrix of distances
= (delta**2).sum(axis=2).sqrt()
dist dist
tensor([[ 0.0000, 4.6538, 4.8366, ..., 48.0356, 51.1620, 45.6434],
[ 4.6538, 0.0000, 0.6129, ..., 43.3835, 46.5297, 40.9945],
[ 4.8366, 0.6129, 0.0000, ..., 43.2585, 46.4482, 40.8819],
[ 5.5356, 3.2940, 3.8986, ..., 43.6154, 46.5149, 41.1672],
[ 7.5212, 3.3443, 3.6595, ..., 40.8279, 43.8168, 38.3975]])
Alternately, as an einsum
:
"ijk,ijk->ij", delta, delta).sqrt() torch.einsum(
tensor([[ 0.0000, 4.6538, 4.8366, ..., 48.0356, 51.1620, 45.6434],
[ 4.6538, 0.0000, 0.6129, ..., 43.3835, 46.5297, 40.9945],
[ 4.8366, 0.6129, 0.0000, ..., 43.2585, 46.4482, 40.8819],
[ 5.5356, 3.2940, 3.8986, ..., 43.6154, 46.5149, 41.1672],
[ 7.5212, 3.3443, 3.6595, ..., 40.8279, 43.8168, 38.3975]])
= gkernel(dist)
weights weights
tensor([[0.1596, 0.0282, 0.0246, ..., 0.0000, 0.0000, 0.0000],
[0.0282, 0.1596, 0.1549, ..., 0.0000, 0.0000, 0.0000],
[0.0246, 0.1549, 0.1596, ..., 0.0000, 0.0000, 0.0000],
[0.0138, 0.0670, 0.0473, ..., 0.0000, 0.0000, 0.0000],
[0.0017, 0.0652, 0.0547, ..., 0.0000, 0.0000, 0.0000]])
weights.shape, X.shape
(torch.Size([5, 1024]), torch.Size([1024, 2]))
= 1 / weights.sum(axis=1)
norm_constant # Note that we need to broadcast the normalization over the X and Y dimensions
= (weights @ X) * norm_constant[:, None]
reweighted reweighted
tensor([[28.6881, 28.6938],
[26.6096, 28.3607],
[26.5901, 28.1598],
[26.3962, 29.4940],
[25.2566, 28.8559]])
def mean_shift_batch(data, bs=512, iterations=5):
= data.clone()
X = len(X)
n for _ in range(iterations):
for i in range(0, n, bs):
= slice(i, min((i + bs, n)))
batch_idxs = X[batch_idxs]
batch = X[None, :] - batch[:, None]
delta = gkernel(torch.einsum("ijk,ijk->ij", delta, delta).sqrt())
weight = 1 / weight.sum(axis=1)
norm = (weight @ X) * norm[:, None]
X[batch_idxs] return X
= (
DEVICE "mps"
if torch.backends.mps.is_available()
else "cuda"
if torch.cuda.is_available()
else "cpu"
)
= data.to(DEVICE) data_acc
mean_shift_batch(data_acc)
tensor([[ 26.9455, 29.1479],
[ 26.9455, 29.1479],
[ 26.9455, 29.1479],
...,
[-16.9937, 20.3887],
[-16.9937, 20.3887],
[-16.9937, 20.3887]], device='mps:0')
Homework
Implement DBSCAN
= data.clone() X
= X.shape n, _
= 2.5
eps = 25 min_points
= torch.pow(X[None, :] - X[:, None], 2).sum(axis=2).sqrt()
norms norms
tensor([[ 0.0000, 4.6538, 4.8366, ..., 48.0356, 51.1620, 45.6434],
[ 4.6538, 0.0000, 0.6129, ..., 43.3835, 46.5297, 40.9945],
[ 4.8366, 0.6129, 0.0000, ..., 43.2585, 46.4482, 40.8819],
...,
[48.0356, 43.3835, 43.2585, ..., 0.0000, 4.5282, 2.5374],
[51.1620, 46.5297, 46.4482, ..., 4.5282, 0.0000, 5.9866],
[45.6434, 40.9945, 40.8819, ..., 2.5374, 5.9866, 0.0000]])
= torch.zeros(n)
cluster_assignment cluster_assignment
tensor([0., 0., 0., ..., 0., 0., 0.])
from functools import lru_cache
from typing import Optional
class DontReassignCluster(Exception):
...
@lru_cache
def core_point_neighbords(point_idx) -> Optional:
= norms[point_idx, :]
dists = dists < eps
neighboring_point_idxs if neighboring_point_idxs.sum() > min_points:
return neighboring_point_idxs
def assign_cluster(cluster_id, point_idx):
if cluster_assignment[point_idx]:
raise DontReassignCluster
# First, assign point to cluster
= cluster_id
cluster_assignment[point_idx]
# Then, if it is a core point, recursively assign the neighbors
= norms[point_idx, :]
dists = core_point_neighbords(point_idx)
neighboring_point_idxs if neighboring_point_idxs is not None:
for i, is_neighbor in enumerate(neighboring_point_idxs):
if is_neighbor:
try:
assign_cluster(cluster_id, i)except DontReassignCluster:
continue
= 1
current_cluster_id
for point_idx in range(n):
if core_point_neighbords(point_idx) is not None:
try:
assign_cluster(current_cluster_id, point_idx)except DontReassignCluster:
continue
else:
+= 1 current_cluster_id
= plt.subplots(figsize=(4, 4))
fig, ax for cluster in cluster_assignment.unique():
ax.scatter(*X[cluster_assignment == cluster, :].T,
=f"Cluster {int(cluster)}" if cluster else "Noise",
label
); ax.legend()