Skip to content

Commit

Permalink
Merge pull request #178 from pavlin-policar/mnt
Browse files Browse the repository at this point in the history
Maintenance
  • Loading branch information
pavlin-policar authored Apr 23, 2021
2 parents 43507dd + 0c6e960 commit 1e6682c
Show file tree
Hide file tree
Showing 7 changed files with 149 additions and 259 deletions.
88 changes: 52 additions & 36 deletions examples/01_simple_usage.ipynb

Large diffs are not rendered by default.

107 changes: 58 additions & 49 deletions examples/02_advanced_usage.ipynb

Large diffs are not rendered by default.

36 changes: 0 additions & 36 deletions openTSNE/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,42 +39,6 @@ def __call__(self, iteration, error, embedding):
"""


class ErrorLogger(Callback):
"""Basic error logger.
This logger prints out basic information about the optimization. These
include the iteration number, error and how much time has elapsed from the
previous callback invocation.
"""

def __init__(self):
warnings.warn(
"`ErrorLogger` will be removed in upcoming version. Please use the "
"`verbose` flag instead.",
category=FutureWarning,
)
self.iter_count = 0
self.last_log_time = None

def optimization_about_to_start(self):
self.last_log_time = time.time()
self.iter_count = 0

def __call__(self, iteration, error, embedding):
now = time.time()
duration = now - self.last_log_time
self.last_log_time = now

n_iters = iteration - self.iter_count
self.iter_count = iteration

print(
"Iteration % 4d, KL divergence % 6.4f, %d iterations in %.4f sec"
% (iteration, error, n_iters, duration)
)


class VerifyExaggerationError(Callback):
"""Used to verify that the exaggeration correction implemented in
`gradient_descent` is correct."""
Expand Down
10 changes: 6 additions & 4 deletions openTSNE/initialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,13 @@ def rescale(x, inplace=False):
return x


def random(X, n_components=2, random_state=None, verbose=False):
def random(n_samples, n_components=2, random_state=None, verbose=False):
"""Initialize an embedding using samples from an isotropic Gaussian.
Parameters
----------
X: np.ndarray
The data matrix.
n_samples: Union[int, np.ndarray]
The number of samples. Also accepts a data matrix.
n_components: int
The dimension of the embedding space.
Expand All @@ -53,7 +53,9 @@ def random(X, n_components=2, random_state=None, verbose=False):
"""
random_state = check_random_state(random_state)
embedding = random_state.normal(0, 1e-4, (X.shape[0], n_components))
if isinstance(n_samples, np.ndarray):
n_samples = n_samples.shape[0]
embedding = random_state.normal(0, 1e-4, (n_samples, n_components))
return np.ascontiguousarray(embedding)


Expand Down
106 changes: 1 addition & 105 deletions openTSNE/nearest_neighbors.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ class Sklearn(KNNIndex):
"sokalmichener",
"sokalsneath",
"wminkowski",
]
] + ["cosine"] # our own workaround implementation

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand Down Expand Up @@ -205,110 +205,6 @@ def query(self, query, k):
return indices, distances


class BallTree(KNNIndex):
VALID_METRICS = neighbors.BallTree.valid_metrics + ["cosine"]

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.__data = None

warnings.warn(
f"`nearest_neighbors.BallTree` has been superseeded by "
f"`nearest_neighbors.Sklearn` and will be removed from future versions",
category=FutureWarning,
)

def build(self):
data, k = self.data, self.k

timer = utils.Timer(
f"Finding {k} nearest neighbors using exact search using "
f"{self.metric} distance...",
verbose=self.verbose,
)
timer.__enter__()

if self.metric == "cosine":
# The nearest neighbor ranking for cosine distance is the same as
# for euclidean distance on normalized data
effective_metric = "euclidean"
effective_data = data.copy()
effective_data = (
effective_data / np.linalg.norm(effective_data, axis=1)[:, None]
)
# In order to properly compute cosine distances when querying the
# index, we need to store the original data
self.__data = data
else:
effective_metric = self.metric
effective_data = data

self.index = neighbors.NearestNeighbors(
algorithm="ball_tree",
metric=effective_metric,
metric_params=self.metric_params,
n_jobs=self.n_jobs,
)
self.index.fit(effective_data)

# Return the nearest neighbors in the training set
distances, indices = self.index.kneighbors(n_neighbors=k)

# If using cosine distance, the computed distances will be wrong and
# need to be recomputed
if self.metric == "cosine":
distances = np.vstack(
[
cdist(np.atleast_2d(x), data[idx], metric="cosine")
for x, idx in zip(data, indices)
]
)

timer.__exit__()

return indices, distances

def query(self, query, k):
timer = utils.Timer(
f"Finding {k} nearest neighbors in existing embedding using exact search...",
self.verbose,
)
timer.__enter__()

# The nearest neighbor ranking for cosine distance is the same as for
# euclidean distance on normalized data
if self.metric == "cosine":
effective_data = query.copy()
effective_data = (
effective_data / np.linalg.norm(effective_data, axis=1)[:, None]
)
else:
effective_data = query

distances, indices = self.index.kneighbors(effective_data, n_neighbors=k)

# If using cosine distance, the computed distances will be wrong and
# need to be recomputed
if self.metric == "cosine":
if self.__data is None:
raise RuntimeError(
"The original data was unavailable when querying cosine "
"distance. Did you change the distance metric after "
"building the index? Please rebuild the index using cosine "
"similarity."
)
distances = np.vstack(
[
cdist(np.atleast_2d(x), self.__data[idx], metric="cosine")
for x, idx in zip(query, indices)
]
)

timer.__exit__()

return indices, distances


class Annoy(KNNIndex):
"""Annoy KNN Index.
Expand Down
49 changes: 28 additions & 21 deletions openTSNE/tsne.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,23 @@ def _check_callbacks(callbacks):
def _handle_nice_params(embedding: np.ndarray, optim_params: dict) -> None:
"""Convert the user friendly params into something the optimizer can
understand."""
n_samples = embedding.shape[0]
# Handle callbacks
optim_params["callbacks"] = _check_callbacks(optim_params.get("callbacks"))
optim_params["use_callbacks"] = optim_params["callbacks"] is not None

# Handle negative gradient method
negative_gradient_method = optim_params.pop("negative_gradient_method")
# Handle `auto` negative gradient method
if isinstance(negative_gradient_method, str) and negative_gradient_method == "auto":
if n_samples < 10_000:
negative_gradient_method = "bh"
else:
negative_gradient_method = "fft"
log.info(
f"Automatically determined negative gradient method `{negative_gradient_method}`"
)

if callable(negative_gradient_method):
negative_gradient_method = negative_gradient_method
elif negative_gradient_method in {"bh", "BH", "barnes-hut"}:
Expand Down Expand Up @@ -78,7 +89,7 @@ def _handle_nice_params(embedding: np.ndarray, optim_params: dict) -> None:

# Determine learning rate if requested
if optim_params.get("learning_rate", "auto") == "auto":
optim_params["learning_rate"] = max(200, embedding.shape[0] / 12)
optim_params["learning_rate"] = max(200, n_samples / 12)


def __check_init_num_samples(num_samples, required_num_samples):
Expand Down Expand Up @@ -169,7 +180,8 @@ class PartialTSNEEmbedding(np.ndarray):
using one of the following aliases: ``bh``, ``BH`` or ``barnes-hut``.
For larger data sets, the FFT accelerated interpolation method is more
appropriate and can be set using one of the following aliases: ``fft``,
``FFT`` or ``ìnterpolation``.
``FFT`` or ``ìnterpolation``. Alternatively, you can use ``auto`` to
approximately select the faster method.
theta: float
This is the trade-off parameter between speed and accuracy of the tree
Expand Down Expand Up @@ -290,6 +302,8 @@ def optimize(
``barnes-hut``. For larger data sets, the FFT accelerated
interpolation method is more appropriate and can be set using one of
the following aliases: ``fft``, ``FFT`` or ``ìnterpolation``.
Alternatively, you can use ``auto`` to approximately select the
faster method.
theta: float
This is the trade-off parameter between speed and accuracy of the
Expand Down Expand Up @@ -431,7 +445,8 @@ class TSNEEmbedding(np.ndarray):
using one of the following aliases: ``bh``, ``BH`` or ``barnes-hut``.
For larger data sets, the FFT accelerated interpolation method is more
appropriate and can be set using one of the following aliases: ``fft``,
``FFT`` or ``ìnterpolation``.
``FFT`` or ``ìnterpolation``.A lternatively, you can use ``auto`` to
approximately select the faster method.
theta: float
This is the trade-off parameter between speed and accuracy of the tree
Expand Down Expand Up @@ -490,7 +505,7 @@ def __new__(
n_interpolation_points=3,
min_num_intervals=50,
ints_in_interval=1,
negative_gradient_method="fft",
negative_gradient_method="auto",
random_state=None,
optimizer=None,
**gradient_descent_params,
Expand Down Expand Up @@ -571,6 +586,8 @@ def optimize(
``barnes-hut``. For larger data sets, the FFT accelerated
interpolation method is more appropriate and can be set using one of
the following aliases: ``fft``, ``FFT`` or ``ìnterpolation``.
Alternatively, you can use ``auto`` to approximately select the
faster method.
theta: float
This is the trade-off parameter between speed and accuracy of the
Expand Down Expand Up @@ -1000,7 +1017,8 @@ class TSNE(BaseEstimator):
This is the trade-off parameter between speed and accuracy of the tree
approximation method. Typical values range from 0.2 to 0.8. The value 0
indicates that no approximation is to be made and produces exact results
also producing longer runtime.
also producing longer runtime. Alternatively, you can use ``auto`` to
approximately select the faster method.
n_interpolation_points: int
Only used when ``negative_gradient_method="fft"`` or its other aliases.
Expand Down Expand Up @@ -1071,7 +1089,8 @@ class TSNE(BaseEstimator):
using one of the following aliases: ``bh``, ``BH`` or ``barnes-hut``.
For larger data sets, the FFT accelerated interpolation method is more
appropriate and can be set using one of the following aliases: ``fft``,
``FFT`` or ``ìnterpolation``.
``FFT`` or ``ìnterpolation``. Alternatively, you can use ``auto`` to
approximately select the faster method.
callbacks: Union[Callable, List[Callable]]
Callbacks, which will be run every ``callbacks_every_iters`` iterations.
Expand Down Expand Up @@ -1113,7 +1132,7 @@ def __init__(
max_step_norm=5,
n_jobs=1,
neighbors="auto",
negative_gradient_method="fft",
negative_gradient_method="auto",
callbacks=None,
callbacks_every_iters=50,
random_state=None,
Expand Down Expand Up @@ -1154,18 +1173,6 @@ def __init__(
self.random_state = random_state
self.verbose = verbose

@property
def neighbors_method(self):
import warnings

warnings.warn(
f"The `neighbors_method` attribute has been deprecated and will be "
f"removed in future versions. Please use the new `neighbors` "
f"attribute",
category=FutureWarning,
)
return self.neighbors

def fit(self, X=None, affinities=None, initialization=None):
"""Fit a t-SNE embedding for a given data set.
Expand Down Expand Up @@ -1324,7 +1331,7 @@ def prepare_initial(self, X=None, affinities=None, initialization=None):
initialization = "spectral"

# Same spiel for precomputed distance matrices
if self.metric == "precomputed" and initialization == "pca":
if self.metric == "precomputed" and isinstance(initialization, str) and initialization == "pca":
log.warning(
"Attempting to use `pca` initalization, but using precomputed "
"distance matrix! Using `spectral` initilization instead, which "
Expand Down Expand Up @@ -1361,7 +1368,7 @@ def prepare_initial(self, X=None, affinities=None, initialization=None):
)
elif initialization == "random":
embedding = initialization_scheme.random(
X,
n_samples,
self.n_components,
random_state=self.random_state,
verbose=self.verbose,
Expand Down
12 changes: 4 additions & 8 deletions tests/test_nearest_neighbors.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,13 +136,13 @@ def test_pickle_with_built_index(self):
np.testing.assert_array_almost_equal(load_dist, orig_dist)


class TestBallTree(KNNIndexTestMixin, unittest.TestCase):
knn_index = nearest_neighbors.BallTree
class TestSklearn(KNNIndexTestMixin, unittest.TestCase):
knn_index = nearest_neighbors.Sklearn

def test_cosine_distance(self):
k = 15
# Compute cosine distance nearest neighbors using ball tree
knn_index = nearest_neighbors.BallTree(self.x1, k, "cosine")
knn_index = self.knn_index(self.x1, k, "cosine")
indices, distances = knn_index.build()

# Compute the exact nearest neighbors as a reference
Expand All @@ -160,7 +160,7 @@ def test_cosine_distance(self):
def test_cosine_distance_query(self):
k = 15
# Compute cosine distance nearest neighbors using ball tree
knn_index = nearest_neighbors.BallTree(self.x1, k, "cosine")
knn_index = self.knn_index(self.x1, k, "cosine")
knn_index.build()

indices, distances = knn_index.query(self.x2, k=k)
Expand Down Expand Up @@ -202,10 +202,6 @@ def manhattan(x, y):
)


class TestSklearn(TestBallTree):
pass


@unittest.skipIf(not is_package_installed("hnswlib"), "`hnswlib`is not installed")
class TestHNSW(KNNIndexTestMixin, unittest.TestCase):
knn_index = nearest_neighbors.HNSW
Expand Down

0 comments on commit 1e6682c

Please sign in to comment.