Merge pull request #178 from pavlin-policar/mnt

pavlin-policar · web-flow · commit 1e6682cc86ff · 2021-04-23T14:39:12.000+02:00
Maintenance
diff --git a/examples/01_simple_usage.ipynb b/examples/01_simple_usage.ipynb
diff --git a/examples/02_advanced_usage.ipynb b/examples/02_advanced_usage.ipynb
diff --git a/openTSNE/callbacks.py b/openTSNE/callbacks.py
@@ -39,42 +39,6 @@ def __call__(self, iteration, error, embedding):
         """
 
 
-class ErrorLogger(Callback):
-    """Basic error logger.
-
-    This logger prints out basic information about the optimization. These
-    include the iteration number, error and how much time has elapsed from the
-    previous callback invocation.
-
-    """
-
-    def __init__(self):
-        warnings.warn(
-            "`ErrorLogger` will be removed in upcoming version. Please use the "
-            "`verbose` flag instead.",
-            category=FutureWarning,
-        )
-        self.iter_count = 0
-        self.last_log_time = None
-
-    def optimization_about_to_start(self):
-        self.last_log_time = time.time()
-        self.iter_count = 0
-
-    def __call__(self, iteration, error, embedding):
-        now = time.time()
-        duration = now - self.last_log_time
-        self.last_log_time = now
-
-        n_iters = iteration - self.iter_count
-        self.iter_count = iteration
-
-        print(
-            "Iteration % 4d, KL divergence % 6.4f, %d iterations in %.4f sec"
-            % (iteration, error, n_iters, duration)
-        )
-
-
 class VerifyExaggerationError(Callback):
     """Used to verify that the exaggeration correction implemented in
     `gradient_descent` is correct."""
diff --git a/openTSNE/initialization.py b/openTSNE/initialization.py
@@ -28,13 +28,13 @@ def rescale(x, inplace=False):
     return x
 
 
-def random(X, n_components=2, random_state=None, verbose=False):
+def random(n_samples, n_components=2, random_state=None, verbose=False):
     """Initialize an embedding using samples from an isotropic Gaussian.
 
     Parameters
     ----------
-    X: np.ndarray
-        The data matrix.
+    n_samples: Union[int, np.ndarray]
+        The number of samples. Also accepts a data matrix.
 
     n_components: int
         The dimension of the embedding space.
@@ -53,7 +53,9 @@ def random(X, n_components=2, random_state=None, verbose=False):
 
     """
     random_state = check_random_state(random_state)
-    embedding = random_state.normal(0, 1e-4, (X.shape[0], n_components))
+    if isinstance(n_samples, np.ndarray):
+        n_samples = n_samples.shape[0]
+    embedding = random_state.normal(0, 1e-4, (n_samples, n_components))
     return np.ascontiguousarray(embedding)
 
 
diff --git a/openTSNE/nearest_neighbors.py b/openTSNE/nearest_neighbors.py
@@ -108,7 +108,7 @@ class Sklearn(KNNIndex):
         "sokalmichener",
         "sokalsneath",
         "wminkowski",
-    ]
+    ] + ["cosine"]  # our own workaround implementation
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -205,110 +205,6 @@ def query(self, query, k):
         return indices, distances
 
 
-class BallTree(KNNIndex):
-    VALID_METRICS = neighbors.BallTree.valid_metrics + ["cosine"]
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.__data = None
-
-        warnings.warn(
-            f"`nearest_neighbors.BallTree` has been superseeded by "
-            f"`nearest_neighbors.Sklearn` and will be removed from future versions",
-            category=FutureWarning,
-        )
-
-    def build(self):
-        data, k = self.data, self.k
-
-        timer = utils.Timer(
-            f"Finding {k} nearest neighbors using exact search using "
-            f"{self.metric} distance...",
-            verbose=self.verbose,
-        )
-        timer.__enter__()
-
-        if self.metric == "cosine":
-            # The nearest neighbor ranking for cosine distance is the same as
-            # for euclidean distance on normalized data
-            effective_metric = "euclidean"
-            effective_data = data.copy()
-            effective_data = (
-                effective_data / np.linalg.norm(effective_data, axis=1)[:, None]
-            )
-            # In order to properly compute cosine distances when querying the
-            # index, we need to store the original data
-            self.__data = data
-        else:
-            effective_metric = self.metric
-            effective_data = data
-
-        self.index = neighbors.NearestNeighbors(
-            algorithm="ball_tree",
-            metric=effective_metric,
-            metric_params=self.metric_params,
-            n_jobs=self.n_jobs,
-        )
-        self.index.fit(effective_data)
-
-        # Return the nearest neighbors in the training set
-        distances, indices = self.index.kneighbors(n_neighbors=k)
-
-        # If using cosine distance, the computed distances will be wrong and
-        # need to be recomputed
-        if self.metric == "cosine":
-            distances = np.vstack(
-                [
-                    cdist(np.atleast_2d(x), data[idx], metric="cosine")
-                    for x, idx in zip(data, indices)
-                ]
-            )
-
-        timer.__exit__()
-
-        return indices, distances
-
-    def query(self, query, k):
-        timer = utils.Timer(
-            f"Finding {k} nearest neighbors in existing embedding using exact search...",
-            self.verbose,
-        )
-        timer.__enter__()
-
-        # The nearest neighbor ranking for cosine distance is the same as for
-        # euclidean distance on normalized data
-        if self.metric == "cosine":
-            effective_data = query.copy()
-            effective_data = (
-                effective_data / np.linalg.norm(effective_data, axis=1)[:, None]
-            )
-        else:
-            effective_data = query
-
-        distances, indices = self.index.kneighbors(effective_data, n_neighbors=k)
-
-        # If using cosine distance, the computed distances will be wrong and
-        # need to be recomputed
-        if self.metric == "cosine":
-            if self.__data is None:
-                raise RuntimeError(
-                    "The original data was unavailable when querying cosine "
-                    "distance. Did you change the distance metric after "
-                    "building the index? Please rebuild the index using cosine "
-                    "similarity."
-                )
-            distances = np.vstack(
-                [
-                    cdist(np.atleast_2d(x), self.__data[idx], metric="cosine")
-                    for x, idx in zip(query, indices)
-                ]
-            )
-
-        timer.__exit__()
-
-        return indices, distances
-
-
 class Annoy(KNNIndex):
     """Annoy KNN Index.
 
diff --git a/openTSNE/tsne.py b/openTSNE/tsne.py
@@ -37,12 +37,23 @@ def _check_callbacks(callbacks):
 def _handle_nice_params(embedding: np.ndarray, optim_params: dict) -> None:
     """Convert the user friendly params into something the optimizer can
     understand."""
+    n_samples = embedding.shape[0]
     # Handle callbacks
     optim_params["callbacks"] = _check_callbacks(optim_params.get("callbacks"))
     optim_params["use_callbacks"] = optim_params["callbacks"] is not None
 
     # Handle negative gradient method
     negative_gradient_method = optim_params.pop("negative_gradient_method")
+    # Handle `auto` negative gradient method
+    if isinstance(negative_gradient_method, str) and negative_gradient_method == "auto":
+        if n_samples < 10_000:
+            negative_gradient_method = "bh"
+        else:
+            negative_gradient_method = "fft"
+        log.info(
+            f"Automatically determined negative gradient method `{negative_gradient_method}`"
+        )
+
     if callable(negative_gradient_method):
         negative_gradient_method = negative_gradient_method
     elif negative_gradient_method in {"bh", "BH", "barnes-hut"}:
@@ -78,7 +89,7 @@ def _handle_nice_params(embedding: np.ndarray, optim_params: dict) -> None:
 
     # Determine learning rate if requested
     if optim_params.get("learning_rate", "auto") == "auto":
-        optim_params["learning_rate"] = max(200, embedding.shape[0] / 12)
+        optim_params["learning_rate"] = max(200, n_samples / 12)
 
 
 def __check_init_num_samples(num_samples, required_num_samples):
@@ -169,7 +180,8 @@ class PartialTSNEEmbedding(np.ndarray):
         using one of the following aliases: ``bh``, ``BH`` or ``barnes-hut``.
         For larger data sets, the FFT accelerated interpolation method is more
         appropriate and can be set using one of the following aliases: ``fft``,
-        ``FFT`` or ``ìnterpolation``.
+        ``FFT`` or ``ìnterpolation``. Alternatively, you can use ``auto`` to
+        approximately select the faster method.
 
     theta: float
         This is the trade-off parameter between speed and accuracy of the tree
@@ -290,6 +302,8 @@ def optimize(
             ``barnes-hut``. For larger data sets, the FFT accelerated
             interpolation method is more appropriate and can be set using one of
             the following aliases: ``fft``, ``FFT`` or ``ìnterpolation``.
+            Alternatively, you can use ``auto`` to approximately select the
+            faster method.
 
         theta: float
             This is the trade-off parameter between speed and accuracy of the
@@ -431,7 +445,8 @@ class TSNEEmbedding(np.ndarray):
         using one of the following aliases: ``bh``, ``BH`` or ``barnes-hut``.
         For larger data sets, the FFT accelerated interpolation method is more
         appropriate and can be set using one of the following aliases: ``fft``,
-        ``FFT`` or ``ìnterpolation``.
+        ``FFT`` or ``ìnterpolation``.A lternatively, you can use ``auto`` to
+        approximately select the faster method.
 
     theta: float
         This is the trade-off parameter between speed and accuracy of the tree
@@ -490,7 +505,7 @@ def __new__(
         n_interpolation_points=3,
         min_num_intervals=50,
         ints_in_interval=1,
-        negative_gradient_method="fft",
+        negative_gradient_method="auto",
         random_state=None,
         optimizer=None,
         **gradient_descent_params,
@@ -571,6 +586,8 @@ def optimize(
             ``barnes-hut``. For larger data sets, the FFT accelerated
             interpolation method is more appropriate and can be set using one of
             the following aliases: ``fft``, ``FFT`` or ``ìnterpolation``.
+            Alternatively, you can use ``auto`` to approximately select the
+            faster method.
 
         theta: float
             This is the trade-off parameter between speed and accuracy of the
@@ -1000,7 +1017,8 @@ class TSNE(BaseEstimator):
         This is the trade-off parameter between speed and accuracy of the tree
         approximation method. Typical values range from 0.2 to 0.8. The value 0
         indicates that no approximation is to be made and produces exact results
-        also producing longer runtime.
+        also producing longer runtime. Alternatively, you can use ``auto`` to
+        approximately select the faster method.
 
     n_interpolation_points: int
         Only used when ``negative_gradient_method="fft"`` or its other aliases.
@@ -1071,7 +1089,8 @@ class TSNE(BaseEstimator):
         using one of the following aliases: ``bh``, ``BH`` or ``barnes-hut``.
         For larger data sets, the FFT accelerated interpolation method is more
         appropriate and can be set using one of the following aliases: ``fft``,
-        ``FFT`` or ``ìnterpolation``.
+        ``FFT`` or ``ìnterpolation``. Alternatively, you can use ``auto`` to
+        approximately select the faster method.
 
     callbacks: Union[Callable, List[Callable]]
         Callbacks, which will be run every ``callbacks_every_iters`` iterations.
@@ -1113,7 +1132,7 @@ def __init__(
         max_step_norm=5,
         n_jobs=1,
         neighbors="auto",
-        negative_gradient_method="fft",
+        negative_gradient_method="auto",
         callbacks=None,
         callbacks_every_iters=50,
         random_state=None,
@@ -1154,18 +1173,6 @@ def __init__(
         self.random_state = random_state
         self.verbose = verbose
 
-    @property
-    def neighbors_method(self):
-        import warnings
-
-        warnings.warn(
-            f"The `neighbors_method` attribute has been deprecated and will be "
-            f"removed in future versions. Please use the new `neighbors` "
-            f"attribute",
-            category=FutureWarning,
-        )
-        return self.neighbors
-
     def fit(self, X=None, affinities=None, initialization=None):
         """Fit a t-SNE embedding for a given data set.
 
@@ -1324,7 +1331,7 @@ def prepare_initial(self, X=None, affinities=None, initialization=None):
             initialization = "spectral"
 
         # Same spiel for precomputed distance matrices
-        if self.metric == "precomputed" and initialization == "pca":
+        if self.metric == "precomputed" and isinstance(initialization, str) and initialization == "pca":
             log.warning(
                 "Attempting to use `pca` initalization, but using precomputed "
                 "distance matrix! Using `spectral` initilization instead, which "
@@ -1361,7 +1368,7 @@ def prepare_initial(self, X=None, affinities=None, initialization=None):
             )
         elif initialization == "random":
             embedding = initialization_scheme.random(
-                X,
+                n_samples,
                 self.n_components,
                 random_state=self.random_state,
                 verbose=self.verbose,
diff --git a/tests/test_nearest_neighbors.py b/tests/test_nearest_neighbors.py
@@ -136,13 +136,13 @@ def test_pickle_with_built_index(self):
         np.testing.assert_array_almost_equal(load_dist, orig_dist)
 
 
-class TestBallTree(KNNIndexTestMixin, unittest.TestCase):
-    knn_index = nearest_neighbors.BallTree
+class TestSklearn(KNNIndexTestMixin, unittest.TestCase):
+    knn_index = nearest_neighbors.Sklearn
 
     def test_cosine_distance(self):
         k = 15
         # Compute cosine distance nearest neighbors using ball tree
-        knn_index = nearest_neighbors.BallTree(self.x1, k, "cosine")
+        knn_index = self.knn_index(self.x1, k, "cosine")
         indices, distances = knn_index.build()
 
         # Compute the exact nearest neighbors as a reference
@@ -160,7 +160,7 @@ def test_cosine_distance(self):
     def test_cosine_distance_query(self):
         k = 15
         # Compute cosine distance nearest neighbors using ball tree
-        knn_index = nearest_neighbors.BallTree(self.x1, k, "cosine")
+        knn_index = self.knn_index(self.x1, k, "cosine")
         knn_index.build()
 
         indices, distances = knn_index.query(self.x2, k=k)
@@ -202,10 +202,6 @@ def manhattan(x, y):
         )
 
 
-class TestSklearn(TestBallTree):
-    pass
-
-
 @unittest.skipIf(not is_package_installed("hnswlib"), "`hnswlib`is not installed")
 class TestHNSW(KNNIndexTestMixin, unittest.TestCase):
     knn_index = nearest_neighbors.HNSW