diff --git a/lib/scholar/metrics/similarity.ex b/lib/scholar/metrics/similarity.ex new file mode 100644 index 00000000..40e263c7 --- /dev/null +++ b/lib/scholar/metrics/similarity.ex @@ -0,0 +1,74 @@ +defmodule Scholar.Metrics.Similarity do + @moduledoc """ + Similarity metrics between 1-D tensors. + """ + + import Nx.Defn + import Scholar.Shared + + @doc ~S""" + Calculates Jaccard similarity (also known as Jaccard similarity coefficient, or Jaccard index). + + Jaccard similarity is a statistic used to measure similarities between two sets. Mathematically, the calculation + of Jaccard similarity is the ratio of set intersection over set union. + + $$ + J(A, B) = \frac{\mid A \cap B \mid}{\mid A \cup B \mid} + $$ + + ## Examples + + iex> x = Nx.tensor([1.0, 5.0, 3.0, 6.7]) + iex> y = Nx.tensor([5.0, 2.5, 3.1, 9.0]) + iex> Scholar.Metrics.Similarity.jaccard(x, y) + #Nx.Tensor< + f32 + 0.1428571492433548 + > + + iex> x = Nx.tensor([1, 2, 3, 5, 7]) + iex> y = Nx.tensor([1, 2, 4, 8, 9]) + iex> Scholar.Metrics.Similarity.jaccard(x, y) + #Nx.Tensor< + f32 + 0.25 + > + + iex> x = Nx.tensor([1, 2]) + iex> y = Nx.tensor([1, 2, 3]) + iex> Scholar.Metrics.Similarity.jaccard(x, y) + ** (ArgumentError) expected input shapes to be equal, got {2} != {3} + """ + defn jaccard(x, y) do + # We're requiring the same shape because usual use cases will have the same shape. + # The last axis could in theory be different on both sides. + assert_same_shape!(x, y) + + x_size = unique_size(x) + y_size = unique_size(y) + + union_size = unique_size(Nx.concatenate([x, y])) + intersection_size = x_size + y_size - union_size + + intersection_size / union_size + end + + defnp unique_size(%Nx.Tensor{shape: shape} = tensor) do + case shape do + {} -> + raise "expected input shape of at least {1}, got: {}" + + {1} -> + 1 + + _ -> + sorted = Nx.sort(tensor) + + different_from_successor? = Nx.not_equal(sorted[0..-2//1], sorted[1..-1//1]) + + different_from_successor? + |> Nx.sum() + |> Nx.add(1) + end + end +end diff --git a/test/scholar/metrics/similarity_test.exs b/test/scholar/metrics/similarity_test.exs new file mode 100644 index 00000000..0ba642b1 --- /dev/null +++ b/test/scholar/metrics/similarity_test.exs @@ -0,0 +1,67 @@ +defmodule Scholar.Metrics.SimilarityTest do + use ExUnit.Case + alias Scholar.Metrics.Similarity + doctest Similarity + + describe "jaccard/2" do + test "returns similarity according to sklearn jaccard_score function" do + x = Nx.tensor([1, 2, 3, 5, 0]) + y = Nx.tensor([1, 30, 4, 8, 9]) + + assert Similarity.jaccard(x, y) == Nx.tensor(0.1111111119389534) + end + + test "returns 100% of similarity" do + x = Nx.tensor([1, 2, 3]) + y = Nx.tensor([1, 2, 3]) + + assert Similarity.jaccard(x, y) == Nx.tensor(1.0) + end + + test "returns 0% of similarity" do + x = Nx.tensor([1, 2, 3]) + y = Nx.tensor([4, 5, 6]) + + assert Similarity.jaccard(x, y) == Nx.tensor(0.0) + end + + test "returns 20% of similarity" do + x = Nx.tensor([1, 2, 3]) + y = Nx.tensor([3, 4, 5]) + + assert Similarity.jaccard(x, y) == Nx.tensor(0.20) + end + + test "returns similarity when tensors have a single element" do + x = Nx.tensor([1]) + y = Nx.tensor([2]) + + assert Similarity.jaccard(x, y) == Nx.tensor(0.0) + end + + test "returns similarity when tensor has multiple dimensions" do + x = Nx.tensor([[0, 1, 1], [1, 1, 0]]) + y = Nx.tensor([[1, 1, 1], [1, 0, 0]]) + + assert Similarity.jaccard(x, y) == Nx.tensor(0.5) + end + + test "raises exception when tensors have different shapes" do + x = Nx.tensor([1, 2, 3, 5]) + y = Nx.tensor([1, 30, 4, 8, 9]) + + assert_raise ArgumentError, "expected input shapes to be equal, got {4} != {5}", fn -> + Similarity.jaccard(x, y) + end + end + + test "raises exception when tensors have shape zero" do + x = Nx.tensor(1) + y = Nx.tensor(1) + + assert_raise RuntimeError, "expected input shape of at least {1}, got: {}", fn -> + Similarity.jaccard(x, y) + end + end + end +end