Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add jaccard similarity metric #20

Merged
merged 12 commits into from
Aug 18, 2022
74 changes: 74 additions & 0 deletions lib/scholar/metrics/similarity.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
defmodule Scholar.Metrics.Similarity do
@moduledoc """
Similarity metrics between 1-D tensors.
"""

import Nx.Defn
import Scholar.Shared

@doc ~S"""
Calculates Jaccard similarity (also known as Jaccard similarity coefficient, or Jaccard index).

Jaccard similarity is a statistic used to measure similarities between two sets. Mathematically, the calculation
of Jaccard similarity is the ratio of set intersection over set union.

$$
J(A, B) = \frac{\mid A \cap B \mid}{\mid A \cup B \mid}
$$

## Examples

iex> x = Nx.tensor([1.0, 5.0, 3.0, 6.7])
iex> y = Nx.tensor([5.0, 2.5, 3.1, 9.0])
iex> Scholar.Metrics.Similarity.jaccard(x, y)
#Nx.Tensor<
f32
0.1428571492433548
>

iex> x = Nx.tensor([1, 2, 3, 5, 7])
iex> y = Nx.tensor([1, 2, 4, 8, 9])
iex> Scholar.Metrics.Similarity.jaccard(x, y)
#Nx.Tensor<
f32
0.25
>

iex> x = Nx.tensor([1, 2])
iex> y = Nx.tensor([1, 2, 3])
iex> Scholar.Metrics.Similarity.jaccard(x, y)
** (ArgumentError) expected input shapes to be equal, got {2} != {3}
"""
defn jaccard(x, y) do
# We're requiring the same shape because usual use cases will have the same shape.
# The last axis could in theory be different on both sides.
assert_same_shape!(x, y)

x_size = unique_size(x)
y_size = unique_size(y)

union_size = unique_size(Nx.concatenate([x, y]))
intersection_size = x_size + y_size - union_size

intersection_size / union_size
end

defnp unique_size(%Nx.Tensor{shape: shape} = tensor) do
case shape do
{} ->
raise "expected input shape of at least {1}, got: {}"

{1} ->
1

_ ->
sorted = Nx.sort(tensor)

different_from_successor? = Nx.not_equal(sorted[0..-2//1], sorted[1..-1//1])

different_from_successor?
|> Nx.sum()
|> Nx.add(1)
end
end
end
67 changes: 67 additions & 0 deletions test/scholar/metrics/similarity_test.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
defmodule Scholar.Metrics.SimilarityTest do
use ExUnit.Case
alias Scholar.Metrics.Similarity
doctest Similarity

describe "jaccard/2" do
test "returns similarity according to sklearn jaccard_score function" do
x = Nx.tensor([1, 2, 3, 5, 0])
y = Nx.tensor([1, 30, 4, 8, 9])

assert Similarity.jaccard(x, y) == Nx.tensor(0.1111111119389534)
end

test "returns 100% of similarity" do
x = Nx.tensor([1, 2, 3])
y = Nx.tensor([1, 2, 3])

assert Similarity.jaccard(x, y) == Nx.tensor(1.0)
end

test "returns 0% of similarity" do
x = Nx.tensor([1, 2, 3])
y = Nx.tensor([4, 5, 6])

assert Similarity.jaccard(x, y) == Nx.tensor(0.0)
end

test "returns 20% of similarity" do
x = Nx.tensor([1, 2, 3])
y = Nx.tensor([3, 4, 5])

assert Similarity.jaccard(x, y) == Nx.tensor(0.20)
end

test "returns similarity when tensors have a single element" do
x = Nx.tensor([1])
y = Nx.tensor([2])

assert Similarity.jaccard(x, y) == Nx.tensor(0.0)
end

test "returns similarity when tensor has multiple dimensions" do
x = Nx.tensor([[0, 1, 1], [1, 1, 0]])
y = Nx.tensor([[1, 1, 1], [1, 0, 0]])

assert Similarity.jaccard(x, y) == Nx.tensor(0.5)
end

test "raises exception when tensors have different shapes" do
x = Nx.tensor([1, 2, 3, 5])
y = Nx.tensor([1, 30, 4, 8, 9])

assert_raise ArgumentError, "expected input shapes to be equal, got {4} != {5}", fn ->
Similarity.jaccard(x, y)
end
end

test "raises exception when tensors have shape zero" do
x = Nx.tensor(1)
y = Nx.tensor(1)

assert_raise RuntimeError, "expected input shape of at least {1}, got: {}", fn ->
Similarity.jaccard(x, y)
end
end
end
end