Skip to content

Commit f0daae9

Browse files
Add jaccard similarity metric (#20)
Co-authored-by: Jonatan Kłosko <jonatanklosko@gmail.com>
1 parent 5d24a61 commit f0daae9

File tree

2 files changed

+141
-0
lines changed

2 files changed

+141
-0
lines changed

lib/scholar/metrics/similarity.ex

+74
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
defmodule Scholar.Metrics.Similarity do
2+
@moduledoc """
3+
Similarity metrics between 1-D tensors.
4+
"""
5+
6+
import Nx.Defn
7+
import Scholar.Shared
8+
9+
@doc ~S"""
10+
Calculates Jaccard similarity (also known as Jaccard similarity coefficient, or Jaccard index).
11+
12+
Jaccard similarity is a statistic used to measure similarities between two sets. Mathematically, the calculation
13+
of Jaccard similarity is the ratio of set intersection over set union.
14+
15+
$$
16+
J(A, B) = \frac{\mid A \cap B \mid}{\mid A \cup B \mid}
17+
$$
18+
19+
## Examples
20+
21+
iex> x = Nx.tensor([1.0, 5.0, 3.0, 6.7])
22+
iex> y = Nx.tensor([5.0, 2.5, 3.1, 9.0])
23+
iex> Scholar.Metrics.Similarity.jaccard(x, y)
24+
#Nx.Tensor<
25+
f32
26+
0.1428571492433548
27+
>
28+
29+
iex> x = Nx.tensor([1, 2, 3, 5, 7])
30+
iex> y = Nx.tensor([1, 2, 4, 8, 9])
31+
iex> Scholar.Metrics.Similarity.jaccard(x, y)
32+
#Nx.Tensor<
33+
f32
34+
0.25
35+
>
36+
37+
iex> x = Nx.tensor([1, 2])
38+
iex> y = Nx.tensor([1, 2, 3])
39+
iex> Scholar.Metrics.Similarity.jaccard(x, y)
40+
** (ArgumentError) expected input shapes to be equal, got {2} != {3}
41+
"""
42+
defn jaccard(x, y) do
43+
# We're requiring the same shape because usual use cases will have the same shape.
44+
# The last axis could in theory be different on both sides.
45+
assert_same_shape!(x, y)
46+
47+
x_size = unique_size(x)
48+
y_size = unique_size(y)
49+
50+
union_size = unique_size(Nx.concatenate([x, y]))
51+
intersection_size = x_size + y_size - union_size
52+
53+
intersection_size / union_size
54+
end
55+
56+
defnp unique_size(%Nx.Tensor{shape: shape} = tensor) do
57+
case shape do
58+
{} ->
59+
raise "expected input shape of at least {1}, got: {}"
60+
61+
{1} ->
62+
1
63+
64+
_ ->
65+
sorted = Nx.sort(tensor)
66+
67+
different_from_successor? = Nx.not_equal(sorted[0..-2//1], sorted[1..-1//1])
68+
69+
different_from_successor?
70+
|> Nx.sum()
71+
|> Nx.add(1)
72+
end
73+
end
74+
end
+67
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
defmodule Scholar.Metrics.SimilarityTest do
2+
use ExUnit.Case
3+
alias Scholar.Metrics.Similarity
4+
doctest Similarity
5+
6+
describe "jaccard/2" do
7+
test "returns similarity according to sklearn jaccard_score function" do
8+
x = Nx.tensor([1, 2, 3, 5, 0])
9+
y = Nx.tensor([1, 30, 4, 8, 9])
10+
11+
assert Similarity.jaccard(x, y) == Nx.tensor(0.1111111119389534)
12+
end
13+
14+
test "returns 100% of similarity" do
15+
x = Nx.tensor([1, 2, 3])
16+
y = Nx.tensor([1, 2, 3])
17+
18+
assert Similarity.jaccard(x, y) == Nx.tensor(1.0)
19+
end
20+
21+
test "returns 0% of similarity" do
22+
x = Nx.tensor([1, 2, 3])
23+
y = Nx.tensor([4, 5, 6])
24+
25+
assert Similarity.jaccard(x, y) == Nx.tensor(0.0)
26+
end
27+
28+
test "returns 20% of similarity" do
29+
x = Nx.tensor([1, 2, 3])
30+
y = Nx.tensor([3, 4, 5])
31+
32+
assert Similarity.jaccard(x, y) == Nx.tensor(0.20)
33+
end
34+
35+
test "returns similarity when tensors have a single element" do
36+
x = Nx.tensor([1])
37+
y = Nx.tensor([2])
38+
39+
assert Similarity.jaccard(x, y) == Nx.tensor(0.0)
40+
end
41+
42+
test "returns similarity when tensor has multiple dimensions" do
43+
x = Nx.tensor([[0, 1, 1], [1, 1, 0]])
44+
y = Nx.tensor([[1, 1, 1], [1, 0, 0]])
45+
46+
assert Similarity.jaccard(x, y) == Nx.tensor(0.5)
47+
end
48+
49+
test "raises exception when tensors have different shapes" do
50+
x = Nx.tensor([1, 2, 3, 5])
51+
y = Nx.tensor([1, 30, 4, 8, 9])
52+
53+
assert_raise ArgumentError, "expected input shapes to be equal, got {4} != {5}", fn ->
54+
Similarity.jaccard(x, y)
55+
end
56+
end
57+
58+
test "raises exception when tensors have shape zero" do
59+
x = Nx.tensor(1)
60+
y = Nx.tensor(1)
61+
62+
assert_raise RuntimeError, "expected input shape of at least {1}, got: {}", fn ->
63+
Similarity.jaccard(x, y)
64+
end
65+
end
66+
end
67+
end

0 commit comments

Comments
 (0)