Skip to content

Commit 24d61a6

Browse files
ShriprajwalKShriprajwal Khbollon
committed
feat: add k-gram shingle to Jaccard/Cosine sim (#11)
* feat: add Shingle function * test: update unit tests for Cosine/Jaccard with shingle Co-authored-by: Shriprajwal K <the_daemon_lord@Shriprajwals-MacBook-Air.local> Co-authored-by: hbollon <hugo.bollon@gmail.com>
1 parent 4f1acf0 commit 24d61a6

8 files changed

+177
-30
lines changed

cosine.go

+17-5
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,24 @@ import (
88
)
99

1010
// CosineSimilarity use cosine algorithm to return a similarity index between string vectors
11-
// Takes two strings as parameters and return an index.
12-
// This algorithm is only effective between sentences and not unique words.
13-
func CosineSimilarity(str1, str2 string) float32 {
11+
// Takes two strings as parameters, a split length which define the k-gram single length
12+
// (if zero split string on whitespaces) and return an index.
13+
func CosineSimilarity(str1, str2 string, splitLength int) float32 {
14+
if str1 == "" || str2 == "" {
15+
return 0
16+
}
17+
1418
// Split string before rune conversion for cosine calculation
15-
splittedStr1 := strings.Split(str1, " ")
16-
splittedStr2 := strings.Split(str2, " ")
19+
// If splitLength == 0 then split on whitespaces
20+
// Else use shingle algorithm
21+
var splittedStr1, splittedStr2 []string
22+
if splitLength == 0 {
23+
splittedStr1 = strings.Split(str1, " ")
24+
splittedStr2 = strings.Split(str2, " ")
25+
} else {
26+
splittedStr1 = ShingleSlice(str1, splitLength)
27+
splittedStr2 = ShingleSlice(str2, splitLength)
28+
}
1729

1830
// Conversion of plitted string into rune array
1931
runeStr1 := make([][]rune, len(splittedStr1))

cosine_test.go

+26-1
Original file line numberDiff line numberDiff line change
@@ -36,13 +36,38 @@ func TestCosineSimilarity(t *testing.T) {
3636
}
3737
for _, tt := range tests {
3838
t.Run(tt.name, func(t *testing.T) {
39-
if got := CosineSimilarity(tt.args.str1, tt.args.str2); got != tt.want {
39+
if got := CosineSimilarity(tt.args.str1, tt.args.str2, 0); got != tt.want {
4040
t.Errorf("CosineSimilarity() = %v, want %v", got, tt.want)
4141
}
4242
})
4343
}
4444
}
4545

46+
func TestCosineShingleSimilarity(t *testing.T) {
47+
type args struct {
48+
str1 string
49+
str2 string
50+
}
51+
tests := []struct {
52+
name string
53+
args args
54+
want float32
55+
}{
56+
{"Cosine shingle sim 1", args{"Radiohead", "Carly Rae Jepsen"}, 0.09759001},
57+
{"Cosine shingle sim 2", args{"I love horror movies", "Lights out is a horror movie"}, 0.5335784},
58+
{"Cosine shingle sim 3", args{"love horror movies", "Lights out horror movie"}, 0.61977977},
59+
{"Cosine shingle sim 4", args{"私の名前はジョンです", "私の名前はジョン・ドゥです"}, 0.76980036},
60+
{"Cosine shingle sim 5", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂"}, 0.8944272},
61+
}
62+
for _, tt := range tests {
63+
t.Run(tt.name, func(t *testing.T) {
64+
if got := CosineSimilarity(tt.args.str1, tt.args.str2, 2); got != tt.want {
65+
t.Errorf("CosineSimilarity() with shingle 2 = %v, want %v", got, tt.want)
66+
}
67+
})
68+
}
69+
}
70+
4671
func Test_union(t *testing.T) {
4772
type args struct {
4873
a []string

jaccard.go

+19-7
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,25 @@ import (
55
)
66

77
// JaccardSimilarity compute the jaccard similarity coeffecient between two strings
8-
// Takes two strings as parameters and return an index.
9-
// This algorithm is only effective between sentences and not unique words.
10-
func JaccardSimilarity(str1, str2 string) float32 {
11-
// Split string before rune conversion for cosine calculation
12-
splittedStr1 := strings.Split(str1, " ")
13-
splittedStr2 := strings.Split(str2, " ")
8+
// Takes two strings as parameters, a split length which define the k-gram single length
9+
// (if zero split string on whitespaces) and return an index.
10+
func JaccardSimilarity(str1, str2 string, splitLength int) float32 {
11+
if str1 == "" || str2 == "" {
12+
return 0
13+
}
14+
15+
// Split string before rune conversion for jaccard calculation
16+
// If splitLength == 0 then split on whitespaces
17+
// Else use shingle algorithm
18+
var splittedStr1, splittedStr2 []string
19+
if splitLength == 0 {
20+
splittedStr1 = strings.Split(str1, " ")
21+
splittedStr2 = strings.Split(str2, " ")
22+
} else {
23+
splittedStr1 = ShingleSlice(str1, splitLength)
24+
splittedStr2 = ShingleSlice(str2, splitLength)
25+
}
26+
1427
// Conversion of splitted string into rune array
1528
runeStr1 := make([][]rune, len(splittedStr1))
1629
for i, str := range splittedStr1 {
@@ -23,7 +36,6 @@ func JaccardSimilarity(str1, str2 string) float32 {
2336

2437
// Create union keywords slice between input strings
2538
unionStr := union(splittedStr1, splittedStr2)
26-
2739
jacc := float32(len(runeStr1) + len(runeStr2) - len(unionStr))
2840

2941
return jacc / float32(len(unionStr))

jaccard_test.go

+27-1
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,35 @@ func TestJaccardSimilarity(t *testing.T) {
2323

2424
for _, tt := range tests {
2525
t.Run(tt.name, func(t *testing.T) {
26-
if got := JaccardSimilarity(tt.args.str1, tt.args.str2); got != tt.want {
26+
if got := JaccardSimilarity(tt.args.str1, tt.args.str2, 0); got != tt.want {
2727
t.Errorf("JaccardSimilarity() = %v, want %v", got, tt.want)
2828
}
2929
})
3030
}
3131
}
32+
33+
func TestJaccardShingleSimilarity(t *testing.T) {
34+
type args struct {
35+
str1 string
36+
str2 string
37+
}
38+
tests := []struct {
39+
name string
40+
args args
41+
want float32
42+
}{
43+
{"Jaccard shingle sim 1", args{"Radiohead", "Carly Rae Jepsen"}, 0.04761905},
44+
{"Jaccard shingle sim 2", args{"I love horror movies", "Lights out is a horror movie"}, 0.3548387},
45+
{"Jaccard shingle sim 3", args{"love horror movies", "Lights out horror movie"}, 0.44},
46+
{"Jaccard shingle sim 4", args{"私の名前はジョンです", "私の名前はジョン・ドゥです"}, 0.61538464},
47+
{"Jaccard shingle sim 5", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂"}, 0.8},
48+
}
49+
50+
for _, tt := range tests {
51+
t.Run(tt.name, func(t *testing.T) {
52+
if got := JaccardSimilarity(tt.args.str1, tt.args.str2, 2); got != tt.want {
53+
t.Errorf("JaccardSimilarity() with shingle 2 = %v, want %v", got, tt.want)
54+
}
55+
})
56+
}
57+
}

shingle.go

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
package edlib
2+
3+
// Shingle Find the k-gram of a string for a given k
4+
// Takes a string and an integer as parameters and return a map.
5+
// Returns an empty map if the string is empty or if k is 0
6+
func Shingle(s string, k int) map[string]int {
7+
m := make(map[string]int)
8+
if s != "" && k != 0 {
9+
runeS := []rune(s)
10+
11+
for i := 0; i < len(runeS)-k+1; i++ {
12+
m[string(runeS[i:i+k])]++
13+
}
14+
}
15+
return m
16+
}
17+
18+
// ShingleSlice Find the k-gram of a string for a given k
19+
// Takes a string and an integer as parameters and return a slice.
20+
// Returns an empty slice if the string is empty or if k is 0
21+
func ShingleSlice(s string, k int) []string {
22+
var out []string
23+
m := make(map[string]int)
24+
if s != "" && k != 0 {
25+
runeS := []rune(s)
26+
for i := 0; i < len(runeS)-k+1; i++ {
27+
m[string(runeS[i:i+k])]++
28+
}
29+
for k := range m {
30+
out = append(out, k)
31+
}
32+
}
33+
return out
34+
}

shingle_test.go

+37
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
package edlib
2+
3+
import (
4+
"reflect"
5+
"testing"
6+
)
7+
8+
func TestShingle(t *testing.T) {
9+
type args struct {
10+
str string
11+
k int
12+
}
13+
tests := []struct {
14+
name string
15+
args args
16+
want map[string]int
17+
}{
18+
{"shingle 1", args{"Radiohead", 2}, map[string]int{"Ra": 1, "ad": 2, "di": 1, "ea": 1, "he": 1, "io": 1, "oh": 1}},
19+
{"shingle 1-1", args{"Radiohead", 3}, map[string]int{"Rad": 1, "adi": 1, "dio": 1, "ead": 1, "hea": 1, "ioh": 1, "ohe": 1}},
20+
{"shingle 2", args{"I love horror movies", 2}, map[string]int{" h": 1, " l": 1, " m": 1, "I ": 1, "e ": 1, "es": 1, "ho": 1, "ie": 1, "lo": 1, "mo": 1, "or": 2, "ov": 2, "r ": 1, "ro": 1, "rr": 1, "ve": 1, "vi": 1}},
21+
{"shingle 3", args{"私の名前はジョンです", 2}, map[string]int{"です": 1, "の名": 1, "はジ": 1, "ジョ": 1, "ョン": 1, "ンで": 1, "前は": 1, "名前": 1, "私の": 1}},
22+
{"shingle 4", args{"🙂😄🙂😄 😄🙂😄", 2}, map[string]int{" 😄": 1, "😄 ": 1, "😄🙂": 2, "🙂😄": 3}},
23+
{"shingle 5", args{"", 100}, make(map[string]int)},
24+
{"shingle 6", args{"hello", 0}, make(map[string]int)},
25+
{"shingle 7", args{"四畳半神話大系", 7}, map[string]int{"四畳半神話大系": 1}},
26+
}
27+
28+
for _, tt := range tests {
29+
t.Run(tt.name, func(t *testing.T) {
30+
got := Shingle(tt.args.str, tt.args.k)
31+
eq := reflect.DeepEqual(got, tt.want)
32+
if !eq {
33+
t.Errorf("Shingle() = %v, want %v", got, tt.want)
34+
}
35+
})
36+
}
37+
}

string-analysis.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ const (
2424

2525
// StringsSimilarity return a similarity index [0..1] between two strings based on given edit distance algorithm in parameter.
2626
// Use defined Algorithm type.
27+
// Through this function, Cosine and Jaccard algorithms are used with Shingle split method with a length of 2.
2728
func StringsSimilarity(str1 string, str2 string, algo Algorithm) (float32, error) {
2829
switch algo {
2930
case Levenshtein:
@@ -45,9 +46,9 @@ func StringsSimilarity(str1 string, str2 string, algo Algorithm) (float32, error
4546
case JaroWinkler:
4647
return JaroWinklerSimilarity(str1, str2), nil
4748
case Cosine:
48-
return CosineSimilarity(str1, str2), nil
49+
return CosineSimilarity(str1, str2, 2), nil
4950
case Jaccard:
50-
return JaccardSimilarity(str1, str2), nil
51+
return JaccardSimilarity(str1, str2, 2), nil
5152
default:
5253
return 0.0, errors.New("Illegal argument for algorithm method")
5354
}

string-analysis_test.go

+14-14
Original file line numberDiff line numberDiff line change
@@ -132,26 +132,26 @@ func TestStringsSimilarity(t *testing.T) {
132132
{"Cosine : Second arg empty", args{"abcde", "", Cosine}, 0.0, false},
133133
{"Cosine : Same args", args{"abcde", "abcde", Cosine}, 1.0, false},
134134
{"Cosine : No characters match", args{"abcd", "effgghh", Cosine}, 0.0, false},
135-
{"Cosine : CRATE/TRACE", args{"CRATE", "TRACE", Cosine}, 0.0, false},
136-
{"Cosine : MARTHA/MARHTA", args{"MARTHA", "MARHTA", Cosine}, 0.0, false},
137-
{"Cosine : DIXON/DICKSONX", args{"DIXON", "DICKSONX", Cosine}, 0.0, false},
138-
{"Cosine : Sentence 1", args{"Radiohead", "Carly Rae Jepsen", Cosine}, 0.0, false},
139-
{"Cosine : Sentence 2", args{"I love horror movies", "Lights out is a horror movie", Cosine}, 0.20412414, false},
140-
{"Cosine : Sentence 3", args{"love horror movies", "Lights out horror movie", Cosine}, 0.28867513, false},
135+
{"Cosine : CRATE/TRACE", args{"CRATE", "TRACE", Cosine}, 0.25, false},
136+
{"Cosine : MARTHA/MARHTA", args{"MARTHA", "MARHTA", Cosine}, 0.4, false},
137+
{"Cosine : DIXON/DICKSONX", args{"DIXON", "DICKSONX", Cosine}, 0.3779645, false},
138+
{"Cosine : Sentence 1", args{"Radiohead", "Carly Rae Jepsen", Cosine}, 0.09759001, false},
139+
{"Cosine : Sentence 2", args{"I love horror movies", "Lights out is a horror movie", Cosine}, 0.5335784, false},
140+
{"Cosine : Sentence 3", args{"love horror movies", "Lights out horror movie", Cosine}, 0.61977977, false},
141141

142142
// Jaccard method
143143
{"Jaccard : First arg empty", args{"", "abcde", Jaccard}, 0.0, false},
144144
{"Jaccard : Second arg empty", args{"abcde", "", Jaccard}, 0.0, false},
145145
{"Jaccard : Same args", args{"abcde", "abcde", Jaccard}, 1.0, false},
146146
{"Jaccard : No characters match", args{"abcd", "effgghh", Jaccard}, 0.0, false},
147-
{"Jaccard : CRATE/TRACE", args{"CRATE", "TRACE", Jaccard}, 0.0, false},
148-
{"Jaccard : MARTHA/MARHTA", args{"MARTHA", "MARHTA", Jaccard}, 0.0, false},
149-
{"Jaccard : DIXON/DICKSONX", args{"DIXON", "DICKSONX", Jaccard}, 0.0, false},
150-
{"Jaccard : Sentence 1", args{"Radiohead", "Carly Rae Jepsen", Jaccard}, 0.0, false},
151-
{"Jaccard : Sentence 2", args{"I love horror movies", "Lights out is a horror movie", Jaccard}, 1.0 / 9.0, false},
152-
{"Jaccard : Sentence 3", args{"love horror movies", "Lights out horror movie", Jaccard}, 1.0 / 6.0, false},
153-
{"Jaccard : Sentence 4", args{"私の名前はジョンです", "私の名前はジョン・ドゥです", Jaccard}, 0.0, false},
154-
{"Jaccard : Sentence 5", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂", Jaccard}, 2.0 / 3.0, false},
147+
{"Jaccard : CRATE/TRACE", args{"CRATE", "TRACE", Jaccard}, 0.14285715, false},
148+
{"Jaccard : MARTHA/MARHTA", args{"MARTHA", "MARHTA", Jaccard}, 0.25, false},
149+
{"Jaccard : DIXON/DICKSONX", args{"DIXON", "DICKSONX", Jaccard}, 0.22222222, false},
150+
{"Jaccard : Sentence 1", args{"Radiohead", "Carly Rae Jepsen", Jaccard}, 0.04761905, false},
151+
{"Jaccard : Sentence 2", args{"I love horror movies", "Lights out is a horror movie", Jaccard}, 0.3548387, false},
152+
{"Jaccard : Sentence 3", args{"love horror movies", "Lights out horror movie", Jaccard}, 0.44, false},
153+
{"Jaccard : Sentence 4", args{"私の名前はジョンです", "私の名前はジョン・ドゥです", Jaccard}, 0.61538464, false},
154+
{"Jaccard : Sentence 5", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂", Jaccard}, 0.8, false},
155155

156156
// Illegal argument error
157157
{"Undefined integer value for method", args{"abc", "abcde", 42}, 0.0, true},

0 commit comments

Comments
 (0)