Skip to content

Commit 5f65401

Browse files
Add qgram and sorensen-dice (#14)
* feat: add qgram and sorensen-dice * fix: Change function names and add Qgram and SorensenDice to string-analysis * feat(qgram): add similarity function to return an index test: fix test cases for QGram Co-authored-by: hbollon <hugo.bollon@gmail.com>
1 parent 34fcab0 commit 5f65401

7 files changed

+202
-14
lines changed

README.md

+15-14
Original file line numberDiff line numberDiff line change
@@ -44,20 +44,21 @@ Designed to be fully compatible with Unicode characters!<br>
4444
This library is 100% test covered 😁
4545

4646
## Features
47-
- [Levenshtein](https://en.wikipedia.org/wiki/Levenshtein_distance)
48-
- [LCS](https://en.wikipedia.org/wiki/Longest_common_subsequence_problem) (Longest common subsequence) with edit distance, backtrack and diff functions ✨
49-
- [Hamming](https://en.wikipedia.org/wiki/Hamming_distance)
50-
- [Damerau-Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance), with following variants :
51-
- OSA (Optimal string alignment) ✨
52-
- Adjacent transpositions ✨
53-
- [Jaro & Jaro-Winkler](https://fr.wikipedia.org/wiki/Distance_de_Jaro-Winkler) similarity algorithms ✨
54-
- [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity) algorithm to compare strings ✨
55-
- [Jaccard Index](https://en.wikipedia.org/wiki/Jaccard_index)
56-
57-
- Computed similarity percentage functions based on all available edit distance algorithms in this lib ✨
58-
- Fuzzy search functions based on edit distance with unique or multiples strings output ✨
59-
- Unicode compatibility ! 🥳
60-
- And many more to come !
47+
48+
- [Levenshtein](https://en.wikipedia.org/wiki/Levenshtein_distance)
49+
- [LCS](https://en.wikipedia.org/wiki/Longest_common_subsequence_problem) (Longest common subsequence) with edit distance, backtrack and diff functions
50+
- [Hamming](https://en.wikipedia.org/wiki/Hamming_distance)
51+
- [Damerau-Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance), with following variants:
52+
- OSA (Optimal string alignment)
53+
- Adjacent transpositions
54+
- [Jaro & Jaro-Winkler](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) similarity algorithms
55+
- [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity)
56+
- [Jaccard Index](https://en.wikipedia.org/wiki/Jaccard_index)
57+
- [QGram](https://en.wikipedia.org/wiki/N-gram)
58+
- [Sorensen-Dice](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient)
59+
- Computed similarity percentage functions based on all available edit distance algorithms in this lib
60+
- Fuzzy search functions based on edit distance with unique or multiples strings output
61+
- Unicode compatibility 🥳
6162

6263
## Benchmarks
6364
You can check an interactive Google chart with few benchmark cases for all similarity algorithms in this library through **StringsSimilarity** function [here](http://benchgraph.codingberg.com/q5)

qgram.go

+56
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
package edlib
2+
3+
import (
4+
"math"
5+
)
6+
7+
// QgramDistance compute the q-gram similarity between two strings
8+
// Takes two strings as parameters, a split length which defines the k-gram shingle length
9+
func QgramDistance(str1, str2 string, splitLength int) int {
10+
splittedStr1 := Shingle(str1, splitLength)
11+
splittedStr2 := Shingle(str2, splitLength)
12+
13+
union := make(map[string]int)
14+
for i := range splittedStr1 {
15+
union[i] = 0
16+
}
17+
for i := range splittedStr2 {
18+
union[i] = 0
19+
}
20+
21+
res := 0
22+
23+
for i := range union {
24+
res += int(math.Abs(float64(splittedStr1[i] - splittedStr2[i])))
25+
}
26+
27+
return res
28+
}
29+
30+
// QgramDistanceCustomNgram compute the q-gram similarity between two custom set of individuals
31+
// Takes two n-gram map as parameters
32+
func QgramDistanceCustomNgram(splittedStr1, splittedStr2 map[string]int) int {
33+
union := make(map[string]int)
34+
for i := range splittedStr1 {
35+
union[i] = 0
36+
}
37+
for i := range splittedStr2 {
38+
union[i] = 0
39+
}
40+
41+
res := 0
42+
for i := range union {
43+
res += int(math.Abs(float64(splittedStr1[i] - splittedStr2[i])))
44+
}
45+
46+
return res
47+
}
48+
49+
// QgramSimilarity compute a similarity index (between 0 and 1) between two strings from a Qgram distance
50+
// Takes two strings as parameters, a split length which defines the k-gram shingle length
51+
func QgramSimilarity(str1, str2 string, splitLength int) float32 {
52+
splittedStr1 := Shingle(str1, splitLength)
53+
splittedStr2 := Shingle(str2, splitLength)
54+
res := float32(QgramDistanceCustomNgram(splittedStr1, splittedStr2))
55+
return 1 - (res / float32(len(splittedStr1)+len(splittedStr2)))
56+
}

qgram_test.go

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
package edlib
2+
3+
import (
4+
"testing"
5+
)
6+
7+
func TestQgramDistance(t *testing.T) {
8+
type args struct {
9+
str1 string
10+
str2 string
11+
splitLength int
12+
}
13+
tests := []struct {
14+
name string
15+
args args
16+
want int
17+
}{
18+
{"Qgram sim 1", args{"Radiohead", "Radiohead", 2}, 0.0},
19+
{"Qgram sim 2", args{"ABCD", "ABCE", 2}, 2.0},
20+
{"Qgram sim 3", args{"Radiohead", "Carly Rae Jepsen", 2}, 21.0},
21+
{"Qgram sim 4", args{"I love horror movies", "Lights out is a horror movie", 2}, 22.0},
22+
{"Qgram sim 5", args{"love horror movies", "Lights out horror movie", 2}, 15.0},
23+
{"Qgram sim 6", args{"私の名前はジョンです", "私の名前はジョン・ドゥです", 2}, 5},
24+
{"Qgram sim 7", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂", 2}, 4},
25+
{"Qgram sim 8", args{"", "", 2}, 0.0},
26+
}
27+
28+
for _, tt := range tests {
29+
t.Run(tt.name, func(t *testing.T) {
30+
if got := QgramDistance(tt.args.str1, tt.args.str2, tt.args.splitLength); got != tt.want {
31+
t.Errorf("QgramDistance() = %v, want %v", got, tt.want)
32+
}
33+
})
34+
}
35+
}

sorensen-dice.go

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
package edlib
2+
3+
// SorensenDiceCoefficient computes the Sorensen-Dice coefficient between two strings
4+
// Takes two strings as parameters, a split length which defines the k-gram shingle length
5+
func SorensenDiceCoefficient(str1, str2 string, splitLength int) float32 {
6+
if str1 == "" && str2 == "" {
7+
return 0
8+
}
9+
shingle1 := Shingle(str1, splitLength)
10+
shingle2 := Shingle(str2, splitLength)
11+
12+
intersection := float32(0)
13+
for i := range shingle1 {
14+
if _, ok := shingle2[i]; ok {
15+
intersection++
16+
}
17+
}
18+
return 2.0 * intersection / float32(len(shingle1)+len(shingle2))
19+
}

sorensen-dice_test.go

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
package edlib
2+
3+
import (
4+
"testing"
5+
)
6+
7+
func TestSorensenDiceCoefficient(t *testing.T) {
8+
type args struct {
9+
str1 string
10+
str2 string
11+
splitLength int
12+
}
13+
tests := []struct {
14+
name string
15+
args args
16+
want float32
17+
}{
18+
{"SorensenDiceCoefficient 1", args{"night", "nacht", 2}, 0.25},
19+
{"SorensenDiceCoefficient 2", args{"Radiohead", "Radiohead", 2}, 1.0},
20+
{"SorensenDiceCoefficient 3", args{"", "", 2}, 0.0},
21+
{"SorensenDiceCoefficient 4", args{"Radiohead", "Carly Rae Jepsen", 2}, 0.09090909},
22+
{"SorensenDiceCoefficient 5", args{"I love horror movies", "Lights out is a horror movie", 2}, 0.52380955},
23+
{"SorensenDiceCoefficient 6", args{"love horror movies", "Lights out horror movie", 2}, 0.6111111},
24+
{"SorensenDiceCoefficient 7", args{"私の名前はジョンです", "私の名前はジョン・ドゥです", 2}, 0.7619048},
25+
{"SorensenDiceCoefficient 8", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂", 2}, 0.8888889},
26+
}
27+
28+
for _, tt := range tests {
29+
t.Run(tt.name, func(t *testing.T) {
30+
if got := SorensenDiceCoefficient(tt.args.str1, tt.args.str2, tt.args.splitLength); got != tt.want {
31+
t.Errorf("SorensenDiceCoefficient() = %v, want %v", got, tt.want)
32+
}
33+
})
34+
}
35+
}

string-analysis.go

+6
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ const (
2020
JaroWinkler
2121
Cosine
2222
Jaccard
23+
SorensenDice
24+
Qgram
2325
)
2426

2527
// StringsSimilarity return a similarity index [0..1] between two strings based on given edit distance algorithm in parameter.
@@ -49,6 +51,10 @@ func StringsSimilarity(str1 string, str2 string, algo Algorithm) (float32, error
4951
return CosineSimilarity(str1, str2, 2), nil
5052
case Jaccard:
5153
return JaccardSimilarity(str1, str2, 2), nil
54+
case SorensenDice:
55+
return SorensenDiceCoefficient(str1, str2, 2), nil
56+
case Qgram:
57+
return QgramSimilarity(str1, str2, 2), nil
5258
default:
5359
return 0.0, errors.New("Illegal argument for algorithm method")
5460
}

string-analysis_test.go

+36
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,42 @@ func TestStringsSimilarity(t *testing.T) {
153153
{"Jaccard : Sentence 4", args{"私の名前はジョンです", "私の名前はジョン・ドゥです", Jaccard}, 0.61538464, false},
154154
{"Jaccard : Sentence 5", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂", Jaccard}, 0.8, false},
155155

156+
// SorensenDice method
157+
{"SorensenDice : First arg empty", args{"", "abcde", SorensenDice}, 0.0, false},
158+
{"SorensenDice : Second arg empty", args{"abcde", "", SorensenDice}, 0.0, false},
159+
{"SorensenDice : Same args", args{"abcde", "abcde", SorensenDice}, 1.0, false},
160+
{"SorensenDice : No characters match", args{"abcd", "effgghh", SorensenDice}, 0.0, false},
161+
{"SorensenDice : CRATE/TRACE", args{"CRATE", "TRACE", SorensenDice}, 0.25, false},
162+
{"SorensenDice : MARTHA/MARHTA", args{"MARTHA", "MARHTA", SorensenDice}, 0.4, false},
163+
{"SorensenDice : DIXON/DICKSONX", args{"DIXON", "DICKSONX", SorensenDice}, 0.36363637, false},
164+
{"SorensenDice Sentence 1", args{"night", "nacht", SorensenDice}, 0.25, false},
165+
{"SorensenDice Sentence 2", args{"Radiohead", "Radiohead", SorensenDice}, 1.0, false},
166+
{"SorensenDice Sentence 3", args{"", "", SorensenDice}, 0.0, false},
167+
{"SorensenDice Sentence 4", args{"Radiohead", "Carly Rae Jepsen", SorensenDice}, 0.09090909, false},
168+
{"SorensenDice Sentence 5", args{"I love horror movies", "Lights out is a horror movie", SorensenDice}, 0.52380955, false},
169+
{"SorensenDice Sentence 6", args{"love horror movies", "Lights out horror movie", SorensenDice}, 0.6111111, false},
170+
{"SorensenDice Sentence 7", args{"私の名前はジョンです", "私の名前はジョン・ドゥです", SorensenDice}, 0.7619048, false},
171+
{"SorensenDice Sentence 8", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂", SorensenDice}, 0.8888889, false},
172+
173+
// Qgram method
174+
{"Qgram: First arg empty", args{"", "abcde", Qgram}, 0.0, false},
175+
{"Qgram : Second arg empty", args{"abcde", "", Qgram}, 0.0, false},
176+
{"Qgram : Same args", args{"abcde", "abcde", Qgram}, 1.0, false},
177+
{"Qgram : No characters match", args{"abcd", "effgghh", Qgram}, 0.0, false},
178+
{"Qgram : CRATE/TRACE", args{"CRATE", "TRACE", Qgram}, 0.25, false},
179+
{"Qgram : MARTHA/MARHTA", args{"MARTHA", "MARHTA", Qgram}, 0.39999998, false},
180+
{"Qgram : DIXON/DICKSONX", args{"DIXON", "DICKSONX", Qgram}, 0.36363637, false},
181+
{"Qgram Sentence 1", args{"Radiohead", "Radiohead", Qgram}, 1.0, false},
182+
{"Qgram Sentence 2", args{"ABCD", "ABCE", Qgram}, 0.6666666, false},
183+
{"Qgram Sentence 3", args{"Radiohead", "Carly Rae Jepsen", Qgram}, 0.04545456, false},
184+
{"Qgram Sentence 4", args{"I love horror movies", "Lights out is a horror movie", Qgram}, 0.47619045, false},
185+
{"Qgram Sentence 5", args{"love horror movies", "Lights out horror movie", Qgram}, 0.5833334, false},
186+
{"Qgram Sentence 6", args{"私の名前はジョンです", "私の名前はジョン・ドゥです", Qgram}, 0.7619048, false},
187+
{"Qgram Sentence 7", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂", Qgram}, 0.5555556, false},
188+
189+
// TODO: Must refactor compare method to handle NaN values
190+
// {"Qgram Sentence 8", args{"", "", Qgram}, float32(math.NaN()), false},
191+
156192
// Illegal argument error
157193
{"Undefined integer value for method", args{"abc", "abcde", 42}, 0.0, true},
158194
}

0 commit comments

Comments
 (0)