Add qgram and sorensen-dice (#14)

ShriprajwalK · hbollon · web-flow · commit 5f65401d50f1 · 2022-01-31T17:04:37.000+01:00
* feat: add qgram and sorensen-dice

* fix: Change function names and add Qgram and SorensenDice to string-analysis

* feat(qgram): add similarity function to return an index
test: fix test cases for QGram

Co-authored-by: hbollon &lt;hugo.bollon@gmail.com&gt;
diff --git a/README.md b/README.md
@@ -44,20 +44,21 @@ Designed to be fully compatible with Unicode characters!<br>
 This library is 100% test covered 😁
 
 ## Features
-- [Levenshtein](https://en.wikipedia.org/wiki/Levenshtein_distance) ✨
-- [LCS](https://en.wikipedia.org/wiki/Longest_common_subsequence_problem) (Longest common subsequence) with edit distance, backtrack and diff functions ✨
-- [Hamming](https://en.wikipedia.org/wiki/Hamming_distance) ✨
-- [Damerau-Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance), with following variants :
-    - OSA (Optimal string alignment) ✨
-    - Adjacent transpositions ✨
-- [Jaro & Jaro-Winkler](https://fr.wikipedia.org/wiki/Distance_de_Jaro-Winkler) similarity algorithms ✨
-- [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity) algorithm to compare strings ✨
-- [Jaccard Index](https://en.wikipedia.org/wiki/Jaccard_index) ✨
-
-- Computed similarity percentage functions based on all available edit distance algorithms in this lib ✨
-- Fuzzy search functions based on edit distance with unique or multiples strings output ✨
-- Unicode compatibility ! 🥳
-- And many more to come !
+
+- [Levenshtein](https://en.wikipedia.org/wiki/Levenshtein_distance)
+- [LCS](https://en.wikipedia.org/wiki/Longest_common_subsequence_problem) (Longest common subsequence) with edit distance, backtrack and diff functions
+- [Hamming](https://en.wikipedia.org/wiki/Hamming_distance)
+- [Damerau-Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance), with following variants:
+  - OSA (Optimal string alignment)
+  - Adjacent transpositions
+- [Jaro & Jaro-Winkler](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) similarity algorithms
+- [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity)
+- [Jaccard Index](https://en.wikipedia.org/wiki/Jaccard_index)
+- [QGram](https://en.wikipedia.org/wiki/N-gram)
+- [Sorensen-Dice](https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient)
+- Computed similarity percentage functions based on all available edit distance algorithms in this lib
+- Fuzzy search functions based on edit distance with unique or multiples strings output
+- Unicode compatibility 🥳
 
 ## Benchmarks
 You can check an interactive Google chart with few benchmark cases for all similarity algorithms in this library through **StringsSimilarity** function [here](http://benchgraph.codingberg.com/q5)
diff --git a/qgram.go b/qgram.go
@@ -0,0 +1,56 @@
+package edlib
+
+import (
+	"math"
+)
+
+// QgramDistance compute the q-gram similarity between two strings
+// Takes two strings as parameters, a split length which defines the k-gram shingle length
+func QgramDistance(str1, str2 string, splitLength int) int {
+	splittedStr1 := Shingle(str1, splitLength)
+	splittedStr2 := Shingle(str2, splitLength)
+
+	union := make(map[string]int)
+	for i := range splittedStr1 {
+		union[i] = 0
+	}
+	for i := range splittedStr2 {
+		union[i] = 0
+	}
+
+	res := 0
+
+	for i := range union {
+		res += int(math.Abs(float64(splittedStr1[i] - splittedStr2[i])))
+	}
+
+	return res
+}
+
+// QgramDistanceCustomNgram compute the q-gram similarity between two custom set of individuals
+// Takes two n-gram map as parameters
+func QgramDistanceCustomNgram(splittedStr1, splittedStr2 map[string]int) int {
+	union := make(map[string]int)
+	for i := range splittedStr1 {
+		union[i] = 0
+	}
+	for i := range splittedStr2 {
+		union[i] = 0
+	}
+
+	res := 0
+	for i := range union {
+		res += int(math.Abs(float64(splittedStr1[i] - splittedStr2[i])))
+	}
+
+	return res
+}
+
+// QgramSimilarity compute a similarity index (between 0 and 1) between two strings from a Qgram distance
+// Takes two strings as parameters, a split length which defines the k-gram shingle length
+func QgramSimilarity(str1, str2 string, splitLength int) float32 {
+	splittedStr1 := Shingle(str1, splitLength)
+	splittedStr2 := Shingle(str2, splitLength)
+	res := float32(QgramDistanceCustomNgram(splittedStr1, splittedStr2))
+	return 1 - (res / float32(len(splittedStr1)+len(splittedStr2)))
+}
diff --git a/qgram_test.go b/qgram_test.go
@@ -0,0 +1,35 @@
+package edlib
+
+import (
+	"testing"
+)
+
+func TestQgramDistance(t *testing.T) {
+	type args struct {
+		str1        string
+		str2        string
+		splitLength int
+	}
+	tests := []struct {
+		name string
+		args args
+		want int
+	}{
+		{"Qgram sim 1", args{"Radiohead", "Radiohead", 2}, 0.0},
+		{"Qgram sim 2", args{"ABCD", "ABCE", 2}, 2.0},
+		{"Qgram sim 3", args{"Radiohead", "Carly Rae Jepsen", 2}, 21.0},
+		{"Qgram sim 4", args{"I love horror movies", "Lights out is a horror movie", 2}, 22.0},
+		{"Qgram sim 5", args{"love horror movies", "Lights out horror movie", 2}, 15.0},
+		{"Qgram sim 6", args{"私の名前はジョンです", "私の名前はジョン・ドゥです", 2}, 5},
+		{"Qgram sim 7", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂", 2}, 4},
+		{"Qgram sim 8", args{"", "", 2}, 0.0},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := QgramDistance(tt.args.str1, tt.args.str2, tt.args.splitLength); got != tt.want {
+				t.Errorf("QgramDistance() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
diff --git a/sorensen-dice.go b/sorensen-dice.go
@@ -0,0 +1,19 @@
+package edlib
+
+// SorensenDiceCoefficient computes the Sorensen-Dice coefficient between two strings
+// Takes two strings as parameters, a split length which defines the k-gram shingle length
+func SorensenDiceCoefficient(str1, str2 string, splitLength int) float32 {
+	if str1 == "" && str2 == "" {
+		return 0
+	}
+	shingle1 := Shingle(str1, splitLength)
+	shingle2 := Shingle(str2, splitLength)
+
+	intersection := float32(0)
+	for i := range shingle1 {
+		if _, ok := shingle2[i]; ok {
+			intersection++
+		}
+	}
+	return 2.0 * intersection / float32(len(shingle1)+len(shingle2))
+}
diff --git a/sorensen-dice_test.go b/sorensen-dice_test.go
@@ -0,0 +1,35 @@
+package edlib
+
+import (
+	"testing"
+)
+
+func TestSorensenDiceCoefficient(t *testing.T) {
+	type args struct {
+		str1        string
+		str2        string
+		splitLength int
+	}
+	tests := []struct {
+		name string
+		args args
+		want float32
+	}{
+		{"SorensenDiceCoefficient 1", args{"night", "nacht", 2}, 0.25},
+		{"SorensenDiceCoefficient 2", args{"Radiohead", "Radiohead", 2}, 1.0},
+		{"SorensenDiceCoefficient 3", args{"", "", 2}, 0.0},
+		{"SorensenDiceCoefficient 4", args{"Radiohead", "Carly Rae Jepsen", 2}, 0.09090909},
+		{"SorensenDiceCoefficient 5", args{"I love horror movies", "Lights out is a horror movie", 2}, 0.52380955},
+		{"SorensenDiceCoefficient 6", args{"love horror movies", "Lights out horror movie", 2}, 0.6111111},
+		{"SorensenDiceCoefficient 7", args{"私の名前はジョンです", "私の名前はジョン・ドゥです", 2}, 0.7619048},
+		{"SorensenDiceCoefficient 8", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂", 2}, 0.8888889},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := SorensenDiceCoefficient(tt.args.str1, tt.args.str2, tt.args.splitLength); got != tt.want {
+				t.Errorf("SorensenDiceCoefficient() = %v, want %v", got, tt.want)
+			}
+		})
+	}
+}
diff --git a/string-analysis.go b/string-analysis.go
@@ -20,6 +20,8 @@ const (
 	JaroWinkler
 	Cosine
 	Jaccard
+	SorensenDice
+	Qgram
 )
 
 // StringsSimilarity return a similarity index [0..1] between two strings based on given edit distance algorithm in parameter.
@@ -49,6 +51,10 @@ func StringsSimilarity(str1 string, str2 string, algo Algorithm) (float32, error
 		return CosineSimilarity(str1, str2, 2), nil
 	case Jaccard:
 		return JaccardSimilarity(str1, str2, 2), nil
+	case SorensenDice:
+		return SorensenDiceCoefficient(str1, str2, 2), nil
+	case Qgram:
+		return QgramSimilarity(str1, str2, 2), nil
 	default:
 		return 0.0, errors.New("Illegal argument for algorithm method")
 	}
diff --git a/string-analysis_test.go b/string-analysis_test.go
@@ -153,6 +153,42 @@ func TestStringsSimilarity(t *testing.T) {
 		{"Jaccard : Sentence 4", args{"私の名前はジョンです", "私の名前はジョン・ドゥです", Jaccard}, 0.61538464, false},
 		{"Jaccard : Sentence 5", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂", Jaccard}, 0.8, false},
 
+		// SorensenDice method
+		{"SorensenDice : First arg empty", args{"", "abcde", SorensenDice}, 0.0, false},
+		{"SorensenDice : Second arg empty", args{"abcde", "", SorensenDice}, 0.0, false},
+		{"SorensenDice : Same args", args{"abcde", "abcde", SorensenDice}, 1.0, false},
+		{"SorensenDice : No characters match", args{"abcd", "effgghh", SorensenDice}, 0.0, false},
+		{"SorensenDice : CRATE/TRACE", args{"CRATE", "TRACE", SorensenDice}, 0.25, false},
+		{"SorensenDice : MARTHA/MARHTA", args{"MARTHA", "MARHTA", SorensenDice}, 0.4, false},
+		{"SorensenDice : DIXON/DICKSONX", args{"DIXON", "DICKSONX", SorensenDice}, 0.36363637, false},
+		{"SorensenDice Sentence 1", args{"night", "nacht", SorensenDice}, 0.25, false},
+		{"SorensenDice Sentence 2", args{"Radiohead", "Radiohead", SorensenDice}, 1.0, false},
+		{"SorensenDice Sentence 3", args{"", "", SorensenDice}, 0.0, false},
+		{"SorensenDice Sentence 4", args{"Radiohead", "Carly Rae Jepsen", SorensenDice}, 0.09090909, false},
+		{"SorensenDice Sentence 5", args{"I love horror movies", "Lights out is a horror movie", SorensenDice}, 0.52380955, false},
+		{"SorensenDice Sentence 6", args{"love horror movies", "Lights out horror movie", SorensenDice}, 0.6111111, false},
+		{"SorensenDice Sentence 7", args{"私の名前はジョンです", "私の名前はジョン・ドゥです", SorensenDice}, 0.7619048, false},
+		{"SorensenDice Sentence 8", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂", SorensenDice}, 0.8888889, false},
+
+		// Qgram method
+		{"Qgram: First arg empty", args{"", "abcde", Qgram}, 0.0, false},
+		{"Qgram : Second arg empty", args{"abcde", "", Qgram}, 0.0, false},
+		{"Qgram : Same args", args{"abcde", "abcde", Qgram}, 1.0, false},
+		{"Qgram : No characters match", args{"abcd", "effgghh", Qgram}, 0.0, false},
+		{"Qgram : CRATE/TRACE", args{"CRATE", "TRACE", Qgram}, 0.25, false},
+		{"Qgram : MARTHA/MARHTA", args{"MARTHA", "MARHTA", Qgram}, 0.39999998, false},
+		{"Qgram : DIXON/DICKSONX", args{"DIXON", "DICKSONX", Qgram}, 0.36363637, false},
+		{"Qgram Sentence 1", args{"Radiohead", "Radiohead", Qgram}, 1.0, false},
+		{"Qgram Sentence 2", args{"ABCD", "ABCE", Qgram}, 0.6666666, false},
+		{"Qgram Sentence 3", args{"Radiohead", "Carly Rae Jepsen", Qgram}, 0.04545456, false},
+		{"Qgram Sentence 4", args{"I love horror movies", "Lights out is a horror movie", Qgram}, 0.47619045, false},
+		{"Qgram Sentence 5", args{"love horror movies", "Lights out horror movie", Qgram}, 0.5833334, false},
+		{"Qgram Sentence 6", args{"私の名前はジョンです", "私の名前はジョン・ドゥです", Qgram}, 0.7619048, false},
+		{"Qgram Sentence 7", args{"🙂😄🙂😄 😄🙂😄", "🙂😄🙂😄 😄🙂😄 🙂😄🙂", Qgram}, 0.5555556, false},
+
+		// TODO: Must refactor compare method to handle NaN values
+		// {"Qgram Sentence 8", args{"", "", Qgram}, float32(math.NaN()), false},
+
 		// Illegal argument error
 		{"Undefined integer value for method", args{"abc", "abcde", 42}, 0.0, true},
 	}