-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsimilarity.v
229 lines (212 loc) · 5.46 KB
/
similarity.v
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
module strings
@[inline]
fn min(a u16, b u16, c u16) u16 {
mut m := a
if b < m {
m = b
}
if c < m {
m = c
}
return m
}
@[inline]
fn max2(a int, b int) int {
if a < b {
return b
}
return a
}
@[inline]
fn min2(a int, b int) int {
if a < b {
return a
}
return b
}
@[inline]
fn abs2(a int, b int) int {
if a < b {
return b - a
}
return a - b
}
// levenshtein_distance uses the Levenshtein Distance algorithm to calculate
// the distance between between two strings `a` and `b` (lower is closer).
@[direct_array_access]
pub fn levenshtein_distance(a string, b string) int {
if a.len == 0 {
return b.len
}
if b.len == 0 {
return a.len
}
if a == b {
return 0
}
mut row := []u16{len: a.len + 1, init: u16(index)}
for i := 1; i < b.len + 1; i++ {
mut prev := u16(i)
for j := 1; j < a.len + 1; j++ {
mut current := row[j - 1] // match
if b[i - 1] != a[j - 1] {
// insertion, substitution, deletion
current = min(row[j - 1] + 1, prev + 1, row[j] + 1)
}
row[j - 1] = prev
prev = current
}
row[a.len] = prev
}
return row[a.len]
}
// levenshtein_distance_percentage uses the Levenshtein Distance algorithm to calculate
// how similar two strings are as a percentage (higher is closer).
pub fn levenshtein_distance_percentage(a string, b string) f32 {
d := levenshtein_distance(a, b)
l := if a.len >= b.len { a.len } else { b.len }
return (1.00 - f32(d) / f32(l)) * 100.00
}
// dice_coefficient implements the Sørensen–Dice coefficient.
// It finds the similarity between two strings, and returns a coefficient
// between 0.0 (not similar) and 1.0 (exact match).
pub fn dice_coefficient(s1 string, s2 string) f32 {
if s1.len == 0 || s2.len == 0 {
return 0.0
}
if s1 == s2 {
return 1.0
}
if s1.len < 2 || s2.len < 2 {
return 0.0
}
a := if s1.len > s2.len { s1 } else { s2 }
b := if a == s1 { s2 } else { s1 }
mut first_bigrams := map[string]int{}
for i in 0 .. a.len - 1 {
bigram := a[i..i + 2]
q := if bigram in first_bigrams { first_bigrams[bigram] + 1 } else { 1 }
first_bigrams[bigram] = q
}
mut intersection_size := 0
for i in 0 .. b.len - 1 {
bigram := b[i..i + 2]
count := if bigram in first_bigrams { first_bigrams[bigram] } else { 0 }
if count > 0 {
first_bigrams[bigram] = count - 1
intersection_size++
}
}
return (2.0 * f32(intersection_size)) / (f32(a.len) + f32(b.len) - 2)
}
// hamming_distance uses the Hamming Distance algorithm to calculate
// the distance between two strings `a` and `b` (lower is closer).
@[direct_array_access]
pub fn hamming_distance(a string, b string) int {
if a.len == 0 && b.len == 0 {
return 0
}
mut match_len := min2(a.len, b.len)
mut diff_count := abs2(a.len, b.len)
for i in 0 .. match_len {
if a[i] != b[i] {
diff_count++
}
}
return diff_count
}
// hamming_similarity uses the Hamming Distance algorithm to calculate
// the distance between two strings `a` and `b`.
// It returns a coefficient between 0.0 (not similar) and 1.0 (exact match).
pub fn hamming_similarity(a string, b string) f32 {
l := max2(a.len, b.len)
if l == 0 {
// Both are empty strings, should return 1.0
return 1.0
}
d := hamming_distance(a, b)
return 1.00 - f32(d) / f32(l)
}
// jaro_similarity uses the Jaro Distance algorithm to calculate
// the distance between two strings `a` and `b`.
// It returns a coefficient between 0.0 (not similar) and 1.0 (exact match).
@[direct_array_access]
pub fn jaro_similarity(a string, b string) f64 {
a_len := a.len
b_len := b.len
if a_len == 0 && b_len == 0 {
// Both are empty strings, should return 1.0
return 1.0
}
if a_len == 0 || b_len == 0 {
return 0
}
// Maximum distance upto which matching is allowed
match_distance := max2(a_len, b_len) / 2 - 1
mut a_matches := []bool{len: a_len}
mut b_matches := []bool{len: b_len}
mut matches := 0
mut transpositions := 0.0
// Traverse through the first string
for i in 0 .. a_len {
start := max2(0, i - match_distance)
end := min2(b_len, i + match_distance + 1)
for k in start .. end {
// If there is a match
if b_matches[k] {
continue
}
if a[i] != b[k] {
continue
}
a_matches[i] = true
b_matches[k] = true
matches++
break
}
}
// If there is no match
if matches == 0 {
return 0
}
mut k := 0
// Count number of occurrences where two characters match but
// there is a third matched character in between the indices
for i in 0 .. a_len {
if !a_matches[i] {
continue
}
// Find the next matched character in second string
for !b_matches[k] {
k++
}
if a[i] != b[k] {
transpositions++
}
k++
}
transpositions /= 2
return (matches / f64(a_len) + matches / f64(b_len) + (matches - transpositions) / matches) / 3
}
// jaro_winkler_similarity uses the Jaro Winkler Distance algorithm to calculate
// the distance between two strings `a` and `b`.
// It returns a coefficient between 0.0 (not similar) and 1.0 (exact match).
// The scaling factor(`p=0.1`) in Jaro-Winkler gives higher weight to prefix
// similarities, making it especially effective for cases where slight misspellings
// or prefixes are common.
@[direct_array_access]
pub fn jaro_winkler_similarity(a string, b string) f64 {
// Maximum of 4 characters are allowed in prefix
mut lmax := min2(4, min2(a.len, b.len))
mut l := 0
for i in 0 .. lmax {
if a[i] == b[i] {
l++
}
}
js := jaro_similarity(a, b)
// select a multiplier (Winkler suggested p=0.1) for the relative importance of the prefix for the word similarity
p := 0.1
ws := js + f64(l) * p * (1 - js)
return ws
}