Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

strings: add hamming_distance/jaro_similarity/jaro_winkler_similarity functions #22701

Merged
merged 5 commits into from
Oct 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 142 additions & 0 deletions vlib/strings/similarity.v
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,30 @@ fn min(a u16, b u16, c u16) u16 {
return m
}

@[inline]
fn max2(a int, b int) int {
if a < b {
return b
}
return a
}

@[inline]
fn min2(a int, b int) int {
if a < b {
return a
}
return b
}

@[inline]
fn abs2(a int, b int) int {
if a < b {
return b - a
}
return a - b
}

// levenshtein_distance uses the Levenshtein Distance algorithm to calculate
// the distance between between two strings `a` and `b` (lower is closer).
@[direct_array_access]
Expand Down Expand Up @@ -85,3 +109,121 @@ pub fn dice_coefficient(s1 string, s2 string) f32 {
}
return (2.0 * f32(intersection_size)) / (f32(a.len) + f32(b.len) - 2)
}

// hamming_distance uses the Hamming Distance algorithm to calculate
// the distance between two strings `a` and `b` (lower is closer).
@[direct_array_access]
pub fn hamming_distance(a string, b string) int {
if a.len == 0 && b.len == 0 {
return 0
}
mut match_len := min2(a.len, b.len)
mut diff_count := abs2(a.len, b.len)
for i in 0 .. match_len {
if a[i] != b[i] {
diff_count++
}
}
return diff_count
}

// hamming_similarity uses the Hamming Distance algorithm to calculate
// the distance between two strings `a` and `b`.
// It returns a coefficient between 0.0 (not similar) and 1.0 (exact match).
pub fn hamming_similarity(a string, b string) f32 {
l := max2(a.len, b.len)
if l == 0 {
// Both are empty strings, should return 1.0
return 1.0
}
d := hamming_distance(a, b)
return 1.00 - f32(d) / f32(l)
}

// jaro_similarity uses the Jaro Distance algorithm to calculate
// the distance between two strings `a` and `b`.
// It returns a coefficient between 0.0 (not similar) and 1.0 (exact match).
@[direct_array_access]
pub fn jaro_similarity(a string, b string) f64 {
a_len := a.len
b_len := b.len
if a_len == 0 && b_len == 0 {
// Both are empty strings, should return 1.0
return 1.0
}
if a_len == 0 || b_len == 0 {
return 0
}

// Maximum distance upto which matching is allowed
match_distance := max2(a_len, b_len) / 2 - 1

mut a_matches := []bool{len: a_len}
mut b_matches := []bool{len: b_len}
mut matches := 0
mut transpositions := 0.0

// Traverse through the first string
for i in 0 .. a_len {
start := max2(0, i - match_distance)
end := min2(b_len, i + match_distance + 1)
for k in start .. end {
// If there is a match
if b_matches[k] {
continue
}
if a[i] != b[k] {
continue
}
a_matches[i] = true
b_matches[k] = true
matches++
break
}
}
// If there is no match
if matches == 0 {
return 0
}
mut k := 0
// Count number of occurrences where two characters match but
// there is a third matched character in between the indices
for i in 0 .. a_len {
if !a_matches[i] {
continue
}
// Find the next matched character in second string
for !b_matches[k] {
k++
}
if a[i] != b[k] {
transpositions++
}
k++
}
transpositions /= 2
return (matches / f64(a_len) + matches / f64(b_len) + (matches - transpositions) / matches) / 3
}

// jaro_winkler_similarity uses the Jaro Winkler Distance algorithm to calculate
// the distance between two strings `a` and `b`.
// It returns a coefficient between 0.0 (not similar) and 1.0 (exact match).
// The scaling factor(`p=0.1`) in Jaro-Winkler gives higher weight to prefix
// similarities, making it especially effective for cases where slight misspellings
// or prefixes are common.
@[direct_array_access]
pub fn jaro_winkler_similarity(a string, b string) f64 {
// Maximum of 4 characters are allowed in prefix
mut lmax := min2(4, min2(a.len, b.len))
mut l := 0
for i in 0 .. lmax {
if a[i] == b[i] {
l++
}
}
js := jaro_similarity(a, b)
// select a multiplier (Winkler suggested p=0.1) for the relative importance of the prefix for the word similarity
p := 0.1
ws := js + f64(l) * p * (1 - js)
return ws
}
49 changes: 49 additions & 0 deletions vlib/strings/similarity_test.v
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,52 @@ fn test_levenshtein_distance() {
assert strings.levenshtein_distance('flomax', 'volmax') == 3
assert strings.levenshtein_distance('ab', 'cd') == 2
}

fn test_hamming_distance() {
assert strings.hamming_distance('', '') == 0
assert strings.hamming_distance('one', 'one') == 0
assert strings.hamming_distance('', 'two') == 3
assert strings.hamming_distance('three', '') == 5
assert strings.hamming_distance('bananna', '') == 7
assert strings.hamming_distance('cats', 'hats') == 1
assert strings.hamming_distance('hugs', 'shrugs') == 6
assert strings.hamming_distance('broom', 'shroom') == 5
assert strings.hamming_distance('flomax', 'volmax') == 3
assert strings.hamming_distance('ab', 'cd') == 2
}

fn test_hamming_similarity() {
assert strings.hamming_similarity('', '') == 1.0
assert strings.hamming_similarity('one', 'one') == 1.0
assert strings.hamming_similarity('', 'two') == 0
assert strings.hamming_similarity('three', '') == 0
assert strings.hamming_similarity('bananna', '') == 0
assert strings.hamming_similarity('cats', 'hats') == 0.75
assert strings.hamming_similarity('hugs', 'shrugs') == 0
assert strings.hamming_similarity('broom', 'shroom') == 0.1666666865348816
assert strings.hamming_similarity('flomax', 'volmax') == 0.5
assert strings.hamming_similarity('ab', 'cd') == 0
}

fn test_jaro_similarity() {
assert strings.jaro_similarity('', '') == 1
assert strings.jaro_similarity('one', 'one') == 1
assert strings.jaro_similarity('', 'two') == 0
assert strings.jaro_similarity('three', '') == 0
assert strings.jaro_similarity('bananna', '') == 0
assert strings.jaro_similarity('MARTHA', 'MARHTA') == 0.9444444444444445
assert strings.jaro_similarity('DIXON', 'DICKSONX') == 0.7666666666666666
assert strings.jaro_similarity('JELLYFISH', 'SMELLYFISH') == 0.8962962962962964
}

fn test_jaro_winkler_similarity() {
assert strings.jaro_winkler_similarity('', '') == 1
assert strings.jaro_winkler_similarity('one', 'one') == 1
assert strings.jaro_winkler_similarity('', 'two') == 0
assert strings.jaro_winkler_similarity('three', '') == 0
assert strings.jaro_winkler_similarity('bananna', '') == 0
assert strings.jaro_winkler_similarity('accomodate', 'accommodate') == 0.9818181818181818
assert strings.jaro_winkler_similarity('accomodate', 'accompanist') == 0.8672727272727273
assert strings.jaro_winkler_similarity('untill', 'huntsville') == 0.8666666666666667
assert strings.jaro_winkler_similarity('wich', 'wichita') == 0.9142857142857143
}
Loading