From 8d21f633109cf0ec60d93a3b91fa8db099c479f8 Mon Sep 17 00:00:00 2001 From: missinglink Date: Tue, 2 Nov 2021 16:43:09 +0100 Subject: [PATCH] feat(unicode): improved diacritics removal --- lib/analysis.js | 7 ++----- lib/unicode.js | 18 ++++++++++++++++++ package.json | 2 +- prototype/query.js | 1 - prototype/tokenize.js | 8 ++++---- test/lib/analysis.js | 12 ++++++++---- test/prototype/tokenize_integration.js | 2 +- 7 files changed, 34 insertions(+), 16 deletions(-) diff --git a/lib/analysis.js b/lib/analysis.js index 5f877ef1..ac6c8dfa 100644 --- a/lib/analysis.js +++ b/lib/analysis.js @@ -1,6 +1,5 @@ const lowercase = require('lower-case').lowerCase; -const removeAccents = require('remove-accents'); const unicode = require('./unicode'); const PARTIAL_TOKEN_SUFFIX = '\x26'; @@ -102,11 +101,9 @@ function normalize( input ){ return synonyms.map( function( synonym ){ return synonym.replace(/\s{2,}/g, ' ').trim(); }) - // basic normalization - // note: lowercase MUST be run before removeAccents, please don't change the order - // see: https://github.com/pelias/placeholder/pull/12 for more detail. + // normalization .map( function( synonym ){ - return removeAccents( lowercase( synonym ) ); + return lowercase( unicode.fold( synonym ) ); }) // remove empty synonyms .filter( function( synonym ){ diff --git a/lib/unicode.js b/lib/unicode.js index e0fdaab8..81c4a578 100644 --- a/lib/unicode.js +++ b/lib/unicode.js @@ -1,5 +1,6 @@ const _ = require('lodash'); const regenerate = require('regenerate'); +const accentsDiacritics = require('remove-accents-diacritics'); // non-printable control characters // ref: https://en.wikipedia.org/wiki/List_of_Unicode_characters @@ -93,4 +94,21 @@ function normalize(str) { .replace(COMBINING_MARKS, ''); } +/** + * Converts alphabetic, numeric, and symbolic characters that are not + * in the Basic Latin Unicode block(first 127 ASCII characters) to their + * ASCII equivalent, if one exists.For example, the filter changes à to a. + */ +function fold(str) { + + // sanity checking + if (!_.isString(str)) { return str; } + + return accentsDiacritics.remove(str) + .normalize('NFD') + .replace(COMBINING_MARKS, '') + .normalize('NFKC'); +} + module.exports.normalize = normalize; +module.exports.fold = fold; diff --git a/package.json b/package.json index bee53e5b..4cabd1f2 100644 --- a/package.json +++ b/package.json @@ -43,7 +43,7 @@ "pelias-logger": "^1.2.1", "pelias-whosonfirst": "^5.0.0", "regenerate": "^1.4.2", - "remove-accents": "^0.4.0", + "remove-accents-diacritics": "^1.0.2", "require-dir": "^1.0.0", "sorted-intersect": "^0.1.4", "split2": "^3.0.0", diff --git a/prototype/query.js b/prototype/query.js index 8839d48d..fe9c8263 100644 --- a/prototype/query.js +++ b/prototype/query.js @@ -2,7 +2,6 @@ var async = require('async'); var util = require('util'); var Result = require('../lib/Result'); -var sorted = require('../lib/sorted'); var debug = false; function reduce( index, res ){ diff --git a/prototype/tokenize.js b/prototype/tokenize.js index 44603420..9340e0fd 100644 --- a/prototype/tokenize.js +++ b/prototype/tokenize.js @@ -1,9 +1,9 @@ // plugin for tokenize -const _ = require('lodash'), - async = require('async'), - analysis = require('../lib/analysis'), - permutations = require('../lib/permutations'); +const _ = require('lodash'); +const async = require('async'); +const analysis = require('../lib/analysis'); +const permutations = require('../lib/permutations'); function tokenize(input, cb){ diff --git a/test/lib/analysis.js b/test/lib/analysis.js index ef781e5b..7a6a76f1 100644 --- a/test/lib/analysis.js +++ b/test/lib/analysis.js @@ -15,14 +15,18 @@ module.exports.normalize = function(test, common) { // Punctuation substitutions assert( 'Straße', [ 'strasse' ] ); assert( 'Jǿ œ̆', [ 'jo oe' ] ); + assert( 'orilẹ́ede manamari', [ 'orileede manamari' ] ); + assert( 'z︠h︡ovkva', [ 'zhovkva' ] ); + assert( 'Žovkva', [ 'zovkva' ] ); + assert( 'Żółkiew', [ 'zolkiew' ] ); assert( 'Trinidad & Tobago', [ 'trinidad and tobago' ] ); // Tests to confirm the order of function execution // see: https://github.com/pelias/placeholder/pull/12#issuecomment-302437570 - test( 'order of execution', function(t) { - t.deepEqual( analysis.normalize( 'İnceyol' ), [ 'i̇nceyol' ] ); - t.equal( analysis.normalize( 'İnceyol' )[0].length, 8 ); - t.equal( analysis.normalize( 'İ' )[0].length, 2 ); + test('order of execution', function(t) { + t.deepEqual( analysis.normalize( 'İnceyol' ), [ 'inceyol' ] ); + t.equal( analysis.normalize( 'İnceyol' )[0].length, 7 ); + t.equal( analysis.normalize( 'İ' )[0].length, 1 ); t.end(); }); diff --git a/test/prototype/tokenize_integration.js b/test/prototype/tokenize_integration.js index 21ca2868..89a8e056 100644 --- a/test/prototype/tokenize_integration.js +++ b/test/prototype/tokenize_integration.js @@ -11,7 +11,7 @@ module.exports.tokenize = function(test, util) { assert('Kelburn Wellington New Zealand', [['kelburn', 'wellington', 'new zealand']]); assert('Sydney New South Wales Australia', [['sydney', 'new south wales', 'australia']]); - assert('ケープタウン 南アフリカ', [['ケープタウン', '南アフリカ']]); + assert('ケープタウン 南アフリカ', [['ケーフタウン', '南アフリカ']]); // duplicates assert('lancaster lancaster pa', [['lancaster', 'lancaster', 'pa']]);