From 5aa4148f849cc34a1e3c33a81af7a420e2811725 Mon Sep 17 00:00:00 2001 From: Devin Gaffney Date: Thu, 6 Oct 2022 10:39:39 -0700 Subject: [PATCH] CHECK-2437 add ascii folding and other minor tweaks (#262) --- app/main/lib/language_analyzers.py | 13 +++++++++---- app/main/lib/reindex_analyzers.py | 4 ++-- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/app/main/lib/language_analyzers.py b/app/main/lib/language_analyzers.py index bb626811..aebb6388 100644 --- a/app/main/lib/language_analyzers.py +++ b/app/main/lib/language_analyzers.py @@ -1,7 +1,7 @@ import json from elasticsearch import Elasticsearch from flask import request, current_app as app -SUPPORTED_LANGUAGES = ["en", "pt", "es", "hi", "bn"] +SUPPORTED_LANGUAGES = ["en", "pt", "es", "hi", "bn", "pt-br"] #via https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lang-analyzer.html#bengali-analyzer SETTINGS_BY_LANGUAGE = { "en": { @@ -28,6 +28,7 @@ "rebuilt_english": { "tokenizer": "standard", "filter": [ + "asciifolding", "english_possessive_stemmer", "lowercase", "english_stop", @@ -58,6 +59,7 @@ "rebuilt_spanish": { "tokenizer": "standard", "filter": [ + "asciifolding", "lowercase", "spanish_stop", "spanish_keywords", @@ -67,7 +69,7 @@ } } }, - "pt-BR": { + "pt-br": { "analysis": { "filter": { "brazilian_stop": { @@ -87,6 +89,7 @@ "rebuilt_brazilian": { "tokenizer": "standard", "filter": [ + "asciifolding", "lowercase", "brazilian_stop", "brazilian_keywords", @@ -116,6 +119,7 @@ "rebuilt_portuguese": { "tokenizer": "standard", "filter": [ + "asciifolding", "lowercase", "portuguese_stop", "portuguese_keywords", @@ -145,6 +149,7 @@ "rebuilt_hindi": { "tokenizer": "standard", "filter": [ + "asciifolding", "lowercase", "decimal_digit", "hindi_keywords", @@ -177,14 +182,14 @@ "rebuilt_bengali": { "tokenizer": "standard", "filter": [ + "asciifolding", "lowercase", "decimal_digit", "bengali_keywords", "indic_normalization", "bengali_normalization", "bengali_stop", - "bengali_stemmer" - ] + "bengali_stemmer" ] } } } diff --git a/app/main/lib/reindex_analyzers.py b/app/main/lib/reindex_analyzers.py index 3c20935e..5dc6c680 100644 --- a/app/main/lib/reindex_analyzers.py +++ b/app/main/lib/reindex_analyzers.py @@ -31,14 +31,14 @@ def get_docs_to_transform(team_id, language=None): docs_to_transform[doc["_id"]] = prediction.language else: docs_to_transform[doc["_id"]] = language - f = open("docs_to_transform.json", "w") + f = open(f"docs_to_transform_{team_id}.json", "w") f.write(json.dumps(docs_to_transform)) f.close() return docs_to_transform def get_cached_docs_to_transform(team_id, language=None): try: - return json.loads(open("docs_to_transform.json").read()) + return json.loads(open(f"docs_to_transform_{team_id}.json").read()) except: return get_docs_to_transform(team_id, language)