Skip to content

Commit

Permalink
CHECK-2437 add ascii folding and other minor tweaks (#262)
Browse files Browse the repository at this point in the history
  • Loading branch information
DGaffney authored Oct 6, 2022
1 parent 5fe1a2e commit 5aa4148
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 6 deletions.
13 changes: 9 additions & 4 deletions app/main/lib/language_analyzers.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import json
from elasticsearch import Elasticsearch
from flask import request, current_app as app
SUPPORTED_LANGUAGES = ["en", "pt", "es", "hi", "bn"]
SUPPORTED_LANGUAGES = ["en", "pt", "es", "hi", "bn", "pt-br"]
#via https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lang-analyzer.html#bengali-analyzer
SETTINGS_BY_LANGUAGE = {
"en": {
Expand All @@ -28,6 +28,7 @@
"rebuilt_english": {
"tokenizer": "standard",
"filter": [
"asciifolding",
"english_possessive_stemmer",
"lowercase",
"english_stop",
Expand Down Expand Up @@ -58,6 +59,7 @@
"rebuilt_spanish": {
"tokenizer": "standard",
"filter": [
"asciifolding",
"lowercase",
"spanish_stop",
"spanish_keywords",
Expand All @@ -67,7 +69,7 @@
}
}
},
"pt-BR": {
"pt-br": {
"analysis": {
"filter": {
"brazilian_stop": {
Expand All @@ -87,6 +89,7 @@
"rebuilt_brazilian": {
"tokenizer": "standard",
"filter": [
"asciifolding",
"lowercase",
"brazilian_stop",
"brazilian_keywords",
Expand Down Expand Up @@ -116,6 +119,7 @@
"rebuilt_portuguese": {
"tokenizer": "standard",
"filter": [
"asciifolding",
"lowercase",
"portuguese_stop",
"portuguese_keywords",
Expand Down Expand Up @@ -145,6 +149,7 @@
"rebuilt_hindi": {
"tokenizer": "standard",
"filter": [
"asciifolding",
"lowercase",
"decimal_digit",
"hindi_keywords",
Expand Down Expand Up @@ -177,14 +182,14 @@
"rebuilt_bengali": {
"tokenizer": "standard",
"filter": [
"asciifolding",
"lowercase",
"decimal_digit",
"bengali_keywords",
"indic_normalization",
"bengali_normalization",
"bengali_stop",
"bengali_stemmer"
]
"bengali_stemmer" ]
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions app/main/lib/reindex_analyzers.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,14 +31,14 @@ def get_docs_to_transform(team_id, language=None):
docs_to_transform[doc["_id"]] = prediction.language
else:
docs_to_transform[doc["_id"]] = language
f = open("docs_to_transform.json", "w")
f = open(f"docs_to_transform_{team_id}.json", "w")
f.write(json.dumps(docs_to_transform))
f.close()
return docs_to_transform

def get_cached_docs_to_transform(team_id, language=None):
try:
return json.loads(open("docs_to_transform.json").read())
return json.loads(open(f"docs_to_transform_{team_id}.json").read())
except:
return get_docs_to_transform(team_id, language)

Expand Down

0 comments on commit 5aa4148

Please sign in to comment.