Skip to content

Commit

Permalink
search-api: Limit supported languages
Browse files Browse the repository at this point in the history
This is to limit the sizes of the indexes.
  • Loading branch information
jnatten committed Feb 26, 2025
1 parent b198e06 commit f7ec255
Show file tree
Hide file tree
Showing 15 changed files with 95 additions and 225 deletions.
5 changes: 3 additions & 2 deletions common/src/main/scala/no/ndla/common/CirceUtil.scala
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,15 @@ object CirceUtil {
}
}

def tryParseAs[T](str: String)(implicit d: Decoder[T]): Try[T] = {
def tryParse(str: String): Try[Json] = {
parser
.parse(str)
.toTry
.flatMap(_.as[T].toTry)
.recoverWith { ex => Failure(CirceFailure(str, ex)) }
}

def tryParseAs[T](str: String)(implicit d: Decoder[T]): Try[T] = tryParse(str).flatMap(_.as[T].toTry)

/** This might throw an exception! Use with care, probably only use this in tests */
def unsafeParseAs[T: Decoder](str: String): T = tryParseAs(str).get

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ trait BaseProps {
def TaxonomyUrl: String = s"http://$TaxonomyApiHost"
def disableWarmup: Boolean = booleanPropOrElse("DISABLE_WARMUP", default = false)

def SupportedLanguages: List[String] =
propOrElse("SUPPORTED_LANGUAGES", "nb,nn,en,sma,se,de,es,zh,ukr").split(",").toList

def ndlaFrontendUrl: String = Environment match {
case "local" => "http://localhost:30017"
case "prod" => "https://ndla.no"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,32 +83,6 @@ trait IndexService {
}
}

/** Returns Sequence of FieldDefinitions for a given field.
*
* @param fieldName
* Name of field in mapping.
* @param keepRaw
* Whether to add a keywordField named raw. Usually used for sorting, aggregations or scripts.
* @return
* Sequence of FieldDefinitions for a field.
*/
protected def generateLanguageSupportedFieldList(fieldName: String, keepRaw: Boolean = false): Seq[ElasticField] = {
if (keepRaw) {
languageAnalyzers.map(langAnalyzer =>
textField(s"$fieldName.${langAnalyzer.languageTag.toString()}")
.fielddata(false)
.analyzer(langAnalyzer.analyzer)
.fields(keywordField("raw"))
)
} else {
languageAnalyzers.map(langAnalyzer =>
textField(s"$fieldName.${langAnalyzer.languageTag.toString()}")
.fielddata(false)
.analyzer(langAnalyzer.analyzer)
)
}
}

/** Returns Sequence of DynamicTemplateRequest for a given field.
*
* @param fieldName
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import no.ndla.common.configuration.BaseComponentRegistry
import no.ndla.network.NdlaClient
import no.ndla.network.clients.{FeideApiClient, FrontpageApiClient, MyNDLAApiClient, RedisClient}
import no.ndla.network.tapir.TapirApplication
import no.ndla.search.{BaseIndexService, Elastic4sClient}
import no.ndla.search.{BaseIndexService, Elastic4sClient, SearchLanguage}
import no.ndla.searchapi.controller.parameters.GetSearchQueryParams
import no.ndla.searchapi.controller.{InternController, SearchController, SwaggerDocControllerConfig}
import no.ndla.searchapi.integration.*
Expand Down Expand Up @@ -43,6 +43,7 @@ class ComponentRegistry(properties: SearchApiProperties)
with TaxonomyApiClient
with IndexService
with BaseIndexService
with SearchLanguage
with StrictLogging
with LearningPathApiClient
with NdlaClient
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,10 @@ import com.sksamuel.elastic4s.ElasticDsl.*
import com.sksamuel.elastic4s.analysis.*
import com.sksamuel.elastic4s.fields.{ElasticField, NestedField, ObjectField}
import com.sksamuel.elastic4s.requests.indexes.IndexRequest
import com.sksamuel.elastic4s.requests.mappings.dynamictemplate.DynamicTemplateRequest
import com.typesafe.scalalogging.StrictLogging
import io.circe.Decoder
import no.ndla.common.model.domain.Content
import no.ndla.network.clients.MyNDLAApiClient
import no.ndla.search.SearchLanguage.NynorskLanguageAnalyzer
import no.ndla.search.model.domain.{BulkIndexResult, ElasticIndexingException, ReindexResult}
import no.ndla.search.{BaseIndexService, Elastic4sClient, SearchLanguage}
import no.ndla.searchapi.Props
Expand All @@ -28,7 +26,7 @@ import scala.util.{Failure, Success, Try}

trait IndexService {
this: Elastic4sClient & SearchApiClient & BaseIndexService & TaxonomyApiClient & GrepApiClient & Props &
MyNDLAApiClient =>
MyNDLAApiClient & SearchLanguage =>

trait BulkIndexingService extends BaseIndexService {

Expand All @@ -37,70 +35,18 @@ trait IndexService {
textField("trigram").analyzer("trigram"),
textField("decompounded").searchAnalyzer("standard").analyzer("compound_analyzer"),
textField("exact").analyzer("exact")
)
val subfieldsWithRaw = if (keepRaw) subfields :+ keywordField("raw") else subfields
) ++
Option.when(keepRaw)(keywordField("raw")).toList

val analyzedFields = SearchLanguage.languageAnalyzers.map(langAnalyzer => {
textField(s"$name.${langAnalyzer.languageTag.toString}")
.analyzer(langAnalyzer.analyzer)
.fields(subfieldsWithRaw)
.fields(subfields)
})

// val analyzedCodes = SearchLanguage.languageAnalyzers.map(_.languageTag.toString).toSet
// val notAnalyzedFields =
// CodeLists.iso639Definitions
// .flatMap(_.part1)
// .filterNot(analyzedCodes.contains)
// .map(x => )

// TODO: Not analyzed fields

analyzedFields
}

/** Returns Sequence of DynamicTemplateRequest for a given field.
*
* @param fieldName
* Name of field in mapping.
* @param keepRaw
* Whether to add a keywordField named raw. Usually used for sorting, aggregations or scripts.
* @return
* Sequence of DynamicTemplateRequest for a field.
*/
protected def generateLanguageSupportedDynamicTemplates(
fieldName: String,
keepRaw: Boolean = false
): Seq[DynamicTemplateRequest] = {
val dynamicFunc = (name: String, analyzer: String, subFields: List[ElasticField]) => {
val field = textField(name).analyzer(analyzer).fields(subFields)
DynamicTemplateRequest(
name = name,
mapping = field,
matchMappingType = Some("string"),
pathMatch = Some(name)
)
}

val sf = List(
textField("trigram").analyzer("trigram"),
textField("decompounded").searchAnalyzer("standard").analyzer("compound_analyzer"),
textField("exact").analyzer("exact")
)
val subFields = if (keepRaw) sf :+ keywordField("raw") else sf

val languageTemplates = SearchLanguage.languageAnalyzers.map(languageAnalyzer => {
val name = s"$fieldName.${languageAnalyzer.languageTag.toString()}"
dynamicFunc(name, languageAnalyzer.analyzer, subFields)
})
val languageSubTemplates = SearchLanguage.languageAnalyzers.map(languageAnalyzer => {
val name = s"*.$fieldName.${languageAnalyzer.languageTag.toString()}"
dynamicFunc(name, languageAnalyzer.analyzer, subFields)
})
val catchAllTemplate = dynamicFunc(s"$fieldName.*", "standard", subFields)
val catchAllSubTemplate = dynamicFunc(s"*.$fieldName.*", "standard", subFields)
languageTemplates ++ languageSubTemplates ++ Seq(catchAllTemplate, catchAllSubTemplate)
}

private val hyphDecompounderTokenFilter: CompoundWordTokenFilter = CompoundWordTokenFilter(
name = "hyphenation_decompounder",
`type` = HyphenationDecompounder,
Expand Down Expand Up @@ -130,7 +76,7 @@ trait IndexService {

override val analysis: Analysis =
Analysis(
analyzers = List(trigram, customExactAnalyzer, customCompoundAnalyzer, NynorskLanguageAnalyzer),
analyzers = List(trigram, customExactAnalyzer, customCompoundAnalyzer, SearchLanguage.NynorskLanguageAnalyzer),
tokenFilters = List(hyphDecompounderTokenFilter) ++ SearchLanguage.NynorskTokenFilters,
normalizers = List(lowerNormalizer)
)
Expand Down Expand Up @@ -287,37 +233,6 @@ trait IndexService {
}
}

/** Returns Sequence of FieldDefinitions for a given field.
*
* @param fieldName
* Name of field in mapping.
* @param keepRaw
* Whether to add a keywordField named raw. Usually used for sorting, aggregations or scripts.
* @return
* Sequence of FieldDefinitions for a field.
*/
protected def generateLanguageSupportedFieldList(
fieldName: String,
keepRaw: Boolean = false
): Seq[ElasticField] = {
SearchLanguage.languageAnalyzers.map(langAnalyzer => {
val sf = List(
textField("trigram").analyzer("trigram"),
textField("decompounded")
.searchAnalyzer("standard")
.analyzer("compound_analyzer"),
textField("exact")
.analyzer("exact")
)

val subFields = if (keepRaw) sf :+ keywordField("raw") else sf

textField(s"$fieldName.${langAnalyzer.languageTag.toString}")
.analyzer(langAnalyzer.analyzer)
.fields(subFields)
})
}

protected def getTaxonomyContextMapping(fieldName: String): NestedField = {
nestedField(fieldName).fields(
List(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import com.sksamuel.elastic4s.requests.searches.SearchResponse
import com.sksamuel.elastic4s.requests.searches.queries.Query
import com.sksamuel.elastic4s.requests.searches.queries.compound.BoolQuery
import com.typesafe.scalalogging.StrictLogging
import no.ndla.common.CirceUtil
import no.ndla.common.errors.{ValidationException, ValidationMessage}
import no.ndla.common.implicits.TryQuestionMark
import no.ndla.common.model.api.search.SearchType
Expand Down Expand Up @@ -156,9 +157,9 @@ trait MultiSearchService {
private def logShardErrors(response: RequestSuccess[SearchResponse]) = {
if (response.result.shards.failed > 0) {
response.body.map { body =>
io.circe.parser.parse(body).toTry match {
case Failure(exception) =>
logger.error(s"Got error parsing search response: $body", exception)
CirceUtil.tryParse(body) match {
case Failure(ex) =>
logger.error(s"Got error parsing search response: $body", ex)
case Success(jsonBody) =>
val failures = jsonBody.hcursor.downField("_shards").downField("failures").focus.map(_.spaces2)
failures match {
Expand All @@ -183,10 +184,6 @@ trait MultiSearchService {
val index = getSearchIndexes(settings).?
val searchToExecute = search(index)
.query(filteredSearch)
// TODO: This fails because `node` doesn't have a field indexed at "content.bla.bla"
// Even if we do dynamic mapping template, that field does not exist until data is indexed.
// This even happens for other fields in other indexes, so maybe we need to reconsider using the dynamic mapping templates.
// Since this might be a problem for other fields or other languages (especially ones where not every index has every language).
.suggestions(suggestions(settings.query.underlying, searchLanguage, settings.fallback))
.from(pagination.startAt)
.trackTotalHits(true)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ import scala.jdk.CollectionConverters.*
import scala.util.{Failure, Success, Try}

trait SearchConverterService {
this: DraftApiClient & TaxonomyApiClient & ConverterService & Props & MyNDLAApiClient =>
this: DraftApiClient & TaxonomyApiClient & ConverterService & Props & MyNDLAApiClient & SearchLanguage =>
val searchConverterService: SearchConverterService

class SearchConverterService extends StrictLogging {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ import scala.util.{Failure, Success, Try}

trait SearchService {
this: Elastic4sClient & IndexService & SearchConverterService & StrictLogging & Props & BaseIndexService &
ErrorHandling =>
ErrorHandling & SearchLanguage =>

trait SearchService {
import props.{DefaultLanguage, ElasticSearchScrollKeepAlive, MaxPageSize}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import no.ndla.database.DBUtility
import no.ndla.network.NdlaClient
import no.ndla.network.clients.{FeideApiClient, FrontpageApiClient, MyNDLAApiClient, RedisClient}
import no.ndla.network.tapir.TapirApplication
import no.ndla.search.{BaseIndexService, Elastic4sClient}
import no.ndla.search.{BaseIndexService, Elastic4sClient, SearchLanguage}
import no.ndla.searchapi.controller.parameters.GetSearchQueryParams
import no.ndla.searchapi.controller.{InternController, SearchController}
import no.ndla.searchapi.integration.*
Expand Down Expand Up @@ -43,6 +43,7 @@ trait TestEnvironment
with TaxonomyApiClient
with DBUtility
with IndexService
with SearchLanguage
with BaseIndexService
with StrictLogging
with LearningPathApiClient
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -393,21 +393,6 @@ class MultiDraftSearchServiceTest extends IntegrationSuite(EnableElasticsearchCo
search.totalCount should equal(0)
}

test("Searching with query for language not in analyzers should work as expected") {
val Success(search) = multiDraftSearchService.matchingQuery(
multiDraftSearchSettings.copy(
query = Some(NonEmptyString.fromString("Chhattisgarhi").get),
language = "hne",
sort = Sort.ByRelevanceDesc
)
)

search.totalCount should equal(1)
search.summaryResults.head.id should equal(13)
search.summaryResults.head.title.title should equal("Chhattisgarhi title")
search.summaryResults.head.title.language should equal("hne")
}

test("metadescription is searchable") {
val Success(search) = multiDraftSearchService.matchingQuery(
multiDraftSearchSettings
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -390,21 +390,6 @@ class MultiSearchServiceTest
searchEn.totalCount should equal(0)
}

test("Searching with query for language not in analyzer should return something") {
val Success(searchEn) = multiSearchService.matchingQuery(
searchSettings.copy(
query = Some(NonEmptyString.fromString("Chhattisgarhi").get),
language = "hne",
sort = Sort.ByRelevanceDesc
)
)

searchEn.totalCount should equal(1)
searchEn.summaryResults.head.id should equal(11)
searchEn.summaryResults.head.title.title should equal("Chhattisgarhi")
searchEn.summaryResults.head.title.language should equal("hne")
}

test("metadescription is searchable") {
val Success(search) = multiSearchService.matchingQuery(
searchSettings.copy(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -304,13 +304,17 @@ class SearchConverterServiceTest extends UnitSuite with TestEnvironment {
)

searchable1.contexts.size should be(2)
searchable1.contexts.map(_.domainObject.root.languageValues.map(_.value)) should be(Seq(Seq("Matte"), Seq("Historie")))
searchable1.contexts.map(_.domainObject.root.languageValues.map(_.value)) should be(
Seq(Seq("Matte"), Seq("Historie"))
)

searchable4.contexts.size should be(1)
searchable4.contexts.head.domainObject.root.languageValues.map(_.value) should be(Seq("Matte"))

searchable5.contexts.size should be(2)
searchable5.contexts.map(_.domainObject.root.languageValues.map(_.value)) should be(Seq(Seq("Historie"), Seq("Matte")))
searchable5.contexts.map(_.domainObject.root.languageValues.map(_.value)) should be(
Seq(Seq("Historie"), Seq("Matte"))
)
}

test("That invisible contexts are not indexed") {
Expand Down
7 changes: 2 additions & 5 deletions search/src/main/scala/no/ndla/search/BaseIndexService.scala
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,17 @@ import com.sksamuel.elastic4s.requests.mappings.MappingDefinition
import com.typesafe.scalalogging.StrictLogging
import no.ndla.common.configuration.HasBaseProps
import no.ndla.common.implicits.TryQuestionMark
import no.ndla.search.SearchLanguage.NynorskLanguageAnalyzer
import no.ndla.search.model.domain.{BulkIndexResult, ElasticIndexingException, ReindexResult}

import java.text.SimpleDateFormat
import java.util.Calendar
import scala.util.{Failure, Success, Try}

trait BaseIndexService {
this: Elastic4sClient with HasBaseProps =>
this: Elastic4sClient & HasBaseProps & SearchLanguage =>

trait BaseIndexService extends StrictLogging {
import SearchLanguage.NynorskLanguageAnalyzer
val documentType: String
val searchIndex: String
val MaxResultWindowOption: Int
Expand Down Expand Up @@ -62,9 +62,6 @@ trait BaseIndexService {
protected def buildCreateIndexRequest(indexName: String, numShards: Option[Int]): CreateIndexRequest = {
createIndex(indexName)
.shards(numShards.getOrElse(indexShards))
// NOTE: we have more than 1000 fields in some indexes, index.mapping.total_fields.limit
// is set to 2000 to avoid errors.
// .indexSetting("mapping.total_fields.limit", 10000)
.mapping(getMapping)
.indexSetting("max_result_window", MaxResultWindowOption)
.replicas(0) // Spawn with 0 replicas to make indexing faster
Expand Down
3 changes: 1 addition & 2 deletions search/src/main/scala/no/ndla/search/Elastic4sClient.scala
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,7 @@ trait Elastic4sClient {
request: T
)(implicit handler: Handler[T, U], mf: Manifest[U], ec: ExecutionContext): Future[Try[RequestSuccess[U]]] = {
val result = client.execute(request).map {
case failure: RequestFailure =>
Failure(NdlaSearchException(request, failure))
case failure: RequestFailure => Failure(NdlaSearchException(request, failure))
case result: RequestSuccess[U] => Success(result)
}

Expand Down
Loading

0 comments on commit f7ec255

Please sign in to comment.