Skip to content

Commit 8f704a6

Browse files
authored
Match case [EDC-2971] (#19)
* Match case [EDC-2971] Create Analyzer to index field with normal case Add new match case parameter for the request Create test for the match case and exact match both Handle search for match case Allow search more than one words who's return the associate document * Add tests for case sensitive / insensitive and refactor Analyzer
1 parent 25e1a47 commit 8f704a6

File tree

7 files changed

+201
-32
lines changed

7 files changed

+201
-32
lines changed

build.gradle

+28-11
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@ plugins {
22
id 'java-library'
33
id 'maven-publish'
44
id 'signing'
5+
id 'com.github.johnrengelman.shadow' version '7.1.2'
6+
id 'java'
57
}
68
group 'fr.techad'
7-
version '2.0.1'
9+
version '2.1.0'
810
sourceCompatibility = 1.8
911
ext.withDependencies=project.hasProperty('withDependencies')
1012
//Used to be able to add dependencies to jar
@@ -13,8 +15,6 @@ configurations.implementation.setCanBeResolved(true)
1315
configurations.api.setCanBeResolved(true)
1416
}
1517

16-
17-
1818
java {
1919
withJavadocJar()
2020
withSourcesJar()
@@ -29,8 +29,9 @@ dependencies {
2929
implementation 'ch.qos.logback:logback-classic:1.4.4'
3030
implementation 'commons-io:commons-io:2.11.0'
3131
implementation 'org.apache.commons:commons-lang3:3.12.0'
32-
implementation 'org.apache.lucene:lucene-core:9.4.0'
33-
implementation 'org.apache.lucene:lucene-queryparser:9.4.0'
32+
implementation 'org.apache.lucene:lucene-analysis-common:9.4.2'
33+
implementation 'org.apache.lucene:lucene-core:9.4.2'
34+
implementation 'org.apache.lucene:lucene-queryparser:9.4.2'
3435
implementation 'org.jsoup:jsoup:1.15.3'
3536
implementation 'com.auth0:java-jwt:4.0.0'
3637
implementation 'net.lingala.zip4j:zip4j:2.11.2'
@@ -56,6 +57,17 @@ from {
5657
}
5758
}
5859
}
60+
61+
shadowJar {
62+
archiveBaseName.set('edc-httpd-java')
63+
archiveClassifier.set('')
64+
archiveVersion.set('2.1.0')
65+
mergeServiceFiles()
66+
manifest {
67+
attributes 'Main-Class': 'fr.techad.edc.httpd.EdcWebServer'
68+
}
69+
}
70+
5971
//Execute Junit tests
6072
test {
6173
useJUnitPlatform()
@@ -90,11 +102,15 @@ publishing {
90102
developer {
91103
name = 'Gregory Cochin'
92104
email= 'gregory.cochin@tech-advantage.com'
93-
}
105+
}
94106
developer {
95107
name = 'Mathieu Benard'
96108
email= 'mathieu.benard@tech-advantage.com'
97109
}
110+
developer {
111+
name = 'Florian Bracq'
112+
email= 'florian.bracq@tech-advantage.com'
113+
}
98114
}
99115
scm {
100116
connection = 'scm:git:git://github.com/tech-advantage/edc-httpd.git'
@@ -116,9 +132,9 @@ signing {
116132
setRequired {
117133
// signing is only required if the artifacts are to be published
118134
gradle.taskGraph.allTasks.any { it.equals( PublishToMavenRepository) }
119-
}
120-
sign publishing.publications.mavenJava
121-
sign configurations.archives
135+
}
136+
sign publishing.publications.mavenJava
137+
sign configurations.archives
122138
}
123139
if (JavaVersion.current().isJava8Compatible()) {
124140
allprojects {
@@ -128,6 +144,7 @@ if (JavaVersion.current().isJava8Compatible()) {
128144
}
129145
}
130146
artifacts {
147+
shadowJar
131148
archives sourcesJar
132149
archives javadocJar
133150
}
@@ -136,7 +153,7 @@ javadoc {
136153
options.addBooleanOption('html5', true)
137154
}
138155
}
139-
156+
tasks.build.dependsOn tasks.shadowJar
140157
task install(dependsOn: publishToMavenLocal) {
141158
group = 'Publishing'
142159
description = 'Installs artifacts to local Maven repository'
@@ -150,4 +167,4 @@ task release() {
150167
dependsOn javadoc
151168
}
152169

153-
170+
build.dependsOn(shadowJar);

src/main/java/fr/techad/edc/httpd/SearchHandler.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ public void handleRequest(HttpServerExchange exchange) throws Exception {
3737
Deque<String> query = queryParameters.get("query");
3838

3939
Boolean exactMatch = BooleanUtils.toBoolean(getParamValue("match-whole-word", queryParameters));
40+
Boolean matchCase = BooleanUtils.toBoolean(getParamValue("match-case", queryParameters));
4041
String lang = getParamValue("lang", queryParameters);
4142

4243
int limitResults = 100;
@@ -50,7 +51,7 @@ public void handleRequest(HttpServerExchange exchange) throws Exception {
5051
String search = query.element();
5152
ContentSearcher contentSearcher = new ContentSearcher(config);
5253
List<DocumentationSearchResult> searchResults = contentSearcher.search(search, lang, limitResults, exactMatch,
53-
LangUtils.getDefaultLanguage(config), LangUtils.findLanguages(config));
54+
matchCase, LangUtils.getDefaultLanguage(config), LangUtils.findLanguages(config));
5455
bytes = objectMapper.writeValueAsBytes(searchResults);
5556
} else {
5657
bytes = objectMapper.writeValueAsBytes(Collections.singletonMap("error", "malformed query"));

src/main/java/fr/techad/edc/httpd/search/ContentBase.java

+2-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ public class ContentBase {
1717
public static final String DOC_LANGUAGE_CODE = "languageCode";
1818
public static final String DOC_LABEL = "label";
1919
public static final String DOC_TYPE = "type";
20-
public static final String DOC_CONTENT = "content";
20+
public static final String DOC_CONTENT_NORMAL_CASE = "normal_case_content";
21+
public static final String DOC_CONTENT_LOWER_CASE = "lower_case_content";
2122
public static final String DOC_URL = "url";
2223
private final Path indexPath;
2324
private final WebServerConfig config;

src/main/java/fr/techad/edc/httpd/search/ContentIndexer.java

+15-2
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,11 @@
44
import com.fasterxml.jackson.databind.JsonNode;
55
import com.fasterxml.jackson.databind.ObjectMapper;
66
import fr.techad.edc.httpd.WebServerConfig;
7+
import fr.techad.edc.httpd.utils.CaseSensitiveStandardAnalyzer;
78
import org.apache.commons.io.FileUtils;
89
import org.apache.commons.io.IOUtils;
10+
import org.apache.lucene.analysis.Analyzer;
11+
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
912
import org.apache.lucene.analysis.standard.StandardAnalyzer;
1013
import org.apache.lucene.document.Document;
1114
import org.apache.lucene.document.Field;
@@ -22,9 +25,11 @@
2225
import java.io.FileInputStream;
2326
import java.io.IOException;
2427
import java.nio.charset.Charset;
28+
import java.util.HashMap;
2529
import java.util.Iterator;
2630
import java.util.List;
2731
import java.util.Map;
32+
2833
import java.util.concurrent.ConcurrentLinkedQueue;
2934

3035
/**
@@ -37,6 +42,7 @@ public class ContentIndexer extends ContentBase {
3742
private IndexWriter indexWriter;
3843
private long counter;
3944

45+
4046
public ContentIndexer(WebServerConfig webServerConfig) {
4147
super(webServerConfig);
4248
this.docBase = webServerConfig.getBase() + "/" + webServerConfig.getDocFolder() + "/";
@@ -158,7 +164,8 @@ private void indexTopic(Long strategyId, String languageCode, String strategyLab
158164
if (type.equals("DOCUMENT")) {
159165
org.jsoup.nodes.Document jsoupDoc = Jsoup.parse(new File(docBase + "/" + fileName), "UTF-8");
160166
String content = jsoupDoc.text();
161-
document.add(new TextField(DOC_CONTENT, content, Field.Store.YES));
167+
document.add(new TextField(DOC_CONTENT_NORMAL_CASE, content, Field.Store.YES));
168+
document.add(new TextField(DOC_CONTENT_LOWER_CASE, content, Field.Store.YES));
162169
}
163170
document.add(new TextField(DOC_URL, fileName, Field.Store.YES));
164171
this.indexWriter.addDocument(document);
@@ -167,7 +174,13 @@ private void indexTopic(Long strategyId, String languageCode, String strategyLab
167174

168175
private void createIndexWriter() throws IOException {
169176
FSDirectory dir = FSDirectory.open(getIndexPath());
170-
IndexWriterConfig config = new IndexWriterConfig(new StandardAnalyzer());
177+
Map<String, Analyzer> analyzerPerField = new HashMap<String, Analyzer>();
178+
// Associate analyzer to the DOC_CONTENT_NORMAL_CASE field to do case sensitive search
179+
analyzerPerField.put(DOC_CONTENT_NORMAL_CASE, new CaseSensitiveStandardAnalyzer());
180+
PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(
181+
new StandardAnalyzer(), analyzerPerField);
182+
183+
IndexWriterConfig config = new IndexWriterConfig(analyzer);
171184
indexWriter = new IndexWriter(dir, config);
172185
}
173186
}

src/main/java/fr/techad/edc/httpd/search/ContentSearcher.java

+28-13
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import java.io.IOException;
44
import java.util.*;
55

6-
import fr.techad.edc.httpd.utils.LangUtils;
6+
import fr.techad.edc.httpd.utils.CaseSensitiveStandardAnalyzer;
77
import org.apache.commons.lang3.StringUtils;
88
import org.apache.lucene.analysis.standard.StandardAnalyzer;
99
import org.apache.lucene.document.Document;
@@ -12,7 +12,6 @@
1212
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
1313
import org.apache.lucene.queryparser.classic.ParseException;
1414
import org.apache.lucene.queryparser.classic.QueryParser;
15-
import org.apache.lucene.queryparser.classic.QueryParserBase;
1615
import org.apache.lucene.search.IndexSearcher;
1716
import org.apache.lucene.search.Query;
1817
import org.apache.lucene.search.ScoreDoc;
@@ -29,17 +28,25 @@
2928
*/
3029
public class ContentSearcher extends ContentBase {
3130
private static final Logger LOGGER = LoggerFactory.getLogger(ContentSearcher.class);
32-
private static final String[] SEARCH_FIELDS = { DOC_LABEL, DOC_CONTENT, DOC_TYPE };
33-
private static final Map<String, Float> BOOTS;
31+
private final static String[] SEARCH_FIELD_NORMAL_CASE = { DOC_LABEL, DOC_CONTENT_NORMAL_CASE, DOC_TYPE };
32+
private final static String[] SEARCH_FIELD_LOWER_CASE = { DOC_LABEL, DOC_CONTENT_LOWER_CASE, DOC_TYPE };
33+
private static final Map<String, Float> BOOTS_NORMAL_CASE;
34+
private static final Map<String, Float> BOOTS_LOWER_CASE;
3435
static {
35-
Map<String, Float> aMap = new HashMap<>();
36-
aMap.put(DOC_LABEL, 2f);
37-
aMap.put(DOC_CONTENT, 1f);
38-
aMap.put(DOC_TYPE, .5f);
39-
BOOTS = Collections.unmodifiableMap(aMap);
36+
Map<String, Float> normalCaseMap = new HashMap<>();
37+
Map<String, Float> lowerCaseMap = new HashMap<>();
38+
normalCaseMap.put(DOC_LABEL, 2f);
39+
normalCaseMap.put(DOC_CONTENT_NORMAL_CASE, 1f);
40+
normalCaseMap.put(DOC_TYPE, .5f);
41+
lowerCaseMap.put(DOC_LABEL, 2f);
42+
lowerCaseMap.put(DOC_CONTENT_LOWER_CASE, 1f);
43+
lowerCaseMap.put(DOC_TYPE, .5f);
44+
BOOTS_NORMAL_CASE = Collections.unmodifiableMap(normalCaseMap);
45+
BOOTS_LOWER_CASE = Collections.unmodifiableMap(lowerCaseMap);
4046
}
4147

4248
private IndexSearcher indexSearcher;
49+
private QueryParser qp;
4350

4451
public ContentSearcher(WebServerConfig webServerConfig) {
4552
super(webServerConfig);
@@ -54,7 +61,7 @@ public ContentSearcher(WebServerConfig webServerConfig) {
5461
* @throws ParseException if the search parameter is malformed
5562
*/
5663
public List<DocumentationSearchResult> search(String search, String lang, int limit, boolean exact,
57-
String defaultLanguage, Set<String> languages) throws IOException, ParseException {
64+
boolean matchCase, String defaultLanguage, Set<String> languages) throws IOException, ParseException {
5865
// Handle wildcard with exacttMode condition
5966
if (!exact && !search.endsWith("*")) {
6067
search = search + "*";
@@ -63,13 +70,21 @@ public List<DocumentationSearchResult> search(String search, String lang, int li
6370
List<DocumentationSearchResult> results = new ArrayList<>();
6471
LOGGER.debug("Search {}", search);
6572
createSearcher();
66-
QueryParser qp = new MultiFieldQueryParser(SEARCH_FIELDS, new StandardAnalyzer(), BOOTS);
73+
74+
if(matchCase){
75+
qp = new MultiFieldQueryParser(SEARCH_FIELD_NORMAL_CASE, new CaseSensitiveStandardAnalyzer(), BOOTS_NORMAL_CASE);
76+
} else {
77+
qp = new MultiFieldQueryParser(SEARCH_FIELD_LOWER_CASE, new StandardAnalyzer(), BOOTS_LOWER_CASE);
78+
}
79+
6780
qp.setAllowLeadingWildcard(true);
81+
qp.setDefaultOperator(QueryParser.Operator.AND);
82+
6883
String langSearch = "";
6984
if (StringUtils.isNotBlank(lang)) {
7085
langSearch = " AND languageCode:" + lang;
7186
}
72-
Query query = qp.parse(QueryParserBase.escape(search) + langSearch);
87+
Query query = qp.parse(search + langSearch);
7388
TopDocs hits = indexSearcher.search(query, limit);
7489
LOGGER.debug("Found {} results for the search '{}'", hits.totalHits, search);
7590

@@ -88,7 +103,7 @@ public List<DocumentationSearchResult> search(String search, String lang, int li
88103

89104
}
90105
if (results.isEmpty() && !defaultLanguage.equals(lang) && !languages.contains(lang)) {
91-
return search(search, defaultLanguage, limit, exact, defaultLanguage, languages);
106+
return search(search, defaultLanguage, limit, exact, matchCase, defaultLanguage, languages);
92107
}
93108
return results;
94109
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
package fr.techad.edc.httpd.utils;
2+
3+
import org.apache.lucene.analysis.*;
4+
import org.apache.lucene.analysis.StopwordAnalyzerBase;
5+
import org.apache.lucene.analysis.standard.StandardTokenizer;
6+
7+
public class CaseSensitiveStandardAnalyzer extends StopwordAnalyzerBase {
8+
/** Default maximum allowed token length */
9+
public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
10+
public StandardTokenizer src;
11+
12+
public CaseSensitiveStandardAnalyzer(){
13+
src = new StandardTokenizer();
14+
}
15+
16+
@Override
17+
protected Analyzer.TokenStreamComponents createComponents(final String fieldName) {
18+
return new TokenStreamComponents(
19+
r -> {
20+
src.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH);
21+
src.setReader(r);
22+
},
23+
new StopFilter(src, stopwords));
24+
}
25+
}

0 commit comments

Comments
 (0)