Skip to content

Commit e6f9ef7

Browse files
committed
speed! queries much faster now
1 parent 83ede0c commit e6f9ef7

File tree

3 files changed

+158
-78
lines changed

3 files changed

+158
-78
lines changed

config/geocodeur.conf

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
"timeoutSeconds": 30
1919
},
2020
"api": {
21-
"similarityThreshold": 0.6,
21+
"similarityThreshold": 0.8,
2222
},
2323
"database": {
2424
"name": "geocodeur",

src/database/setup.go

+93-15
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,12 @@ func CreateDB(config settings.Config) {
8484
log.Fatalf("Failed to create index: %v", err)
8585
}
8686

87+
log.Info("Creating search rank index")
88+
err = createIndexRank(pool)
89+
if err != nil {
90+
log.Fatalf("Failed to create index: %v", err)
91+
}
92+
8793
log.Info("Creating search trgm index")
8894
err = createIndexTrgm(pool)
8995
if err != nil {
@@ -171,20 +177,20 @@ func processParquet(pool *pgxpool.Pool, path string) {
171177

172178
func processAliases(tx pgx.Tx, rec Record, id uint64) {
173179
// Add name as alias
174-
addAlias(tx, rec.ID, rec.Name, id)
180+
addAlias(tx, rec, rec.Name, id)
175181

176182
// Add aliases for name aliases
177183
for name, alias := range aliases {
178184
if rec.Name == name {
179-
addAlias(tx, rec.ID, alias, id)
185+
addAlias(tx, rec, alias, id)
180186
}
181187
}
182188

183189
// Add embedding for truncated names
184190
for _, truncation := range truncations {
185191
if strings.Contains(rec.Name, truncation) {
186192
alias := strings.Trim(strings.Replace(rec.Name, truncation, "", 1), " ")
187-
addAlias(tx, rec.ID, alias, id)
193+
addAlias(tx, rec, alias, id)
188194
}
189195
}
190196

@@ -197,13 +203,13 @@ func processAliases(tx pgx.Tx, rec Record, id uint64) {
197203
}
198204

199205
alias := rec.Name + " " + relation
200-
addAlias(tx, rec.ID, alias, id)
206+
addAlias(tx, rec, alias, id)
201207

202208
// Add entry for relation aliases
203209
for name, alias := range aliases {
204210
if relation == name {
205211
aliasEmbedding := rec.Name + " " + alias
206-
addAlias(tx, rec.ID, aliasEmbedding, id)
212+
addAlias(tx, rec, aliasEmbedding, id)
207213
}
208214
}
209215
}
@@ -217,10 +223,15 @@ func addOvertureFeature(tx pgx.Tx, rec Record, recordId uint64) error {
217223
return err
218224
}
219225

220-
func addAlias(tx pgx.Tx, id, alias string, recordId uint64) error {
226+
func addAlias(tx pgx.Tx, rec Record, alias string, recordId uint64) error {
221227
alias = strings.ToLower(alias)
222-
query := fmt.Sprintf(`INSERT INTO %s (feature_id, alias) VALUES ($1, $2)`, TABLE_SEARCH)
223-
_, err := tx.Exec(context.Background(), query, recordId, alias)
228+
classRank := getClassRank(rec.Class)
229+
subclassRank := getSubclassScore(rec.Subclass)
230+
wordCount := len(strings.Split(alias, " "))
231+
charCount := len(alias)
232+
233+
query := fmt.Sprintf(`INSERT INTO %s (feature_id, alias, class_rank, subclass_rank, word_count, char_count) VALUES ($1, $2, $3, $4, $5, $6)`, TABLE_SEARCH)
234+
_, err := tx.Exec(context.Background(), query, recordId, alias, classRank, subclassRank, wordCount, charCount)
224235

225236
return err
226237
}
@@ -240,6 +251,13 @@ func vacuum(pool *pgxpool.Pool) error {
240251
}
241252

242253
func setupDatabase(pool *pgxpool.Pool, schema string) error {
254+
if schema != "" {
255+
_, err := pool.Exec(context.Background(), fmt.Sprintf("CREATE SCHEMA IF NOT EXISTS %s;", schema))
256+
if err != nil {
257+
return fmt.Errorf("failed to create schema: %v", err)
258+
}
259+
}
260+
243261
queryExtensions := `
244262
CREATE EXTENSION IF NOT EXISTS postgis;
245263
CREATE EXTENSION IF NOT EXISTS pg_trgm;
@@ -250,12 +268,7 @@ func setupDatabase(pool *pgxpool.Pool, schema string) error {
250268
return fmt.Errorf("failed to create extensions: %v", err)
251269
}
252270

253-
if schema == "" {
254-
return nil
255-
}
256-
257-
_, err = pool.Exec(context.Background(), fmt.Sprintf("CREATE SCHEMA IF NOT EXISTS %s;", schema))
258-
return err
271+
return nil
259272
}
260273

261274
func createTableOverture(pool *pgxpool.Pool, tablespace string) error {
@@ -296,7 +309,11 @@ func createTableSearch(pool *pgxpool.Pool, tablespace string) error {
296309
297310
CREATE TABLE %[1]s (
298311
feature_id BIGINT,
299-
alias TEXT
312+
alias TEXT,
313+
class_rank INT,
314+
subclass_rank INT,
315+
word_count INT,
316+
char_count INT
300317
) %[2]s;
301318
`, TABLE_SEARCH, tablespace)
302319

@@ -313,6 +330,15 @@ func createForeignKey(pool *pgxpool.Pool) error {
313330
return err
314331
}
315332

333+
func createIndexRank(pool *pgxpool.Pool) error {
334+
query := fmt.Sprintf(`
335+
CREATE INDEX IF NOT EXISTS idx_%[1]s_class_subclass ON %[1]s USING btree (class_rank, subclass_rank);
336+
`, TABLE_SEARCH)
337+
338+
_, err := pool.Exec(context.Background(), query)
339+
return err
340+
}
341+
316342
func createIndexGeom(pool *pgxpool.Pool) error {
317343
query := fmt.Sprintf(`
318344
CREATE INDEX IF NOT EXISTS idx_%[1]s_geom ON %[1]s USING GIST (geom);
@@ -341,3 +367,55 @@ func createFTSVectorColumn(pool *pgxpool.Pool) error {
341367
_, err := pool.Exec(context.Background(), query)
342368
return err
343369
}
370+
371+
func getClassRank(class string) int {
372+
switch class {
373+
case "division":
374+
return 1
375+
case "water": // lot of division names with partly water name, maas, ijssel, etc, rank the same
376+
return 1
377+
case "road":
378+
return 2
379+
case "infra":
380+
return 3
381+
case "address":
382+
return 4
383+
case "zipcode":
384+
return 5
385+
case "poi":
386+
return 6
387+
default:
388+
return 100
389+
}
390+
}
391+
392+
func getSubclassScore(subclass string) int {
393+
switch subclass {
394+
case "locality":
395+
return 1
396+
case "county":
397+
return 2
398+
case "neighboorhood":
399+
return 3
400+
case "microhood":
401+
return 4
402+
case "motorway":
403+
return 1
404+
case "trunk":
405+
return 2
406+
case "primary":
407+
return 3
408+
case "secondary":
409+
return 4
410+
case "tertiary":
411+
return 5
412+
case "unclassified":
413+
return 6
414+
case "residential":
415+
return 6
416+
case "living_street":
417+
return 6
418+
default:
419+
return 100
420+
}
421+
}

src/service/geocode.go

+64-62
Original file line numberDiff line numberDiff line change
@@ -148,83 +148,85 @@ func parseGeocodeResults(rows pgx.Rows) ([]GeocodeResult, error) {
148148
func createGeocodeQuery(options GeocodeOptions, input string) string {
149149
classesIn := options.ClassesToSqlArray()
150150

151+
// workaround for now since we do not have class in the search table
152+
classesIn = strings.Replace(classesIn, "'division'", "1", -1)
153+
classesIn = strings.Replace(classesIn, "'water'", "1", -1)
154+
classesIn = strings.Replace(classesIn, "'road'", "2", -1)
155+
classesIn = strings.Replace(classesIn, "'infra'", "3", -1)
156+
classesIn = strings.Replace(classesIn, "'address'", "4", -1)
157+
classesIn = strings.Replace(classesIn, "'zipcode'", "5", -1)
158+
classesIn = strings.Replace(classesIn, "'poi'", "6", -1)
159+
151160
// Conditional geometry column
152161
geometryColumn := "'' AS geom" // Default to an empty string if geometry is not included
153162
if options.IncludeGeometry {
154-
geometryColumn = "ST_AsGeoJSON(a.geom) AS geom"
163+
geometryColumn = "ST_AsGeoJSON(b.geom) AS geom"
155164
}
156165

157166
return fmt.Sprintf(`
158167
WITH fts AS (
159-
SELECT feature_id, alias, similarity(alias, $1) AS sim, 'fts' as search
160-
FROM %s AS a
161-
JOIN %s AS b ON a.feature_id = b.id
162-
WHERE a.vector_search @@ to_tsquery('simple',
163-
replace($1, ' ', ':* & ') || ':*'
164-
)
165-
AND b.class IN %s
166-
ORDER BY sim
168+
SELECT
169+
feature_id, alias, class_rank, subclass_rank, 'fts' as search
170+
FROM
171+
%[1]s
172+
WHERE
173+
ABS(word_count - array_length(string_to_array($1, ' '), 1)) < 3
174+
AND
175+
ABS(char_count - LENGTH($1)) < 30
176+
AND
177+
vector_search @@ to_tsquery('simple', replace($1, ' ', ':* & ') || ':*')
178+
AND
179+
class_rank IN %[3]s
180+
ORDER BY
181+
class_rank ASC,
182+
subclass_rank ASC
183+
LIMIT 100
167184
),
168185
trgm AS (
169-
SELECT feature_id, alias, similarity(a.alias, $1) AS sim, 'trgm' as search
170-
FROM %s AS a
171-
JOIN %s AS b ON a.feature_id = b.id
172-
WHERE a.alias %% $1
173-
AND b.class IN %s
174-
ORDER BY a.alias <-> $1
186+
SELECT feature_id, alias, class_rank, subclass_rank, 'trgm' as search
187+
FROM %[1]s
188+
WHERE
189+
ABS(word_count - array_length(string_to_array($1, ' '), 1)) < 3
190+
AND
191+
ABS(char_count - LENGTH($1)) < 30
192+
AND
193+
alias %% $1
194+
AND
195+
class_rank IN %[3]s
196+
ORDER BY
197+
class_rank ASC,
198+
subclass_rank ASC
199+
LIMIT 100
175200
),
176-
alias_results AS (
201+
search_results AS (
177202
SELECT *
178203
FROM fts
179204
UNION ALL
180205
SELECT *
181206
FROM trgm
182207
WHERE NOT EXISTS (SELECT 1 FROM fts)
183-
), ranked_aliases AS (
184-
SELECT
185-
a.id,
186-
a.name,
187-
a.class,
188-
a.subclass,
189-
array_to_string(a.divisions, ',') AS divisions,
190-
b.alias,
191-
b.sim,
192-
b.search,
193-
%s, -- Geometry column is dynamically included or excluded
194-
CASE
195-
WHEN a.class = 'division' THEN 1
196-
WHEN a.class = 'water' THEN 2
197-
WHEN a.class = 'road' THEN 3
198-
WHEN a.class = 'infra' THEN 4
199-
WHEN a.class = 'address' THEN 5
200-
WHEN a.class = 'zipcode' THEN 6
201-
WHEN a.class = 'poi' THEN 7
202-
ELSE 100
203-
END AS class_score,
204-
CASE
205-
WHEN a.subclass = 'locality' THEN 1
206-
WHEN a.subclass = 'county' THEN 2
207-
WHEN a.subclass = 'neighboorhood' THEN 3
208-
WHEN a.subclass = 'microhood' THEN 4
209-
-- roads up to living_street the rest gets a high score
210-
WHEN a.subclass = 'motorway' THEN 1
211-
WHEN a.subclass = 'trunk' THEN 2
212-
WHEN a.subclass = 'primary' THEN 3
213-
WHEN a.subclass = 'secondary' THEN 4
214-
WHEN a.subclass = 'tertiary' THEN 5
215-
WHEN a.subclass = 'unclassified' THEN 6
216-
WHEN a.subclass = 'residential' THEN 7
217-
WHEN a.subclass = 'living_street' THEN 8
218-
ELSE 100
219-
END AS subclass_score,
220-
ROW_NUMBER() OVER (PARTITION BY a.id ORDER BY similarity(b.alias, $1) DESC) AS rnk
221-
FROM %s a
222-
JOIN alias_results b ON a.id = b.feature_id
208+
),
209+
similarity as (
210+
select
211+
feature_id,
212+
alias,
213+
class_rank,
214+
subclass_rank,
215+
similarity(alias, $1) AS sim,
216+
search,
217+
ROW_NUMBER() OVER (PARTITION BY feature_id ORDER BY similarity(alias, $1) DESC) AS rnk
218+
from search_results
223219
)
224-
SELECT id, name, class, subclass, divisions, alias, search, sim, geom
225-
FROM ranked_aliases
226-
WHERE rnk = 1
227-
ORDER BY sim desc, class_score asc, subclass_score asc
228-
LIMIT %v;`,
229-
database.TABLE_SEARCH, database.TABLE_OVERTURE, classesIn, database.TABLE_SEARCH, database.TABLE_OVERTURE, classesIn, geometryColumn, database.TABLE_OVERTURE, options.Limit)
220+
SELECT
221+
b.id, b.name, b.class, b.subclass, b.divisions::varchar, a.alias, a.search, a.sim, %[4]s
222+
FROM similarity AS a
223+
INNER JOIN
224+
%[2]s AS b ON a.feature_id = b.id
225+
WHERE a.rnk = 1
226+
ORDER by
227+
sim desc,
228+
class_rank asc,
229+
subclass_rank asc
230+
LIMIT %[5]v;`,
231+
database.TABLE_SEARCH, database.TABLE_OVERTURE, classesIn, geometryColumn, options.Limit)
230232
}

0 commit comments

Comments
 (0)