Skip to content

Commit

Permalink
add PhrasePrefixQuery (#1842)
Browse files Browse the repository at this point in the history
* add PhrasePrefixQuery
  • Loading branch information
trinity-1686a authored Feb 22, 2023
1 parent c7278b3 commit 533ad99
Show file tree
Hide file tree
Showing 7 changed files with 696 additions and 2 deletions.
2 changes: 2 additions & 0 deletions src/query/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ mod explanation;
mod fuzzy_query;
mod intersection;
mod more_like_this;
mod phrase_prefix_query;
mod phrase_query;
mod query;
mod query_parser;
Expand Down Expand Up @@ -47,6 +48,7 @@ pub(crate) use self::fuzzy_query::DfaWrapper;
pub use self::fuzzy_query::FuzzyTermQuery;
pub use self::intersection::{intersect_scorers, Intersection};
pub use self::more_like_this::{MoreLikeThisQuery, MoreLikeThisQueryBuilder};
pub use self::phrase_prefix_query::PhrasePrefixQuery;
pub use self::phrase_query::PhraseQuery;
pub use self::query::{EnableScoring, Query, QueryClone};
pub use self::query_parser::{QueryParser, QueryParserError};
Expand Down
34 changes: 34 additions & 0 deletions src/query/phrase_prefix_query/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
mod phrase_prefix_query;
mod phrase_prefix_scorer;
mod phrase_prefix_weight;

pub use phrase_prefix_query::PhrasePrefixQuery;
pub use phrase_prefix_scorer::PhrasePrefixScorer;
pub use phrase_prefix_weight::PhrasePrefixWeight;

fn prefix_end(prefix_start: &[u8]) -> Option<Vec<u8>> {
let mut res = prefix_start.to_owned();
while !res.is_empty() {
let end = res.len() - 1;
if res[end] == u8::MAX {
res.pop();
} else {
res[end] += 1;
return Some(res);
}
}
None
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_prefix_end() {
assert_eq!(prefix_end(b"aaa"), Some(b"aab".to_vec()));
assert_eq!(prefix_end(b"aa\xff"), Some(b"ab".to_vec()));
assert_eq!(prefix_end(b"a\xff\xff"), Some(b"b".to_vec()));
assert_eq!(prefix_end(b"\xff\xff\xff"), None);
}
}
168 changes: 168 additions & 0 deletions src/query/phrase_prefix_query/phrase_prefix_query.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
use std::ops::Bound;

use super::{prefix_end, PhrasePrefixWeight};
use crate::query::bm25::Bm25Weight;
use crate::query::{EnableScoring, Query, RangeQuery, Weight};
use crate::schema::{Field, IndexRecordOption, Term};

const DEFAULT_MAX_EXPANSIONS: u32 = 50;

/// `PhrasePrefixQuery` matches a specific sequence of words followed by term of which only a
/// prefix is known.
///
/// For instance the phrase prefix query for `"part t"` will match
/// the sentence
///
/// **Alan just got a part time job.**
///
/// On the other hand it will not match the sentence.
///
/// **This is my favorite part of the job.**
///
/// Using a `PhrasePrefixQuery` on a field requires positions
/// to be indexed for this field.
#[derive(Clone, Debug)]
pub struct PhrasePrefixQuery {
field: Field,
phrase_terms: Vec<(usize, Term)>,
prefix: (usize, Term),
max_expansions: u32,
}

impl PhrasePrefixQuery {
/// Creates a new `PhrasePrefixQuery` given a list of terms.
///
/// There must be at least two terms, and all terms
/// must belong to the same field.
/// Offset for each term will be same as index in the Vector
/// The last Term is a prefix and not a full value
pub fn new(terms: Vec<Term>) -> PhrasePrefixQuery {
let terms_with_offset = terms.into_iter().enumerate().collect();
PhrasePrefixQuery::new_with_offset(terms_with_offset)
}

/// Creates a new `PhrasePrefixQuery` given a list of terms and their offsets.
///
/// Can be used to provide custom offset for each term.
pub fn new_with_offset(mut terms: Vec<(usize, Term)>) -> PhrasePrefixQuery {
assert!(
!terms.is_empty(),
"A phrase prefix query is required to have at least one term."
);
terms.sort_by_key(|&(offset, _)| offset);
let field = terms[0].1.field();
assert!(
terms[1..].iter().all(|term| term.1.field() == field),
"All terms from a phrase query must belong to the same field"
);
PhrasePrefixQuery {
field,
prefix: terms.pop().unwrap(),
phrase_terms: terms,
max_expansions: DEFAULT_MAX_EXPANSIONS,
}
}

/// Maximum number of terms to which the last provided term will expand.
pub fn set_max_expansions(&mut self, value: u32) {
self.max_expansions = value;
}

/// The [`Field`] this `PhrasePrefixQuery` is targeting.
pub fn field(&self) -> Field {
self.field
}

/// `Term`s in the phrase without the associated offsets.
pub fn phrase_terms(&self) -> Vec<Term> {
// TODO should we include the last term too?
self.phrase_terms
.iter()
.map(|(_, term)| term.clone())
.collect::<Vec<Term>>()
}

/// Returns the [`PhrasePrefixWeight`] for the given phrase query given a specific `searcher`.
///
/// This function is the same as [`Query::weight()`] except it returns
/// a specialized type [`PhraseQueryWeight`] instead of a Boxed trait.
/// If the query was only one term long, this returns `None` wherease [`Query::weight`]
/// returns a boxed [`RangeWeight`]
///
/// Returns `None`, if phrase_terms is empty, which happens if the phrase prefix query was
/// built with a single term.
pub(crate) fn phrase_prefix_query_weight(
&self,
enable_scoring: EnableScoring<'_>,
) -> crate::Result<Option<PhrasePrefixWeight>> {
if self.phrase_terms.is_empty() {
return Ok(None);
}
let schema = enable_scoring.schema();
let field_entry = schema.get_field_entry(self.field);
let has_positions = field_entry
.field_type()
.get_index_record_option()
.map(IndexRecordOption::has_positions)
.unwrap_or(false);
if !has_positions {
let field_name = field_entry.name();
return Err(crate::TantivyError::SchemaError(format!(
"Applied phrase query on field {:?}, which does not have positions indexed",
field_name
)));
}
let terms = self.phrase_terms();
let bm25_weight_opt = match enable_scoring {
EnableScoring::Enabled { searcher, .. } => {
Some(Bm25Weight::for_terms(searcher, &terms)?)
}
EnableScoring::Disabled { .. } => None,
};
let weight = PhrasePrefixWeight::new(
self.phrase_terms.clone(),
self.prefix.clone(),
bm25_weight_opt,
self.max_expansions,
);
Ok(Some(weight))
}
}

impl Query for PhrasePrefixQuery {
/// Create the weight associated with a query.
///
/// See [`Weight`].
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
if let Some(phrase_weight) = self.phrase_prefix_query_weight(enable_scoring)? {
Ok(Box::new(phrase_weight))
} else {
// There are no prefix. Let's just match the suffix.
let end_term = if let Some(end_value) = prefix_end(&self.prefix.1.value_bytes()) {
let mut end_term = Term::with_capacity(end_value.len());
end_term.set_field_and_type(self.field, self.prefix.1.typ());
end_term.append_bytes(&end_value);
Bound::Excluded(end_term)
} else {
Bound::Unbounded
};

RangeQuery::new_term_bounds(
enable_scoring
.schema()
.get_field_name(self.field)
.to_owned(),
self.prefix.1.typ(),
&Bound::Included(self.prefix.1.clone()),
&end_term,
)
.weight(enable_scoring)
}
}

fn query_terms<'a>(&'a self, visitor: &mut dyn FnMut(&'a Term, bool)) {
for (_, term) in &self.phrase_terms {
visitor(term, true);
}
}
}
Loading

0 comments on commit 533ad99

Please sign in to comment.