Skip to content

Commit

Permalink
polish: more polishing work
Browse files Browse the repository at this point in the history
Touching up docs, restyling code in places, etc.
  • Loading branch information
BurntSushi committed Mar 7, 2020
1 parent 8a368ba commit 57e588c
Show file tree
Hide file tree
Showing 16 changed files with 153 additions and 176 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ levenshtein = ["utf8-ranges"]
utf8-ranges = { version = "1.0.4", optional = true }

[dev-dependencies]
doc-comment = "0.3.1"
fnv = "1.0.6"
memmap = "0.7"
quickcheck = { version = "0.9.2", default-features = false }
Expand Down
39 changes: 16 additions & 23 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,10 @@ Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org).
https://docs.rs/fst

The
[`fst-regex`](https://docs.rs/fst-regex)
and
[`fst-levenshtein`](https://docs.rs/fst-levenshtein)
crates provide regular expression matching and fuzzy searching on FSTs,
respectively.
[`regex-automata`](https://docs.rs/regex-automata)
crate provides implementations of the `fst::Automata` trait when its
`transducer` feature is enabled. This permits using DFAs compiled by
`regex-automata` to search finite state transducers produced by this crate.


### Installation
Expand All @@ -36,27 +35,21 @@ Simply add a corresponding entry to your `Cargo.toml` dependency list:

```toml,ignore
[dependencies]
fst = "0.3"
fst = "0.4"
```


### Example

This example demonstrates building a set in memory and executing a fuzzy query
against it. You'll need `fst = "0.3"` and `fst-levenshtein = "0.2"` in your
`Cargo.toml`.
against it. You'll need `fst = "0.4"` with the `levenshtein` feature enabled in
your `Cargo.toml`.

```rust
extern crate fst;
extern crate fst_levenshtein;

use std::error::Error;
use std::process;

use fst::{IntoStreamer, Set};
use fst_levenshtein::Levenshtein;
use fst::automaton::Levenshtein;

fn try_main() -> Result<(), Box<Error>> {
fn main() -> Result<(), Box<dyn std::error::Error>> {
// A convenient way to create sets in memory.
let keys = vec!["fa", "fo", "fob", "focus", "foo", "food", "foul"];
let set = Set::from_iter(keys)?;
Expand All @@ -71,13 +64,13 @@ fn try_main() -> Result<(), Box<Error>> {
assert_eq!(keys, vec!["fo", "fob", "foo", "food"]);
Ok(())
}

fn main() {
if let Err(err) = try_main() {
eprintln!("{}", err);
process::exit(1);
}
}
```

Check out the documentation for a lot more examples!


### Cargo features

* `levenshtein` - **Disabled** by default. This adds the `Levenshtein`
automaton to the `automaton` sub-module. This includes an additional
dependency on `utf8-ranges`.
18 changes: 9 additions & 9 deletions fst-bin/src/merge.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ impl<I> Merger<I>
where
I: Iterator<Item = Result<(BString, u64), Error>> + Send + 'static,
{
pub fn new<T, P>(it: T, output: P) -> Self
pub fn new<T, P>(it: T, output: P) -> Merger<I>
where
P: AsRef<Path>,
T: IntoIterator<IntoIter = I, Item = I::Item>,
Expand All @@ -49,35 +49,35 @@ where
}
}

pub fn value_merger<F>(mut self, f: F) -> Self
pub fn value_merger<F>(mut self, f: F) -> Merger<I>
where
F: Fn(u64, u64) -> u64 + Send + Sync + 'static,
{
self.value_merger = Some(Arc::new(f));
self
}

pub fn fd_limit(mut self, fd_limit: u32) -> Self {
pub fn fd_limit(mut self, fd_limit: u32) -> Merger<I> {
self.fd_limit = fd_limit;
self
}

pub fn batch_size(mut self, batch_size: u32) -> Self {
pub fn batch_size(mut self, batch_size: u32) -> Merger<I> {
self.batch_size = batch_size;
self
}

pub fn threads(mut self, threads: u32) -> Self {
pub fn threads(mut self, threads: u32) -> Merger<I> {
self.threads = threads;
self
}

pub fn tmp_dir<P: AsRef<Path>>(mut self, path: P) -> Self {
pub fn tmp_dir<P: AsRef<Path>>(mut self, path: P) -> Merger<I> {
self.tmp_dir = path.as_ref().to_path_buf();
self
}

pub fn keep_tmp_dir(mut self, yes: bool) -> Self {
pub fn keep_tmp_dir(mut self, yes: bool) -> Merger<I> {
self.keep_tmp_dir = yes;
self
}
Expand Down Expand Up @@ -117,7 +117,7 @@ where
for (i, union_batch) in batches.into_iter().enumerate() {
sorters.create_fst(UnionBatch {
tmp_dir: tmp_dir_path.clone(),
gen: gen,
gen,
index: i,
fsts: union_batch?,
value_merger: self.value_merger.clone(),
Expand Down Expand Up @@ -178,7 +178,7 @@ struct Sorters<B> {
}

impl<B: Batchable + Send + 'static> Sorters<B> {
fn new(threads: u32) -> Self {
fn new(threads: u32) -> Sorters<B> {
let (bsend, brecv) = chan::bounded::<B>(0);
let (rsend, rrecv) = chan::bounded(0);
for _ in 0..threads {
Expand Down
8 changes: 4 additions & 4 deletions fst-bin/src/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -103,14 +103,14 @@ type Lines = bstr::io::ByteLines<
impl ConcatLines {
pub fn new(mut inputs: Vec<PathBuf>) -> ConcatLines {
inputs.reverse(); // treat it as a stack
ConcatLines { inputs: inputs, cur: None }
ConcatLines { inputs, cur: None }
}
}

impl Iterator for ConcatLines {
type Item = io::Result<BString>;

fn next(&mut self) -> Option<Self::Item> {
fn next(&mut self) -> Option<io::Result<BString>> {
loop {
if self.cur.is_none() {
match self.inputs.pop() {
Expand Down Expand Up @@ -143,7 +143,7 @@ type Rows = csv::DeserializeRecordsIntoIter<Reader, (BString, u64)>;
impl ConcatCsv {
pub fn new(mut inputs: Vec<PathBuf>) -> ConcatCsv {
inputs.reverse(); // treat it as a stack
ConcatCsv { inputs: inputs, cur: None }
ConcatCsv { inputs, cur: None }
}

fn read_row(&mut self) -> Option<Result<(BString, u64), Error>> {
Expand All @@ -162,7 +162,7 @@ impl ConcatCsv {
impl Iterator for ConcatCsv {
type Item = Result<(BString, u64), Error>;

fn next(&mut self) -> Option<Self::Item> {
fn next(&mut self) -> Option<Result<(BString, u64), Error>> {
loop {
if self.cur.is_none() {
match self.inputs.pop() {
Expand Down
17 changes: 6 additions & 11 deletions fst-levenshtein/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@



use std::cmp;
use std::collections::hash_map::Entry;
use std::collections::{HashMap, HashSet};
Expand Down Expand Up @@ -90,13 +87,13 @@ impl Levenshtein {
/// A `Levenshtein` value satisfies the `Automaton` trait, which means it
/// can be used with the `search` method of any finite state transducer.
#[inline]
pub fn new(query: &str, distance: u32) -> Result<Self, Error> {
pub fn new(query: &str, distance: u32) -> Result<Levenshtein, Error> {
let lev = DynamicLevenshtein {
query: query.to_owned(),
dist: distance as usize,
};
let dfa = DfaBuilder::new(lev.clone()).build()?;
Ok(Levenshtein { prog: lev, dfa: dfa })
Ok(Levenshtein { prog: lev, dfa })
}
}

Expand Down Expand Up @@ -197,10 +194,10 @@ struct DfaBuilder {
}

impl DfaBuilder {
fn new(lev: DynamicLevenshtein) -> Self {
fn new(lev: DynamicLevenshtein) -> DfaBuilder {
DfaBuilder {
dfa: Dfa { states: Vec::with_capacity(16) },
lev: lev,
lev,
cache: HashMap::with_capacity(1024),
}
}
Expand Down Expand Up @@ -251,9 +248,7 @@ impl DfaBuilder {
Entry::Occupied(v) => (*v.get(), true),
Entry::Vacant(v) => {
let is_match = self.lev.is_match(lev_state);
self.dfa
.states
.push(State { next: [None; 256], is_match: is_match });
self.dfa.states.push(State { next: [None; 256], is_match });
(*v.insert(self.dfa.states.len() - 1), false)
}
})
Expand Down Expand Up @@ -314,7 +309,7 @@ impl DfaBuilder {
}

fn new_state(&mut self, is_match: bool) -> usize {
self.dfa.states.push(State { next: [None; 256], is_match: is_match });
self.dfa.states.push(State { next: [None; 256], is_match });
self.dfa.states.len() - 1
}
}
7 changes: 5 additions & 2 deletions src/automaton/levenshtein.rs
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,10 @@ impl Levenshtein {
/// A `Levenshtein` value satisfies the `Automaton` trait, which means it
/// can be used with the `search` method of any finite state transducer.
#[inline]
pub fn new(query: &str, distance: u32) -> Result<Self, LevenshteinError> {
pub fn new(
query: &str,
distance: u32,
) -> Result<Levenshtein, LevenshteinError> {
let lev = DynamicLevenshtein {
query: query.to_owned(),
dist: distance as usize,
Expand Down Expand Up @@ -216,7 +219,7 @@ struct DfaBuilder {
}

impl DfaBuilder {
fn new(lev: DynamicLevenshtein) -> Self {
fn new(lev: DynamicLevenshtein) -> DfaBuilder {
DfaBuilder {
dfa: Dfa { states: Vec::with_capacity(16) },
lev,
Expand Down
47 changes: 21 additions & 26 deletions src/automaton/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#[cfg(feature = "levenshtein")]
pub use self::levenshtein::{Levenshtein, LevenshteinError};
use self::StartsWithStateInternal::*;

#[cfg(feature = "levenshtein")]
mod levenshtein;
Expand Down Expand Up @@ -109,23 +108,23 @@ pub trait Automaton {
impl<'a, T: Automaton> Automaton for &'a T {
type State = T::State;

fn start(&self) -> Self::State {
fn start(&self) -> T::State {
(*self).start()
}

fn is_match(&self, state: &Self::State) -> bool {
fn is_match(&self, state: &T::State) -> bool {
(*self).is_match(state)
}

fn can_match(&self, state: &Self::State) -> bool {
fn can_match(&self, state: &T::State) -> bool {
(*self).can_match(state)
}

fn will_always_match(&self, state: &Self::State) -> bool {
fn will_always_match(&self, state: &T::State) -> bool {
(*self).will_always_match(state)
}

fn accept(&self, state: &Self::State, byte: u8) -> Self::State {
fn accept(&self, state: &T::State, byte: u8) -> T::State {
(*self).accept(state, byte)
}
}
Expand All @@ -138,13 +137,11 @@ impl<'a, T: Automaton> Automaton for &'a T {
/// ```rust
/// extern crate fst;
///
/// use std::error::Error;
///
/// use fst::{Automaton, IntoStreamer, Streamer, Set};
/// use fst::automaton::Str;
///
/// # fn main() { example().unwrap(); }
/// fn example() -> Result<(), Box<Error>> {
/// fn example() -> Result<(), Box<dyn std::error::Error>> {
/// let paths = vec!["/home/projects/bar", "/home/projects/foo", "/tmp/foo"];
/// let set = Set::from_iter(paths)?;
///
Expand Down Expand Up @@ -212,13 +209,11 @@ impl<'a> Automaton for Str<'a> {
/// ```rust
/// extern crate fst;
///
/// use std::error::Error;
///
/// use fst::{IntoStreamer, Streamer, Set};
/// use fst::automaton::Subsequence;
///
/// # fn main() { example().unwrap(); }
/// fn example() -> Result<(), Box<Error>> {
/// fn example() -> Result<(), Box<dyn std::error::Error>> {
/// let paths = vec!["/home/projects/bar", "/home/projects/foo", "/tmp/foo"];
/// let set = Set::from_iter(paths)?;
///
Expand Down Expand Up @@ -317,9 +312,9 @@ impl Automaton for AlwaysMatch {
pub struct StartsWith<A>(A);

/// The `Automaton` state for `StartsWith<A>`.
pub struct StartsWithState<A: Automaton>(StartsWithStateInternal<A>);
pub struct StartsWithState<A: Automaton>(StartsWithStateKind<A>);

enum StartsWithStateInternal<A: Automaton> {
enum StartsWithStateKind<A: Automaton> {
Done,
Running(A::State),
}
Expand All @@ -331,31 +326,31 @@ impl<A: Automaton> Automaton for StartsWith<A> {
StartsWithState({
let inner = self.0.start();
if self.0.is_match(&inner) {
Done
StartsWithStateKind::Done
} else {
Running(inner)
StartsWithStateKind::Running(inner)
}
})
}

fn is_match(&self, state: &StartsWithState<A>) -> bool {
match state.0 {
Done => true,
Running(_) => false,
StartsWithStateKind::Done => true,
StartsWithStateKind::Running(_) => false,
}
}

fn can_match(&self, state: &StartsWithState<A>) -> bool {
match state.0 {
Done => true,
Running(ref inner) => self.0.can_match(inner),
StartsWithStateKind::Done => true,
StartsWithStateKind::Running(ref inner) => self.0.can_match(inner),
}
}

fn will_always_match(&self, state: &StartsWithState<A>) -> bool {
match state.0 {
Done => true,
Running(_) => false,
StartsWithStateKind::Done => true,
StartsWithStateKind::Running(_) => false,
}
}

Expand All @@ -365,13 +360,13 @@ impl<A: Automaton> Automaton for StartsWith<A> {
byte: u8,
) -> StartsWithState<A> {
StartsWithState(match state.0 {
Done => Done,
Running(ref inner) => {
StartsWithStateKind::Done => StartsWithStateKind::Done,
StartsWithStateKind::Running(ref inner) => {
let next_inner = self.0.accept(inner, byte);
if self.0.is_match(&next_inner) {
Done
StartsWithStateKind::Done
} else {
Running(next_inner)
StartsWithStateKind::Running(next_inner)
}
}
})
Expand Down
Loading

0 comments on commit 57e588c

Please sign in to comment.