From 0cbe12f3416781b5ed3583105ab707e3eb79c2ad Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 23 Feb 2020 11:58:45 -0500 Subject: [PATCH 01/23] cleanup: remove superfluous files I stopped using this stuff a very long time ago. --- Makefile | 14 -------------- ctags.rust | 11 ----------- session.vim | 1 - 3 files changed, 26 deletions(-) delete mode 100644 Makefile delete mode 100644 ctags.rust delete mode 100644 session.vim diff --git a/Makefile b/Makefile deleted file mode 100644 index 45247d9..0000000 --- a/Makefile +++ /dev/null @@ -1,14 +0,0 @@ -all: - echo Nothing to do... - -ctags: - ctags --options=ctags.rust --languages=Rust src/*.rs src/*/*.rs - -docs: - cargo doc - in-dir ./target/doc fix-perms - rscp ./target/doc/* gopher:~/www/burntsushi.net/rustdoc/ - -install: - cargo build --manifest-path ./fst-bin/Cargo.toml --release - cp fst-bin/target/release/fst $(CARGO_INSTALL_ROOT)/bin/ diff --git a/ctags.rust b/ctags.rust deleted file mode 100644 index b42edf7..0000000 --- a/ctags.rust +++ /dev/null @@ -1,11 +0,0 @@ ---langdef=Rust ---langmap=Rust:.rs ---regex-Rust=/^[ \t]*(#\[[^\]]\][ \t]*)*(pub[ \t]+)?(extern[ \t]+)?("[^"]+"[ \t]+)?(unsafe[ \t]+)?fn[ \t]+([a-zA-Z0-9_]+)/\6/f,functions,function definitions/ ---regex-Rust=/^[ \t]*(pub[ \t]+)?type[ \t]+([a-zA-Z0-9_]+)/\2/T,types,type definitions/ ---regex-Rust=/^[ \t]*(pub[ \t]+)?enum[ \t]+([a-zA-Z0-9_]+)/\2/g,enum,enumeration names/ ---regex-Rust=/^[ \t]*(pub[ \t]+)?struct[ \t]+([a-zA-Z0-9_]+)/\2/s,structure names/ ---regex-Rust=/^[ \t]*(pub[ \t]+)?mod[ \t]+([a-zA-Z0-9_]+)/\2/m,modules,module names/ ---regex-Rust=/^[ \t]*(pub[ \t]+)?static[ \t]+([a-zA-Z0-9_]+)/\2/c,consts,static constants/ ---regex-Rust=/^[ \t]*(pub[ \t]+)?trait[ \t]+([a-zA-Z0-9_]+)/\2/t,traits,traits/ ---regex-Rust=/^[ \t]*(pub[ \t]+)?impl([ \t\n]+<.*>)?[ \t]+([a-zA-Z0-9_]+)/\3/i,impls,trait implementations/ ---regex-Rust=/^[ \t]*macro_rules![ \t]+([a-zA-Z0-9_]+)/\1/d,macros,macro definitions/ diff --git a/session.vim b/session.vim deleted file mode 100644 index 213c956..0000000 --- a/session.vim +++ /dev/null @@ -1 +0,0 @@ -au BufWritePost *.rs silent!make ctags > /dev/null 2>&1 From 17b49ef763a98bc0cf30afd5e83bb4ef0695fc3f Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 21 Feb 2020 16:45:04 -0500 Subject: [PATCH 02/23] style: switch to rustfmt --- benches/search.rs | 20 +- fst-levenshtein/src/error.rs | 10 +- fst-levenshtein/src/lib.rs | 51 ++-- fst-regex/src/compile.rs | 52 ++-- fst-regex/src/dfa.rs | 18 +- fst-regex/src/error.rs | 39 ++- fst-regex/src/lib.rs | 4 +- fst-regex/src/sparse.rs | 6 +- rustfmt.toml | 2 + src/automaton/mod.rs | 118 +++++---- src/lib.rs | 19 +- src/map.rs | 72 ++++-- src/raw/build.rs | 63 +++-- src/raw/common_inputs.rs | 484 +++++++++-------------------------- src/raw/counting_writer.rs | 8 +- src/raw/error.rs | 41 ++- src/raw/mmap.rs | 8 +- src/raw/mod.rs | 107 ++++---- src/raw/node.rs | 120 +++++---- src/raw/ops.rs | 114 +++++---- src/raw/pack.rs | 6 +- src/raw/registry.rs | 41 +-- src/raw/registry_minimal.rs | 2 +- src/raw/tests.rs | 63 +++-- src/set.rs | 87 ++++--- src/stream.rs | 2 +- tests/test.rs | 24 +- 27 files changed, 766 insertions(+), 815 deletions(-) create mode 100644 rustfmt.toml diff --git a/benches/search.rs b/benches/search.rs index 824bb77..5e4a315 100644 --- a/benches/search.rs +++ b/benches/search.rs @@ -2,7 +2,8 @@ extern crate fnv; extern crate fst; -#[macro_use] extern crate lazy_static; +#[macro_use] +extern crate lazy_static; extern crate test; const STR_WORDS: &'static str = include_str!("./../data/words-100000"); @@ -50,9 +51,8 @@ macro_rules! search { fn hash_fnv_contains(b: &mut Bencher) { type Fnv = BuildHasherDefault; lazy_static! { - static ref SET: HashSet = { - $keys.clone().into_iter().collect() - }; + static ref SET: HashSet = + { $keys.clone().into_iter().collect() }; } let mut i = 0; b.iter(|| { @@ -64,9 +64,8 @@ macro_rules! search { #[bench] fn hash_sip_contains(b: &mut Bencher) { lazy_static! { - static ref SET: HashSet = { - $keys.clone().into_iter().collect() - }; + static ref SET: HashSet = + { $keys.clone().into_iter().collect() }; } let mut i = 0; b.iter(|| { @@ -78,9 +77,8 @@ macro_rules! search { #[bench] fn btree_contains(b: &mut Bencher) { lazy_static! { - static ref SET: BTreeSet = { - $keys.clone().into_iter().collect() - }; + static ref SET: BTreeSet = + { $keys.clone().into_iter().collect() }; } let mut i = 0; b.iter(|| { @@ -89,7 +87,7 @@ macro_rules! search { }) } } - } + }; } search!(words, ::WORDS); diff --git a/fst-levenshtein/src/error.rs b/fst-levenshtein/src/error.rs index 8233888..eda24fb 100644 --- a/fst-levenshtein/src/error.rs +++ b/fst-levenshtein/src/error.rs @@ -15,10 +15,12 @@ impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { use self::Error::*; match *self { - TooManyStates(size_limit) => { - write!(f, "Levenshtein automaton exceeds size limit of \ - {} states", size_limit) - } + TooManyStates(size_limit) => write!( + f, + "Levenshtein automaton exceeds size limit of \ + {} states", + size_limit + ), } } } diff --git a/fst-levenshtein/src/lib.rs b/fst-levenshtein/src/lib.rs index 100025c..abbfc33 100644 --- a/fst-levenshtein/src/lib.rs +++ b/fst-levenshtein/src/lib.rs @@ -2,8 +2,8 @@ extern crate fst; extern crate utf8_ranges; use std::cmp; -use std::collections::{HashMap, HashSet}; use std::collections::hash_map::Entry; +use std::collections::{HashMap, HashSet}; use std::fmt; use utf8_ranges::{Utf8Range, Utf8Sequences}; @@ -96,17 +96,17 @@ impl Levenshtein { dist: distance as usize, }; let dfa = DfaBuilder::new(lev.clone()).build()?; - Ok(Levenshtein { - prog: lev, - dfa: dfa, - }) + Ok(Levenshtein { prog: lev, dfa: dfa }) } } impl fmt::Debug for Levenshtein { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "Levenshtein(query: {:?}, distance: {:?})", - self.prog.query, self.prog.dist) + write!( + f, + "Levenshtein(query: {:?}, distance: {:?})", + self.prog.query, self.prog.dist + ) } } @@ -118,7 +118,7 @@ struct DynamicLevenshtein { impl DynamicLevenshtein { fn start(&self) -> Vec { - (0..self.query.chars().count()+1).collect() + (0..self.query.chars().count() + 1).collect() } fn is_match(&self, state: &[usize]) -> bool { @@ -130,10 +130,13 @@ impl DynamicLevenshtein { } fn accept(&self, state: &[usize], chr: Option) -> Vec { - let mut next = vec![state[0]+1]; + let mut next = vec![state[0] + 1]; for (i, c) in self.query.chars().enumerate() { let cost = if Some(c) == chr { 0 } else { 1 }; - let v = cmp::min(cmp::min(next[i]+1, state[i+1]+1), state[i]+cost); + let v = cmp::min( + cmp::min(next[i] + 1, state[i + 1] + 1), + state[i] + cost, + ); next.push(cmp::min(v, self.dist + 1)); } next @@ -144,7 +147,9 @@ impl Automaton for Levenshtein { type State = Option; #[inline] - fn start(&self) -> Option { Some(0) } + fn start(&self) -> Option { + Some(0) + } #[inline] fn is_match(&self, state: &Option) -> bool { @@ -194,9 +199,7 @@ struct DfaBuilder { impl DfaBuilder { fn new(lev: DynamicLevenshtein) -> Self { DfaBuilder { - dfa: Dfa { - states: Vec::with_capacity(16), - }, + dfa: Dfa { states: Vec::with_capacity(16) }, lev: lev, cache: HashMap::with_capacity(1024), } @@ -248,10 +251,9 @@ impl DfaBuilder { Entry::Occupied(v) => (*v.get(), true), Entry::Vacant(v) => { let is_match = self.lev.is_match(lev_state); - self.dfa.states.push(State { - next: [None; 256], - is_match: is_match, - }); + self.dfa + .states + .push(State { next: [None; 256], is_match: is_match }); (*v.insert(self.dfa.states.len() - 1), false) } }) @@ -283,13 +285,17 @@ impl DfaBuilder { ) { for seq in Utf8Sequences::new(from_chr, to_chr) { let mut fsi = from_si; - for range in &seq.as_slice()[0..seq.len()-1] { + for range in &seq.as_slice()[0..seq.len() - 1] { let tsi = self.new_state(false); self.add_utf8_range(overwrite, fsi, tsi, range); fsi = tsi; } self.add_utf8_range( - overwrite, fsi, to_si, &seq.as_slice()[seq.len()-1]); + overwrite, + fsi, + to_si, + &seq.as_slice()[seq.len() - 1], + ); } } @@ -308,10 +314,7 @@ impl DfaBuilder { } fn new_state(&mut self, is_match: bool) -> usize { - self.dfa.states.push(State { - next: [None; 256], - is_match: is_match, - }); + self.dfa.states.push(State { next: [None; 256], is_match: is_match }); self.dfa.states.len() - 1 } } diff --git a/fst-regex/src/compile.rs b/fst-regex/src/compile.rs index 530c32e..44c1c9e 100644 --- a/fst-regex/src/compile.rs +++ b/fst-regex/src/compile.rs @@ -1,5 +1,5 @@ -use regex_syntax::{Expr, Repeater, CharClass, ClassRange}; -use utf8_ranges::{Utf8Sequences, Utf8Sequence}; +use regex_syntax::{CharClass, ClassRange, Expr, Repeater}; +use utf8_ranges::{Utf8Sequence, Utf8Sequences}; use {Error, Inst}; @@ -10,10 +10,7 @@ pub struct Compiler { impl Compiler { pub fn new(size_limit: usize) -> Compiler { - Compiler { - size_limit: size_limit, - insts: vec![], - } + Compiler { size_limit: size_limit, insts: vec![] } } pub fn compile(mut self, ast: &Expr) -> Result, Error> { @@ -40,13 +37,17 @@ impl Compiler { | Expr::ClassBytes(..) => { return Err(From::from(Error::NoBytes)); } - Expr::Empty => {}, + Expr::Empty => {} Expr::Literal { ref chars, casei } => { for &c in chars { if casei { - self.c(&Expr::Class(CharClass::new(vec![ - ClassRange { start: c, end: c }, - ]).case_fold()))?; + self.c(&Expr::Class( + CharClass::new(vec![ClassRange { + start: c, + end: c, + }]) + .case_fold(), + ))?; } else { // One scalar value, so we're guaranteed to get a // single byte sequence. @@ -56,13 +57,18 @@ impl Compiler { } } } - Expr::AnyChar => self.c(&Expr::Class(CharClass::new(vec![ - ClassRange { start: '\u{0}', end: '\u{10FFFF}' }, - ])))?, - Expr::AnyCharNoNL => self.c(&Expr::Class(CharClass::new(vec![ - ClassRange { start: '\u{0}', end: '\u{09}' }, - ClassRange { start: '\u{0B}', end: '\u{10FFFF}' }, - ])))?, + Expr::AnyChar => { + self.c(&Expr::Class(CharClass::new(vec![ClassRange { + start: '\u{0}', + end: '\u{10FFFF}', + }])))? + } + Expr::AnyCharNoNL => { + self.c(&Expr::Class(CharClass::new(vec![ + ClassRange { start: '\u{0}', end: '\u{09}' }, + ClassRange { start: '\u{0B}', end: '\u{10FFFF}' }, + ])))? + } Expr::Class(ref cls) => { self.compile_class(cls)?; } @@ -77,7 +83,7 @@ impl Compiler { return Ok(()); } let mut jmps_to_end = vec![]; - for e in &es[0..es.len()-1] { + for e in &es[0..es.len() - 1] { let split = self.empty_split(); let j1 = self.insts.len(); self.c(e)?; @@ -85,7 +91,7 @@ impl Compiler { let j2 = self.insts.len(); self.set_split(split, j1, j2); } - self.c(&es[es.len()-1])?; + self.c(&es[es.len() - 1])?; let end = self.insts.len(); for jmp_to_end in jmps_to_end { self.set_jump(jmp_to_end, end); @@ -161,7 +167,7 @@ impl Compiler { return Ok(()); } let mut jmps = vec![]; - for r in &class[0..class.len()-1] { + for r in &class[0..class.len() - 1] { let split = self.empty_split(); let j1 = self.insts.len(); self.compile_class_range(r)?; @@ -169,7 +175,7 @@ impl Compiler { let j2 = self.insts.len(); self.set_split(split, j1, j2); } - self.compile_class_range(&class[class.len()-1])?; + self.compile_class_range(&class[class.len() - 1])?; let end = self.insts.len(); for jmp in jmps { self.set_jump(jmp, end); @@ -181,8 +187,8 @@ impl Compiler { &mut self, char_range: &ClassRange, ) -> Result<(), Error> { - let mut it = Utf8Sequences::new(char_range.start, char_range.end) - .peekable(); + let mut it = + Utf8Sequences::new(char_range.start, char_range.end).peekable(); let mut jmps = vec![]; let mut utf8_ranges = it.next().expect("non-empty char class"); while it.peek().is_some() { diff --git a/fst-regex/src/dfa.rs b/fst-regex/src/dfa.rs index 67f3baa..8f0fe3b 100644 --- a/fst-regex/src/dfa.rs +++ b/fst-regex/src/dfa.rs @@ -1,8 +1,8 @@ use std::collections::{HashMap, HashSet}; use std::fmt; -use {Error, Inst}; use sparse::SparseSet; +use {Error, Inst}; const STATE_LIMIT: usize = 1_000; // currently at least 2MB >_< @@ -25,10 +25,7 @@ struct State { impl DfaBuilder { pub fn new(insts: Vec) -> Self { DfaBuilder { - dfa: Dfa { - insts: insts, - states: Vec::with_capacity(16), - }, + dfa: Dfa { insts: insts, states: Vec::with_capacity(16) }, cache: HashMap::with_capacity(1024), } } @@ -57,8 +54,13 @@ impl DfaBuilder { Ok(self.dfa) } - fn run_state(&mut self, cur: &mut SparseSet, next: &mut SparseSet, - state: usize, byte: u8) -> Option { + fn run_state( + &mut self, + cur: &mut SparseSet, + next: &mut SparseSet, + state: usize, + byte: u8, + ) -> Option { cur.clear(); for &ip in &self.dfa.states[state].insts { cur.add(ip); @@ -142,7 +144,7 @@ impl Dfa { Match => is_match = true, Range(s, e) => { if s <= byte && byte <= e { - self.add(to, ip+1); + self.add(to, ip + 1); } } } diff --git a/fst-regex/src/error.rs b/fst-regex/src/error.rs index 4f579c2..e2a1222 100644 --- a/fst-regex/src/error.rs +++ b/fst-regex/src/error.rs @@ -51,20 +51,31 @@ impl fmt::Display for Error { use self::Error::*; match *self { Syntax(ref err) => err.fmt(f), - CompiledTooBig(size_limit) => { - write!(f, "Compiled regex exceeds size limit of {} bytes", - size_limit) - } - TooManyStates(size_limit) => { - write!(f, "Compiled regex exceeds size limit of {} states", - size_limit) - } - NoLazy => write!(f, "Lazy reptition operators such as '+?' are \ - not allowed."), - NoWordBoundary => write!(f, "Word boundary operators are not \ - allowed."), - NoEmpty => write!(f, "Empty match operators are not allowed \ - (hopefully temporary)."), + CompiledTooBig(size_limit) => write!( + f, + "Compiled regex exceeds size limit of {} bytes", + size_limit + ), + TooManyStates(size_limit) => write!( + f, + "Compiled regex exceeds size limit of {} states", + size_limit + ), + NoLazy => write!( + f, + "Lazy reptition operators such as '+?' are \ + not allowed." + ), + NoWordBoundary => write!( + f, + "Word boundary operators are not \ + allowed." + ), + NoEmpty => write!( + f, + "Empty match operators are not allowed \ + (hopefully temporary)." + ), NoBytes => write!(f, "Byte literals are not allowed."), } } diff --git a/fst-regex/src/lib.rs b/fst-regex/src/lib.rs index 8f4bb59..26d8b53 100644 --- a/fst-regex/src/lib.rs +++ b/fst-regex/src/lib.rs @@ -136,7 +136,9 @@ impl Automaton for Regex { type State = Option; #[inline] - fn start(&self) -> Option { Some(0) } + fn start(&self) -> Option { + Some(0) + } #[inline] fn is_match(&self, state: &Option) -> bool { diff --git a/fst-regex/src/sparse.rs b/fst-regex/src/sparse.rs index f63e05e..7df3ac6 100644 --- a/fst-regex/src/sparse.rs +++ b/fst-regex/src/sparse.rs @@ -6,11 +6,7 @@ pub struct SparseSet { impl SparseSet { pub fn new(size: usize) -> SparseSet { - SparseSet { - dense: vec![0; size], - sparse: vec![0; size], - size: 0, - } + SparseSet { dense: vec![0; size], sparse: vec![0; size], size: 0 } } pub fn len(&self) -> usize { diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 0000000..aa37a21 --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1,2 @@ +max_width = 79 +use_small_heuristics = "max" diff --git a/src/automaton/mod.rs b/src/automaton/mod.rs index 885b4e6..72843eb 100644 --- a/src/automaton/mod.rs +++ b/src/automaton/mod.rs @@ -66,31 +66,37 @@ pub trait Automaton { /// Returns an automaton that matches the strings that start with something /// this automaton matches. - fn starts_with(self) -> StartsWith where Self: Sized { + fn starts_with(self) -> StartsWith + where + Self: Sized, + { StartsWith(self) } /// Returns an automaton that matches the strings matched by either this or /// the other automaton. - fn union( - self, - rhs: Rhs, - ) -> Union where Self: Sized { + fn union(self, rhs: Rhs) -> Union + where + Self: Sized, + { Union(self, rhs) } /// Returns an automaton that matches the strings matched by both this and /// the other automaton. - fn intersection( - self, - rhs: Rhs, - ) -> Intersection where Self: Sized { + fn intersection(self, rhs: Rhs) -> Intersection + where + Self: Sized, + { Intersection(self, rhs) } /// Returns an automaton that matches the strings not matched by this /// automaton. - fn complement(self) -> Complement where Self: Sized { + fn complement(self) -> Complement + where + Self: Sized, + { Complement(self) } } @@ -150,7 +156,7 @@ impl<'a, T: Automaton> Automaton for &'a T { /// ``` #[derive(Clone, Debug)] pub struct Str<'a> { - string: &'a [u8] + string: &'a [u8], } impl<'a> Str<'a> { @@ -165,7 +171,9 @@ impl<'a> Automaton for Str<'a> { type State = Option; #[inline] - fn start(&self) -> Option { Some(0) } + fn start(&self) -> Option { + Some(0) + } #[inline] fn is_match(&self, pos: &Option) -> bool { @@ -222,7 +230,7 @@ impl<'a> Automaton for Str<'a> { /// ``` #[derive(Clone, Debug)] pub struct Subsequence<'a> { - subseq: &'a [u8] + subseq: &'a [u8], } impl<'a> Subsequence<'a> { @@ -238,7 +246,9 @@ impl<'a> Automaton for Subsequence<'a> { type State = usize; #[inline] - fn start(&self) -> usize { 0 } + fn start(&self) -> usize { + 0 + } #[inline] fn is_match(&self, &state: &usize) -> bool { @@ -246,7 +256,9 @@ impl<'a> Automaton for Subsequence<'a> { } #[inline] - fn can_match(&self, _: &usize) -> bool { true } + fn can_match(&self, _: &usize) -> bool { + true + } #[inline] fn will_always_match(&self, &state: &usize) -> bool { @@ -255,7 +267,9 @@ impl<'a> Automaton for Subsequence<'a> { #[inline] fn accept(&self, &state: &usize, byte: u8) -> usize { - if state == self.subseq.len() { return state; } + if state == self.subseq.len() { + return state; + } state + (byte == self.subseq[state]) as usize } } @@ -270,11 +284,26 @@ pub struct AlwaysMatch; impl Automaton for AlwaysMatch { type State = (); - #[inline] fn start(&self) -> () { () } - #[inline] fn is_match(&self, _: &()) -> bool { true } - #[inline] fn can_match(&self, _: &()) -> bool { true } - #[inline] fn will_always_match(&self, _: &()) -> bool { true } - #[inline] fn accept(&self, _: &(), _: u8) -> () { () } + #[inline] + fn start(&self) -> () { + () + } + #[inline] + fn is_match(&self, _: &()) -> bool { + true + } + #[inline] + fn can_match(&self, _: &()) -> bool { + true + } + #[inline] + fn will_always_match(&self, _: &()) -> bool { + true + } + #[inline] + fn accept(&self, _: &(), _: u8) -> () { + () + } } /// An automaton that matches a string that begins with something that the @@ -287,7 +316,7 @@ pub struct StartsWithState(StartsWithStateInternal); enum StartsWithStateInternal { Done, - Running(A::State) + Running(A::State), } impl Automaton for StartsWith { @@ -307,21 +336,21 @@ impl Automaton for StartsWith { fn is_match(&self, state: &StartsWithState) -> bool { match state.0 { Done => true, - Running(_) => false + Running(_) => false, } } fn can_match(&self, state: &StartsWithState) -> bool { match state.0 { Done => true, - Running(ref inner) => self.0.can_match(inner) + Running(ref inner) => self.0.can_match(inner), } } fn will_always_match(&self, state: &StartsWithState) -> bool { match state.0 { Done => true, - Running(_) => false + Running(_) => false, } } @@ -330,19 +359,17 @@ impl Automaton for StartsWith { state: &StartsWithState, byte: u8, ) -> StartsWithState { - StartsWithState( - match state.0 { - Done => Done, - Running(ref inner) => { - let next_inner = self.0.accept(inner, byte); - if self.0.is_match(&next_inner) { - Done - } else { - Running(next_inner) - } + StartsWithState(match state.0 { + Done => Done, + Running(ref inner) => { + let next_inner = self.0.accept(inner, byte); + if self.0.is_match(&next_inner) { + Done + } else { + Running(next_inner) } } - ) + }) } } @@ -357,10 +384,7 @@ impl Automaton for Union { type State = UnionState; fn start(&self) -> UnionState { - UnionState( - self.0.start(), - self.1.start() - ) + UnionState(self.0.start(), self.1.start()) } fn is_match(&self, state: &UnionState) -> bool { @@ -373,13 +397,13 @@ impl Automaton for Union { fn will_always_match(&self, state: &UnionState) -> bool { self.0.will_always_match(&state.0) - || self.1.will_always_match(&state.1) + || self.1.will_always_match(&state.1) } fn accept(&self, state: &UnionState, byte: u8) -> UnionState { UnionState( self.0.accept(&state.0, byte), - self.1.accept(&state.1, byte) + self.1.accept(&state.1, byte), ) } } @@ -395,10 +419,7 @@ impl Automaton for Intersection { type State = IntersectionState; fn start(&self) -> IntersectionState { - IntersectionState( - self.0.start(), - self.1.start() - ) + IntersectionState(self.0.start(), self.1.start()) } fn is_match(&self, state: &IntersectionState) -> bool { @@ -410,7 +431,8 @@ impl Automaton for Intersection { } fn will_always_match(&self, state: &IntersectionState) -> bool { - self.0.will_always_match(&state.0) && self.1.will_always_match(&state.1) + self.0.will_always_match(&state.0) + && self.1.will_always_match(&state.1) } fn accept( @@ -420,7 +442,7 @@ impl Automaton for Intersection { ) -> IntersectionState { IntersectionState( self.0.accept(&state.0, byte), - self.1.accept(&state.1, byte) + self.1.accept(&state.1, byte), ) } } diff --git a/src/lib.rs b/src/lib.rs index 2ef5e83..a29475a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -306,11 +306,16 @@ will be returned if the automaton gets too big (tens of MB in heap usage). #![deny(missing_docs)] extern crate byteorder; -#[cfg(test)] extern crate fst_levenshtein; -#[cfg(test)] extern crate fst_regex; -#[cfg(feature = "mmap")] extern crate memmap; -#[cfg(test)] extern crate quickcheck; -#[cfg(test)] extern crate rand; +#[cfg(test)] +extern crate fst_levenshtein; +#[cfg(test)] +extern crate fst_regex; +#[cfg(feature = "mmap")] +extern crate memmap; +#[cfg(test)] +extern crate quickcheck; +#[cfg(test)] +extern crate rand; pub use automaton::Automaton; pub use error::{Error, Result}; @@ -318,14 +323,14 @@ pub use map::{Map, MapBuilder}; pub use set::{Set, SetBuilder}; pub use stream::{IntoStreamer, Streamer}; +mod error; #[path = "automaton/mod.rs"] mod inner_automaton; -mod error; #[path = "map.rs"] mod inner_map; -pub mod raw; #[path = "set.rs"] mod inner_set; +pub mod raw; mod stream; /// Automaton implementations for finite state transducers. diff --git a/src/map.rs b/src/map.rs index d3bf998..8c3d386 100644 --- a/src/map.rs +++ b/src/map.rs @@ -1,12 +1,12 @@ use std::fmt; -use std::iter::{self, FromIterator}; use std::io; +use std::iter::{self, FromIterator}; #[cfg(feature = "mmap")] use std::path::Path; -use automaton::{Automaton, AlwaysMatch}; +use automaton::{AlwaysMatch, Automaton}; use raw; -pub use raw::IndexedValue as IndexedValue; +pub use raw::IndexedValue; use stream::{IntoStreamer, Streamer}; use Result; @@ -117,7 +117,10 @@ impl Map { /// To build a map that streams to an arbitrary `io::Write`, use /// `MapBuilder`. pub fn from_iter(iter: I) -> Result - where K: AsRef<[u8]>, I: IntoIterator { + where + K: AsRef<[u8]>, + I: IntoIterator, + { let mut builder = MapBuilder::memory(); builder.extend_iter(iter)?; Map::from_bytes(builder.into_inner()?) @@ -577,9 +580,13 @@ impl MapBuilder { /// added, then an error is returned. Similarly, if there was a problem /// writing to the underlying writer, an error is returned. pub fn extend_iter(&mut self, iter: I) -> Result<()> - where K: AsRef<[u8]>, I: IntoIterator { - self.0.extend_iter(iter.into_iter() - .map(|(k, v)| (k, raw::Output::new(v)))) + where + K: AsRef<[u8]>, + I: IntoIterator, + { + self.0.extend_iter( + iter.into_iter().map(|(k, v)| (k, raw::Output::new(v))), + ) } /// Calls insert on each item in the stream. @@ -591,8 +598,10 @@ impl MapBuilder { /// added, then an error is returned. Similarly, if there was a problem /// writing to the underlying writer, an error is returned. pub fn extend_stream<'f, I, S>(&mut self, stream: I) -> Result<()> - where I: for<'a> IntoStreamer<'a, Into=S, Item=(&'a [u8], u64)>, - S: 'f + for<'a> Streamer<'a, Item=(&'a [u8], u64)> { + where + I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>, + S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], u64)>, + { self.0.extend_stream(StreamOutput(stream.into_stream())) } @@ -618,7 +627,6 @@ impl MapBuilder { pub fn bytes_written(&self) -> u64 { self.0.bytes_written() } - } /// A lexicographically ordered stream of key-value pairs from a map. @@ -627,7 +635,9 @@ impl MapBuilder { /// the stream. By default, no filtering is done. /// /// The `'m` lifetime parameter refers to the lifetime of the underlying map. -pub struct Stream<'m, A=AlwaysMatch>(raw::Stream<'m, A>) where A: Automaton; +pub struct Stream<'m, A = AlwaysMatch>(raw::Stream<'m, A>) +where + A: Automaton; impl<'a, 'm, A: Automaton> Streamer<'a> for Stream<'m, A> { type Item = (&'a [u8], u64); @@ -719,7 +729,7 @@ impl<'a, 'm> Streamer<'a> for Values<'m> { /// the stream. By default, no filtering is done. /// /// The `'m` lifetime parameter refers to the lifetime of the underlying map. -pub struct StreamBuilder<'m, A=AlwaysMatch>(raw::StreamBuilder<'m, A>); +pub struct StreamBuilder<'m, A = AlwaysMatch>(raw::StreamBuilder<'m, A>); impl<'m, A: Automaton> StreamBuilder<'m, A> { /// Specify a greater-than-or-equal-to bound. @@ -786,8 +796,10 @@ impl<'m> OpBuilder<'m> { /// The stream must emit a lexicographically ordered sequence of key-value /// pairs. pub fn add(mut self, streamable: I) -> Self - where I: for<'a> IntoStreamer<'a, Into=S, Item=(&'a [u8], u64)>, - S: 'm + for<'a> Streamer<'a, Item=(&'a [u8], u64)> { + where + I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>, + S: 'm + for<'a> Streamer<'a, Item = (&'a [u8], u64)>, + { self.push(streamable); self } @@ -797,8 +809,10 @@ impl<'m> OpBuilder<'m> { /// The stream must emit a lexicographically ordered sequence of key-value /// pairs. pub fn push(&mut self, streamable: I) - where I: for<'a> IntoStreamer<'a, Into=S, Item=(&'a [u8], u64)>, - S: 'm + for<'a> Streamer<'a, Item=(&'a [u8], u64)> { + where + I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>, + S: 'm + for<'a> Streamer<'a, Item = (&'a [u8], u64)>, + { self.0.push(StreamOutput(streamable.into_stream())); } @@ -983,9 +997,14 @@ impl<'m> OpBuilder<'m> { } impl<'f, I, S> Extend for OpBuilder<'f> - where I: for<'a> IntoStreamer<'a, Into=S, Item=(&'a [u8], u64)>, - S: 'f + for<'a> Streamer<'a, Item=(&'a [u8], u64)> { - fn extend(&mut self, it: T) where T: IntoIterator { +where + I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>, + S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], u64)>, +{ + fn extend(&mut self, it: T) + where + T: IntoIterator, + { for stream in it { self.push(stream); } @@ -993,9 +1012,14 @@ impl<'f, I, S> Extend for OpBuilder<'f> } impl<'f, I, S> FromIterator for OpBuilder<'f> - where I: for<'a> IntoStreamer<'a, Into=S, Item=(&'a [u8], u64)>, - S: 'f + for<'a> Streamer<'a, Item=(&'a [u8], u64)> { - fn from_iter(it: T) -> Self where T: IntoIterator { +where + I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>, + S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], u64)>, +{ + fn from_iter(it: T) -> Self + where + T: IntoIterator, + { let mut op = OpBuilder::new(); op.extend(it); op @@ -1073,7 +1097,9 @@ impl<'a, 'm> Streamer<'a> for SymmetricDifference<'m> { struct StreamOutput(S); impl<'a, S> Streamer<'a> for StreamOutput - where S: Streamer<'a, Item=(&'a [u8], u64)> { +where + S: Streamer<'a, Item = (&'a [u8], u64)>, +{ type Item = (&'a [u8], raw::Output); fn next(&'a mut self) -> Option { diff --git a/src/raw/build.rs b/src/raw/build.rs index 0877e82..2f59302 100644 --- a/src/raw/build.rs +++ b/src/raw/build.rs @@ -1,15 +1,15 @@ use std::io::{self, Write}; -use byteorder::{WriteBytesExt, LittleEndian}; +use byteorder::{LittleEndian, WriteBytesExt}; use error::Result; -use raw::{ - VERSION, EMPTY_ADDRESS, NONE_ADDRESS, - CompiledAddr, FstType, Output, Transition, -}; use raw::counting_writer::CountingWriter; use raw::error::Error; use raw::registry::{Registry, RegistryEntry}; +use raw::{ + CompiledAddr, FstType, Output, Transition, EMPTY_ADDRESS, NONE_ADDRESS, + VERSION, +}; // use raw::registry_minimal::{Registry, RegistryEntry}; use stream::{IntoStreamer, Streamer}; @@ -134,7 +134,9 @@ impl Builder { /// Adds a byte string to this FST with a zero output value. pub fn add(&mut self, bs: B) -> Result<()> - where B: AsRef<[u8]> { + where + B: AsRef<[u8]>, + { self.check_last_key(bs.as_ref(), false)?; self.insert_output(bs, None) } @@ -149,7 +151,9 @@ impl Builder { /// added, then an error is returned. Similarly, if there was a problem /// writing to the underlying writer, an error is returned. pub fn insert(&mut self, bs: B, val: u64) -> Result<()> - where B: AsRef<[u8]> { + where + B: AsRef<[u8]>, + { self.check_last_key(bs.as_ref(), true)?; self.insert_output(bs, Some(Output::new(val))) } @@ -163,7 +167,10 @@ impl Builder { /// added, then an error is returned. Similarly, if there was a problem /// writing to the underlying writer, an error is returned. pub fn extend_iter(&mut self, iter: I) -> Result<()> - where T: AsRef<[u8]>, I: IntoIterator { + where + T: AsRef<[u8]>, + I: IntoIterator, + { for (key, out) in iter { self.insert(key, out.value())?; } @@ -179,8 +186,10 @@ impl Builder { /// added, then an error is returned. Similarly, if there was a problem /// writing to the underlying writer, an error is returned. pub fn extend_stream<'f, I, S>(&mut self, stream: I) -> Result<()> - where I: for<'a> IntoStreamer<'a, Into=S, Item=(&'a [u8], Output)>, - S: 'f + for<'a> Streamer<'a, Item=(&'a [u8], Output)> { + where + I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>, + S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>, + { let mut stream = stream.into_stream(); while let Some((key, out)) = stream.next() { self.insert(key, out.value())?; @@ -209,7 +218,9 @@ impl Builder { } fn insert_output(&mut self, bs: B, out: Option) -> Result<()> - where B: AsRef<[u8]> { + where + B: AsRef<[u8]>, + { let bs = bs.as_ref(); if bs.is_empty() { self.len = 1; // must be first key, so length is always 1 @@ -240,12 +251,11 @@ impl Builder { fn compile_from(&mut self, istate: usize) -> Result<()> { let mut addr = NONE_ADDRESS; while istate + 1 < self.unfinished.len() { - let node = - if addr == NONE_ADDRESS { - self.unfinished.pop_empty() - } else { - self.unfinished.pop_freeze(addr) - }; + let node = if addr == NONE_ADDRESS { + self.unfinished.pop_empty() + } else { + self.unfinished.pop_freeze(addr) + }; addr = self.compile(&node)?; assert!(addr != NONE_ADDRESS); } @@ -256,7 +266,8 @@ impl Builder { fn compile(&mut self, node: &BuilderNode) -> Result { if node.is_final && node.trans.is_empty() - && node.final_output.is_zero() { + && node.final_output.is_zero() + { return Ok(EMPTY_ADDRESS); } let entry = self.registry.entry(&node); @@ -281,7 +292,8 @@ impl Builder { return Err(Error::OutOfOrder { previous: last.to_vec(), got: bs.to_vec(), - }.into()); + } + .into()); } last.clear(); for &b in bs { @@ -367,13 +379,12 @@ impl UnfinishedNodes { } fn find_common_prefix(&mut self, bs: &[u8]) -> usize { - bs - .iter() - .zip(&self.stack) - .take_while(|&(&b, ref node)| { - node.last.as_ref().map(|t| t.inp == b).unwrap_or(false) - }) - .count() + bs.iter() + .zip(&self.stack) + .take_while(|&(&b, ref node)| { + node.last.as_ref().map(|t| t.inp == b).unwrap_or(false) + }) + .count() } fn find_common_prefix_and_set_output( diff --git a/src/raw/common_inputs.rs b/src/raw/common_inputs.rs index 231a1c1..f7d7d23 100644 --- a/src/raw/common_inputs.rs +++ b/src/raw/common_inputs.rs @@ -1,20 +1,20 @@ pub const COMMON_INPUTS: [u8; 256] = [ - 84, // '\x00' - 85, // '\x01' - 86, // '\x02' - 87, // '\x03' - 88, // '\x04' - 89, // '\x05' - 90, // '\x06' - 91, // '\x07' - 92, // '\x08' - 93, // '\t' - 94, // '\n' - 95, // '\x0b' - 96, // '\x0c' - 97, // '\r' - 98, // '\x0e' - 99, // '\x0f' + 84, // '\x00' + 85, // '\x01' + 86, // '\x02' + 87, // '\x03' + 88, // '\x04' + 89, // '\x05' + 90, // '\x06' + 91, // '\x07' + 92, // '\x08' + 93, // '\t' + 94, // '\n' + 95, // '\x0b' + 96, // '\x0c' + 97, // '\r' + 98, // '\x0e' + 99, // '\x0f' 100, // '\x10' 101, // '\x11' 102, // '\x12' @@ -32,100 +32,100 @@ pub const COMMON_INPUTS: [u8; 256] = [ 114, // '\x1e' 115, // '\x1f' 116, // ' ' - 80, // '!' + 80, // '!' 117, // '"' 118, // '#' - 79, // '$' - 39, // '%' - 30, // '&' - 81, // "'" - 75, // '(' - 74, // ')' - 82, // '*' - 57, // '+' - 66, // ',' - 16, // '-' - 12, // '.' - 2, // '/' - 19, // '0' - 20, // '1' - 21, // '2' - 27, // '3' - 32, // '4' - 29, // '5' - 35, // '6' - 36, // '7' - 37, // '8' - 34, // '9' - 24, // ':' - 73, // ';' + 79, // '$' + 39, // '%' + 30, // '&' + 81, // "'" + 75, // '(' + 74, // ')' + 82, // '*' + 57, // '+' + 66, // ',' + 16, // '-' + 12, // '.' + 2, // '/' + 19, // '0' + 20, // '1' + 21, // '2' + 27, // '3' + 32, // '4' + 29, // '5' + 35, // '6' + 36, // '7' + 37, // '8' + 34, // '9' + 24, // ':' + 73, // ';' 119, // '<' - 23, // '=' + 23, // '=' 120, // '>' - 40, // '?' - 83, // '@' - 44, // 'A' - 48, // 'B' - 42, // 'C' - 43, // 'D' - 49, // 'E' - 46, // 'F' - 62, // 'G' - 61, // 'H' - 47, // 'I' - 69, // 'J' - 68, // 'K' - 58, // 'L' - 56, // 'M' - 55, // 'N' - 59, // 'O' - 51, // 'P' - 72, // 'Q' - 54, // 'R' - 45, // 'S' - 52, // 'T' - 64, // 'U' - 65, // 'V' - 63, // 'W' - 71, // 'X' - 67, // 'Y' - 70, // 'Z' - 77, // '[' + 40, // '?' + 83, // '@' + 44, // 'A' + 48, // 'B' + 42, // 'C' + 43, // 'D' + 49, // 'E' + 46, // 'F' + 62, // 'G' + 61, // 'H' + 47, // 'I' + 69, // 'J' + 68, // 'K' + 58, // 'L' + 56, // 'M' + 55, // 'N' + 59, // 'O' + 51, // 'P' + 72, // 'Q' + 54, // 'R' + 45, // 'S' + 52, // 'T' + 64, // 'U' + 65, // 'V' + 63, // 'W' + 71, // 'X' + 67, // 'Y' + 70, // 'Z' + 77, // '[' 121, // '\\' - 78, // ']' + 78, // ']' 122, // '^' - 31, // '_' + 31, // '_' 123, // '`' - 4, // 'a' - 25, // 'b' - 9, // 'c' - 17, // 'd' - 1, // 'e' - 26, // 'f' - 22, // 'g' - 13, // 'h' - 7, // 'i' - 50, // 'j' - 38, // 'k' - 14, // 'l' - 15, // 'm' - 10, // 'n' - 3, // 'o' - 8, // 'p' - 60, // 'q' - 6, // 'r' - 5, // 's' - 0, // 't' - 18, // 'u' - 33, // 'v' - 11, // 'w' - 41, // 'x' - 28, // 'y' - 53, // 'z' + 4, // 'a' + 25, // 'b' + 9, // 'c' + 17, // 'd' + 1, // 'e' + 26, // 'f' + 22, // 'g' + 13, // 'h' + 7, // 'i' + 50, // 'j' + 38, // 'k' + 14, // 'l' + 15, // 'm' + 10, // 'n' + 3, // 'o' + 8, // 'p' + 60, // 'q' + 6, // 'r' + 5, // 's' + 0, // 't' + 18, // 'u' + 33, // 'v' + 11, // 'w' + 41, // 'x' + 28, // 'y' + 53, // 'z' 124, // '{' 125, // '|' 126, // '}' - 76, // '~' + 76, // '~' 127, // '\x7f' 128, // '\x80' 129, // '\x81' @@ -258,260 +258,32 @@ pub const COMMON_INPUTS: [u8; 256] = [ ]; pub const COMMON_INPUTS_INV: [u8; 256] = [ - b't', - b'e', - b'/', - b'o', - b'a', - b's', - b'r', - b'i', - b'p', - b'c', - b'n', - b'w', - b'.', - b'h', - b'l', - b'm', - b'-', - b'd', - b'u', - b'0', - b'1', - b'2', - b'g', - b'=', - b':', - b'b', - b'f', - b'3', - b'y', - b'5', - b'&', - b'_', - b'4', - b'v', - b'9', - b'6', - b'7', - b'8', - b'k', - b'%', - b'?', - b'x', - b'C', - b'D', - b'A', - b'S', - b'F', - b'I', - b'B', - b'E', - b'j', - b'P', - b'T', - b'z', - b'R', - b'N', - b'M', - b'+', - b'L', - b'O', - b'q', - b'H', - b'G', - b'W', - b'U', - b'V', - b',', - b'Y', - b'K', - b'J', - b'Z', - b'X', - b'Q', - b';', - b')', - b'(', - b'~', - b'[', - b']', - b'$', - b'!', - b'\'', - b'*', - b'@', - b'\x00', - b'\x01', - b'\x02', - b'\x03', - b'\x04', - b'\x05', - b'\x06', - b'\x07', - b'\x08', - b'\t', - b'\n', - b'\x0b', - b'\x0c', - b'\r', - b'\x0e', - b'\x0f', - b'\x10', - b'\x11', - b'\x12', - b'\x13', - b'\x14', - b'\x15', - b'\x16', - b'\x17', - b'\x18', - b'\x19', - b'\x1a', - b'\x1b', - b'\x1c', - b'\x1d', - b'\x1e', - b'\x1f', - b' ', - b'"', - b'#', - b'<', - b'>', - b'\\', - b'^', - b'`', - b'{', - b'|', - b'}', - b'\x7f', - b'\x80', - b'\x81', - b'\x82', - b'\x83', - b'\x84', - b'\x85', - b'\x86', - b'\x87', - b'\x88', - b'\x89', - b'\x8a', - b'\x8b', - b'\x8c', - b'\x8d', - b'\x8e', - b'\x8f', - b'\x90', - b'\x91', - b'\x92', - b'\x93', - b'\x94', - b'\x95', - b'\x96', - b'\x97', - b'\x98', - b'\x99', - b'\x9a', - b'\x9b', - b'\x9c', - b'\x9d', - b'\x9e', - b'\x9f', - b'\xa0', - b'\xa1', - b'\xa2', - b'\xa3', - b'\xa4', - b'\xa5', - b'\xa6', - b'\xa7', - b'\xa8', - b'\xa9', - b'\xaa', - b'\xab', - b'\xac', - b'\xad', - b'\xae', - b'\xaf', - b'\xb0', - b'\xb1', - b'\xb2', - b'\xb3', - b'\xb4', - b'\xb5', - b'\xb6', - b'\xb7', - b'\xb8', - b'\xb9', - b'\xba', - b'\xbb', - b'\xbc', - b'\xbd', - b'\xbe', - b'\xbf', - b'\xc0', - b'\xc1', - b'\xc2', - b'\xc3', - b'\xc4', - b'\xc5', - b'\xc6', - b'\xc7', - b'\xc8', - b'\xc9', - b'\xca', - b'\xcb', - b'\xcc', - b'\xcd', - b'\xce', - b'\xcf', - b'\xd0', - b'\xd1', - b'\xd2', - b'\xd3', - b'\xd4', - b'\xd5', - b'\xd6', - b'\xd7', - b'\xd8', - b'\xd9', - b'\xda', - b'\xdb', - b'\xdc', - b'\xdd', - b'\xde', - b'\xdf', - b'\xe0', - b'\xe1', - b'\xe2', - b'\xe3', - b'\xe4', - b'\xe5', - b'\xe6', - b'\xe7', - b'\xe8', - b'\xe9', - b'\xea', - b'\xeb', - b'\xec', - b'\xed', - b'\xee', - b'\xef', - b'\xf0', - b'\xf1', - b'\xf2', - b'\xf3', - b'\xf4', - b'\xf5', - b'\xf6', - b'\xf7', - b'\xf8', - b'\xf9', - b'\xfa', - b'\xfb', - b'\xfc', - b'\xfd', - b'\xfe', - b'\xff', + b't', b'e', b'/', b'o', b'a', b's', b'r', b'i', b'p', b'c', b'n', b'w', + b'.', b'h', b'l', b'm', b'-', b'd', b'u', b'0', b'1', b'2', b'g', b'=', + b':', b'b', b'f', b'3', b'y', b'5', b'&', b'_', b'4', b'v', b'9', b'6', + b'7', b'8', b'k', b'%', b'?', b'x', b'C', b'D', b'A', b'S', b'F', b'I', + b'B', b'E', b'j', b'P', b'T', b'z', b'R', b'N', b'M', b'+', b'L', b'O', + b'q', b'H', b'G', b'W', b'U', b'V', b',', b'Y', b'K', b'J', b'Z', b'X', + b'Q', b';', b')', b'(', b'~', b'[', b']', b'$', b'!', b'\'', b'*', b'@', + b'\x00', b'\x01', b'\x02', b'\x03', b'\x04', b'\x05', b'\x06', b'\x07', + b'\x08', b'\t', b'\n', b'\x0b', b'\x0c', b'\r', b'\x0e', b'\x0f', b'\x10', + b'\x11', b'\x12', b'\x13', b'\x14', b'\x15', b'\x16', b'\x17', b'\x18', + b'\x19', b'\x1a', b'\x1b', b'\x1c', b'\x1d', b'\x1e', b'\x1f', b' ', b'"', + b'#', b'<', b'>', b'\\', b'^', b'`', b'{', b'|', b'}', b'\x7f', b'\x80', + b'\x81', b'\x82', b'\x83', b'\x84', b'\x85', b'\x86', b'\x87', b'\x88', + b'\x89', b'\x8a', b'\x8b', b'\x8c', b'\x8d', b'\x8e', b'\x8f', b'\x90', + b'\x91', b'\x92', b'\x93', b'\x94', b'\x95', b'\x96', b'\x97', b'\x98', + b'\x99', b'\x9a', b'\x9b', b'\x9c', b'\x9d', b'\x9e', b'\x9f', b'\xa0', + b'\xa1', b'\xa2', b'\xa3', b'\xa4', b'\xa5', b'\xa6', b'\xa7', b'\xa8', + b'\xa9', b'\xaa', b'\xab', b'\xac', b'\xad', b'\xae', b'\xaf', b'\xb0', + b'\xb1', b'\xb2', b'\xb3', b'\xb4', b'\xb5', b'\xb6', b'\xb7', b'\xb8', + b'\xb9', b'\xba', b'\xbb', b'\xbc', b'\xbd', b'\xbe', b'\xbf', b'\xc0', + b'\xc1', b'\xc2', b'\xc3', b'\xc4', b'\xc5', b'\xc6', b'\xc7', b'\xc8', + b'\xc9', b'\xca', b'\xcb', b'\xcc', b'\xcd', b'\xce', b'\xcf', b'\xd0', + b'\xd1', b'\xd2', b'\xd3', b'\xd4', b'\xd5', b'\xd6', b'\xd7', b'\xd8', + b'\xd9', b'\xda', b'\xdb', b'\xdc', b'\xdd', b'\xde', b'\xdf', b'\xe0', + b'\xe1', b'\xe2', b'\xe3', b'\xe4', b'\xe5', b'\xe6', b'\xe7', b'\xe8', + b'\xe9', b'\xea', b'\xeb', b'\xec', b'\xed', b'\xee', b'\xef', b'\xf0', + b'\xf1', b'\xf2', b'\xf3', b'\xf4', b'\xf5', b'\xf6', b'\xf7', b'\xf8', + b'\xf9', b'\xfa', b'\xfb', b'\xfc', b'\xfd', b'\xfe', b'\xff', ]; diff --git a/src/raw/counting_writer.rs b/src/raw/counting_writer.rs index 4e233b9..d893782 100644 --- a/src/raw/counting_writer.rs +++ b/src/raw/counting_writer.rs @@ -9,10 +9,7 @@ pub struct CountingWriter { impl CountingWriter { /// Wrap the given writer with a counter. pub fn new(wtr: W) -> CountingWriter { - CountingWriter { - wtr: wtr, - cnt: 0, - } + CountingWriter { wtr: wtr, cnt: 0 } } /// Return the total number of bytes written to the underlying writer. @@ -32,7 +29,6 @@ impl CountingWriter { pub fn get_ref(&self) -> &W { &self.wtr } - } impl io::Write for CountingWriter { @@ -49,8 +45,8 @@ impl io::Write for CountingWriter { #[cfg(test)] mod tests { - use std::io::Write; use super::CountingWriter; + use std::io::Write; #[test] fn counts_bytes() { diff --git a/src/raw/error.rs b/src/raw/error.rs index 6aaaadb..1871db3 100644 --- a/src/raw/error.rs +++ b/src/raw/error.rs @@ -63,25 +63,42 @@ impl fmt::Display for Error { use self::Error::*; match *self { FromUtf8(ref err) => err.fmt(f), - Version { expected, got } => { - write!(f, "\ + Version { expected, got } => write!( + f, + "\ Error opening FST: expected API version {}, got API version {}. It looks like the FST you're trying to open is either not an FST file or it was generated with a different version of the 'fst' crate. You'll either need to change the version of the 'fst' crate you're using, or re-generate the -FST.", expected, got) - } - Format => write!(f, "\ +FST.", + expected, got + ), + Format => write!( + f, + "\ Error opening FST: An unknown error occurred. This usually means you're trying -to read data that isn't actually an encoded FST."), - DuplicateKey { ref got } => write!(f, "\ -Error inserting duplicate key: {}.", format_bytes(&*got)), - OutOfOrder { ref previous, ref got } => write!(f, "\ +to read data that isn't actually an encoded FST." + ), + DuplicateKey { ref got } => write!( + f, + "\ +Error inserting duplicate key: {}.", + format_bytes(&*got) + ), + OutOfOrder { ref previous, ref got } => write!( + f, + "\ Error inserting out-of-order key: {}. (Previous key was {}.) Keys must be inserted in lexicographic order.", -format_bytes(&*got), format_bytes(&*previous)), - WrongType { expected, got } => write!(f, "\ -Error opening FST: expected type {}, got type {}.", expected, got), + format_bytes(&*got), + format_bytes(&*previous) + ), + WrongType { expected, got } => write!( + f, + "\ +Error opening FST: expected type {}, got type {}.", + expected, got + ), } } } diff --git a/src/raw/mmap.rs b/src/raw/mmap.rs index 2b4d25f..be63c98 100644 --- a/src/raw/mmap.rs +++ b/src/raw/mmap.rs @@ -31,11 +31,7 @@ impl MmapReadOnly { pub unsafe fn open(file: &fs::File) -> io::Result { let mmap = Mmap::map(file)?; let len = mmap.len(); - Ok(MmapReadOnly { - map: Arc::new(mmap), - offset: 0, - len: len, - }) + Ok(MmapReadOnly { map: Arc::new(mmap), offset: 0, len: len }) } /// Open a new memory map from the path given. @@ -82,7 +78,7 @@ impl MmapReadOnly { impl Clone for MmapReadOnly { #[inline] fn clone(&self) -> MmapReadOnly { - MmapReadOnly{ + MmapReadOnly { map: self.map.clone(), offset: self.offset, len: self.len, diff --git a/src/raw/mod.rs b/src/raw/mod.rs index 8db1487..0487bfe 100644 --- a/src/raw/mod.rs +++ b/src/raw/mod.rs @@ -26,33 +26,36 @@ use std::ops::Deref; use std::path::Path; use std::sync::Arc; -use byteorder::{ReadBytesExt, LittleEndian}; +use byteorder::{LittleEndian, ReadBytesExt}; -use automaton::{Automaton, AlwaysMatch}; +use automaton::{AlwaysMatch, Automaton}; use error::Result; use stream::{IntoStreamer, Streamer}; pub use self::build::Builder; pub use self::error::Error; -pub use self::node::{Node, Transitions}; -#[cfg(feature = "mmap")] pub use self::mmap::MmapReadOnly; +#[cfg(feature = "mmap")] +pub use self::mmap::MmapReadOnly; use self::node::node_new; +pub use self::node::{Node, Transitions}; pub use self::ops::{ - IndexedValue, OpBuilder, - Intersection, Union, Difference, SymmetricDifference, + Difference, IndexedValue, Intersection, OpBuilder, SymmetricDifference, + Union, }; mod build; mod common_inputs; mod counting_writer; mod error; -#[cfg(feature = "mmap")] mod mmap; +#[cfg(feature = "mmap")] +mod mmap; mod node; mod ops; mod pack; mod registry; mod registry_minimal; -#[cfg(test)] mod tests; +#[cfg(test)] +mod tests; /// The API version of this crate. /// @@ -357,10 +360,9 @@ impl Fst { // N bytes (no unexpected EOF). let version = (&*data).read_u64::().unwrap(); if version == 0 || version > VERSION { - return Err(Error::Version { - expected: VERSION, - got: version, - }.into()); + return Err( + Error::Version { expected: VERSION, got: version }.into() + ); } let ty = (&data[8..]).read_u64::().unwrap(); let root_addr = { @@ -392,7 +394,8 @@ impl Fst { // // This is essentially our own little checksum. if (root_addr == EMPTY_ADDRESS && data.len() != 32) - && root_addr + 17 != data.len() { + && root_addr + 17 != data.len() + { return Err(Error::Format.into()); } Ok(Fst { @@ -496,8 +499,10 @@ impl Fst { /// `stream` must be a lexicographically ordered sequence of byte strings /// with associated values. pub fn is_disjoint<'f, I, S>(&self, stream: I) -> bool - where I: for<'a> IntoStreamer<'a, Into=S, Item=(&'a [u8], Output)>, - S: 'f + for<'a> Streamer<'a, Item=(&'a [u8], Output)> { + where + I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>, + S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>, + { self.op().add(stream).intersection().next().is_none() } @@ -507,8 +512,10 @@ impl Fst { /// `stream` must be a lexicographically ordered sequence of byte strings /// with associated values. pub fn is_subset<'f, I, S>(&self, stream: I) -> bool - where I: for<'a> IntoStreamer<'a, Into=S, Item=(&'a [u8], Output)>, - S: 'f + for<'a> Streamer<'a, Item=(&'a [u8], Output)> { + where + I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>, + S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>, + { let mut op = self.op().add(stream).intersection(); let mut count = 0; while let Some(_) = op.next() { @@ -523,8 +530,10 @@ impl Fst { /// `stream` must be a lexicographically ordered sequence of byte strings /// with associated values. pub fn is_superset<'f, I, S>(&self, stream: I) -> bool - where I: for<'a> IntoStreamer<'a, Into=S, Item=(&'a [u8], Output)>, - S: 'f + for<'a> Streamer<'a, Item=(&'a [u8], Output)> { + where + I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>, + S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>, + { let mut op = self.op().add(stream).union(); let mut count = 0; while let Some(_) = op.next() { @@ -603,7 +612,7 @@ impl<'a, 'f> IntoStreamer<'a> for &'f Fst { /// the stream. By default, no filtering is done. /// /// The `'f` lifetime parameter refers to the lifetime of the underlying fst. -pub struct StreamBuilder<'f, A=AlwaysMatch> { +pub struct StreamBuilder<'f, A = AlwaysMatch> { fst: &'f Fst, aut: A, min: Bound, @@ -692,7 +701,10 @@ impl Bound { /// the stream. By default, no filtering is done. /// /// The `'f` lifetime parameter refers to the lifetime of the underlying fst. -pub struct Stream<'f, A=AlwaysMatch> where A: Automaton { +pub struct Stream<'f, A = AlwaysMatch> +where + A: Automaton, +{ fst: &'f Fst, aut: A, inp: Vec, @@ -744,12 +756,8 @@ impl<'f, A: Automaton> Stream<'f, A> { return; } let (key, inclusive) = match min { - Bound::Excluded(ref min) => { - (min, false) - } - Bound::Included(ref min) => { - (min, true) - } + Bound::Excluded(ref min) => (min, false), + Bound::Included(ref min) => (min, true), Bound::Unbounded => unreachable!(), }; // At this point, we need to find the starting location of `min` in @@ -770,7 +778,7 @@ impl<'f, A: Automaton> Stream<'f, A> { self.inp.push(b); self.stack.push(StreamState { node: node, - trans: i+1, + trans: i + 1, out: out, aut_state: prev_state, }); @@ -785,9 +793,10 @@ impl<'f, A: Automaton> Stream<'f, A> { // input byte. self.stack.push(StreamState { node: node, - trans: node.transitions() - .position(|t| t.inp > b) - .unwrap_or(node.len()), + trans: node + .transitions() + .position(|t| t.inp > b) + .unwrap_or(node.len()), out: out, aut_state: aut_state, }); @@ -890,7 +899,8 @@ impl<'f, 'a, A: Automaton> Streamer<'a> for Stream<'f, A> { } while let Some(state) = self.stack.pop() { if state.trans >= state.node.len() - || !self.aut.can_match(&state.aut_state) { + || !self.aut.can_match(&state.aut_state) + { if state.node.addr() != self.fst.root_addr { self.inp.pop().unwrap(); } @@ -902,9 +912,7 @@ impl<'f, 'a, A: Automaton> Streamer<'a> for Stream<'f, A> { let is_match = self.aut.is_match(&next_state); let next_node = self.fst.node(trans.addr); self.inp.push(trans.inp); - self.stack.push(StreamState { - trans: state.trans + 1, .. state - }); + self.stack.push(StreamState { trans: state.trans + 1, ..state }); self.stack.push(StreamState { node: next_node, trans: 0, @@ -982,8 +990,11 @@ impl Output { /// This function panics if `self > o`. #[inline] pub fn sub(self, o: Output) -> Output { - Output(self.0.checked_sub(o.0) - .expect("BUG: underflow subtraction not allowed")) + Output( + self.0 + .checked_sub(o.0) + .expect("BUG: underflow subtraction not allowed"), + ) } } @@ -1027,11 +1038,7 @@ pub struct Transition { impl Default for Transition { #[inline] fn default() -> Self { - Transition { - inp: 0, - out: Output::zero(), - addr: NONE_ADDRESS, - } + Transition { inp: 0, out: Output::zero(), addr: NONE_ADDRESS } } } @@ -1040,8 +1047,13 @@ impl fmt::Debug for Transition { if self.out.is_zero() { write!(f, "{} -> {}", self.inp as char, self.addr) } else { - write!(f, "({}, {}) -> {}", - self.inp as char, self.out.value(), self.addr) + write!( + f, + "({}, {}) -> {}", + self.inp as char, + self.out.value(), + self.addr + ) } } } @@ -1056,11 +1068,14 @@ fn u64_to_usize(n: u64) -> usize { #[cfg(not(target_pointer_width = "64"))] fn u64_to_usize(n: u64) -> usize { if n > ::std::usize::MAX as u64 { - panic!("\ + panic!( + "\ Cannot convert node address {} to a pointer sized variable. If this FST is very large and was generated on a system with a larger pointer size than this system, then it is not possible to read this FST on this -system.", n); +system.", + n + ); } n as usize } diff --git a/src/raw/node.rs b/src/raw/node.rs index 13d70e5..714796f 100644 --- a/src/raw/node.rs +++ b/src/raw/node.rs @@ -5,10 +5,10 @@ use std::ops::Range; use byteorder::WriteBytesExt; -use raw::{EMPTY_ADDRESS, CompiledAddr, Output, Transition, u64_to_usize}; use raw::build::BuilderNode; use raw::common_inputs::{COMMON_INPUTS, COMMON_INPUTS_INV}; use raw::pack::{pack_size, pack_uint, pack_uint_in, unpack_uint}; +use raw::{u64_to_usize, CompiledAddr, Output, Transition, EMPTY_ADDRESS}; /// The threshold (in number of transitions) at which an index is created for /// a node's transitions. This speeds up lookup time at the expense of FST @@ -59,19 +59,17 @@ pub fn node_new(version: u64, addr: CompiledAddr, data: &[u8]) -> Node { use self::State::*; let state = State::new(data, addr); match state { - EmptyFinal => { - Node { - data: &[], - version: version, - state: State::EmptyFinal, - start: EMPTY_ADDRESS, - end: EMPTY_ADDRESS, - is_final: true, - ntrans: 0, - sizes: PackSizes::new(), - final_output: Output::zero(), - } - } + EmptyFinal => Node { + data: &[], + version: version, + state: State::EmptyFinal, + start: EMPTY_ADDRESS, + end: EMPTY_ADDRESS, + is_final: true, + ntrans: 0, + sizes: PackSizes::new(), + final_output: Output::zero(), + }, OneTransNext(s) => { let data = &data[..addr + 1]; Node { @@ -149,13 +147,11 @@ impl<'f> Node<'f> { addr: s.trans_addr(self), } } - AnyTrans(s) => { - Transition { - inp: s.input(self, i), - out: s.output(self, i), - addr: s.trans_addr(self, i), - } - } + AnyTrans(s) => Transition { + inp: s.input(self, i), + out: s.output(self, i), + addr: s.trans_addr(self, i), + }, EmptyFinal => panic!("out of bounds"), } } @@ -255,14 +251,16 @@ impl<'f> Node<'f> { assert!(node.trans.len() <= 256); if node.trans.is_empty() && node.is_final - && node.final_output.is_zero() { + && node.final_output.is_zero() + { return Ok(()); } else if node.trans.len() != 1 || node.is_final { StateAnyTrans::compile(wtr, addr, node) } else { if !node.is_final && node.trans[0].addr == last_addr - && node.trans[0].out.is_zero() { + && node.trans[0].out.is_zero() + { StateOneTransNext::compile(wtr, addr, node.trans[0].inp) } else { StateOneTrans::compile(wtr, addr, node.trans[0]) @@ -273,7 +271,10 @@ impl<'f> Node<'f> { impl BuilderNode { pub fn compile_to( - &self, wtr: W, last_addr: CompiledAddr, addr: CompiledAddr, + &self, + wtr: W, + last_addr: CompiledAddr, + addr: CompiledAddr, ) -> io::Result<()> { Node::compile(wtr, last_addr, addr, self) } @@ -315,7 +316,9 @@ impl State { impl StateOneTransNext { fn compile( - mut wtr: W, _: CompiledAddr, input: u8, + mut wtr: W, + _: CompiledAddr, + input: u8, ) -> io::Result<()> { let mut state = StateOneTransNext::new(); state.set_common_input(input); @@ -341,7 +344,11 @@ impl StateOneTransNext { #[inline(always)] fn input_len(&self) -> usize { - if self.common_input().is_none() { 1 } else { 0 } + if self.common_input().is_none() { + 1 + } else { + 0 + } } #[inline(always)] @@ -366,14 +373,13 @@ impl StateOneTransNext { impl StateOneTrans { fn compile( - mut wtr: W, addr: CompiledAddr, trans: Transition, + mut wtr: W, + addr: CompiledAddr, + trans: Transition, ) -> io::Result<()> { let out = trans.out.value(); - let output_pack_size = if out == 0 { - 0 - } else { - pack_uint(&mut wtr, out)? - }; + let output_pack_size = + if out == 0 { 0 } else { pack_uint(&mut wtr, out)? }; let trans_pack_size = pack_delta(&mut wtr, addr, trans.addr)?; let mut pack_sizes = PackSizes::new(); @@ -404,14 +410,16 @@ impl StateOneTrans { #[inline(always)] fn input_len(&self) -> usize { - if self.common_input().is_none() { 1 } else { 0 } + if self.common_input().is_none() { + 1 + } else { + 0 + } } #[inline(always)] fn sizes(&self, data: &[u8]) -> PackSizes { - let i = data.len() - 1 - - self.input_len() - - 1; + let i = data.len() - 1 - self.input_len() - 1; PackSizes::decode(data[i]) } @@ -460,7 +468,9 @@ impl StateOneTrans { impl StateAnyTrans { fn compile( - mut wtr: W, addr: CompiledAddr, node: &BuilderNode, + mut wtr: W, + addr: CompiledAddr, + node: &BuilderNode, ) -> io::Result<()> { assert!(node.trans.len() <= 256); @@ -559,9 +569,7 @@ impl StateAnyTrans { #[inline(always)] fn sizes(&self, data: &[u8]) -> PackSizes { - let i = data.len() - 1 - - self.ntrans_len() - - 1; + let i = data.len() - 1 - self.ntrans_len() - 1; PackSizes::decode(data[i]) } @@ -587,7 +595,11 @@ impl StateAnyTrans { #[inline(always)] fn ntrans_len(&self) -> usize { - if self.state_ntrans().is_none() { 1 } else { 0 } + if self.state_ntrans().is_none() { + 1 + } else { + 0 + } } #[inline(always)] @@ -636,11 +648,7 @@ impl StateAnyTrans { ntrans: usize, ) -> usize { let osize = sizes.output_pack_size(); - let final_osize = if !self.is_final_state() { - 0 - } else { - osize - }; + let final_osize = if !self.is_final_state() { 0 } else { osize }; data.len() - 1 - self.ntrans_len() - 1 // pack size @@ -859,11 +867,11 @@ fn unpack_delta( #[cfg(test)] mod tests { - use quickcheck::{TestResult, quickcheck}; + use quickcheck::{quickcheck, TestResult}; - use raw::{VERSION, Builder, Transition, CompiledAddr, Fst, Output}; use raw::build::BuilderNode; - use raw::node::{Node, node_new}; + use raw::node::{node_new, Node}; + use raw::{Builder, CompiledAddr, Fst, Output, Transition, VERSION}; use stream::Streamer; const NEVER_LAST: CompiledAddr = ::std::u64::MAX as CompiledAddr; @@ -894,8 +902,9 @@ mod tests { assert_eq!(compiled.is_final(), uncompiled.is_final); assert_eq!(compiled.len(), uncompiled.trans.len()); assert_eq!(compiled.final_output(), uncompiled.final_output); - for (ct, ut) - in compiled.transitions().zip(uncompiled.trans.iter().cloned()) { + for (ct, ut) in + compiled.transitions().zip(uncompiled.trans.iter().cloned()) + { assert_eq!(ct.inp, ut.inp); assert_eq!(ct.out, ut.out); assert_eq!(ct.addr, ut.addr); @@ -964,9 +973,12 @@ mod tests { is_final: false, final_output: Output::zero(), trans: vec![ - trans(2, b'a'), trans(3, b'b'), - trans(4, b'c'), trans(5, b'd'), - trans(6, b'e'), trans(7, b'f'), + trans(2, b'a'), + trans(3, b'b'), + trans(4, b'c'), + trans(5, b'd'), + trans(6, b'e'), + trans(7, b'f'), ], }; let (addr, buf) = compile(&bnode); diff --git a/src/raw/ops.rs b/src/raw/ops.rs index b142c11..604c8da 100644 --- a/src/raw/ops.rs +++ b/src/raw/ops.rs @@ -6,7 +6,8 @@ use raw::Output; use stream::{IntoStreamer, Streamer}; /// Permits stream operations to be hetergeneous with respect to streams. -type BoxedStream<'f> = Box Streamer<'a, Item=(&'a [u8], Output)> + 'f>; +type BoxedStream<'f> = + Box Streamer<'a, Item = (&'a [u8], Output)> + 'f>; /// A value indexed by a stream. /// @@ -59,8 +60,10 @@ impl<'f> OpBuilder<'f> { /// The stream must emit a lexicographically ordered sequence of key-value /// pairs. pub fn add(mut self, stream: I) -> Self - where I: for<'a> IntoStreamer<'a, Into=S, Item=(&'a [u8], Output)>, - S: 'f + for<'a> Streamer<'a, Item=(&'a [u8], Output)> { + where + I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>, + S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>, + { self.push(stream); self } @@ -70,8 +73,10 @@ impl<'f> OpBuilder<'f> { /// The stream must emit a lexicographically ordered sequence of key-value /// pairs. pub fn push(&mut self, stream: I) - where I: for<'a> IntoStreamer<'a, Into=S, Item=(&'a [u8], Output)>, - S: 'f + for<'a> Streamer<'a, Item=(&'a [u8], Output)> { + where + I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>, + S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>, + { self.streams.push(Box::new(stream.into_stream())); } @@ -164,9 +169,14 @@ impl<'f> OpBuilder<'f> { } impl<'f, I, S> Extend for OpBuilder<'f> - where I: for<'a> IntoStreamer<'a, Into=S, Item=(&'a [u8], Output)>, - S: 'f + for<'a> Streamer<'a, Item=(&'a [u8], Output)> { - fn extend(&mut self, it: T) where T: IntoIterator { +where + I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>, + S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>, +{ + fn extend(&mut self, it: T) + where + T: IntoIterator, + { for stream in it { self.push(stream); } @@ -174,9 +184,14 @@ impl<'f, I, S> Extend for OpBuilder<'f> } impl<'f, I, S> FromIterator for OpBuilder<'f> - where I: for<'a> IntoStreamer<'a, Into=S, Item=(&'a [u8], Output)>, - S: 'f + for<'a> Streamer<'a, Item=(&'a [u8], Output)> { - fn from_iter(it: T) -> Self where T: IntoIterator { +where + I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>, + S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>, +{ + fn from_iter(it: T) -> Self + where + T: IntoIterator, + { let mut op = OpBuilder::new(); op.extend(it); op @@ -251,7 +266,7 @@ impl<'a, 'f> Streamer<'a> for Intersection<'f> { } else { self.cur_slot = Some(slot); let key = self.cur_slot.as_ref().unwrap().input(); - return Some((key, &self.outs)) + return Some((key, &self.outs)); } } } @@ -283,10 +298,8 @@ impl<'a, 'f> Streamer<'a> for Difference<'f> { self.key.clear(); self.key.extend(key); self.outs.clear(); - self.outs.push(IndexedValue { - index: 0, - value: out.value(), - }); + self.outs + .push(IndexedValue { index: 0, value: out.value() }); } }; let mut unique = true; @@ -297,7 +310,7 @@ impl<'a, 'f> Streamer<'a> for Difference<'f> { self.heap.refill(slot); } if unique { - return Some((&self.key, &self.outs)) + return Some((&self.key, &self.outs)); } } } @@ -340,7 +353,7 @@ impl<'a, 'f> Streamer<'a> for SymmetricDifference<'f> { } else { self.cur_slot = Some(slot); let key = self.cur_slot.as_ref().unwrap().input(); - return Some((key, &self.outs)) + return Some((key, &self.outs)); } } } @@ -353,10 +366,7 @@ struct StreamHeap<'f> { impl<'f> StreamHeap<'f> { fn new(streams: Vec>) -> StreamHeap<'f> { - let mut u = StreamHeap { - rdrs: streams, - heap: BinaryHeap::new(), - }; + let mut u = StreamHeap { rdrs: streams, heap: BinaryHeap::new() }; for i in 0..u.rdrs.len() { u.refill(Slot::new(i)); } @@ -437,8 +447,8 @@ impl Slot { impl PartialOrd for Slot { fn partial_cmp(&self, other: &Slot) -> Option { (&self.input, self.output) - .partial_cmp(&(&other.input, other.output)) - .map(|ord| ord.reverse()) + .partial_cmp(&(&other.input, other.output)) + .map(|ord| ord.reverse()) } } @@ -456,7 +466,9 @@ mod tests { use super::OpBuilder; - fn s(string: &str) -> String { string.to_owned() } + fn s(string: &str) -> String { + string.to_owned() + } macro_rules! create_set_op { ($name:ident, $op:ident) => { @@ -470,7 +482,7 @@ mod tests { } keys } - } + }; } macro_rules! create_map_op { @@ -487,7 +499,7 @@ mod tests { } keys } - } + }; } create_set_op!(fst_union, union); @@ -501,19 +513,13 @@ mod tests { #[test] fn union_set() { - let v = fst_union(vec![ - vec!["a", "b", "c"], - vec!["x", "y", "z"], - ]); + let v = fst_union(vec![vec!["a", "b", "c"], vec!["x", "y", "z"]]); assert_eq!(v, vec!["a", "b", "c", "x", "y", "z"]); } #[test] fn union_set_dupes() { - let v = fst_union(vec![ - vec!["aa", "b", "cc"], - vec!["b", "cc", "z"], - ]); + let v = fst_union(vec![vec!["aa", "b", "cc"], vec!["b", "cc", "z"]]); assert_eq!(v, vec!["aa", "b", "cc", "z"]); } @@ -523,10 +529,17 @@ mod tests { vec![("a", 1), ("b", 2), ("c", 3)], vec![("x", 1), ("y", 2), ("z", 3)], ]); - assert_eq!(v, vec![ - (s("a"), 1), (s("b"), 2), (s("c"), 3), - (s("x"), 1), (s("y"), 2), (s("z"), 3), - ]); + assert_eq!( + v, + vec![ + (s("a"), 1), + (s("b"), 2), + (s("c"), 3), + (s("x"), 1), + (s("y"), 2), + (s("z"), 3), + ] + ); } #[test] @@ -536,17 +549,16 @@ mod tests { vec![("b", 1), ("cc", 2), ("z", 3)], vec![("b", 1)], ]); - assert_eq!(v, vec![ - (s("aa"), 1), (s("b"), 4), (s("cc"), 5), (s("z"), 3), - ]); + assert_eq!( + v, + vec![(s("aa"), 1), (s("b"), 4), (s("cc"), 5), (s("z"), 3),] + ); } #[test] fn intersect_set() { - let v = fst_intersection(vec![ - vec!["a", "b", "c"], - vec!["x", "y", "z"], - ]); + let v = + fst_intersection(vec![vec!["a", "b", "c"], vec!["x", "y", "z"]]); assert_eq!(v, Vec::::new()); } @@ -611,15 +623,9 @@ mod tests { #[test] fn difference2() { // Regression test: https://github.com/BurntSushi/fst/issues/19 - let v = fst_difference(vec![ - vec!["a", "c"], - vec!["b", "c"], - ]); + let v = fst_difference(vec![vec!["a", "c"], vec!["b", "c"]]); assert_eq!(v, vec!["a"]); - let v = fst_difference(vec![ - vec!["bar", "foo"], - vec!["baz", "foo"], - ]); + let v = fst_difference(vec![vec!["bar", "foo"], vec!["baz", "foo"]]); assert_eq!(v, vec!["bar"]); } diff --git a/src/raw/pack.rs b/src/raw/pack.rs index 983ba3b..a5926a5 100644 --- a/src/raw/pack.rs +++ b/src/raw/pack.rs @@ -1,6 +1,6 @@ use std::io; -use byteorder::{ByteOrder, WriteBytesExt, LittleEndian}; +use byteorder::{ByteOrder, LittleEndian, WriteBytesExt}; /// pack_uint packs the given integer in the smallest number of bytes possible, /// and writes it to the given writer. The number of bytes written is returned @@ -55,9 +55,9 @@ pub fn pack_size(n: u64) -> u8 { #[cfg(test)] mod tests { - use std::io; - use quickcheck::{QuickCheck, StdGen}; use super::*; + use quickcheck::{QuickCheck, StdGen}; + use std::io; #[test] fn prop_pack_in_out() { diff --git a/src/raw/registry.rs b/src/raw/registry.rs index b6bd069..5d9e938 100644 --- a/src/raw/registry.rs +++ b/src/raw/registry.rs @@ -1,11 +1,11 @@ -use raw::{NONE_ADDRESS, CompiledAddr}; use raw::build::BuilderNode; +use raw::{CompiledAddr, NONE_ADDRESS}; #[derive(Debug)] pub struct Registry { table: Vec, table_size: usize, // number of rows - mru_size: usize, // number of columns + mru_size: usize, // number of columns } #[derive(Debug)] @@ -94,7 +94,7 @@ impl<'a> RegistryCache<'a> { fn promote(&mut self, mut i: usize) { assert!(i < self.cells.len()); while i > 0 { - self.cells.swap(i-1, i); + self.cells.swap(i - 1, i); i -= 1; } } @@ -102,10 +102,7 @@ impl<'a> RegistryCache<'a> { impl RegistryCell { fn none() -> RegistryCell { - RegistryCell { - addr: NONE_ADDRESS, - node: BuilderNode::default(), - } + RegistryCell { addr: NONE_ADDRESS, node: BuilderNode::default() } } fn is_none(&self) -> bool { @@ -119,11 +116,9 @@ impl RegistryCell { #[cfg(test)] mod tests { - use raw::{Output, Transition}; + use super::{Registry, RegistryCache, RegistryCell, RegistryEntry}; use raw::build::BuilderNode; - use super::{ - Registry, RegistryCell, RegistryEntry, RegistryCache, - }; + use raw::{Output, Transition}; fn assert_rejected(entry: RegistryEntry) { match entry { @@ -179,23 +174,30 @@ mod tests { is_final: false, final_output: Output::zero(), trans: vec![Transition { - addr: 0, inp: b'a', out: Output::zero(), + addr: 0, + inp: b'a', + out: Output::zero(), }], }; assert_insert_and_found(&mut reg, &bnode); assert_not_found( - reg.entry(&BuilderNode { is_final: true, .. bnode.clone() })); + reg.entry(&BuilderNode { is_final: true, ..bnode.clone() }), + ); assert_not_found(reg.entry(&BuilderNode { trans: vec![Transition { - addr: 0, inp: b'b', out: Output::zero(), + addr: 0, + inp: b'b', + out: Output::zero(), }], - .. bnode.clone() + ..bnode.clone() })); assert_not_found(reg.entry(&BuilderNode { trans: vec![Transition { - addr: 0, inp: b'a', out: Output::new(1), + addr: 0, + inp: b'a', + out: Output::new(1), }], - .. bnode.clone() + ..bnode.clone() })); } @@ -206,9 +208,8 @@ mod tests { let bnode1 = BuilderNode { is_final: true, ..BuilderNode::default() }; assert_insert_and_found(&mut reg, &bnode1); - let bnode2 = BuilderNode { - final_output: Output::new(1), ..bnode1.clone() - }; + let bnode2 = + BuilderNode { final_output: Output::new(1), ..bnode1.clone() }; assert_insert_and_found(&mut reg, &bnode2); assert_not_found(reg.entry(&bnode1)); } diff --git a/src/raw/registry_minimal.rs b/src/raw/registry_minimal.rs index 0832de2..a92823d 100644 --- a/src/raw/registry_minimal.rs +++ b/src/raw/registry_minimal.rs @@ -13,8 +13,8 @@ use std::collections::hash_map::{Entry, HashMap}; -use raw::CompiledAddr; use raw::build::BuilderNode; +use raw::CompiledAddr; #[derive(Debug)] pub struct Registry { diff --git a/src/raw/tests.rs b/src/raw/tests.rs index 9c4b771..b1a185b 100644 --- a/src/raw/tests.rs +++ b/src/raw/tests.rs @@ -2,13 +2,16 @@ use std::sync::Arc; use automaton::AlwaysMatch; use error::Error; -use raw::{self, VERSION, Builder, Bound, Fst, Stream, Output}; +use raw::{self, Bound, Builder, Fst, Output, Stream, VERSION}; use stream::Streamer; const TEXT: &'static str = include_str!("./../../data/words-100000"); pub fn fst_set(ss: I) -> Fst - where I: IntoIterator, S: AsRef<[u8]> { +where + I: IntoIterator, + S: AsRef<[u8]>, +{ let mut bfst = Builder::memory(); let mut ss: Vec> = ss.into_iter().map(|s| s.as_ref().to_vec()).collect(); @@ -23,7 +26,10 @@ pub fn fst_set(ss: I) -> Fst } pub fn fst_map(ss: I) -> Fst - where I: IntoIterator, S: AsRef<[u8]> { +where + I: IntoIterator, + S: AsRef<[u8]>, +{ let mut bfst = Builder::memory(); let mut ss: Vec<(Vec, u64)> = ss.into_iter().map(|(s, o)| (s.as_ref().to_vec(), o)).collect(); @@ -98,15 +104,16 @@ test_set_fail!(fst_set_order2, "a", "b", "c", "a"); #[test] fn fst_set_100000() { - let words: Vec> = TEXT.lines() - .map(|s| s.as_bytes().to_vec()) - .collect(); + let words: Vec> = + TEXT.lines().map(|s| s.as_bytes().to_vec()).collect(); let fst = fst_set(words.clone()); assert_eq!(words, fst_inputs(&fst)); for word in &words { - assert!(fst.get(word).is_some(), - "failed to find word: {}", - ::std::str::from_utf8(word).unwrap()); + assert!( + fst.get(word).is_some(), + "failed to find word: {}", + ::std::str::from_utf8(word).unwrap() + ); } } @@ -149,8 +156,18 @@ test_map!(fst_map_two, "a", 1, "b", 2); test_map!(fst_map_many1, "a", 34786, "ab", 26); test_map!( fst_map_many2, - "a", 34786, "ab", 26, "abc", 58976, "abcd", 25, - "z", 58, "zabc", 6798 + "a", + 34786, + "ab", + 26, + "abc", + 58976, + "abcd", + 25, + "z", + 58, + "zabc", + 6798 ); test_map!(fst_map_many3, "a", 1, "ab", 0, "abc", 0); @@ -162,11 +179,11 @@ test_map_fail!(fst_map_order2, "a", 0, "b", 0, "c", 0, "a", 0); #[test] fn fst_map_100000_increments() { - let words: Vec<(Vec, u64)> = - TEXT.lines() - .enumerate() - .map(|(i, s)| (s.as_bytes().to_vec(), i as u64)) - .collect(); + let words: Vec<(Vec, u64)> = TEXT + .lines() + .enumerate() + .map(|(i, s)| (s.as_bytes().to_vec(), i as u64)) + .collect(); let fst = fst_map(words.clone()); assert_eq!(words, fst_inputs_outputs(&fst)); for &(ref word, out) in &words { @@ -176,10 +193,10 @@ fn fst_map_100000_increments() { #[test] fn fst_map_100000_lengths() { - let words: Vec<(Vec, u64)> = - TEXT.lines() - .map(|s| (s.as_bytes().to_vec(), s.len() as u64)) - .collect(); + let words: Vec<(Vec, u64)> = TEXT + .lines() + .map(|s| (s.as_bytes().to_vec(), s.len() as u64)) + .collect(); let fst = fst_map(words.clone()); assert_eq!(words, fst_inputs_outputs(&fst)); for &(ref word, out) in &words { @@ -477,7 +494,11 @@ fn one_vec_multiple_fsts() { let fst1 = Fst::from_shared_bytes(bytes.clone(), 0, fst1_len).unwrap(); let fst2 = Fst::from_shared_bytes( - bytes.clone(), fst1_len, bytes.len() - fst1_len).unwrap(); + bytes.clone(), + fst1_len, + bytes.len() - fst1_len, + ) + .unwrap(); assert_eq!(fst_inputs(&fst1), vec![b"bar".to_vec(), b"baz".to_vec()]); assert_eq!(fst_inputs(&fst2), vec![b"bar".to_vec(), b"foo".to_vec()]); diff --git a/src/set.rs b/src/set.rs index ef23e44..6c0d894 100644 --- a/src/set.rs +++ b/src/set.rs @@ -1,10 +1,10 @@ use std::fmt; -use std::iter::{self, FromIterator}; use std::io; +use std::iter::{self, FromIterator}; #[cfg(feature = "mmap")] use std::path::Path; -use automaton::{Automaton, AlwaysMatch}; +use automaton::{AlwaysMatch, Automaton}; use raw; use stream::{IntoStreamer, Streamer}; use Result; @@ -93,7 +93,10 @@ impl Set { /// To build a set that streams to an arbitrary `io::Write`, use /// `SetBuilder`. pub fn from_iter(iter: I) -> Result - where T: AsRef<[u8]>, I: IntoIterator { + where + T: AsRef<[u8]>, + I: IntoIterator, + { let mut builder = SetBuilder::memory(); builder.extend_iter(iter)?; Set::from_bytes(builder.into_inner()?) @@ -281,8 +284,10 @@ impl Set { /// assert_eq!(set1.is_disjoint(&set3), false); /// ``` pub fn is_disjoint<'f, I, S>(&self, stream: I) -> bool - where I: for<'a> IntoStreamer<'a, Into=S, Item=&'a [u8]>, - S: 'f + for<'a> Streamer<'a, Item=&'a [u8]> { + where + I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>, + S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>, + { self.0.is_disjoint(StreamZeroOutput(stream.into_stream())) } @@ -306,8 +311,10 @@ impl Set { /// assert_eq!(set3.is_subset(&set1), true); /// ``` pub fn is_subset<'f, I, S>(&self, stream: I) -> bool - where I: for<'a> IntoStreamer<'a, Into=S, Item=&'a [u8]>, - S: 'f + for<'a> Streamer<'a, Item=&'a [u8]> { + where + I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>, + S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>, + { self.0.is_subset(StreamZeroOutput(stream.into_stream())) } @@ -331,8 +338,10 @@ impl Set { /// assert_eq!(set3.is_superset(&set1), false); /// ``` pub fn is_superset<'f, I, S>(&self, stream: I) -> bool - where I: for<'a> IntoStreamer<'a, Into=S, Item=&'a [u8]>, - S: 'f + for<'a> Streamer<'a, Item=&'a [u8]> { + where + I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>, + S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>, + { self.0.is_superset(StreamZeroOutput(stream.into_stream())) } @@ -515,7 +524,10 @@ impl SetBuilder { /// If an error occurred while adding an element, processing is stopped /// and the error is returned. pub fn extend_iter(&mut self, iter: I) -> Result<()> - where T: AsRef<[u8]>, I: IntoIterator { + where + T: AsRef<[u8]>, + I: IntoIterator, + { for key in iter { self.0.add(key)?; } @@ -527,8 +539,10 @@ impl SetBuilder { /// Note that unlike `extend_iter`, this is not generic on the items in /// the stream. pub fn extend_stream<'f, I, S>(&mut self, stream: I) -> Result<()> - where I: for<'a> IntoStreamer<'a, Into=S, Item=&'a [u8]>, - S: 'f + for<'a> Streamer<'a, Item=&'a [u8]> { + where + I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>, + S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>, + { self.0.extend_stream(StreamZeroOutput(stream.into_stream())) } @@ -554,7 +568,6 @@ impl SetBuilder { pub fn bytes_written(&self) -> u64 { self.0.bytes_written() } - } /// A lexicographically ordered stream of keys from a set. @@ -563,7 +576,9 @@ impl SetBuilder { /// the stream. By default, no filtering is done. /// /// The `'s` lifetime parameter refers to the lifetime of the underlying set. -pub struct Stream<'s, A=AlwaysMatch>(raw::Stream<'s, A>) where A: Automaton; +pub struct Stream<'s, A = AlwaysMatch>(raw::Stream<'s, A>) +where + A: Automaton; impl<'s, A: Automaton> Stream<'s, A> { /// Creates a new set stream from an fst stream. @@ -612,7 +627,7 @@ impl<'a, 's, A: Automaton> Streamer<'a> for Stream<'s, A> { /// the stream. By default, no filtering is done. /// /// The `'s` lifetime parameter refers to the lifetime of the underlying set. -pub struct StreamBuilder<'s, A=AlwaysMatch>(raw::StreamBuilder<'s, A>); +pub struct StreamBuilder<'s, A = AlwaysMatch>(raw::StreamBuilder<'s, A>); impl<'s, A: Automaton> StreamBuilder<'s, A> { /// Specify a greater-than-or-equal-to bound. @@ -676,8 +691,10 @@ impl<'s> OpBuilder<'s> { /// The stream must emit a lexicographically ordered sequence of byte /// strings. pub fn add(mut self, streamable: I) -> Self - where I: for<'a> IntoStreamer<'a, Into=S, Item=&'a [u8]>, - S: 's + for<'a> Streamer<'a, Item=&'a [u8]> { + where + I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>, + S: 's + for<'a> Streamer<'a, Item = &'a [u8]>, + { self.push(streamable); self } @@ -687,8 +704,10 @@ impl<'s> OpBuilder<'s> { /// The stream must emit a lexicographically ordered sequence of byte /// strings. pub fn push(&mut self, streamable: I) - where I: for<'a> IntoStreamer<'a, Into=S, Item=&'a [u8]>, - S: 's + for<'a> Streamer<'a, Item=&'a [u8]> { + where + I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>, + S: 's + for<'a> Streamer<'a, Item = &'a [u8]>, + { self.0.push(StreamZeroOutput(streamable.into_stream())); } @@ -795,9 +814,14 @@ impl<'s> OpBuilder<'s> { } impl<'f, I, S> Extend for OpBuilder<'f> - where I: for<'a> IntoStreamer<'a, Into=S, Item=&'a [u8]>, - S: 'f + for<'a> Streamer<'a, Item=&'a [u8]> { - fn extend(&mut self, it: T) where T: IntoIterator { +where + I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>, + S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>, +{ + fn extend(&mut self, it: T) + where + T: IntoIterator, + { for stream in it { self.push(stream); } @@ -805,9 +829,14 @@ impl<'f, I, S> Extend for OpBuilder<'f> } impl<'f, I, S> FromIterator for OpBuilder<'f> - where I: for<'a> IntoStreamer<'a, Into=S, Item=&'a [u8]>, - S: 'f + for<'a> Streamer<'a, Item=&'a [u8]> { - fn from_iter(it: T) -> Self where T: IntoIterator { +where + I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>, + S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>, +{ + fn from_iter(it: T) -> Self + where + T: IntoIterator, + { let mut op = OpBuilder::new(); op.extend(it); op @@ -892,8 +921,8 @@ impl<'a, S: Streamer<'a>> Streamer<'a> for StreamZeroOutput { #[cfg(test)] mod tests { - use Streamer; use super::OpBuilder; + use Streamer; #[test] fn no_fsts() { @@ -929,11 +958,7 @@ mod tests { &b"fubar"[..], &b"quux"[..], ])) - .add(Iter::new(vec![ - &b"bar"[..], - &b"foofoo"[..], - &b"fubar"[..], - ])) + .add(Iter::new(vec![&b"bar"[..], &b"foofoo"[..], &b"fubar"[..]])) .intersection(); let mut got = vec![]; diff --git a/src/stream.rs b/src/stream.rs index db6ac36..e78a596 100644 --- a/src/stream.rs +++ b/src/stream.rs @@ -114,7 +114,7 @@ pub trait IntoStreamer<'a> { /// The type of the item emitted by the stream. type Item: 'a; /// The type of the stream to be constructed. - type Into: Streamer<'a, Item=Self::Item>; + type Into: Streamer<'a, Item = Self::Item>; /// Construct a stream from `Self`. fn into_stream(self) -> Self::Into; diff --git a/tests/test.rs b/tests/test.rs index 98af1a7..821f895 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -5,10 +5,10 @@ extern crate fst_regex; use fst_levenshtein::Levenshtein; use fst_regex::Regex; -use fst::{Automaton, IntoStreamer, Streamer}; use fst::automaton::{Str, Subsequence}; use fst::raw::{Builder, Fst, Output}; -use fst::set::{Set, OpBuilder}; +use fst::set::{OpBuilder, Set}; +use fst::{Automaton, IntoStreamer, Streamer}; static WORDS: &'static str = include_str!("../data/words-10000"); @@ -17,7 +17,10 @@ fn get_set() -> Set { } fn fst_set(ss: I) -> Fst - where I: IntoIterator, S: AsRef<[u8]> { +where + I: IntoIterator, + S: AsRef<[u8]>, +{ let mut bfst = Builder::memory(); let mut ss: Vec> = ss.into_iter().map(|s| s.as_ref().to_vec()).collect(); @@ -78,9 +81,12 @@ fn startswith_small() { let stream = set.search(lev.starts_with()).into_stream(); let keys = stream.into_strs().unwrap(); - assert_eq!(keys, vec![ - "cooing", "fo", "fob", "focus", "foo", "food", "foul", "frothing", - ]); + assert_eq!( + keys, + vec![ + "cooing", "fo", "fob", "focus", "foo", "food", "foul", "frothing", + ] + ); } #[test] @@ -129,10 +135,8 @@ fn union_large() { let lev = Levenshtein::new("foo", 3).unwrap(); let reg = Regex::new("(..)*").unwrap(); let mut stream1 = set.search((&lev).union(®)).into_stream(); - let mut stream2 = OpBuilder::new() - .add(set.search(&lev)) - .add(set.search(®)) - .union(); + let mut stream2 = + OpBuilder::new().add(set.search(&lev)).add(set.search(®)).union(); while let Some(key1) = stream1.next() { assert_eq!(stream2.next(), Some(key1)); } From d7eeeec15c0075e7eaed03e467cebdd61e1e109a Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 21 Feb 2020 16:49:11 -0500 Subject: [PATCH 03/23] ci: switch to GitHub Actions --- .github/workflows/ci.yml | 124 +++++++++++++++++++++++++++++++++++++++ .travis.yml | 10 ---- README.md | 11 +--- appveyor.yml | 22 ------- ci/script.sh | 22 ------- 5 files changed, 126 insertions(+), 63 deletions(-) create mode 100644 .github/workflows/ci.yml delete mode 100644 .travis.yml delete mode 100644 appveyor.yml delete mode 100755 ci/script.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..556f970 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,124 @@ +name: ci +on: + pull_request: + push: + branches: + - master + schedule: + - cron: '00 01 * * *' +jobs: + test: + name: test + env: + # For some builds, we use cross to test on 32-bit and big-endian + # systems. + CARGO: cargo + # When CARGO is set to CROSS, TARGET is set to `--target matrix.target`. + TARGET: + runs-on: ${{ matrix.os }} + strategy: + matrix: + build: + - pinned + - stable + - stable-32 + - stable-mips + - beta + - nightly + - macos + - win-msvc + - win-gnu + include: + - build: pinned + os: ubuntu-18.04 + rust: 1.39.0 + - build: stable + os: ubuntu-18.04 + rust: stable + - build: stable-32 + os: ubuntu-18.04 + rust: stable + target: i686-unknown-linux-gnu + - build: stable-mips + os: ubuntu-18.04 + rust: stable + target: mips64-unknown-linux-gnuabi64 + - build: beta + os: ubuntu-18.04 + rust: beta + - build: nightly + os: ubuntu-18.04 + rust: nightly + - build: macos + os: macos-latest + rust: stable + - build: win-msvc + os: windows-2019 + rust: stable + - build: win-gnu + os: windows-2019 + rust: stable-x86_64-gnu + steps: + - name: Checkout repository + uses: actions/checkout@v1 + with: + fetch-depth: 1 + + - name: Install Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: ${{ matrix.rust }} + profile: minimal + override: true + + - name: Use Cross + if: matrix.target != '' + run: | + # FIXME: to work around bugs in latest cross release, install master. + # See: https://github.com/rust-embedded/cross/issues/357 + cargo install --git https://github.com/rust-embedded/cross + echo "::set-env name=CARGO::cross" + echo "::set-env name=TARGET::--target ${{ matrix.target }}" + + - name: Show command used for Cargo + run: | + echo "cargo command is: ${{ env.CARGO }}" + echo "target flag is: ${{ env.TARGET }}" + + - name: Build + run: ${{ env.CARGO }} build --verbose ${{ env.TARGET }} + + - name: Build docs + run: ${{ env.CARGO }} doc --verbose ${{ env.TARGET }} + + - name: Run tests + run: ${{ env.CARGO }} test --verbose --all ${{ env.TARGET }} + + - name: Run tests without default features + run: ${{ env.CARGO }} test --verbose --lib --no-default-features ${{ env.TARGET }} + + # Disable this for now until we publish regex-automata with fst support. + # - name: Build fst CLI tool + # run: ${{ env.CARGO }} build --verbose --manifest-path fst-bin/Cargo.toml ${{ env.TARGET }} + + - name: Compile benchmarks + run: ${{ env.CARGO }} bench --manifest-path bench/Cargo.toml --verbose ${{ env.TARGET }} -- --test + + rustfmt: + name: rustfmt + runs-on: ubuntu-18.04 + steps: + - name: Checkout repository + uses: actions/checkout@v1 + with: + fetch-depth: 1 + - name: Install Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + override: true + profile: minimal + components: rustfmt + - name: Check formatting + run: | + cargo fmt --all -- --check diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index ec508d0..0000000 --- a/.travis.yml +++ /dev/null @@ -1,10 +0,0 @@ -language: rust -rust: - - 1.24.0 - - stable - - beta - - nightly -script: ci/script.sh -branches: - only: - - master diff --git a/README.md b/README.md index e9bcc4b..fbf4941 100644 --- a/README.md +++ b/README.md @@ -12,8 +12,7 @@ Check out my blog post Rust](http://blog.burntsushi.net/transducers/) for extensive background, examples and experiments. -[![Linux build status](https://travis-ci.org/BurntSushi/fst.svg?branch=master)](https://travis-ci.org/BurntSushi/fst) -[![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/fst?svg=true)](https://ci.appveyor.com/project/BurntSushi/fst) +[![Build status](https://github.com/BurntSushi/fst/workflows/ci/badge.svg)](https://github.com/BurntSushi/fst/actions) [![](http://meritbadge.herokuapp.com/fst)](https://crates.io/crates/fst) Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org). @@ -21,7 +20,7 @@ Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org). ### Documentation -[Full API documentation and examples.](http://burntsushi.net/rustdoc/fst/) +https://docs.rs/fst The [`fst-regex`](https://docs.rs/fst-regex) @@ -40,12 +39,6 @@ Simply add a corresponding entry to your `Cargo.toml` dependency list: fst = "0.3" ``` -And add this to your crate root: - -```rust,ignore -extern crate fst; -``` - ### Example diff --git a/appveyor.yml b/appveyor.yml deleted file mode 100644 index de0ee9f..0000000 --- a/appveyor.yml +++ /dev/null @@ -1,22 +0,0 @@ -environment: - matrix: - - TARGET: x86_64-pc-windows-gnu - BITS: 64 - MSYS2: 1 - - TARGET: i686-pc-windows-gnu - BITS: 32 - MSYS2: 1 -install: - - curl -sSf -o rustup-init.exe https://win.rustup.rs/ - - rustup-init.exe -y --default-host %TARGET% - - set PATH=%PATH%;C:\Users\appveyor\.cargo\bin - - if defined MSYS2 set PATH=C:\msys64\mingw%BITS%\bin;%PATH% - - rustc -V - - cargo -V -build: false -test_script: - - cargo build --verbose --all - - cargo test --verbose --all -branches: - only: - - master diff --git a/ci/script.sh b/ci/script.sh deleted file mode 100755 index 8eb68e8..0000000 --- a/ci/script.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/sh - -set -ex - -cargo doc --verbose -cargo build --verbose -cargo build --verbose --manifest-path fst-bin/Cargo.toml - -# If we're testing on an older version of Rust, then only check that we -# can build the crate. This is because the dev dependencies might be updated -# more frequently, and therefore might require a newer version of Rust. -# -# This isn't ideal. It's a compromise. -if [ "$TRAVIS_RUST_VERSION" = "1.20.0" ]; then - exit -fi - -cargo test --verbose -cargo test --verbose --lib --no-default-features -if [ "$TRAVIS_RUST_VERSION" = "nightly" ]; then - cargo bench --verbose --no-run -fi From 743633aff9723337336204c6da3a65e6f48c754e Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 21 Feb 2020 18:21:28 -0500 Subject: [PATCH 04/23] edition: switch to Rust 2018 --- Cargo.toml | 1 + benches/build.rs | 2 +- benches/search.rs | 8 ++++---- fst-bin/Cargo.toml | 1 + fst-bin/src/cmd/csv.rs | 4 ++-- fst-bin/src/cmd/dot.rs | 4 ++-- fst-bin/src/cmd/dupes.rs | 4 ++-- fst-bin/src/cmd/fuzzy.rs | 4 ++-- fst-bin/src/cmd/grep.rs | 4 ++-- fst-bin/src/cmd/map.rs | 6 +++--- fst-bin/src/cmd/node.rs | 4 ++-- fst-bin/src/cmd/range.rs | 4 ++-- fst-bin/src/cmd/rust.rs | 4 ++-- fst-bin/src/cmd/set.rs | 6 +++--- fst-bin/src/cmd/union.rs | 4 ++-- fst-bin/src/main.rs | 2 +- fst-bin/src/merge.rs | 6 +++--- fst-bin/src/util.rs | 14 ++++++------- fst-levenshtein/Cargo.toml | 1 + fst-levenshtein/src/error.rs | 4 ++-- fst-levenshtein/src/lib.rs | 8 ++++---- fst-regex/Cargo.toml | 1 + fst-regex/src/compile.rs | 2 +- fst-regex/src/dfa.rs | 12 +++++------ fst-regex/src/error.rs | 4 ++-- fst-regex/src/lib.rs | 12 +++++------ src/error.rs | 6 +++--- src/lib.rs | 21 +++++++++---------- src/map.rs | 24 +++++++++++----------- src/raw/build.rs | 12 +++++------ src/raw/error.rs | 8 ++++---- src/raw/mod.rs | 20 +++++++++--------- src/raw/node.rs | 40 ++++++++++++++++++------------------ src/raw/ops.rs | 12 +++++------ src/raw/registry.rs | 8 ++++---- src/raw/registry_minimal.rs | 4 ++-- src/raw/tests.rs | 8 ++++---- src/set.rs | 20 +++++++++--------- tests/test.rs | 6 +++--- 39 files changed, 159 insertions(+), 156 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 6c18970..7572402 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,6 +12,7 @@ repository = "https://github.com/BurntSushi/fst" readme = "README.md" keywords = ["search", "information", "retrieval", "dictionary", "map"] license = "Unlicense/MIT" +edition = "2018" [features] mmap = ["memmap"] diff --git a/benches/build.rs b/benches/build.rs index d006589..6d871c5 100644 --- a/benches/build.rs +++ b/benches/build.rs @@ -1,6 +1,6 @@ #![feature(test)] -extern crate fst; + extern crate test; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; diff --git a/benches/search.rs b/benches/search.rs index 5e4a315..4960f3e 100644 --- a/benches/search.rs +++ b/benches/search.rs @@ -1,7 +1,7 @@ #![feature(test)] -extern crate fnv; -extern crate fst; + + #[macro_use] extern crate lazy_static; extern crate test; @@ -90,5 +90,5 @@ macro_rules! search { }; } -search!(words, ::WORDS); -search!(wiki_urls, ::WIKI_URLS); +search!(words, crate::WORDS); +search!(wiki_urls, crate::WIKI_URLS); diff --git a/fst-bin/Cargo.toml b/fst-bin/Cargo.toml index 3bb0bf2..195b79e 100644 --- a/fst-bin/Cargo.toml +++ b/fst-bin/Cargo.toml @@ -13,6 +13,7 @@ homepage = "https://github.com/BurntSushi/fst" repository = "https://github.com/BurntSushi/fst" keywords = ["search", "information", "retrieval", "dictionary", "map"] license = "Unlicense/MIT" +edition = "2018" [[bin]] name = "fst" diff --git a/fst-bin/src/cmd/csv.rs b/fst-bin/src/cmd/csv.rs index 73f272b..0b8724b 100644 --- a/fst-bin/src/cmd/csv.rs +++ b/fst-bin/src/cmd/csv.rs @@ -3,8 +3,8 @@ use csv; use docopt::Docopt; use fst::raw as fst; -use util; -use Error; +use crate::util; +use crate::Error; const USAGE: &'static str = " Emit information in CSV format about the transducer. diff --git a/fst-bin/src/cmd/dot.rs b/fst-bin/src/cmd/dot.rs index 68fde89..46635e5 100644 --- a/fst-bin/src/cmd/dot.rs +++ b/fst-bin/src/cmd/dot.rs @@ -4,8 +4,8 @@ use bit_set::BitSet; use docopt::Docopt; use fst::raw as fst; -use util; -use Error; +use crate::util; +use crate::Error; const USAGE: &'static str = " Emit this transducer in the \"dot\" format. diff --git a/fst-bin/src/cmd/dupes.rs b/fst-bin/src/cmd/dupes.rs index db694c2..66516b0 100644 --- a/fst-bin/src/cmd/dupes.rs +++ b/fst-bin/src/cmd/dupes.rs @@ -5,8 +5,8 @@ use bit_set::BitSet; use docopt::Docopt; use fst::raw as fst; -use util; -use Error; +use crate::util; +use crate::Error; const USAGE: &'static str = " A simple way to show duplicate nodes. diff --git a/fst-bin/src/cmd/fuzzy.rs b/fst-bin/src/cmd/fuzzy.rs index 81d5d63..125e52a 100644 --- a/fst-bin/src/cmd/fuzzy.rs +++ b/fst-bin/src/cmd/fuzzy.rs @@ -4,8 +4,8 @@ use docopt::Docopt; use fst::raw::Fst; use fst_levenshtein::Levenshtein; -use util; -use Error; +use crate::util; +use crate::Error; const USAGE: &'static str = " Issues a fuzzy query against the given transducer. diff --git a/fst-bin/src/cmd/grep.rs b/fst-bin/src/cmd/grep.rs index 931f231..3e61f9f 100644 --- a/fst-bin/src/cmd/grep.rs +++ b/fst-bin/src/cmd/grep.rs @@ -4,8 +4,8 @@ use docopt::Docopt; use fst::raw::Fst; use fst_regex::Regex; -use util; -use Error; +use crate::util; +use crate::Error; const USAGE: &'static str = " Searches a transducer with a regular expression. diff --git a/fst-bin/src/cmd/map.rs b/fst-bin/src/cmd/map.rs index 4d86c2d..f933a80 100644 --- a/fst-bin/src/cmd/map.rs +++ b/fst-bin/src/cmd/map.rs @@ -6,9 +6,9 @@ use csv; use docopt::Docopt; use fst::MapBuilder; -use merge::Merger; -use util; -use Error; +use crate::merge::Merger; +use crate::util; +use crate::Error; const USAGE: &'static str = " Creates an ordered map backed by a finite state transducer. diff --git a/fst-bin/src/cmd/node.rs b/fst-bin/src/cmd/node.rs index 9773702..d24802c 100644 --- a/fst-bin/src/cmd/node.rs +++ b/fst-bin/src/cmd/node.rs @@ -1,8 +1,8 @@ use docopt::Docopt; use fst::raw::Fst; -use util; -use Error; +use crate::util; +use crate::Error; const USAGE: &'static str = " Shows a single node from the transducer. diff --git a/fst-bin/src/cmd/range.rs b/fst-bin/src/cmd/range.rs index 6cf832c..ba7cfb7 100644 --- a/fst-bin/src/cmd/range.rs +++ b/fst-bin/src/cmd/range.rs @@ -3,8 +3,8 @@ use std::io; use docopt::Docopt; use fst::raw::Fst; -use util; -use Error; +use crate::util; +use crate::Error; const USAGE: &'static str = " Issues a range query against the given transducer. diff --git a/fst-bin/src/cmd/rust.rs b/fst-bin/src/cmd/rust.rs index 1d2d927..c44d7d3 100644 --- a/fst-bin/src/cmd/rust.rs +++ b/fst-bin/src/cmd/rust.rs @@ -2,8 +2,8 @@ use std::io::{Read, Write}; use docopt::Docopt; -use util; -use Error; +use crate::util; +use crate::Error; const USAGE: &'static str = " Emit Rust source code for the given FST. diff --git a/fst-bin/src/cmd/set.rs b/fst-bin/src/cmd/set.rs index 355a2ce..684422f 100644 --- a/fst-bin/src/cmd/set.rs +++ b/fst-bin/src/cmd/set.rs @@ -5,9 +5,9 @@ use docopt::Docopt; use fst::SetBuilder; use lines::linereader::LineReader; -use merge::Merger; -use util; -use Error; +use crate::merge::Merger; +use crate::util; +use crate::Error; const USAGE: &'static str = " Creates an ordered set backed by a finite state transducer. diff --git a/fst-bin/src/cmd/union.rs b/fst-bin/src/cmd/union.rs index ec4137b..fb5860b 100644 --- a/fst-bin/src/cmd/union.rs +++ b/fst-bin/src/cmd/union.rs @@ -3,8 +3,8 @@ use std::fs; use docopt::Docopt; use fst; -use util; -use Error; +use crate::util; +use crate::Error; const USAGE: &'static str = " Unions all of the transducer inputs into a single transducer. diff --git a/fst-bin/src/main.rs b/fst-bin/src/main.rs index 6414e0f..4614097 100644 --- a/fst-bin/src/main.rs +++ b/fst-bin/src/main.rs @@ -36,7 +36,7 @@ mod cmd; mod merge; mod util; -pub type Error = Box; +pub type Error = Box; const USAGE: &'static str = " Usage: diff --git a/fst-bin/src/merge.rs b/fst-bin/src/merge.rs index edbf876..84cecbc 100644 --- a/fst-bin/src/merge.rs +++ b/fst-bin/src/merge.rs @@ -13,12 +13,12 @@ use fst::{self, Streamer, raw}; use num_cpus; use tempdir::TempDir; -use Error; +use crate::Error; pub struct Merger { it: I, output: PathBuf, - value_merger: Option u64 + Send + Sync + 'static>>, + value_merger: Option u64 + Send + Sync + 'static>>, fd_limit: u32, batch_size: u32, threads: u32, @@ -237,7 +237,7 @@ struct UnionBatch { gen: usize, index: usize, fsts: Vec, - value_merger: Option u64 + Send + Sync + 'static>>, + value_merger: Option u64 + Send + Sync + 'static>>, } impl Batchable for UnionBatch { diff --git a/fst-bin/src/util.rs b/fst-bin/src/util.rs index 31f8e83..7045c0d 100644 --- a/fst-bin/src/util.rs +++ b/fst-bin/src/util.rs @@ -7,7 +7,7 @@ use csv; use fst::{IntoStreamer, Streamer}; use fst::raw::Output; -use Error; +use crate::Error; pub fn escape_input(b: u8) -> String { String::from_utf8(ascii::escape_default(b).collect::>()).unwrap() @@ -15,19 +15,19 @@ pub fn escape_input(b: u8) -> String { pub fn get_buf_reader>( path: Option, -) -> io::Result>> { +) -> io::Result>> { Ok(io::BufReader::new(get_reader(path)?)) } pub fn get_buf_writer>( path: Option, -) -> io::Result>> { +) -> io::Result>> { Ok(io::BufWriter::new(get_writer(path)?)) } pub fn get_reader>( path: Option, -) -> io::Result> { +) -> io::Result> { Ok(match to_stdio(path) { None => Box::new(io::stdin()), Some(path) => Box::new(File::open(path)?), @@ -36,7 +36,7 @@ pub fn get_reader>( pub fn get_writer>( path: Option, -) -> io::Result> { +) -> io::Result> { Ok(match to_stdio(path) { None => Box::new(io::stdout()), Some(path) => Box::new(File::create(path)?), @@ -86,7 +86,7 @@ pub struct ConcatLines { cur: Option, } -type Lines = io::Lines>>; +type Lines = io::Lines>>; impl ConcatLines { pub fn new(mut inputs: Vec) -> ConcatLines { @@ -125,7 +125,7 @@ pub struct ConcatCsv { cur: Option, } -type Reader = Box; +type Reader = Box; type Rows = csv::DeserializeRecordsIntoIter; impl ConcatCsv { diff --git a/fst-levenshtein/Cargo.toml b/fst-levenshtein/Cargo.toml index 1cac095..f204a21 100644 --- a/fst-levenshtein/Cargo.toml +++ b/fst-levenshtein/Cargo.toml @@ -10,6 +10,7 @@ homepage = "https://github.com/BurntSushi/fst" repository = "https://github.com/BurntSushi/fst" keywords = ["search", "information", "retrieval", "dictionary", "map"] license = "Unlicense/MIT" +edition = "2018" [dependencies] fst = { path = "..", version = "0.3.1" } diff --git a/fst-levenshtein/src/error.rs b/fst-levenshtein/src/error.rs index eda24fb..24322bd 100644 --- a/fst-levenshtein/src/error.rs +++ b/fst-levenshtein/src/error.rs @@ -12,7 +12,7 @@ pub enum Error { } impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use self::Error::*; match *self { TooManyStates(size_limit) => write!( @@ -33,7 +33,7 @@ impl error::Error for Error { } } - fn cause(&self) -> Option<&error::Error> { + fn cause(&self) -> Option<&dyn error::Error> { None } } diff --git a/fst-levenshtein/src/lib.rs b/fst-levenshtein/src/lib.rs index abbfc33..b8e9f84 100644 --- a/fst-levenshtein/src/lib.rs +++ b/fst-levenshtein/src/lib.rs @@ -1,5 +1,5 @@ -extern crate fst; -extern crate utf8_ranges; + + use std::cmp; use std::collections::hash_map::Entry; @@ -101,7 +101,7 @@ impl Levenshtein { } impl fmt::Debug for Levenshtein { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, "Levenshtein(query: {:?}, distance: {:?})", @@ -178,7 +178,7 @@ struct State { } impl fmt::Debug for State { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { writeln!(f, "State {{")?; writeln!(f, " is_match: {:?}", self.is_match)?; for i in 0..256 { diff --git a/fst-regex/Cargo.toml b/fst-regex/Cargo.toml index 6e7b50a..33ce341 100644 --- a/fst-regex/Cargo.toml +++ b/fst-regex/Cargo.toml @@ -10,6 +10,7 @@ homepage = "https://github.com/BurntSushi/fst" repository = "https://github.com/BurntSushi/fst" keywords = ["search", "information", "retrieval", "dictionary", "map"] license = "Unlicense/MIT" +edition = "2018" [features] mmap = ["fst/mmap"] diff --git a/fst-regex/src/compile.rs b/fst-regex/src/compile.rs index 44c1c9e..9213668 100644 --- a/fst-regex/src/compile.rs +++ b/fst-regex/src/compile.rs @@ -1,7 +1,7 @@ use regex_syntax::{CharClass, ClassRange, Expr, Repeater}; use utf8_ranges::{Utf8Sequence, Utf8Sequences}; -use {Error, Inst}; +use crate::{Error, Inst}; pub struct Compiler { size_limit: usize, diff --git a/fst-regex/src/dfa.rs b/fst-regex/src/dfa.rs index 8f0fe3b..b9755ef 100644 --- a/fst-regex/src/dfa.rs +++ b/fst-regex/src/dfa.rs @@ -1,8 +1,8 @@ use std::collections::{HashMap, HashSet}; use std::fmt; -use sparse::SparseSet; -use {Error, Inst}; +use crate::sparse::SparseSet; +use crate::{Error, Inst}; const STATE_LIMIT: usize = 1_000; // currently at least 2MB >_< @@ -73,7 +73,7 @@ impl DfaBuilder { fn cached_state(&mut self, set: &SparseSet) -> Option { use std::collections::hash_map::Entry; - use Inst::*; + use crate::Inst::*; // There are probably many ways to optimize this routine. ---AG @@ -117,7 +117,7 @@ impl Dfa { } fn add(&self, set: &mut SparseSet, ip: usize) { - use Inst::*; + use crate::Inst::*; if set.contains(ip) { return; @@ -134,7 +134,7 @@ impl Dfa { } fn run(&self, from: &SparseSet, to: &mut SparseSet, byte: u8) -> bool { - use Inst::*; + use crate::Inst::*; to.clear(); let mut is_match = false; for i in 0..from.len() { @@ -154,7 +154,7 @@ impl Dfa { } impl fmt::Debug for Dfa { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { for (i, inst) in self.insts.iter().enumerate() { writeln!(f, "{:03} {:?}", i, inst)?; } diff --git a/fst-regex/src/error.rs b/fst-regex/src/error.rs index e2a1222..50b1031 100644 --- a/fst-regex/src/error.rs +++ b/fst-regex/src/error.rs @@ -47,7 +47,7 @@ impl From for Error { } impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use self::Error::*; match *self { Syntax(ref err) => err.fmt(f), @@ -95,7 +95,7 @@ impl error::Error for Error { } } - fn cause(&self) -> Option<&error::Error> { + fn cause(&self) -> Option<&dyn error::Error> { use self::Error::*; match *self { Syntax(ref err) => Some(err), diff --git a/fst-regex/src/lib.rs b/fst-regex/src/lib.rs index 26d8b53..56b40b1 100644 --- a/fst-regex/src/lib.rs +++ b/fst-regex/src/lib.rs @@ -1,12 +1,12 @@ -extern crate fst; -extern crate regex_syntax; -extern crate utf8_ranges; + +use regex_syntax; + use std::fmt; use fst::Automaton; -pub use error::Error; +pub use crate::error::Error; mod compile; mod dfa; @@ -157,14 +157,14 @@ impl Automaton for Regex { } impl fmt::Debug for Regex { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { writeln!(f, "Regex({:?})", self.original)?; self.dfa.fmt(f) } } impl fmt::Debug for Inst { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use self::Inst::*; match *self { Match => write!(f, "Match"), diff --git a/src/error.rs b/src/error.rs index 3ba252b..93df638 100644 --- a/src/error.rs +++ b/src/error.rs @@ -2,7 +2,7 @@ use std::error; use std::fmt; use std::io; -use raw; +use crate::raw; /// A `Result` type alias for this crate's `Error` type. pub type Result = ::std::result::Result; @@ -32,7 +32,7 @@ impl From for Error { } impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use self::Error::*; match *self { Fst(ref err) => err.fmt(f), @@ -50,7 +50,7 @@ impl error::Error for Error { } } - fn cause(&self) -> Option<&error::Error> { + fn cause(&self) -> Option<&dyn error::Error> { use self::Error::*; match *self { Fst(ref err) => Some(err), diff --git a/src/lib.rs b/src/lib.rs index a29475a..e0b93ba 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -305,23 +305,22 @@ will be returned if the automaton gets too big (tens of MB in heap usage). #![deny(missing_docs)] -extern crate byteorder; + #[cfg(test)] extern crate fst_levenshtein; #[cfg(test)] extern crate fst_regex; -#[cfg(feature = "mmap")] -extern crate memmap; + #[cfg(test)] extern crate quickcheck; #[cfg(test)] extern crate rand; -pub use automaton::Automaton; -pub use error::{Error, Result}; -pub use map::{Map, MapBuilder}; -pub use set::{Set, SetBuilder}; -pub use stream::{IntoStreamer, Streamer}; +pub use crate::automaton::Automaton; +pub use crate::error::{Error, Result}; +pub use crate::map::{Map, MapBuilder}; +pub use crate::set::{Set, SetBuilder}; +pub use crate::stream::{IntoStreamer, Streamer}; mod error; #[path = "automaton/mod.rs"] @@ -338,7 +337,7 @@ mod stream; /// This module defines a trait, `Automaton`, with several implementations /// including, but not limited to, union, intersection and complement. pub mod automaton { - pub use inner_automaton::*; + pub use crate::inner_automaton::*; } /// Map operations implemented by finite state transducers. @@ -357,7 +356,7 @@ pub mod automaton { /// option of specifying a merge strategy for a map's values. The rest of the /// types are streams for set operations. pub mod map { - pub use inner_map::*; + pub use crate::inner_map::*; } /// Set operations implemented by finite state transducers. @@ -380,5 +379,5 @@ pub mod map { /// of streams and executes set operations like `union` or `intersection` on /// them. The rest of the types are streams for set operations. pub mod set { - pub use inner_set::*; + pub use crate::inner_set::*; } diff --git a/src/map.rs b/src/map.rs index 8c3d386..c784151 100644 --- a/src/map.rs +++ b/src/map.rs @@ -4,11 +4,11 @@ use std::iter::{self, FromIterator}; #[cfg(feature = "mmap")] use std::path::Path; -use automaton::{AlwaysMatch, Automaton}; -use raw; -pub use raw::IndexedValue; -use stream::{IntoStreamer, Streamer}; -use Result; +use crate::automaton::{AlwaysMatch, Automaton}; +use crate::raw; +pub use crate::raw::IndexedValue; +use crate::stream::{IntoStreamer, Streamer}; +use crate::Result; /// Map is a lexicographically ordered map from byte strings to integers. /// @@ -193,7 +193,7 @@ impl Map { /// ]); /// ``` #[inline] - pub fn stream(&self) -> Stream { + pub fn stream(&self) -> Stream<'_> { Stream(self.0.stream()) } @@ -216,7 +216,7 @@ impl Map { /// assert_eq!(keys, vec![b"a", b"b", b"c"]); /// ``` #[inline] - pub fn keys(&self) -> Keys { + pub fn keys(&self) -> Keys<'_> { Keys(self.0.stream()) } @@ -240,7 +240,7 @@ impl Map { /// assert_eq!(values, vec![1, 2, 3]); /// ``` #[inline] - pub fn values(&self) -> Values { + pub fn values(&self) -> Values<'_> { Values(self.0.stream()) } @@ -276,7 +276,7 @@ impl Map { /// ]); /// ``` #[inline] - pub fn range(&self) -> StreamBuilder { + pub fn range(&self) -> StreamBuilder<'_> { StreamBuilder(self.0.range()) } @@ -323,7 +323,7 @@ impl Map { /// Ok(()) /// } /// ``` - pub fn search(&self, aut: A) -> StreamBuilder { + pub fn search(&self, aut: A) -> StreamBuilder<'_, A> { StreamBuilder(self.0.search(aut)) } @@ -383,7 +383,7 @@ impl Map { /// ]); /// ``` #[inline] - pub fn op(&self) -> OpBuilder { + pub fn op(&self) -> OpBuilder<'_> { OpBuilder::new().add(self) } @@ -402,7 +402,7 @@ impl Default for Map { } impl fmt::Debug for Map { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "Map([")?; let mut stream = self.stream(); let mut first = true; diff --git a/src/raw/build.rs b/src/raw/build.rs index 2f59302..e4c3bbd 100644 --- a/src/raw/build.rs +++ b/src/raw/build.rs @@ -2,16 +2,16 @@ use std::io::{self, Write}; use byteorder::{LittleEndian, WriteBytesExt}; -use error::Result; -use raw::counting_writer::CountingWriter; -use raw::error::Error; -use raw::registry::{Registry, RegistryEntry}; -use raw::{ +use crate::error::Result; +use crate::raw::counting_writer::CountingWriter; +use crate::raw::error::Error; +use crate::raw::registry::{Registry, RegistryEntry}; +use crate::raw::{ CompiledAddr, FstType, Output, Transition, EMPTY_ADDRESS, NONE_ADDRESS, VERSION, }; // use raw::registry_minimal::{Registry, RegistryEntry}; -use stream::{IntoStreamer, Streamer}; +use crate::stream::{IntoStreamer, Streamer}; /// A builder for creating a finite state transducer. /// diff --git a/src/raw/error.rs b/src/raw/error.rs index 1871db3..34fb655 100644 --- a/src/raw/error.rs +++ b/src/raw/error.rs @@ -3,7 +3,7 @@ use std::fmt; use std::str; use std::string::FromUtf8Error; -use raw::FstType; +use crate::raw::FstType; /// An error that occurred while using a finite state transducer. pub enum Error { @@ -59,7 +59,7 @@ pub enum Error { } impl fmt::Display for Error { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use self::Error::*; match *self { FromUtf8(ref err) => err.fmt(f), @@ -104,7 +104,7 @@ Error opening FST: expected type {}, got type {}.", } impl fmt::Debug for Error { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fmt::Display::fmt(self, f) } } @@ -122,7 +122,7 @@ impl error::Error for Error { } } - fn cause(&self) -> Option<&error::Error> { + fn cause(&self) -> Option<&dyn error::Error> { match *self { Error::FromUtf8(ref err) => Some(err), _ => None, diff --git a/src/raw/mod.rs b/src/raw/mod.rs index 0487bfe..9604696 100644 --- a/src/raw/mod.rs +++ b/src/raw/mod.rs @@ -28,9 +28,9 @@ use std::sync::Arc; use byteorder::{LittleEndian, ReadBytesExt}; -use automaton::{AlwaysMatch, Automaton}; -use error::Result; -use stream::{IntoStreamer, Streamer}; +use crate::automaton::{AlwaysMatch, Automaton}; +use crate::error::Result; +use crate::stream::{IntoStreamer, Streamer}; pub use self::build::Builder; pub use self::error::Error; @@ -446,7 +446,7 @@ impl Fst { /// Return a lexicographically ordered stream of all key-value pairs in /// this fst. #[inline] - pub fn stream(&self) -> Stream { + pub fn stream(&self) -> Stream<'_> { StreamBuilder::new(self, AlwaysMatch).into_stream() } @@ -455,12 +455,12 @@ impl Fst { /// A range query returns a subset of key-value pairs in this fst in a /// range given in lexicographic order. #[inline] - pub fn range(&self) -> StreamBuilder { + pub fn range(&self) -> StreamBuilder<'_> { StreamBuilder::new(self, AlwaysMatch) } /// Executes an automaton on the keys of this map. - pub fn search(&self, aut: A) -> StreamBuilder { + pub fn search(&self, aut: A) -> StreamBuilder<'_, A> { StreamBuilder::new(self, aut) } @@ -489,7 +489,7 @@ impl Fst { /// symmetric difference on the keys of the fst. These set operations also /// allow one to specify how conflicting values are merged in the stream. #[inline] - pub fn op(&self) -> OpBuilder { + pub fn op(&self) -> OpBuilder<'_> { OpBuilder::new().add(self) } @@ -556,7 +556,7 @@ impl Fst { /// Returns the root node of this fst. #[inline(always)] - pub fn root(&self) -> Node { + pub fn root(&self) -> Node<'_> { self.node(self.root_addr) } @@ -564,7 +564,7 @@ impl Fst { /// /// Node addresses can be obtained by reading transitions on `Node` values. #[inline] - pub fn node(&self, addr: CompiledAddr) -> Node { + pub fn node(&self, addr: CompiledAddr) -> Node<'_> { node_new(self.version, addr, &self.data) } @@ -1043,7 +1043,7 @@ impl Default for Transition { } impl fmt::Debug for Transition { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { if self.out.is_zero() { write!(f, "{} -> {}", self.inp as char, self.addr) } else { diff --git a/src/raw/node.rs b/src/raw/node.rs index 714796f..521740f 100644 --- a/src/raw/node.rs +++ b/src/raw/node.rs @@ -5,10 +5,10 @@ use std::ops::Range; use byteorder::WriteBytesExt; -use raw::build::BuilderNode; -use raw::common_inputs::{COMMON_INPUTS, COMMON_INPUTS_INV}; -use raw::pack::{pack_size, pack_uint, pack_uint_in, unpack_uint}; -use raw::{u64_to_usize, CompiledAddr, Output, Transition, EMPTY_ADDRESS}; +use crate::raw::build::BuilderNode; +use crate::raw::common_inputs::{COMMON_INPUTS, COMMON_INPUTS_INV}; +use crate::raw::pack::{pack_size, pack_uint, pack_uint_in, unpack_uint}; +use crate::raw::{u64_to_usize, CompiledAddr, Output, Transition, EMPTY_ADDRESS}; /// The threshold (in number of transitions) at which an index is created for /// a node's transitions. This speeds up lookup time at the expense of FST @@ -32,7 +32,7 @@ pub struct Node<'f> { } impl<'f> fmt::Debug for Node<'f> { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { writeln!(f, "NODE@{}", self.start)?; writeln!(f, " end_addr: {}", self.end)?; writeln!(f, " size: {} bytes", self.as_slice().len())?; @@ -55,7 +55,7 @@ impl<'f> fmt::Debug for Node<'f> { /// This is a free function so that we can export it to parent modules, but /// not to consumers of this crate. #[inline(always)] -pub fn node_new(version: u64, addr: CompiledAddr, data: &[u8]) -> Node { +pub fn node_new(version: u64, addr: CompiledAddr, data: &[u8]) -> Node<'_> { use self::State::*; let state = State::new(data, addr); match state { @@ -357,7 +357,7 @@ impl StateOneTransNext { } #[inline(always)] - fn input(&self, node: &Node) -> u8 { + fn input(&self, node: &Node<'_>) -> u8 { if let Some(inp) = self.common_input() { inp } else { @@ -366,7 +366,7 @@ impl StateOneTransNext { } #[inline(always)] - fn trans_addr(&self, node: &Node) -> CompiledAddr { + fn trans_addr(&self, node: &Node<'_>) -> CompiledAddr { node.end as CompiledAddr - 1 } } @@ -433,7 +433,7 @@ impl StateOneTrans { } #[inline(always)] - fn input(&self, node: &Node) -> u8 { + fn input(&self, node: &Node<'_>) -> u8 { if let Some(inp) = self.common_input() { inp } else { @@ -442,7 +442,7 @@ impl StateOneTrans { } #[inline(always)] - fn output(&self, node: &Node) -> Output { + fn output(&self, node: &Node<'_>) -> Output { let osize = node.sizes.output_pack_size(); if osize == 0 { return Output::zero(); @@ -456,7 +456,7 @@ impl StateOneTrans { } #[inline(always)] - fn trans_addr(&self, node: &Node) -> CompiledAddr { + fn trans_addr(&self, node: &Node<'_>) -> CompiledAddr { let tsize = node.sizes.transition_pack_size(); let i = node.start - self.input_len() @@ -658,7 +658,7 @@ impl StateAnyTrans { } #[inline(always)] - fn trans_addr(&self, node: &Node, i: usize) -> CompiledAddr { + fn trans_addr(&self, node: &Node<'_>, i: usize) -> CompiledAddr { assert!(i < node.ntrans); let tsize = node.sizes.transition_pack_size(); let at = node.start @@ -672,7 +672,7 @@ impl StateAnyTrans { } #[inline(always)] - fn input(&self, node: &Node, i: usize) -> u8 { + fn input(&self, node: &Node<'_>, i: usize) -> u8 { let at = node.start - self.ntrans_len() - 1 // pack size @@ -683,7 +683,7 @@ impl StateAnyTrans { } #[inline(always)] - fn find_input(&self, node: &Node, b: u8) -> Option { + fn find_input(&self, node: &Node<'_>, b: u8) -> Option { if node.version >= 2 && node.ntrans > TRANS_INDEX_THRESHOLD { let start = node.start - self.ntrans_len() @@ -707,7 +707,7 @@ impl StateAnyTrans { } #[inline(always)] - fn output(&self, node: &Node, i: usize) -> Output { + fn output(&self, node: &Node<'_>, i: usize) -> Output { let osize = node.sizes.output_pack_size(); if osize == 0 { return Output::zero(); @@ -772,7 +772,7 @@ impl PackSizes { /// /// `'f` is the lifetime of the underlying fst and `'n` is the lifetime of /// the underlying `Node`. -pub struct Transitions<'f: 'n, 'n> { +pub struct Transitions<'f, 'n> { node: &'n Node<'f>, range: Range, } @@ -869,10 +869,10 @@ fn unpack_delta( mod tests { use quickcheck::{quickcheck, TestResult}; - use raw::build::BuilderNode; - use raw::node::{node_new, Node}; - use raw::{Builder, CompiledAddr, Fst, Output, Transition, VERSION}; - use stream::Streamer; + use crate::raw::build::BuilderNode; + use crate::raw::node::{node_new, Node}; + use crate::raw::{Builder, CompiledAddr, Fst, Output, Transition, VERSION}; + use crate::stream::Streamer; const NEVER_LAST: CompiledAddr = ::std::u64::MAX as CompiledAddr; diff --git a/src/raw/ops.rs b/src/raw/ops.rs index 604c8da..41ddfc6 100644 --- a/src/raw/ops.rs +++ b/src/raw/ops.rs @@ -2,12 +2,12 @@ use std::cmp; use std::collections::BinaryHeap; use std::iter::FromIterator; -use raw::Output; -use stream::{IntoStreamer, Streamer}; +use crate::raw::Output; +use crate::stream::{IntoStreamer, Streamer}; /// Permits stream operations to be hetergeneous with respect to streams. type BoxedStream<'f> = - Box Streamer<'a, Item = (&'a [u8], Output)> + 'f>; + Box Streamer<'a, Item = (&'a [u8], Output)> + 'f>; /// A value indexed by a stream. /// @@ -460,9 +460,9 @@ impl Ord for Slot { #[cfg(test)] mod tests { - use raw::tests::{fst_map, fst_set}; - use raw::Fst; - use stream::{IntoStreamer, Streamer}; + use crate::raw::tests::{fst_map, fst_set}; + use crate::raw::Fst; + use crate::stream::{IntoStreamer, Streamer}; use super::OpBuilder; diff --git a/src/raw/registry.rs b/src/raw/registry.rs index 5d9e938..88297a9 100644 --- a/src/raw/registry.rs +++ b/src/raw/registry.rs @@ -1,5 +1,5 @@ -use raw::build::BuilderNode; -use raw::{CompiledAddr, NONE_ADDRESS}; +use crate::raw::build::BuilderNode; +use crate::raw::{CompiledAddr, NONE_ADDRESS}; #[derive(Debug)] pub struct Registry { @@ -117,8 +117,8 @@ impl RegistryCell { #[cfg(test)] mod tests { use super::{Registry, RegistryCache, RegistryCell, RegistryEntry}; - use raw::build::BuilderNode; - use raw::{Output, Transition}; + use crate::raw::build::BuilderNode; + use crate::raw::{Output, Transition}; fn assert_rejected(entry: RegistryEntry) { match entry { diff --git a/src/raw/registry_minimal.rs b/src/raw/registry_minimal.rs index a92823d..663b012 100644 --- a/src/raw/registry_minimal.rs +++ b/src/raw/registry_minimal.rs @@ -13,8 +13,8 @@ use std::collections::hash_map::{Entry, HashMap}; -use raw::build::BuilderNode; -use raw::CompiledAddr; +use crate::raw::build::BuilderNode; +use crate::raw::CompiledAddr; #[derive(Debug)] pub struct Registry { diff --git a/src/raw/tests.rs b/src/raw/tests.rs index b1a185b..6ca7fc4 100644 --- a/src/raw/tests.rs +++ b/src/raw/tests.rs @@ -1,9 +1,9 @@ use std::sync::Arc; -use automaton::AlwaysMatch; -use error::Error; -use raw::{self, Bound, Builder, Fst, Output, Stream, VERSION}; -use stream::Streamer; +use crate::automaton::AlwaysMatch; +use crate::error::Error; +use crate::raw::{self, Bound, Builder, Fst, Output, Stream, VERSION}; +use crate::stream::Streamer; const TEXT: &'static str = include_str!("./../../data/words-100000"); diff --git a/src/set.rs b/src/set.rs index 6c0d894..babaa61 100644 --- a/src/set.rs +++ b/src/set.rs @@ -4,10 +4,10 @@ use std::iter::{self, FromIterator}; #[cfg(feature = "mmap")] use std::path::Path; -use automaton::{AlwaysMatch, Automaton}; -use raw; -use stream::{IntoStreamer, Streamer}; -use Result; +use crate::automaton::{AlwaysMatch, Automaton}; +use crate::raw; +use crate::stream::{IntoStreamer, Streamer}; +use crate::Result; /// Set is a lexicographically ordered set of byte strings. /// @@ -146,7 +146,7 @@ impl Set { /// assert_eq!(keys, vec![b"a", b"b", b"c"]); /// ``` #[inline] - pub fn stream(&self) -> Stream { + pub fn stream(&self) -> Stream<'_> { Stream(self.0.stream()) } @@ -176,7 +176,7 @@ impl Set { /// assert_eq!(keys, vec![b"b", b"c", b"d"]); /// ``` #[inline] - pub fn range(&self) -> StreamBuilder { + pub fn range(&self) -> StreamBuilder<'_> { StreamBuilder(self.0.range()) } @@ -221,7 +221,7 @@ impl Set { /// Ok(()) /// } /// ``` - pub fn search(&self, aut: A) -> StreamBuilder { + pub fn search(&self, aut: A) -> StreamBuilder<'_, A> { StreamBuilder(self.0.search(aut)) } @@ -260,7 +260,7 @@ impl Set { /// assert_eq!(keys, vec![b"a", b"b", b"c", b"y", b"z"]); /// ``` #[inline] - pub fn op(&self) -> OpBuilder { + pub fn op(&self) -> OpBuilder<'_> { OpBuilder::new().add(self) } @@ -360,7 +360,7 @@ impl Default for Set { } impl fmt::Debug for Set { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "Set([")?; let mut stream = self.stream(); let mut first = true; @@ -922,7 +922,7 @@ impl<'a, S: Streamer<'a>> Streamer<'a> for StreamZeroOutput { #[cfg(test)] mod tests { use super::OpBuilder; - use Streamer; + use crate::Streamer; #[test] fn no_fsts() { diff --git a/tests/test.rs b/tests/test.rs index 821f895..7ffc2c0 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -1,6 +1,6 @@ -extern crate fst; -extern crate fst_levenshtein; -extern crate fst_regex; +use fst; + + use fst_levenshtein::Levenshtein; use fst_regex::Regex; From 3e3d29609df9e2f12595ab4348c8ed588b0ab92b Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 21 Feb 2020 18:33:54 -0500 Subject: [PATCH 05/23] polish: clean up all warnings and make things compile 'cargo fix' appears to have choked on fst-bin's use of imports. We patch it up here along with fixing all remaining warnings. --- fst-bin/Cargo.toml | 7 +++---- fst-bin/src/cmd/csv.rs | 20 ++++++++++-------- fst-bin/src/cmd/dot.rs | 42 +++++++++++++++++++++++--------------- fst-bin/src/cmd/dupes.rs | 14 ++++++------- fst-bin/src/cmd/fuzzy.rs | 5 +++-- fst-bin/src/cmd/grep.rs | 5 +++-- fst-bin/src/cmd/map.rs | 18 ++++++++-------- fst-bin/src/cmd/node.rs | 5 +++-- fst-bin/src/cmd/range.rs | 5 +++-- fst-bin/src/cmd/rust.rs | 12 +++++++---- fst-bin/src/cmd/set.rs | 24 +++++++++++----------- fst-bin/src/cmd/union.rs | 6 +++--- fst-bin/src/main.rs | 30 ++++++++------------------- fst-regex/src/error.rs | 15 +------------- src/error.rs | 10 +-------- src/raw/build.rs | 8 ++++---- src/raw/counting_writer.rs | 2 +- src/raw/error.rs | 14 +------------ src/raw/mod.rs | 24 +++++++++++----------- src/raw/node.rs | 28 ++++++++++++------------- src/raw/registry.rs | 6 +----- src/set.rs | 2 +- 22 files changed, 137 insertions(+), 165 deletions(-) diff --git a/fst-bin/Cargo.toml b/fst-bin/Cargo.toml index 195b79e..00cae31 100644 --- a/fst-bin/Cargo.toml +++ b/fst-bin/Cargo.toml @@ -22,15 +22,14 @@ path = "src/main.rs" [dependencies] bit-set = "0.4" chan = "0.1" -csv = "1.0.0-beta.3" -docopt = "0.8" +csv = "1.1.3" +docopt = "1.1" fst = { path = "..", version = "0.3" } fst-regex = { path = "../fst-regex", version = "0.2" } fst-levenshtein = { path = "../fst-levenshtein", version = "0.2" } lines = "0.0" num_cpus = "1.5" -serde = "1" -serde_derive = "1" +serde = { version = "1.0.104", features = ["derive"] } tempdir = "0.3" [profile.release] diff --git a/fst-bin/src/cmd/csv.rs b/fst-bin/src/cmd/csv.rs index 0b8724b..9a26fa2 100644 --- a/fst-bin/src/cmd/csv.rs +++ b/fst-bin/src/cmd/csv.rs @@ -1,7 +1,7 @@ use bit_set::BitSet; use csv; use docopt::Docopt; -use fst::raw as fst; +use serde::Deserialize; use crate::util; use crate::Error; @@ -30,13 +30,13 @@ struct Args { pub fn run(argv: Vec) -> Result<(), Error> { let args: Args = Docopt::new(USAGE) - .and_then(|d| d.argv(&argv).deserialize()) - .unwrap_or_else(|e| e.exit()); + .and_then(|d| d.argv(&argv).deserialize()) + .unwrap_or_else(|e| e.exit()); let wtr = util::get_writer(args.arg_output.as_ref())?; let mut wtr = csv::Writer::from_writer(wtr); - let fst = unsafe { fst::Fst::from_path(args.arg_input) }?; + let fst = unsafe { fst::raw::Fst::from_path(args.arg_input) }?; let mut set = BitSet::with_capacity(fst.len()); if args.cmd_edges { @@ -49,15 +49,17 @@ pub fn run(argv: Vec) -> Result<(), Error> { stack.push(t.addr); set.insert(t.addr); } - wtr.serialize(( - addr, t.addr, t.inp as char, t.out.value(), - ))?; + wtr.serialize((addr, t.addr, t.inp as char, t.out.value()))?; } } } else { wtr.serialize(( - "addr", "state", "size", - "transitions", "final", "final_output", + "addr", + "state", + "size", + "transitions", + "final", + "final_output", ))?; let mut stack = vec![fst.root().addr()]; set.insert(fst.root().addr()); diff --git a/fst-bin/src/cmd/dot.rs b/fst-bin/src/cmd/dot.rs index 46635e5..050d8aa 100644 --- a/fst-bin/src/cmd/dot.rs +++ b/fst-bin/src/cmd/dot.rs @@ -2,7 +2,7 @@ use std::io::Write; use bit_set::BitSet; use docopt::Docopt; -use fst::raw as fst; +use serde::Deserialize; use crate::util; use crate::Error; @@ -41,15 +41,12 @@ struct Args { impl Args { fn dot_state( &self, - node: &fst::Node, + node: &fst::raw::Node, i: usize, - addr: fst::CompiledAddr, + addr: fst::raw::CompiledAddr, ) -> String { - let label = if self.flag_state_names { - i.to_string() - } else { - "".to_owned() - }; + let label = + if self.flag_state_names { i.to_string() } else { "".to_owned() }; if node.is_final() { format!(" {} [label=\"{}\",peripheries=2];", addr, label) } else { @@ -60,21 +57,25 @@ impl Args { pub fn run(argv: Vec) -> Result<(), Error> { let args: Args = Docopt::new(USAGE) - .and_then(|d| d.argv(&argv).deserialize()) - .unwrap_or_else(|e| e.exit()); + .and_then(|d| d.argv(&argv).deserialize()) + .unwrap_or_else(|e| e.exit()); let mut wtr = util::get_buf_writer(args.arg_output.as_ref())?; - let fst = unsafe { fst::Fst::from_path(&args.arg_input) }?; + let fst = unsafe { fst::raw::Fst::from_path(&args.arg_input) }?; let mut set = BitSet::with_capacity(fst.len()); let mut stack = vec![fst.root().addr()]; - writeln!(wtr, r#" + writeln!( + wtr, + r#" digraph automaton {{ labelloc="l"; labeljust="l"; rankdir="LR"; -"#).unwrap(); +"# + ) + .unwrap(); let mut state_num = 0; while let Some(addr) = stack.pop() { if set.contains(addr) { @@ -86,11 +87,20 @@ digraph automaton {{ writeln!(wtr, "{}", args.dot_state(&node, state_num, addr)).unwrap(); for t in node.transitions() { stack.push(t.addr); - let out = if t.out.value() == 0 { "".to_owned() } else { + let out = if t.out.value() == 0 { + "".to_owned() + } else { format!("/{}", t.out.value().to_string()) }; - writeln!(wtr, " {} -> {} [label=\"{}{}\"];", - addr, t.addr, util::escape_input(t.inp), out).unwrap(); + writeln!( + wtr, + " {} -> {} [label=\"{}{}\"];", + addr, + t.addr, + util::escape_input(t.inp), + out + ) + .unwrap(); } state_num += 1; } diff --git a/fst-bin/src/cmd/dupes.rs b/fst-bin/src/cmd/dupes.rs index 66516b0..285d807 100644 --- a/fst-bin/src/cmd/dupes.rs +++ b/fst-bin/src/cmd/dupes.rs @@ -3,7 +3,7 @@ use std::io::Write; use bit_set::BitSet; use docopt::Docopt; -use fst::raw as fst; +use serde::Deserialize; use crate::util; use crate::Error; @@ -43,12 +43,12 @@ struct Args { #[derive(Clone, Debug, Hash, Eq, PartialEq)] struct FullNode { is_final: bool, - final_output: fst::Output, - trans: Vec, + final_output: fst::raw::Output, + trans: Vec, } impl FullNode { - fn from_node(node: &fst::Node) -> FullNode { + fn from_node(node: &fst::raw::Node) -> FullNode { FullNode { is_final: node.is_final(), final_output: node.final_output(), @@ -59,11 +59,11 @@ impl FullNode { pub fn run(argv: Vec) -> Result<(), Error> { let args: Args = Docopt::new(USAGE) - .and_then(|d| d.argv(&argv).deserialize()) - .unwrap_or_else(|e| e.exit()); + .and_then(|d| d.argv(&argv).deserialize()) + .unwrap_or_else(|e| e.exit()); let mut wtr = util::get_buf_writer(args.arg_output.as_ref())?; - let fst = unsafe { fst::Fst::from_path(args.arg_input) }?; + let fst = unsafe { fst::raw::Fst::from_path(args.arg_input) }?; let mut set = BitSet::with_capacity(fst.len()); let mut node_counts = HashMap::with_capacity(10_000); diff --git a/fst-bin/src/cmd/fuzzy.rs b/fst-bin/src/cmd/fuzzy.rs index 125e52a..0a7f143 100644 --- a/fst-bin/src/cmd/fuzzy.rs +++ b/fst-bin/src/cmd/fuzzy.rs @@ -3,6 +3,7 @@ use std::io; use docopt::Docopt; use fst::raw::Fst; use fst_levenshtein::Levenshtein; +use serde::Deserialize; use crate::util; use crate::Error; @@ -45,8 +46,8 @@ struct Args { pub fn run(argv: Vec) -> Result<(), Error> { let args: Args = Docopt::new(USAGE) - .and_then(|d| d.argv(&argv).deserialize()) - .unwrap_or_else(|e| e.exit()); + .and_then(|d| d.argv(&argv).deserialize()) + .unwrap_or_else(|e| e.exit()); let fst = unsafe { Fst::from_path(&args.arg_fst) }?; let lev = Levenshtein::new(&args.arg_query, args.flag_distance)?; let mut q = fst.search(&lev); diff --git a/fst-bin/src/cmd/grep.rs b/fst-bin/src/cmd/grep.rs index 3e61f9f..fbd0c76 100644 --- a/fst-bin/src/cmd/grep.rs +++ b/fst-bin/src/cmd/grep.rs @@ -3,6 +3,7 @@ use std::io; use docopt::Docopt; use fst::raw::Fst; use fst_regex::Regex; +use serde::Deserialize; use crate::util; use crate::Error; @@ -36,8 +37,8 @@ struct Args { pub fn run(argv: Vec) -> Result<(), Error> { let args: Args = Docopt::new(USAGE) - .and_then(|d| d.argv(&argv).deserialize()) - .unwrap_or_else(|e| e.exit()); + .and_then(|d| d.argv(&argv).deserialize()) + .unwrap_or_else(|e| e.exit()); let fst = unsafe { Fst::from_path(&args.arg_fst) }?; let lev = Regex::new(&args.arg_regex)?; let mut q = fst.search(&lev); diff --git a/fst-bin/src/cmd/map.rs b/fst-bin/src/cmd/map.rs index f933a80..f5c0a9b 100644 --- a/fst-bin/src/cmd/map.rs +++ b/fst-bin/src/cmd/map.rs @@ -5,6 +5,7 @@ use std::path::Path; use csv; use docopt::Docopt; use fst::MapBuilder; +use serde::Deserialize; use crate::merge::Merger; use crate::util; @@ -65,8 +66,8 @@ struct Args { pub fn run(argv: Vec) -> Result<(), Error> { let args: Args = Docopt::new(USAGE) - .and_then(|d| d.argv(&argv).deserialize()) - .unwrap_or_else(|e| e.exit()); + .and_then(|d| d.argv(&argv).deserialize()) + .unwrap_or_else(|e| e.exit()); if !args.flag_force && fs::metadata(&args.arg_output).is_ok() { fail!("Output file already exists: {}", args.arg_output); } @@ -82,9 +83,8 @@ fn run_sorted(args: &Args) -> Result<(), Error> { let mut map = MapBuilder::new(wtr)?; for input in &args.arg_input { let rdr = util::get_reader(Some(input))?; - let mut rdr = csv::ReaderBuilder::new() - .has_headers(false) - .from_reader(rdr); + let mut rdr = + csv::ReaderBuilder::new().has_headers(false).from_reader(rdr); for row in rdr.deserialize() { let (key, val): (String, u64) = row?; map.insert(key, val)?; @@ -95,9 +95,11 @@ fn run_sorted(args: &Args) -> Result<(), Error> { } fn run_unsorted(args: &Args) -> Result<(), Error> { - let inputs = - args.arg_input - .iter().map(|inp| Path::new(inp).to_path_buf()).collect(); + let inputs = args + .arg_input + .iter() + .map(|inp| Path::new(inp).to_path_buf()) + .collect(); let keys = util::ConcatCsv::new(inputs); let mut merger = Merger::new(keys, &args.arg_output); diff --git a/fst-bin/src/cmd/node.rs b/fst-bin/src/cmd/node.rs index d24802c..e6f5946 100644 --- a/fst-bin/src/cmd/node.rs +++ b/fst-bin/src/cmd/node.rs @@ -1,5 +1,6 @@ use docopt::Docopt; use fst::raw::Fst; +use serde::Deserialize; use crate::util; use crate::Error; @@ -30,8 +31,8 @@ struct Args { pub fn run(argv: Vec) -> Result<(), Error> { let args: Args = Docopt::new(USAGE) - .and_then(|d| d.argv(&argv).deserialize()) - .unwrap_or_else(|e| e.exit()); + .and_then(|d| d.argv(&argv).deserialize()) + .unwrap_or_else(|e| e.exit()); let mut wtr = util::get_buf_writer::<&str>(None)?; let fst = unsafe { Fst::from_path(&args.arg_fst) }?; let node = fst.node(args.arg_node_address); diff --git a/fst-bin/src/cmd/range.rs b/fst-bin/src/cmd/range.rs index ba7cfb7..361cc18 100644 --- a/fst-bin/src/cmd/range.rs +++ b/fst-bin/src/cmd/range.rs @@ -2,6 +2,7 @@ use std::io; use docopt::Docopt; use fst::raw::Fst; +use serde::Deserialize; use crate::util; use crate::Error; @@ -35,8 +36,8 @@ struct Args { pub fn run(argv: Vec) -> Result<(), Error> { let args: Args = Docopt::new(USAGE) - .and_then(|d| d.argv(&argv).deserialize()) - .unwrap_or_else(|e| e.exit()); + .and_then(|d| d.argv(&argv).deserialize()) + .unwrap_or_else(|e| e.exit()); let fst = unsafe { Fst::from_path(&args.arg_fst) }?; let mut q = fst.range(); if let Some(ref start) = args.flag_start { diff --git a/fst-bin/src/cmd/rust.rs b/fst-bin/src/cmd/rust.rs index c44d7d3..89994f5 100644 --- a/fst-bin/src/cmd/rust.rs +++ b/fst-bin/src/cmd/rust.rs @@ -1,6 +1,7 @@ use std::io::{Read, Write}; use docopt::Docopt; +use serde::Deserialize; use crate::util; use crate::Error; @@ -38,8 +39,8 @@ struct Args { pub fn run(argv: Vec) -> Result<(), Error> { let args: Args = Docopt::new(USAGE) - .and_then(|d| d.argv(&argv).deserialize()) - .unwrap_or_else(|e| e.exit()); + .and_then(|d| d.argv(&argv).deserialize()) + .unwrap_or_else(|e| e.exit()); let mut wtr = util::get_buf_writer::<&str>(None)?; let mut rdr = util::get_buf_reader(Some(&args.arg_fst))?; @@ -48,8 +49,11 @@ pub fn run(argv: Vec) -> Result<(), Error> { w!(wtr, "lazy_static! {{"); w!(wtr, " pub static ref {}: ::fst::raw::Fst = ", args.arg_name); - w!(wtr, " ::fst::raw::Fst::from_static_slice({}_BYTES).unwrap();", - args.arg_name); + w!( + wtr, + " ::fst::raw::Fst::from_static_slice({}_BYTES).unwrap();", + args.arg_name + ); w!(wtr, "}}\n"); w!(wtr, "const {}_BYTES: &'static [u8] = b\"\\", args.arg_name); diff --git a/fst-bin/src/cmd/set.rs b/fst-bin/src/cmd/set.rs index 684422f..03a34cb 100644 --- a/fst-bin/src/cmd/set.rs +++ b/fst-bin/src/cmd/set.rs @@ -4,6 +4,7 @@ use std::path::Path; use docopt::Docopt; use fst::SetBuilder; use lines::linereader::LineReader; +use serde::Deserialize; use crate::merge::Merger; use crate::util; @@ -59,8 +60,8 @@ struct Args { pub fn run(argv: Vec) -> Result<(), Error> { let args: Args = Docopt::new(USAGE) - .and_then(|d| d.argv(&argv).deserialize()) - .unwrap_or_else(|e| e.exit()); + .and_then(|d| d.argv(&argv).deserialize()) + .unwrap_or_else(|e| e.exit()); if !args.flag_force && fs::metadata(&args.arg_output).is_ok() { fail!("Output file already exists: {}", args.arg_output); } @@ -82,26 +83,25 @@ fn run_sorted(args: &Args) -> Result<(), Error> { if line.is_empty() { break; } - let off = if line.len() >= 2 && line[line.len()-2] == b'\r' { + let off = if line.len() >= 2 && line[line.len() - 2] == b'\r' { 2 } else { 1 }; - set.insert(&line[0..line.len()-off])?; + set.insert(&line[0..line.len() - off])?; } } set.finish().map_err(From::from) } fn run_unsorted(args: &Args) -> Result<(), Error> { - let inputs = - args.arg_input - .iter().map(|inp| Path::new(inp).to_path_buf()).collect(); - let keys = - util::ConcatLines::new(inputs) - .map(|result| { - result.map(|line| (line, 0)).map_err(From::from) - }); + let inputs = args + .arg_input + .iter() + .map(|inp| Path::new(inp).to_path_buf()) + .collect(); + let keys = util::ConcatLines::new(inputs) + .map(|result| result.map(|line| (line, 0)).map_err(From::from)); let mut merger = Merger::new(keys, &args.arg_output); merger = merger.fd_limit(args.flag_fd_limit); diff --git a/fst-bin/src/cmd/union.rs b/fst-bin/src/cmd/union.rs index fb5860b..2b7c25d 100644 --- a/fst-bin/src/cmd/union.rs +++ b/fst-bin/src/cmd/union.rs @@ -1,7 +1,7 @@ use std::fs; use docopt::Docopt; -use fst; +use serde::Deserialize; use crate::util; use crate::Error; @@ -30,8 +30,8 @@ struct Args { pub fn run(argv: Vec) -> Result<(), Error> { let args: Args = Docopt::new(USAGE) - .and_then(|d| d.argv(&argv).deserialize()) - .unwrap_or_else(|e| e.exit()); + .and_then(|d| d.argv(&argv).deserialize()) + .unwrap_or_else(|e| e.exit()); if !args.flag_force && fs::metadata(&args.arg_output).is_ok() { fail!("Output file already exists: {}", args.arg_output); } diff --git a/fst-bin/src/main.rs b/fst-bin/src/main.rs index 4614097..3a37100 100644 --- a/fst-bin/src/main.rs +++ b/fst-bin/src/main.rs @@ -1,25 +1,12 @@ #![allow(dead_code)] -extern crate bit_set; -extern crate chan; -extern crate csv; -extern crate docopt; -extern crate fst; -extern crate fst_levenshtein; -extern crate fst_regex; -extern crate lines; -extern crate num_cpus; -extern crate serde; -#[macro_use] -extern crate serde_derive; -extern crate tempdir; - use std::env; use std::error; -use std::process; use std::io::{self, Write}; +use std::process; use docopt::Docopt; +use serde::Deserialize; macro_rules! w { ($wtr:expr, $($tt:tt)*) => {{ @@ -105,10 +92,10 @@ impl Command { fn main() { let args: Args = Docopt::new(USAGE) - .and_then(|d| d.options_first(true) - .version(Some(version())) - .deserialize()) - .unwrap_or_else(|e| e.exit()); + .and_then(|d| { + d.options_first(true).version(Some(version())).deserialize() + }) + .unwrap_or_else(|e| e.exit()); let cmd = args.arg_command.expect("BUG: expected a command"); if let Err(err) = cmd.run() { writeln!(&mut io::stderr(), "{}", err).unwrap(); @@ -123,8 +110,9 @@ fn version() -> String { option_env!("CARGO_PKG_VERSION_PATCH"), ); match (maj, min, pat) { - (Some(maj), Some(min), Some(pat)) => - format!("{}.{}.{}", maj, min, pat), + (Some(maj), Some(min), Some(pat)) => { + format!("{}.{}.{}", maj, min, pat) + } _ => "N/A".to_owned(), } } diff --git a/fst-regex/src/error.rs b/fst-regex/src/error.rs index 50b1031..c757be8 100644 --- a/fst-regex/src/error.rs +++ b/fst-regex/src/error.rs @@ -82,20 +82,7 @@ impl fmt::Display for Error { } impl error::Error for Error { - fn description(&self) -> &str { - use self::Error::*; - match *self { - Syntax(ref err) => err.description(), - CompiledTooBig(_) => "compiled regex is too big", - TooManyStates(_) => "compiled regex has too many states", - NoLazy => "lazy repetition operators are not allowed", - NoWordBoundary => "word boundary operators are not allowed", - NoEmpty => "empty match operators are not allowed", - NoBytes => "byte literals are not allowed", - } - } - - fn cause(&self) -> Option<&dyn error::Error> { + fn source(&self) -> Option<&(dyn error::Error + 'static)> { use self::Error::*; match *self { Syntax(ref err) => Some(err), diff --git a/src/error.rs b/src/error.rs index 93df638..88660ee 100644 --- a/src/error.rs +++ b/src/error.rs @@ -42,15 +42,7 @@ impl fmt::Display for Error { } impl error::Error for Error { - fn description(&self) -> &str { - use self::Error::*; - match *self { - Fst(ref err) => err.description(), - Io(ref err) => err.description(), - } - } - - fn cause(&self) -> Option<&dyn error::Error> { + fn source(&self) -> Option<&(dyn error::Error + 'static)> { use self::Error::*; match *self { Fst(ref err) => Some(err), diff --git a/src/raw/build.rs b/src/raw/build.rs index e4c3bbd..421b94c 100644 --- a/src/raw/build.rs +++ b/src/raw/build.rs @@ -123,7 +123,7 @@ impl Builder { // Similarly for 8-15 for the fst type. wtr.write_u64::(ty)?; Ok(Builder { - wtr: wtr, + wtr, unfinished: UnfinishedNodes::new(), registry: Registry::new(10_000, 2), last: None, @@ -329,7 +329,7 @@ impl UnfinishedNodes { fn push_empty(&mut self, is_final: bool) { self.stack.push(BuilderNodeUnfinished { - node: BuilderNode { is_final: is_final, ..BuilderNode::default() }, + node: BuilderNode { is_final, ..BuilderNode::default() }, last: None, }); } @@ -368,7 +368,7 @@ impl UnfinishedNodes { } let last = self.stack.len().checked_sub(1).unwrap(); assert!(self.stack[last].last.is_none()); - self.stack[last].last = Some(LastTransition { inp: bs[0], out: out }); + self.stack[last].last = Some(LastTransition { inp: bs[0], out }); for &b in &bs[1..] { self.stack.push(BuilderNodeUnfinished { node: BuilderNode::default(), @@ -419,7 +419,7 @@ impl BuilderNodeUnfinished { self.node.trans.push(Transition { inp: trans.inp, out: trans.out, - addr: addr, + addr, }); } } diff --git a/src/raw/counting_writer.rs b/src/raw/counting_writer.rs index d893782..e0e336d 100644 --- a/src/raw/counting_writer.rs +++ b/src/raw/counting_writer.rs @@ -9,7 +9,7 @@ pub struct CountingWriter { impl CountingWriter { /// Wrap the given writer with a counter. pub fn new(wtr: W) -> CountingWriter { - CountingWriter { wtr: wtr, cnt: 0 } + CountingWriter { wtr, cnt: 0 } } /// Return the total number of bytes written to the underlying writer. diff --git a/src/raw/error.rs b/src/raw/error.rs index 34fb655..25a1a94 100644 --- a/src/raw/error.rs +++ b/src/raw/error.rs @@ -110,19 +110,7 @@ impl fmt::Debug for Error { } impl error::Error for Error { - fn description(&self) -> &str { - use self::Error::*; - match *self { - FromUtf8(ref err) => err.description(), - Version { .. } => "incompatible version found when opening FST", - Format => "unknown invalid format found when opening FST", - DuplicateKey { .. } => "duplicate key insertion", - OutOfOrder { .. } => "out-of-order key insertion", - WrongType { .. } => "incompatible type found when opening FST", - } - } - - fn cause(&self) -> Option<&dyn error::Error> { + fn source(&self) -> Option<&(dyn error::Error + 'static)> { match *self { Error::FromUtf8(ref err) => Some(err), _ => None, diff --git a/src/raw/mod.rs b/src/raw/mod.rs index 9604696..a4c4869 100644 --- a/src/raw/mod.rs +++ b/src/raw/mod.rs @@ -622,8 +622,8 @@ pub struct StreamBuilder<'f, A = AlwaysMatch> { impl<'f, A: Automaton> StreamBuilder<'f, A> { fn new(fst: &'f Fst, aut: A) -> Self { StreamBuilder { - fst: fst, - aut: aut, + fst, + aut, min: Bound::Unbounded, max: Bound::Unbounded, } @@ -724,8 +724,8 @@ struct StreamState<'f, S> { impl<'f, A: Automaton> Stream<'f, A> { fn new(fst: &'f Fst, aut: A, min: Bound, max: Bound) -> Self { let mut rdr = Stream { - fst: fst, - aut: aut, + fst, + aut, inp: Vec::with_capacity(16), empty_output: None, stack: vec![], @@ -777,9 +777,9 @@ impl<'f, A: Automaton> Stream<'f, A> { aut_state = self.aut.accept(&prev_state, b); self.inp.push(b); self.stack.push(StreamState { - node: node, + node, trans: i + 1, - out: out, + out, aut_state: prev_state, }); out = out.cat(t.out); @@ -792,13 +792,13 @@ impl<'f, A: Automaton> Stream<'f, A> { // first transition in this node that proceeds the current // input byte. self.stack.push(StreamState { - node: node, + node, trans: node .transitions() .position(|t| t.inp > b) .unwrap_or(node.len()), - out: out, - aut_state: aut_state, + out, + aut_state, }); return; } @@ -815,8 +815,8 @@ impl<'f, A: Automaton> Stream<'f, A> { self.stack.push(StreamState { node: self.fst.node(node.transition(trans - 1).addr), trans: 0, - out: out, - aut_state: aut_state, + out, + aut_state, }); } } @@ -916,7 +916,7 @@ impl<'f, 'a, A: Automaton> Streamer<'a> for Stream<'f, A> { self.stack.push(StreamState { node: next_node, trans: 0, - out: out, + out, aut_state: next_state, }); if self.end_at.exceeded_by(&self.inp) { diff --git a/src/raw/node.rs b/src/raw/node.rs index 521740f..a468fca 100644 --- a/src/raw/node.rs +++ b/src/raw/node.rs @@ -61,7 +61,7 @@ pub fn node_new(version: u64, addr: CompiledAddr, data: &[u8]) -> Node<'_> { match state { EmptyFinal => Node { data: &[], - version: version, + version, state: State::EmptyFinal, start: EMPTY_ADDRESS, end: EMPTY_ADDRESS, @@ -73,9 +73,9 @@ pub fn node_new(version: u64, addr: CompiledAddr, data: &[u8]) -> Node<'_> { OneTransNext(s) => { let data = &data[..addr + 1]; Node { - data: data, - version: version, - state: state, + data, + version, + state, start: addr, end: s.end_addr(data), is_final: false, @@ -88,14 +88,14 @@ pub fn node_new(version: u64, addr: CompiledAddr, data: &[u8]) -> Node<'_> { let data = &data[..addr + 1]; let sizes = s.sizes(data); Node { - data: data, - version: version, - state: state, + data, + version, + state, start: addr, end: s.end_addr(data, sizes), is_final: false, ntrans: 1, - sizes: sizes, + sizes, final_output: Output::zero(), } } @@ -104,14 +104,14 @@ pub fn node_new(version: u64, addr: CompiledAddr, data: &[u8]) -> Node<'_> { let sizes = s.sizes(data); let ntrans = s.ntrans(data); Node { - data: data, - version: version, - state: state, + data, + version, + state, start: addr, end: s.end_addr(version, data, sizes, ntrans), is_final: s.is_final_state(), - ntrans: ntrans, - sizes: sizes, + ntrans, + sizes, final_output: s.final_output(version, data, sizes, ntrans), } } @@ -925,7 +925,7 @@ mod tests { } fn trans(addr: CompiledAddr, inp: u8) -> Transition { - Transition { inp: inp, out: Output::zero(), addr: addr } + Transition { inp, out: Output::zero(), addr } } #[test] diff --git a/src/raw/registry.rs b/src/raw/registry.rs index 88297a9..da76070 100644 --- a/src/raw/registry.rs +++ b/src/raw/registry.rs @@ -30,11 +30,7 @@ impl Registry { pub fn new(table_size: usize, mru_size: usize) -> Registry { let empty_cell = RegistryCell::none(); let ncells = table_size.checked_mul(mru_size).unwrap(); - Registry { - table: vec![empty_cell; ncells], - table_size: table_size, - mru_size: mru_size, - } + Registry { table: vec![empty_cell; ncells], table_size, mru_size } } pub fn entry<'a>(&'a mut self, node: &BuilderNode) -> RegistryEntry<'a> { diff --git a/src/set.rs b/src/set.rs index babaa61..b04fded 100644 --- a/src/set.rs +++ b/src/set.rs @@ -933,7 +933,7 @@ mod tests { impl<'a> Iter<'a> { fn new(xs: Vec<&'a [u8]>) -> Iter<'a> { - Iter { i: 0, xs: xs } + Iter { i: 0, xs } } } From ad294c870ce280ea84346a3d07385b690185bd70 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 21 Feb 2020 18:44:11 -0500 Subject: [PATCH 06/23] levenshtein/regex: move levenshtein, remove regex This is where the Levenshtein automata originally lived. I originally moved it out of the crate in order to reduce dependencies. But we can instead just make it an optional feature. Instead of moving fst-regex back into the crate as well, we instead recommend using `regex-automata`. `regex-automata` is basically the productionized version of "compile a regex to a DFA," which is exactly what we want. --- Cargo.toml | 9 +- fst-bin/Cargo.toml | 5 +- fst-bin/src/cmd/fuzzy.rs | 2 +- fst-bin/src/cmd/grep.rs | 19 +- fst-levenshtein/Cargo.toml | 4 +- fst-regex/Cargo.toml | 4 +- src/automaton/levenshtein.rs | 337 +++++++++++++++++++++++++++++++++++ src/automaton/mod.rs | 5 + src/lib.rs | 155 +++++++--------- src/map.rs | 29 +-- src/set.rs | 23 +-- tests/test.rs | 83 +++++---- 12 files changed, 505 insertions(+), 170 deletions(-) create mode 100644 src/automaton/levenshtein.rs diff --git a/Cargo.toml b/Cargo.toml index 7572402..1a79e91 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,8 +15,9 @@ license = "Unlicense/MIT" edition = "2018" [features] -mmap = ["memmap"] default = ["mmap"] +mmap = ["memmap"] +levenshtein = ["utf8-ranges"] [[bench]] name = "build" @@ -33,11 +34,10 @@ bench = true [dependencies] byteorder = "1" memmap = { version = "0.6.0", optional = true } +utf8-ranges = { version = "1", optional = true } [dev-dependencies] fnv = "1.0.5" -fst-levenshtein = { version = "0.2", path = "fst-levenshtein" } -fst-regex = { version = "0.2", path = "fst-regex" } lazy_static = "0.2.8" quickcheck = { version = "0.7", default-features = false } rand = "0.5" @@ -48,3 +48,6 @@ debug = true [profile.bench] opt-level = 3 debug = true + +[package.metadata.docs.rs] +features = ["levenshtein"] diff --git a/fst-bin/Cargo.toml b/fst-bin/Cargo.toml index 00cae31..5987eef 100644 --- a/fst-bin/Cargo.toml +++ b/fst-bin/Cargo.toml @@ -24,11 +24,10 @@ bit-set = "0.4" chan = "0.1" csv = "1.1.3" docopt = "1.1" -fst = { path = "..", version = "0.3" } -fst-regex = { path = "../fst-regex", version = "0.2" } -fst-levenshtein = { path = "../fst-levenshtein", version = "0.2" } +fst = { path = "..", version = "0.3", features = ["levenshtein"] } lines = "0.0" num_cpus = "1.5" +regex-automata = { version = "*", path = "/home/andrew/rust/regex-automata", features = ["transducer"] } serde = { version = "1.0.104", features = ["derive"] } tempdir = "0.3" diff --git a/fst-bin/src/cmd/fuzzy.rs b/fst-bin/src/cmd/fuzzy.rs index 0a7f143..649c54c 100644 --- a/fst-bin/src/cmd/fuzzy.rs +++ b/fst-bin/src/cmd/fuzzy.rs @@ -1,8 +1,8 @@ use std::io; use docopt::Docopt; +use fst::automaton::Levenshtein; use fst::raw::Fst; -use fst_levenshtein::Levenshtein; use serde::Deserialize; use crate::util; diff --git a/fst-bin/src/cmd/grep.rs b/fst-bin/src/cmd/grep.rs index fbd0c76..c1b6ce7 100644 --- a/fst-bin/src/cmd/grep.rs +++ b/fst-bin/src/cmd/grep.rs @@ -2,7 +2,8 @@ use std::io; use docopt::Docopt; use fst::raw::Fst; -use fst_regex::Regex; +use regex_automata::dense; +// use regex_automata::sparse::SparseDFA; use serde::Deserialize; use crate::util; @@ -40,8 +41,20 @@ pub fn run(argv: Vec) -> Result<(), Error> { .and_then(|d| d.argv(&argv).deserialize()) .unwrap_or_else(|e| e.exit()); let fst = unsafe { Fst::from_path(&args.arg_fst) }?; - let lev = Regex::new(&args.arg_regex)?; - let mut q = fst.search(&lev); + let dense_dfa = dense::Builder::new() + .anchored(true) + .byte_classes(false) + .premultiply(false) + .build(&args.arg_regex)?; + let dfa = match dense_dfa { + dense::DenseDFA::Standard(dfa) => dfa, + _ => unreachable!(), + }; + // let dfa = match dense_dfa.to_sparse()? { + // SparseDFA::Standard(dfa) => dfa, + // _ => unreachable!(), + // }; + let mut q = fst.search(&dfa); if let Some(ref start) = args.flag_start { q = q.ge(start); } diff --git a/fst-levenshtein/Cargo.toml b/fst-levenshtein/Cargo.toml index f204a21..d2cb397 100644 --- a/fst-levenshtein/Cargo.toml +++ b/fst-levenshtein/Cargo.toml @@ -1,9 +1,9 @@ [package] name = "fst-levenshtein" -version = "0.2.1" #:version +version = "0.3.0" #:version authors = ["Andrew Gallant "] description = """ -Search finite state transducers with fuzzy queries using Levenshtein automata. +DEPRECATED. Use 'fst' crate with 'levenshtein' feature instead. """ documentation = "https://docs.rs/fst-levenshtein" homepage = "https://github.com/BurntSushi/fst" diff --git a/fst-regex/Cargo.toml b/fst-regex/Cargo.toml index 33ce341..f867718 100644 --- a/fst-regex/Cargo.toml +++ b/fst-regex/Cargo.toml @@ -1,9 +1,9 @@ [package] name = "fst-regex" -version = "0.2.2" #:version +version = "0.3.0" #:version authors = ["Andrew Gallant "] description = """ -Search finite state transducers with regular expression. +DEPRECATED. Use 'regex-automata' crate with 'transducer' feature instead. """ documentation = "https://docs.rs/fst-regex" homepage = "https://github.com/BurntSushi/fst" diff --git a/src/automaton/levenshtein.rs b/src/automaton/levenshtein.rs new file mode 100644 index 0000000..af15f02 --- /dev/null +++ b/src/automaton/levenshtein.rs @@ -0,0 +1,337 @@ +use std::cmp; +use std::collections::hash_map::Entry; +use std::collections::{HashMap, HashSet}; +use std::fmt; + +use utf8_ranges::{Utf8Range, Utf8Sequences}; + +use crate::automaton::Automaton; + +const STATE_LIMIT: usize = 10_000; // currently at least 20MB >_< + +/// An error that occurred while building a Levenshtein automaton. +/// +/// This error is only defined when the `levenshtein` crate feature is enabled. +#[derive(Debug)] +pub enum LevenshteinError { + /// If construction of the automaton reaches some hard-coded limit + /// on the number of states, then this error is returned. + /// + /// The number given is the limit that was exceeded. + TooManyStates(usize), +} + +impl fmt::Display for LevenshteinError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match *self { + LevenshteinError::TooManyStates(size_limit) => write!( + f, + "Levenshtein automaton exceeds size limit of \ + {} states", + size_limit + ), + } + } +} + +impl std::error::Error for LevenshteinError {} + +/// A Unicode aware Levenshtein automaton for running efficient fuzzy queries. +/// +/// This is only defined when the `levenshtein` crate feature is enabled. +/// +/// A Levenshtein automata is one way to search any finite state transducer +/// for keys that *approximately* match a given query. A Levenshtein automaton +/// approximates this by returning all keys within a certain edit distance of +/// the query. The edit distance is defined by the number of insertions, +/// deletions and substitutions required to turn the query into the key. +/// Insertions, deletions and substitutions are based on +/// **Unicode characters** (where each character is a single Unicode scalar +/// value). +/// +/// # Example +/// +/// This example shows how to find all keys within an edit distance of `1` +/// from `foo`. +/// +/// ```rust +/// use fst::automaton::Levenshtein; +/// use fst::{IntoStreamer, Streamer, Set}; +/// +/// fn main() { +/// let keys = vec!["fa", "fo", "fob", "focus", "foo", "food", "foul"]; +/// let set = Set::from_iter(keys).unwrap(); +/// +/// let lev = Levenshtein::new("foo", 1).unwrap(); +/// let mut stream = set.search(&lev).into_stream(); +/// +/// let mut keys = vec![]; +/// while let Some(key) = stream.next() { +/// keys.push(key.to_vec()); +/// } +/// assert_eq!(keys, vec![ +/// "fo".as_bytes(), // 1 deletion +/// "fob".as_bytes(), // 1 substitution +/// "foo".as_bytes(), // 0 insertions/deletions/substitutions +/// "food".as_bytes(), // 1 insertion +/// ]); +/// } +/// ``` +/// +/// This example only uses ASCII characters, but it will work equally well +/// on Unicode characters. +/// +/// # Warning: experimental +/// +/// While executing this Levenshtein automaton against a finite state +/// transducer will be very fast, *constructing* an automaton may not be. +/// Namely, this implementation is a proof of concept. While I believe the +/// algorithmic complexity is not exponential, the implementation is not speedy +/// and it can use enormous amounts of memory (tens of MB before a hard-coded +/// limit will cause an error to be returned). +/// +/// This is important functionality, so one should count on this implementation +/// being vastly improved in the future. +pub struct Levenshtein { + prog: DynamicLevenshtein, + dfa: Dfa, +} + +impl Levenshtein { + /// Create a new Levenshtein query. + /// + /// The query finds all matching terms that are at most `distance` + /// edit operations from `query`. (An edit operation may be an insertion, + /// a deletion or a substitution.) + /// + /// If the underlying automaton becomes too big, then an error is returned. + /// + /// A `Levenshtein` value satisfies the `Automaton` trait, which means it + /// can be used with the `search` method of any finite state transducer. + #[inline] + pub fn new(query: &str, distance: u32) -> Result { + let lev = DynamicLevenshtein { + query: query.to_owned(), + dist: distance as usize, + }; + let dfa = DfaBuilder::new(lev.clone()).build()?; + Ok(Levenshtein { prog: lev, dfa }) + } +} + +impl fmt::Debug for Levenshtein { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "Levenshtein(query: {:?}, distance: {:?})", + self.prog.query, self.prog.dist + ) + } +} + +#[derive(Clone)] +struct DynamicLevenshtein { + query: String, + dist: usize, +} + +impl DynamicLevenshtein { + fn start(&self) -> Vec { + (0..self.query.chars().count() + 1).collect() + } + + fn is_match(&self, state: &[usize]) -> bool { + state.last().map(|&n| n <= self.dist).unwrap_or(false) + } + + fn can_match(&self, state: &[usize]) -> bool { + state.iter().min().map(|&n| n <= self.dist).unwrap_or(false) + } + + fn accept(&self, state: &[usize], chr: Option) -> Vec { + let mut next = vec![state[0] + 1]; + for (i, c) in self.query.chars().enumerate() { + let cost = if Some(c) == chr { 0 } else { 1 }; + let v = cmp::min( + cmp::min(next[i] + 1, state[i + 1] + 1), + state[i] + cost, + ); + next.push(cmp::min(v, self.dist + 1)); + } + next + } +} + +impl Automaton for Levenshtein { + type State = Option; + + #[inline] + fn start(&self) -> Option { + Some(0) + } + + #[inline] + fn is_match(&self, state: &Option) -> bool { + state.map(|state| self.dfa.states[state].is_match).unwrap_or(false) + } + + #[inline] + fn can_match(&self, state: &Option) -> bool { + state.is_some() + } + + #[inline] + fn accept(&self, state: &Option, byte: u8) -> Option { + state.and_then(|state| self.dfa.states[state].next[byte as usize]) + } +} + +#[derive(Debug)] +struct Dfa { + states: Vec, +} + +struct State { + next: [Option; 256], + is_match: bool, +} + +impl fmt::Debug for State { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + writeln!(f, "State {{")?; + writeln!(f, " is_match: {:?}", self.is_match)?; + for i in 0..256 { + if let Some(si) = self.next[i] { + writeln!(f, " {:?}: {:?}", i, si)?; + } + } + write!(f, "}}") + } +} + +struct DfaBuilder { + dfa: Dfa, + lev: DynamicLevenshtein, + cache: HashMap, usize>, +} + +impl DfaBuilder { + fn new(lev: DynamicLevenshtein) -> Self { + DfaBuilder { + dfa: Dfa { states: Vec::with_capacity(16) }, + lev, + cache: HashMap::with_capacity(1024), + } + } + + fn build(mut self) -> Result { + let mut stack = vec![self.lev.start()]; + let mut seen = HashSet::new(); + let query = self.lev.query.clone(); // temp work around of borrowck + while let Some(lev_state) = stack.pop() { + let dfa_si = self.cached_state(&lev_state).unwrap(); + let mismatch = self.add_mismatch_utf8_states(dfa_si, &lev_state); + if let Some((next_si, lev_next)) = mismatch { + if !seen.contains(&next_si) { + seen.insert(next_si); + stack.push(lev_next); + } + } + for (i, c) in query.chars().enumerate() { + if lev_state[i] > self.lev.dist { + continue; + } + let lev_next = self.lev.accept(&lev_state, Some(c)); + let next_si = self.cached_state(&lev_next); + if let Some(next_si) = next_si { + self.add_utf8_sequences(true, dfa_si, next_si, c, c); + if !seen.contains(&next_si) { + seen.insert(next_si); + stack.push(lev_next); + } + } + } + if self.dfa.states.len() > STATE_LIMIT { + return Err(LevenshteinError::TooManyStates(STATE_LIMIT)); + } + } + Ok(self.dfa) + } + + fn cached_state(&mut self, lev_state: &[usize]) -> Option { + self.cached(lev_state).map(|(si, _)| si) + } + + fn cached(&mut self, lev_state: &[usize]) -> Option<(usize, bool)> { + if !self.lev.can_match(lev_state) { + return None; + } + Some(match self.cache.entry(lev_state.to_vec()) { + Entry::Occupied(v) => (*v.get(), true), + Entry::Vacant(v) => { + let is_match = self.lev.is_match(lev_state); + self.dfa.states.push(State { next: [None; 256], is_match }); + (*v.insert(self.dfa.states.len() - 1), false) + } + }) + } + + fn add_mismatch_utf8_states( + &mut self, + from_si: usize, + lev_state: &[usize], + ) -> Option<(usize, Vec)> { + let mismatch_state = self.lev.accept(lev_state, None); + let to_si = match self.cached(&mismatch_state) { + None => return None, + Some((si, _)) => si, + // Some((si, true)) => return Some((si, mismatch_state)), + // Some((si, false)) => si, + }; + self.add_utf8_sequences(false, from_si, to_si, '\u{0}', '\u{10FFFF}'); + return Some((to_si, mismatch_state)); + } + + fn add_utf8_sequences( + &mut self, + overwrite: bool, + from_si: usize, + to_si: usize, + from_chr: char, + to_chr: char, + ) { + for seq in Utf8Sequences::new(from_chr, to_chr) { + let mut fsi = from_si; + for range in &seq.as_slice()[0..seq.len() - 1] { + let tsi = self.new_state(false); + self.add_utf8_range(overwrite, fsi, tsi, range); + fsi = tsi; + } + self.add_utf8_range( + overwrite, + fsi, + to_si, + &seq.as_slice()[seq.len() - 1], + ); + } + } + + fn add_utf8_range( + &mut self, + overwrite: bool, + from: usize, + to: usize, + range: &Utf8Range, + ) { + for b in range.start as usize..range.end as usize + 1 { + if overwrite || self.dfa.states[from].next[b].is_none() { + self.dfa.states[from].next[b] = Some(to); + } + } + } + + fn new_state(&mut self, is_match: bool) -> usize { + self.dfa.states.push(State { next: [None; 256], is_match }); + self.dfa.states.len() - 1 + } +} diff --git a/src/automaton/mod.rs b/src/automaton/mod.rs index 72843eb..254c8ca 100644 --- a/src/automaton/mod.rs +++ b/src/automaton/mod.rs @@ -1,5 +1,10 @@ +#[cfg(feature = "levenshtein")] +pub use self::levenshtein::{Levenshtein, LevenshteinError}; use self::StartsWithStateInternal::*; +#[cfg(feature = "levenshtein")] +mod levenshtein; + /// Automaton describes types that behave as a finite automaton. /// /// All implementors of this trait are represented by *byte based* automata. diff --git a/src/lib.rs b/src/lib.rs index e0b93ba..c85f217 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -23,23 +23,8 @@ Simply add a corresponding entry to your `Cargo.toml` dependency list: fst = "0.2" ``` -And add this to your crate root: - -```ignore -extern crate fst; -``` - The examples in this documentation will show the rest. -# Other crates - -The -[`fst-regex`](https://docs.rs/fst-regex) -and -[`fst-levenshtein`](https://docs.rs/fst-levenshtein) -crates provide regular expression matching and fuzzy searching on FSTs, -respectively. - # Overview of types and modules This crate provides the high level abstractions---namely sets and maps---in the @@ -51,7 +36,10 @@ as range queries and streams. The `raw` module permits direct interaction with finite state transducers. Namely, the states and transitions of a transducer can be directly accessed with the `raw` module. - +*/ +#![cfg_attr( + feature = "levenshtein", + doc = r##" # Example: fuzzy query This example shows how to create a set of strings in memory, and then execute @@ -60,17 +48,17 @@ of `1` of `foo`. (Edit distance is the number of character insertions, deletions or substitutions required to get from one string to another. In this case, a character is a Unicode codepoint.) -```rust -extern crate fst; -extern crate fst_levenshtein; // the fst-levenshtein crate +This requires the `levenshtein` feature in this crate to be enabled. It is not +enabled by default. +```rust use std::error::Error; use fst::{IntoStreamer, Streamer, Set}; -use fst_levenshtein::Levenshtein; +use fst::automaton::Levenshtein; # fn main() { example().unwrap(); } -fn example() -> Result<(), Box> { +fn example() -> Result<(), Box> { // A convenient way to create sets in memory. let keys = vec!["fa", "fo", "fob", "focus", "foo", "food", "foul"]; let set = Set::from_iter(keys)?; @@ -87,6 +75,18 @@ fn example() -> Result<(), Box> { } ``` +**Warning**: Levenshtein automatons use a lot of memory + +The construction of Levenshtein automatons should be consider "proof of +concept" quality. Namely, they do just enough to be *correct*. But they haven't +had any effort put into them to be memory conscious. + +Note that an error will be returned if a Levenshtein automaton gets too big +(tens of MB in heap usage). + +"## +)] +/*! # Example: stream a map to a file This shows how to create a `MapBuilder` that will stream construction of the @@ -132,32 +132,38 @@ assert_eq!(kvs, vec![ # Example: case insensitive search -We can perform case insensitive search on a set using a regular expression. -Note that while sets can store arbitrary byte strings, a regular expression -will only match valid UTF-8 encoded byte strings. - -```rust -extern crate fst; -extern crate fst_regex; // the fst-regex crate - -use std::error::Error; +We can perform case insensitive search on a set using a regular expression. We +can use the [`regex-automata`](https://docs.rs/regex-automata) crate to compile +a regular expression into an automaton: -use fst::{IntoStreamer, Streamer, Set}; -use fst_regex::Regex; +```ignore +use fst::{IntoStreamer, Set}; +use regex_automata::dense; // regex-automata crate with 'transducer' feature -# fn main() { example().unwrap(); } -fn example() -> Result<(), Box> { +fn main() -> Result<(), Box> { let set = Set::from_iter(&["FoO", "Foo", "fOO", "foo"])?; + let pattern = r"(?i)foo"; + // Setting 'anchored' is important, otherwise the regex can match anywhere + // in the key. This would cause the regex to iterate over every key in the + // FST set. + let dfa = dense::Builder::new().anchored(true).build(pattern).unwrap(); - let re = Regex::new("(?i)foo")?; - let mut stream = set.search(&re).into_stream(); - - let keys = stream.into_strs()?; + let keys = set.search(&dfa).into_stream().into_strs()?; assert_eq!(keys, vec!["FoO", "Foo", "fOO", "foo"]); + println!("{:?}", keys); Ok(()) } ``` +Note that for this to work, the `regex-automata` crate must be compiled with +the `transducer` feature enabled: + +```toml +[dependencies] +fst = "0.3" +regex-automata = { version = "0.1.9", features = ["transducer"] } +``` + # Example: searching multiple sets efficiently Since queries can search a transducer without reading the entire data structure @@ -167,55 +173,52 @@ This crate provides efficient set/map operations that allow one to combine multiple streams of search results. Each operation only uses memory proportional to the number of streams. -The example below shows how to find all keys that have at least one capital -letter that doesn't appear at the beginning of the key. The example below uses -sets, but the same operations are available on maps too. +The example below shows how to find all keys that start with `B` or `G`. The +example below uses sets, but the same operations are available on maps too. ```rust -extern crate fst; -extern crate fst_regex; // the fst-regex crate - use std::error::Error; -use fst::{Streamer, Set}; +use fst::automaton::{Automaton, Str}; use fst::set; -use fst_regex::Regex; +use fst::{IntoStreamer, Set, Streamer}; # fn main() { example().unwrap(); } -fn example() -> Result<(), Box> { +fn example() -> Result<(), Box> { let set1 = Set::from_iter(&["AC/DC", "Aerosmith"])?; let set2 = Set::from_iter(&["Bob Seger", "Bruce Springsteen"])?; let set3 = Set::from_iter(&["George Thorogood", "Golden Earring"])?; let set4 = Set::from_iter(&["Kansas"])?; let set5 = Set::from_iter(&["Metallica"])?; - // Create the regular expression. We can reuse it to search all of the sets. - let re = Regex::new(r".+\p{Lu}.*")?; + // Create the matcher. We can reuse it to search all of the sets. + let matcher = Str::new("B") + .starts_with() + .union(Str::new("G").starts_with()); - // Build a set operation. All we need to do is add a search result stream for - // each set and ask for the union. (Other operations, like intersection and - // difference are also available.) + // Build a set operation. All we need to do is add a search result stream + // for each set and ask for the union. (Other operations, like intersection + // and difference are also available.) let mut stream = set::OpBuilder::new() - .add(set1.search(&re)) - .add(set2.search(&re)) - .add(set3.search(&re)) - .add(set4.search(&re)) - .add(set5.search(&re)) + .add(set1.search(&matcher)) + .add(set2.search(&matcher)) + .add(set3.search(&matcher)) + .add(set4.search(&matcher)) + .add(set5.search(&matcher)) .union(); - // Now collect all of the keys. Alternatively, you could build another set here - // using `SetBuilder::extend_stream`. + // Now collect all of the keys. Alternatively, you could build another set + // here using `SetBuilder::extend_stream`. let mut keys = vec![]; while let Some(key) = stream.next() { - keys.push(key.to_vec()); + keys.push(String::from_utf8(key.to_vec())?); } assert_eq!(keys, vec![ - "AC/DC".as_bytes(), - "Bob Seger".as_bytes(), - "Bruce Springsteen".as_bytes(), - "George Thorogood".as_bytes(), - "Golden Earring".as_bytes(), + "Bob Seger", + "Bruce Springsteen", + "George Thorogood", + "Golden Earring", ]); Ok(()) } @@ -290,32 +293,10 @@ data structures found in the standard library, such as `BTreeSet` and done, they can be merged together into one big set/map if desired. A somewhat simplistic example of this procedure can be seen in `fst-bin/src/merge.rs` from the root of this crate's repository. - -# Warning: regexes and Levenshtein automatons use a lot of memory - -The construction of automatons for both regular expressions and Levenshtein -automatons should be consider "proof of concept" quality. Namely, they do just -enough to be *correct*. But they haven't had any effort put into them to be -memory conscious. These are important parts of this library, so they will be -improved. - -Note that whether you're using regexes or Levenshtein automatons, an error -will be returned if the automaton gets too big (tens of MB in heap usage). */ #![deny(missing_docs)] - -#[cfg(test)] -extern crate fst_levenshtein; -#[cfg(test)] -extern crate fst_regex; - -#[cfg(test)] -extern crate quickcheck; -#[cfg(test)] -extern crate rand; - pub use crate::automaton::Automaton; pub use crate::error::{Error, Result}; pub use crate::map::{Map, MapBuilder}; diff --git a/src/map.rs b/src/map.rs index c784151..6bada89 100644 --- a/src/map.rs +++ b/src/map.rs @@ -292,32 +292,35 @@ impl Map { /// An implementation of regular expressions for `Automaton` is available /// in the `fst-regex` crate, which can be used to search maps. /// - /// ```rust - /// extern crate fst; - /// extern crate fst_regex; + /// # Example /// - /// use std::error::Error; + /// An implementation of subsequence search for `Automaton` can be used + /// to search sets: /// + /// ```rust + /// use fst::automaton::Subsequence; /// use fst::{IntoStreamer, Streamer, Map}; - /// use fst_regex::Regex; /// /// # fn main() { example().unwrap(); } - /// fn example() -> Result<(), Box> { + /// fn example() -> Result<(), Box> { /// let map = Map::from_iter(vec![ - /// ("foo", 1), ("foo1", 2), ("foo2", 3), ("foo3", 4), ("foobar", 5), + /// ("a foo bar", 1), + /// ("foo", 2), + /// ("foo1", 3), + /// ("foo2", 4), + /// ("foo3", 5), + /// ("foobar", 6), /// ]).unwrap(); /// - /// let re = Regex::new("f[a-z]+3?").unwrap(); - /// let mut stream = map.search(&re).into_stream(); + /// let matcher = Subsequence::new("for"); + /// let mut stream = map.search(&matcher).into_stream(); /// /// let mut kvs = vec![]; /// while let Some((k, v)) = stream.next() { - /// kvs.push((k.to_vec(), v)); + /// kvs.push((String::from_utf8(k.to_vec())?, v)); /// } /// assert_eq!(kvs, vec![ - /// (b"foo".to_vec(), 1), - /// (b"foo3".to_vec(), 4), - /// (b"foobar".to_vec(), 5), + /// ("a foo bar".to_string(), 1), ("foobar".to_string(), 6), /// ]); /// /// Ok(()) diff --git a/src/set.rs b/src/set.rs index b04fded..d50114c 100644 --- a/src/set.rs +++ b/src/set.rs @@ -189,33 +189,28 @@ impl Set { /// /// # Example /// - /// An implementation of regular expressions for `Automaton` is available - /// in the `fst-regex` crate, which can be used to search sets. + /// An implementation of subsequence search for `Automaton` can be used + /// to search sets: /// /// ```rust - /// extern crate fst; - /// extern crate fst_regex; - /// - /// use std::error::Error; - /// + /// use fst::automaton::Subsequence; /// use fst::{IntoStreamer, Streamer, Set}; - /// use fst_regex::Regex; /// /// # fn main() { example().unwrap(); } - /// fn example() -> Result<(), Box> { + /// fn example() -> Result<(), Box> { /// let set = Set::from_iter(&[ - /// "foo", "foo1", "foo2", "foo3", "foobar", + /// "a foo bar", "foo", "foo1", "foo2", "foo3", "foobar", /// ]).unwrap(); /// - /// let re = Regex::new("f[a-z]+3?").unwrap(); - /// let mut stream = set.search(&re).into_stream(); + /// let matcher = Subsequence::new("for"); + /// let mut stream = set.search(&matcher).into_stream(); /// /// let mut keys = vec![]; /// while let Some(key) = stream.next() { - /// keys.push(key.to_vec()); + /// keys.push(String::from_utf8(key.to_vec())?); /// } /// assert_eq!(keys, vec![ - /// "foo".as_bytes(), "foo3".as_bytes(), "foobar".as_bytes(), + /// "a foo bar", "foobar", /// ]); /// /// Ok(()) diff --git a/tests/test.rs b/tests/test.rs index 7ffc2c0..4ae8e39 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -1,14 +1,10 @@ -use fst; - - - -use fst_levenshtein::Levenshtein; -use fst_regex::Regex; - +#[cfg(feature = "levenshtein")] +use fst::automaton::Levenshtein; use fst::automaton::{Str, Subsequence}; -use fst::raw::{Builder, Fst, Output}; -use fst::set::{OpBuilder, Set}; -use fst::{Automaton, IntoStreamer, Streamer}; +#[cfg(feature = "levenshtein")] +use fst::raw::{Builder, Fst}; +use fst::set::Set; +use fst::{self, Automaton, IntoStreamer, Streamer}; static WORDS: &'static str = include_str!("../data/words-10000"); @@ -16,6 +12,7 @@ fn get_set() -> Set { Set::from_iter(WORDS.lines()).unwrap() } +#[cfg(feature = "levenshtein")] fn fst_set(ss: I) -> Fst where I: IntoIterator, @@ -34,15 +31,7 @@ where fst } -#[test] -fn regex_simple() { - let set = fst_set(vec!["abc", "abd", "ayz", "za"]); - let re = Regex::new("a[a-z]*").unwrap(); - let mut rdr = set.search(&re).ge("abd").lt("ax").into_stream(); - assert_eq!(rdr.next(), Some(("abd".as_bytes(), Output::zero()))); - assert!(rdr.next().is_none()); -} - +#[cfg(feature = "levenshtein")] #[test] fn levenshtein_simple() { let set = fst_set(vec!["woof", "wood", "banana"]); @@ -51,6 +40,7 @@ fn levenshtein_simple() { assert_eq!(vs, vec!["wood".as_bytes(), "woof".as_bytes()]); } +#[cfg(feature = "levenshtein")] #[test] fn levenshtein_unicode() { let set = fst_set(vec!["woof", "wood", "banana", "☃snowman☃"]); @@ -59,6 +49,7 @@ fn levenshtein_unicode() { assert_eq!(vs, vec!["☃snowman☃".as_bytes()]); } +#[cfg(feature = "levenshtein")] #[test] fn complement_small() { let keys = vec!["fa", "fo", "fob", "focus", "foo", "food", "foul"]; @@ -70,6 +61,7 @@ fn complement_small() { assert_eq!(keys, vec!["fa", "focus", "foul"]); } +#[cfg(feature = "levenshtein")] #[test] fn startswith_small() { let keys = vec![ @@ -89,39 +81,44 @@ fn startswith_small() { ); } +#[cfg(feature = "levenshtein")] #[test] fn intersection_small() { - let keys = vec!["fa", "fo", "fob", "focus", "foo", "food", "foul"]; + let keys = vec!["fab", "fo", "fob", "focus", "foo", "food", "foul", "goo"]; let set = Set::from_iter(keys).unwrap(); let lev = Levenshtein::new("foo", 1).unwrap(); - let reg = Regex::new("(..)*").unwrap(); - let stream = set.search(lev.intersection(reg)).into_stream(); + let prefix = Str::new("fo").starts_with(); + let stream = set.search(lev.intersection(prefix)).into_stream(); let keys = stream.into_strs().unwrap(); - assert_eq!(keys, vec!["fo", "food"]); + assert_eq!(keys, vec!["fo", "fob", "foo", "food"]); } +#[cfg(feature = "levenshtein")] #[test] fn union_small() { - let keys = vec!["fa", "fo", "fob", "focus", "foo", "food", "foul"]; + let keys = vec!["fab", "fob", "focus", "foo", "food", "goo"]; let set = Set::from_iter(keys).unwrap(); let lev = Levenshtein::new("foo", 1).unwrap(); - let reg = Regex::new("(..)*").unwrap(); - let stream = set.search(lev.union(reg)).into_stream(); + let prefix = Str::new("fo").starts_with(); + let stream = set.search(lev.union(prefix)).into_stream(); let keys = stream.into_strs().unwrap(); - assert_eq!(keys, vec!["fa", "fo", "fob", "foo", "food", "foul"]); + assert_eq!(keys, vec!["fob", "focus", "foo", "food", "goo"]); } +#[cfg(feature = "levenshtein")] #[test] fn intersection_large() { + use fst::set::OpBuilder; + let set = get_set(); let lev = Levenshtein::new("foo", 3).unwrap(); - let reg = Regex::new("(..)*").unwrap(); - let mut stream1 = set.search((&lev).intersection(®)).into_stream(); + let prefix = Str::new("fa").starts_with(); + let mut stream1 = set.search((&lev).intersection(&prefix)).into_stream(); let mut stream2 = OpBuilder::new() .add(set.search(&lev)) - .add(set.search(®)) + .add(set.search(&prefix)) .intersection(); while let Some(key1) = stream1.next() { assert_eq!(stream2.next(), Some(key1)); @@ -129,14 +126,19 @@ fn intersection_large() { assert_eq!(stream2.next(), None); } +#[cfg(feature = "levenshtein")] #[test] fn union_large() { + use fst::set::OpBuilder; + let set = get_set(); let lev = Levenshtein::new("foo", 3).unwrap(); - let reg = Regex::new("(..)*").unwrap(); - let mut stream1 = set.search((&lev).union(®)).into_stream(); - let mut stream2 = - OpBuilder::new().add(set.search(&lev)).add(set.search(®)).union(); + let prefix = Str::new("fa").starts_with(); + let mut stream1 = set.search((&lev).union(&prefix)).into_stream(); + let mut stream2 = OpBuilder::new() + .add(set.search(&lev)) + .add(set.search(&prefix)) + .union(); while let Some(key1) = stream1.next() { assert_eq!(stream2.next(), Some(key1)); } @@ -166,14 +168,11 @@ fn str() { #[test] fn subsequence() { let set = get_set(); - let subseq = Subsequence::new("aab"); - let regex = Regex::new(".*a.*a.*b.*").unwrap(); - let mut stream1 = set.search(&subseq).into_stream(); - let mut stream2 = set.search(®ex).into_stream(); - while let Some(key1) = stream1.next() { - assert_eq!(stream2.next(), Some(key1)); - } - assert_eq!(stream2.next(), None); + let subseq = Subsequence::new("nockbunsurrundd"); + + let mut stream = set.search(&subseq).into_stream(); + assert_eq!(stream.next().unwrap(), b"bannockburnsurrounded"); + assert_eq!(stream.next(), None); } #[test] From 1aa9516f49fd87ba52d22190dfdd652855cd207d Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 23 Feb 2020 11:39:47 -0500 Subject: [PATCH 07/23] bench: switch to criterion for benchmarks --- Cargo.toml | 16 +--- bench/Cargo.toml | 23 ++++++ bench/src/bench.rs | 185 +++++++++++++++++++++++++++++++++++++++++++++ bench/src/lib.rs | 2 + benches/build.rs | 93 ----------------------- benches/search.rs | 94 ----------------------- fst-bin/Cargo.toml | 4 - 7 files changed, 213 insertions(+), 204 deletions(-) create mode 100644 bench/Cargo.toml create mode 100644 bench/src/bench.rs create mode 100644 bench/src/lib.rs delete mode 100644 benches/build.rs delete mode 100644 benches/search.rs diff --git a/Cargo.toml b/Cargo.toml index 1a79e91..a2a9a53 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,23 +14,14 @@ keywords = ["search", "information", "retrieval", "dictionary", "map"] license = "Unlicense/MIT" edition = "2018" +[workspace] +members = ["bench", "fst-bin"] + [features] default = ["mmap"] mmap = ["memmap"] levenshtein = ["utf8-ranges"] -[[bench]] -name = "build" -path = "./benches/build.rs" -test = false -bench = true - -[[bench]] -name = "search" -path = "./benches/search.rs" -test = false -bench = true - [dependencies] byteorder = "1" memmap = { version = "0.6.0", optional = true } @@ -46,7 +37,6 @@ rand = "0.5" debug = true [profile.bench] -opt-level = 3 debug = true [package.metadata.docs.rs] diff --git a/bench/Cargo.toml b/bench/Cargo.toml new file mode 100644 index 0000000..49d5ce5 --- /dev/null +++ b/bench/Cargo.toml @@ -0,0 +1,23 @@ +[package] +publish = false +name = "fst-bench" +version = "0.0.1" +authors = ["Andrew Gallant "] +description = "Criterion benchmark suite for fst." +homepage = "https://github.com/BurntSushi/fst" +repository = "https://github.com/BurntSushi/fst" +license = "BSD-3-Clause" +edition = "2018" + +[lib] +bench = false + +[[bench]] +name = "fst" +harness = false +path = "src/bench.rs" + +[dependencies] +criterion = "0.3.1" +fnv = "1.0.6" +fst = { version = "*", path = "..", features = ["levenshtein"] } diff --git a/bench/src/bench.rs b/bench/src/bench.rs new file mode 100644 index 0000000..abfe94b --- /dev/null +++ b/bench/src/bench.rs @@ -0,0 +1,185 @@ +#![allow(warnings)] + +use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; +use std::iter::FromIterator; +use std::sync::Arc; +use std::time::Duration; + +use fnv::FnvHashSet; +use fst::raw::{Builder, Fst}; + +use criterion::{ + criterion_group, criterion_main, Bencher, Benchmark, Criterion, Throughput, +}; + +const WORDS: &str = include_str!("../../data/words-10000"); +const WORDS_BIG: &str = include_str!("../../data/words-100000"); +const WIKI_URLS: &str = include_str!("../../data/wiki-urls-100000"); + +fn all(c: &mut Criterion) { + build(c); + search(c); +} + +fn build(c: &mut Criterion) { + define_corpus(c, "build", "fst/set", WORDS.as_bytes(), move |b| { + let words = get_words(); + b.iter(|| { + let set = fst::Set::from_iter(&words).unwrap(); + assert_eq!(words.len(), set.len()); + }); + }); + + define_corpus(c, "build", "fst/map", WORDS.as_bytes(), move |b| { + let words = get_words_outputs(); + b.iter(|| { + let map = fst::Map::from_iter(words.iter().cloned()).unwrap(); + assert_eq!(words.len(), map.len()); + }); + }); + + define_corpus(c, "build", "hash/set", WORDS.as_bytes(), move |b| { + let words = get_words(); + b.iter(|| { + let mut set = HashSet::new(); + for word in &words { + set.insert(word); + } + assert_eq!(words.len(), set.len()); + }); + }); + + define_corpus(c, "build", "hash/map", WORDS.as_bytes(), move |b| { + let words = get_words_outputs(); + b.iter(|| { + let mut map = HashMap::new(); + for &(ref word, len) in &words { + map.insert(word, len); + } + assert_eq!(words.len(), map.len()); + }); + }); + + define_corpus(c, "build", "btree/set", WORDS.as_bytes(), move |b| { + let words = get_words(); + b.iter(|| { + let mut set = BTreeSet::new(); + for word in &words { + set.insert(word); + } + assert_eq!(words.len(), set.len()); + }); + }); + + define_corpus(c, "build", "btree/map", WORDS.as_bytes(), move |b| { + let words = get_words_outputs(); + b.iter(|| { + let mut map = BTreeMap::new(); + for &(ref word, len) in &words { + map.insert(word, len); + } + assert_eq!(words.len(), map.len()); + }); + }); +} + +fn search(c: &mut Criterion) { + let lists = &[("words", WORDS_BIG), ("wikiurls", WIKI_URLS)]; + + for &(name, list) in lists { + let group = format!("search/{}", name); + define(c, &group, "fst/contains", move |b| { + let keys = get_keys(list); + let set = fst::Set::from_iter(&keys).unwrap(); + + let mut i = 0; + b.iter(|| { + assert!(set.contains(&keys[i])); + i = (i + 1) % keys.len(); + }); + }); + } + for &(name, list) in lists { + let group = format!("search/{}", name); + define(c, &group, "btree/contains", move |b| { + let keys = get_keys(list); + let set = BTreeSet::from_iter(&keys); + + let mut i = 0; + b.iter(|| { + assert!(set.contains(&keys[i])); + i = (i + 1) % keys.len(); + }); + }); + } + for &(name, list) in lists { + let group = format!("search/{}", name); + define(c, &group, "hash/contains", move |b| { + let keys = get_keys(list); + let set: HashSet<&String> = HashSet::from_iter(&keys); + + let mut i = 0; + b.iter(|| { + assert!(set.contains(&keys[i])); + i = (i + 1) % keys.len(); + }); + }); + } + for &(name, list) in lists { + let group = format!("search/{}", name); + define(c, &group, "hashfnv/contains", move |b| { + let keys = get_keys(list); + let set = FnvHashSet::from_iter(&keys); + + let mut i = 0; + b.iter(|| { + assert!(set.contains(&keys[i])); + i = (i + 1) % keys.len(); + }); + }); + } +} + +fn get_words() -> Vec { + WORDS.lines().map(|s| s.to_owned()).collect() +} + +fn get_words_outputs() -> Vec<(String, u64)> { + WORDS.lines().map(|s| (s.to_owned(), s.len() as u64)).collect() +} + +fn get_keys(s: &str) -> Vec { + s.lines().map(str::to_owned).collect() +} + +fn define( + c: &mut Criterion, + group_name: &str, + bench_name: &str, + bench: impl FnMut(&mut Bencher) + 'static, +) { + let benchmark = Benchmark::new(bench_name, bench) + .sample_size(50) + .warm_up_time(Duration::from_millis(500)) + .measurement_time(Duration::from_secs(3)); + c.bench(group_name, benchmark); +} + +fn define_corpus( + c: &mut Criterion, + group_name: &str, + bench_name: &str, + corpus: &[u8], + bench: impl FnMut(&mut Bencher) + 'static, +) { + let tput = Throughput::Bytes(corpus.len() as u64); + let benchmark = Benchmark::new(bench_name, bench) + .throughput(tput) + .sample_size(30) + .warm_up_time(Duration::from_millis(500)) + .measurement_time(Duration::from_secs(5)); + c.bench(group_name, benchmark); +} + +criterion_group!(g, all); +criterion_main!(g); diff --git a/bench/src/lib.rs b/bench/src/lib.rs new file mode 100644 index 0000000..3b67349 --- /dev/null +++ b/bench/src/lib.rs @@ -0,0 +1,2 @@ +// This is purposely empty. See src/bench.rs instead. We use src/bench.rs +// to avoid including the same file in multiple build targets. diff --git a/benches/build.rs b/benches/build.rs deleted file mode 100644 index 6d871c5..0000000 --- a/benches/build.rs +++ /dev/null @@ -1,93 +0,0 @@ -#![feature(test)] - - -extern crate test; - -use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; - -use fst::raw::{Builder, Fst}; -use test::Bencher; - -const WORDS: &'static str = include_str!("./../data/words-10000"); - -fn get_words() -> Vec { - WORDS.lines().map(|s| s.to_owned()).collect() -} - -fn get_words_outputs() -> Vec<(String, u64)> { - WORDS.lines().map(|s| (s.to_owned(), s.len() as u64)).collect() -} - -#[bench] -fn build_fst_set(b: &mut Bencher) { - let words = get_words(); - b.bytes = WORDS.len() as u64; - b.iter(|| { - let mut bfst = Builder::memory(); - for word in &words { - bfst.add(word).unwrap(); - } - Fst::from_bytes(bfst.into_inner().unwrap()).unwrap(); - }); -} - -#[bench] -fn build_fst_map(b: &mut Bencher) { - let words = get_words_outputs(); - b.bytes = WORDS.len() as u64; - b.iter(|| { - let mut bfst = Builder::memory(); - for &(ref word, len) in &words { - bfst.insert(word, len).unwrap(); - } - Fst::from_bytes(bfst.into_inner().unwrap()).unwrap(); - }); -} - -#[bench] -fn build_hash_set(b: &mut Bencher) { - let words = get_words(); - b.bytes = WORDS.len() as u64; - b.iter(|| { - let mut set = HashSet::new(); - for word in &words { - set.insert(word); - } - }); -} - -#[bench] -fn build_hash_map(b: &mut Bencher) { - let words = get_words_outputs(); - b.bytes = WORDS.len() as u64; - b.iter(|| { - let mut set = HashMap::new(); - for &(ref word, len) in &words { - set.insert(word, len); - } - }); -} - -#[bench] -fn build_btree_set(b: &mut Bencher) { - let words = get_words(); - b.bytes = WORDS.len() as u64; - b.iter(|| { - let mut set = BTreeSet::new(); - for word in &words { - set.insert(word); - } - }); -} - -#[bench] -fn build_btree_map(b: &mut Bencher) { - let words = get_words_outputs(); - b.bytes = WORDS.len() as u64; - b.iter(|| { - let mut set = BTreeMap::new(); - for &(ref word, len) in &words { - set.insert(word, len); - } - }); -} diff --git a/benches/search.rs b/benches/search.rs deleted file mode 100644 index 4960f3e..0000000 --- a/benches/search.rs +++ /dev/null @@ -1,94 +0,0 @@ -#![feature(test)] - - - -#[macro_use] -extern crate lazy_static; -extern crate test; - -const STR_WORDS: &'static str = include_str!("./../data/words-100000"); -const STR_WIKI_URLS: &'static str = include_str!("./../data/wiki-urls-100000"); - -fn get_keys(s: &'static str) -> Vec { - s.lines().map(str::to_owned).collect() -} - -lazy_static! { - static ref WORDS: Vec = get_keys(STR_WORDS); - static ref WIKI_URLS: Vec = get_keys(STR_WIKI_URLS); -} - -macro_rules! search { - ($name:ident, $keys:expr) => { - mod $name { - use std::collections::{BTreeSet, HashSet}; - use std::hash::BuildHasherDefault; - - use fnv::FnvHasher; - use fst::raw::{Builder, Fst}; - use test::Bencher; - - #[bench] - fn fst_contains(b: &mut Bencher) { - lazy_static! { - static ref FST: Fst = { - let mut bfst = Builder::memory(); - for word in $keys.iter() { - bfst.add(word).unwrap(); - } - let bytes = bfst.into_inner().unwrap(); - Fst::from_bytes(bytes).unwrap() - }; - } - let mut i = 0; - b.iter(|| { - i = (i + 1) % $keys.len(); - assert!(FST.contains_key(&$keys[i])); - }) - } - - #[bench] - fn hash_fnv_contains(b: &mut Bencher) { - type Fnv = BuildHasherDefault; - lazy_static! { - static ref SET: HashSet = - { $keys.clone().into_iter().collect() }; - } - let mut i = 0; - b.iter(|| { - i = (i + 1) % $keys.len(); - assert!(SET.contains(&$keys[i])); - }) - } - - #[bench] - fn hash_sip_contains(b: &mut Bencher) { - lazy_static! { - static ref SET: HashSet = - { $keys.clone().into_iter().collect() }; - } - let mut i = 0; - b.iter(|| { - i = (i + 1) % $keys.len(); - assert!(SET.contains(&$keys[i])); - }) - } - - #[bench] - fn btree_contains(b: &mut Bencher) { - lazy_static! { - static ref SET: BTreeSet = - { $keys.clone().into_iter().collect() }; - } - let mut i = 0; - b.iter(|| { - i = (i + 1) % $keys.len(); - assert!(SET.contains(&$keys[i])); - }) - } - } - }; -} - -search!(words, crate::WORDS); -search!(wiki_urls, crate::WIKI_URLS); diff --git a/fst-bin/Cargo.toml b/fst-bin/Cargo.toml index 5987eef..fe654f6 100644 --- a/fst-bin/Cargo.toml +++ b/fst-bin/Cargo.toml @@ -30,7 +30,3 @@ num_cpus = "1.5" regex-automata = { version = "*", path = "/home/andrew/rust/regex-automata", features = ["transducer"] } serde = { version = "1.0.104", features = ["derive"] } tempdir = "0.3" - -[profile.release] -debug = true -opt-level = 3 From 20f7a90323558cd2db88cfec1acd38bfeda582a6 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 23 Feb 2020 11:55:28 -0500 Subject: [PATCH 08/23] perf: specialize registry case of two cells This is good for a modest ~15% performance improvement in the build/fst/set benchmark. --- src/raw/registry.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/raw/registry.rs b/src/raw/registry.rs index da76070..3b00a5b 100644 --- a/src/raw/registry.rs +++ b/src/raw/registry.rs @@ -72,6 +72,22 @@ impl<'a> RegistryCache<'a> { cell.node.clone_from(node); RegistryEntry::NotFound(cell) } + } else if self.cells.len() == 2 { + let cell1 = &mut self.cells[0]; + if !cell1.is_none() && &cell1.node == node { + return RegistryEntry::Found(cell1.addr); + } + + let cell2 = &mut self.cells[1]; + if !cell2.is_none() && &cell2.node == node { + let addr = cell2.addr; + self.cells.swap(0, 1); + return RegistryEntry::Found(addr); + } + + self.cells[1].node.clone_from(node); + self.cells.swap(0, 1); + RegistryEntry::NotFound(&mut self.cells[0]) } else { let find = |c: &RegistryCell| !c.is_none() && &c.node == node; if let Some(i) = self.cells.iter().position(find) { From c7b65fb7230b104d5a2835360dc42a71240cd55e Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 23 Feb 2020 15:11:14 -0500 Subject: [PATCH 09/23] style: minor touchups --- src/error.rs | 2 +- src/raw/error.rs | 5 ++--- src/raw/mod.rs | 2 +- src/raw/node.rs | 10 +++++++--- src/raw/tests.rs | 2 +- 5 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/error.rs b/src/error.rs index 88660ee..1f68761 100644 --- a/src/error.rs +++ b/src/error.rs @@ -5,7 +5,7 @@ use std::io; use crate::raw; /// A `Result` type alias for this crate's `Error` type. -pub type Result = ::std::result::Result; +pub type Result = std::result::Result; /// An error that encapsulates all possible errors in this crate. #[derive(Debug)] diff --git a/src/raw/error.rs b/src/raw/error.rs index 25a1a94..c151af4 100644 --- a/src/raw/error.rs +++ b/src/raw/error.rs @@ -1,4 +1,3 @@ -use std::error; use std::fmt; use std::str; use std::string::FromUtf8Error; @@ -109,8 +108,8 @@ impl fmt::Debug for Error { } } -impl error::Error for Error { - fn source(&self) -> Option<&(dyn error::Error + 'static)> { +impl std::error::Error for Error { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match *self { Error::FromUtf8(ref err) => Some(err), _ => None, diff --git a/src/raw/mod.rs b/src/raw/mod.rs index a4c4869..421dab3 100644 --- a/src/raw/mod.rs +++ b/src/raw/mod.rs @@ -1067,7 +1067,7 @@ fn u64_to_usize(n: u64) -> usize { #[inline] #[cfg(not(target_pointer_width = "64"))] fn u64_to_usize(n: u64) -> usize { - if n > ::std::usize::MAX as u64 { + if n > std::usize::MAX as u64 { panic!( "\ Cannot convert node address {} to a pointer sized variable. If this FST diff --git a/src/raw/node.rs b/src/raw/node.rs index a468fca..febd31a 100644 --- a/src/raw/node.rs +++ b/src/raw/node.rs @@ -8,7 +8,9 @@ use byteorder::WriteBytesExt; use crate::raw::build::BuilderNode; use crate::raw::common_inputs::{COMMON_INPUTS, COMMON_INPUTS_INV}; use crate::raw::pack::{pack_size, pack_uint, pack_uint_in, unpack_uint}; -use crate::raw::{u64_to_usize, CompiledAddr, Output, Transition, EMPTY_ADDRESS}; +use crate::raw::{ + u64_to_usize, CompiledAddr, Output, Transition, EMPTY_ADDRESS, +}; /// The threshold (in number of transitions) at which an index is created for /// a node's transitions. This speeds up lookup time at the expense of FST @@ -871,10 +873,12 @@ mod tests { use crate::raw::build::BuilderNode; use crate::raw::node::{node_new, Node}; - use crate::raw::{Builder, CompiledAddr, Fst, Output, Transition, VERSION}; + use crate::raw::{ + Builder, CompiledAddr, Fst, Output, Transition, VERSION, + }; use crate::stream::Streamer; - const NEVER_LAST: CompiledAddr = ::std::u64::MAX as CompiledAddr; + const NEVER_LAST: CompiledAddr = std::u64::MAX as CompiledAddr; #[test] fn prop_emits_inputs() { diff --git a/src/raw/tests.rs b/src/raw/tests.rs index 6ca7fc4..4a05f3c 100644 --- a/src/raw/tests.rs +++ b/src/raw/tests.rs @@ -112,7 +112,7 @@ fn fst_set_100000() { assert!( fst.get(word).is_some(), "failed to find word: {}", - ::std::str::from_utf8(word).unwrap() + std::str::from_utf8(word).unwrap() ); } } From ab44e3e005778a0919689f348458db5dec62ea7d Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 23 Feb 2020 17:19:12 -0500 Subject: [PATCH 10/23] api: make FSTs generic over AsRef<[u8]> This change was not nearly as bad as I thought it would be. Instead of trying to provide all the different possible ways to store some bytes, we instead make FSTs maximally flexible by accepting any type that can cheaply provide byte slice. This should resolve a number of issues with folks constructing FSTs in ways that weren't supported by the old constructors. As a bonus, we no longer need to directly depend on a specific implementation of memory maps. Conveniently, the `Mmap` type in the `memmap` crate already implements `AsRef<[u8]>`, so using memory maps is as simple as let mmap = memmap::Mmap::map(&File::open("something.fst").unwrap()); let fst = Fst::new(mmap).unwrap(); Fixes #92, Fixes #94, Fixes #97 --- Cargo.toml | 5 +- fst-bin/Cargo.toml | 1 + fst-bin/src/cmd/csv.rs | 2 +- fst-bin/src/cmd/dot.rs | 2 +- fst-bin/src/cmd/dupes.rs | 2 +- fst-bin/src/cmd/fuzzy.rs | 3 +- fst-bin/src/cmd/grep.rs | 8 +- fst-bin/src/cmd/mod.rs | 16 --- fst-bin/src/cmd/node.rs | 3 +- fst-bin/src/cmd/range.rs | 3 +- fst-bin/src/cmd/rust.rs | 6 +- fst-bin/src/cmd/union.rs | 3 +- fst-bin/src/merge.rs | 42 +++--- fst-bin/src/util.rs | 20 ++- fst-regex/Cargo.toml | 4 - src/lib.rs | 20 ++- src/map.rs | 114 ++++++--------- src/raw/build.rs | 10 +- src/raw/error.rs | 12 +- src/raw/mmap.rs | 87 ------------ src/raw/mod.rs | 295 ++++++++++++++++++--------------------- src/raw/node.rs | 6 +- src/raw/ops.rs | 6 +- src/raw/tests.rs | 58 ++++---- src/set.rs | 113 ++++++--------- tests/test.rs | 10 +- 26 files changed, 346 insertions(+), 505 deletions(-) delete mode 100644 src/raw/mmap.rs diff --git a/Cargo.toml b/Cargo.toml index a2a9a53..bd7d405 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,18 +18,17 @@ edition = "2018" members = ["bench", "fst-bin"] [features] -default = ["mmap"] -mmap = ["memmap"] +default = [] levenshtein = ["utf8-ranges"] [dependencies] byteorder = "1" -memmap = { version = "0.6.0", optional = true } utf8-ranges = { version = "1", optional = true } [dev-dependencies] fnv = "1.0.5" lazy_static = "0.2.8" +memmap = "0.7" quickcheck = { version = "0.7", default-features = false } rand = "0.5" diff --git a/fst-bin/Cargo.toml b/fst-bin/Cargo.toml index fe654f6..c38bd83 100644 --- a/fst-bin/Cargo.toml +++ b/fst-bin/Cargo.toml @@ -26,6 +26,7 @@ csv = "1.1.3" docopt = "1.1" fst = { path = "..", version = "0.3", features = ["levenshtein"] } lines = "0.0" +memmap = "0.7" num_cpus = "1.5" regex-automata = { version = "*", path = "/home/andrew/rust/regex-automata", features = ["transducer"] } serde = { version = "1.0.104", features = ["derive"] } diff --git a/fst-bin/src/cmd/csv.rs b/fst-bin/src/cmd/csv.rs index 9a26fa2..2989f96 100644 --- a/fst-bin/src/cmd/csv.rs +++ b/fst-bin/src/cmd/csv.rs @@ -36,7 +36,7 @@ pub fn run(argv: Vec) -> Result<(), Error> { let wtr = util::get_writer(args.arg_output.as_ref())?; let mut wtr = csv::Writer::from_writer(wtr); - let fst = unsafe { fst::raw::Fst::from_path(args.arg_input) }?; + let fst = unsafe { util::mmap_fst(args.arg_input)? }; let mut set = BitSet::with_capacity(fst.len()); if args.cmd_edges { diff --git a/fst-bin/src/cmd/dot.rs b/fst-bin/src/cmd/dot.rs index 050d8aa..13f0c23 100644 --- a/fst-bin/src/cmd/dot.rs +++ b/fst-bin/src/cmd/dot.rs @@ -61,7 +61,7 @@ pub fn run(argv: Vec) -> Result<(), Error> { .unwrap_or_else(|e| e.exit()); let mut wtr = util::get_buf_writer(args.arg_output.as_ref())?; - let fst = unsafe { fst::raw::Fst::from_path(&args.arg_input) }?; + let fst = unsafe { util::mmap_fst(&args.arg_input)? }; let mut set = BitSet::with_capacity(fst.len()); let mut stack = vec![fst.root().addr()]; diff --git a/fst-bin/src/cmd/dupes.rs b/fst-bin/src/cmd/dupes.rs index 285d807..42d9fcc 100644 --- a/fst-bin/src/cmd/dupes.rs +++ b/fst-bin/src/cmd/dupes.rs @@ -63,7 +63,7 @@ pub fn run(argv: Vec) -> Result<(), Error> { .unwrap_or_else(|e| e.exit()); let mut wtr = util::get_buf_writer(args.arg_output.as_ref())?; - let fst = unsafe { fst::raw::Fst::from_path(args.arg_input) }?; + let fst = unsafe { util::mmap_fst(args.arg_input)? }; let mut set = BitSet::with_capacity(fst.len()); let mut node_counts = HashMap::with_capacity(10_000); diff --git a/fst-bin/src/cmd/fuzzy.rs b/fst-bin/src/cmd/fuzzy.rs index 649c54c..a8ed249 100644 --- a/fst-bin/src/cmd/fuzzy.rs +++ b/fst-bin/src/cmd/fuzzy.rs @@ -2,7 +2,6 @@ use std::io; use docopt::Docopt; use fst::automaton::Levenshtein; -use fst::raw::Fst; use serde::Deserialize; use crate::util; @@ -48,7 +47,7 @@ pub fn run(argv: Vec) -> Result<(), Error> { let args: Args = Docopt::new(USAGE) .and_then(|d| d.argv(&argv).deserialize()) .unwrap_or_else(|e| e.exit()); - let fst = unsafe { Fst::from_path(&args.arg_fst) }?; + let fst = unsafe { util::mmap_fst(&args.arg_fst)? }; let lev = Levenshtein::new(&args.arg_query, args.flag_distance)?; let mut q = fst.search(&lev); if let Some(ref start) = args.flag_start { diff --git a/fst-bin/src/cmd/grep.rs b/fst-bin/src/cmd/grep.rs index c1b6ce7..09513ee 100644 --- a/fst-bin/src/cmd/grep.rs +++ b/fst-bin/src/cmd/grep.rs @@ -1,9 +1,7 @@ use std::io; use docopt::Docopt; -use fst::raw::Fst; use regex_automata::dense; -// use regex_automata::sparse::SparseDFA; use serde::Deserialize; use crate::util; @@ -40,7 +38,7 @@ pub fn run(argv: Vec) -> Result<(), Error> { let args: Args = Docopt::new(USAGE) .and_then(|d| d.argv(&argv).deserialize()) .unwrap_or_else(|e| e.exit()); - let fst = unsafe { Fst::from_path(&args.arg_fst) }?; + let fst = unsafe { util::mmap_fst(&args.arg_fst)? }; let dense_dfa = dense::Builder::new() .anchored(true) .byte_classes(false) @@ -50,10 +48,6 @@ pub fn run(argv: Vec) -> Result<(), Error> { dense::DenseDFA::Standard(dfa) => dfa, _ => unreachable!(), }; - // let dfa = match dense_dfa.to_sparse()? { - // SparseDFA::Standard(dfa) => dfa, - // _ => unreachable!(), - // }; let mut q = fst.search(&dfa); if let Some(ref start) = args.flag_start { q = q.ge(start); diff --git a/fst-bin/src/cmd/mod.rs b/fst-bin/src/cmd/mod.rs index e76f157..ff283e8 100644 --- a/fst-bin/src/cmd/mod.rs +++ b/fst-bin/src/cmd/mod.rs @@ -9,19 +9,3 @@ pub mod range; pub mod rust; pub mod set; pub mod union; - -// If compile times become unruly, comment out unused modules above and use -// the following macro to satisfying the compiler. -// macro_rules! unused { - // ($($name:ident),*) => { - // $( - // pub mod $name { - // pub fn run(_: Vec) -> Result<(), ::Error> { - // unimplemented!() - // } - // } - // )* - // } -// } -// -// unused! { csv, dot, fuzzy, grep, map, union } diff --git a/fst-bin/src/cmd/node.rs b/fst-bin/src/cmd/node.rs index e6f5946..5364965 100644 --- a/fst-bin/src/cmd/node.rs +++ b/fst-bin/src/cmd/node.rs @@ -1,5 +1,4 @@ use docopt::Docopt; -use fst::raw::Fst; use serde::Deserialize; use crate::util; @@ -34,7 +33,7 @@ pub fn run(argv: Vec) -> Result<(), Error> { .and_then(|d| d.argv(&argv).deserialize()) .unwrap_or_else(|e| e.exit()); let mut wtr = util::get_buf_writer::<&str>(None)?; - let fst = unsafe { Fst::from_path(&args.arg_fst) }?; + let fst = unsafe { util::mmap_fst(&args.arg_fst)? }; let node = fst.node(args.arg_node_address); w!(wtr, "{:?}", node); Ok(()) diff --git a/fst-bin/src/cmd/range.rs b/fst-bin/src/cmd/range.rs index 361cc18..2252109 100644 --- a/fst-bin/src/cmd/range.rs +++ b/fst-bin/src/cmd/range.rs @@ -1,7 +1,6 @@ use std::io; use docopt::Docopt; -use fst::raw::Fst; use serde::Deserialize; use crate::util; @@ -38,7 +37,7 @@ pub fn run(argv: Vec) -> Result<(), Error> { let args: Args = Docopt::new(USAGE) .and_then(|d| d.argv(&argv).deserialize()) .unwrap_or_else(|e| e.exit()); - let fst = unsafe { Fst::from_path(&args.arg_fst) }?; + let fst = unsafe { util::mmap_fst(&args.arg_fst)? }; let mut q = fst.range(); if let Some(ref start) = args.flag_start { q = q.ge(start); diff --git a/fst-bin/src/cmd/rust.rs b/fst-bin/src/cmd/rust.rs index 89994f5..c48cbb3 100644 --- a/fst-bin/src/cmd/rust.rs +++ b/fst-bin/src/cmd/rust.rs @@ -49,11 +49,7 @@ pub fn run(argv: Vec) -> Result<(), Error> { w!(wtr, "lazy_static! {{"); w!(wtr, " pub static ref {}: ::fst::raw::Fst = ", args.arg_name); - w!( - wtr, - " ::fst::raw::Fst::from_static_slice({}_BYTES).unwrap();", - args.arg_name - ); + w!(wtr, " ::fst::raw::Fst::new({}_BYTES).unwrap();", args.arg_name); w!(wtr, "}}\n"); w!(wtr, "const {}_BYTES: &'static [u8] = b\"\\", args.arg_name); diff --git a/fst-bin/src/cmd/union.rs b/fst-bin/src/cmd/union.rs index 2b7c25d..2517957 100644 --- a/fst-bin/src/cmd/union.rs +++ b/fst-bin/src/cmd/union.rs @@ -41,7 +41,8 @@ pub fn run(argv: Vec) -> Result<(), Error> { let mut sets = vec![]; for set_path in &args.arg_input { - sets.push(unsafe { fst::Set::from_path(set_path) }?); + let fst = unsafe { util::mmap_fst(set_path)? }; + sets.push(fst::Set::from(fst)); } let union = sets.iter().collect::().union(); merged.extend_stream(union)?; diff --git a/fst-bin/src/merge.rs b/fst-bin/src/merge.rs index 84cecbc..81ba237 100644 --- a/fst-bin/src/merge.rs +++ b/fst-bin/src/merge.rs @@ -9,10 +9,11 @@ use std::sync::Arc; use std::thread; use chan; -use fst::{self, Streamer, raw}; +use fst::{self, raw, Streamer}; use num_cpus; use tempdir::TempDir; +use crate::util; use crate::Error; pub struct Merger { @@ -29,10 +30,14 @@ pub struct Merger { type KV = (String, u64); impl Merger -where I: Iterator> + Send + 'static { +where + I: Iterator> + Send + 'static, +{ pub fn new(it: T, output: P) -> Self - where P: AsRef, - T: IntoIterator { + where + P: AsRef, + T: IntoIterator, + { Merger { it: it.into_iter(), output: output.as_ref().to_path_buf(), @@ -46,7 +51,9 @@ where I: Iterator> + Send + 'static { } pub fn value_merger(mut self, f: F) -> Self - where F: Fn(u64, u64) -> u64 + Send + Sync + 'static { + where + F: Fn(u64, u64) -> u64 + Send + Sync + 'static, + { self.value_merger = Some(Arc::new(f)); self } @@ -132,9 +139,11 @@ fn batcher( batch_size: u32, threads: u32, ) -> chan::Receiver, Error>> -where T: Send + 'static, - IT: Iterator> + Send + 'static, - I: IntoIterator + Send + 'static { +where + T: Send + 'static, + IT: Iterator> + Send + 'static, + I: IntoIterator + Send + 'static, +{ let batch_size = batch_size as usize; let (send, recv) = chan::sync(cmp::min(1, threads as usize / 3)); let it = it.into_iter(); @@ -182,10 +191,7 @@ impl Sorters { rsend.send(results); }); } - Sorters { - send: bsend, - results: rrecv, - } + Sorters { send: bsend, results: rrecv } } fn create_fst(&self, batch: B) { @@ -223,7 +229,9 @@ impl Batchable for KvBatch { for &(ref k, v) in &self.kvs { match builder.insert(k, v) { Ok(_) => {} - Err(fst::Error::Fst(fst::raw::Error::DuplicateKey { .. })) => {} + Err(fst::Error::Fst(fst::raw::Error::DuplicateKey { + .. + })) => {} Err(err) => return Err(From::from(err)), } } @@ -249,15 +257,15 @@ impl Batchable for UnionBatch { let mut fsts = vec![]; for path in &self.fsts { - fsts.push(unsafe { raw::Fst::from_path(path) }?); + fsts.push(unsafe { util::mmap_fst(path)? }); } let mut union = fsts.iter().collect::().union(); while let Some((key, outputs)) = union.next() { let mut merged = 0; if let Some(ref value_merger) = self.value_merger { - merged = outputs.iter().fold(0, |merged, iv| { - value_merger(merged, iv.value) - }); + merged = outputs + .iter() + .fold(0, |merged, iv| value_merger(merged, iv.value)); } builder.insert(key, merged)?; } diff --git a/fst-bin/src/util.rs b/fst-bin/src/util.rs index 7045c0d..babd88a 100644 --- a/fst-bin/src/util.rs +++ b/fst-bin/src/util.rs @@ -4,11 +4,18 @@ use std::io::{self, BufRead}; use std::path::{Path, PathBuf}; use csv; +use fst::raw::{Fst, Output}; use fst::{IntoStreamer, Streamer}; -use fst::raw::Output; +use memmap::Mmap; use crate::Error; +pub unsafe fn mmap_fst>(path: P) -> Result, Error> { + let mmap = Mmap::map(&File::open(path)?)?; + let fst = Fst::new(mmap)?; + Ok(fst) +} + pub fn escape_input(b: u8) -> String { String::from_utf8(ascii::escape_default(b).collect::>()).unwrap() } @@ -61,9 +68,11 @@ pub fn print_stream<'f, W, I, S>( outputs: bool, stream: I, ) -> Result<(), Error> -where W: io::Write, - I: for<'a> IntoStreamer<'a, Into=S, Item=(&'a [u8], Output)>, - S: 'f + for<'a> Streamer<'a, Item=(&'a [u8], Output)> { +where + W: io::Write, + I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>, + S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>, +{ let mut stream = stream.into_stream(); if outputs { let mut wtr = csv::Writer::from_writer(wtr); @@ -86,7 +95,8 @@ pub struct ConcatLines { cur: Option, } -type Lines = io::Lines>>; +type Lines = + io::Lines>>; impl ConcatLines { pub fn new(mut inputs: Vec) -> ConcatLines { diff --git a/fst-regex/Cargo.toml b/fst-regex/Cargo.toml index f867718..d145117 100644 --- a/fst-regex/Cargo.toml +++ b/fst-regex/Cargo.toml @@ -12,10 +12,6 @@ keywords = ["search", "information", "retrieval", "dictionary", "map"] license = "Unlicense/MIT" edition = "2018" -[features] -mmap = ["fst/mmap"] -default = ["mmap"] - [dependencies] fst = { path = "..", version = "0.3.1", default-features = false } regex-syntax = "0.3" diff --git a/src/lib.rs b/src/lib.rs index c85f217..03048aa 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -87,11 +87,16 @@ Note that an error will be returned if a Levenshtein automaton gets too big "## )] /*! -# Example: stream a map to a file +# Example: stream to a file and memory map it for searching This shows how to create a `MapBuilder` that will stream construction of the map to a file. Notably, this will never store the entire transducer in memory. -Instead, only constant memory is required. +Instead, only constant memory is required during construction. + +For the search phase, we use the +[`memmap`](https://crates.io/memmap) +crate to make the file available as a `&[u8]` without necessarily reading it +all into memory (the operating system will automatically handle that for you). ```rust,no_run # fn example() -> Result<(), fst::Error> { @@ -99,6 +104,7 @@ use std::fs::File; use std::io; use fst::{IntoStreamer, Streamer, Map, MapBuilder}; +use memmap::Mmap; // This is where we'll write our map to. let mut wtr = io::BufWriter::new(File::create("map.fst")?); @@ -115,7 +121,8 @@ build.finish()?; // At this point, the map has been constructed. Now we'd like to search it. // This creates a memory map, which enables searching the map without loading // all of it into memory. -let map = unsafe { Map::from_path("map.fst") }?; +let mmap = unsafe { Mmap::map(&File::open("map.fst")?)? }; +let map = Map::new(mmap)?; // Query for keys that are greater than or equal to clarence. let mut stream = map.range().ge("clarence").into_stream(); @@ -239,8 +246,9 @@ actually reading the entire set/map into memory. This use case is served well by *memory maps*, which lets one assign the entire contents of a file to a contiguous region of virtual memory. -Indeed, this crate encourages this mode of operation. Both sets and maps have -methods for memory mapping a finite state transducer from disk. +Indeed, this crate encourages this mode of operation. Both sets and maps can +be constructed from anything that provides an `AsRef<[u8]>` implementation, +which any memory map should. This is particularly important for long running processes that use this crate, since it enables the operating system to determine which regions of your @@ -255,7 +263,7 @@ solid state drives where seek time is eliminated. Nevertheless, solid state drives are not ubiquitous and it is possible that the OS will not be smart enough to keep your memory mapped transducers in the page cache. In that case, it is advisable to load the entire transducer into your process's memory (e.g., -`Set::from_bytes`). +calling `Set::new` with a `Vec`). # Streams diff --git a/src/map.rs b/src/map.rs index 6bada89..c53b930 100644 --- a/src/map.rs +++ b/src/map.rs @@ -1,8 +1,6 @@ use std::fmt; use std::io; use std::iter::{self, FromIterator}; -#[cfg(feature = "mmap")] -use std::path::Path; use crate::automaton::{AlwaysMatch, Automaton}; use crate::raw; @@ -18,8 +16,7 @@ use crate::Result; /// /// A key feature of `Map` is that it can be serialized to disk compactly. Its /// underlying representation is built such that the `Map` can be memory mapped -/// (`Map::from_path`) and searched without necessarily loading the entire -/// map into memory. +/// and searched without necessarily loading the entire map into memory. /// /// It supports most common operations associated with maps, such as key /// lookup and search. It also supports set operations on its keys along with @@ -54,41 +51,36 @@ use crate::Result; /// Keys will always be byte strings; however, we may grow more conveniences /// around dealing with them (such as a serialization/deserialization step, /// although it isn't clear where exactly this should live). -pub struct Map(raw::Fst); - -impl Map { - /// Opens a map stored at the given file path via a memory map. - /// - /// The map must have been written with a compatible finite state - /// transducer builder (`MapBuilder` qualifies). If the format is invalid - /// or if there is a mismatch between the API version of this library - /// and the map, then an error is returned. - /// - /// This is unsafe because Rust programs cannot guarantee that memory - /// backed by a memory mapped file won't be mutably aliased. It is up to - /// the caller to enforce that the memory map is not modified while it is - /// opened. - #[cfg(feature = "mmap")] - pub unsafe fn from_path>(path: P) -> Result { - raw::Fst::from_path(path).map(Map) - } +pub struct Map>(raw::Fst); - /// Creates a map from its representation as a raw byte sequence. +impl Map> { + /// Create a `Map` from an iterator of lexicographically ordered byte + /// strings and associated values. /// - /// Note that this operation is very cheap (no allocations and no copies). + /// If the iterator does not yield unique keys in lexicographic order, then + /// an error is returned. /// - /// The map must have been written with a compatible finite state - /// transducer builder (`MapBuilder` qualifies). If the format is invalid - /// or if there is a mismatch between the API version of this library - /// and the map, then an error is returned. - pub fn from_bytes(bytes: Vec) -> Result { - raw::Fst::from_bytes(bytes).map(Map) + /// Note that this is a convenience function to build a map in memory. + /// To build a map that streams to an arbitrary `io::Write`, use + /// `MapBuilder`. + pub fn from_iter(iter: I) -> Result>> + where + K: AsRef<[u8]>, + I: IntoIterator, + { + let mut builder = MapBuilder::memory(); + builder.extend_iter(iter)?; + Map::new(builder.into_inner()?) } +} +impl> Map { /// Creates a map from its representation as a raw byte sequence. /// - /// This accepts a static byte slice, which may be useful if the FST data is - /// embedded into the program. + /// This accepts anything that can be cheaply converted to a `&[u8]`. The + /// caller is responsible for guaranteeing that the given bytes refer to + /// a valid FST. While memory safety will not be violated by invalid input, + /// a panic could occur while reading the FST at any point. /// /// # Example /// @@ -101,29 +93,10 @@ impl Map { /// # }; /// # static FST: &[u8] = &[]; /// - /// let map = Map::from_static_slice(FST).unwrap(); + /// let map = Map::new(FST).unwrap(); /// ``` - pub fn from_static_slice(bytes: &'static [u8]) -> Result { - raw::Fst::from_static_slice(bytes).map(Map) - } - - /// Create a `Map` from an iterator of lexicographically ordered byte - /// strings and associated values. - /// - /// If the iterator does not yield unique keys in lexicographic order, then - /// an error is returned. - /// - /// Note that this is a convenience function to build a map in memory. - /// To build a map that streams to an arbitrary `io::Write`, use - /// `MapBuilder`. - pub fn from_iter(iter: I) -> Result - where - K: AsRef<[u8]>, - I: IntoIterator, - { - let mut builder = MapBuilder::memory(); - builder.extend_iter(iter)?; - Map::from_bytes(builder.into_inner()?) + pub fn new(data: D) -> Result> { + raw::Fst::new(data).map(Map) } /// Tests the membership of a single key. @@ -392,19 +365,19 @@ impl Map { /// Returns a reference to the underlying raw finite state transducer. #[inline] - pub fn as_fst(&self) -> &raw::Fst { + pub fn as_fst(&self) -> &raw::Fst { &self.0 } } -impl Default for Map { +impl Default for Map> { #[inline] - fn default() -> Map { + fn default() -> Map> { Map::from_iter(iter::empty::<(&[u8], u64)>()).unwrap() } } -impl fmt::Debug for Map { +impl> fmt::Debug for Map { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "Map([")?; let mut stream = self.stream(); @@ -421,27 +394,27 @@ impl fmt::Debug for Map { } // Construct a map from an Fst object. -impl From for Map { +impl> From> for Map { #[inline] - fn from(fst: raw::Fst) -> Map { + fn from(fst: raw::Fst) -> Map { Map(fst) } } /// Returns the underlying finite state transducer. -impl AsRef for Map { +impl> AsRef> for Map { #[inline] - fn as_ref(&self) -> &raw::Fst { + fn as_ref(&self) -> &raw::Fst { &self.0 } } -impl<'m, 'a> IntoStreamer<'a> for &'m Map { +impl<'m, 'a, D: AsRef<[u8]>> IntoStreamer<'a> for &'m Map { type Item = (&'a [u8], u64); type Into = Stream<'m>; #[inline] - fn into_stream(self) -> Self::Into { + fn into_stream(self) -> Stream<'m> { Stream(self.0.stream()) } } @@ -493,7 +466,7 @@ impl<'m, 'a> IntoStreamer<'a> for &'m Map { /// let bytes = build.into_inner().unwrap(); /// /// // At this point, the map has been constructed, but here's how to read it. -/// let map = Map::from_bytes(bytes).unwrap(); +/// let map = Map::new(bytes).unwrap(); /// let mut stream = map.into_stream(); /// let mut kvs = vec![]; /// while let Some((k, v)) = stream.next() { @@ -527,7 +500,9 @@ impl<'m, 'a> IntoStreamer<'a> for &'m Map { /// build.finish().unwrap(); /// /// // At this point, the map has been constructed, but here's how to read it. -/// let map = unsafe { Map::from_path("map.fst").unwrap() }; +/// // NOTE: Normally, one would memory map a file instead of reading its +/// // entire contents on to the heap. +/// let map = Map::new(std::fs::read("map.fst").unwrap()).unwrap(); /// let mut stream = map.into_stream(); /// let mut kvs = vec![]; /// while let Some((k, v)) = stream.next() { @@ -548,9 +523,10 @@ impl MapBuilder> { MapBuilder(raw::Builder::memory()) } - /// Finishes the construction of the map and returning it. - pub fn into_map(self) -> Map { - self.into_inner().and_then(Map::from_bytes).unwrap() + /// Finishes the construction of the map and returns it. + #[inline] + pub fn into_map(self) -> Map> { + Map(self.0.into_fst()) } } diff --git a/src/raw/build.rs b/src/raw/build.rs index 421b94c..ddceaa4 100644 --- a/src/raw/build.rs +++ b/src/raw/build.rs @@ -7,8 +7,8 @@ use crate::raw::counting_writer::CountingWriter; use crate::raw::error::Error; use crate::raw::registry::{Registry, RegistryEntry}; use crate::raw::{ - CompiledAddr, FstType, Output, Transition, EMPTY_ADDRESS, NONE_ADDRESS, - VERSION, + CompiledAddr, Fst, FstType, Output, Transition, EMPTY_ADDRESS, + NONE_ADDRESS, VERSION, }; // use raw::registry_minimal::{Registry, RegistryEntry}; use crate::stream::{IntoStreamer, Streamer}; @@ -103,6 +103,12 @@ impl Builder> { pub fn memory() -> Self { Builder::new(Vec::with_capacity(10 * (1 << 10))).unwrap() } + + /// Finishes construction of the FST and returns it. + #[inline] + pub fn into_fst(self) -> Fst> { + self.into_inner().and_then(Fst::new).unwrap() + } } impl Builder { diff --git a/src/raw/error.rs b/src/raw/error.rs index c151af4..284fccc 100644 --- a/src/raw/error.rs +++ b/src/raw/error.rs @@ -26,7 +26,10 @@ pub enum Error { /// An unexpected error occurred while reading a finite state transducer. /// Usually this occurs because the data is corrupted or is not actually /// a finite state transducer serialized by this library. - Format, + Format { + /// The number of bytes given to the FST constructor. + size: usize, + }, /// A duplicate key was inserted into a finite state transducer, which is /// not allowed. DuplicateKey { @@ -72,11 +75,12 @@ to change the version of the 'fst' crate you're using, or re-generate the FST.", expected, got ), - Format => write!( + Format { size } => write!( f, "\ -Error opening FST: An unknown error occurred. This usually means you're trying -to read data that isn't actually an encoded FST." +Error opening FST with size {} bytes: An unknown error occurred. This +usually means you're trying to read data that isn't actually an encoded FST.", + size ), DuplicateKey { ref got } => write!( f, diff --git a/src/raw/mmap.rs b/src/raw/mmap.rs deleted file mode 100644 index be63c98..0000000 --- a/src/raw/mmap.rs +++ /dev/null @@ -1,87 +0,0 @@ -use std::fs; -use std::io; -use std::path::Path; -use std::sync::Arc; - -use memmap::Mmap; - -/// A read only view into a memory map. -/// -/// Opening a memory map is unsafe because we cannot guarantee that its -/// underlying memory is not mutated by external processes. This read only -/// memory map guarantees that consumers can at least never modify the -/// underlying data. -/// -/// It is principally useful for controlling which region of a file an `Fst` -/// reads. -pub struct MmapReadOnly { - map: Arc, - offset: usize, - len: usize, -} - -impl MmapReadOnly { - /// Create a new memory map from an existing file handle. - /// - /// This is unsafe because Rust programs cannot guarantee that memory - /// backed by a memory mapped file won't be mutably aliased. It is up to - /// the caller to enforce that the memory map is not modified while it is - /// opened. - #[inline] - pub unsafe fn open(file: &fs::File) -> io::Result { - let mmap = Mmap::map(file)?; - let len = mmap.len(); - Ok(MmapReadOnly { map: Arc::new(mmap), offset: 0, len: len }) - } - - /// Open a new memory map from the path given. - /// - /// This is unsafe because Rust programs cannot guarantee that memory - /// backed by a memory mapped file won't be mutably aliased. It is up to - /// the caller to enforce that the memory map is not modified while it is - /// opened. - pub unsafe fn open_path>( - path: P, - ) -> io::Result { - MmapReadOnly::open(&fs::File::open(path)?) - } - - /// Returns the size in byte of the memory map. - /// - /// If it is a range, the size is the size of the range. - #[inline] - pub fn len(&self) -> usize { - self.len - } - - /// Slice this memory map to a new `offset` and `len`. - /// - /// If the new range is outside the bounds of `self`, then this method - /// panics. - #[inline] - pub fn range(&self, offset: usize, len: usize) -> MmapReadOnly { - assert!(offset + len <= self.len); - MmapReadOnly { - map: self.map.clone(), - offset: self.offset + offset, - len: len, - } - } - - /// Read the memory map as a `&[u8]`. - #[inline] - pub fn as_slice(&self) -> &[u8] { - &self.map[self.offset..self.offset + self.len] - } -} - -impl Clone for MmapReadOnly { - #[inline] - fn clone(&self) -> MmapReadOnly { - MmapReadOnly { - map: self.map.clone(), - offset: self.offset, - len: self.len, - } - } -} diff --git a/src/raw/mod.rs b/src/raw/mod.rs index 421dab3..034939c 100644 --- a/src/raw/mod.rs +++ b/src/raw/mod.rs @@ -18,13 +18,8 @@ option of specifying a merge strategy for output values. Most of the rest of the types are streams from set operations. */ -use std::borrow::Cow; use std::cmp; use std::fmt; -use std::ops::Deref; -#[cfg(feature = "mmap")] -use std::path::Path; -use std::sync::Arc; use byteorder::{LittleEndian, ReadBytesExt}; @@ -34,8 +29,6 @@ use crate::stream::{IntoStreamer, Streamer}; pub use self::build::Builder; pub use self::error::Error; -#[cfg(feature = "mmap")] -pub use self::mmap::MmapReadOnly; use self::node::node_new; pub use self::node::{Node, Transitions}; pub use self::ops::{ @@ -47,8 +40,6 @@ mod build; mod common_inputs; mod counting_writer; mod error; -#[cfg(feature = "mmap")] -mod mmap; mod node; mod ops; mod pack; @@ -277,42 +268,20 @@ pub type CompiledAddr = usize; /// (excellent for in depth overview) /// * [Comparison of Construction Algorithms for Minimal, Acyclic, Deterministic, Finite-State Automata from Sets of Strings](http://www.cs.mun.ca/~harold/Courses/Old/CS4750/Diary/q3p2qx4lv71m5vew.pdf) /// (excellent for surface level overview) -pub struct Fst { +pub struct Fst> { + meta: Meta, + data: D, +} + +#[derive(Debug)] +struct Meta { version: u64, - data: FstData, root_addr: CompiledAddr, ty: FstType, len: usize, } -impl Fst { - /// Opens a transducer stored at the given file path via a memory map. - /// - /// The fst must have been written with a compatible finite state - /// transducer builder (`Builder` qualifies). If the format is invalid or - /// if there is a mismatch between the API version of this library and the - /// fst, then an error is returned. - /// - /// This is unsafe because Rust programs cannot guarantee that memory - /// backed by a memory mapped file won't be mutably aliased. It is up to - /// the caller to enforce that the memory map is not modified while it is - /// opened. - #[cfg(feature = "mmap")] - pub unsafe fn from_path>(path: P) -> Result { - Fst::from_mmap(MmapReadOnly::open_path(path)?) - } - - /// Opens a transducer from a `MmapReadOnly`. - /// - /// This is useful if a transducer is serialized to only a part of a file. - /// A `MmapReadOnly` lets one control which region of the file is used for - /// the transducer. - #[cfg(feature = "mmap")] - #[inline] - pub fn from_mmap(mmap: MmapReadOnly) -> Result { - Fst::new(FstData::Mmap(mmap)) - } - +impl> Fst { /// Creates a transducer from its representation as a raw byte sequence. /// /// Note that this operation is very cheap (no allocations and no copies). @@ -322,55 +291,29 @@ impl Fst { /// if there is a mismatch between the API version of this library and the /// fst, then an error is returned. #[inline] - pub fn from_bytes(bytes: Vec) -> Result { - Fst::new(FstData::Cow(Cow::Owned(bytes))) - } - - /// Creates a transducer from its representation as a raw byte sequence. - /// - /// This accepts a static byte slice, which may be useful if the Fst - /// is embedded into source code. - #[inline] - pub fn from_static_slice(bytes: &'static [u8]) -> Result { - Fst::new(FstData::Cow(Cow::Borrowed(bytes))) - } - - /// Creates a transducer from a shared vector at the given offset and - /// length. - /// - /// This permits creating multiple transducers from a single region of - /// owned memory. - #[inline] - pub fn from_shared_bytes( - bytes: Arc>, - offset: usize, - len: usize, - ) -> Result { - Fst::new(FstData::Shared { vec: bytes, offset: offset, len: len }) - } - - fn new(data: FstData) -> Result { - if data.len() < 32 { - return Err(Error::Format.into()); + pub fn new(data: D) -> Result { + let bytes = data.as_ref(); + if bytes.len() < 32 { + return Err(Error::Format { size: bytes.len() }.into()); } // The read_u64 unwraps below are OK because they can never fail. // They can only fail when there is an IO error or if there is an // unexpected EOF. However, we are reading from a byte slice (no // IO errors possible) and we've confirmed the byte slice is at least // N bytes (no unexpected EOF). - let version = (&*data).read_u64::().unwrap(); + let version = (&bytes[0..]).read_u64::().unwrap(); if version == 0 || version > VERSION { return Err( Error::Version { expected: VERSION, got: version }.into() ); } - let ty = (&data[8..]).read_u64::().unwrap(); + let ty = (&bytes[8..]).read_u64::().unwrap(); let root_addr = { - let mut last = &data[data.len() - 8..]; + let mut last = &bytes[bytes.len() - 8..]; u64_to_usize(last.read_u64::().unwrap()) }; let len = { - let mut last2 = &data[data.len() - 16..]; + let mut last2 = &bytes[bytes.len() - 16..]; u64_to_usize(last2.read_u64::().unwrap()) }; // The root node is always the last node written, so its address should @@ -391,63 +334,34 @@ impl Fst { // special address `0`. In that case, the FST is the smallest it can // be: the version, type, root address and number of nodes. That's // 32 bytes (8 byte u64 each). - // - // This is essentially our own little checksum. - if (root_addr == EMPTY_ADDRESS && data.len() != 32) - && root_addr + 17 != data.len() + if (root_addr == EMPTY_ADDRESS && bytes.len() != 32) + && root_addr + 17 != bytes.len() { - return Err(Error::Format.into()); + return Err(Error::Format { size: bytes.len() }.into()); } - Ok(Fst { - version: version, - data: data, - root_addr: root_addr, - ty: ty, - len: len, - }) + let meta = Meta { version, root_addr, ty, len }; + Ok(Fst { meta, data }) } /// Retrieves the value associated with a key. /// /// If the key does not exist, then `None` is returned. - #[inline(never)] + #[inline] pub fn get>(&self, key: B) -> Option { - let mut node = self.root(); - let mut out = Output::zero(); - for &b in key.as_ref() { - node = match node.find_input(b) { - None => return None, - Some(i) => { - let t = node.transition(i); - out = out.cat(t.out); - self.node(t.addr) - } - } - } - if !node.is_final() { - None - } else { - Some(out.cat(node.final_output())) - } + self.as_ref().get(key.as_ref()) } /// Returns true if and only if the given key is in this FST. + #[inline] pub fn contains_key>(&self, key: B) -> bool { - let mut node = self.root(); - for &b in key.as_ref() { - node = match node.find_input(b) { - None => return false, - Some(i) => self.node(node.transition_addr(i)), - } - } - node.is_final() + self.as_ref().contains_key(key.as_ref()) } /// Return a lexicographically ordered stream of all key-value pairs in /// this fst. #[inline] pub fn stream(&self) -> Stream<'_> { - StreamBuilder::new(self, AlwaysMatch).into_stream() + StreamBuilder::new(self.as_ref(), AlwaysMatch).into_stream() } /// Return a builder for range queries. @@ -456,30 +370,31 @@ impl Fst { /// range given in lexicographic order. #[inline] pub fn range(&self) -> StreamBuilder<'_> { - StreamBuilder::new(self, AlwaysMatch) + StreamBuilder::new(self.as_ref(), AlwaysMatch) } /// Executes an automaton on the keys of this map. + #[inline] pub fn search(&self, aut: A) -> StreamBuilder<'_, A> { - StreamBuilder::new(self, aut) + StreamBuilder::new(self.as_ref(), aut) } /// Returns the number of keys in this fst. #[inline] pub fn len(&self) -> usize { - self.len + self.as_ref().len() } /// Returns true if and only if this fst has no keys. #[inline] pub fn is_empty(&self) -> bool { - self.len == 0 + self.as_ref().is_empty() } /// Returns the number of bytes used by this fst. #[inline] pub fn size(&self) -> usize { - self.data.len() + self.as_ref().size() } /// Creates a new fst operation with this fst added to it. @@ -498,6 +413,7 @@ impl Fst { /// /// `stream` must be a lexicographically ordered sequence of byte strings /// with associated values. + #[inline] pub fn is_disjoint<'f, I, S>(&self, stream: I) -> bool where I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>, @@ -511,6 +427,7 @@ impl Fst { /// /// `stream` must be a lexicographically ordered sequence of byte strings /// with associated values. + #[inline] pub fn is_subset<'f, I, S>(&self, stream: I) -> bool where I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>, @@ -529,6 +446,7 @@ impl Fst { /// /// `stream` must be a lexicographically ordered sequence of byte strings /// with associated values. + #[inline] pub fn is_superset<'f, I, S>(&self, stream: I) -> bool where I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>, @@ -551,13 +469,13 @@ impl Fst { /// the meaning of 0-255 unspecified. #[inline] pub fn fst_type(&self) -> FstType { - self.ty + self.as_ref().fst_type() } /// Returns the root node of this fst. #[inline(always)] pub fn root(&self) -> Node<'_> { - self.node(self.root_addr) + self.as_ref().root() } /// Returns the node at the given address. @@ -565,38 +483,117 @@ impl Fst { /// Node addresses can be obtained by reading transitions on `Node` values. #[inline] pub fn node(&self, addr: CompiledAddr) -> Node<'_> { - node_new(self.version, addr, &self.data) + self.as_ref().node(addr) } /// Returns a copy of the binary contents of this FST. #[inline] pub fn to_vec(&self) -> Vec { - self.data.to_vec() + self.as_ref().to_vec() } /// Returns the binary contents of this FST. #[inline] pub fn as_bytes(&self) -> &[u8] { - &self.data + self.as_ref().as_bytes() } - fn empty_final_output(&self) -> Option { - let root = self.root(); - if root.is_final() { - Some(root.final_output()) - } else { - None - } + #[inline] + fn as_ref(&self) -> FstRef { + FstRef { meta: &self.meta, data: self.data.as_ref() } } } -impl<'a, 'f> IntoStreamer<'a> for &'f Fst { +impl<'a, 'f, D: AsRef<[u8]>> IntoStreamer<'a> for &'f Fst { type Item = (&'a [u8], Output); type Into = Stream<'f>; #[inline] fn into_stream(self) -> Self::Into { - StreamBuilder::new(self, AlwaysMatch).into_stream() + StreamBuilder::new(self.as_ref(), AlwaysMatch).into_stream() + } +} + +struct FstRef<'f> { + meta: &'f Meta, + data: &'f [u8], +} + +impl<'f> FstRef<'f> { + fn get(&self, key: &[u8]) -> Option { + let mut node = self.root(); + let mut out = Output::zero(); + for &b in key { + node = match node.find_input(b) { + None => return None, + Some(i) => { + let t = node.transition(i); + out = out.cat(t.out); + self.node(t.addr) + } + } + } + if !node.is_final() { + None + } else { + Some(out.cat(node.final_output())) + } + } + + fn contains_key(&self, key: &[u8]) -> bool { + let mut node = self.root(); + for &b in key { + node = match node.find_input(b) { + None => return false, + Some(i) => self.node(node.transition_addr(i)), + } + } + node.is_final() + } + + fn len(&self) -> usize { + self.meta.len + } + + fn is_empty(&self) -> bool { + self.meta.len == 0 + } + + fn size(&self) -> usize { + self.as_bytes().len() + } + + fn fst_type(&self) -> FstType { + self.meta.ty + } + + fn root_addr(&self) -> CompiledAddr { + self.meta.root_addr + } + + fn root(&self) -> Node<'f> { + self.node(self.root_addr()) + } + + fn node(&self, addr: CompiledAddr) -> Node<'f> { + node_new(self.meta.version, addr, self.as_bytes()) + } + + fn to_vec(&self) -> Vec { + self.as_bytes().to_vec() + } + + fn as_bytes(&self) -> &'f [u8] { + self.data + } + + fn empty_final_output(&self) -> Option { + let root = self.root(); + if root.is_final() { + Some(root.final_output()) + } else { + None + } } } @@ -613,14 +610,14 @@ impl<'a, 'f> IntoStreamer<'a> for &'f Fst { /// /// The `'f` lifetime parameter refers to the lifetime of the underlying fst. pub struct StreamBuilder<'f, A = AlwaysMatch> { - fst: &'f Fst, + fst: FstRef<'f>, aut: A, min: Bound, max: Bound, } impl<'f, A: Automaton> StreamBuilder<'f, A> { - fn new(fst: &'f Fst, aut: A) -> Self { + fn new(fst: FstRef<'f>, aut: A) -> Self { StreamBuilder { fst, aut, @@ -705,7 +702,7 @@ pub struct Stream<'f, A = AlwaysMatch> where A: Automaton, { - fst: &'f Fst, + fst: FstRef<'f>, aut: A, inp: Vec, empty_output: Option, @@ -722,7 +719,7 @@ struct StreamState<'f, S> { } impl<'f, A: Automaton> Stream<'f, A> { - fn new(fst: &'f Fst, aut: A, min: Bound, max: Bound) -> Self { + fn new(fst: FstRef<'f>, aut: A, min: Bound, max: Bound) -> Stream<'f, A> { let mut rdr = Stream { fst, aut, @@ -901,7 +898,7 @@ impl<'f, 'a, A: Automaton> Streamer<'a> for Stream<'f, A> { if state.trans >= state.node.len() || !self.aut.can_match(&state.aut_state) { - if state.node.addr() != self.fst.root_addr { + if state.node.addr() != self.fst.root_addr() { self.inp.pop().unwrap(); } continue; @@ -998,32 +995,6 @@ impl Output { } } -enum FstData { - Cow(Cow<'static, [u8]>), - Shared { - vec: Arc>, - offset: usize, - len: usize, - }, - #[cfg(feature = "mmap")] - Mmap(MmapReadOnly), -} - -impl Deref for FstData { - type Target = [u8]; - - fn deref(&self) -> &[u8] { - match *self { - FstData::Cow(ref v) => &**v, - FstData::Shared { ref vec, offset, len } => { - &vec[offset..offset + len] - } - #[cfg(feature = "mmap")] - FstData::Mmap(ref v) => v.as_slice(), - } - } -} - /// A transition from one note to another. #[derive(Copy, Clone, Hash, Eq, PartialEq)] pub struct Transition { diff --git a/src/raw/node.rs b/src/raw/node.rs index febd31a..989d6ac 100644 --- a/src/raw/node.rs +++ b/src/raw/node.rs @@ -873,9 +873,7 @@ mod tests { use crate::raw::build::BuilderNode; use crate::raw::node::{node_new, Node}; - use crate::raw::{ - Builder, CompiledAddr, Fst, Output, Transition, VERSION, - }; + use crate::raw::{Builder, CompiledAddr, Output, Transition, VERSION}; use crate::stream::Streamer; const NEVER_LAST: CompiledAddr = std::u64::MAX as CompiledAddr; @@ -890,7 +888,7 @@ mod tests { for word in &bs { bfst.add(word).unwrap(); } - let fst = Fst::from_bytes(bfst.into_inner().unwrap()).unwrap(); + let fst = bfst.into_fst(); let mut rdr = fst.stream(); let mut words = vec![]; while let Some(w) = rdr.next() { diff --git a/src/raw/ops.rs b/src/raw/ops.rs index 41ddfc6..a9e79a1 100644 --- a/src/raw/ops.rs +++ b/src/raw/ops.rs @@ -473,7 +473,8 @@ mod tests { macro_rules! create_set_op { ($name:ident, $op:ident) => { fn $name(sets: Vec>) -> Vec { - let fsts: Vec = sets.into_iter().map(fst_set).collect(); + let fsts: Vec> = + sets.into_iter().map(fst_set).collect(); let op: OpBuilder = fsts.iter().collect(); let mut stream = op.$op().into_stream(); let mut keys = vec![]; @@ -488,7 +489,8 @@ mod tests { macro_rules! create_map_op { ($name:ident, $op:ident) => { fn $name(sets: Vec>) -> Vec<(String, u64)> { - let fsts: Vec = sets.into_iter().map(fst_map).collect(); + let fsts: Vec> = + sets.into_iter().map(fst_map).collect(); let op: OpBuilder = fsts.iter().collect(); let mut stream = op.$op().into_stream(); let mut keys = vec![]; diff --git a/src/raw/tests.rs b/src/raw/tests.rs index 4a05f3c..4e9b53d 100644 --- a/src/raw/tests.rs +++ b/src/raw/tests.rs @@ -1,5 +1,3 @@ -use std::sync::Arc; - use crate::automaton::AlwaysMatch; use crate::error::Error; use crate::raw::{self, Bound, Builder, Fst, Output, Stream, VERSION}; @@ -7,7 +5,7 @@ use crate::stream::Streamer; const TEXT: &'static str = include_str!("./../../data/words-100000"); -pub fn fst_set(ss: I) -> Fst +pub fn fst_set(ss: I) -> Fst> where I: IntoIterator, S: AsRef<[u8]>, @@ -16,16 +14,16 @@ where let mut ss: Vec> = ss.into_iter().map(|s| s.as_ref().to_vec()).collect(); ss.sort(); + ss.dedup(); for s in ss.iter().into_iter() { bfst.add(s).unwrap(); } - let fst = Fst::from_bytes(bfst.into_inner().unwrap()).unwrap(); - ss.dedup(); + let fst = bfst.into_fst(); assert_eq!(fst.len(), ss.len()); fst } -pub fn fst_map(ss: I) -> Fst +pub fn fst_map(ss: I) -> Fst> where I: IntoIterator, S: AsRef<[u8]>, @@ -38,10 +36,10 @@ where for (s, o) in ss.into_iter() { bfst.insert(s, o).unwrap(); } - Fst::from_bytes(bfst.into_inner().unwrap()).unwrap() + bfst.into_fst() } -pub fn fst_inputs(fst: &Fst) -> Vec> { +pub fn fst_inputs>(fst: &Fst) -> Vec> { let mut words = vec![]; let mut rdr = fst.stream(); while let Some((word, _)) = rdr.next() { @@ -50,7 +48,9 @@ pub fn fst_inputs(fst: &Fst) -> Vec> { words } -pub fn fst_inputs_outputs(fst: &Fst) -> Vec<(Vec, u64)> { +pub fn fst_inputs_outputs>( + fst: &Fst, +) -> Vec<(Vec, u64)> { let mut words = vec![]; let mut rdr = fst.stream(); while let Some((word, out)) = rdr.next() { @@ -206,7 +206,7 @@ fn fst_map_100000_lengths() { #[test] fn invalid_version() { - match Fst::from_bytes(vec![0; 32]) { + match Fst::new(vec![0; 32]) { Err(Error::Fst(raw::Error::Version { got, .. })) => assert_eq!(got, 0), Err(err) => panic!("expected version error, got {:?}", err), Ok(_) => panic!("expected version error, got FST"), @@ -219,7 +219,7 @@ fn invalid_version_crate_too_old() { let mut buf = vec![0; 32]; LittleEndian::write_u64(&mut buf, VERSION + 1); - match Fst::from_bytes(buf) { + match Fst::new(buf) { Err(Error::Fst(raw::Error::Version { got, .. })) => { assert_eq!(got, VERSION + 1); } @@ -230,8 +230,8 @@ fn invalid_version_crate_too_old() { #[test] fn invalid_format() { - match Fst::from_bytes(vec![0; 0]) { - Err(Error::Fst(raw::Error::Format)) => {} + match Fst::new(vec![0; 0]) { + Err(Error::Fst(raw::Error::Format { .. })) => {} Err(err) => panic!("expected format error, got {:?}", err), Ok(_) => panic!("expected format error, got FST"), } @@ -256,14 +256,18 @@ macro_rules! test_range { #[test] fn $name() { let items: Vec<&'static str> = vec![$($s),*]; - let items: Vec<_> = - items.into_iter().enumerate() - .map(|(i, k)| (k, i as u64)).collect(); + let items: Vec<_> = items + .into_iter() + .enumerate() + .map(|(i, k)| (k, i as u64)) + .collect(); let fst = fst_map(items.clone()); - let mut rdr = Stream::new(&fst, AlwaysMatch, $min, $max); + let mut rdr = Stream::new(fst.as_ref(), AlwaysMatch, $min, $max); for i in $imin..$imax { - assert_eq!(rdr.next().unwrap(), - (items[i].0.as_bytes(), Output::new(items[i].1))); + assert_eq!( + rdr.next().unwrap(), + (items[i].0.as_bytes(), Output::new(items[i].1)), + ); } assert_eq!(rdr.next(), None); } @@ -490,15 +494,13 @@ fn one_vec_multiple_fsts() { let mut bfst2 = Builder::new(bytes).unwrap(); bfst2.add(b"bar").unwrap(); bfst2.add(b"foo").unwrap(); - let bytes = Arc::new(bfst2.into_inner().unwrap()); - - let fst1 = Fst::from_shared_bytes(bytes.clone(), 0, fst1_len).unwrap(); - let fst2 = Fst::from_shared_bytes( - bytes.clone(), - fst1_len, - bytes.len() - fst1_len, - ) - .unwrap(); + + let bytes = bfst2.into_inner().unwrap(); + let slice1 = &bytes[0..fst1_len]; + let slice2 = &bytes[fst1_len..bytes.len()]; + + let fst1 = Fst::new(slice1).unwrap(); + let fst2 = Fst::new(slice2).unwrap(); assert_eq!(fst_inputs(&fst1), vec![b"bar".to_vec(), b"baz".to_vec()]); assert_eq!(fst_inputs(&fst2), vec![b"bar".to_vec(), b"foo".to_vec()]); diff --git a/src/set.rs b/src/set.rs index d50114c..6ae5fc2 100644 --- a/src/set.rs +++ b/src/set.rs @@ -1,8 +1,6 @@ use std::fmt; use std::io; use std::iter::{self, FromIterator}; -#[cfg(feature = "mmap")] -use std::path::Path; use crate::automaton::{AlwaysMatch, Automaton}; use crate::raw; @@ -17,8 +15,7 @@ use crate::Result; /// /// A key feature of `Set` is that it can be serialized to disk compactly. Its /// underlying representation is built such that the `Set` can be memory mapped -/// (`Set::from_path`) and searched without necessarily loading the entire -/// set into memory. +/// and searched without necessarily loading the entire set into memory. /// /// It supports most common operations associated with sets, such as /// membership, union, intersection, subset/superset, etc. It also supports @@ -29,42 +26,36 @@ use crate::Result; /// /// 1. Once constructed, a `Set` can never be modified. /// 2. Sets must be constructed with lexicographically ordered byte sequences. -pub struct Set(raw::Fst); +pub struct Set>(raw::Fst); -impl Set { - /// Opens a set stored at the given file path via a memory map. - /// - /// The set must have been written with a compatible finite state - /// transducer builder (`SetBuilder` qualifies). If the format is invalid - /// or if there is a mismatch between the API version of this library - /// and the set, then an error is returned. - /// - /// This is unsafe because Rust programs cannot guarantee that memory - /// backed by a memory mapped file won't be mutably aliased. It is up to - /// the caller to enforce that the memory map is not modified while it is - /// opened. - #[cfg(feature = "mmap")] - pub unsafe fn from_path>(path: P) -> Result { - raw::Fst::from_path(path).map(Set) - } - - /// Creates a set from its representation as a raw byte sequence. +impl Set> { + /// Create a `Set` from an iterator of lexicographically ordered byte + /// strings. /// - /// Note that this operation is very cheap (no allocations and no copies). + /// If the iterator does not yield values in lexicographic order, then an + /// error is returned. /// - /// The set must have been written with a compatible finite state - /// transducer builder (`SetBuilder` qualifies). If the format is invalid - /// or if there is a mismatch between the API version of this library - /// and the set, then an error is returned. - #[inline] - pub fn from_bytes(bytes: Vec) -> Result { - raw::Fst::from_bytes(bytes).map(Set) + /// Note that this is a convenience function to build a set in memory. + /// To build a set that streams to an arbitrary `io::Write`, use + /// `SetBuilder`. + pub fn from_iter(iter: I) -> Result>> + where + T: AsRef<[u8]>, + I: IntoIterator, + { + let mut builder = SetBuilder::memory(); + builder.extend_iter(iter)?; + Set::new(builder.into_inner()?) } +} +impl> Set { /// Creates a set from its representation as a raw byte sequence. /// - /// This accepts a static byte slice, which may be useful if the FST data is - /// embedded into the program. + /// This accepts anything that can be cheaply converted to a `&[u8]`. The + /// caller is responsible for guaranteeing that the given bytes refer to + /// a valid FST. While memory safety will not be violated by invalid input, + /// a panic could occur while reading the FST at any point. /// /// # Example /// @@ -77,29 +68,10 @@ impl Set { /// # }; /// # static FST: &[u8] = &[]; /// - /// let set = Set::from_static_slice(FST).unwrap(); + /// let set = Set::new(FST).unwrap(); /// ``` - pub fn from_static_slice(bytes: &'static [u8]) -> Result { - raw::Fst::from_static_slice(bytes).map(Set) - } - - /// Create a `Set` from an iterator of lexicographically ordered byte - /// strings. - /// - /// If the iterator does not yield values in lexicographic order, then an - /// error is returned. - /// - /// Note that this is a convenience function to build a set in memory. - /// To build a set that streams to an arbitrary `io::Write`, use - /// `SetBuilder`. - pub fn from_iter(iter: I) -> Result - where - T: AsRef<[u8]>, - I: IntoIterator, - { - let mut builder = SetBuilder::memory(); - builder.extend_iter(iter)?; - Set::from_bytes(builder.into_inner()?) + pub fn new(data: D) -> Result> { + raw::Fst::new(data).map(Set) } /// Tests the membership of a single key. @@ -342,19 +314,19 @@ impl Set { /// Returns a reference to the underlying raw finite state transducer. #[inline] - pub fn as_fst(&self) -> &raw::Fst { + pub fn as_fst(&self) -> &raw::Fst { &self.0 } } -impl Default for Set { +impl Default for Set> { #[inline] - fn default() -> Set { + fn default() -> Set> { Set::from_iter(iter::empty::<&[u8]>()).unwrap() } } -impl fmt::Debug for Set { +impl> fmt::Debug for Set { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "Set([")?; let mut stream = self.stream(); @@ -371,14 +343,14 @@ impl fmt::Debug for Set { } /// Returns the underlying finite state transducer. -impl AsRef for Set { +impl> AsRef> for Set { #[inline] - fn as_ref(&self) -> &raw::Fst { + fn as_ref(&self) -> &raw::Fst { &self.0 } } -impl<'s, 'a> IntoStreamer<'a> for &'s Set { +impl<'s, 'a, D: AsRef<[u8]>> IntoStreamer<'a> for &'s Set { type Item = &'a [u8]; type Into = Stream<'s>; @@ -389,9 +361,9 @@ impl<'s, 'a> IntoStreamer<'a> for &'s Set { } // Construct a set from an Fst object. -impl From for Set { +impl> From> for Set { #[inline] - fn from(fst: raw::Fst) -> Set { + fn from(fst: raw::Fst) -> Set { Set(fst) } } @@ -441,7 +413,7 @@ impl From for Set { /// let bytes = build.into_inner().unwrap(); /// /// // At this point, the set has been constructed, but here's how to read it. -/// let set = Set::from_bytes(bytes).unwrap(); +/// let set = Set::new(bytes).unwrap(); /// let mut stream = set.into_stream(); /// let mut keys = vec![]; /// while let Some(key) = stream.next() { @@ -473,7 +445,9 @@ impl From for Set { /// build.finish().unwrap(); /// /// // At this point, the set has been constructed, but here's how to read it. -/// let set = unsafe { Set::from_path("set.fst").unwrap() }; +/// // NOTE: Normally, one would memory map a file instead of reading its +/// // entire contents on to the heap. +/// let set = Set::new(std::fs::read("set.fst").unwrap()).unwrap(); /// let mut stream = set.into_stream(); /// let mut keys = vec![]; /// while let Some(key) = stream.next() { @@ -492,9 +466,10 @@ impl SetBuilder> { SetBuilder(raw::Builder::memory()) } - /// Finishes the construction of the set and returning it. - pub fn into_set(self) -> Set { - self.into_inner().and_then(Set::from_bytes).unwrap() + /// Finishes the construction of the set and returns it. + #[inline] + pub fn into_set(self) -> Set> { + Set(self.0.into_fst()) } } diff --git a/tests/test.rs b/tests/test.rs index 4ae8e39..c7d733c 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -8,12 +8,12 @@ use fst::{self, Automaton, IntoStreamer, Streamer}; static WORDS: &'static str = include_str!("../data/words-10000"); -fn get_set() -> Set { +fn get_set() -> Set> { Set::from_iter(WORDS.lines()).unwrap() } #[cfg(feature = "levenshtein")] -fn fst_set(ss: I) -> Fst +fn fst_set(ss: I) -> Fst> where I: IntoIterator, S: AsRef<[u8]>, @@ -25,7 +25,7 @@ where for s in ss.iter().into_iter() { bfst.add(s).unwrap(); } - let fst = Fst::from_bytes(bfst.into_inner().unwrap()).unwrap(); + let fst = bfst.into_fst(); ss.dedup(); assert_eq!(fst.len(), ss.len()); fst @@ -177,9 +177,9 @@ fn subsequence() { #[test] fn implements_default() { - let map: fst::Map = Default::default(); + let map: fst::Map> = Default::default(); assert!(map.is_empty()); - let set: fst::Set = Default::default(); + let set: fst::Set> = Default::default(); assert!(set.is_empty()); } From 8d248b71d49b7813a8ba1ca98598f95092ba6ba3 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 23 Feb 2020 19:00:18 -0500 Subject: [PATCH 11/23] fst-bin: update dependencies and add some polish Mostly this just upgrades to the newest versions of everything, and replaces deprecated crates (tempdir, chan) with their newer variants (tempfile, crossbeam-channel). We also use BString in several places instead of String so as to permit indexing any keys and not just UTF-8 keys. This is the calm before the storm. The next commit will switch from Docopt to Clap. --- Cargo.toml | 11 +++++------ fst-bin/Cargo.toml | 8 ++++---- fst-bin/src/cmd/set.rs | 20 ++++++++------------ fst-bin/src/merge.rs | 31 ++++++++++++++++--------------- fst-bin/src/util.rs | 22 ++++++++++++---------- 5 files changed, 45 insertions(+), 47 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index bd7d405..b399cbb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,15 +22,14 @@ default = [] levenshtein = ["utf8-ranges"] [dependencies] -byteorder = "1" -utf8-ranges = { version = "1", optional = true } +byteorder = "1.3.4" +utf8-ranges = { version = "1.0.4", optional = true } [dev-dependencies] -fnv = "1.0.5" -lazy_static = "0.2.8" +fnv = "1.0.6" memmap = "0.7" -quickcheck = { version = "0.7", default-features = false } -rand = "0.5" +quickcheck = { version = "0.9.2", default-features = false } +rand = "0.7.3" [profile.release] debug = true diff --git a/fst-bin/Cargo.toml b/fst-bin/Cargo.toml index c38bd83..0c40d83 100644 --- a/fst-bin/Cargo.toml +++ b/fst-bin/Cargo.toml @@ -20,14 +20,14 @@ name = "fst" path = "src/main.rs" [dependencies] -bit-set = "0.4" -chan = "0.1" +bit-set = "0.5.1" +bstr = "0.2.11" +crossbeam-channel = "0.4.2" csv = "1.1.3" docopt = "1.1" fst = { path = "..", version = "0.3", features = ["levenshtein"] } -lines = "0.0" memmap = "0.7" num_cpus = "1.5" regex-automata = { version = "*", path = "/home/andrew/rust/regex-automata", features = ["transducer"] } serde = { version = "1.0.104", features = ["derive"] } -tempdir = "0.3" +tempfile = "3.1.0" diff --git a/fst-bin/src/cmd/set.rs b/fst-bin/src/cmd/set.rs index 03a34cb..83605bf 100644 --- a/fst-bin/src/cmd/set.rs +++ b/fst-bin/src/cmd/set.rs @@ -1,9 +1,10 @@ use std::fs; +use std::io; use std::path::Path; +use bstr::io::BufReadExt; use docopt::Docopt; use fst::SetBuilder; -use lines::linereader::LineReader; use serde::Deserialize; use crate::merge::Merger; @@ -77,19 +78,14 @@ fn run_sorted(args: &Args) -> Result<(), Error> { let mut set = SetBuilder::new(wtr)?; for input in &args.arg_input { let rdr = util::get_buf_reader(Some(input))?; - let mut lines = LineReader::new(rdr); - loop { - let line = lines.read_line()?; + rdr.for_byte_line(|line| { if line.is_empty() { - break; + return Ok(false); } - let off = if line.len() >= 2 && line[line.len() - 2] == b'\r' { - 2 - } else { - 1 - }; - set.insert(&line[0..line.len() - off])?; - } + set.insert(line) + .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; + Ok(true) + })?; } set.finish().map_err(From::from) } diff --git a/fst-bin/src/merge.rs b/fst-bin/src/merge.rs index 81ba237..8e5a692 100644 --- a/fst-bin/src/merge.rs +++ b/fst-bin/src/merge.rs @@ -1,5 +1,3 @@ -#![allow(dead_code)] - use std::cmp; use std::env; use std::fs::{self, File}; @@ -8,10 +6,11 @@ use std::path::{Path, PathBuf}; use std::sync::Arc; use std::thread; -use chan; +use bstr::BString; +use crossbeam_channel as chan; use fst::{self, raw, Streamer}; use num_cpus; -use tempdir::TempDir; +use tempfile; use crate::util; use crate::Error; @@ -27,11 +26,11 @@ pub struct Merger { keep_tmp_dir: bool, } -type KV = (String, u64); +type KV = (BString, u64); impl Merger where - I: Iterator> + Send + 'static, + I: Iterator> + Send + 'static, { pub fn new(it: T, output: P) -> Self where @@ -84,7 +83,9 @@ where } pub fn merge(self) -> Result<(), Error> { - let tmp_dir = TempDir::new_in(&self.tmp_dir, "rust-fst")?; + let tmp_dir = tempfile::Builder::new() + .prefix("rust-fst") + .tempdir_in(&self.tmp_dir)?; let tmp_dir_path = Arc::new(tmp_dir.path().to_path_buf()); // Do the initial round of creating FSTs for every batch in the input. @@ -145,27 +146,27 @@ where I: IntoIterator + Send + 'static, { let batch_size = batch_size as usize; - let (send, recv) = chan::sync(cmp::min(1, threads as usize / 3)); + let (send, recv) = chan::bounded(cmp::min(1, threads as usize / 3)); let it = it.into_iter(); thread::spawn(move || { let mut batch = Vec::with_capacity(batch_size); for item in it { match item { Err(err) => { - send.send(Err(From::from(err))); + send.send(Err(From::from(err))).unwrap(); return; } Ok(item) => { batch.push(item); if batch.len() >= batch_size { - send.send(Ok(batch)); + send.send(Ok(batch)).unwrap(); batch = Vec::with_capacity(batch_size); } } } } if !batch.is_empty() { - send.send(Ok(batch)); + send.send(Ok(batch)).unwrap(); } }); recv @@ -178,8 +179,8 @@ struct Sorters { impl Sorters { fn new(threads: u32) -> Self { - let (bsend, brecv) = chan::sync::(0); // not sure why B is explicit - let (rsend, rrecv) = chan::sync(0); + let (bsend, brecv) = chan::bounded::(0); + let (rsend, rrecv) = chan::bounded(0); for _ in 0..threads { let brecv = brecv.clone(); let rsend = rsend.clone(); @@ -188,14 +189,14 @@ impl Sorters { for mut batch in brecv { results.push(batch.create_fst()); } - rsend.send(results); + rsend.send(results).unwrap(); }); } Sorters { send: bsend, results: rrecv } } fn create_fst(&self, batch: B) { - self.send.send(batch); + self.send.send(batch).unwrap(); } fn results(self) -> Vec> { diff --git a/fst-bin/src/util.rs b/fst-bin/src/util.rs index babd88a..a166529 100644 --- a/fst-bin/src/util.rs +++ b/fst-bin/src/util.rs @@ -1,8 +1,9 @@ use std::ascii; use std::fs::File; -use std::io::{self, BufRead}; +use std::io; use std::path::{Path, PathBuf}; +use bstr::{io::BufReadExt, BString}; use csv; use fst::raw::{Fst, Output}; use fst::{IntoStreamer, Streamer}; @@ -95,8 +96,9 @@ pub struct ConcatLines { cur: Option, } -type Lines = - io::Lines>>; +type Lines = bstr::io::ByteLines< + io::BufReader>, +>; impl ConcatLines { pub fn new(mut inputs: Vec) -> ConcatLines { @@ -106,7 +108,7 @@ impl ConcatLines { } impl Iterator for ConcatLines { - type Item = io::Result; + type Item = io::Result; fn next(&mut self) -> Option { loop { @@ -118,13 +120,13 @@ impl Iterator for ConcatLines { Err(err) => return Some(Err(err)), Ok(rdr) => rdr, }; - self.cur = Some(rdr.lines()); + self.cur = Some(rdr.byte_lines()); } } } - match self.cur.as_mut().and_then(|lines| Iterator::next(lines)) { + match self.cur.as_mut().and_then(|lines| lines.next()) { None => self.cur = None, - Some(r) => return Some(r), + Some(r) => return Some(r.map(BString::from)), } } } @@ -136,7 +138,7 @@ pub struct ConcatCsv { } type Reader = Box; -type Rows = csv::DeserializeRecordsIntoIter; +type Rows = csv::DeserializeRecordsIntoIter; impl ConcatCsv { pub fn new(mut inputs: Vec) -> ConcatCsv { @@ -144,7 +146,7 @@ impl ConcatCsv { ConcatCsv { inputs: inputs, cur: None } } - fn read_row(&mut self) -> Option> { + fn read_row(&mut self) -> Option> { let rdr = match self.cur { None => return None, Some(ref mut rdr) => rdr, @@ -158,7 +160,7 @@ impl ConcatCsv { } impl Iterator for ConcatCsv { - type Item = Result<(String, u64), Error>; + type Item = Result<(BString, u64), Error>; fn next(&mut self) -> Option { loop { From a1305f4bd9a13a26ff8db25ae4fa8f5f8fbffa10 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 23 Feb 2020 22:44:04 -0500 Subject: [PATCH 12/23] fst-bin: drop docopt in favor of clap This was a grueling refactor, but it's done. We now correctly handle non-UTF-8 arguments in most places. We also switch to anyhow for error handling, so setting RUST_LIB_BACKTRACE when an error occurs will show something useful (assuming it was built with a nightly compiler). --- fst-bin/Cargo.toml | 3 +- fst-bin/src/app.rs | 404 +++++++++++++++++++++++++++++++++++++++ fst-bin/src/cmd/csv.rs | 148 +++++++------- fst-bin/src/cmd/dot.rs | 152 +++++++-------- fst-bin/src/cmd/dupes.rs | 124 ++++++------ fst-bin/src/cmd/fuzzy.rs | 90 ++++----- fst-bin/src/cmd/grep.rs | 95 ++++----- fst-bin/src/cmd/map.rs | 181 ++++++++---------- fst-bin/src/cmd/node.rs | 56 +++--- fst-bin/src/cmd/range.rs | 66 +++---- fst-bin/src/cmd/rust.rs | 109 +++++------ fst-bin/src/cmd/set.rs | 167 ++++++++-------- fst-bin/src/cmd/union.rs | 80 ++++---- fst-bin/src/main.rs | 123 ++---------- src/raw/error.rs | 15 +- 15 files changed, 1026 insertions(+), 787 deletions(-) create mode 100644 fst-bin/src/app.rs diff --git a/fst-bin/Cargo.toml b/fst-bin/Cargo.toml index 0c40d83..bae1a0b 100644 --- a/fst-bin/Cargo.toml +++ b/fst-bin/Cargo.toml @@ -20,11 +20,12 @@ name = "fst" path = "src/main.rs" [dependencies] +anyhow = "1.0.26" bit-set = "0.5.1" bstr = "0.2.11" +clap = { version = "2.33.0", default-features = false } crossbeam-channel = "0.4.2" csv = "1.1.3" -docopt = "1.1" fst = { path = "..", version = "0.3", features = ["levenshtein"] } memmap = "0.7" num_cpus = "1.5" diff --git a/fst-bin/src/app.rs b/fst-bin/src/app.rs new file mode 100644 index 0000000..393989b --- /dev/null +++ b/fst-bin/src/app.rs @@ -0,0 +1,404 @@ +const ABOUT: &str = "\ +A command line tool for building, searching and inspecting FSTs. +"; + +const ABOUT_CSV: &str = "\ +Emit information in CSV format about the transducer. + +If is not set, then CSV data is emitted to stdout. +"; + +const ABOUT_DOT: &str = "\ +Emit this transducer in the \"dot\" format. + +If is not set, then the \"dot\" description is emitted to stdout. + +Generally, usage of this command should look like this: + + $ fst dot your-transducer.fst | dot -Tpng > your-transducer.png + $ $YOUR_FAVORITE_IMAGE_VIEWER your-transducer.png + +Where 'dot' is a command line utility that is part of graphviz. + +If your transducer contains output values, then they are shown as labels on +transitions. Zero output values are omitted. +"; + +const ABOUT_DUPES: &str = "\ +A simple way to show duplicate nodes. + +This is meant to be a diagnostic tool to view duplicate nodes in the +transducer. Every duplicate node represents a missed opportunity for more +compression. A minimal transducer should have precisely zero duplicate nodes. + +WARNING: This stores all nodes in the transducer in memory, decompressed. This +may be expensive in both time and space depending on the size of your +transducer. + +If is omitted, then diagnostic data is emitted to stdout. +"; + +const ABOUT_FUZZY: &str = "\ +Issues a fuzzy query against the given transducer. + +A fuzzy query returns all search results within a particular edit distance +of the query given. + +WARNING: This works by building a Levenshtein automaton, which is currently +rather expensive (in time and space) with a big edit distance. This will be +improved in the future. +"; + +const ABOUT_GREP: &str = "\ +Searches a transducer with a regular expression. + +WARNING: This works by building a regular expression automaton, which can be +quite expensive depending on how big the regex is. If this becomes a problem, +consider disabling Unicode support in the regex via '(?-u)'. +"; + +const ABOUT_MAP: &str = "\ +Creates an ordered map backed by a finite state transducer. + +The input to this command should be a CSV file with exactly two columns and no +headers. The first column should be the key and the second column should be a +value that can be interpreted as an unsigned 64 bit integer. + +If your input is already sorted, then pass the --sorted flag to make +construction much faster. If you use --sorted and the data is not sorted, +then this will return an error when it sees an out-of-order key. +"; + +const ABOUT_NODE: &str = "\ +Shows a single node from the transducer. + +The input to this command is the node's address. An address may be found either +from debugging a transducer in code, or from the output of the 'fst csv' +command. + +If the address does not point to a valid node, then the executable may panic or +abort without ceremony. +"; + +const ABOUT_RANGE: &str = "\ +Issues a range query against the given transducer. + +A range query returns all search results within a particular. + +If neither the start or the end of the range is specified, then all entries +in the transducer are shown. +"; + +const ABOUT_RUST: &str = "\ +Emit Rust source code for the given FST. + +This reads the FST given and emits it as Rust source code with one +constant defined: + + {NAME}_BYTES + +And a `lazy_static!` ref for: + + {NAME} + +Where {NAME} is taken from the name given as an argument. + +The latter definition corresponds to calling `Fst::new({NAME}_BYTES)`. +This makes it possible to trivially use pre-built FSTs in your program. +"; + +const ABOUT_SET: &str = "\ +Creates an ordered set backed by a finite state transducer. + +The input to this command should be one or more files with one key per line. + +If your input is already sorted, then pass the --sorted flag to make +construction much faster. If you use --sorted and the data is not sorted, +then this will return an error when it sees an out-of-order key. +"; + +const ABOUT_UNION: &str = "\ +Unions all of the transducer inputs into a single transducer. + +Any output values are dropped. Stated differently, the resulting transducer is +always a set. + +Usage: + fst union [options] ... + fst union --help + +Options: + -h, --help Show this help message. + -f, --force Overwrites the output if a file already exists. +"; + +pub fn app() -> clap::App<'static, 'static> { + let cmd = |name, about| { + clap::SubCommand::with_name(name) + .author(clap::crate_authors!()) + .version(clap::crate_version!()) + .about(about) + }; + let pos = |name| clap::Arg::with_name(name); + let flag = |name| clap::Arg::with_name(name).long(name); + + let csv_arg_input = pos("input") + .required(true) + .help("The FST to extract information from."); + let csv_arg_output = pos("output").help( + "The CSV file to write information to. \ + When absent, print to stdout.", + ); + let csv = cmd("csv", ABOUT_CSV) + .subcommand( + cmd("edges", "Emit information about edges in an FST.") + .arg(csv_arg_input.clone()) + .arg(csv_arg_output.clone()), + ) + .subcommand( + cmd("nodes", "Emit information about nodes in an FST.") + .arg(csv_arg_input.clone()) + .arg(csv_arg_output.clone()), + ); + + let dot = cmd("dot", ABOUT_DOT) + .arg(pos("input").required(true).help("The FST to visualize.")) + .arg(pos("output").help( + "An optional file path to write Dot output to. \ + When empty, output is written to stdout.", + )) + .arg(flag("state-names").help( + " When set, states will be labeled with an arbitrary number.", + )); + + let dupes = + cmd("dupes", ABOUT_DUPES) + .arg(pos("input").required(true).help("The FST to query.")) + .arg(pos("output").help( + "An optional file path to write output to. \ + When empty, output is written to stdout.", + )) + .arg( + flag("limit") + .default_value("10") + .help("Show this many duplicate nodes."), + ) + .arg(flag("min").default_value("20").help( + "Only show duplicates nodes with this many reoccurrences.", + )); + + let fuzzy = cmd("fuzzy", ABOUT_FUZZY) + .arg(pos("input").required(true).help("The FST to query.")) + .arg(pos("query").required(true).help("The fuzzy query.")) + .arg(flag("distance").short("d").default_value("1").help( + "All terms in the FST within this distance are shown. The \ + distance is measured in the number of character insertions, \ + deletions and substitutions required to transform the query \ + to a particular term. A \"character\" in this context refers \ + to a single Unicode codepoint.", + )) + .arg( + flag("outputs") + .short("o") + .help("When set, output values are shown as CSV data."), + ) + .arg( + flag("start") + .short("s") + .help("Only show results greater than or equal to this."), + ) + .arg( + flag("end") + .short("e") + .help("Only show results less than or equal to this."), + ); + + let grep = cmd("grep", ABOUT_GREP) + .arg(pos("input").required(true).help("The FST to query.")) + .arg(pos("regex").required(true).help("The regex.")) + .arg( + flag("outputs") + .short("o") + .help("When set, output values are shown as CSV data."), + ) + .arg( + flag("start") + .short("s") + .help("Only show results greater than or equal to this."), + ) + .arg( + flag("end") + .short("e") + .help("Only show results less than or equal to this."), + ); + + let map = cmd("map", ABOUT_MAP) + .arg( + pos("input") + .required(true) + .multiple(true) + .help("A file containing a key per line."), + ) + .arg( + pos("output") + .required(true) + .help("The destination file path to write the FST."), + ) + .arg(flag("force").help( + "Overwrites the output if the destination file already exists.", + )) + .arg(flag("sorted").help( + "Set this if the input data is already lexicographically \ + sorted. This will make construction much faster. Note that \ + when this is set, most of the other flags (like --fd-limit \ + and --batch-size) are not relevant.", + )) + .arg(flag("max").help( + "When building an FST from unsorted data, this merges output \ + values by taking the maximum. The default is to sum them.", + )) + .arg(flag("min").help( + "When building an FST from unsorted data, this merges output \ + values by taking the minimum. The default is to sum them.", + )) + .arg(flag("fd-limit").default_value("15").help( + "The maximum number of file descriptors to have open in a \ + single worker thread.", + )) + .arg(flag("batch-size").default_value("100000").help( + "The number of keys to collect in each batch. N.B. This is the \ + primary factor in how much memory this process uses.", + )) + .arg(flag("threads").default_value("0").help( + "The number of simultaneous workers to run. The default of 0 \ + will use the number of logical CPUs reported by your system.", + )) + .arg(flag("tmp-dir").help( + "A temporary directory used to store intermediate transducers. \ + This defaults to the default temporary directory reported by \ + your system.", + )) + .arg(flag("keep-tmp-dir").help( + "Does not delete the temporary directory. Useful for debugging.", + )); + + let node = cmd("node", ABOUT_NODE) + .arg(pos("input").required(true).help("The FST to inspect.")) + .arg( + pos("node-address") + .required(true) + .help("The address of the node to print."), + ); + + let range = cmd("range", ABOUT_RANGE) + .arg( + pos("input") + .required(true) + .help("The FST to run a range query against."), + ) + .arg( + flag("outputs") + .short("o") + .help("When set, output values are shown as CSV data."), + ) + .arg( + flag("start") + .short("s") + .takes_value(true) + .help("Only show results greater than or equal to this."), + ) + .arg( + flag("end") + .short("e") + .takes_value(true) + .help("Only show results less than or equal to this."), + ); + + let rust = cmd("rust", ABOUT_RUST) + .arg( + pos("input") + .required(true) + .help("The FST to generate Rust code for."), + ) + .arg( + pos("name") + .required(true) + .help("The name of the FST to use in the Rust source code."), + ); + + let set = cmd("set", ABOUT_SET) + .arg( + pos("input") + .required(true) + .multiple(true) + .help("One or more files containing a key per line."), + ) + .arg( + pos("output") + .required(true) + .help("The destination file path to write the FST."), + ) + .arg(flag("force").help( + "Overwrites the output if the destination file already exists.", + )) + .arg(flag("sorted").help( + "Set this if the input data is already lexicographically \ + sorted. This will make construction much faster. Note that \ + when this is set, most of the other flags (like --fd-limit \ + and --batch-size) are not relevant.", + )) + .arg(flag("fd-limit").default_value("15").help( + "The maximum number of file descriptors to have open in a \ + single worker thread.", + )) + .arg(flag("batch-size").default_value("100000").help( + "The number of keys to collect in each batch. N.B. This is the \ + primary factor in how much memory this process uses.", + )) + .arg(flag("threads").default_value("0").help( + "The number of simultaneous workers to run. The default of 0 \ + will use the number of logical CPUs reported by your system.", + )) + .arg(flag("tmp-dir").help( + "A temporary directory used to store intermediate transducers. \ + This defaults to the default temporary directory reported by \ + your system.", + )) + .arg(flag("keep-tmp-dir").help( + "Does not delete the temporary directory. Useful for debugging.", + )); + + let union = cmd("union", ABOUT_UNION) + .arg( + pos("input") + .required(true) + .multiple(true) + .help("One or more files containing a key per line."), + ) + .arg( + pos("output") + .required(true) + .help("The destination file path to write the FST."), + ) + .arg(flag("force").help( + "Overwrites the output if the destination file already exists.", + )); + + clap::App::new("ucd-generate") + .author(clap::crate_authors!()) + .version(clap::crate_version!()) + .about(ABOUT) + .max_term_width(100) + .setting(clap::AppSettings::UnifiedHelpMessage) + .subcommand(csv) + .subcommand(dot) + .subcommand(dupes) + .subcommand(fuzzy) + .subcommand(grep) + .subcommand(map) + .subcommand(node) + .subcommand(range) + .subcommand(rust) + .subcommand(set) + .subcommand(union) +} diff --git a/fst-bin/src/cmd/csv.rs b/fst-bin/src/cmd/csv.rs index 2989f96..3d23195 100644 --- a/fst-bin/src/cmd/csv.rs +++ b/fst-bin/src/cmd/csv.rs @@ -1,86 +1,102 @@ +use std::path::PathBuf; + use bit_set::BitSet; use csv; -use docopt::Docopt; -use serde::Deserialize; use crate::util; use crate::Error; -const USAGE: &'static str = " -Emit information in CSV format about the transducer. - -If is not set, then CSV data is emitted to stdout. - -Usage: - fst csv edges [] - fst csv nodes [] - fst csv --help - -Options: - -h, --help Display this message. -"; +pub fn run(matches: &clap::ArgMatches) -> Result<(), Error> { + Args::new(matches).and_then(|args| args.run()) +} -#[derive(Debug, Deserialize)] +#[derive(Debug)] struct Args { - cmd_edges: bool, - cmd_nodes: bool, - arg_input: String, - arg_output: Option, + input: PathBuf, + output: Option, + which: Which, } -pub fn run(argv: Vec) -> Result<(), Error> { - let args: Args = Docopt::new(USAGE) - .and_then(|d| d.argv(&argv).deserialize()) - .unwrap_or_else(|e| e.exit()); +#[derive(Debug)] +enum Which { + Edges, + Nodes, +} - let wtr = util::get_writer(args.arg_output.as_ref())?; - let mut wtr = csv::Writer::from_writer(wtr); +impl Args { + fn new(m: &clap::ArgMatches) -> Result { + let (which, m) = match m.subcommand() { + ("edges", Some(m)) => (Which::Edges, m), + ("nodes", Some(m)) => (Which::Nodes, m), + (unknown, _) => { + anyhow::bail!("unrecognized csv sub-command: {}", unknown) + } + }; + Ok(Args { + input: m.value_of_os("input").map(PathBuf::from).unwrap(), + output: m.value_of_os("output").map(PathBuf::from), + which, + }) + } - let fst = unsafe { util::mmap_fst(args.arg_input)? }; - let mut set = BitSet::with_capacity(fst.len()); + fn run(&self) -> Result<(), Error> { + let wtr = util::get_writer(self.output.as_ref())?; + let mut wtr = csv::Writer::from_writer(wtr); - if args.cmd_edges { - wtr.serialize(("addr_in", "addr_out", "input", "output"))?; - let mut stack = vec![fst.root().addr()]; - set.insert(fst.root().addr()); - while let Some(addr) = stack.pop() { - for t in fst.node(addr).transitions() { - if !set.contains(t.addr) { - stack.push(t.addr); - set.insert(t.addr); + let fst = unsafe { util::mmap_fst(&self.input)? }; + let mut set = BitSet::with_capacity(fst.len()); + + match self.which { + Which::Edges => { + wtr.serialize(("addr_in", "addr_out", "input", "output"))?; + let mut stack = vec![fst.root().addr()]; + set.insert(fst.root().addr()); + while let Some(addr) = stack.pop() { + for t in fst.node(addr).transitions() { + if !set.contains(t.addr) { + stack.push(t.addr); + set.insert(t.addr); + } + wtr.serialize(( + addr, + t.addr, + t.inp as char, + t.out.value(), + ))?; + } } - wtr.serialize((addr, t.addr, t.inp as char, t.out.value()))?; } - } - } else { - wtr.serialize(( - "addr", - "state", - "size", - "transitions", - "final", - "final_output", - ))?; - let mut stack = vec![fst.root().addr()]; - set.insert(fst.root().addr()); - while let Some(addr) = stack.pop() { - let node = fst.node(addr); - for t in node.transitions() { - if !set.contains(t.addr) { - stack.push(t.addr); - set.insert(t.addr); + Which::Nodes => { + wtr.serialize(( + "addr", + "state", + "size", + "transitions", + "final", + "final_output", + ))?; + let mut stack = vec![fst.root().addr()]; + set.insert(fst.root().addr()); + while let Some(addr) = stack.pop() { + let node = fst.node(addr); + for t in node.transitions() { + if !set.contains(t.addr) { + stack.push(t.addr); + set.insert(t.addr); + } + } + let row = &[ + node.addr().to_string(), + node.state().to_string(), + node.as_slice().len().to_string(), + node.len().to_string(), + node.is_final().to_string(), + node.final_output().value().to_string(), + ]; + wtr.write_record(row.iter())?; } } - let row = &[ - node.addr().to_string(), - node.state().to_string(), - node.as_slice().len().to_string(), - node.len().to_string(), - node.is_final().to_string(), - node.final_output().value().to_string(), - ]; - wtr.write_record(row.iter())?; } + wtr.flush().map_err(From::from) } - wtr.flush().map_err(From::from) } diff --git a/fst-bin/src/cmd/dot.rs b/fst-bin/src/cmd/dot.rs index 13f0c23..d131bb4 100644 --- a/fst-bin/src/cmd/dot.rs +++ b/fst-bin/src/cmd/dot.rs @@ -1,44 +1,82 @@ use std::io::Write; +use std::path::PathBuf; use bit_set::BitSet; -use docopt::Docopt; -use serde::Deserialize; use crate::util; use crate::Error; -const USAGE: &'static str = " -Emit this transducer in the \"dot\" format. - -If is not set, then the \"dot\" description is emitted to stdout. - -Generally, usage of this command should look like this: - - $ fst dot your-transducer.fst | dot -Tpng > your-transducer.png - $ $YOUR_FAVORITE_IMAGE_VIEWER your-transducer.png - -Note that `dot` is a command line utility that is part of graphviz. - -If your transducer contains output values, then they are shown as labels on -transitions. Zero output values are omitted. - -Usage: - fst dot [options] [] - fst dot --help - -Options: - -h, --help Show this help message. - --state-names When set, states will be labeled with arbitrary number. -"; +pub fn run(matches: &clap::ArgMatches) -> Result<(), Error> { + Args::new(matches).and_then(|args| args.run()) +} -#[derive(Debug, Deserialize)] +#[derive(Debug)] struct Args { - arg_input: String, - arg_output: Option, - flag_state_names: bool, + input: PathBuf, + output: Option, + state_names: bool, } impl Args { + fn new(m: &clap::ArgMatches) -> Result { + Ok(Args { + input: m.value_of_os("input").map(PathBuf::from).unwrap(), + output: m.value_of_os("output").map(PathBuf::from), + state_names: m.is_present("state-names"), + }) + } + + fn run(&self) -> Result<(), Error> { + let mut wtr = util::get_buf_writer(self.output.as_ref())?; + let fst = unsafe { util::mmap_fst(&self.input)? }; + let mut set = BitSet::with_capacity(fst.len()); + + let mut stack = vec![fst.root().addr()]; + + writeln!( + wtr, + r#" + digraph automaton {{ + labelloc="l"; + labeljust="l"; + rankdir="LR"; + "# + ) + .unwrap(); + let mut state_num = 0; + while let Some(addr) = stack.pop() { + if set.contains(addr) { + continue; + } + set.insert(addr); + + let node = fst.node(addr); + writeln!(wtr, "{}", self.dot_state(&node, state_num, addr)) + .unwrap(); + for t in node.transitions() { + stack.push(t.addr); + let out = if t.out.value() == 0 { + "".to_owned() + } else { + format!("/{}", t.out.value().to_string()) + }; + writeln!( + wtr, + " {} -> {} [label=\"{}{}\"];", + addr, + t.addr, + util::escape_input(t.inp), + out + ) + .unwrap(); + } + state_num += 1; + } + writeln!(wtr, "}}").unwrap(); + wtr.flush()?; + Ok(()) + } + fn dot_state( &self, node: &fst::raw::Node, @@ -46,7 +84,7 @@ impl Args { addr: fst::raw::CompiledAddr, ) -> String { let label = - if self.flag_state_names { i.to_string() } else { "".to_owned() }; + if self.state_names { i.to_string() } else { "".to_owned() }; if node.is_final() { format!(" {} [label=\"{}\",peripheries=2];", addr, label) } else { @@ -54,57 +92,3 @@ impl Args { } } } - -pub fn run(argv: Vec) -> Result<(), Error> { - let args: Args = Docopt::new(USAGE) - .and_then(|d| d.argv(&argv).deserialize()) - .unwrap_or_else(|e| e.exit()); - - let mut wtr = util::get_buf_writer(args.arg_output.as_ref())?; - let fst = unsafe { util::mmap_fst(&args.arg_input)? }; - let mut set = BitSet::with_capacity(fst.len()); - - let mut stack = vec![fst.root().addr()]; - - writeln!( - wtr, - r#" -digraph automaton {{ - labelloc="l"; - labeljust="l"; - rankdir="LR"; -"# - ) - .unwrap(); - let mut state_num = 0; - while let Some(addr) = stack.pop() { - if set.contains(addr) { - continue; - } - set.insert(addr); - - let node = fst.node(addr); - writeln!(wtr, "{}", args.dot_state(&node, state_num, addr)).unwrap(); - for t in node.transitions() { - stack.push(t.addr); - let out = if t.out.value() == 0 { - "".to_owned() - } else { - format!("/{}", t.out.value().to_string()) - }; - writeln!( - wtr, - " {} -> {} [label=\"{}{}\"];", - addr, - t.addr, - util::escape_input(t.inp), - out - ) - .unwrap(); - } - state_num += 1; - } - writeln!(wtr, "}}").unwrap(); - wtr.flush()?; - Ok(()) -} diff --git a/fst-bin/src/cmd/dupes.rs b/fst-bin/src/cmd/dupes.rs index 42d9fcc..e27a55d 100644 --- a/fst-bin/src/cmd/dupes.rs +++ b/fst-bin/src/cmd/dupes.rs @@ -1,43 +1,13 @@ use std::collections::HashMap; use std::io::Write; +use std::path::PathBuf; use bit_set::BitSet; -use docopt::Docopt; -use serde::Deserialize; -use crate::util; -use crate::Error; +use crate::{util, Error}; -const USAGE: &'static str = " -A simple way to show duplicate nodes. - -This is meant to be a diagnostic tool to view duplicate nodes in the -transducer. Every duplicate node represents a missed opportunity for more -compression. A minimal transducer should have precisely zero duplicate nodes. - -WARNING: This stores all nodes in the transducer in memory, decompressed. This -may be expensive in both time and space depending on the size of your -transducer. - -If is omitted, then diagnostic data is emitted to stdout. - -Usage: - fst dupes [options] [] - fst dupes --help - -Options: - -h, --help Display this message. - --limit ARG Show this many duplicate nodes. [default: 10] - --min ARG Only show duplicate nodes with this many - reoccurrences. [default: 20] -"; - -#[derive(Debug, Deserialize)] -struct Args { - arg_input: String, - arg_output: Option, - flag_limit: usize, - flag_min: i32, +pub fn run(matches: &clap::ArgMatches) -> Result<(), Error> { + Args::new(matches).and_then(|args| args.run()) } #[derive(Clone, Debug, Hash, Eq, PartialEq)] @@ -57,48 +27,62 @@ impl FullNode { } } -pub fn run(argv: Vec) -> Result<(), Error> { - let args: Args = Docopt::new(USAGE) - .and_then(|d| d.argv(&argv).deserialize()) - .unwrap_or_else(|e| e.exit()); +#[derive(Debug)] +struct Args { + input: PathBuf, + output: Option, + limit: usize, + min: i32, +} - let mut wtr = util::get_buf_writer(args.arg_output.as_ref())?; - let fst = unsafe { util::mmap_fst(args.arg_input)? }; - let mut set = BitSet::with_capacity(fst.len()); - let mut node_counts = HashMap::with_capacity(10_000); +impl Args { + fn new(m: &clap::ArgMatches) -> Result { + Ok(Args { + input: m.value_of_os("input").map(PathBuf::from).unwrap(), + output: m.value_of_os("output").map(PathBuf::from), + limit: m.value_of_lossy("limit").unwrap().parse()?, + min: m.value_of_lossy("min").unwrap().parse()?, + }) + } - let mut stack = vec![fst.root().addr()]; - while let Some(addr) = stack.pop() { - if set.contains(addr) { - continue; + fn run(&self) -> Result<(), Error> { + let mut wtr = util::get_buf_writer(self.output.as_ref())?; + let fst = unsafe { util::mmap_fst(&self.input)? }; + let mut set = BitSet::with_capacity(fst.len()); + let mut node_counts = HashMap::with_capacity(10_000); + + let mut stack = vec![fst.root().addr()]; + while let Some(addr) = stack.pop() { + if set.contains(addr) { + continue; + } + set.insert(addr); + + let full_node = FullNode::from_node(&fst.node(addr)); + for t in &full_node.trans { + stack.push(t.addr); + } + *node_counts.entry(full_node).or_insert(0) += 1; } - set.insert(addr); - let full_node = FullNode::from_node(&fst.node(addr)); - for t in &full_node.trans { - stack.push(t.addr); - } - *node_counts.entry(full_node).or_insert(0) += 1; - } + let total = node_counts.values().fold(0, |n, c| n + c); + let unique = node_counts.len(); + let mut counts: Vec<(FullNode, i32)> = + node_counts.into_iter().filter(|&(_, c)| c > self.min).collect(); + counts.sort_by(|&(_, ref c1), &(_, ref c2)| c1.cmp(c2).reverse()); - let total = node_counts.values().fold(0, |n, c| n + c); - let unique = node_counts.len(); - let min = args.flag_min; - let mut counts: Vec<(FullNode, i32)> = - node_counts.into_iter().filter(|&(_, c)| c > min).collect(); - counts.sort_by(|&(_, ref c1), &(_, ref c2)| c1.cmp(c2).reverse()); + w!(wtr, "Total nodes: {}", total); + w!(wtr, "Unique nodes: {}", unique); + w!(wtr, "Nodes with duplicates: {}", counts.len()); + w!(wtr, "----------------------------------"); - w!(wtr, "Total nodes: {}", total); - w!(wtr, "Unique nodes: {}", unique); - w!(wtr, "Nodes with duplicates: {}", counts.len()); - w!(wtr, "----------------------------------"); + for &(ref fnode, count) in counts.iter().take(self.limit) { + w!(wtr, "Duplicated {} times", count); + w!(wtr, "{:#?}", fnode); + w!(wtr, "----------------------------------"); + } - for &(ref fnode, count) in counts.iter().take(args.flag_limit) { - w!(wtr, "Duplicated {} times", count); - w!(wtr, "{:#?}", fnode); - w!(wtr, "----------------------------------"); + wtr.flush()?; + Ok(()) } - - wtr.flush()?; - Ok(()) } diff --git a/fst-bin/src/cmd/fuzzy.rs b/fst-bin/src/cmd/fuzzy.rs index a8ed249..1e35ec1 100644 --- a/fst-bin/src/cmd/fuzzy.rs +++ b/fst-bin/src/cmd/fuzzy.rs @@ -1,60 +1,54 @@ use std::io; +use std::path::PathBuf; -use docopt::Docopt; +use bstr::{BString, ByteVec}; use fst::automaton::Levenshtein; -use serde::Deserialize; -use crate::util; -use crate::Error; +use crate::{util, Error}; -const USAGE: &'static str = " -Issues a fuzzy query against the given transducer. - -A fuzzy query returns all search results within a particular edit distance -of the query given. - -WARNING: This works by building a Levenshtein automaton, which is currently -rather expensive (in time and space) with a big edit distance. This will be -improved in the future. - -Usage: - fst fuzzy [options] - fst fuzzy --help - -Options: - -h, --help Display this message. - -d, --distance ARG All terms in the fst within this distance are shown. - The distance is measured in the number of character - insertions, deletions and substitutions on the query - to get the term. In this case, a \"character\" is a - single Unicode codepoint. [default: 1] - -o, --outputs When set, output values are shown as CSV data. - -s, --start ARG Only show results greater than or equal to this. - -e, --end ARG Only show results less than or equal to this. -"; +pub fn run(matches: &clap::ArgMatches) -> Result<(), Error> { + Args::new(matches).and_then(|args| args.run()) +} -#[derive(Debug, Deserialize)] +#[derive(Debug)] struct Args { - arg_fst: String, - arg_query: String, - flag_distance: u32, - flag_outputs: bool, - flag_start: Option, - flag_end: Option, + input: PathBuf, + query: String, + distance: u32, + outputs: bool, + start: Option, + end: Option, } -pub fn run(argv: Vec) -> Result<(), Error> { - let args: Args = Docopt::new(USAGE) - .and_then(|d| d.argv(&argv).deserialize()) - .unwrap_or_else(|e| e.exit()); - let fst = unsafe { util::mmap_fst(&args.arg_fst)? }; - let lev = Levenshtein::new(&args.arg_query, args.flag_distance)?; - let mut q = fst.search(&lev); - if let Some(ref start) = args.flag_start { - q = q.ge(start); +impl Args { + fn new(m: &clap::ArgMatches) -> Result { + Ok(Args { + input: m.value_of_os("input").map(PathBuf::from).unwrap(), + query: m + .value_of_os("query") + .map(|v| v.to_string_lossy().into_owned()) + .unwrap(), + distance: m.value_of_lossy("distance").unwrap().parse()?, + outputs: m.is_present("outputs"), + start: m + .value_of_os("start") + .map(|v| Vec::from_os_str_lossy(v).into_owned().into()), + end: m + .value_of_os("end") + .map(|v| Vec::from_os_str_lossy(v).into_owned().into()), + }) } - if let Some(ref end) = args.flag_end { - q = q.le(end); + + fn run(&self) -> Result<(), Error> { + let fst = unsafe { util::mmap_fst(&self.input)? }; + let lev = Levenshtein::new(&self.query, self.distance)?; + let mut q = fst.search(&lev); + if let Some(ref start) = self.start { + q = q.ge(start); + } + if let Some(ref end) = self.end { + q = q.le(end); + } + util::print_stream(io::BufWriter::new(io::stdout()), self.outputs, q) } - util::print_stream(io::BufWriter::new(io::stdout()), args.flag_outputs, q) } diff --git a/fst-bin/src/cmd/grep.rs b/fst-bin/src/cmd/grep.rs index 09513ee..cc84f71 100644 --- a/fst-bin/src/cmd/grep.rs +++ b/fst-bin/src/cmd/grep.rs @@ -1,59 +1,60 @@ use std::io; +use std::path::PathBuf; -use docopt::Docopt; +use bstr::{BString, ByteVec}; use regex_automata::dense; -use serde::Deserialize; -use crate::util; -use crate::Error; +use crate::{util, Error}; -const USAGE: &'static str = " -Searches a transducer with a regular expression. - -WARNING: This works by building a regular expression automaton, which is -currently rather expensive (in time and space) with complex regexes. This -will be improved in the future. - -Usage: - fst grep [options] - fst grep --help - -Options: - -h, --help Display this message. - -o, --outputs When set, output values are shown as CSV data. - -s, --start ARG Only show results greater than or equal to this. - -e, --end ARG Only show results less than or equal to this. -"; +pub fn run(matches: &clap::ArgMatches) -> Result<(), Error> { + Args::new(matches).and_then(|args| args.run()) +} -#[derive(Debug, Deserialize)] +#[derive(Debug)] struct Args { - arg_fst: String, - arg_regex: String, - flag_outputs: bool, - flag_start: Option, - flag_end: Option, + input: PathBuf, + regex: String, + outputs: bool, + start: Option, + end: Option, } -pub fn run(argv: Vec) -> Result<(), Error> { - let args: Args = Docopt::new(USAGE) - .and_then(|d| d.argv(&argv).deserialize()) - .unwrap_or_else(|e| e.exit()); - let fst = unsafe { util::mmap_fst(&args.arg_fst)? }; - let dense_dfa = dense::Builder::new() - .anchored(true) - .byte_classes(false) - .premultiply(false) - .build(&args.arg_regex)?; - let dfa = match dense_dfa { - dense::DenseDFA::Standard(dfa) => dfa, - _ => unreachable!(), - }; - let mut q = fst.search(&dfa); - if let Some(ref start) = args.flag_start { - q = q.ge(start); +impl Args { + fn new(m: &clap::ArgMatches) -> Result { + Ok(Args { + input: m.value_of_os("input").map(PathBuf::from).unwrap(), + regex: m + .value_of_os("regex") + .map(|v| v.to_string_lossy().into_owned()) + .unwrap(), + outputs: m.is_present("outputs"), + start: m + .value_of_os("start") + .map(|v| Vec::from_os_str_lossy(v).into_owned().into()), + end: m + .value_of_os("end") + .map(|v| Vec::from_os_str_lossy(v).into_owned().into()), + }) } - if let Some(ref end) = args.flag_end { - q = q.le(end); + + fn run(&self) -> Result<(), Error> { + let fst = unsafe { util::mmap_fst(&self.input)? }; + let dense_dfa = dense::Builder::new() + .anchored(true) + .byte_classes(true) + .premultiply(true) + .build(&self.regex)?; + let dfa = match dense_dfa { + dense::DenseDFA::PremultipliedByteClass(dfa) => dfa, + _ => unreachable!(), + }; + let mut q = fst.search(&dfa); + if let Some(ref start) = self.start { + q = q.ge(start); + } + if let Some(ref end) = self.end { + q = q.le(end); + } + util::print_stream(io::BufWriter::new(io::stdout()), self.outputs, q) } - util::print_stream(io::BufWriter::new(io::stdout()), args.flag_outputs, q) } diff --git a/fst-bin/src/cmd/map.rs b/fst-bin/src/cmd/map.rs index f5c0a9b..d6e356e 100644 --- a/fst-bin/src/cmd/map.rs +++ b/fst-bin/src/cmd/map.rs @@ -1,9 +1,7 @@ use std::cmp; use std::fs; -use std::path::Path; +use std::path::{Path, PathBuf}; -use csv; -use docopt::Docopt; use fst::MapBuilder; use serde::Deserialize; @@ -11,113 +9,98 @@ use crate::merge::Merger; use crate::util; use crate::Error; -const USAGE: &'static str = " -Creates an ordered map backed by a finite state transducer. - -The input to this command should be a CSV file with exactly two columns. -The first column should be the key and the second column should be a value -that can be interpreted as an unsigned 64 bit integer. - -Usage: - fst map [options] ... - fst map --help - -Options: - -h, --help Display this message. - -f, --force Overwrites the output if a file already exists. - --sorted Set this if the input data is already lexicographically - sorted. This will make construction much faster. - --max When building an FST from unsorted data, this merges - output values by taking the max. The default is to sum - them. - --min When building an FST from unsorted data, this merges - output values by taking the min. The default is to sum - them. - --fd-limit ARG The maximum number of file descriptors to have open in - a single worker thread. [default: 15] - --batch-size ARG The number of keys to collect in each batch. - N.B. This is the primary factor in how much memory - this process uses. - [default: 100000] - --threads ARG The number of simultaneous workers to run. - This defaults to the number of logical CPUs reported - by your system. - --tmp-dir ARG A temporary directory used to store intermediate - transducers. This defaults to the default temporary - directory reported by your system. - --keep-tmp-dir Does not delete the temporary directory. Useful for - debugging. -"; +pub fn run(matches: &clap::ArgMatches) -> Result<(), Error> { + Args::new(matches).and_then(|args| args.run()) +} #[derive(Debug, Deserialize)] struct Args { - arg_input: Vec, - arg_output: String, - flag_force: bool, - flag_sorted: bool, - flag_fd_limit: u32, - flag_batch_size: u32, - flag_threads: Option, - flag_tmp_dir: Option, - flag_keep_tmp_dir: bool, - flag_max: bool, - flag_min: bool, + input: Vec, + output: PathBuf, + force: bool, + sorted: bool, + fd_limit: u32, + batch_size: u32, + threads: Option, + tmp_dir: Option, + keep_tmp_dir: bool, + max: bool, + min: bool, } -pub fn run(argv: Vec) -> Result<(), Error> { - let args: Args = Docopt::new(USAGE) - .and_then(|d| d.argv(&argv).deserialize()) - .unwrap_or_else(|e| e.exit()); - if !args.flag_force && fs::metadata(&args.arg_output).is_ok() { - fail!("Output file already exists: {}", args.arg_output); +impl Args { + fn new(m: &clap::ArgMatches) -> Result { + let threads = m.value_of_lossy("threads").unwrap().parse()?; + Ok(Args { + input: m + .values_of_os("input") + .unwrap() + .map(PathBuf::from) + .collect(), + output: m.value_of_os("output").map(PathBuf::from).unwrap(), + force: m.is_present("force"), + sorted: m.is_present("sorted"), + fd_limit: m.value_of_lossy("fd-limit").unwrap().parse()?, + batch_size: m.value_of_lossy("batch-size").unwrap().parse()?, + threads: if threads == 0 { None } else { Some(threads) }, + tmp_dir: m.value_of_os("tmp-dir").map(PathBuf::from), + keep_tmp_dir: m.is_present("keep-tmp-dir"), + max: m.is_present("max"), + min: m.is_present("min"), + }) } - if args.flag_sorted { - run_sorted(&args) - } else { - run_unsorted(&args) + + fn run(&self) -> Result<(), Error> { + if !self.force && fs::metadata(&self.output).is_ok() { + anyhow::bail!("Output file already exists: {:?}", self.output); + } + if self.sorted { + self.run_sorted() + } else { + self.run_unsorted() + } } -} -fn run_sorted(args: &Args) -> Result<(), Error> { - let wtr = util::get_buf_writer(Some(&args.arg_output))?; - let mut map = MapBuilder::new(wtr)?; - for input in &args.arg_input { - let rdr = util::get_reader(Some(input))?; - let mut rdr = - csv::ReaderBuilder::new().has_headers(false).from_reader(rdr); - for row in rdr.deserialize() { - let (key, val): (String, u64) = row?; - map.insert(key, val)?; + fn run_sorted(&self) -> Result<(), Error> { + let wtr = util::get_buf_writer(Some(&self.output))?; + let mut map = MapBuilder::new(wtr)?; + for input in &self.input { + let mut rdr = csv::ReaderBuilder::new() + .has_headers(false) + .from_reader(util::get_reader(Some(input))?); + for row in rdr.deserialize() { + let (key, val): (String, u64) = row?; + map.insert(key, val)?; + } } + map.finish().map_err(From::from) } - map.finish()?; - Ok(()) -} -fn run_unsorted(args: &Args) -> Result<(), Error> { - let inputs = args - .arg_input - .iter() - .map(|inp| Path::new(inp).to_path_buf()) - .collect(); - let keys = util::ConcatCsv::new(inputs); + fn run_unsorted(&self) -> Result<(), Error> { + let inputs = self + .input + .iter() + .map(|inp| Path::new(inp).to_path_buf()) + .collect(); + let keys = util::ConcatCsv::new(inputs); - let mut merger = Merger::new(keys, &args.arg_output); - merger = merger.fd_limit(args.flag_fd_limit); - merger = merger.batch_size(args.flag_batch_size); - merger = merger.keep_tmp_dir(args.flag_keep_tmp_dir); - if let Some(threads) = args.flag_threads { - merger = merger.threads(threads); - } - if let Some(ref tmp_dir) = args.flag_tmp_dir { - merger = merger.tmp_dir(tmp_dir); - } - if args.flag_max { - merger = merger.value_merger(|x, y| cmp::max(x, y)); - } else if args.flag_min { - merger = merger.value_merger(|x, y| cmp::min(x, y)); - } else { - merger = merger.value_merger(|x, y| x + y); + let mut merger = Merger::new(keys, &self.output); + merger = merger.fd_limit(self.fd_limit); + merger = merger.batch_size(self.batch_size); + merger = merger.keep_tmp_dir(self.keep_tmp_dir); + if let Some(threads) = self.threads { + merger = merger.threads(threads); + } + if let Some(ref tmp_dir) = self.tmp_dir { + merger = merger.tmp_dir(tmp_dir); + } + if self.max { + merger = merger.value_merger(|x, y| cmp::max(x, y)); + } else if self.min { + merger = merger.value_merger(|x, y| cmp::min(x, y)); + } else { + merger = merger.value_merger(|x, y| x + y); + } + merger.merge() } - merger.merge() } diff --git a/fst-bin/src/cmd/node.rs b/fst-bin/src/cmd/node.rs index 5364965..b524996 100644 --- a/fst-bin/src/cmd/node.rs +++ b/fst-bin/src/cmd/node.rs @@ -1,40 +1,30 @@ -use docopt::Docopt; -use serde::Deserialize; +use std::path::PathBuf; -use crate::util; -use crate::Error; +use crate::{util, Error}; -const USAGE: &'static str = " -Shows a single node from the transducer. - -The input to this command is the node's address. An address may be found either -from debugging a transducer in code, or from the output of the 'fst csv' -command. - -If the address does not point to a valid node, then the executable may panic or -abort without ceremony. - -Usage: - fst node [options] - fst node --help - -Options: - -h, --help Show this help message. -"; +pub fn run(matches: &clap::ArgMatches) -> Result<(), Error> { + Args::new(matches).and_then(|args| args.run()) +} -#[derive(Debug, Deserialize)] +#[derive(Debug)] struct Args { - arg_fst: String, - arg_node_address: usize, + input: PathBuf, + node_address: usize, } -pub fn run(argv: Vec) -> Result<(), Error> { - let args: Args = Docopt::new(USAGE) - .and_then(|d| d.argv(&argv).deserialize()) - .unwrap_or_else(|e| e.exit()); - let mut wtr = util::get_buf_writer::<&str>(None)?; - let fst = unsafe { util::mmap_fst(&args.arg_fst)? }; - let node = fst.node(args.arg_node_address); - w!(wtr, "{:?}", node); - Ok(()) +impl Args { + fn new(m: &clap::ArgMatches) -> Result { + Ok(Args { + input: m.value_of_os("input").map(PathBuf::from).unwrap(), + node_address: m.value_of_lossy("node-address").unwrap().parse()?, + }) + } + + fn run(&self) -> Result<(), Error> { + let mut wtr = util::get_buf_writer::<&str>(None)?; + let fst = unsafe { util::mmap_fst(&self.input)? }; + let node = fst.node(self.node_address); + w!(wtr, "{:?}", node); + Ok(()) + } } diff --git a/fst-bin/src/cmd/range.rs b/fst-bin/src/cmd/range.rs index 2252109..d104ab2 100644 --- a/fst-bin/src/cmd/range.rs +++ b/fst-bin/src/cmd/range.rs @@ -1,49 +1,43 @@ +use std::ffi::OsString; use std::io; +use std::path::PathBuf; -use docopt::Docopt; -use serde::Deserialize; +use bstr::ByteVec; use crate::util; use crate::Error; -const USAGE: &'static str = " -Issues a range query against the given transducer. - -A range query returns all search results within a particular. - -If neither the start or the end of the range is specified, then all entries -in the transducer are shown. - -Usage: - fst range [options] - fst range --help - -Options: - -h, --help Display this message. - -o, --outputs When set, output values are shown as CSV data. - -s, --start ARG Only show results greater than or equal to this. - -e, --end ARG Only show results less than or equal to this. -"; +pub fn run(matches: &clap::ArgMatches) -> Result<(), Error> { + Args::new(matches).and_then(|args| args.run()) +} -#[derive(Debug, Deserialize)] +#[derive(Debug)] struct Args { - arg_fst: String, - flag_outputs: bool, - flag_start: Option, - flag_end: Option, + input: PathBuf, + outputs: bool, + start: Option, + end: Option, } -pub fn run(argv: Vec) -> Result<(), Error> { - let args: Args = Docopt::new(USAGE) - .and_then(|d| d.argv(&argv).deserialize()) - .unwrap_or_else(|e| e.exit()); - let fst = unsafe { util::mmap_fst(&args.arg_fst)? }; - let mut q = fst.range(); - if let Some(ref start) = args.flag_start { - q = q.ge(start); +impl Args { + fn new(m: &clap::ArgMatches) -> Result { + Ok(Args { + input: m.value_of_os("input").map(PathBuf::from).unwrap(), + outputs: m.is_present("outputs"), + start: m.value_of_os("start").map(|v| v.to_os_string()), + end: m.value_of_os("end").map(|v| v.to_os_string()), + }) } - if let Some(ref end) = args.flag_end { - q = q.le(end); + + fn run(&self) -> Result<(), Error> { + let fst = unsafe { util::mmap_fst(&self.input)? }; + let mut q = fst.range(); + if let Some(ref start) = self.start { + q = q.ge(Vec::from_os_str_lossy(start)); + } + if let Some(ref end) = self.end { + q = q.le(Vec::from_os_str_lossy(end)); + } + util::print_stream(io::BufWriter::new(io::stdout()), self.outputs, q) } - util::print_stream(io::BufWriter::new(io::stdout()), args.flag_outputs, q) } diff --git a/fst-bin/src/cmd/rust.rs b/fst-bin/src/cmd/rust.rs index c48cbb3..a9ff9fa 100644 --- a/fst-bin/src/cmd/rust.rs +++ b/fst-bin/src/cmd/rust.rs @@ -1,73 +1,58 @@ use std::io::{Read, Write}; +use std::path::PathBuf; -use docopt::Docopt; -use serde::Deserialize; +use crate::{util, Error}; -use crate::util; -use crate::Error; - -const USAGE: &'static str = " -Emit Rust source code for the given FST. - -This reads the FST given and emits it as Rust source code with one -constant defined: - - {NAME}_BYTES - -And a `lazy_static!` ref for: - - {NAME} - -Where {NAME} is taken from the name given as an argument. - -The latter definition corresponds to calling `Fst::from_bytes({NAME}_BYTES)`. -This makes it possible to trivially use pre-built FSTs in your program. - -Usage: - fst rust [options] - fst node --help - -Options: - -h, --help Show this help message. -"; +pub fn run(matches: &clap::ArgMatches) -> Result<(), Error> { + Args::new(matches).and_then(|args| args.run()) +} -#[derive(Debug, Deserialize)] +#[derive(Debug)] struct Args { - arg_fst: String, - arg_name: String, + input: PathBuf, + name: String, } -pub fn run(argv: Vec) -> Result<(), Error> { - let args: Args = Docopt::new(USAGE) - .and_then(|d| d.argv(&argv).deserialize()) - .unwrap_or_else(|e| e.exit()); - let mut wtr = util::get_buf_writer::<&str>(None)?; - let mut rdr = util::get_buf_reader(Some(&args.arg_fst))?; - - let mut bytes = vec![]; - rdr.read_to_end(&mut bytes)?; - - w!(wtr, "lazy_static! {{"); - w!(wtr, " pub static ref {}: ::fst::raw::Fst = ", args.arg_name); - w!(wtr, " ::fst::raw::Fst::new({}_BYTES).unwrap();", args.arg_name); - w!(wtr, "}}\n"); +impl Args { + fn new(m: &clap::ArgMatches) -> Result { + Ok(Args { + input: m.value_of_os("input").map(PathBuf::from).unwrap(), + name: m + .value_of_os("name") + .map(|v| v.to_string_lossy().into_owned()) + .unwrap(), + }) + } - w!(wtr, "const {}_BYTES: &'static [u8] = b\"\\", args.arg_name); - let mut column = 0; - for b in bytes { - let escaped = if (b as char).is_whitespace() { - format!("\\x{:02x}", b) - } else { - util::escape_input(b) - }; - if column + escaped.len() >= 79 { - column = 0; - write!(wtr, "\\\n")?; + fn run(&self) -> Result<(), Error> { + let mut wtr = util::get_buf_writer::<&str>(None)?; + let mut rdr = util::get_buf_reader(Some(&self.input))?; + + let mut bytes = vec![]; + rdr.read_to_end(&mut bytes)?; + + w!(wtr, "lazy_static! {{"); + w!(wtr, " pub static ref {}: ::fst::raw::Fst = ", self.name); + w!(wtr, " ::fst::raw::Fst::new({}_BYTES).unwrap();", self.name); + w!(wtr, "}}\n"); + + w!(wtr, "const {}_BYTES: &'static [u8] = b\"\\", self.name); + let mut column = 0; + for b in bytes { + let escaped = if (b as char).is_whitespace() { + format!("\\x{:02x}", b) + } else { + util::escape_input(b) + }; + if column + escaped.len() >= 79 { + column = 0; + write!(wtr, "\\\n")?; + } + column += escaped.len(); + write!(wtr, "{}", escaped)?; } - column += escaped.len(); - write!(wtr, "{}", escaped)?; - } - w!(wtr, "\\\n\";"); + w!(wtr, "\\\n\";"); - Ok(()) + Ok(()) + } } diff --git a/fst-bin/src/cmd/set.rs b/fst-bin/src/cmd/set.rs index 83605bf..722f291 100644 --- a/fst-bin/src/cmd/set.rs +++ b/fst-bin/src/cmd/set.rs @@ -1,113 +1,98 @@ use std::fs; use std::io; -use std::path::Path; +use std::path::{Path, PathBuf}; use bstr::io::BufReadExt; -use docopt::Docopt; use fst::SetBuilder; -use serde::Deserialize; use crate::merge::Merger; use crate::util; use crate::Error; -const USAGE: &'static str = " -Creates an ordered set backed by a finite state transducer. - -The input to this command should be one or more files with one key per line. - -If your input is already sorted, then pass the --sorted flag to make -construction much faster. - -Usage: - fst set [options] ... - fst set --help - -Options: - -h, --help Display this message. - -f, --force Overwrites the output if a file already exists. - --sorted Set this if the input data is already lexicographically - sorted. This will make construction much faster. - Note that when this is set, all of the options below are - ignored/not relevant. - --fd-limit ARG The maximum number of file descriptors to have open in - a single worker thread. [default: 15] - --batch-size ARG The number of keys to collect in each batch. - N.B. This is the primary factor in how much memory - this process uses. - [default: 100000] - --threads ARG The number of simultaneous workers to run. - This defaults to the number of logical CPUs reported - by your system. - --tmp-dir ARG A temporary directory used to store intermediate - transducers. This defaults to the default temporary - directory reported by your system. - --keep-tmp-dir Does not delete the temporary directory. Useful for - debugging. -"; +pub fn run(matches: &clap::ArgMatches) -> Result<(), Error> { + Args::new(matches).and_then(|args| args.run()) +} -#[derive(Debug, Deserialize)] +#[derive(Debug)] struct Args { - arg_input: Vec, - arg_output: String, - flag_force: bool, - flag_sorted: bool, - flag_fd_limit: u32, - flag_batch_size: u32, - flag_threads: Option, - flag_tmp_dir: Option, - flag_keep_tmp_dir: bool, + input: Vec, + output: PathBuf, + force: bool, + sorted: bool, + fd_limit: u32, + batch_size: u32, + threads: Option, + tmp_dir: Option, + keep_tmp_dir: bool, } -pub fn run(argv: Vec) -> Result<(), Error> { - let args: Args = Docopt::new(USAGE) - .and_then(|d| d.argv(&argv).deserialize()) - .unwrap_or_else(|e| e.exit()); - if !args.flag_force && fs::metadata(&args.arg_output).is_ok() { - fail!("Output file already exists: {}", args.arg_output); +impl Args { + fn new(m: &clap::ArgMatches) -> Result { + let threads = m.value_of_lossy("threads").unwrap().parse()?; + Ok(Args { + input: m + .values_of_os("input") + .unwrap() + .map(PathBuf::from) + .collect(), + output: m.value_of_os("output").map(PathBuf::from).unwrap(), + force: m.is_present("force"), + sorted: m.is_present("sorted"), + fd_limit: m.value_of_lossy("fd-limit").unwrap().parse()?, + batch_size: m.value_of_lossy("batch-size").unwrap().parse()?, + threads: if threads == 0 { None } else { Some(threads) }, + tmp_dir: m.value_of_os("tmp-dir").map(PathBuf::from), + keep_tmp_dir: m.is_present("keep-tmp-dir"), + }) } - if args.flag_sorted { - run_sorted(&args) - } else { - run_unsorted(&args) + + fn run(&self) -> Result<(), Error> { + if !self.force && fs::metadata(&self.output).is_ok() { + anyhow::bail!("Output file already exists: {:?}", self.output); + } + if self.sorted { + self.run_sorted() + } else { + self.run_unsorted() + } } -} -fn run_sorted(args: &Args) -> Result<(), Error> { - let wtr = util::get_buf_writer(Some(&args.arg_output))?; - let mut set = SetBuilder::new(wtr)?; - for input in &args.arg_input { - let rdr = util::get_buf_reader(Some(input))?; - rdr.for_byte_line(|line| { - if line.is_empty() { - return Ok(false); - } - set.insert(line) - .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; - Ok(true) - })?; + fn run_sorted(&self) -> Result<(), Error> { + let wtr = util::get_buf_writer(Some(&self.output))?; + let mut set = SetBuilder::new(wtr)?; + for input in &self.input { + let rdr = util::get_buf_reader(Some(input))?; + rdr.for_byte_line(|line| { + if line.is_empty() { + return Ok(false); + } + set.insert(line) + .map_err(|e| io::Error::new(io::ErrorKind::Other, e))?; + Ok(true) + })?; + } + set.finish().map_err(From::from) } - set.finish().map_err(From::from) -} -fn run_unsorted(args: &Args) -> Result<(), Error> { - let inputs = args - .arg_input - .iter() - .map(|inp| Path::new(inp).to_path_buf()) - .collect(); - let keys = util::ConcatLines::new(inputs) - .map(|result| result.map(|line| (line, 0)).map_err(From::from)); + fn run_unsorted(&self) -> Result<(), Error> { + let inputs = self + .input + .iter() + .map(|inp| Path::new(inp).to_path_buf()) + .collect(); + let keys = util::ConcatLines::new(inputs) + .map(|result| result.map(|line| (line, 0)).map_err(From::from)); - let mut merger = Merger::new(keys, &args.arg_output); - merger = merger.fd_limit(args.flag_fd_limit); - merger = merger.batch_size(args.flag_batch_size); - merger = merger.keep_tmp_dir(args.flag_keep_tmp_dir); - if let Some(threads) = args.flag_threads { - merger = merger.threads(threads); - } - if let Some(ref tmp_dir) = args.flag_tmp_dir { - merger = merger.tmp_dir(tmp_dir); + let mut merger = Merger::new(keys, &self.output); + merger = merger.fd_limit(self.fd_limit); + merger = merger.batch_size(self.batch_size); + merger = merger.keep_tmp_dir(self.keep_tmp_dir); + if let Some(threads) = self.threads { + merger = merger.threads(threads); + } + if let Some(ref tmp_dir) = self.tmp_dir { + merger = merger.tmp_dir(tmp_dir); + } + merger.merge() } - merger.merge() } diff --git a/fst-bin/src/cmd/union.rs b/fst-bin/src/cmd/union.rs index 2517957..6c1fffd 100644 --- a/fst-bin/src/cmd/union.rs +++ b/fst-bin/src/cmd/union.rs @@ -1,51 +1,51 @@ use std::fs; +use std::path::PathBuf; -use docopt::Docopt; -use serde::Deserialize; +use crate::{util, Error}; -use crate::util; -use crate::Error; - -const USAGE: &'static str = " -Unions all of the transducer inputs into a single transducer. - -Any output values are dropped. Stated differently, the output transducer is -always a set. - -Usage: - fst union [options] ... - fst union --help - -Options: - -h, --help Show this help message. - -f, --force Overwrites the output if a file already exists. -"; +pub fn run(matches: &clap::ArgMatches) -> Result<(), Error> { + Args::new(matches).and_then(|args| args.run()) +} -#[derive(Debug, Deserialize)] +#[derive(Debug)] struct Args { - arg_input: Vec, - arg_output: String, - flag_force: bool, + input: Vec, + output: PathBuf, + force: bool, } -pub fn run(argv: Vec) -> Result<(), Error> { - let args: Args = Docopt::new(USAGE) - .and_then(|d| d.argv(&argv).deserialize()) - .unwrap_or_else(|e| e.exit()); - if !args.flag_force && fs::metadata(&args.arg_output).is_ok() { - fail!("Output file already exists: {}", args.arg_output); +impl Args { + fn new(m: &clap::ArgMatches) -> Result { + Ok(Args { + input: m + .values_of_os("input") + .unwrap() + .map(PathBuf::from) + .collect(), + output: m.value_of_os("output").map(PathBuf::from).unwrap(), + force: m.is_present("force"), + }) } - let wtr = util::get_buf_writer(Some(&args.arg_output))?; - let mut merged = fst::SetBuilder::new(wtr)?; - - let mut sets = vec![]; - for set_path in &args.arg_input { - let fst = unsafe { util::mmap_fst(set_path)? }; - sets.push(fst::Set::from(fst)); + fn run(&self) -> Result<(), Error> { + if !self.force && fs::metadata(&self.output).is_ok() { + anyhow::bail!( + "Output file already exists: {}", + self.output.display() + ); + } + + let wtr = util::get_buf_writer(Some(&self.output))?; + let mut merged = fst::SetBuilder::new(wtr)?; + + let mut sets = vec![]; + for set_path in &self.input { + let fst = unsafe { util::mmap_fst(set_path)? }; + sets.push(fst::Set::from(fst)); + } + let union = sets.iter().collect::().union(); + merged.extend_stream(union)?; + merged.finish()?; + Ok(()) } - let union = sets.iter().collect::().union(); - merged.extend_stream(union)?; - merged.finish()?; - Ok(()) } diff --git a/fst-bin/src/main.rs b/fst-bin/src/main.rs index 3a37100..d61d712 100644 --- a/fst-bin/src/main.rs +++ b/fst-bin/src/main.rs @@ -1,12 +1,4 @@ -#![allow(dead_code)] - -use std::env; -use std::error; -use std::io::{self, Write}; -use std::process; - -use docopt::Docopt; -use serde::Deserialize; +use anyhow::Error; macro_rules! w { ($wtr:expr, $($tt:tt)*) => {{ @@ -15,104 +7,31 @@ macro_rules! w { }} } -macro_rules! fail { - ($($tt:tt)*) => { return Err(From::from(format!($($tt)*))); } -} - +mod app; mod cmd; mod merge; mod util; -pub type Error = Box; - -const USAGE: &'static str = " -Usage: - fst [...] - fst --help - fst --version - -Commands: - csv Emit statistics about nodes or edges. - dot Emit a dot representation of an FST. - dupes Emit diagnostic info about frequency of duplicate nodes. - fuzzy Run fuzzy queries based on edit distance. - grep Search an FST with a regex. - map Create a new map of key-value pairs. - node Show a single node. - range Run range queries. - rust Emit a Rust source code for this FST. - set Create a new set of keys. - union Union two or more FSTs. - -Options: - -h, --help Show this help message. - -v, --version Show version. -"; - -#[derive(Debug, Deserialize)] -struct Args { - arg_command: Option, -} - -#[derive(Debug, Deserialize)] -enum Command { - Csv, - Dot, - Dupes, - Fuzzy, - Grep, - Map, - Node, - Range, - Rust, - Set, - Union, -} - -impl Command { - fn run(self) -> Result<(), Error> { - use self::Command::*; - - let argv: Vec = env::args().collect(); - match self { - Csv => cmd::csv::run(argv), - Dot => cmd::dot::run(argv), - Dupes => cmd::dupes::run(argv), - Fuzzy => cmd::fuzzy::run(argv), - Grep => cmd::grep::run(argv), - Map => cmd::map::run(argv), - Node => cmd::node::run(argv), - Range => cmd::range::run(argv), - Rust => cmd::rust::run(argv), - Set => cmd::set::run(argv), - Union => cmd::union::run(argv), +fn main() -> Result<(), Error> { + match crate::app::app().get_matches().subcommand() { + ("csv", Some(m)) => cmd::csv::run(m), + ("dot", Some(m)) => cmd::dot::run(m), + ("dupes", Some(m)) => cmd::dupes::run(m), + ("fuzzy", Some(m)) => cmd::fuzzy::run(m), + ("grep", Some(m)) => cmd::grep::run(m), + ("map", Some(m)) => cmd::map::run(m), + ("node", Some(m)) => cmd::node::run(m), + ("range", Some(m)) => cmd::range::run(m), + ("rust", Some(m)) => cmd::rust::run(m), + ("set", Some(m)) => cmd::set::run(m), + ("union", Some(m)) => cmd::union::run(m), + ("", _) => { + app::app().print_help()?; + println!(""); + Ok(()) } - } -} - -fn main() { - let args: Args = Docopt::new(USAGE) - .and_then(|d| { - d.options_first(true).version(Some(version())).deserialize() - }) - .unwrap_or_else(|e| e.exit()); - let cmd = args.arg_command.expect("BUG: expected a command"); - if let Err(err) = cmd.run() { - writeln!(&mut io::stderr(), "{}", err).unwrap(); - process::exit(1); - } -} - -fn version() -> String { - let (maj, min, pat) = ( - option_env!("CARGO_PKG_VERSION_MAJOR"), - option_env!("CARGO_PKG_VERSION_MINOR"), - option_env!("CARGO_PKG_VERSION_PATCH"), - ); - match (maj, min, pat) { - (Some(maj), Some(min), Some(pat)) => { - format!("{}.{}.{}", maj, min, pat) + (unknown, _) => { + Err(anyhow::anyhow!("unrecognized command: {}", unknown)) } - _ => "N/A".to_owned(), } } diff --git a/src/raw/error.rs b/src/raw/error.rs index 284fccc..d63fe04 100644 --- a/src/raw/error.rs +++ b/src/raw/error.rs @@ -68,9 +68,9 @@ impl fmt::Display for Error { Version { expected, got } => write!( f, "\ -Error opening FST: expected API version {}, got API version {}. -It looks like the FST you're trying to open is either not an FST file or it -was generated with a different version of the 'fst' crate. You'll either need +Error opening FST: expected API version {}, got API version {}. \ +It looks like the FST you're trying to open is either not an FST file or it \ +was generated with a different version of the 'fst' crate. You'll either need \ to change the version of the 'fst' crate you're using, or re-generate the FST.", expected, got @@ -78,20 +78,19 @@ FST.", Format { size } => write!( f, "\ -Error opening FST with size {} bytes: An unknown error occurred. This +Error opening FST with size {} bytes: An unknown error occurred. This \ usually means you're trying to read data that isn't actually an encoded FST.", size ), DuplicateKey { ref got } => write!( f, - "\ -Error inserting duplicate key: {}.", + "Error inserting duplicate key: '{}'.", format_bytes(&*got) ), OutOfOrder { ref previous, ref got } => write!( f, "\ -Error inserting out-of-order key: {}. (Previous key was {}.) Keys must be +Error inserting out-of-order key: '{}'. (Previous key was '{}'.) Keys must be \ inserted in lexicographic order.", format_bytes(&*got), format_bytes(&*previous) @@ -99,7 +98,7 @@ inserted in lexicographic order.", WrongType { expected, got } => write!( f, "\ -Error opening FST: expected type {}, got type {}.", +Error opening FST: expected type '{}', got type '{}'.", expected, got ), } From 0614f0470cf5a1a8a5a3e1ceecdc4a49ae9b0210 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 28 Feb 2020 12:33:59 -0500 Subject: [PATCH 13/23] fst-bin: add knobs for reverse and minimization This adds a couple environment variables that makes it easy to configure DFA construction with a few more knobs. This is useful for benchmarking various options. --- fst-bin/src/cmd/grep.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fst-bin/src/cmd/grep.rs b/fst-bin/src/cmd/grep.rs index cc84f71..13d353b 100644 --- a/fst-bin/src/cmd/grep.rs +++ b/fst-bin/src/cmd/grep.rs @@ -38,11 +38,17 @@ impl Args { } fn run(&self) -> Result<(), Error> { + let reverse = + std::env::var("FST_BIN_DFA_REVERSE").map_or(false, |v| v == "1"); + let minimize = + std::env::var("FST_BIN_DFA_MINIMIZE").map_or(false, |v| v == "1"); let fst = unsafe { util::mmap_fst(&self.input)? }; let dense_dfa = dense::Builder::new() .anchored(true) + .minimize(minimize) .byte_classes(true) .premultiply(true) + .reverse(reverse) .build(&self.regex)?; let dfa = match dense_dfa { dense::DenseDFA::PremultipliedByteClass(dfa) => dfa, From b6c85d7a656eba38f85459d70e840acc3fe441a3 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 5 Mar 2020 10:44:24 -0500 Subject: [PATCH 14/23] raw: add get_key/get_key_into APIs This provides the ability to map from a value back to its corresponding key. This only works when values are monotonically increasing, which is unfortunate, but it's still worth exposing. Since this is only useful is limited circumstances, we don't make this available on the fst::Map type, but instead provide it on raw::Fst only. Closes #16 --- src/raw/mod.rs | 52 ++++++++++++++++++++++++++++++++++++++++++++++++ src/raw/tests.rs | 36 +++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) diff --git a/src/raw/mod.rs b/src/raw/mod.rs index 034939c..e023f54 100644 --- a/src/raw/mod.rs +++ b/src/raw/mod.rs @@ -357,6 +357,39 @@ impl> Fst { self.as_ref().contains_key(key.as_ref()) } + /// Retrieves the key associated with the given value. + /// + /// This is like `get_key_into`, but will return the key itself without + /// allowing the caller to reuse an allocation. + /// + /// If the given value does not exist, then `None` is returned. + /// + /// The values in this FST are not monotonically increasing when sorted + /// lexicographically by key, then this routine has unspecified behavior. + #[inline] + pub fn get_key(&self, value: u64) -> Option> { + let mut key = vec![]; + if self.get_key_into(value, &mut key) { + Some(key) + } else { + None + } + } + + /// Retrieves the key associated with the given value. + /// + /// If the given value does not exist, then `false` is returned. In this + /// case, the contents of `key` are unspecified. + /// + /// The given buffer is not clearer before the key is written to it. + /// + /// The values in this FST are not monotonically increasing when sorted + /// lexicographically by key, then this routine has unspecified behavior. + #[inline] + pub fn get_key_into(&self, value: u64, key: &mut Vec) -> bool { + self.as_ref().get_key_into(value, key) + } + /// Return a lexicographically ordered stream of all key-value pairs in /// this fst. #[inline] @@ -551,6 +584,25 @@ impl<'f> FstRef<'f> { node.is_final() } + fn get_key_into(&self, mut value: u64, key: &mut Vec) -> bool { + let mut node = self.root(); + while value != 0 || !node.is_final() { + let trans = node + .transitions() + .take_while(|t| t.out.value() <= value) + .last(); + node = match trans { + None => return false, + Some(t) => { + value -= t.out.value(); + key.push(t.inp); + self.node(t.addr) + } + }; + } + true + } + fn len(&self) -> usize { self.meta.len } diff --git a/src/raw/tests.rs b/src/raw/tests.rs index 4e9b53d..c32a217 100644 --- a/src/raw/tests.rs +++ b/src/raw/tests.rs @@ -517,3 +517,39 @@ fn bytes_written() { let footer_size = 24; assert_eq!(counted_len + footer_size, fst1_len); } + +#[test] +fn get_key_simple() { + let map = fst_map(vec![("abc", 2), ("xyz", 3)]); + assert_eq!(map.get_key(0), None); + assert_eq!(map.get_key(1), None); + assert_eq!(map.get_key(2), Some(b"abc".to_vec())); + assert_eq!(map.get_key(3), Some(b"xyz".to_vec())); + assert_eq!(map.get_key(4), None); +} + +#[test] +fn get_key_words() { + let words: Vec<(Vec, u64)> = TEXT + .lines() + .enumerate() + .map(|(i, line)| (line.as_bytes().to_vec(), i as u64)) + .collect(); + let map = fst_map(words.clone()); + for (key, value) in words { + assert_eq!(map.get_key(value), Some(key)); + } +} + +#[test] +fn get_key_words_discontiguous() { + let words: Vec<(Vec, u64)> = TEXT + .lines() + .enumerate() + .map(|(i, line)| (line.as_bytes().to_vec(), i as u64 * 2)) + .collect(); + let map = fst_map(words.clone()); + for (key, value) in words { + assert_eq!(map.get_key(value), Some(key)); + } +} From 1953f197c5be1e8ff2d899683591215fe07aa8da Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 5 Mar 2020 18:55:22 -0500 Subject: [PATCH 15/23] raw: add from_iter_{set,map} convenient constructors These mirror the constructors on Set and Map. --- src/raw/mod.rs | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/src/raw/mod.rs b/src/raw/mod.rs index e023f54..fc70167 100644 --- a/src/raw/mod.rs +++ b/src/raw/mod.rs @@ -281,6 +281,51 @@ struct Meta { len: usize, } +impl Fst> { + /// Create a new FST from an iterator of lexicographically ordered byte + /// strings. Every key's value is set to `0`. + /// + /// If the iterator does not yield values in lexicographic order, then an + /// error is returned. + /// + /// Note that this is a convenience function to build an FST in memory. + /// To build an FST that streams to an arbitrary `io::Write`, use + /// `raw::Builder`. + pub fn from_iter_set(iter: I) -> Result>> + where + K: AsRef<[u8]>, + I: IntoIterator, + { + let mut builder = Builder::memory(); + for k in iter { + builder.add(k)?; + } + Ok(builder.into_fst()) + } + + /// Create a new FST from an iterator of lexicographically ordered byte + /// strings. The iterator should consist of tuples, where the first element + /// is the byte string and the second element is its corresponding value. + /// + /// If the iterator does not yield unique keys in lexicographic order, then + /// an error is returned. + /// + /// Note that this is a convenience function to build an FST in memory. + /// To build an FST that streams to an arbitrary `io::Write`, use + /// `raw::Builder`. + pub fn from_iter_map(iter: I) -> Result>> + where + K: AsRef<[u8]>, + I: IntoIterator, + { + let mut builder = Builder::memory(); + for (k, v) in iter { + builder.insert(k, v)?; + } + Ok(builder.into_fst()) + } +} + impl> Fst { /// Creates a transducer from its representation as a raw byte sequence. /// From 3235075a7fc26ff3308eabd3af5bd6fb7346cbdd Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 5 Mar 2020 19:23:28 -0500 Subject: [PATCH 16/23] tmp: remove fst-bin from workspace We'll re-add once we put out a new release of regex-automata. --- Cargo.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index b399cbb..e6b780c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,7 +15,8 @@ license = "Unlicense/MIT" edition = "2018" [workspace] -members = ["bench", "fst-bin"] +members = ["bench"] +exclude = ["fst-bin"] [features] default = [] From 5ca477696bddebb044a8fb77069c7a69cb10ab6f Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Thu, 5 Mar 2020 20:32:49 -0500 Subject: [PATCH 17/23] streams: add "stream with state" APIs This is apparently useful in some cases when access to the underlying automaton's state can produce useful information about a match. Regretably, the implementation requires the states to satisfy `Clone`, or else we would otherwise need to execute the state transition twice. In practice, states are usually `Copy` and quite small. Closes #60, Closes #61 --- src/map.rs | 161 +++++++++++++++++++++++++- src/raw/mod.rs | 299 ++++++++++++++++++++++++++++++++++++------------- src/set.rs | 168 ++++++++++++++++++++++++--- 3 files changed, 530 insertions(+), 98 deletions(-) diff --git a/src/map.rs b/src/map.rs index c53b930..d6e2053 100644 --- a/src/map.rs +++ b/src/map.rs @@ -263,12 +263,13 @@ impl> Map { /// # Example /// /// An implementation of regular expressions for `Automaton` is available - /// in the `fst-regex` crate, which can be used to search maps. + /// in the `regex-automata` crate with the `fst1` feature enabled, which + /// can be used to search maps. /// /// # Example /// /// An implementation of subsequence search for `Automaton` can be used - /// to search sets: + /// to search maps: /// /// ```rust /// use fst::automaton::Subsequence; @@ -303,6 +304,62 @@ impl> Map { StreamBuilder(self.0.search(aut)) } + /// Executes an automaton on the keys of this map and yields matching + /// keys along with the corresponding matching states in the given + /// automaton. + /// + /// Note that this returns a `StreamWithStateBuilder`, which can be used to + /// add a range query to the search (see the `range` method). + /// + /// Memory requirements are the same as described on `Map::stream`. + /// + #[cfg_attr( + feature = "levenshtein", + doc = r##" +# Example + +An implementation of fuzzy search using Levenshtein automata can be used +to search maps: + +```rust +use fst::automaton::Levenshtein; +use fst::{IntoStreamer, Streamer, Map}; + +# fn main() { example().unwrap(); } +fn example() -> Result<(), Box> { + let map = Map::from_iter(vec![ + ("foo", 1), + ("foob", 2), + ("foobar", 3), + ("fozb", 4), + ]).unwrap(); + + let query = Levenshtein::new("foo", 2)?; + let mut stream = map.search_with_state(&query).into_stream(); + + let mut kvs = vec![]; + while let Some((k, v, s)) = stream.next() { + kvs.push((String::from_utf8(k.to_vec())?, v, s)); + } + // Currently, there isn't much interesting that you can do with the states. + assert_eq!(kvs, vec![ + ("foo".to_string(), 1, Some(183)), + ("foob".to_string(), 2, Some(123)), + ("fozb".to_string(), 4, Some(83)), + ]); + + Ok(()) +} +``` +"## + )] + pub fn search_with_state( + &self, + aut: A, + ) -> StreamWithStateBuilder<'_, A> { + StreamWithStateBuilder(self.0.search_with_state(aut)) + } + /// Returns the number of elements in this map. #[inline] pub fn len(&self) -> usize { @@ -667,6 +724,30 @@ impl<'m, A: Automaton> Stream<'m, A> { } } +/// A lexicographically ordered stream of key-value-state triples from a map +/// and an automaton. +/// +/// The key-values are from the map while the states are from the automaton. +/// +/// The `A` type parameter corresponds to an optional automaton to filter +/// the stream. By default, no filtering is done. +/// +/// The `'m` lifetime parameter refers to the lifetime of the underlying map. +pub struct StreamWithState<'m, A = AlwaysMatch>(raw::StreamWithState<'m, A>) +where + A: Automaton; + +impl<'a, 'm, A: 'a + Automaton> Streamer<'a> for StreamWithState<'m, A> +where + A::State: Clone, +{ + type Item = (&'a [u8], u64, A::State); + + fn next(&'a mut self) -> Option<(&'a [u8], u64, A::State)> { + self.0.next().map(|(key, out, state)| (key, out.value(), state)) + } +} + /// A lexicographically ordered stream of keys from a map. /// /// The `'m` lifetime parameter refers to the lifetime of the underlying map. @@ -712,22 +793,22 @@ pub struct StreamBuilder<'m, A = AlwaysMatch>(raw::StreamBuilder<'m, A>); impl<'m, A: Automaton> StreamBuilder<'m, A> { /// Specify a greater-than-or-equal-to bound. - pub fn ge>(self, bound: T) -> Self { + pub fn ge>(self, bound: T) -> StreamBuilder<'m, A> { StreamBuilder(self.0.ge(bound)) } /// Specify a greater-than bound. - pub fn gt>(self, bound: T) -> Self { + pub fn gt>(self, bound: T) -> StreamBuilder<'m, A> { StreamBuilder(self.0.gt(bound)) } /// Specify a less-than-or-equal-to bound. - pub fn le>(self, bound: T) -> Self { + pub fn le>(self, bound: T) -> StreamBuilder<'m, A> { StreamBuilder(self.0.le(bound)) } /// Specify a less-than bound. - pub fn lt>(self, bound: T) -> Self { + pub fn lt>(self, bound: T) -> StreamBuilder<'m, A> { StreamBuilder(self.0.lt(bound)) } } @@ -741,6 +822,74 @@ impl<'m, 'a, A: Automaton> IntoStreamer<'a> for StreamBuilder<'m, A> { } } +/// A builder for constructing range queries on streams that include automaton +/// states. +/// +/// In general, one should use `StreamBuilder` unless you have a specific need +/// for accessing the states of the underlying automaton that is being used to +/// filter this stream. +/// +/// Once all bounds are set, one should call `into_stream` to get a +/// `Stream`. +/// +/// Bounds are not additive. That is, if `ge` is called twice on the same +/// builder, then the second setting wins. +/// +/// The `A` type parameter corresponds to an optional automaton to filter +/// the stream. By default, no filtering is done. +/// +/// The `'m` lifetime parameter refers to the lifetime of the underlying map. +pub struct StreamWithStateBuilder<'m, A = AlwaysMatch>( + raw::StreamWithStateBuilder<'m, A>, +); + +impl<'m, A: Automaton> StreamWithStateBuilder<'m, A> { + /// Specify a greater-than-or-equal-to bound. + pub fn ge>( + self, + bound: T, + ) -> StreamWithStateBuilder<'m, A> { + StreamWithStateBuilder(self.0.ge(bound)) + } + + /// Specify a greater-than bound. + pub fn gt>( + self, + bound: T, + ) -> StreamWithStateBuilder<'m, A> { + StreamWithStateBuilder(self.0.gt(bound)) + } + + /// Specify a less-than-or-equal-to bound. + pub fn le>( + self, + bound: T, + ) -> StreamWithStateBuilder<'m, A> { + StreamWithStateBuilder(self.0.le(bound)) + } + + /// Specify a less-than bound. + pub fn lt>( + self, + bound: T, + ) -> StreamWithStateBuilder<'m, A> { + StreamWithStateBuilder(self.0.lt(bound)) + } +} + +impl<'m, 'a, A: 'a + Automaton> IntoStreamer<'a> + for StreamWithStateBuilder<'m, A> +where + A::State: Clone, +{ + type Item = (&'a [u8], u64, A::State); + type Into = StreamWithState<'m, A>; + + fn into_stream(self) -> StreamWithState<'m, A> { + StreamWithState(self.0.into_stream()) + } +} + /// A builder for collecting map streams on which to perform set operations /// on the keys of maps. /// diff --git a/src/raw/mod.rs b/src/raw/mod.rs index fc70167..67386cd 100644 --- a/src/raw/mod.rs +++ b/src/raw/mod.rs @@ -451,12 +451,23 @@ impl> Fst { StreamBuilder::new(self.as_ref(), AlwaysMatch) } - /// Executes an automaton on the keys of this map. + /// Executes an automaton on the keys of this FST. #[inline] pub fn search(&self, aut: A) -> StreamBuilder<'_, A> { StreamBuilder::new(self.as_ref(), aut) } + /// Executes an automaton on the keys of this FST and yields matching + /// keys along with the corresponding matching states in the given + /// automaton. + #[inline] + pub fn search_with_state( + &self, + aut: A, + ) -> StreamWithStateBuilder<'_, A> { + StreamWithStateBuilder::new(self.as_ref(), aut) + } + /// Returns the number of keys in this fst. #[inline] pub fn len(&self) -> usize { @@ -724,25 +735,25 @@ impl<'f, A: Automaton> StreamBuilder<'f, A> { } /// Specify a greater-than-or-equal-to bound. - pub fn ge>(mut self, bound: T) -> Self { + pub fn ge>(mut self, bound: T) -> StreamBuilder<'f, A> { self.min = Bound::Included(bound.as_ref().to_owned()); self } /// Specify a greater-than bound. - pub fn gt>(mut self, bound: T) -> Self { + pub fn gt>(mut self, bound: T) -> StreamBuilder<'f, A> { self.min = Bound::Excluded(bound.as_ref().to_owned()); self } /// Specify a less-than-or-equal-to bound. - pub fn le>(mut self, bound: T) -> Self { + pub fn le>(mut self, bound: T) -> StreamBuilder<'f, A> { self.max = Bound::Included(bound.as_ref().to_owned()); self } /// Specify a less-than bound. - pub fn lt>(mut self, bound: T) -> Self { + pub fn lt>(mut self, bound: T) -> StreamBuilder<'f, A> { self.max = Bound::Excluded(bound.as_ref().to_owned()); self } @@ -757,6 +768,90 @@ impl<'a, 'f, A: Automaton> IntoStreamer<'a> for StreamBuilder<'f, A> { } } +/// A builder for constructing range queries on streams that include automaton +/// states. +/// +/// In general, one should use `StreamBuilder` unless you have a specific need +/// for accessing the states of the underlying automaton that is being used to +/// filter this stream. +/// +/// Once all bounds are set, one should call `into_stream` to get a +/// `Stream`. +/// +/// Bounds are not additive. That is, if `ge` is called twice on the same +/// builder, then the second setting wins. +/// +/// The `A` type parameter corresponds to an optional automaton to filter +/// the stream. By default, no filtering is done. +/// +/// The `'f` lifetime parameter refers to the lifetime of the underlying fst. +pub struct StreamWithStateBuilder<'f, A = AlwaysMatch> { + fst: FstRef<'f>, + aut: A, + min: Bound, + max: Bound, +} + +impl<'f, A: Automaton> StreamWithStateBuilder<'f, A> { + fn new(fst: FstRef<'f>, aut: A) -> StreamWithStateBuilder<'f, A> { + StreamWithStateBuilder { + fst, + aut, + min: Bound::Unbounded, + max: Bound::Unbounded, + } + } + + /// Specify a greater-than-or-equal-to bound. + pub fn ge>( + mut self, + bound: T, + ) -> StreamWithStateBuilder<'f, A> { + self.min = Bound::Included(bound.as_ref().to_owned()); + self + } + + /// Specify a greater-than bound. + pub fn gt>( + mut self, + bound: T, + ) -> StreamWithStateBuilder<'f, A> { + self.min = Bound::Excluded(bound.as_ref().to_owned()); + self + } + + /// Specify a less-than-or-equal-to bound. + pub fn le>( + mut self, + bound: T, + ) -> StreamWithStateBuilder<'f, A> { + self.max = Bound::Included(bound.as_ref().to_owned()); + self + } + + /// Specify a less-than bound. + pub fn lt>( + mut self, + bound: T, + ) -> StreamWithStateBuilder<'f, A> { + self.max = Bound::Excluded(bound.as_ref().to_owned()); + self + } +} + +impl<'a, 'f, A: 'a + Automaton> IntoStreamer<'a> + for StreamWithStateBuilder<'f, A> +where + A::State: Clone, +{ + type Item = (&'a [u8], Output, A::State); + type Into = StreamWithState<'f, A>; + + fn into_stream(self) -> StreamWithState<'f, A> { + StreamWithState::new(self.fst, self.aut, self.min, self.max) + } +} + #[derive(Debug)] enum Bound { Included(Vec), @@ -795,7 +890,94 @@ impl Bound { /// the stream. By default, no filtering is done. /// /// The `'f` lifetime parameter refers to the lifetime of the underlying fst. -pub struct Stream<'f, A = AlwaysMatch> +pub struct Stream<'f, A: Automaton = AlwaysMatch>(StreamWithState<'f, A>); + +impl<'f, A: Automaton> Stream<'f, A> { + fn new(fst: FstRef<'f>, aut: A, min: Bound, max: Bound) -> Stream<'f, A> { + Stream(StreamWithState::new(fst, aut, min, max)) + } + + /// Convert this stream into a vector of byte strings and outputs. + /// + /// Note that this creates a new allocation for every key in the stream. + pub fn into_byte_vec(mut self) -> Vec<(Vec, u64)> { + let mut vs = vec![]; + while let Some((k, v)) = self.next() { + vs.push((k.to_vec(), v.value())); + } + vs + } + + /// Convert this stream into a vector of Unicode strings and outputs. + /// + /// If any key is not valid UTF-8, then iteration on the stream is stopped + /// and a UTF-8 decoding error is returned. + /// + /// Note that this creates a new allocation for every key in the stream. + pub fn into_str_vec(mut self) -> Result> { + let mut vs = vec![]; + while let Some((k, v)) = self.next() { + let k = String::from_utf8(k.to_vec()).map_err(Error::from)?; + vs.push((k, v.value())); + } + Ok(vs) + } + + /// Convert this stream into a vector of byte strings. + /// + /// Note that this creates a new allocation for every key in the stream. + pub fn into_byte_keys(mut self) -> Vec> { + let mut vs = vec![]; + while let Some((k, _)) = self.next() { + vs.push(k.to_vec()); + } + vs + } + + /// Convert this stream into a vector of Unicode strings. + /// + /// If any key is not valid UTF-8, then iteration on the stream is stopped + /// and a UTF-8 decoding error is returned. + /// + /// Note that this creates a new allocation for every key in the stream. + pub fn into_str_keys(mut self) -> Result> { + let mut vs = vec![]; + while let Some((k, _)) = self.next() { + let k = String::from_utf8(k.to_vec()).map_err(Error::from)?; + vs.push(k); + } + Ok(vs) + } + + /// Convert this stream into a vector of outputs. + pub fn into_values(mut self) -> Vec { + let mut vs = vec![]; + while let Some((_, v)) = self.next() { + vs.push(v.value()); + } + vs + } +} + +impl<'f, 'a, A: Automaton> Streamer<'a> for Stream<'f, A> { + type Item = (&'a [u8], Output); + + fn next(&'a mut self) -> Option { + self.0.next_with(|_| ()).map(|(key, out, _)| (key, out)) + } +} + +/// A lexicographically ordered stream of key-value-state triples from an fst +/// and an automaton. +/// +/// The key-values are from the underyling FSTP while the states are from the +/// automaton. +/// +/// The `A` type parameter corresponds to an optional automaton to filter +/// the stream. By default, no filtering is done. +/// +/// The `'m` lifetime parameter refers to the lifetime of the underlying map. +pub struct StreamWithState<'f, A = AlwaysMatch> where A: Automaton, { @@ -815,9 +997,14 @@ struct StreamState<'f, S> { aut_state: S, } -impl<'f, A: Automaton> Stream<'f, A> { - fn new(fst: FstRef<'f>, aut: A, min: Bound, max: Bound) -> Stream<'f, A> { - let mut rdr = Stream { +impl<'f, A: Automaton> StreamWithState<'f, A> { + fn new( + fst: FstRef<'f>, + aut: A, + min: Bound, + max: Bound, + ) -> StreamWithState<'f, A> { + let mut rdr = StreamWithState { fst, aut, inp: Vec::with_capacity(16), @@ -916,79 +1103,19 @@ impl<'f, A: Automaton> Stream<'f, A> { } } - /// Convert this stream into a vector of byte strings and outputs. - /// - /// Note that this creates a new allocation for every key in the stream. - pub fn into_byte_vec(mut self) -> Vec<(Vec, u64)> { - let mut vs = vec![]; - while let Some((k, v)) = self.next() { - vs.push((k.to_vec(), v.value())); - } - vs - } - - /// Convert this stream into a vector of Unicode strings and outputs. - /// - /// If any key is not valid UTF-8, then iteration on the stream is stopped - /// and a UTF-8 decoding error is returned. - /// - /// Note that this creates a new allocation for every key in the stream. - pub fn into_str_vec(mut self) -> Result> { - let mut vs = vec![]; - while let Some((k, v)) = self.next() { - let k = String::from_utf8(k.to_vec()).map_err(Error::from)?; - vs.push((k, v.value())); - } - Ok(vs) - } - - /// Convert this stream into a vector of byte strings. - /// - /// Note that this creates a new allocation for every key in the stream. - pub fn into_byte_keys(mut self) -> Vec> { - let mut vs = vec![]; - while let Some((k, _)) = self.next() { - vs.push(k.to_vec()); - } - vs - } - - /// Convert this stream into a vector of Unicode strings. - /// - /// If any key is not valid UTF-8, then iteration on the stream is stopped - /// and a UTF-8 decoding error is returned. - /// - /// Note that this creates a new allocation for every key in the stream. - pub fn into_str_keys(mut self) -> Result> { - let mut vs = vec![]; - while let Some((k, _)) = self.next() { - let k = String::from_utf8(k.to_vec()).map_err(Error::from)?; - vs.push(k); - } - Ok(vs) - } - - /// Convert this stream into a vector of outputs. - pub fn into_values(mut self) -> Vec { - let mut vs = vec![]; - while let Some((_, v)) = self.next() { - vs.push(v.value()); - } - vs - } -} - -impl<'f, 'a, A: Automaton> Streamer<'a> for Stream<'f, A> { - type Item = (&'a [u8], Output); - - fn next(&'a mut self) -> Option { + fn next_with( + &mut self, + mut map: impl FnMut(&A::State) -> T, + ) -> Option<(&[u8], Output, T)> { if let Some(out) = self.empty_output.take() { if self.end_at.exceeded_by(&[]) { self.stack.clear(); return None; } - if self.aut.is_match(&self.aut.start()) { - return Some((&[], out)); + + let start = self.aut.start(); + if self.aut.is_match(&start) { + return Some((&[], out, map(&start))); } } while let Some(state) = self.stack.pop() { @@ -1003,6 +1130,7 @@ impl<'f, 'a, A: Automaton> Streamer<'a> for Stream<'f, A> { let trans = state.node.transition(state.trans); let out = state.out.cat(trans.out); let next_state = self.aut.accept(&state.aut_state, trans.inp); + let t = map(&next_state); let is_match = self.aut.is_match(&next_state); let next_node = self.fst.node(trans.addr); self.inp.push(trans.inp); @@ -1019,13 +1147,28 @@ impl<'f, 'a, A: Automaton> Streamer<'a> for Stream<'f, A> { return None; } if next_node.is_final() && is_match { - return Some((&self.inp, out.cat(next_node.final_output()))); + return Some(( + &self.inp, + out.cat(next_node.final_output()), + t, + )); } } None } } +impl<'a, 'f, A: 'a + Automaton> Streamer<'a> for StreamWithState<'f, A> +where + A::State: Clone, +{ + type Item = (&'a [u8], Output, A::State); + + fn next(&'a mut self) -> Option<(&'a [u8], Output, A::State)> { + self.next_with(|state| state.clone()) + } +} + /// An output is a value that is associated with a key in a finite state /// transducer. /// diff --git a/src/set.rs b/src/set.rs index 6ae5fc2..412a5d0 100644 --- a/src/set.rs +++ b/src/set.rs @@ -192,6 +192,62 @@ impl> Set { StreamBuilder(self.0.search(aut)) } + /// Executes an automaton on the values of this set and yields matching + /// values along with the corresponding matching states in the given + /// automaton. + /// + /// Note that this returns a `StreamWithStateBuilder`, which can be used to + /// add a range query to the search (see the `range` method). + /// + /// Memory requirements are the same as described on `Map::stream`. + /// + #[cfg_attr( + feature = "levenshtein", + doc = r##" +# Example + +An implementation of fuzzy search using Levenshtein automata can be used +to search sets: + +```rust +use fst::automaton::Levenshtein; +use fst::{IntoStreamer, Streamer, Set}; + +# fn main() { example().unwrap(); } +fn example() -> Result<(), Box> { + let set = Set::from_iter(vec![ + "foo", + "foob", + "foobar", + "fozb", + ]).unwrap(); + + let query = Levenshtein::new("foo", 2)?; + let mut stream = set.search_with_state(&query).into_stream(); + + let mut vs = vec![]; + while let Some((v, s)) = stream.next() { + vs.push((String::from_utf8(v.to_vec())?, s)); + } + // Currently, there isn't much interesting that you can do with the states. + assert_eq!(vs, vec![ + ("foo".to_string(), Some(183)), + ("foob".to_string(), Some(123)), + ("fozb".to_string(), Some(83)), + ]); + + Ok(()) +} +``` +"## + )] + pub fn search_with_state( + &self, + aut: A, + ) -> StreamWithStateBuilder<'_, A> { + StreamWithStateBuilder(self.0.search_with_state(aut)) + } + /// Returns the number of elements in this set. #[inline] pub fn len(&self) -> usize { @@ -551,14 +607,6 @@ where A: Automaton; impl<'s, A: Automaton> Stream<'s, A> { - /// Creates a new set stream from an fst stream. - /// - /// Not part of the public API, but useful in sibling module `map`. - #[doc(hidden)] - pub fn new(fst_stream: raw::Stream<'s, A>) -> Self { - Stream(fst_stream) - } - /// Convert this stream into a vector of Unicode strings. /// /// If any key is not valid UTF-8, then iteration on the stream is stopped @@ -580,11 +628,35 @@ impl<'s, A: Automaton> Stream<'s, A> { impl<'a, 's, A: Automaton> Streamer<'a> for Stream<'s, A> { type Item = &'a [u8]; - fn next(&'a mut self) -> Option { + fn next(&'a mut self) -> Option<&'a [u8]> { self.0.next().map(|(key, _)| key) } } +/// A lexicographically ordered stream of key-state pairs from a set and +/// an automaton. +/// +/// The keys are from the set while the states are from the automaton. +/// +/// The `A` type parameter corresponds to an optional automaton to filter +/// the stream. By default, no filtering is done. +/// +/// The `'m` lifetime parameter refers to the lifetime of the underlying set. +pub struct StreamWithState<'m, A = AlwaysMatch>(raw::StreamWithState<'m, A>) +where + A: Automaton; + +impl<'a, 'm, A: 'a + Automaton> Streamer<'a> for StreamWithState<'m, A> +where + A::State: Clone, +{ + type Item = (&'a [u8], A::State); + + fn next(&'a mut self) -> Option<(&'a [u8], A::State)> { + self.0.next().map(|(key, _, state)| (key, state)) + } +} + /// A builder for constructing range queries on streams. /// /// Once all bounds are set, one should call `into_stream` to get a @@ -601,22 +673,22 @@ pub struct StreamBuilder<'s, A = AlwaysMatch>(raw::StreamBuilder<'s, A>); impl<'s, A: Automaton> StreamBuilder<'s, A> { /// Specify a greater-than-or-equal-to bound. - pub fn ge>(self, bound: T) -> Self { + pub fn ge>(self, bound: T) -> StreamBuilder<'s, A> { StreamBuilder(self.0.ge(bound)) } /// Specify a greater-than bound. - pub fn gt>(self, bound: T) -> Self { + pub fn gt>(self, bound: T) -> StreamBuilder<'s, A> { StreamBuilder(self.0.gt(bound)) } /// Specify a less-than-or-equal-to bound. - pub fn le>(self, bound: T) -> Self { + pub fn le>(self, bound: T) -> StreamBuilder<'s, A> { StreamBuilder(self.0.le(bound)) } /// Specify a less-than bound. - pub fn lt>(self, bound: T) -> Self { + pub fn lt>(self, bound: T) -> StreamBuilder<'s, A> { StreamBuilder(self.0.lt(bound)) } } @@ -625,11 +697,79 @@ impl<'s, 'a, A: Automaton> IntoStreamer<'a> for StreamBuilder<'s, A> { type Item = &'a [u8]; type Into = Stream<'s, A>; - fn into_stream(self) -> Self::Into { + fn into_stream(self) -> Stream<'s, A> { Stream(self.0.into_stream()) } } +/// A builder for constructing range queries on streams that include automaton +/// states. +/// +/// In general, one should use `StreamBuilder` unless you have a specific need +/// for accessing the states of the underlying automaton that is being used to +/// filter this stream. +/// +/// Once all bounds are set, one should call `into_stream` to get a +/// `Stream`. +/// +/// Bounds are not additive. That is, if `ge` is called twice on the same +/// builder, then the second setting wins. +/// +/// The `A` type parameter corresponds to an optional automaton to filter +/// the stream. By default, no filtering is done. +/// +/// The `'s` lifetime parameter refers to the lifetime of the underlying set. +pub struct StreamWithStateBuilder<'s, A = AlwaysMatch>( + raw::StreamWithStateBuilder<'s, A>, +); + +impl<'s, A: Automaton> StreamWithStateBuilder<'s, A> { + /// Specify a greater-than-or-equal-to bound. + pub fn ge>( + self, + bound: T, + ) -> StreamWithStateBuilder<'s, A> { + StreamWithStateBuilder(self.0.ge(bound)) + } + + /// Specify a greater-than bound. + pub fn gt>( + self, + bound: T, + ) -> StreamWithStateBuilder<'s, A> { + StreamWithStateBuilder(self.0.gt(bound)) + } + + /// Specify a less-than-or-equal-to bound. + pub fn le>( + self, + bound: T, + ) -> StreamWithStateBuilder<'s, A> { + StreamWithStateBuilder(self.0.le(bound)) + } + + /// Specify a less-than bound. + pub fn lt>( + self, + bound: T, + ) -> StreamWithStateBuilder<'s, A> { + StreamWithStateBuilder(self.0.lt(bound)) + } +} + +impl<'s, 'a, A: 'a + Automaton> IntoStreamer<'a> + for StreamWithStateBuilder<'s, A> +where + A::State: Clone, +{ + type Item = (&'a [u8], A::State); + type Into = StreamWithState<'s, A>; + + fn into_stream(self) -> StreamWithState<'s, A> { + StreamWithState(self.0.into_stream()) + } +} + /// A builder for collecting set streams on which to perform set operations. /// /// Set operations include intersection, union, difference and symmetric From b6814a4eaabff692812dedd09fe775a2cc91aae2 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Fri, 6 Mar 2020 08:48:38 -0500 Subject: [PATCH 18/23] crc: add CRC32C to every FST This modifies the FST builder to compute the CRC32C checksum as the FST is being constructed. It is written as the last 4 bytes of the FST. We also add a new `verify` routine on `raw::Fst` that permits callers to check whether the FST's integrity is intact. Since verification could be quite expensive for very large FSTs, we do not do this by default. We avoid a dependency on a CRC32C crate since hand-rolling it ourselves is very simple and not much code. I tried using a SIMD version of CRC32C, but I couldn't benchmark a difference. In particular, I suspect that the FST writing process is doing a lot of small writes, so the checksummer doesn't get much opportunity to checksum a lot of bytes at once. It's not quite clear how to fix this. We could use our own buffer internally, but then the caller wouldn't be able to use their own buffered reader, which is a bit weird. But not without precedent. In any case, the overhead of checksumming during construction is virtually negligible, so we don't sweat it for now. --- build.rs | 124 +++++++++++++++++++++++++++++++++++++ src/bytes.rs | 92 +++++++++++++++++++++++++++ src/lib.rs | 1 + src/raw/build.rs | 10 ++- src/raw/counting_writer.rs | 17 ++++- src/raw/crc32.rs | 59 ++++++++++++++++++ src/raw/crc32_table.rs | 2 + src/raw/error.rs | 13 ++++ src/raw/mod.rs | 53 ++++++++++++---- src/raw/tests.rs | 42 ++++++++++++- 10 files changed, 392 insertions(+), 21 deletions(-) create mode 100644 build.rs create mode 100644 src/bytes.rs create mode 100644 src/raw/crc32.rs create mode 100644 src/raw/crc32_table.rs diff --git a/build.rs b/build.rs new file mode 100644 index 0000000..283164f --- /dev/null +++ b/build.rs @@ -0,0 +1,124 @@ +use std::env; +use std::fs::File; +use std::io::{self, Write}; +use std::path::{Path, PathBuf}; + +const CASTAGNOLI_POLY: u32 = 0x82f63b78; + +type Result = std::result::Result>; + +fn main() { + if let Err(err) = try_main() { + panic!("{}", err); + } +} + +fn try_main() -> Result<()> { + let out_dir = match env::var_os("OUT_DIR") { + None => { + return Err(From::from("OUT_DIR environment variable not defined")) + } + Some(out_dir) => PathBuf::from(out_dir), + }; + write_tag_lookup_table(&out_dir)?; + write_crc_tables(&out_dir)?; + Ok(()) +} + +fn write_tag_lookup_table(out_dir: &Path) -> Result<()> { + let out_path = out_dir.join("tag.rs"); + let mut out = io::BufWriter::new(File::create(out_path)?); + + writeln!(out, "pub const TAG_LOOKUP_TABLE: [u16; 256] = [")?; + for b in 0u8..=255 { + writeln!(out, " {},", tag_entry(b))?; + } + writeln!(out, "];")?; + Ok(()) +} + +fn tag_entry(b: u8) -> u16 { + let b = b as u16; + match b & 0b00000011 { + 0b00 => { + let lit_len = (b >> 2) + 1; + if lit_len <= 60 { + lit_len + } else { + assert!(lit_len <= 64); + (lit_len - 60) << 11 + } + } + 0b01 => { + let len = 4 + ((b >> 2) & 0b111); + let offset = (b >> 5) & 0b111; + (1 << 11) | (offset << 8) | len + } + 0b10 => { + let len = 1 + (b >> 2); + (2 << 11) | len + } + 0b11 => { + let len = 1 + (b >> 2); + (4 << 11) | len + } + _ => unreachable!(), + } +} + +fn write_crc_tables(out_dir: &Path) -> Result<()> { + let out_path = out_dir.join("crc32_table.rs"); + let mut out = io::BufWriter::new(File::create(out_path)?); + + let table = make_table(CASTAGNOLI_POLY); + let table16 = make_table16(CASTAGNOLI_POLY); + + writeln!(out, "pub const TABLE: [u32; 256] = [")?; + for &x in table.iter() { + writeln!(out, " {},", x)?; + } + writeln!(out, "];\n")?; + + writeln!(out, "pub const TABLE16: [[u32; 256]; 16] = [")?; + for table in table16.iter() { + writeln!(out, " [")?; + for &x in table.iter() { + writeln!(out, " {},", x)?; + } + writeln!(out, " ],")?; + } + writeln!(out, "];")?; + + out.flush()?; + + Ok(()) +} + +fn make_table16(poly: u32) -> [[u32; 256]; 16] { + let mut tab = [[0; 256]; 16]; + tab[0] = make_table(poly); + for i in 0..256 { + let mut crc = tab[0][i]; + for j in 1..16 { + crc = (crc >> 8) ^ tab[0][crc as u8 as usize]; + tab[j][i] = crc; + } + } + tab +} + +fn make_table(poly: u32) -> [u32; 256] { + let mut tab = [0; 256]; + for i in 0u32..256u32 { + let mut crc = i; + for _ in 0..8 { + if crc & 1 == 1 { + crc = (crc >> 1) ^ poly; + } else { + crc >>= 1; + } + } + tab[i as usize] = crc; + } + tab +} diff --git a/src/bytes.rs b/src/bytes.rs new file mode 100644 index 0000000..852941c --- /dev/null +++ b/src/bytes.rs @@ -0,0 +1,92 @@ +#![allow(warnings)] + +use std::convert::TryInto; +use std::io; + +/// Read a u16 in little endian format from the beginning of the given slice. +/// This panics if the slice has length less than 2. +pub fn read_u16_le(slice: &[u8]) -> u16 { + u16::from_le_bytes(slice[..2].try_into().unwrap()) +} + +/// Read a u24 (returned as a u32 with the most significant 8 bits always set +/// to 0) in little endian format from the beginning of the given slice. This +/// panics if the slice has length less than 3. +pub fn read_u24_le(slice: &[u8]) -> u32 { + slice[0] as u32 | (slice[1] as u32) << 8 | (slice[2] as u32) << 16 +} + +/// Read a u32 in little endian format from the beginning of the given slice. +/// This panics if the slice has length less than 4. +pub fn read_u32_le(slice: &[u8]) -> u32 { + u32::from_le_bytes(slice[..4].try_into().unwrap()) +} + +/// Like read_u32_le, but from an io::Read implementation. If io::Read does +/// not yield at least 4 bytes, then this returns an unexpected EOF error. +pub fn io_read_u32_le(mut rdr: R) -> io::Result { + let mut buf = [0; 4]; + rdr.read_exact(&mut buf)?; + Ok(u32::from_le_bytes(buf)) +} + +/// Write a u16 in little endian format to the beginning of the given slice. +/// This panics if the slice has length less than 2. +pub fn write_u16_le(n: u16, slice: &mut [u8]) { + assert!(slice.len() >= 2); + let bytes = n.to_le_bytes(); + slice[0] = bytes[0]; + slice[1] = bytes[1]; +} + +/// Write a u24 (given as a u32 where the most significant 8 bits are ignored) +/// in little endian format to the beginning of the given slice. This panics +/// if the slice has length less than 3. +pub fn write_u24_le(n: u32, slice: &mut [u8]) { + slice[0] = n as u8; + slice[1] = (n >> 8) as u8; + slice[2] = (n >> 16) as u8; +} + +/// Write a u32 in little endian format to the beginning of the given slice. +/// This panics if the slice has length less than 4. +pub fn write_u32_le(n: u32, slice: &mut [u8]) { + assert!(slice.len() >= 4); + let bytes = n.to_le_bytes(); + slice[0] = bytes[0]; + slice[1] = bytes[1]; + slice[2] = bytes[2]; + slice[3] = bytes[3]; +} + +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +pub fn write_varu64(data: &mut [u8], mut n: u64) -> usize { + let mut i = 0; + while n >= 0b1000_0000 { + data[i] = (n as u8) | 0b1000_0000; + n >>= 7; + i += 1; + } + data[i] = n as u8; + i + 1 +} + +/// https://developers.google.com/protocol-buffers/docs/encoding#varints +pub fn read_varu64(data: &[u8]) -> (u64, usize) { + let mut n: u64 = 0; + let mut shift: u32 = 0; + for (i, &b) in data.iter().enumerate() { + if b < 0b1000_0000 { + return match (b as u64).checked_shl(shift) { + None => (0, 0), + Some(b) => (n | b, i + 1), + }; + } + match ((b as u64) & 0b0111_1111).checked_shl(shift) { + None => return (0, 0), + Some(b) => n |= b, + } + shift += 7; + } + (0, 0) +} diff --git a/src/lib.rs b/src/lib.rs index 03048aa..0538fb3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -311,6 +311,7 @@ pub use crate::map::{Map, MapBuilder}; pub use crate::set::{Set, SetBuilder}; pub use crate::stream::{IntoStreamer, Streamer}; +mod bytes; mod error; #[path = "automaton/mod.rs"] mod inner_automaton; diff --git a/src/raw/build.rs b/src/raw/build.rs index ddceaa4..f4e6495 100644 --- a/src/raw/build.rs +++ b/src/raw/build.rs @@ -1,4 +1,4 @@ -use std::io::{self, Write}; +use std::io; use byteorder::{LittleEndian, WriteBytesExt}; @@ -219,8 +219,12 @@ impl Builder { let root_addr = self.compile(&root_node)?; self.wtr.write_u64::(self.len as u64)?; self.wtr.write_u64::(root_addr as u64)?; - self.wtr.flush()?; - Ok(self.wtr.into_inner()) + + let sum = self.wtr.masked_checksum(); + let mut wtr = self.wtr.into_inner(); + wtr.write_u32::(sum)?; + wtr.flush()?; + Ok(wtr) } fn insert_output(&mut self, bs: B, out: Option) -> Result<()> diff --git a/src/raw/counting_writer.rs b/src/raw/counting_writer.rs index e0e336d..70d9315 100644 --- a/src/raw/counting_writer.rs +++ b/src/raw/counting_writer.rs @@ -1,15 +1,18 @@ use std::io; -/// Wraps any writer and counts bytes written. +use crate::raw::crc32::CheckSummer; + +/// Wraps any writer that counts and checksums bytes written. pub struct CountingWriter { wtr: W, cnt: u64, + summer: CheckSummer, } impl CountingWriter { /// Wrap the given writer with a counter. pub fn new(wtr: W) -> CountingWriter { - CountingWriter { wtr, cnt: 0 } + CountingWriter { wtr, cnt: 0, summer: CheckSummer::new() } } /// Return the total number of bytes written to the underlying writer. @@ -20,6 +23,15 @@ impl CountingWriter { self.cnt } + /// Returns the masked CRC32C checksum of the bytes written so far. + /// + /// This "masked" checksum is the same one used by the Snappy frame format. + /// Masking is supposed to make the checksum robust with respect to data + /// that contains the checksum itself. + pub fn masked_checksum(&self) -> u32 { + self.summer.masked() + } + /// Unwrap the counting writer and return the inner writer. pub fn into_inner(self) -> W { self.wtr @@ -33,6 +45,7 @@ impl CountingWriter { impl io::Write for CountingWriter { fn write(&mut self, buf: &[u8]) -> io::Result { + self.summer.update(buf); let n = self.wtr.write(buf)?; self.cnt += n as u64; Ok(n) diff --git a/src/raw/crc32.rs b/src/raw/crc32.rs new file mode 100644 index 0000000..06b4442 --- /dev/null +++ b/src/raw/crc32.rs @@ -0,0 +1,59 @@ +use crate::bytes; +use crate::raw::crc32_table::{TABLE, TABLE16}; + +/// Provides a simple API to perform a rolling CRC32C checksum. +#[derive(Clone, Copy, Debug)] +pub struct CheckSummer { + sum: u32, +} + +impl CheckSummer { + /// Create a new checksummer that can compute CRC32C checksums on arbitrary + /// bytes. + pub fn new() -> CheckSummer { + CheckSummer { sum: 0 } + } + + /// Returns the "masked" CRC32 checksum of the data so far using the + /// Castagnoli polynomial. This "masked" checksum is the same one used + /// by the Snappy frame format. Masking is supposed to make the checksum + /// robust with respect to data that contains the checksum itself. + pub fn masked(&self) -> u32 { + let sum = self.sum; + (sum.wrapping_shr(15) | sum.wrapping_shl(17)).wrapping_add(0xA282EAD8) + } + + /// Update the current checksum with the checksum for the given bytes. + pub fn update(&mut self, buf: &[u8]) { + self.sum = crc32c_slice16(self.sum, buf); + } +} + +/// Returns the CRC32 checksum of `buf` using the Castagnoli polynomial. +fn crc32c_slice16(prev: u32, mut buf: &[u8]) -> u32 { + let mut crc: u32 = !prev; + while buf.len() >= 16 { + crc ^= bytes::read_u32_le(buf); + crc = TABLE16[0][buf[15] as usize] + ^ TABLE16[1][buf[14] as usize] + ^ TABLE16[2][buf[13] as usize] + ^ TABLE16[3][buf[12] as usize] + ^ TABLE16[4][buf[11] as usize] + ^ TABLE16[5][buf[10] as usize] + ^ TABLE16[6][buf[9] as usize] + ^ TABLE16[7][buf[8] as usize] + ^ TABLE16[8][buf[7] as usize] + ^ TABLE16[9][buf[6] as usize] + ^ TABLE16[10][buf[5] as usize] + ^ TABLE16[11][buf[4] as usize] + ^ TABLE16[12][(crc >> 24) as u8 as usize] + ^ TABLE16[13][(crc >> 16) as u8 as usize] + ^ TABLE16[14][(crc >> 8) as u8 as usize] + ^ TABLE16[15][(crc) as u8 as usize]; + buf = &buf[16..]; + } + for &b in buf { + crc = TABLE[((crc as u8) ^ b) as usize] ^ (crc >> 8); + } + !crc +} diff --git a/src/raw/crc32_table.rs b/src/raw/crc32_table.rs new file mode 100644 index 0000000..7821b5d --- /dev/null +++ b/src/raw/crc32_table.rs @@ -0,0 +1,2 @@ +// Generated by build.rs. +include!(concat!(env!("OUT_DIR"), "/crc32_table.rs")); diff --git a/src/raw/error.rs b/src/raw/error.rs index d63fe04..6d2bf05 100644 --- a/src/raw/error.rs +++ b/src/raw/error.rs @@ -30,6 +30,14 @@ pub enum Error { /// The number of bytes given to the FST constructor. size: usize, }, + /// An error that is returned if verification of an FST fails because of a + /// checksum mismatch. + ChecksumMismatch { + /// The checksum that was expected. + expected: u32, + /// The checksum that was actually computed. + got: u32, + }, /// A duplicate key was inserted into a finite state transducer, which is /// not allowed. DuplicateKey { @@ -82,6 +90,11 @@ Error opening FST with size {} bytes: An unknown error occurred. This \ usually means you're trying to read data that isn't actually an encoded FST.", size ), + ChecksumMismatch { expected, got } => write!( + f, + "FST verification failed: expected checksum of {} but got {}", + expected, got, + ), DuplicateKey { ref got } => write!( f, "Error inserting duplicate key: '{}'.", diff --git a/src/raw/mod.rs b/src/raw/mod.rs index 67386cd..e6f72d5 100644 --- a/src/raw/mod.rs +++ b/src/raw/mod.rs @@ -39,6 +39,8 @@ pub use self::ops::{ mod build; mod common_inputs; mod counting_writer; +mod crc32; +mod crc32_table; mod error; mod node; mod ops; @@ -279,6 +281,7 @@ struct Meta { root_addr: CompiledAddr, ty: FstType, len: usize, + checksum: u32, } impl Fst> { @@ -329,16 +332,19 @@ impl Fst> { impl> Fst { /// Creates a transducer from its representation as a raw byte sequence. /// - /// Note that this operation is very cheap (no allocations and no copies). + /// This operation is intentionally very cheap (no allocations and no + /// copies). In particular, no verification on the integrity of the + /// FST is performed. Callers may opt into integrity checks via the + /// [`Fst::verify`](struct.Fst.html#method.verify) method. /// /// The fst must have been written with a compatible finite state /// transducer builder (`Builder` qualifies). If the format is invalid or /// if there is a mismatch between the API version of this library and the /// fst, then an error is returned. #[inline] - pub fn new(data: D) -> Result { + pub fn new(data: D) -> Result> { let bytes = data.as_ref(); - if bytes.len() < 32 { + if bytes.len() < 36 { return Err(Error::Format { size: bytes.len() }.into()); } // The read_u64 unwraps below are OK because they can never fail. @@ -353,20 +359,23 @@ impl> Fst { ); } let ty = (&bytes[8..]).read_u64::().unwrap(); + let checksum = + (&bytes[bytes.len() - 4..]).read_u32::().unwrap(); let root_addr = { - let mut last = &bytes[bytes.len() - 8..]; + let mut last = &bytes[bytes.len() - 12..]; u64_to_usize(last.read_u64::().unwrap()) }; let len = { - let mut last2 = &bytes[bytes.len() - 16..]; + let mut last2 = &bytes[bytes.len() - 20..]; u64_to_usize(last2.read_u64::().unwrap()) }; // The root node is always the last node written, so its address should // be near the end. After the root node is written, we still have to - // write the root *address* and the number of keys in the FST. - // That's 16 bytes. The extra byte comes from the fact that the root - // address points to the last byte in the root node, rather than the - // byte immediately following the root node. + // write the root *address* and the number of keys in the FST, along + // with the checksum. That's 20 bytes. The extra byte used below (21 + // and not 20) comes from the fact that the root address points to + // the last byte in the root node, rather than the byte immediately + // following the root node. // // If this check passes, it is still possible that the FST is invalid // but probably unlikely. If this check reports a false positive, then @@ -378,13 +387,13 @@ impl> Fst { // has a root node that is empty and final, which means it has the // special address `0`. In that case, the FST is the smallest it can // be: the version, type, root address and number of nodes. That's - // 32 bytes (8 byte u64 each). - if (root_addr == EMPTY_ADDRESS && bytes.len() != 32) - && root_addr + 17 != bytes.len() + // 36 bytes (8 byte u64 each). + if (root_addr == EMPTY_ADDRESS && bytes.len() != 36) + && root_addr + 21 != bytes.len() { return Err(Error::Format { size: bytes.len() }.into()); } - let meta = Meta { version, root_addr, ty, len }; + let meta = Meta { version, root_addr, ty, len, checksum }; Ok(Fst { meta, data }) } @@ -486,6 +495,24 @@ impl> Fst { self.as_ref().size() } + /// Attempts to verify this FST by computing its checksum. + /// + /// This will scan over all of the bytes in the underlying FST, so this + /// may be an expensive operation depending on the size of the FST. + #[inline] + pub fn verify(&self) -> Result<()> { + use crate::raw::crc32::CheckSummer; + + let expected = self.as_ref().meta.checksum; + let mut summer = CheckSummer::new(); + summer.update(&self.as_bytes()[..self.as_bytes().len() - 4]); + let got = summer.masked(); + if expected == got { + return Ok(()); + } + Err(Error::ChecksumMismatch { expected, got }.into()) + } + /// Creates a new fst operation with this fst added to it. /// /// The `OpBuilder` type can be used to add additional fst streams diff --git a/src/raw/tests.rs b/src/raw/tests.rs index c32a217..d681f0b 100644 --- a/src/raw/tests.rs +++ b/src/raw/tests.rs @@ -206,7 +206,7 @@ fn fst_map_100000_lengths() { #[test] fn invalid_version() { - match Fst::new(vec![0; 32]) { + match Fst::new(vec![0; 36]) { Err(Error::Fst(raw::Error::Version { got, .. })) => assert_eq!(got, 0), Err(err) => panic!("expected version error, got {:?}", err), Ok(_) => panic!("expected version error, got FST"), @@ -217,7 +217,7 @@ fn invalid_version() { fn invalid_version_crate_too_old() { use byteorder::{ByteOrder, LittleEndian}; - let mut buf = vec![0; 32]; + let mut buf = vec![0; 36]; LittleEndian::write_u64(&mut buf, VERSION + 1); match Fst::new(buf) { Err(Error::Fst(raw::Error::Version { got, .. })) => { @@ -514,7 +514,7 @@ fn bytes_written() { let counted_len = bfst1.bytes_written(); let bytes = bfst1.into_inner().unwrap(); let fst1_len = bytes.len() as u64; - let footer_size = 24; + let footer_size = 28; assert_eq!(counted_len + footer_size, fst1_len); } @@ -553,3 +553,39 @@ fn get_key_words_discontiguous() { assert_eq!(map.get_key(value), Some(key)); } } + +#[test] +fn verify_ok_nonempty() { + let words: Vec<(Vec, u64)> = TEXT + .lines() + .enumerate() + .map(|(i, line)| (line.as_bytes().to_vec(), i as u64 * 2)) + .collect(); + let map = fst_map(words.clone()); + assert!(map.verify().is_ok()); +} + +#[test] +fn verify_ok_empty() { + let map = fst_map(Vec::<(&str, u64)>::new()); + assert!(map.verify().is_ok()); +} + +#[test] +fn verify_err() { + let mut b = Builder::memory(); + b.add(b"bar").unwrap(); + b.add(b"baz").unwrap(); + let mut bytes = b.into_inner().unwrap(); + + { + let fst = Fst::new(&bytes).unwrap(); + assert!(fst.verify().is_ok()); + } + + bytes[17] = b'\xFF'; + { + let fst = Fst::new(&bytes).unwrap(); + assert!(fst.verify().is_err()); + } +} From 7f8edc636d4ac450c411471cc498b259f0888eb9 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 7 Mar 2020 12:39:09 -0500 Subject: [PATCH 19/23] deps: remove byteorder Now that std provides safe APIs for converting between raw bytes and integers, we can use those instead. We still define a few utility functions for convenience though. The packing code is definitely not as ideally efficient as what byteorder does, since we use safe code. But I was not able to observe a difference in benchmarks. fst now has no required dependencies! --- Cargo.toml | 1 - src/bytes.rs | 169 ++++++++++++++++++++++++++++++----------------- src/raw/build.rs | 13 ++-- src/raw/mod.rs | 19 +++--- src/raw/node.rs | 54 ++++++++------- src/raw/pack.rs | 74 --------------------- src/raw/tests.rs | 4 +- 7 files changed, 152 insertions(+), 182 deletions(-) delete mode 100644 src/raw/pack.rs diff --git a/Cargo.toml b/Cargo.toml index e6b780c..b135a67 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,7 +23,6 @@ default = [] levenshtein = ["utf8-ranges"] [dependencies] -byteorder = "1.3.4" utf8-ranges = { version = "1.0.4", optional = true } [dev-dependencies] diff --git a/src/bytes.rs b/src/bytes.rs index 852941c..ff5a8e3 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -1,92 +1,139 @@ -#![allow(warnings)] - use std::convert::TryInto; use std::io; -/// Read a u16 in little endian format from the beginning of the given slice. -/// This panics if the slice has length less than 2. -pub fn read_u16_le(slice: &[u8]) -> u16 { - u16::from_le_bytes(slice[..2].try_into().unwrap()) -} - -/// Read a u24 (returned as a u32 with the most significant 8 bits always set -/// to 0) in little endian format from the beginning of the given slice. This -/// panics if the slice has length less than 3. -pub fn read_u24_le(slice: &[u8]) -> u32 { - slice[0] as u32 | (slice[1] as u32) << 8 | (slice[2] as u32) << 16 -} - /// Read a u32 in little endian format from the beginning of the given slice. /// This panics if the slice has length less than 4. pub fn read_u32_le(slice: &[u8]) -> u32 { u32::from_le_bytes(slice[..4].try_into().unwrap()) } -/// Like read_u32_le, but from an io::Read implementation. If io::Read does -/// not yield at least 4 bytes, then this returns an unexpected EOF error. -pub fn io_read_u32_le(mut rdr: R) -> io::Result { - let mut buf = [0; 4]; - rdr.read_exact(&mut buf)?; - Ok(u32::from_le_bytes(buf)) +/// Read a u64 in little endian format from the beginning of the given slice. +/// This panics if the slice has length less than 8. +pub fn read_u64_le(slice: &[u8]) -> u64 { + u64::from_le_bytes(slice[..8].try_into().unwrap()) } -/// Write a u16 in little endian format to the beginning of the given slice. -/// This panics if the slice has length less than 2. -pub fn write_u16_le(n: u16, slice: &mut [u8]) { - assert!(slice.len() >= 2); +/// Write a u32 in little endian format to the beginning of the given slice. +/// This panics if the slice has length less than 4. +pub fn write_u32_le(n: u32, slice: &mut [u8]) { + assert!(slice.len() >= 4); let bytes = n.to_le_bytes(); slice[0] = bytes[0]; slice[1] = bytes[1]; + slice[2] = bytes[2]; + slice[3] = bytes[3]; } -/// Write a u24 (given as a u32 where the most significant 8 bits are ignored) -/// in little endian format to the beginning of the given slice. This panics -/// if the slice has length less than 3. -pub fn write_u24_le(n: u32, slice: &mut [u8]) { - slice[0] = n as u8; - slice[1] = (n >> 8) as u8; - slice[2] = (n >> 16) as u8; +/// Like write_u32_le, but to an io::Write implementation. If every byte could +/// not be writen, then this returns an error. +pub fn io_write_u32_le(n: u32, mut wtr: W) -> io::Result<()> { + let mut buf = [0; 4]; + write_u32_le(n, &mut buf); + wtr.write_all(&buf) } -/// Write a u32 in little endian format to the beginning of the given slice. -/// This panics if the slice has length less than 4. -pub fn write_u32_le(n: u32, slice: &mut [u8]) { - assert!(slice.len() >= 4); +/// Write a u64 in little endian format to the beginning of the given slice. +/// This panics if the slice has length less than 8. +pub fn write_u64_le(n: u64, slice: &mut [u8]) { + assert!(slice.len() >= 8); let bytes = n.to_le_bytes(); slice[0] = bytes[0]; slice[1] = bytes[1]; slice[2] = bytes[2]; slice[3] = bytes[3]; + slice[4] = bytes[4]; + slice[5] = bytes[5]; + slice[6] = bytes[6]; + slice[7] = bytes[7]; +} + +/// Like write_u64_le, but to an io::Write implementation. If every byte could +/// not be writen, then this returns an error. +pub fn io_write_u64_le(n: u64, mut wtr: W) -> io::Result<()> { + let mut buf = [0; 8]; + write_u64_le(n, &mut buf); + wtr.write_all(&buf) } -/// https://developers.google.com/protocol-buffers/docs/encoding#varints -pub fn write_varu64(data: &mut [u8], mut n: u64) -> usize { - let mut i = 0; - while n >= 0b1000_0000 { - data[i] = (n as u8) | 0b1000_0000; - n >>= 7; - i += 1; +/// pack_uint packs the given integer in the smallest number of bytes possible, +/// and writes it to the given writer. The number of bytes written is returned +/// on success. +pub fn pack_uint(wtr: W, n: u64) -> io::Result { + let nbytes = pack_size(n); + pack_uint_in(wtr, n, nbytes).map(|_| nbytes) +} + +/// pack_uint_in is like pack_uint, but always uses the number of bytes given +/// to pack the number given. +/// +/// `nbytes` must be >= pack_size(n) and <= 8, where `pack_size(n)` is the +/// smallest number of bytes that can store the integer given. +pub fn pack_uint_in( + mut wtr: W, + mut n: u64, + nbytes: u8, +) -> io::Result<()> { + assert!(1 <= nbytes && nbytes <= 8); + let mut buf = [0u8; 8]; + for i in 0..nbytes { + buf[i as usize] = n as u8; + n = n >> 8; } - data[i] = n as u8; - i + 1 + wtr.write_all(&buf[..nbytes as usize])?; + Ok(()) } -/// https://developers.google.com/protocol-buffers/docs/encoding#varints -pub fn read_varu64(data: &[u8]) -> (u64, usize) { - let mut n: u64 = 0; - let mut shift: u32 = 0; - for (i, &b) in data.iter().enumerate() { - if b < 0b1000_0000 { - return match (b as u64).checked_shl(shift) { - None => (0, 0), - Some(b) => (n | b, i + 1), - }; - } - match ((b as u64) & 0b0111_1111).checked_shl(shift) { - None => return (0, 0), - Some(b) => n |= b, +/// unpack_uint is the dual of pack_uint. It unpacks the integer at the current +/// position in `slice` after reading `nbytes` bytes. +/// +/// `nbytes` must be >= 1 and <= 8. +pub fn unpack_uint(slice: &[u8], nbytes: u8) -> u64 { + assert!(1 <= nbytes && nbytes <= 8); + + let mut n = 0; + for (i, &b) in slice[..nbytes as usize].iter().enumerate() { + n = n | ((b as u64) << (8 * i)); + } + n +} + +/// pack_size returns the smallest number of bytes that can encode `n`. +pub fn pack_size(n: u64) -> u8 { + if n < 1 << 8 { + 1 + } else if n < 1 << 16 { + 2 + } else if n < 1 << 24 { + 3 + } else if n < 1 << 32 { + 4 + } else if n < 1 << 40 { + 5 + } else if n < 1 << 48 { + 6 + } else if n < 1 << 56 { + 7 + } else { + 8 + } +} + +#[cfg(test)] +mod tests { + use super::*; + use quickcheck::{QuickCheck, StdGen}; + use std::io; + + #[test] + fn prop_pack_in_out() { + fn p(num: u64) -> bool { + let mut buf = io::Cursor::new(vec![]); + let size = pack_uint(&mut buf, num).unwrap(); + buf.set_position(0); + num == unpack_uint(buf.get_ref(), size) } - shift += 7; + QuickCheck::new() + .gen(StdGen::new(::rand::thread_rng(), 257)) // pick byte boundary + .quickcheck(p as fn(u64) -> bool); } - (0, 0) } diff --git a/src/raw/build.rs b/src/raw/build.rs index f4e6495..5dc33b0 100644 --- a/src/raw/build.rs +++ b/src/raw/build.rs @@ -1,7 +1,6 @@ use std::io; -use byteorder::{LittleEndian, WriteBytesExt}; - +use crate::bytes; use crate::error::Result; use crate::raw::counting_writer::CountingWriter; use crate::raw::error::Error; @@ -125,9 +124,9 @@ impl Builder { // Don't allow any nodes to have address 0-7. We use these to encode // the API version. We also use addresses `0` and `1` as special // sentinel values, so they should never correspond to a real node. - wtr.write_u64::(VERSION)?; + bytes::io_write_u64_le(VERSION, &mut wtr)?; // Similarly for 8-15 for the fst type. - wtr.write_u64::(ty)?; + bytes::io_write_u64_le(ty, &mut wtr)?; Ok(Builder { wtr, unfinished: UnfinishedNodes::new(), @@ -217,12 +216,12 @@ impl Builder { self.compile_from(0)?; let root_node = self.unfinished.pop_root(); let root_addr = self.compile(&root_node)?; - self.wtr.write_u64::(self.len as u64)?; - self.wtr.write_u64::(root_addr as u64)?; + bytes::io_write_u64_le(self.len as u64, &mut self.wtr)?; + bytes::io_write_u64_le(root_addr as u64, &mut self.wtr)?; let sum = self.wtr.masked_checksum(); let mut wtr = self.wtr.into_inner(); - wtr.write_u32::(sum)?; + bytes::io_write_u32_le(sum, &mut wtr)?; wtr.flush()?; Ok(wtr) } diff --git a/src/raw/mod.rs b/src/raw/mod.rs index e6f72d5..11e2bd9 100644 --- a/src/raw/mod.rs +++ b/src/raw/mod.rs @@ -21,9 +21,8 @@ Most of the rest of the types are streams from set operations. use std::cmp; use std::fmt; -use byteorder::{LittleEndian, ReadBytesExt}; - use crate::automaton::{AlwaysMatch, Automaton}; +use crate::bytes; use crate::error::Result; use crate::stream::{IntoStreamer, Streamer}; @@ -44,7 +43,6 @@ mod crc32_table; mod error; mod node; mod ops; -mod pack; mod registry; mod registry_minimal; #[cfg(test)] @@ -352,22 +350,21 @@ impl> Fst { // unexpected EOF. However, we are reading from a byte slice (no // IO errors possible) and we've confirmed the byte slice is at least // N bytes (no unexpected EOF). - let version = (&bytes[0..]).read_u64::().unwrap(); + let version = bytes::read_u64_le(&bytes); if version == 0 || version > VERSION { return Err( Error::Version { expected: VERSION, got: version }.into() ); } - let ty = (&bytes[8..]).read_u64::().unwrap(); - let checksum = - (&bytes[bytes.len() - 4..]).read_u32::().unwrap(); + let ty = bytes::read_u64_le(&bytes[8..]); + let checksum = bytes::read_u32_le(&bytes[bytes.len() - 4..]); let root_addr = { - let mut last = &bytes[bytes.len() - 12..]; - u64_to_usize(last.read_u64::().unwrap()) + let last = &bytes[bytes.len() - 12..]; + u64_to_usize(bytes::read_u64_le(last)) }; let len = { - let mut last2 = &bytes[bytes.len() - 20..]; - u64_to_usize(last2.read_u64::().unwrap()) + let last2 = &bytes[bytes.len() - 20..]; + u64_to_usize(bytes::read_u64_le(last2)) }; // The root node is always the last node written, so its address should // be near the end. After the root node is written, we still have to diff --git a/src/raw/node.rs b/src/raw/node.rs index 989d6ac..2bc8dd3 100644 --- a/src/raw/node.rs +++ b/src/raw/node.rs @@ -3,11 +3,9 @@ use std::fmt; use std::io; use std::ops::Range; -use byteorder::WriteBytesExt; - +use crate::bytes; use crate::raw::build::BuilderNode; use crate::raw::common_inputs::{COMMON_INPUTS, COMMON_INPUTS_INV}; -use crate::raw::pack::{pack_size, pack_uint, pack_uint_in, unpack_uint}; use crate::raw::{ u64_to_usize, CompiledAddr, Output, Transition, EMPTY_ADDRESS, }; @@ -325,9 +323,10 @@ impl StateOneTransNext { let mut state = StateOneTransNext::new(); state.set_common_input(input); if state.common_input().is_none() { - wtr.write_u8(input)?; + wtr.write_all(&[input])?; } - wtr.write_u8(state.0).map_err(From::from) + wtr.write_all(&[state.0])?; + Ok(()) } #[inline(always)] @@ -381,20 +380,21 @@ impl StateOneTrans { ) -> io::Result<()> { let out = trans.out.value(); let output_pack_size = - if out == 0 { 0 } else { pack_uint(&mut wtr, out)? }; + if out == 0 { 0 } else { bytes::pack_uint(&mut wtr, out)? }; let trans_pack_size = pack_delta(&mut wtr, addr, trans.addr)?; let mut pack_sizes = PackSizes::new(); pack_sizes.set_output_pack_size(output_pack_size); pack_sizes.set_transition_pack_size(trans_pack_size); - wtr.write_u8(pack_sizes.encode())?; + wtr.write_all(&[pack_sizes.encode()])?; let mut state = StateOneTrans::new(); state.set_common_input(trans.inp); if state.common_input().is_none() { - wtr.write_u8(trans.inp)?; + wtr.write_all(&[trans.inp])?; } - wtr.write_u8(state.0).map_err(From::from) + wtr.write_all(&[state.0])?; + Ok(()) } fn new() -> Self { @@ -454,7 +454,7 @@ impl StateOneTrans { - self.input_len() - 1 // pack size - tsize - osize; - Output::new(unpack_uint(&node.data[i..], osize as u8)) + Output::new(bytes::unpack_uint(&node.data[i..], osize as u8)) } #[inline(always)] @@ -477,11 +477,11 @@ impl StateAnyTrans { assert!(node.trans.len() <= 256); let mut tsize = 0; - let mut osize = pack_size(node.final_output.value()); + let mut osize = bytes::pack_size(node.final_output.value()); let mut any_outs = !node.final_output.is_zero(); for t in &node.trans { tsize = cmp::max(tsize, pack_delta_size(addr, t.addr)); - osize = cmp::max(osize, pack_size(t.out.value())); + osize = cmp::max(osize, bytes::pack_size(t.out.value())); any_outs = any_outs || !t.out.is_zero(); } @@ -499,17 +499,21 @@ impl StateAnyTrans { if any_outs { if node.is_final { - pack_uint_in(&mut wtr, node.final_output.value(), osize)?; + bytes::pack_uint_in( + &mut wtr, + node.final_output.value(), + osize, + )?; } for t in node.trans.iter().rev() { - pack_uint_in(&mut wtr, t.out.value(), osize)?; + bytes::pack_uint_in(&mut wtr, t.out.value(), osize)?; } } for t in node.trans.iter().rev() { pack_delta_in(&mut wtr, addr, t.addr, tsize)?; } for t in node.trans.iter().rev() { - wtr.write_u8(t.inp)?; + wtr.write_all(&[t.inp])?; } if node.trans.len() > TRANS_INDEX_THRESHOLD { // A value of 255 indicates that no transition exists for the byte @@ -523,18 +527,19 @@ impl StateAnyTrans { wtr.write_all(&index)?; } - wtr.write_u8(pack_sizes.encode())?; + wtr.write_all(&[pack_sizes.encode()])?; if state.state_ntrans().is_none() { if node.trans.len() == 256 { // 256 can't be represented in a u8, so we abuse the fact that // the # of transitions can never be 1 here, since 1 is always // encoded in the state byte. - wtr.write_u8(1)?; + wtr.write_all(&[1])?; } else { - wtr.write_u8(node.trans.len() as u8)?; + wtr.write_all(&[node.trans.len() as u8])?; } } - wtr.write_u8(state.0).map_err(From::from) + wtr.write_all(&[state.0])?; + Ok(()) } #[inline(always)] @@ -638,7 +643,7 @@ impl StateAnyTrans { - self.total_trans_size(version, sizes, ntrans) - (ntrans * osize) // output values - osize; // the desired output value - Output::new(unpack_uint(&data[at..], osize as u8)) + Output::new(bytes::unpack_uint(&data[at..], osize as u8)) } #[inline(always)] @@ -720,7 +725,7 @@ impl StateAnyTrans { - self.total_trans_size(node.version, node.sizes, node.ntrans) - (i * osize) // the previous outputs - osize; // the desired output value - Output::new(unpack_uint(&node.data[at..], osize as u8)) + Output::new(bytes::unpack_uint(&node.data[at..], osize as u8)) } } @@ -840,7 +845,7 @@ fn pack_delta_in( } else { node_addr - trans_addr }; - pack_uint_in(wtr, delta_addr as u64, nbytes) + bytes::pack_uint_in(wtr, delta_addr as u64, nbytes) } fn pack_delta_size(node_addr: CompiledAddr, trans_addr: CompiledAddr) -> u8 { @@ -849,16 +854,15 @@ fn pack_delta_size(node_addr: CompiledAddr, trans_addr: CompiledAddr) -> u8 { } else { node_addr - trans_addr }; - pack_size(delta_addr as u64) + bytes::pack_size(delta_addr as u64) } -#[inline(always)] fn unpack_delta( slice: &[u8], trans_pack_size: usize, node_addr: usize, ) -> CompiledAddr { - let delta = unpack_uint(slice, trans_pack_size as u8); + let delta = bytes::unpack_uint(slice, trans_pack_size as u8); let delta_addr = u64_to_usize(delta); if delta_addr == EMPTY_ADDRESS { EMPTY_ADDRESS diff --git a/src/raw/pack.rs b/src/raw/pack.rs deleted file mode 100644 index a5926a5..0000000 --- a/src/raw/pack.rs +++ /dev/null @@ -1,74 +0,0 @@ -use std::io; - -use byteorder::{ByteOrder, LittleEndian, WriteBytesExt}; - -/// pack_uint packs the given integer in the smallest number of bytes possible, -/// and writes it to the given writer. The number of bytes written is returned -/// on success. -pub fn pack_uint(wtr: W, n: u64) -> io::Result { - let nbytes = pack_size(n); - pack_uint_in(wtr, n, nbytes).map(|_| nbytes) -} - -/// pack_uint_in is like pack_uint, but always uses the number of bytes given -/// to pack the number given. -/// -/// `nbytes` must be >= pack_size(n) and <= 8, where `pack_size(n)` is the -/// smallest number of bytes that can store the integer given. -pub fn pack_uint_in( - mut wtr: W, - n: u64, - nbytes: u8, -) -> io::Result<()> { - wtr.write_uint::(n, nbytes as usize).map_err(From::from) -} - -/// unpack_uint is the dual of pack_uint. It unpacks the integer at the current -/// position in `slice` after reading `nbytes` bytes. -/// -/// `nbytes` must be >= 1 and <= 8. -#[inline(always)] -pub fn unpack_uint(slice: &[u8], nbytes: u8) -> u64 { - LittleEndian::read_uint(slice, nbytes as usize) -} - -/// pack_size returns the smallest number of bytes that can encode `n`. -pub fn pack_size(n: u64) -> u8 { - if n < 1 << 8 { - 1 - } else if n < 1 << 16 { - 2 - } else if n < 1 << 24 { - 3 - } else if n < 1 << 32 { - 4 - } else if n < 1 << 40 { - 5 - } else if n < 1 << 48 { - 6 - } else if n < 1 << 56 { - 7 - } else { - 8 - } -} - -#[cfg(test)] -mod tests { - use super::*; - use quickcheck::{QuickCheck, StdGen}; - use std::io; - - #[test] - fn prop_pack_in_out() { - fn p(num: u64) -> bool { - let mut buf = io::Cursor::new(vec![]); - let size = pack_uint(&mut buf, num).unwrap(); - buf.set_position(0); - num == unpack_uint(buf.get_ref(), size) - } - QuickCheck::new() - .gen(StdGen::new(::rand::thread_rng(), 257)) // pick byte boundary - .quickcheck(p as fn(u64) -> bool); - } -} diff --git a/src/raw/tests.rs b/src/raw/tests.rs index d681f0b..c9e77a5 100644 --- a/src/raw/tests.rs +++ b/src/raw/tests.rs @@ -215,10 +215,8 @@ fn invalid_version() { #[test] fn invalid_version_crate_too_old() { - use byteorder::{ByteOrder, LittleEndian}; - let mut buf = vec![0; 36]; - LittleEndian::write_u64(&mut buf, VERSION + 1); + crate::bytes::write_u64_le(VERSION + 1, &mut buf); match Fst::new(buf) { Err(Error::Fst(raw::Error::Version { got, .. })) => { assert_eq!(got, VERSION + 1); From 3972944bcde97570b978bad1c0c31110fbe56103 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 7 Mar 2020 13:05:53 -0500 Subject: [PATCH 20/23] polish: get rid of hack used before pub(crate) --- src/raw/mod.rs | 3 +- src/raw/node.rs | 139 ++++++++++++++++++++++++------------------------ 2 files changed, 70 insertions(+), 72 deletions(-) diff --git a/src/raw/mod.rs b/src/raw/mod.rs index 11e2bd9..d56f9b3 100644 --- a/src/raw/mod.rs +++ b/src/raw/mod.rs @@ -28,7 +28,6 @@ use crate::stream::{IntoStreamer, Streamer}; pub use self::build::Builder; pub use self::error::Error; -use self::node::node_new; pub use self::node::{Node, Transitions}; pub use self::ops::{ Difference, IndexedValue, Intersection, OpBuilder, SymmetricDifference, @@ -708,7 +707,7 @@ impl<'f> FstRef<'f> { } fn node(&self, addr: CompiledAddr) -> Node<'f> { - node_new(self.meta.version, addr, self.as_bytes()) + Node::new(self.meta.version, addr, self.as_bytes()) } fn to_vec(&self) -> Vec { diff --git a/src/raw/node.rs b/src/raw/node.rs index 2bc8dd3..c0c4e41 100644 --- a/src/raw/node.rs +++ b/src/raw/node.rs @@ -48,77 +48,76 @@ impl<'f> fmt::Debug for Node<'f> { } } -/// Creates a new note at the address given. -/// -/// `data` should be a slice to an entire FST. -/// -/// This is a free function so that we can export it to parent modules, but -/// not to consumers of this crate. -#[inline(always)] -pub fn node_new(version: u64, addr: CompiledAddr, data: &[u8]) -> Node<'_> { - use self::State::*; - let state = State::new(data, addr); - match state { - EmptyFinal => Node { - data: &[], - version, - state: State::EmptyFinal, - start: EMPTY_ADDRESS, - end: EMPTY_ADDRESS, - is_final: true, - ntrans: 0, - sizes: PackSizes::new(), - final_output: Output::zero(), - }, - OneTransNext(s) => { - let data = &data[..addr + 1]; - Node { - data, +impl<'f> Node<'f> { + /// Creates a new note at the address given. + /// + /// `data` should be a slice to an entire FST. + pub(crate) fn new( + version: u64, + addr: CompiledAddr, + data: &[u8], + ) -> Node<'_> { + use self::State::*; + let state = State::new(data, addr); + match state { + EmptyFinal => Node { + data: &[], version, - state, - start: addr, - end: s.end_addr(data), - is_final: false, + state: State::EmptyFinal, + start: EMPTY_ADDRESS, + end: EMPTY_ADDRESS, + is_final: true, + ntrans: 0, sizes: PackSizes::new(), - ntrans: 1, final_output: Output::zero(), + }, + OneTransNext(s) => { + let data = &data[..addr + 1]; + Node { + data, + version, + state, + start: addr, + end: s.end_addr(data), + is_final: false, + sizes: PackSizes::new(), + ntrans: 1, + final_output: Output::zero(), + } } - } - OneTrans(s) => { - let data = &data[..addr + 1]; - let sizes = s.sizes(data); - Node { - data, - version, - state, - start: addr, - end: s.end_addr(data, sizes), - is_final: false, - ntrans: 1, - sizes, - final_output: Output::zero(), + OneTrans(s) => { + let data = &data[..addr + 1]; + let sizes = s.sizes(data); + Node { + data, + version, + state, + start: addr, + end: s.end_addr(data, sizes), + is_final: false, + ntrans: 1, + sizes, + final_output: Output::zero(), + } } - } - AnyTrans(s) => { - let data = &data[..addr + 1]; - let sizes = s.sizes(data); - let ntrans = s.ntrans(data); - Node { - data, - version, - state, - start: addr, - end: s.end_addr(version, data, sizes, ntrans), - is_final: s.is_final_state(), - ntrans, - sizes, - final_output: s.final_output(version, data, sizes, ntrans), + AnyTrans(s) => { + let data = &data[..addr + 1]; + let sizes = s.sizes(data); + let ntrans = s.ntrans(data); + Node { + data, + version, + state, + start: addr, + end: s.end_addr(version, data, sizes, ntrans), + is_final: s.is_final_state(), + ntrans, + sizes, + final_output: s.final_output(version, data, sizes, ntrans), + } } } } -} - -impl<'f> Node<'f> { /// Returns an iterator over all transitions in this node in lexicographic /// order. #[inline] @@ -876,7 +875,7 @@ mod tests { use quickcheck::{quickcheck, TestResult}; use crate::raw::build::BuilderNode; - use crate::raw::node::{node_new, Node}; + use crate::raw::node::Node; use crate::raw::{Builder, CompiledAddr, Output, Transition, VERSION}; use crate::stream::Streamer; @@ -926,7 +925,7 @@ mod tests { fn roundtrip(bnode: &BuilderNode) -> bool { let (addr, bytes) = compile(bnode); - let node = node_new(VERSION, addr, &bytes); + let node = Node::new(VERSION, addr, &bytes); nodes_equal(&node, &bnode) } @@ -942,7 +941,7 @@ mod tests { trans: vec![], }; let (addr, buf) = compile(&bnode); - let node = node_new(VERSION, addr, &buf); + let node = Node::new(VERSION, addr, &buf); assert_eq!(node.as_slice().len(), 3); roundtrip(&bnode); } @@ -955,7 +954,7 @@ mod tests { trans: vec![trans(20, b'a')], }; let (addr, buf) = compile(&bnode); - let node = node_new(VERSION, addr, &buf); + let node = Node::new(VERSION, addr, &buf); assert_eq!(node.as_slice().len(), 3); roundtrip(&bnode); } @@ -968,7 +967,7 @@ mod tests { trans: vec![trans(2, b'\xff')], }; let (addr, buf) = compile(&bnode); - let node = node_new(VERSION, addr, &buf); + let node = Node::new(VERSION, addr, &buf); assert_eq!(node.as_slice().len(), 4); roundtrip(&bnode); } @@ -988,7 +987,7 @@ mod tests { ], }; let (addr, buf) = compile(&bnode); - let node = node_new(VERSION, addr, &buf); + let node = Node::new(VERSION, addr, &buf); assert_eq!(node.as_slice().len(), 14); roundtrip(&bnode); } @@ -1001,7 +1000,7 @@ mod tests { trans: (0..256).map(|i| trans(0, i as u8)).collect(), }; let (addr, buf) = compile(&bnode); - let node = node_new(VERSION, addr, &buf); + let node = Node::new(VERSION, addr, &buf); assert_eq!(node.transitions().count(), 256); assert_eq!(node.len(), node.transitions().count()); roundtrip(&bnode); From 8a368babe6a94b9e855daadf5e9cd34758268e31 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 7 Mar 2020 13:08:32 -0500 Subject: [PATCH 21/23] perf: tweak inline annotations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This replaces `inline(always)` annotations with `inline` (except for one case, in which we add a comment). Notably, inlining the methods on `Bound` appear to provide a nice boost: group c01 c02 ----- --- --- search/wikiurls/fst/contains 1.06 428.7±9.53ns 1.00 404.8±4.58ns search/words/fst/contains 1.05 199.8±1.74ns 1.00 191.0±3.34ns --- src/bytes.rs | 10 +++++ src/raw/mod.rs | 18 +++++++- src/raw/node.rs | 114 ++++++++++++++++++++++++++---------------------- 3 files changed, 90 insertions(+), 52 deletions(-) diff --git a/src/bytes.rs b/src/bytes.rs index ff5a8e3..35da411 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -3,18 +3,21 @@ use std::io; /// Read a u32 in little endian format from the beginning of the given slice. /// This panics if the slice has length less than 4. +#[inline] pub fn read_u32_le(slice: &[u8]) -> u32 { u32::from_le_bytes(slice[..4].try_into().unwrap()) } /// Read a u64 in little endian format from the beginning of the given slice. /// This panics if the slice has length less than 8. +#[inline] pub fn read_u64_le(slice: &[u8]) -> u64 { u64::from_le_bytes(slice[..8].try_into().unwrap()) } /// Write a u32 in little endian format to the beginning of the given slice. /// This panics if the slice has length less than 4. +#[inline] pub fn write_u32_le(n: u32, slice: &mut [u8]) { assert!(slice.len() >= 4); let bytes = n.to_le_bytes(); @@ -26,6 +29,7 @@ pub fn write_u32_le(n: u32, slice: &mut [u8]) { /// Like write_u32_le, but to an io::Write implementation. If every byte could /// not be writen, then this returns an error. +#[inline] pub fn io_write_u32_le(n: u32, mut wtr: W) -> io::Result<()> { let mut buf = [0; 4]; write_u32_le(n, &mut buf); @@ -34,6 +38,7 @@ pub fn io_write_u32_le(n: u32, mut wtr: W) -> io::Result<()> { /// Write a u64 in little endian format to the beginning of the given slice. /// This panics if the slice has length less than 8. +#[inline] pub fn write_u64_le(n: u64, slice: &mut [u8]) { assert!(slice.len() >= 8); let bytes = n.to_le_bytes(); @@ -49,6 +54,7 @@ pub fn write_u64_le(n: u64, slice: &mut [u8]) { /// Like write_u64_le, but to an io::Write implementation. If every byte could /// not be writen, then this returns an error. +#[inline] pub fn io_write_u64_le(n: u64, mut wtr: W) -> io::Result<()> { let mut buf = [0; 8]; write_u64_le(n, &mut buf); @@ -58,6 +64,7 @@ pub fn io_write_u64_le(n: u64, mut wtr: W) -> io::Result<()> { /// pack_uint packs the given integer in the smallest number of bytes possible, /// and writes it to the given writer. The number of bytes written is returned /// on success. +#[inline] pub fn pack_uint(wtr: W, n: u64) -> io::Result { let nbytes = pack_size(n); pack_uint_in(wtr, n, nbytes).map(|_| nbytes) @@ -68,6 +75,7 @@ pub fn pack_uint(wtr: W, n: u64) -> io::Result { /// /// `nbytes` must be >= pack_size(n) and <= 8, where `pack_size(n)` is the /// smallest number of bytes that can store the integer given. +#[inline] pub fn pack_uint_in( mut wtr: W, mut n: u64, @@ -87,6 +95,7 @@ pub fn pack_uint_in( /// position in `slice` after reading `nbytes` bytes. /// /// `nbytes` must be >= 1 and <= 8. +#[inline] pub fn unpack_uint(slice: &[u8], nbytes: u8) -> u64 { assert!(1 <= nbytes && nbytes <= 8); @@ -98,6 +107,7 @@ pub fn unpack_uint(slice: &[u8], nbytes: u8) -> u64 { } /// pack_size returns the smallest number of bytes that can encode `n`. +#[inline] pub fn pack_size(n: u64) -> u8 { if n < 1 << 8 { 1 diff --git a/src/raw/mod.rs b/src/raw/mod.rs index d56f9b3..62c7a46 100644 --- a/src/raw/mod.rs +++ b/src/raw/mod.rs @@ -585,7 +585,7 @@ impl> Fst { } /// Returns the root node of this fst. - #[inline(always)] + #[inline] pub fn root(&self) -> Node<'_> { self.as_ref().root() } @@ -632,6 +632,7 @@ struct FstRef<'f> { } impl<'f> FstRef<'f> { + #[inline] fn get(&self, key: &[u8]) -> Option { let mut node = self.root(); let mut out = Output::zero(); @@ -652,6 +653,7 @@ impl<'f> FstRef<'f> { } } + #[inline] fn contains_key(&self, key: &[u8]) -> bool { let mut node = self.root(); for &b in key { @@ -663,6 +665,7 @@ impl<'f> FstRef<'f> { node.is_final() } + #[inline] fn get_key_into(&self, mut value: u64, key: &mut Vec) -> bool { let mut node = self.root(); while value != 0 || !node.is_final() { @@ -682,42 +685,52 @@ impl<'f> FstRef<'f> { true } + #[inline] fn len(&self) -> usize { self.meta.len } + #[inline] fn is_empty(&self) -> bool { self.meta.len == 0 } + #[inline] fn size(&self) -> usize { self.as_bytes().len() } + #[inline] fn fst_type(&self) -> FstType { self.meta.ty } + #[inline] fn root_addr(&self) -> CompiledAddr { self.meta.root_addr } + #[inline] fn root(&self) -> Node<'f> { self.node(self.root_addr()) } + #[inline] fn node(&self, addr: CompiledAddr) -> Node<'f> { Node::new(self.meta.version, addr, self.as_bytes()) } + #[inline] fn to_vec(&self) -> Vec { self.as_bytes().to_vec() } + #[inline] fn as_bytes(&self) -> &'f [u8] { self.data } + #[inline] fn empty_final_output(&self) -> Option { let root = self.root(); if root.is_final() { @@ -883,6 +896,7 @@ enum Bound { } impl Bound { + #[inline] fn exceeded_by(&self, inp: &[u8]) -> bool { match *self { Bound::Included(ref v) => inp > v, @@ -891,6 +905,7 @@ impl Bound { } } + #[inline] fn is_empty(&self) -> bool { match *self { Bound::Included(ref v) => v.is_empty(), @@ -899,6 +914,7 @@ impl Bound { } } + #[inline] fn is_inclusive(&self) -> bool { match *self { Bound::Excluded(_) => false, diff --git a/src/raw/node.rs b/src/raw/node.rs index c0c4e41..0e05bc0 100644 --- a/src/raw/node.rs +++ b/src/raw/node.rs @@ -128,9 +128,13 @@ impl<'f> Node<'f> { /// Returns the transition at index `i`. #[inline(always)] pub fn transition(&self, i: usize) -> Transition { - use self::State::*; + // The `inline(always)` annotation on this function appears to + // dramatically speed up FST traversal. In particular, measuring the + // time it takes to run `fst range something-big.fst` shows almost a 2x + // improvement with this single `inline(always)` annotation. + match self.state { - OneTransNext(s) => { + State::OneTransNext(s) => { assert!(i == 0); Transition { inp: s.input(self), @@ -138,7 +142,7 @@ impl<'f> Node<'f> { addr: s.trans_addr(self), } } - OneTrans(s) => { + State::OneTrans(s) => { assert!(i == 0); Transition { inp: s.input(self), @@ -146,17 +150,17 @@ impl<'f> Node<'f> { addr: s.trans_addr(self), } } - AnyTrans(s) => Transition { + State::AnyTrans(s) => Transition { inp: s.input(self, i), out: s.output(self, i), addr: s.trans_addr(self, i), }, - EmptyFinal => panic!("out of bounds"), + State::EmptyFinal => panic!("out of bounds"), } } /// Returns the transition address of the `i`th transition. - #[inline(always)] + #[inline] pub fn transition_addr(&self, i: usize) -> CompiledAddr { use self::State::*; match self.state { @@ -176,7 +180,7 @@ impl<'f> Node<'f> { /// Finds the `i`th transition corresponding to the given input byte. /// /// If no transition for this byte exists, then `None` is returned. - #[inline(always)] + #[inline] pub fn find_input(&self, b: u8) -> Option { use self::State::*; match self.state { @@ -191,14 +195,14 @@ impl<'f> Node<'f> { /// If this node is final and has a terminal output value, then it is /// returned. Otherwise, a zero output is returned. - #[inline(always)] + #[inline] pub fn final_output(&self) -> Output { self.final_output } /// Returns true if and only if this node corresponds to a final or "match" /// state in the finite state transducer. - #[inline(always)] + #[inline] pub fn is_final(&self) -> bool { self.is_final } @@ -206,31 +210,31 @@ impl<'f> Node<'f> { /// Returns the number of transitions in this node. /// /// The maximum number of transitions is 256. - #[inline(always)] + #[inline] pub fn len(&self) -> usize { self.ntrans } /// Returns true if and only if this node has zero transitions. - #[inline(always)] + #[inline] pub fn is_empty(&self) -> bool { self.ntrans == 0 } /// Return the address of this node. - #[inline(always)] + #[inline] pub fn addr(&self) -> CompiledAddr { self.start } #[doc(hidden)] - #[inline(always)] + #[inline] pub fn as_slice(&self) -> &[u8] { &self.data[self.end..] } #[doc(hidden)] - #[inline(always)] + #[inline] pub fn state(&self) -> &'static str { use self::State::*; match self.state { @@ -298,7 +302,6 @@ struct StateOneTrans(u8); struct StateAnyTrans(u8); impl State { - #[inline(always)] fn new(data: &[u8], addr: CompiledAddr) -> State { use self::State::*; if addr == EMPTY_ADDRESS { @@ -328,21 +331,22 @@ impl StateOneTransNext { Ok(()) } - #[inline(always)] + #[inline] fn new() -> Self { StateOneTransNext(0b11_000000) } + #[inline] fn set_common_input(&mut self, input: u8) { self.0 = (self.0 & 0b11_000000) | common_idx(input, 0b111111); } - #[inline(always)] + #[inline] fn common_input(&self) -> Option { common_input(self.0 & 0b00_111111) } - #[inline(always)] + #[inline] fn input_len(&self) -> usize { if self.common_input().is_none() { 1 @@ -351,12 +355,12 @@ impl StateOneTransNext { } } - #[inline(always)] + #[inline] fn end_addr(&self, data: &[u8]) -> usize { data.len() - 1 - self.input_len() } - #[inline(always)] + #[inline] fn input(&self, node: &Node<'_>) -> u8 { if let Some(inp) = self.common_input() { inp @@ -365,7 +369,7 @@ impl StateOneTransNext { } } - #[inline(always)] + #[inline] fn trans_addr(&self, node: &Node<'_>) -> CompiledAddr { node.end as CompiledAddr - 1 } @@ -396,20 +400,22 @@ impl StateOneTrans { Ok(()) } + #[inline] fn new() -> Self { StateOneTrans(0b10_000000) } + #[inline] fn set_common_input(&mut self, input: u8) { self.0 = (self.0 & 0b10_000000) | common_idx(input, 0b111111); } - #[inline(always)] + #[inline] fn common_input(&self) -> Option { common_input(self.0 & 0b00_111111) } - #[inline(always)] + #[inline] fn input_len(&self) -> usize { if self.common_input().is_none() { 1 @@ -418,13 +424,13 @@ impl StateOneTrans { } } - #[inline(always)] + #[inline] fn sizes(&self, data: &[u8]) -> PackSizes { let i = data.len() - 1 - self.input_len() - 1; PackSizes::decode(data[i]) } - #[inline(always)] + #[inline] fn end_addr(&self, data: &[u8], sizes: PackSizes) -> usize { data.len() - 1 - self.input_len() @@ -433,7 +439,7 @@ impl StateOneTrans { - sizes.output_pack_size() } - #[inline(always)] + #[inline] fn input(&self, node: &Node<'_>) -> u8 { if let Some(inp) = self.common_input() { inp @@ -442,7 +448,7 @@ impl StateOneTrans { } } - #[inline(always)] + #[inline] fn output(&self, node: &Node<'_>) -> Output { let osize = node.sizes.output_pack_size(); if osize == 0 { @@ -456,7 +462,7 @@ impl StateOneTrans { Output::new(bytes::unpack_uint(&node.data[i..], osize as u8)) } - #[inline(always)] + #[inline] fn trans_addr(&self, node: &Node<'_>) -> CompiledAddr { let tsize = node.sizes.transition_pack_size(); let i = node.start @@ -541,29 +547,31 @@ impl StateAnyTrans { Ok(()) } - #[inline(always)] + #[inline] fn new() -> Self { StateAnyTrans(0b00_000000) } + #[inline] fn set_final_state(&mut self, yes: bool) { if yes { self.0 |= 0b01_000000; } } - #[inline(always)] + #[inline] fn is_final_state(&self) -> bool { self.0 & 0b01_000000 == 0b01_000000 } + #[inline] fn set_state_ntrans(&mut self, n: u8) { if n <= 0b00_111111 { self.0 = (self.0 & 0b11_000000) | n; } } - #[inline(always)] + #[inline] fn state_ntrans(&self) -> Option { let n = self.0 & 0b00_111111; if n == 0 { @@ -573,13 +581,13 @@ impl StateAnyTrans { } } - #[inline(always)] + #[inline] fn sizes(&self, data: &[u8]) -> PackSizes { let i = data.len() - 1 - self.ntrans_len() - 1; PackSizes::decode(data[i]) } - #[inline(always)] + #[inline] fn total_trans_size( &self, version: u64, @@ -590,7 +598,7 @@ impl StateAnyTrans { ntrans + (ntrans * sizes.transition_pack_size()) + index_size } - #[inline(always)] + #[inline] fn trans_index_size(&self, version: u64, ntrans: usize) -> usize { if version >= 2 && ntrans > TRANS_INDEX_THRESHOLD { 256 @@ -599,7 +607,7 @@ impl StateAnyTrans { } } - #[inline(always)] + #[inline] fn ntrans_len(&self) -> usize { if self.state_ntrans().is_none() { 1 @@ -608,7 +616,7 @@ impl StateAnyTrans { } } - #[inline(always)] + #[inline] fn ntrans(&self, data: &[u8]) -> usize { if let Some(n) = self.state_ntrans() { n as usize @@ -624,7 +632,7 @@ impl StateAnyTrans { } } - #[inline(always)] + #[inline] fn final_output( &self, version: u64, @@ -645,7 +653,7 @@ impl StateAnyTrans { Output::new(bytes::unpack_uint(&data[at..], osize as u8)) } - #[inline(always)] + #[inline] fn end_addr( &self, version: u64, @@ -663,7 +671,7 @@ impl StateAnyTrans { - final_osize // final output } - #[inline(always)] + #[inline] fn trans_addr(&self, node: &Node<'_>, i: usize) -> CompiledAddr { assert!(i < node.ntrans); let tsize = node.sizes.transition_pack_size(); @@ -677,7 +685,7 @@ impl StateAnyTrans { unpack_delta(&node.data[at..], tsize, node.end) } - #[inline(always)] + #[inline] fn input(&self, node: &Node<'_>, i: usize) -> u8 { let at = node.start - self.ntrans_len() @@ -688,7 +696,7 @@ impl StateAnyTrans { node.data[at] } - #[inline(always)] + #[inline] fn find_input(&self, node: &Node<'_>, b: u8) -> Option { if node.version >= 2 && node.ntrans > TRANS_INDEX_THRESHOLD { let start = node.start @@ -712,7 +720,7 @@ impl StateAnyTrans { } } - #[inline(always)] + #[inline] fn output(&self, node: &Node<'_>, i: usize) -> Output { let osize = node.sizes.output_pack_size(); if osize == 0 { @@ -736,39 +744,39 @@ impl StateAnyTrans { struct PackSizes(u8); impl PackSizes { - #[inline(always)] + #[inline] fn new() -> Self { PackSizes(0) } - #[inline(always)] + #[inline] fn decode(v: u8) -> Self { PackSizes(v) } - #[inline(always)] + #[inline] fn encode(&self) -> u8 { self.0 } - #[inline(always)] + #[inline] fn set_transition_pack_size(&mut self, size: u8) { assert!(size <= 8); self.0 = (self.0 & 0b0000_1111) | (size << 4); } - #[inline(always)] + #[inline] fn transition_pack_size(&self) -> usize { ((self.0 & 0b1111_0000) >> 4) as usize } - #[inline(always)] + #[inline] fn set_output_pack_size(&mut self, size: u8) { assert!(size <= 8); self.0 = (self.0 & 0b1111_0000) | size; } - #[inline(always)] + #[inline] fn output_pack_size(&self) -> usize { (self.0 & 0b0000_1111) as usize } @@ -802,7 +810,7 @@ impl<'f, 'n> Iterator for Transitions<'f, 'n> { /// /// Nevertheless, the *caller* may have a priori knowledge that could be /// supplied to the builder manually, which could then be embedded in the FST. -#[inline(always)] +#[inline] fn common_idx(input: u8, max: u8) -> u8 { let val = ((COMMON_INPUTS[input as usize] as u32 + 1) % 256) as u8; if val > max { @@ -814,7 +822,7 @@ fn common_idx(input: u8, max: u8) -> u8 { /// common_input translates a common input index stored in a serialized FST /// to the corresponding byte. -#[inline(always)] +#[inline] fn common_input(idx: u8) -> Option { if idx == 0 { None @@ -823,6 +831,7 @@ fn common_input(idx: u8) -> Option { } } +#[inline] fn pack_delta( wtr: W, node_addr: CompiledAddr, @@ -833,6 +842,7 @@ fn pack_delta( Ok(nbytes) } +#[inline] fn pack_delta_in( wtr: W, node_addr: CompiledAddr, @@ -847,6 +857,7 @@ fn pack_delta_in( bytes::pack_uint_in(wtr, delta_addr as u64, nbytes) } +#[inline] fn pack_delta_size(node_addr: CompiledAddr, trans_addr: CompiledAddr) -> u8 { let delta_addr = if trans_addr == EMPTY_ADDRESS { EMPTY_ADDRESS @@ -856,6 +867,7 @@ fn pack_delta_size(node_addr: CompiledAddr, trans_addr: CompiledAddr) -> u8 { bytes::pack_size(delta_addr as u64) } +#[inline] fn unpack_delta( slice: &[u8], trans_pack_size: usize, From 650480cb242b5825e6b6581f80bbfd67f43f8036 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 7 Mar 2020 14:36:08 -0500 Subject: [PATCH 22/23] polish: more polishing work Touching up docs, restyling code in places, etc. --- .github/workflows/ci.yml | 2 +- Cargo.toml | 1 + README.md | 39 ++++++++++-------------- fst-bin/src/merge.rs | 18 +++++------ fst-bin/src/util.rs | 8 ++--- fst-levenshtein/src/lib.rs | 17 ++++------- src/automaton/levenshtein.rs | 7 +++-- src/automaton/mod.rs | 47 +++++++++++++--------------- src/error.rs | 15 ++++----- src/lib.rs | 17 +++++------ src/map.rs | 26 ++++++++-------- src/raw/build.rs | 8 ++--- src/raw/error.rs | 17 +++++------ src/raw/mod.rs | 16 +++++----- src/raw/node.rs | 59 +++++++++++++++++------------------- src/raw/ops.rs | 14 ++++----- src/set.rs | 20 ++++++------ 17 files changed, 154 insertions(+), 177 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 556f970..1f506ed 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -31,7 +31,7 @@ jobs: include: - build: pinned os: ubuntu-18.04 - rust: 1.39.0 + rust: 1.40.0 - build: stable os: ubuntu-18.04 rust: stable diff --git a/Cargo.toml b/Cargo.toml index b135a67..f70f108 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,6 +26,7 @@ levenshtein = ["utf8-ranges"] utf8-ranges = { version = "1.0.4", optional = true } [dev-dependencies] +doc-comment = "0.3.1" fnv = "1.0.6" memmap = "0.7" quickcheck = { version = "0.9.2", default-features = false } diff --git a/README.md b/README.md index fbf4941..6052082 100644 --- a/README.md +++ b/README.md @@ -23,11 +23,10 @@ Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org). https://docs.rs/fst The -[`fst-regex`](https://docs.rs/fst-regex) -and -[`fst-levenshtein`](https://docs.rs/fst-levenshtein) -crates provide regular expression matching and fuzzy searching on FSTs, -respectively. +[`regex-automata`](https://docs.rs/regex-automata) +crate provides implementations of the `fst::Automata` trait when its +`transducer` feature is enabled. This permits using DFAs compiled by +`regex-automata` to search finite state transducers produced by this crate. ### Installation @@ -36,27 +35,21 @@ Simply add a corresponding entry to your `Cargo.toml` dependency list: ```toml,ignore [dependencies] -fst = "0.3" +fst = "0.4" ``` ### Example This example demonstrates building a set in memory and executing a fuzzy query -against it. You'll need `fst = "0.3"` and `fst-levenshtein = "0.2"` in your -`Cargo.toml`. +against it. You'll need `fst = "0.4"` with the `levenshtein` feature enabled in +your `Cargo.toml`. ```rust -extern crate fst; -extern crate fst_levenshtein; - -use std::error::Error; -use std::process; - use fst::{IntoStreamer, Set}; -use fst_levenshtein::Levenshtein; +use fst::automaton::Levenshtein; -fn try_main() -> Result<(), Box> { +fn main() -> Result<(), Box> { // A convenient way to create sets in memory. let keys = vec!["fa", "fo", "fob", "focus", "foo", "food", "foul"]; let set = Set::from_iter(keys)?; @@ -71,13 +64,13 @@ fn try_main() -> Result<(), Box> { assert_eq!(keys, vec!["fo", "fob", "foo", "food"]); Ok(()) } - -fn main() { - if let Err(err) = try_main() { - eprintln!("{}", err); - process::exit(1); - } -} ``` Check out the documentation for a lot more examples! + + +### Cargo features + +* `levenshtein` - **Disabled** by default. This adds the `Levenshtein` + automaton to the `automaton` sub-module. This includes an additional + dependency on `utf8-ranges`. diff --git a/fst-bin/src/merge.rs b/fst-bin/src/merge.rs index 8e5a692..fe880ff 100644 --- a/fst-bin/src/merge.rs +++ b/fst-bin/src/merge.rs @@ -32,7 +32,7 @@ impl Merger where I: Iterator> + Send + 'static, { - pub fn new(it: T, output: P) -> Self + pub fn new(it: T, output: P) -> Merger where P: AsRef, T: IntoIterator, @@ -49,7 +49,7 @@ where } } - pub fn value_merger(mut self, f: F) -> Self + pub fn value_merger(mut self, f: F) -> Merger where F: Fn(u64, u64) -> u64 + Send + Sync + 'static, { @@ -57,27 +57,27 @@ where self } - pub fn fd_limit(mut self, fd_limit: u32) -> Self { + pub fn fd_limit(mut self, fd_limit: u32) -> Merger { self.fd_limit = fd_limit; self } - pub fn batch_size(mut self, batch_size: u32) -> Self { + pub fn batch_size(mut self, batch_size: u32) -> Merger { self.batch_size = batch_size; self } - pub fn threads(mut self, threads: u32) -> Self { + pub fn threads(mut self, threads: u32) -> Merger { self.threads = threads; self } - pub fn tmp_dir>(mut self, path: P) -> Self { + pub fn tmp_dir>(mut self, path: P) -> Merger { self.tmp_dir = path.as_ref().to_path_buf(); self } - pub fn keep_tmp_dir(mut self, yes: bool) -> Self { + pub fn keep_tmp_dir(mut self, yes: bool) -> Merger { self.keep_tmp_dir = yes; self } @@ -117,7 +117,7 @@ where for (i, union_batch) in batches.into_iter().enumerate() { sorters.create_fst(UnionBatch { tmp_dir: tmp_dir_path.clone(), - gen: gen, + gen, index: i, fsts: union_batch?, value_merger: self.value_merger.clone(), @@ -178,7 +178,7 @@ struct Sorters { } impl Sorters { - fn new(threads: u32) -> Self { + fn new(threads: u32) -> Sorters { let (bsend, brecv) = chan::bounded::(0); let (rsend, rrecv) = chan::bounded(0); for _ in 0..threads { diff --git a/fst-bin/src/util.rs b/fst-bin/src/util.rs index a166529..dfe4970 100644 --- a/fst-bin/src/util.rs +++ b/fst-bin/src/util.rs @@ -103,14 +103,14 @@ type Lines = bstr::io::ByteLines< impl ConcatLines { pub fn new(mut inputs: Vec) -> ConcatLines { inputs.reverse(); // treat it as a stack - ConcatLines { inputs: inputs, cur: None } + ConcatLines { inputs, cur: None } } } impl Iterator for ConcatLines { type Item = io::Result; - fn next(&mut self) -> Option { + fn next(&mut self) -> Option> { loop { if self.cur.is_none() { match self.inputs.pop() { @@ -143,7 +143,7 @@ type Rows = csv::DeserializeRecordsIntoIter; impl ConcatCsv { pub fn new(mut inputs: Vec) -> ConcatCsv { inputs.reverse(); // treat it as a stack - ConcatCsv { inputs: inputs, cur: None } + ConcatCsv { inputs, cur: None } } fn read_row(&mut self) -> Option> { @@ -162,7 +162,7 @@ impl ConcatCsv { impl Iterator for ConcatCsv { type Item = Result<(BString, u64), Error>; - fn next(&mut self) -> Option { + fn next(&mut self) -> Option> { loop { if self.cur.is_none() { match self.inputs.pop() { diff --git a/fst-levenshtein/src/lib.rs b/fst-levenshtein/src/lib.rs index b8e9f84..e5c65f9 100644 --- a/fst-levenshtein/src/lib.rs +++ b/fst-levenshtein/src/lib.rs @@ -1,6 +1,3 @@ - - - use std::cmp; use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; @@ -90,13 +87,13 @@ impl Levenshtein { /// A `Levenshtein` value satisfies the `Automaton` trait, which means it /// can be used with the `search` method of any finite state transducer. #[inline] - pub fn new(query: &str, distance: u32) -> Result { + pub fn new(query: &str, distance: u32) -> Result { let lev = DynamicLevenshtein { query: query.to_owned(), dist: distance as usize, }; let dfa = DfaBuilder::new(lev.clone()).build()?; - Ok(Levenshtein { prog: lev, dfa: dfa }) + Ok(Levenshtein { prog: lev, dfa }) } } @@ -197,10 +194,10 @@ struct DfaBuilder { } impl DfaBuilder { - fn new(lev: DynamicLevenshtein) -> Self { + fn new(lev: DynamicLevenshtein) -> DfaBuilder { DfaBuilder { dfa: Dfa { states: Vec::with_capacity(16) }, - lev: lev, + lev, cache: HashMap::with_capacity(1024), } } @@ -251,9 +248,7 @@ impl DfaBuilder { Entry::Occupied(v) => (*v.get(), true), Entry::Vacant(v) => { let is_match = self.lev.is_match(lev_state); - self.dfa - .states - .push(State { next: [None; 256], is_match: is_match }); + self.dfa.states.push(State { next: [None; 256], is_match }); (*v.insert(self.dfa.states.len() - 1), false) } }) @@ -314,7 +309,7 @@ impl DfaBuilder { } fn new_state(&mut self, is_match: bool) -> usize { - self.dfa.states.push(State { next: [None; 256], is_match: is_match }); + self.dfa.states.push(State { next: [None; 256], is_match }); self.dfa.states.len() - 1 } } diff --git a/src/automaton/levenshtein.rs b/src/automaton/levenshtein.rs index af15f02..9db9748 100644 --- a/src/automaton/levenshtein.rs +++ b/src/automaton/levenshtein.rs @@ -109,7 +109,10 @@ impl Levenshtein { /// A `Levenshtein` value satisfies the `Automaton` trait, which means it /// can be used with the `search` method of any finite state transducer. #[inline] - pub fn new(query: &str, distance: u32) -> Result { + pub fn new( + query: &str, + distance: u32, + ) -> Result { let lev = DynamicLevenshtein { query: query.to_owned(), dist: distance as usize, @@ -216,7 +219,7 @@ struct DfaBuilder { } impl DfaBuilder { - fn new(lev: DynamicLevenshtein) -> Self { + fn new(lev: DynamicLevenshtein) -> DfaBuilder { DfaBuilder { dfa: Dfa { states: Vec::with_capacity(16) }, lev, diff --git a/src/automaton/mod.rs b/src/automaton/mod.rs index 254c8ca..a36392a 100644 --- a/src/automaton/mod.rs +++ b/src/automaton/mod.rs @@ -1,6 +1,5 @@ #[cfg(feature = "levenshtein")] pub use self::levenshtein::{Levenshtein, LevenshteinError}; -use self::StartsWithStateInternal::*; #[cfg(feature = "levenshtein")] mod levenshtein; @@ -109,23 +108,23 @@ pub trait Automaton { impl<'a, T: Automaton> Automaton for &'a T { type State = T::State; - fn start(&self) -> Self::State { + fn start(&self) -> T::State { (*self).start() } - fn is_match(&self, state: &Self::State) -> bool { + fn is_match(&self, state: &T::State) -> bool { (*self).is_match(state) } - fn can_match(&self, state: &Self::State) -> bool { + fn can_match(&self, state: &T::State) -> bool { (*self).can_match(state) } - fn will_always_match(&self, state: &Self::State) -> bool { + fn will_always_match(&self, state: &T::State) -> bool { (*self).will_always_match(state) } - fn accept(&self, state: &Self::State, byte: u8) -> Self::State { + fn accept(&self, state: &T::State, byte: u8) -> T::State { (*self).accept(state, byte) } } @@ -138,13 +137,11 @@ impl<'a, T: Automaton> Automaton for &'a T { /// ```rust /// extern crate fst; /// -/// use std::error::Error; -/// /// use fst::{Automaton, IntoStreamer, Streamer, Set}; /// use fst::automaton::Str; /// /// # fn main() { example().unwrap(); } -/// fn example() -> Result<(), Box> { +/// fn example() -> Result<(), Box> { /// let paths = vec!["/home/projects/bar", "/home/projects/foo", "/tmp/foo"]; /// let set = Set::from_iter(paths)?; /// @@ -212,13 +209,11 @@ impl<'a> Automaton for Str<'a> { /// ```rust /// extern crate fst; /// -/// use std::error::Error; -/// /// use fst::{IntoStreamer, Streamer, Set}; /// use fst::automaton::Subsequence; /// /// # fn main() { example().unwrap(); } -/// fn example() -> Result<(), Box> { +/// fn example() -> Result<(), Box> { /// let paths = vec!["/home/projects/bar", "/home/projects/foo", "/tmp/foo"]; /// let set = Set::from_iter(paths)?; /// @@ -317,9 +312,9 @@ impl Automaton for AlwaysMatch { pub struct StartsWith(A); /// The `Automaton` state for `StartsWith`. -pub struct StartsWithState(StartsWithStateInternal); +pub struct StartsWithState(StartsWithStateKind); -enum StartsWithStateInternal { +enum StartsWithStateKind { Done, Running(A::State), } @@ -331,31 +326,31 @@ impl Automaton for StartsWith { StartsWithState({ let inner = self.0.start(); if self.0.is_match(&inner) { - Done + StartsWithStateKind::Done } else { - Running(inner) + StartsWithStateKind::Running(inner) } }) } fn is_match(&self, state: &StartsWithState) -> bool { match state.0 { - Done => true, - Running(_) => false, + StartsWithStateKind::Done => true, + StartsWithStateKind::Running(_) => false, } } fn can_match(&self, state: &StartsWithState) -> bool { match state.0 { - Done => true, - Running(ref inner) => self.0.can_match(inner), + StartsWithStateKind::Done => true, + StartsWithStateKind::Running(ref inner) => self.0.can_match(inner), } } fn will_always_match(&self, state: &StartsWithState) -> bool { match state.0 { - Done => true, - Running(_) => false, + StartsWithStateKind::Done => true, + StartsWithStateKind::Running(_) => false, } } @@ -365,13 +360,13 @@ impl Automaton for StartsWith { byte: u8, ) -> StartsWithState { StartsWithState(match state.0 { - Done => Done, - Running(ref inner) => { + StartsWithStateKind::Done => StartsWithStateKind::Done, + StartsWithStateKind::Running(ref inner) => { let next_inner = self.0.accept(inner, byte); if self.0.is_match(&next_inner) { - Done + StartsWithStateKind::Done } else { - Running(next_inner) + StartsWithStateKind::Running(next_inner) } } }) diff --git a/src/error.rs b/src/error.rs index 1f68761..dc89955 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,4 +1,3 @@ -use std::error; use std::fmt; use std::io; @@ -33,20 +32,18 @@ impl From for Error { impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - use self::Error::*; match *self { - Fst(ref err) => err.fmt(f), - Io(ref err) => err.fmt(f), + Error::Fst(ref err) => err.fmt(f), + Error::Io(ref err) => err.fmt(f), } } } -impl error::Error for Error { - fn source(&self) -> Option<&(dyn error::Error + 'static)> { - use self::Error::*; +impl std::error::Error for Error { + fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match *self { - Fst(ref err) => Some(err), - Io(ref err) => Some(err), + Error::Fst(ref err) => Some(err), + Error::Io(ref err) => Some(err), } } } diff --git a/src/lib.rs b/src/lib.rs index 0538fb3..04c3364 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -20,7 +20,7 @@ Simply add a corresponding entry to your `Cargo.toml` dependency list: ```ignore [dependencies] -fst = "0.2" +fst = "0.4" ``` The examples in this documentation will show the rest. @@ -52,13 +52,11 @@ This requires the `levenshtein` feature in this crate to be enabled. It is not enabled by default. ```rust -use std::error::Error; - use fst::{IntoStreamer, Streamer, Set}; use fst::automaton::Levenshtein; # fn main() { example().unwrap(); } -fn example() -> Result<(), Box> { +fn example() -> Result<(), Box> { // A convenient way to create sets in memory. let keys = vec!["fa", "fo", "fob", "focus", "foo", "food", "foul"]; let set = Set::from_iter(keys)?; @@ -167,8 +165,8 @@ the `transducer` feature enabled: ```toml [dependencies] -fst = "0.3" -regex-automata = { version = "0.1.9", features = ["transducer"] } +fst = "0.4" +regex-automata = { version = "0.1.8", features = ["transducer"] } ``` # Example: searching multiple sets efficiently @@ -184,14 +182,12 @@ The example below shows how to find all keys that start with `B` or `G`. The example below uses sets, but the same operations are available on maps too. ```rust -use std::error::Error; - use fst::automaton::{Automaton, Str}; use fst::set; use fst::{IntoStreamer, Set, Streamer}; # fn main() { example().unwrap(); } -fn example() -> Result<(), Box> { +fn example() -> Result<(), Box> { let set1 = Set::from_iter(&["AC/DC", "Aerosmith"])?; let set2 = Set::from_iter(&["Bob Seger", "Bruce Springsteen"])?; let set3 = Set::from_iter(&["George Thorogood", "Golden Earring"])?; @@ -305,6 +301,9 @@ data structures found in the standard library, such as `BTreeSet` and #![deny(missing_docs)] +#[cfg(all(feature = "levenshtein", doctest))] +doc_comment::doctest!("../README.md"); + pub use crate::automaton::Automaton; pub use crate::error::{Error, Result}; pub use crate::map::{Map, MapBuilder}; diff --git a/src/map.rs b/src/map.rs index d6e2053..b2b40f5 100644 --- a/src/map.rs +++ b/src/map.rs @@ -576,7 +576,7 @@ pub struct MapBuilder(raw::Builder); impl MapBuilder> { /// Create a builder that builds a map in memory. #[inline] - pub fn memory() -> Self { + pub fn memory() -> MapBuilder> { MapBuilder(raw::Builder::memory()) } @@ -678,7 +678,7 @@ where impl<'a, 'm, A: Automaton> Streamer<'a> for Stream<'m, A> { type Item = (&'a [u8], u64); - fn next(&'a mut self) -> Option { + fn next(&'a mut self) -> Option<(&'a [u8], u64)> { self.0.next().map(|(key, out)| (key, out.value())) } } @@ -757,7 +757,7 @@ impl<'a, 'm> Streamer<'a> for Keys<'m> { type Item = &'a [u8]; #[inline] - fn next(&'a mut self) -> Option { + fn next(&'a mut self) -> Option<&'a [u8]> { self.0.next().map(|(key, _)| key) } } @@ -772,7 +772,7 @@ impl<'a, 'm> Streamer<'a> for Values<'m> { type Item = u64; #[inline] - fn next(&'a mut self) -> Option { + fn next(&'a mut self) -> Option { self.0.next().map(|(_, out)| out.value()) } } @@ -817,7 +817,7 @@ impl<'m, 'a, A: Automaton> IntoStreamer<'a> for StreamBuilder<'m, A> { type Item = (&'a [u8], u64); type Into = Stream<'m, A>; - fn into_stream(self) -> Self::Into { + fn into_stream(self) -> Stream<'m, A> { Stream(self.0.into_stream()) } } @@ -912,7 +912,7 @@ pub struct OpBuilder<'m>(raw::OpBuilder<'m>); impl<'m> OpBuilder<'m> { /// Create a new set operation builder. #[inline] - pub fn new() -> Self { + pub fn new() -> OpBuilder<'m> { OpBuilder(raw::OpBuilder::new()) } @@ -923,7 +923,7 @@ impl<'m> OpBuilder<'m> { /// /// The stream must emit a lexicographically ordered sequence of key-value /// pairs. - pub fn add(mut self, streamable: I) -> Self + pub fn add(mut self, streamable: I) -> OpBuilder<'m> where I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>, S: 'm + for<'a> Streamer<'a, Item = (&'a [u8], u64)>, @@ -1144,7 +1144,7 @@ where I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>, S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], u64)>, { - fn from_iter(it: T) -> Self + fn from_iter(it: T) -> OpBuilder<'f> where T: IntoIterator, { @@ -1163,7 +1163,7 @@ impl<'a, 'm> Streamer<'a> for Union<'m> { type Item = (&'a [u8], &'a [IndexedValue]); #[inline] - fn next(&'a mut self) -> Option { + fn next(&'a mut self) -> Option<(&'a [u8], &'a [IndexedValue])> { self.0.next() } } @@ -1178,7 +1178,7 @@ impl<'a, 'm> Streamer<'a> for Intersection<'m> { type Item = (&'a [u8], &'a [IndexedValue]); #[inline] - fn next(&'a mut self) -> Option { + fn next(&'a mut self) -> Option<(&'a [u8], &'a [IndexedValue])> { self.0.next() } } @@ -1197,7 +1197,7 @@ impl<'a, 'm> Streamer<'a> for Difference<'m> { type Item = (&'a [u8], &'a [IndexedValue]); #[inline] - fn next(&'a mut self) -> Option { + fn next(&'a mut self) -> Option<(&'a [u8], &'a [IndexedValue])> { self.0.next() } } @@ -1212,7 +1212,7 @@ impl<'a, 'm> Streamer<'a> for SymmetricDifference<'m> { type Item = (&'a [u8], &'a [IndexedValue]); #[inline] - fn next(&'a mut self) -> Option { + fn next(&'a mut self) -> Option<(&'a [u8], &'a [IndexedValue])> { self.0.next() } } @@ -1230,7 +1230,7 @@ where { type Item = (&'a [u8], raw::Output); - fn next(&'a mut self) -> Option { + fn next(&'a mut self) -> Option<(&'a [u8], raw::Output)> { self.0.next().map(|(k, v)| (k, raw::Output::new(v))) } } diff --git a/src/raw/build.rs b/src/raw/build.rs index 5dc33b0..642fab4 100644 --- a/src/raw/build.rs +++ b/src/raw/build.rs @@ -99,7 +99,7 @@ struct LastTransition { impl Builder> { /// Create a builder that builds an fst in memory. #[inline] - pub fn memory() -> Self { + pub fn memory() -> Builder> { Builder::new(Vec::with_capacity(10 * (1 << 10))).unwrap() } @@ -447,7 +447,7 @@ impl BuilderNodeUnfinished { } impl Clone for BuilderNode { - fn clone(&self) -> Self { + fn clone(&self) -> BuilderNode { BuilderNode { is_final: self.is_final, final_output: self.final_output, @@ -455,7 +455,7 @@ impl Clone for BuilderNode { } } - fn clone_from(&mut self, source: &Self) { + fn clone_from(&mut self, source: &BuilderNode) { self.is_final = source.is_final; self.final_output = source.final_output; self.trans.clear(); @@ -464,7 +464,7 @@ impl Clone for BuilderNode { } impl Default for BuilderNode { - fn default() -> Self { + fn default() -> BuilderNode { BuilderNode { is_final: false, final_output: Output::zero(), diff --git a/src/raw/error.rs b/src/raw/error.rs index 6d2bf05..d4e6d81 100644 --- a/src/raw/error.rs +++ b/src/raw/error.rs @@ -70,10 +70,9 @@ pub enum Error { impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - use self::Error::*; match *self { - FromUtf8(ref err) => err.fmt(f), - Version { expected, got } => write!( + Error::FromUtf8(ref err) => err.fmt(f), + Error::Version { expected, got } => write!( f, "\ Error opening FST: expected API version {}, got API version {}. \ @@ -83,24 +82,24 @@ to change the version of the 'fst' crate you're using, or re-generate the FST.", expected, got ), - Format { size } => write!( + Error::Format { size } => write!( f, "\ Error opening FST with size {} bytes: An unknown error occurred. This \ usually means you're trying to read data that isn't actually an encoded FST.", size ), - ChecksumMismatch { expected, got } => write!( + Error::ChecksumMismatch { expected, got } => write!( f, "FST verification failed: expected checksum of {} but got {}", expected, got, ), - DuplicateKey { ref got } => write!( + Error::DuplicateKey { ref got } => write!( f, "Error inserting duplicate key: '{}'.", format_bytes(&*got) ), - OutOfOrder { ref previous, ref got } => write!( + Error::OutOfOrder { ref previous, ref got } => write!( f, "\ Error inserting out-of-order key: '{}'. (Previous key was '{}'.) Keys must be \ @@ -108,7 +107,7 @@ inserted in lexicographic order.", format_bytes(&*got), format_bytes(&*previous) ), - WrongType { expected, got } => write!( + Error::WrongType { expected, got } => write!( f, "\ Error opening FST: expected type '{}', got type '{}'.", @@ -135,7 +134,7 @@ impl std::error::Error for Error { impl From for Error { #[inline] - fn from(err: FromUtf8Error) -> Self { + fn from(err: FromUtf8Error) -> Error { Error::FromUtf8(err) } } diff --git a/src/raw/mod.rs b/src/raw/mod.rs index 62c7a46..1f4c8af 100644 --- a/src/raw/mod.rs +++ b/src/raw/mod.rs @@ -26,10 +26,10 @@ use crate::bytes; use crate::error::Result; use crate::stream::{IntoStreamer, Streamer}; -pub use self::build::Builder; -pub use self::error::Error; -pub use self::node::{Node, Transitions}; -pub use self::ops::{ +pub use crate::raw::build::Builder; +pub use crate::raw::error::Error; +pub use crate::raw::node::{Node, Transitions}; +pub use crate::raw::ops::{ Difference, IndexedValue, Intersection, OpBuilder, SymmetricDifference, Union, }; @@ -621,7 +621,7 @@ impl<'a, 'f, D: AsRef<[u8]>> IntoStreamer<'a> for &'f Fst { type Into = Stream<'f>; #[inline] - fn into_stream(self) -> Self::Into { + fn into_stream(self) -> Stream<'f> { StreamBuilder::new(self.as_ref(), AlwaysMatch).into_stream() } } @@ -761,7 +761,7 @@ pub struct StreamBuilder<'f, A = AlwaysMatch> { } impl<'f, A: Automaton> StreamBuilder<'f, A> { - fn new(fst: FstRef<'f>, aut: A) -> Self { + fn new(fst: FstRef<'f>, aut: A) -> StreamBuilder<'f, A> { StreamBuilder { fst, aut, @@ -1001,7 +1001,7 @@ impl<'f, A: Automaton> Stream<'f, A> { impl<'f, 'a, A: Automaton> Streamer<'a> for Stream<'f, A> { type Item = (&'a [u8], Output); - fn next(&'a mut self) -> Option { + fn next(&'a mut self) -> Option<(&'a [u8], Output)> { self.0.next_with(|_| ()).map(|(key, out, _)| (key, out)) } } @@ -1287,7 +1287,7 @@ pub struct Transition { impl Default for Transition { #[inline] - fn default() -> Self { + fn default() -> Transition { Transition { inp: 0, out: Output::zero(), addr: NONE_ADDRESS } } } diff --git a/src/raw/node.rs b/src/raw/node.rs index 0e05bc0..8a9b95b 100644 --- a/src/raw/node.rs +++ b/src/raw/node.rs @@ -57,10 +57,9 @@ impl<'f> Node<'f> { addr: CompiledAddr, data: &[u8], ) -> Node<'_> { - use self::State::*; let state = State::new(data, addr); match state { - EmptyFinal => Node { + State::EmptyFinal => Node { data: &[], version, state: State::EmptyFinal, @@ -71,7 +70,7 @@ impl<'f> Node<'f> { sizes: PackSizes::new(), final_output: Output::zero(), }, - OneTransNext(s) => { + State::OneTransNext(s) => { let data = &data[..addr + 1]; Node { data, @@ -85,7 +84,7 @@ impl<'f> Node<'f> { final_output: Output::zero(), } } - OneTrans(s) => { + State::OneTrans(s) => { let data = &data[..addr + 1]; let sizes = s.sizes(data); Node { @@ -100,7 +99,7 @@ impl<'f> Node<'f> { final_output: Output::zero(), } } - AnyTrans(s) => { + State::AnyTrans(s) => { let data = &data[..addr + 1]; let sizes = s.sizes(data); let ntrans = s.ntrans(data); @@ -162,18 +161,17 @@ impl<'f> Node<'f> { /// Returns the transition address of the `i`th transition. #[inline] pub fn transition_addr(&self, i: usize) -> CompiledAddr { - use self::State::*; match self.state { - OneTransNext(s) => { + State::OneTransNext(s) => { assert!(i == 0); s.trans_addr(self) } - OneTrans(s) => { + State::OneTrans(s) => { assert!(i == 0); s.trans_addr(self) } - AnyTrans(s) => s.trans_addr(self, i), - EmptyFinal => panic!("out of bounds"), + State::AnyTrans(s) => s.trans_addr(self, i), + State::EmptyFinal => panic!("out of bounds"), } } @@ -182,14 +180,13 @@ impl<'f> Node<'f> { /// If no transition for this byte exists, then `None` is returned. #[inline] pub fn find_input(&self, b: u8) -> Option { - use self::State::*; match self.state { - OneTransNext(s) if s.input(self) == b => Some(0), - OneTransNext(_) => None, - OneTrans(s) if s.input(self) == b => Some(0), - OneTrans(_) => None, - AnyTrans(s) => s.find_input(self, b), - EmptyFinal => None, + State::OneTransNext(s) if s.input(self) == b => Some(0), + State::OneTransNext(_) => None, + State::OneTrans(s) if s.input(self) == b => Some(0), + State::OneTrans(_) => None, + State::AnyTrans(s) => s.find_input(self, b), + State::EmptyFinal => None, } } @@ -236,12 +233,11 @@ impl<'f> Node<'f> { #[doc(hidden)] #[inline] pub fn state(&self) -> &'static str { - use self::State::*; match self.state { - OneTransNext(_) => "OTN", - OneTrans(_) => "OT", - AnyTrans(_) => "AT", - EmptyFinal => "EF", + State::OneTransNext(_) => "OTN", + State::OneTrans(_) => "OT", + State::AnyTrans(_) => "AT", + State::EmptyFinal => "EF", } } @@ -303,15 +299,14 @@ struct StateAnyTrans(u8); impl State { fn new(data: &[u8], addr: CompiledAddr) -> State { - use self::State::*; if addr == EMPTY_ADDRESS { - return EmptyFinal; + return State::EmptyFinal; } let v = data[addr]; match (v & 0b11_000000) >> 6 { - 0b11 => OneTransNext(StateOneTransNext(v)), - 0b10 => OneTrans(StateOneTrans(v)), - _ => AnyTrans(StateAnyTrans(v)), + 0b11 => State::OneTransNext(StateOneTransNext(v)), + 0b10 => State::OneTrans(StateOneTrans(v)), + _ => State::AnyTrans(StateAnyTrans(v)), } } } @@ -332,7 +327,7 @@ impl StateOneTransNext { } #[inline] - fn new() -> Self { + fn new() -> StateOneTransNext { StateOneTransNext(0b11_000000) } @@ -401,7 +396,7 @@ impl StateOneTrans { } #[inline] - fn new() -> Self { + fn new() -> StateOneTrans { StateOneTrans(0b10_000000) } @@ -548,7 +543,7 @@ impl StateAnyTrans { } #[inline] - fn new() -> Self { + fn new() -> StateAnyTrans { StateAnyTrans(0b00_000000) } @@ -745,12 +740,12 @@ struct PackSizes(u8); impl PackSizes { #[inline] - fn new() -> Self { + fn new() -> PackSizes { PackSizes(0) } #[inline] - fn decode(v: u8) -> Self { + fn decode(v: u8) -> PackSizes { PackSizes(v) } diff --git a/src/raw/ops.rs b/src/raw/ops.rs index a9e79a1..9baf855 100644 --- a/src/raw/ops.rs +++ b/src/raw/ops.rs @@ -48,7 +48,7 @@ pub struct OpBuilder<'f> { impl<'f> OpBuilder<'f> { /// Create a new set operation builder. #[inline] - pub fn new() -> Self { + pub fn new() -> OpBuilder<'f> { OpBuilder { streams: vec![] } } @@ -59,7 +59,7 @@ impl<'f> OpBuilder<'f> { /// /// The stream must emit a lexicographically ordered sequence of key-value /// pairs. - pub fn add(mut self, stream: I) -> Self + pub fn add(mut self, stream: I) -> OpBuilder<'f> where I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>, S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>, @@ -188,7 +188,7 @@ where I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>, S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>, { - fn from_iter(it: T) -> Self + fn from_iter(it: T) -> OpBuilder<'f> where T: IntoIterator, { @@ -210,7 +210,7 @@ pub struct Union<'f> { impl<'a, 'f> Streamer<'a> for Union<'f> { type Item = (&'a [u8], &'a [IndexedValue]); - fn next(&'a mut self) -> Option { + fn next(&'a mut self) -> Option<(&'a [u8], &'a [IndexedValue])> { if let Some(slot) = self.cur_slot.take() { self.heap.refill(slot); } @@ -244,7 +244,7 @@ pub struct Intersection<'f> { impl<'a, 'f> Streamer<'a> for Intersection<'f> { type Item = (&'a [u8], &'a [IndexedValue]); - fn next(&'a mut self) -> Option { + fn next(&'a mut self) -> Option<(&'a [u8], &'a [IndexedValue])> { if let Some(slot) = self.cur_slot.take() { self.heap.refill(slot); } @@ -290,7 +290,7 @@ pub struct Difference<'f> { impl<'a, 'f> Streamer<'a> for Difference<'f> { type Item = (&'a [u8], &'a [IndexedValue]); - fn next(&'a mut self) -> Option { + fn next(&'a mut self) -> Option<(&'a [u8], &'a [IndexedValue])> { loop { match self.set.next() { None => return None, @@ -329,7 +329,7 @@ pub struct SymmetricDifference<'f> { impl<'a, 'f> Streamer<'a> for SymmetricDifference<'f> { type Item = (&'a [u8], &'a [IndexedValue]); - fn next(&'a mut self) -> Option { + fn next(&'a mut self) -> Option<(&'a [u8], &'a [IndexedValue])> { if let Some(slot) = self.cur_slot.take() { self.heap.refill(slot); } diff --git a/src/set.rs b/src/set.rs index 412a5d0..3af8538 100644 --- a/src/set.rs +++ b/src/set.rs @@ -411,7 +411,7 @@ impl<'s, 'a, D: AsRef<[u8]>> IntoStreamer<'a> for &'s Set { type Into = Stream<'s>; #[inline] - fn into_stream(self) -> Self::Into { + fn into_stream(self) -> Stream<'s> { Stream(self.0.stream()) } } @@ -518,7 +518,7 @@ pub struct SetBuilder(raw::Builder); impl SetBuilder> { /// Create a builder that builds a set in memory. #[inline] - pub fn memory() -> Self { + pub fn memory() -> SetBuilder> { SetBuilder(raw::Builder::memory()) } @@ -789,7 +789,7 @@ pub struct OpBuilder<'s>(raw::OpBuilder<'s>); impl<'s> OpBuilder<'s> { /// Create a new set operation builder. #[inline] - pub fn new() -> Self { + pub fn new() -> OpBuilder<'s> { OpBuilder(raw::OpBuilder::new()) } @@ -800,7 +800,7 @@ impl<'s> OpBuilder<'s> { /// /// The stream must emit a lexicographically ordered sequence of byte /// strings. - pub fn add(mut self, streamable: I) -> Self + pub fn add(mut self, streamable: I) -> OpBuilder<'s> where I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>, S: 's + for<'a> Streamer<'a, Item = &'a [u8]>, @@ -943,7 +943,7 @@ where I: for<'a> IntoStreamer<'a, Into = S, Item = &'a [u8]>, S: 'f + for<'a> Streamer<'a, Item = &'a [u8]>, { - fn from_iter(it: T) -> Self + fn from_iter(it: T) -> OpBuilder<'f> where T: IntoIterator, { @@ -962,7 +962,7 @@ impl<'a, 's> Streamer<'a> for Union<'s> { type Item = &'a [u8]; #[inline] - fn next(&'a mut self) -> Option { + fn next(&'a mut self) -> Option<&'a [u8]> { self.0.next().map(|(key, _)| key) } } @@ -976,7 +976,7 @@ impl<'a, 's> Streamer<'a> for Intersection<'s> { type Item = &'a [u8]; #[inline] - fn next(&'a mut self) -> Option { + fn next(&'a mut self) -> Option<&'a [u8]> { self.0.next().map(|(key, _)| key) } } @@ -994,7 +994,7 @@ impl<'a, 's> Streamer<'a> for Difference<'s> { type Item = &'a [u8]; #[inline] - fn next(&'a mut self) -> Option { + fn next(&'a mut self) -> Option<&'a [u8]> { self.0.next().map(|(key, _)| key) } } @@ -1009,7 +1009,7 @@ impl<'a, 's> Streamer<'a> for SymmetricDifference<'s> { type Item = &'a [u8]; #[inline] - fn next(&'a mut self) -> Option { + fn next(&'a mut self) -> Option<&'a [u8]> { self.0.next().map(|(key, _)| key) } } @@ -1024,7 +1024,7 @@ struct StreamZeroOutput(S); impl<'a, S: Streamer<'a>> Streamer<'a> for StreamZeroOutput { type Item = (S::Item, raw::Output); - fn next(&'a mut self) -> Option { + fn next(&'a mut self) -> Option<(S::Item, raw::Output)> { self.0.next().map(|key| (key, raw::Output::zero())) } } From a4c4b32f9015af40b61bbc7701a0c3036145c4d6 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sun, 8 Mar 2020 10:26:27 -0400 Subject: [PATCH 23/23] api: remove AsRef<[u8]> from struct definitions This isn't actually necessary. I put them there originally because I thought that it added more clarity to the intended use of an FST. But the constructors should make it clear enough. --- src/map.rs | 2 +- src/raw/mod.rs | 2 +- src/set.rs | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/map.rs b/src/map.rs index b2b40f5..68bf8e4 100644 --- a/src/map.rs +++ b/src/map.rs @@ -51,7 +51,7 @@ use crate::Result; /// Keys will always be byte strings; however, we may grow more conveniences /// around dealing with them (such as a serialization/deserialization step, /// although it isn't clear where exactly this should live). -pub struct Map>(raw::Fst); +pub struct Map(raw::Fst); impl Map> { /// Create a `Map` from an iterator of lexicographically ordered byte diff --git a/src/raw/mod.rs b/src/raw/mod.rs index 1f4c8af..9bae180 100644 --- a/src/raw/mod.rs +++ b/src/raw/mod.rs @@ -267,7 +267,7 @@ pub type CompiledAddr = usize; /// (excellent for in depth overview) /// * [Comparison of Construction Algorithms for Minimal, Acyclic, Deterministic, Finite-State Automata from Sets of Strings](http://www.cs.mun.ca/~harold/Courses/Old/CS4750/Diary/q3p2qx4lv71m5vew.pdf) /// (excellent for surface level overview) -pub struct Fst> { +pub struct Fst { meta: Meta, data: D, } diff --git a/src/set.rs b/src/set.rs index 3af8538..a3419c6 100644 --- a/src/set.rs +++ b/src/set.rs @@ -26,7 +26,7 @@ use crate::Result; /// /// 1. Once constructed, a `Set` can never be modified. /// 2. Sets must be constructed with lexicographically ordered byte sequences. -pub struct Set>(raw::Fst); +pub struct Set(raw::Fst); impl Set> { /// Create a `Set` from an iterator of lexicographically ordered byte