diff --git a/.gitignore b/.gitignore index 1db66a9..4170c07 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ Cargo.lock *.swo *.swn target/ +.idea/ diff --git a/Cargo.toml b/Cargo.toml index b654b1b..98301a8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,3 +7,7 @@ edition = "2018" [dependencies] byteorder = "1.3.2" flate2 = "1.0.12" +thiserror = "1.0.30" +rassert-rs = "1.0" +memmap2 = "0.5" +advisory-lock = "0.3" diff --git a/README.md b/README.md index 9b86951..8ee1577 100644 --- a/README.md +++ b/README.md @@ -16,10 +16,12 @@ Example Usage Citing from the crates documentation: ``` +use dict::Dict; + fn main() { let index_file = "/usr/share/dictd/freedict-lat-deu.index"; let dict_file = "/usr/share/dictd/freedict-lat-deu.dict.dz"; - let mut latdeu = dict::load_dictionary_from_file(dict_file, index_file).unwrap(); + let mut latdeu = Dict::from_file(dict_file, index_file).unwrap(); // hey: rust! println!("{}", latdeu.lookup("ferrugo").unwrap()); } diff --git a/src/compressed.rs b/src/compressed.rs new file mode 100644 index 0000000..33f39db --- /dev/null +++ b/src/compressed.rs @@ -0,0 +1,219 @@ +use byteorder::{ByteOrder, LittleEndian, ReadBytesExt}; +use rassert_rs::rassert; +use std::io::{self, Read, Seek, SeekFrom}; + +use super::{DictError, DictReader, MAX_BYTES_FOR_BUFFER}; +use DictError::*; + +/// Compressed (gzip) Dict reader +/// +/// This reader can read compressed .dict files with the file name suffix .dz. +/// This format is documented in RFC 1952 and in `man dictzip`. An example implementation can be +/// found in the dict daemon (dictd) in `data.c`. +pub struct Compressed { + /// Compressed buffer + pub(crate) buf: B, + + /// Length of an uncompressed chunk + pub(crate) uchunk_length: usize, + + /// End of compressed data + pub(crate) end_compressed_data: u64, + + /// Offsets in file where new compressed chunks start + pub(crate) chunk_offsets: Vec, + + /// Total size of uncompressed file + pub(crate) ufile_length: u64, +} + +/// Byte mask to query for existence of FEXTRA field in the flags byte of a `.dz` file +pub const GZ_FEXTRA: u8 = 0b0000_0100; + +/// Byte mask to query for the existence of a file name in a `.dz` file +pub const GZ_FNAME: u8 = 0b0000_1000; + +/// Byte mask to query for the existence of a comment in a `.dz` file +pub const GZ_COMMENT: u8 = 0b0001_0000; + +/// Byte mask to detect that a comment is contained in a `.dz` file +pub const GZ_FHCRC: u8 = 0b0000_0010; + +/// A (gz) chunk, representing length and offset within the compressed file +#[derive(Debug)] +struct Chunk { + offset: u64, + length: usize, +} + +impl Compressed { + pub fn new(mut buf: B) -> Result { + let mut header = vec![0; 12]; + + // Check header + buf.read_exact(&mut header)?; + rassert!(&header[0..2] == &[0x1F, 0x8B], InvalidFileFormat("Not in gzip format".into())); + + // Check for FEXTRA flag + let flags = header[3]; + rassert!(flags & GZ_FEXTRA != 0, InvalidFileFormat("Extra flag (FLG.FEXTRA) not set. Not in gzip + dzip format.".into())); + + // Read length of FEXTRA field + let xlen = LittleEndian::read_u16(&header[10..12]); + + // Read FEXTRA field + let mut fextra = vec![0; xlen as usize]; + buf.read_exact(&mut fextra)?; + rassert!(&fextra[0..2] == b"RA", InvalidFileFormat("No dictzip info found in FEXTRA header (behind XLEN, in SI1SI2 fields".into())); + + // Check subfield length + let subfield_length = LittleEndian::read_u16(&fextra[2..4]); + rassert!(subfield_length == xlen - 4, InvalidFileFormat( + "The length of the subfield should be the same as the FEXTRA field, \ + ignoring the additional length information and the file format identification".into() + )); + + // Check dictzip version + let version = LittleEndian::read_u16(&fextra[4..6]); + rassert!(version == 1, InvalidFileFormat("Unimplemented dictzip version, only version 1 supported".into())); + + // Before compression, the file is split into evenly-sized chunks and the + // size information is put right after the version information + let uchunk_length = LittleEndian::read_u16(&fextra[6..8]) as usize; + let chunk_count = LittleEndian::read_u16(&fextra[8..10]); + rassert!(chunk_count != 0, InvalidFileFormat("No compressed chunks in file or broken header information".into())); + + // Compute number of possible chunks which would fit into the FEXTRA field. + // Used for validity check, first 10 bytes of FEXTRA are header information, + // the rest are 2-byte, little-endian numbers. + let max_chunks = ((fextra.len() - 10) / 2) as u16; + rassert!(max_chunks == chunk_count, InvalidFileFormat(format!( + "Expected {} chunks according to dictzip header, but the FEXTRA field accomodate {}. Possibly broken file.", + chunk_count, max_chunks + ))); + + // If filename bit set, skip nul-terminated filename + if flags & GZ_FNAME != 0 { + while buf.read_u8()? != b'\0' {} + } + + // Skip comment + if flags & GZ_COMMENT != 0 { + while buf.read_u8()? != b'\0' {} + } + + // Skip CRC bytes + if flags & GZ_FHCRC != 0 { + buf.seek(SeekFrom::Current(2))?; + } + + // Save length of each compressed chunk + let mut chunk_offsets = Vec::with_capacity(chunk_count as usize); + + // Save position of last compressed byte + // Note: This might not be EOF, could be followed by CRC checksum. + let mut end_compressed_data = buf.seek(SeekFrom::Current(0))?; + + // After the various header bytes parsed above, the list of chunk lengths + // can be found (slice for easier indexing) + let chunks_from_header = &fextra[10..(10 + chunk_count * 2) as usize]; + let chunk_sizes = chunks_from_header + .chunks(2) + .map(|slice| LittleEndian::read_u16(slice) as u64); + + // Push all chunk offsets + for size in chunk_sizes { + chunk_offsets.push(end_compressed_data); + end_compressed_data += size; + } + + rassert!(chunk_offsets.len() == chunk_count as usize, InvalidFileFormat( + "The read number of compressed chunks in the .dz file must be equivalent \ + to the number of chunks actually found in the file".into() + )); + + // Read uncompressed file length + buf.seek(SeekFrom::Start(end_compressed_data as u64))?; + let ufile_length = buf.read_i32::()? as u64; + + Ok(Self { + buf, + chunk_offsets, + end_compressed_data, + uchunk_length, + ufile_length, + }) + } + + /// Inflate a dictdz chunk + fn inflate(&self, data: Vec) -> Result, DictError> { + let mut decoder = flate2::Decompress::new(false); + let mut decoded = vec![0; self.uchunk_length]; + decoder.decompress(&data, &mut decoded, flate2::FlushDecompress::None)?; + + Ok(decoded) + } + + fn get_chunks_for(&self, start_offset: u64, length: u64) -> Result, DictError> { + let mut chunks = Vec::new(); + let start = start_offset as usize / self.uchunk_length; + let end = (start_offset + length) as usize / self.uchunk_length; + for id in start..=end { + let offset = self.chunk_offsets[id]; + let length = match self.chunk_offsets.get(id + 1) { + Some(next) => next - offset, + None => self.end_compressed_data - offset, + } as usize; + + chunks.push(Chunk { offset, length }); + } + + Ok(chunks) + } +} + +impl DictReader for Compressed { + fn fetch_definition(&mut self, start_offset: u64, length: u64) -> Result { + rassert!(length <= MAX_BYTES_FOR_BUFFER, MemoryError); + rassert!(start_offset + length < self.ufile_length, IoError(io::Error::new(io::ErrorKind::UnexpectedEof, + "Seek beyond the end of uncompressed data was requested." + ))); + + let mut data = Vec::new(); + for chunk in self.get_chunks_for(start_offset, length)? { + let pos = self.buf.seek(SeekFrom::Start(chunk.offset))?; + rassert!(pos == chunk.offset, IoError(io::Error::new(io::ErrorKind::Other, format!( + "Attempted to seek to {} but new position is {}", + chunk.offset, pos + )))); + + let mut definition = vec![0; chunk.length]; + self.buf.read_exact(&mut definition)?; + data.push(self.inflate(definition)?); + } + + // Cut definition, convert to string + let cut_front = start_offset as usize % self.uchunk_length; + + let data = match data.len() { + 0 => unreachable!(), + 1 => data[0][cut_front..cut_front + length as usize].to_vec(), + n => { + let mut tmp = data[0][cut_front..].to_vec(); + + // First vec has been inserted into tmp, therefore skip first and last chunk, too + for text in data.iter().skip(1).take(n - 2) { + tmp.extend_from_slice(text); + } + + // Add last chunk to tmp, omitting stuff after word definition end + let remaining_bytes = (length as usize + cut_front) % self.uchunk_length; + tmp.extend_from_slice(&data[n - 1][..remaining_bytes]); + tmp + } + }; + + Ok(String::from_utf8(data)?) + } +} + diff --git a/src/dictreader.rs b/src/dictreader.rs deleted file mode 100644 index 93735e8..0000000 --- a/src/dictreader.rs +++ /dev/null @@ -1,327 +0,0 @@ -//! Open and read .dict or .dict.dz files -//! -//! This module contains traits and structs to work with uncompressed .dict and compressed .dict.dz -//! files. These files contain the actual dictionary content. While these readers return the -//! definitions, they do not do any post-processing. Definitions are normally plain text, but they -//! could be HTML, or anything else, in theory (although plain text is the de facto default). -//! -//! To understand some of the constants defined in this module or to understand the internals of -//! the DictReaderDz struct, it is advisable to have a brief look at -//! [the GZip standard](https://tools.ietf.org/html/rfc1952). - -use byteorder::*; -use std::ffi::OsStr; -use std::path::Path; -use std::fs::File; -use std::io; -use std::io::{BufReader, BufRead, Read, Seek, SeekFrom}; - -use crate::errors::DictError; - -/// limit size of a word buffer, so that malicious index files cannot request too much memory for a -/// translation -pub static MAX_BYTES_FOR_BUFFER: u64 = 1_048_576; // no headword definition is larger than 1M - -/// byte mask to query for existence of FEXTRA field in the flags byte of a `.dz` file -pub static GZ_FEXTRA: u8 = 0b0000_0100; -/// byte mask to query for the existence of a file name in a `.dz` file -pub static GZ_FNAME: u8 = 0b0000_1000; // indicates whether a file name is contained in the archive -/// byte mask to query for the existence of a comment in a `.dz` file -pub static GZ_COMMENT: u8 = 0b0001_0000; // indicates, whether a comment is present -/// byte mask to detect that a comment is contained in a `.dz` file -pub static GZ_FHCRC: u8 = 0b0000_0010; - - -/// .dict file format: either compressed or uncompressed -/// A dictionary (content) reader -/// -/// This type abstracts from the underlying seek operations required for lookup -/// of headwords and provides easy methods to search for a word given a certain -/// offset and length. Users of a type which implements this trait don't need to care about compression -/// of the dictionary. -pub trait DictReader { - /// fetch the definition from the dictionary at offset and length - fn fetch_definition(&mut self, start_offset: u64, length: u64) -> Result; -} - -/// Raw Dict reader -/// -/// This reader can read uncompressed .dict files. -pub struct DictReaderRaw { - dict_data: B, - total_length: u64, -} - -impl DictReaderRaw { - /// Get a new DictReader from a Reader. - pub fn new(mut dict_data: B) -> Result, DictError> { - let end = dict_data.seek(SeekFrom::End(0))?; - Ok(DictReaderRaw { dict_data, total_length: end }) - } -} - -impl DictReader for DictReaderRaw { - /// fetch definition from dictionary - fn fetch_definition(&mut self, start_offset: u64, length: u64) -> Result { - if length > MAX_BYTES_FOR_BUFFER { - return Err(DictError::MemoryError); - } - if (start_offset + length) > self.total_length { - return Err(DictError::IoError(io::Error::new(io::ErrorKind::UnexpectedEof, "a \ - seek beyond the end of uncompressed data was requested"))); - } - - self.dict_data.seek(SeekFrom::Start(start_offset))?; - let mut read_data = vec![0; length as usize]; - let bytes_read = self.dict_data.read(read_data.as_mut_slice())? as u64; - if bytes_read != length { // reading from end of file? - return Err(DictError::IoError(io::Error::new( - io::ErrorKind::UnexpectedEof, "seek beyond end of file"))); - } - Ok(String::from_utf8(read_data)?) - } -} - -/// Load a [DictReader](trait.DictReader.html) from file. -/// -/// This function loads a [Dictreader](trait.DictReader.html) from a file and transparently selects -/// the correct reader using the file type extension, so the callee doesn't need to care about -/// compression (`.dz`). -/// -/// # Errors -/// -/// The function can return a `DictError`, which can either occur if a I/O error occurs, or when -/// the GZ compressed file is invalid. -pub fn load_dict>(path: P) -> Result, DictError> { - if path.as_ref().extension() == Some(OsStr::new("dz")) { - let reader = File::open(path)?; - Ok(Box::new(DictReaderDz::new(reader)?)) - } else { - let reader = BufReader::new(File::open(path)?); - Ok(Box::new(DictReaderRaw::new(reader)?)) - } -} - - -// ----------------------------------------------------------------------------- -// gzip handling - -/// Gzip Dict reader -/// -/// This reader can read compressed .dict files with the file name suffix .dz. -/// This format is documented in RFC 1952 and in `man dictzip`. An example implementation can be -/// found in the dict daemon (dictd) in `data.c`. -pub struct DictReaderDz { - /// compressed DZ dictionary - dzdict: B, - /// length of an uncompressed chunk - uchunk_length: usize, - /// end of compressed data - end_compressed_data: usize, - /// offsets in file where a new compressed chunk starts - chunk_offsets: Vec, - /// total size of uncompressed file - ufile_length: u64, // has u64 to be quicker in comparing to offsets -} - -#[derive(Debug)] -// a (GZ) chunk, representing length and offset withing the compressed file -struct Chunk { - offset: usize, - length: usize, -} - -impl DictReaderDz { - /// Get a new DictReader from a Reader. - pub fn new(dzdict: B) -> Result, DictError> { - let mut buffered_dzdict = BufReader::new(dzdict); - let mut header = vec![0u8; 12]; - buffered_dzdict.read_exact(&mut header)?; - if header[0..2] != [0x1F, 0x8B] { - return Err(DictError::InvalidFileFormat("Not in gzip format".into(), None)); - } - - let flags = &header[3]; // bitmap of gzip attributes - if (flags & GZ_FEXTRA) == 0 { // check whether FLG.FEXTRA is set - return Err(DictError::InvalidFileFormat("Extra flag (FLG.FEXTRA) \ - not set, not in gzip + dzip format".into(), None)); - } - - // read XLEN, length of extra FEXTRA field - let xlen = LittleEndian::read_u16(&header[10..12]); - - // read FEXTRA data - let mut fextra = vec![0u8; xlen as usize]; - buffered_dzdict.read_exact(&mut fextra)?; - - if fextra[0..2] != [b'R', b'A'] { - return Err(DictError::InvalidFileFormat("No dictzip info found in FEXTRA \ - header (behind XLEN, in SI1SI2 fields)".into(), None)); - } - - let length_subfield = LittleEndian::read_u16(&fextra[2..4]); - assert_eq!(length_subfield, xlen - 4, "the length of the subfield \ - should be the same as the fextra field, ignoring the \ - additional length information and the file format identification"); - let subf_version = LittleEndian::read_u16(&fextra[4..6]); - if subf_version != 1 { - return Err(DictError::InvalidFileFormat("Unimplemented dictzip \ - version, only ver 1 supported".into(), None)); - } - - // before compression, the file is split into evenly-sized chunks and the size information - // is put right after the version information: - let uchunk_length = LittleEndian::read_u16(&fextra[6..8]); - // number of chunks in the file - let chunk_count = LittleEndian::read_u16(&fextra[8..10]); - if chunk_count == 0 { - return Err(DictError::InvalidFileFormat("No compressed chunks in \ - file or broken header information".into(), None)); - } - - // compute number of possible chunks which would fit into the FEXTRA field; used for - // validity check. first 10 bytes of FEXTRA are header information, the rest are 2-byte, - // little-endian numbers. - let numbers_chunks_which_would_fit = ((fextra.len() - 10) / 2) as u16; // each chunk represented by u16 == 2 bytes - // check that number of claimed chunks fits within given size for subfield - if numbers_chunks_which_would_fit != chunk_count { - return Err(DictError::InvalidFileFormat(format!("Expected {} chunks \ - according to dictzip header, but the FEXTRA field can \ - accomodate {}; possibly broken file", chunk_count, - numbers_chunks_which_would_fit), None)); - } - - // if file name bit set, seek beyond the 0-terminated file name, we don't care - if (flags & GZ_FNAME) != 0 { - let mut tmp = Vec::new(); - buffered_dzdict.read_until(b'\0', &mut tmp)?; - } - - // seek past comment, if any - if (flags & GZ_COMMENT) != 0 { - let mut tmp = Vec::new(); - buffered_dzdict.read_until(b'\0', &mut tmp)?; - } - - // skip CRC stuff, 2 bytes - if (flags & GZ_FHCRC) != 0 { - buffered_dzdict.seek(SeekFrom::Current(2))?; - } - - // save length of each compressed chunk - let mut chunk_offsets = Vec::with_capacity(chunk_count as usize); - // save position of last compressed byte (this is NOT EOF, could be followed by CRC checksum) - let mut end_compressed_data = buffered_dzdict.seek(SeekFrom::Current(0))? as usize; - // after the various header bytes parsed above, the list of chunk lengths can be found (slice for easier indexing) - let chunks_from_header = &fextra[10usize..(10 + chunk_count * 2) as usize]; - - // iterate over each 2nd byte, parse u16 - for index in (0..chunks_from_header.len()).filter(|i| (i%2)==0) { - let index = index as usize; - let compressed_len = LittleEndian::read_u16(&chunks_from_header[index..(index + 2)]) as usize; - chunk_offsets.push(end_compressed_data); - end_compressed_data += compressed_len; - } - assert_eq!(chunk_offsets.len() as u16, chunk_count, "The read number of compressed chunks in \ - the .dz file must be equivalent to the number of chunks actually found in the file.\n"); - - // read uncompressed file length - buffered_dzdict.seek(SeekFrom::Start(end_compressed_data as u64))?; - let uncompressed = buffered_dzdict.read_i32::()?; - - Ok(DictReaderDz { dzdict: buffered_dzdict.into_inner(), - chunk_offsets, - end_compressed_data, - uchunk_length: uchunk_length as usize, - ufile_length: uncompressed as u64 }) - } - - fn get_chunks_for(&self, start_offset: u64, length: u64) -> Result, DictError> { - let mut chunks = Vec::new(); - let start_chunk = start_offset as usize / self.uchunk_length; - let end_chunk = (start_offset + length) as usize / self.uchunk_length; - for id in start_chunk..=end_chunk { - let chunk_length = match self.chunk_offsets.get(id+1) { - Some(next) => next - self.chunk_offsets[id], - None => self.end_compressed_data - self.chunk_offsets[id], - }; - chunks.push(Chunk { offset: self.chunk_offsets[id], length: chunk_length }); - } - - Ok(chunks) - } - - // inflate a dictdz chunk - fn inflate(&self, data: Vec) -> Result, DictError> { - let mut decoder = flate2::Decompress::new(false); - let mut decoded = vec![0u8; self.uchunk_length]; - decoder.decompress(data.as_slice(), decoded.as_mut_slice(), flate2::FlushDecompress::None)?; - Ok(decoded) - } -} - -impl DictReader for DictReaderDz { - // Fetch definition from the dictionary. - fn fetch_definition(&mut self, start_offset: u64, length: u64) -> Result { - if length > MAX_BYTES_FOR_BUFFER { - return Err(DictError::MemoryError); - } - if (start_offset + length) > self.ufile_length { - return Err(DictError::IoError(io::Error::new(io::ErrorKind::UnexpectedEof, "a \ - seek beyond the end of uncompressed data was requested"))); - } - let mut data = Vec::new(); - for chunk in self.get_chunks_for(start_offset, length)? { - let pos = self.dzdict.seek(SeekFrom::Start(chunk.offset as u64))?; - if pos != (chunk.offset as u64) { - return Err(DictError::IoError(io::Error::new(io::ErrorKind::Other, format!( - "attempted to seek to {} but new position is {}", - chunk.offset, pos)))); - } - let mut definition = vec![0u8; chunk.length]; - self.dzdict.read_exact(&mut definition)?; - data.push(self.inflate(definition)?); - }; - - // cut definition, convert to string - let cut_front = start_offset as usize % self.uchunk_length; - // join the chunks to one vector, only keeping the content of the definition - let data = match data.len() { - 0 => panic!(), - 1 => data[0][cut_front .. cut_front + length as usize].to_vec(), - n => { - let mut tmp = data[0][cut_front..].to_vec(); - // first vec has been inserted into tmp, therefore skip first and last chunk, too - for text in data.iter().skip(1).take(n-2) { - tmp.extend_from_slice(text); - } - // add last chunk to tmp, omitting stuff after word definition end - let remaining_bytes = (length as usize + cut_front) % self.uchunk_length; - tmp.extend_from_slice(&data[n-1][..remaining_bytes]); - tmp - }, - }; - Ok(String::from_utf8(data)?) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - fn load_resource(name: &str) -> ::std::fs::File { - let mut path = ::std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")); - path.push("tests"); - path.push("assets"); - path.push(name); - ::std::fs::File::open(path).unwrap() - } - - #[test] - fn test_number_of_parsed_chunks_is_correct() { - let rsrc = load_resource("lat-deu.dict.dz"); - let d = DictReaderDz::new(rsrc).unwrap(); - assert_eq!(d.chunk_offsets.len(), 7); - } -} - diff --git a/src/error.rs b/src/error.rs new file mode 100644 index 0000000..a980e95 --- /dev/null +++ b/src/error.rs @@ -0,0 +1,41 @@ +use std::io; +use std::string::FromUtf8Error; + +/// Error type, representing the errors which can be returned by the libdict library. +/// +/// This enum represents a handful of custom errors and wraps `io:::Error` and +/// `string::FromUtf8Error`. +#[derive(Debug, thiserror::Error)] +pub enum DictError { + /// Invalid character within the index file. Contains detailed positions within the index file. + #[error("Invalid character '{0}' found on line: {1} at position {2}.")] + InvalidCharacter(char, usize, usize), + + /// Occurs whenever a line in an index file misses a column. + #[error("Not enough tab-separated columns in index file, expected 3. Line: {0}")] + MissingColumnInIndex(usize), + + /// Invalid file format. Contains additional context of the error. + #[error("Encountered an invalid file format. Context: {0:?}")] + InvalidFileFormat(String), + + /// This reports a malicious/malformed index file, which requests a buffer which is too large. + #[error("Requested too much memory. Headword definitions are never larger than 1 MB. The index file is malicious or malformed.")] + MemoryError, + + /// This reports words which are not present in the dictionary. + #[error("Word \"{0}\" not found.")] + WordNotFound(String), + + /// A wrapped io::Error. + #[error("Encountered an IO error.")] + IoError(#[from] io::Error), + + /// A wrapped string::FromUtf8Error. + #[error("Encountered a UTF-8 error.")] + Utf8Error(#[from] FromUtf8Error), + + /// Errors thrown by the flate2 crate - not really descriptive errors, though. + #[error("Encountered a decompression error.")] + Deflate(#[from] flate2::DecompressError), +} diff --git a/src/errors.rs b/src/errors.rs deleted file mode 100644 index 75ed349..0000000 --- a/src/errors.rs +++ /dev/null @@ -1,100 +0,0 @@ -//! Errors for the Dict dictionary crate. -use std::error; - -/// Error type, representing the errors which can be returned by the libdict library. -/// -/// This enum represents a handful of custom errors and wraps `io:::Error` and -/// `string::FromUtf8Error`. -#[derive(Debug)] -pub enum DictError { - /// Invalid character, e.g. within the index file; the error contains the erroneous character, - /// and optionally line and position. - InvalidCharacter(char, Option, Option), - /// Occurs whenever a line in an index file misses a column. - MissingColumnInIndex(usize), - /// Invalid file format, contains an explanation an optional path to the - /// file with the invalid file format. - InvalidFileFormat(String, Option), - /// This reports a malicious / malformed index file, which requests a buffer which is too large. - MemoryError, - /// This reports words which are not present in the dictionary. - WordNotFound(String), - /// A wrapped io::Error. - IoError(::std::io::Error), - /// A wrapped Utf8Error. - Utf8Error(::std::string::FromUtf8Error), - /// errors thrown by the flate2 crate - not really descriptive errors, though. - DeflateError(flate2::DecompressError), -} - -impl ::std::fmt::Display for DictError { - fn fmt(&self, f: &mut ::std::fmt::Formatter) -> ::std::fmt::Result { - match *self { - DictError::IoError(ref e) => e.fmt(f), - DictError::Utf8Error(ref e) => e.fmt(f), - DictError::DeflateError(ref err) => write!(f, "Error while using \ - the flate2 crate: {:?}", err), - DictError::MemoryError => write!(f, "not enough memory available"), - DictError::WordNotFound(ref word) => write!(f, "Word not found: {}", word), - DictError::InvalidCharacter(ref ch, ref line, ref pos) => { - let mut ret = write!(f, "Invalid character {}", ch); - if let Some(ln) = *line { - ret = write!(f, " on line {}", ln); - } - if let Some(pos) = *pos { - ret = write!(f, " at position {}", pos); - } - ret - }, - DictError::MissingColumnInIndex(ref lnum) => write!(f, "line {}: not \ - enough -separated columns found, expected 3", lnum), - DictError::InvalidFileFormat(ref explanation, ref path) => - write!(f, "{}{}", path.clone().unwrap_or_else(String::new), explanation) - } - } -} - -impl error::Error for DictError { - fn description(&self) -> &str { - match *self { - DictError::InvalidCharacter(_, _, _) => "invalid character", - DictError::MemoryError => "not enough memory available", - DictError::WordNotFound(_) => "word not found", - DictError::MissingColumnInIndex(_) => - "not enough -separated columns given", - DictError::InvalidFileFormat(ref _explanation, ref _path) => "could not \ - determine file format", - DictError::IoError(ref err) => err.description(), - DictError::DeflateError(_) => "invalid data, couldn't inflate", - DictError::Utf8Error(ref err) => err.description(), - } - } - - fn cause(&self) -> Option<&dyn error::Error> { - match *self { - DictError::IoError(ref err) => err.source(), - DictError::Utf8Error(ref err) => err.source(), - _ => None, - } - } -} - -// allow seamless coercion from::Error -impl From<::std::io::Error> for DictError { - fn from(err: ::std::io::Error) -> DictError { - DictError::IoError(err) - } -} - -impl From<::std::string::FromUtf8Error> for DictError { - fn from(err: ::std::string::FromUtf8Error) -> DictError { - DictError::Utf8Error(err) - } -} - -impl From for DictError { - fn from(err: flate2::DecompressError) -> DictError { - DictError::DeflateError(err) - } -} - diff --git a/src/index/mod.rs b/src/index/mod.rs new file mode 100644 index 0000000..8b0854b --- /dev/null +++ b/src/index/mod.rs @@ -0,0 +1,16 @@ +mod parsing; + +use crate::DictError; +use std::{collections::HashMap, io::{Seek, Read, BufReader}}; + +pub struct Index { + pub words: HashMap, +} + +impl Index { + pub fn new(reader: R) -> Result { + let buf_reader = BufReader::new(reader); + parsing::parse(buf_reader) + } +} + diff --git a/src/index/parsing.rs b/src/index/parsing.rs new file mode 100644 index 0000000..5ce7ae4 --- /dev/null +++ b/src/index/parsing.rs @@ -0,0 +1,66 @@ +use super::Index; +use crate::DictError; +use std::collections::HashMap; +use std::io::BufRead; +use DictError::*; + +#[derive(Default)] +struct Context { + line: usize, + pos: usize, +} + +pub fn parse(reader: R) -> Result { + let mut ctx = Context::default(); + let mut words = HashMap::new(); + + for line in reader.lines() { + let (word, start_offset, length) = parse_line(&mut ctx, line?)?; + words.insert(word, (start_offset, length)); + } + + Ok(Index { words }) +} + +fn parse_line(ctx: &mut Context, line: String) -> Result<(String, u64, u64), DictError> { + let mut split = line.split('\t'); + + // 1st column + let word = split.next().ok_or(MissingColumnInIndex(ctx.line))?; + + // 2nd column - offset into file + ctx.pos = word.len(); + let s = split.next().ok_or(MissingColumnInIndex(ctx.line))?; + let start_offset = decode_number(&ctx, s)?; + + // 3rd column - entry length + ctx.pos += s.len(); + let s = split.next().ok_or(MissingColumnInIndex(ctx.line))?; + let length = decode_number(&ctx, s)?; + + // Advance context to new line + ctx.line += 1; + ctx.pos = 0; + + Ok((word.into(), start_offset, length)) +} + +fn decode_number(ctx: &Context, s: &str) -> Result { + let mut index = 0u64; + for (i, ch) in s.chars().rev().enumerate() { + index += get_base(ctx, ch)? * 64u64.pow(i as u32); + } + + Ok(index) +} + +fn get_base(ctx: &Context, ch: char) -> Result { + match ch { + 'A'..='Z' => Ok((ch as u64) - 65), // 'A' should become 0 + 'a'..='z' => Ok((ch as u64) - 71), // 'a' should become 26 + '0'..='9' => Ok((ch as u64) + 4), // 0 should become 52 + '+' => Ok(62), + '/' => Ok(63), + _ => Err(InvalidCharacter(ch, ctx.line, ctx.pos)), + } +} diff --git a/src/indexing.rs b/src/indexing.rs deleted file mode 100644 index e2394d1..0000000 --- a/src/indexing.rs +++ /dev/null @@ -1,105 +0,0 @@ -//! Parse and decode `*.index` files. -//! -//! Each dictionary file (`*.dict.?)`) is accompanied by a `*.index` file containing a list of -//! words, together with its (byte) position in the dict file and its (byte) length. This module -//! provides functions to parse this index file. -//! -//! The position and the length of a definition is given in a semi-base64 encoding. It uses all -//! Latin letters (upper and lower case), all digits and additionally, `+` and `/`: -//! -//! `ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/` -//! -//! The calculation works as follows: `sum += x * 64.pow(i` -//! -//! - `i` is the position within the string to calculate the number from and counts from right to -//! left, starting at 0. -//! - `x` is the index within the array given above, i.e. `'a' == 26`. -//! -//! The sum makes up the index. -use std::path::Path; -use std::collections::HashMap; -use std::io::{BufRead, BufReader}; -use std::fs::File; - -use crate::errors::DictError; -use crate::errors::DictError::*; - -/// Datastructure to hold the word → (position, length) information. -pub type Index = HashMap; - -/// Get the assigned number for a character -/// If the character was unknown, an empty Err(()) is returned. -#[inline] -fn get_base(input: char) -> Result { - match input { - 'A' ..= 'Z' => Ok((input as u64) - 65), // 'A' should become 0 - 'a' ..= 'z' => Ok((input as u64) - 71), // 'a' should become 26, ... - '0' ..= '9' => Ok((input as u64) + 4), // 0 should become 52 - '+' => Ok(62), - '/' => Ok(63), - _ => Err(()), - } -} - -/// Decode a number from a given String. -/// -/// This function decodes a number from the format described in the module documentation. If -/// unknown characters/bytes are encountered, a `DictError` is returned. -/// -/// # Example -/// -/// ``` -/// use dict::indexing::decode_number; -/// -/// fn main() { -/// let myoffset = "3W/"; -/// let myoffset = decode_number(myoffset).unwrap(); -/// assert_eq!(myoffset, 226751); -/// } -/// ``` -pub fn decode_number(word: &str) -> Result { - let mut index = 0u64; - for (i, character) in word.chars().rev().enumerate() { - index += match get_base(character) { - Ok(x) => x * 64u64.pow(i as u32), - Err(_) => return Err(InvalidCharacter(character, None, Some(i))), - }; - } - Ok(index) -} - -fn parse_line(line: &str, line_number: usize) -> Result<(&str, u64, u64), DictError> { - let mut split = line.split('\t'); - let word = split.next().ok_or(MissingColumnInIndex(line_number))?; - - // second column: offset into file - let start_offset = split.next().ok_or(MissingColumnInIndex(line_number))?; - let start_offset = decode_number(start_offset)?; - - // get entry length - let length = split.next().ok_or(MissingColumnInIndex(line_number))?; - let length = decode_number(length)?; - - Ok((word, start_offset, length)) -} - -/// Parse the index for a dictionary from a given BufRead compatible object. -pub fn parse_index(br: B) -> Result { - let mut index = HashMap::new(); - - for (line_number, line) in br.lines().enumerate() { - let line = line?; - let (word, start_offset, length) = parse_line(&line, line_number)?; - index.entry(word.to_string()).or_insert((start_offset, length)); - } - - Ok(index) -} - -/// Parse the index for a dictionary from a given path. -pub fn parse_index_from_file>(path: P) -> Result { - let file = File::open(path)?; - let file = BufReader::new(&file); - parse_index(file) -} - diff --git a/src/lib.rs b/src/lib.rs index c5696f9..6fd510b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,24 +8,34 @@ //! The usage is straight forward: //! //! ```rust,no_run +//! use dict::Dict; +//! //! fn main() { //! let index_file = "/usr/share/dictd/freedict-lat-deu.index"; //! let dict_file = "/usr/share/dictd/freedict-lat-deu.dict.dz"; -//! let mut latdeu = dict::load_dictionary_from_file(dict_file, index_file).unwrap(); +//! let mut latdeu = Dict::from_file(dict_file, index_file).unwrap(); //! // hey: rust! //! println!("{}", latdeu.lookup("ferrugo").unwrap()); //! } //! ``` -pub mod dictreader; -pub mod errors; -pub mod indexing; - -use self::dictreader::DictReader; -use self::indexing::Index; +pub mod compressed; +mod error; +pub mod index; +mod reader; +mod uncompressed; +mod mmap; +pub use compressed::Compressed; +pub use error::DictError; +use mmap::MmapCursor; +pub use reader::{DictReader, MAX_BYTES_FOR_BUFFER}; +pub use uncompressed::Uncompressed; +pub use index::Index; +use std::ffi::OsStr; +use std::fs::File; +use std::io::BufReader; use std::path::Path; -use std::collections::HashMap; /// A dictionary wrapper. /// @@ -34,26 +44,72 @@ use std::collections::HashMap; /// information. It provides a convenience function to look up headwords directly, without caring /// about the details of the index and the underlying dict format. /// For an example, please see the [crate documentation](index.html). -pub struct Dictionary { - dict_reader: Box, - word_index: HashMap +pub struct Dict { + pub(crate) reader: Box, + pub(crate) index: Index, } -impl Dictionary { +impl Dict { + /// Creates a Dict with a BufReader. + pub fn from_file>(dict_path: P, index_path: P) -> Result { + let dict_reader = BufReader::new(File::open(&dict_path)?); + let index_reader = BufReader::new(File::open(&index_path)?); + + let reader: Box = if dict_path.as_ref().extension() == Some(OsStr::new("dz")) { + Box::new(Compressed::new(dict_reader)?) + } else { + Box::new(Uncompressed::new(dict_reader)?) + }; + + Ok(Self { + reader, + index: Index::new(index_reader)?, + }) + } + + /// Creates a Dict with an Mmap reader. + /// + /// # Note + /// In order for updates to the dictionary to happen soundly, the updater must use advisory + /// locks to exclusively lock the dictionary. + pub fn from_file_mmap>(dict_path: P, index_path: P) -> Result { + let dict_reader = MmapCursor::new(&dict_path)?; + let index_reader = BufReader::new(File::open(&index_path)?); + + let reader: Box = if dict_path.as_ref().extension() == Some(OsStr::new("dz")) { + Box::new(Compressed::new(dict_reader)?) + } else { + Box::new(Uncompressed::new(dict_reader)?) + }; + + Ok(Self { + reader, + index: Index::new(index_reader)?, + }) + } + + /// Creates a Dict from an existing DictReader and Index. + pub fn from_existing(reader: Box, index: Index) -> Result { + Ok(Self { reader, index }) + } + /// Look up a word in a dictionary. /// /// Words are looked up in the index and then retrieved from the dict file. If no word was /// found, `DictError::WordNotFound` is returned. Other errors all result from the parsing of /// the underlying files. - pub fn lookup(&mut self, word: &str) -> Result { - let &(start, length) = self.word_index.get(&word.to_lowercase()).ok_or_else(|| - errors::DictError::WordNotFound(word.into()))?; - self.dict_reader.fetch_definition(start, length) + pub fn lookup(&mut self, word: &str) -> Result { + let &(start, length) = self + .index + .words + .get(&word.to_lowercase()) + .ok_or_else(|| DictError::WordNotFound(word.into()))?; + self.reader.fetch_definition(start, length) } /// Check whether a word is contained in the index pub fn contains(&self, word: &str) -> bool { - self.word_index.get(&word.to_lowercase()).is_some() + self.index.words.get(&word.to_lowercase()).is_some() } /// Case-sensitive member check. @@ -62,17 +118,16 @@ impl Dictionary { /// it's lower case or not. This can help to avoid an additional allocation, if the caller can /// be sure that the string is already lower case. pub fn contains_unchecked(&self, word: &str) -> bool { - self.word_index.get(word).is_some() + self.index.words.get(word).is_some() } /// Get the short name. /// /// This returns the short name of a dictionary. This corresponds to the /// value passed to the `-s` option of `dictfmt`. - pub fn short_name(&mut self) -> Result { + pub fn short_name(&mut self) -> Result { self.lookup("00-database-short") .or_else(|_| self.lookup("00databaseshort")) - // Some dictionaries contain the headword in their entry, others don't: .map(|def| { let start = if def.starts_with("00-database-short") { 17 @@ -84,42 +139,83 @@ impl Dictionary { } } -/// Load dictionary from given paths -/// -/// A dictionary is made of an index and a dictionary (data) file, both are opened from the given -/// input file names. Gzipped files with the suffix `.dz` will be handled automatically. -pub fn load_dictionary_from_file>(content_fn: P, index_fn: P) -> Result { - let dreader = dictreader::load_dict(content_fn)?; - let index = indexing::parse_index_from_file(index_fn)?; - Ok(Dictionary { dict_reader: dreader, word_index: index }) -} - -/// Load dictionary from given [DictReader](dictreader/index.html) and [Index](indexing/type.Index.html). -/// -/// A dictionary is made of an index and a dictionary (data). Both are required for look up. This -/// function allows abstraction from the underlying source by only requiring a -/// [dictReader](dictreader) as trait object. This way, dictionaries from RAM or similar can be -/// implemented. -pub fn load_dictionary(content: Box, index: Index) -> Dictionary { - Dictionary { dict_reader: content, word_index: index } -} - #[cfg(test)] mod tests { use super::*; + use std::fs::File; + use std::path::PathBuf; + + fn get_asset_path() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("assets") + } + + fn get_resource(name: &str) -> PathBuf { + get_asset_path().join(name) + } + + fn load_resource(name: &str) -> File { + let res = get_resource(name); + File::open(res).unwrap() + } + + fn resource_over_bufreader(file: File) -> BufReader { + BufReader::new(file) + } + + fn resource_over_mmap(file: File) -> MmapCursor { + MmapCursor::from_file(file).unwrap() + } + + fn example_dictionary() -> Result { + let dict = get_asset_path().join("lat-deu.dict.dz"); + let index = get_asset_path().join("lat-deu.index"); - fn example_dictionary() -> Result { - let path = ::std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")) - .join("tests/assets"); - load_dictionary_from_file(path.join("lat-deu.dict.dz"), - path.join("lat-deu.index")) + Dict::from_file(dict, index) + } + + fn example_dictionary_mmap() -> Result { + let dict = get_asset_path().join("lat-deu.dict.dz"); + let index = get_asset_path().join("lat-deu.index"); + + Dict::from_file_mmap(dict, index) } #[test] fn test_getting_short_name() { let mut dict = example_dictionary().unwrap(); - assert_eq!(dict.short_name().ok(), - Some("Latin - German FreeDict dictionary ver. 0.4".to_string())); + + assert_eq!( + dict.short_name().ok(), + Some("Latin - German FreeDict dictionary ver. 0.4".to_string()) + ); + } + + #[test] + fn test_number_of_parsed_chunks_is_correct() { + let dict_file = resource_over_bufreader(load_resource("lat-deu.dict.dz")); + let reader = Compressed::new(dict_file).unwrap(); + + assert_eq!(reader.chunk_offsets.len(), 7); + } + + #[test] + fn test_getting_short_name_mmap() { + let mut dict = example_dictionary_mmap().unwrap(); + + assert_eq!( + dict.short_name().ok(), + Some("Latin - German FreeDict dictionary ver. 0.4".to_string()) + ); + } + + #[test] + fn test_number_of_parsed_chunks_is_correct_mmap() { + let dict_file = resource_over_mmap(load_resource("lat-deu.dict.dz")); + let reader = Compressed::new(dict_file).unwrap(); + + assert_eq!(reader.chunk_offsets.len(), 7); } } + diff --git a/src/mmap/mod.rs b/src/mmap/mod.rs new file mode 100644 index 0000000..5143c99 --- /dev/null +++ b/src/mmap/mod.rs @@ -0,0 +1,86 @@ +use std::{io, fs::File, path::Path}; +use advisory_lock::{AdvisoryFileLock, FileLockMode, FileLockError}; +use memmap2::Mmap; + +pub struct MmapCursor { + file: File, + mmap: Mmap, + pos: u64, +} + +impl MmapCursor { + pub fn new(path: impl AsRef) -> io::Result { + let file = File::open(path)?; + let mmap = unsafe { Mmap::map(&file)? }; + + Ok(Self { + file, + mmap, + pos: 0, + }) + } + + pub fn from_file(file: File) -> io::Result { + let mmap = unsafe { Mmap::map(&file)? }; + + Ok(Self { + file, + mmap, + pos: 0, + }) + } + + pub fn remaining_slice(&self) -> &[u8] { + let len = self.pos.min(self.mmap.len() as u64); + &self.mmap[(len as usize)..] + } +} + +impl io::Read for MmapCursor { + fn read(&mut self, buf: &mut [u8]) -> io::Result { + self.file.lock(FileLockMode::Shared).or_else(|e| match e { + FileLockError::AlreadyLocked => unreachable!(), // File is in a blocking lock, shouldn't happen? + FileLockError::Io(e) => Err(e), + })?; + + let n = io::Read::read(&mut self.remaining_slice(), buf)?; + self.pos += n as u64; + Ok(n) + } +} + +impl io::Seek for MmapCursor { + fn seek(&mut self, pos: io::SeekFrom) -> io::Result { + self.file.lock(FileLockMode::Shared).or_else(|e| match e { + FileLockError::AlreadyLocked => unreachable!(), // File is in a blocking lock, shouldn't happen? + FileLockError::Io(e) => Err(e), + })?; + + let (base_pos, offset) = match pos { + io::SeekFrom::Start(n) => { + self.pos = n; + return Ok(n); + } + io::SeekFrom::End(n) => (self.mmap.len() as u64, n), + io::SeekFrom::Current(n) => (self.pos, n), + }; + + let new_pos = if offset >= 0 { + u64::checked_add(base_pos, offset as u64) + } else { + u64::checked_add(base_pos, offset.unsigned_abs()) + }; + + match new_pos { + Some(n) => { + self.pos = n; + Ok(n) + } + None => Err(io::Error::new( + io::ErrorKind::InvalidInput, + "invalid seek to a negative or overflowing position", + )), + } + } +} + diff --git a/src/reader.rs b/src/reader.rs new file mode 100644 index 0000000..6ca0939 --- /dev/null +++ b/src/reader.rs @@ -0,0 +1,11 @@ +use super::DictError; + +pub trait DictReader { + fn fetch_definition(&mut self, start_offset: u64, length: u64) -> Result; +} + +/// Limit size of a word buffer +/// +/// Headword definitions are never larger than 1 MB, so prevent malicious or malformed index files +/// from requesting too much memory for a translation. +pub const MAX_BYTES_FOR_BUFFER: u64 = 1_048_576; diff --git a/src/uncompressed.rs b/src/uncompressed.rs new file mode 100644 index 0000000..162326c --- /dev/null +++ b/src/uncompressed.rs @@ -0,0 +1,39 @@ +use super::{DictError, DictReader, MAX_BYTES_FOR_BUFFER}; +use rassert_rs::rassert; +use std::io::{self, Read, Seek, SeekFrom}; +use DictError::*; + +/// Uncompressed Dict reader +/// +/// This reader can read uncompressed .dict files. +pub struct Uncompressed { + pub(crate) buf: B, + pub(crate) length: u64, +} + +impl Uncompressed { + pub fn new(mut buf: B) -> Result { + let length = buf.seek(SeekFrom::End(0))?; + + Ok(Self { buf, length }) + } +} + +impl DictReader for Uncompressed { + fn fetch_definition(&mut self, start_offset: u64, length: u64) -> Result { + rassert!(length <= MAX_BYTES_FOR_BUFFER, MemoryError); + rassert!(start_offset + length <= self.length, IoError(io::Error::new(io::ErrorKind::UnexpectedEof, + "Seek beyond the end of uncompressed data was requested." + ))); + + self.buf.seek(SeekFrom::Start(start_offset))?; + let mut read_data = vec![0; length as usize]; + let bytes_read = self.buf.read(&mut read_data)? as u64; + rassert!(bytes_read == length, IoError(io::Error::new(io::ErrorKind::UnexpectedEof, + "Seek beyond end of file" + ))); + + Ok(String::from_utf8(read_data)?) + } +} + diff --git a/tests/dict.rs b/tests/dict.rs new file mode 100644 index 0000000..b43fd08 --- /dev/null +++ b/tests/dict.rs @@ -0,0 +1,330 @@ +use std::fs::File; +use std::io::{BufReader, Cursor, Read}; +use std::path::PathBuf; + +use dict::index::Index; +use dict::*; + +fn get_asset_path() -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("tests") + .join("assets") +} + +fn get_resource(name: &str) -> PathBuf { + get_asset_path().join(name) +} + +fn load_resource(name: &str) -> File { + let res = get_resource(name); + File::open(res).unwrap() +} + +// Uncompressed dict reader + +#[test] +fn correct_position() { + let reader = Cursor::new("Ignore me: important"); + let mut dict = Uncompressed::new(reader).unwrap(); + let def = dict.fetch_definition(11, 9).unwrap(); + + assert_eq!(def, "important"); +} + +#[test] +fn seeking_to_start() { + let reader = Cursor::new("abcdefg"); + let mut dict = Uncompressed::new(reader).unwrap(); + let def = dict.fetch_definition(0, 3).unwrap(); + + assert_eq!(def, "abc"); +} + +#[test] +#[should_panic] +fn seeking_beyond_file() { + let reader = Cursor::new("xyz is too short ;)"); + let mut dict = Uncompressed::new(reader).unwrap(); + dict.fetch_definition(66642, 18).unwrap(); +} + +#[test] +#[should_panic] +fn reading_beyond_file_boundary() { + let reader = Cursor::new("blablablup"); + let mut dict = Uncompressed::new(reader).unwrap(); + dict.fetch_definition(0, 424242).unwrap(); +} + +#[test] +#[should_panic] +fn length_too_large() { + let reader = Cursor::new("blablablup"); + let mut dict = Uncompressed::new(reader).unwrap(); + dict.fetch_definition(0, 424242).unwrap(); +} + +// Compressed dict reader + +#[test] +#[should_panic] +fn wrong_file_id() { + let data = Cursor::new(vec![0x1F, 0x8C]); + Compressed::new(data).unwrap(); +} + +#[test] +fn right_file_id() { + let file = load_resource("lat-deu.dict.dz"); + Compressed::new(file).unwrap(); +} + +#[test] +#[should_panic] +fn no_fextra_field() { + let mut file = load_resource("lat-deu.dict.dz"); + let mut data = Vec::new(); + file.read_to_end(&mut data).unwrap(); + + // Reset flags field to 0 + data[3] = 0; + + let data = Cursor::new(data); + Compressed::new(data).unwrap(); +} + +#[test] +#[should_panic] +fn invalid_si_bytes() { + // silsi2 are the identification for the dictzip extension + let mut file = load_resource("lat-deu.dict.dz"); + let mut data = Vec::new(); + file.read_to_end(&mut data).unwrap(); + + // Reset flags field to 0 + data[12] = 0; + data[13] = 0; + + let data = Cursor::new(data); + Compressed::new(data).unwrap(); +} + +#[test] +#[should_panic] +fn invalid_version_number() { + // dictzip format specifies a field called "VER" + let mut file = load_resource("lat-deu.dict.dz"); + let mut data = Vec::new(); + file.read_to_end(&mut data).unwrap(); + + // Reset version field to 0 + data[16] = 0; + data[17] = 0; + + let data = Cursor::new(data); + Compressed::new(data).unwrap(); +} + +#[test] +#[should_panic] +fn mismatched_subfield_and_fextra_length() { + // the "FEXTRA" length (also called XLEN in the specification) contains the additional header + // information for the dictzip format. This field has a header on its own and hence it is + // necessary to check whether both match and whether non-matching field lengths are detected + let mut file = load_resource("lat-deu.dict.dz"); + let mut data = Vec::new(); + file.read_to_end(&mut data).unwrap(); + + // Reset flags field to 0 + data[14] = 8; + data[15] = 9; + + let data = Cursor::new(data); + Compressed::new(data).unwrap(); +} + +#[test] +#[should_panic] +fn chunk_count_is_zero() { + let mut file = load_resource("lat-deu.dict.dz"); + let mut data = Vec::new(); + file.read_to_end(&mut data).unwrap(); + + // Reset chunk count to 0 + data[20] = 0; + data[21] = 0; + + let data = Cursor::new(data); + Compressed::new(data).unwrap(); +} + +#[test] +#[should_panic] +fn mismatched_chunk_count_and_xlen() { + let mut file = load_resource("lat-deu.dict.dz"); + let mut data = Vec::new(); + file.read_to_end(&mut data).unwrap(); + + // Reset chunk count to 0 + data[20] = 8; + data[21] = 9; + + let data = Cursor::new(data); + Compressed::new(data).unwrap(); +} + +#[test] +fn word_doesnt_exist() { + let dict_path = get_resource("lat-deu.dict.dz"); + let index_path = get_resource("lat-deu.index"); + let mut dict = Dict::from_file(dict_path, index_path).unwrap(); + + assert!(dict.lookup("testtesttest").is_err()); +} + +#[test] +fn word_does_exist() { + let dict_path = get_resource("lat-deu.dict.dz"); + let index_path = get_resource("lat-deu.index"); + let mut dict = Dict::from_file(dict_path, index_path).unwrap(); + let word = dict.lookup("mater").unwrap(); + + assert!(word.starts_with("mater")); +} + +#[test] +fn get_word_from_first_chunk() { + let dict_path = get_resource("lat-deu.dict.dz"); + let index_path = get_resource("lat-deu.index"); + let mut dict = Dict::from_file(dict_path, index_path).unwrap(); + let word = dict.lookup("amo").unwrap(); + + assert!(word.starts_with("amo")); +} + +#[test] +fn get_word_from_last_chunk() { + let dict_path = get_resource("lat-deu.dict.dz"); + let index_path = get_resource("lat-deu.index"); + let mut dict = Dict::from_file(dict_path, index_path).unwrap(); + let word = dict.lookup("vultus").unwrap(); + + assert!(word.starts_with("vultus")); +} + +#[test] +fn get_word_split_at_chunk_border() { + let dict_path = get_resource("lat-deu.dict.dz"); + let index_path = get_resource("lat-deu.index"); + let mut dict = Dict::from_file(dict_path, index_path).unwrap(); + let word = dict.lookup("circumfero").unwrap(); + + // For the above dictionary, the chunk (or block) length of each uncompressed chunk is 58315; + // Exactly there, the definition circumfero is split into two pieces: + assert!(word.starts_with("circumfero")); + assert!(word.ends_with("herumtreiben\n")); +} + +#[test] +fn comment_parsing_correct() { + let mut file = load_resource("lat-deu.dict.dz"); + let mut data = Vec::new(); + file.read_to_end(&mut data).unwrap(); + + // Set comment bit to 1 + data[3] |= dict::compressed::GZ_COMMENT; + + // Add comment after file name; the header itself is for this particular file 36 bytes + 13 + // bytes file name (byte 13 is 0-byte) + let mut newdata: Vec = Vec::with_capacity(data.len() - 13); + newdata.extend(&data[0..49]); + newdata.extend(b"hi there\0"); // Insert comment + newdata.extend(&data[49..]); + + let index_reader = BufReader::new(File::open(get_resource("lat-deu.index")).unwrap()); + let index = Index::new(index_reader).unwrap(); + let data = Cursor::new(newdata); + let reader = Box::new(Compressed::new(data).unwrap()); + let mut dict = Dict::from_existing(reader, index).unwrap(); + let word = dict.lookup("mater").unwrap(); + + assert!(word.starts_with("mater")); +} + +#[test] +fn no_filename_correct() { + let mut file = load_resource("lat-deu.dict.dz"); + let mut data = Vec::new(); + file.read_to_end(&mut data).unwrap(); + + // Reset fname bit to 0 + data[3] &= !dict::compressed::GZ_FNAME; + + // flags byte of gz header + // remove file name from file; there are various fields in the gz header, which I won't repeat; + // together with the bytes in fextra (listing 7 compressed chunks), the file name starts at + // position 36. If you want to check the maths, have a look at src/dictreader.rs. The file name + // is 13 bytes long, so these need to be extracted: + let mut newdata: Vec = Vec::with_capacity(data.len() - 13); + newdata.extend(&data[0..36]); + newdata.extend(&data[49..]); + + let index_reader = BufReader::new(File::open(get_resource("lat-deu.index")).unwrap()); + let index = Index::new(index_reader).unwrap(); + let data = Cursor::new(newdata); + let reader = Box::new(Compressed::new(data).unwrap()); + let mut dict = Dict::from_existing(reader, index).unwrap(); + let word = dict.lookup("mater").unwrap(); + + assert!(word.starts_with("mater")); +} + +#[test] +#[should_panic] +fn seek_beyond_end_of_file() { + let mut file = load_resource("lat-deu.dict.dz"); + let mut data = Vec::new(); + file.read_to_end(&mut data).unwrap(); + + let data = Cursor::new(data); + let mut dict = Compressed::new(data).unwrap(); + dict.fetch_definition(9999999999u64, 888u64).unwrap(); +} + + +// Mmap tests + +#[test] +fn get_word_from_first_chunk_mmap() { + let dict_path = get_resource("lat-deu.dict.dz"); + let index_path = get_resource("lat-deu.index"); + let mut dict = Dict::from_file_mmap(dict_path, index_path).unwrap(); + let word = dict.lookup("amo").unwrap(); + + assert!(word.starts_with("amo")); +} + +#[test] +fn get_word_from_last_chunk_mmap() { + let dict_path = get_resource("lat-deu.dict.dz"); + let index_path = get_resource("lat-deu.index"); + let mut dict = Dict::from_file_mmap(dict_path, index_path).unwrap(); + let word = dict.lookup("vultus").unwrap(); + + assert!(word.starts_with("vultus")); +} + +#[test] +fn get_word_split_at_chunk_border_mmap() { + let dict_path = get_resource("lat-deu.dict.dz"); + let index_path = get_resource("lat-deu.index"); + let mut dict = Dict::from_file_mmap(dict_path, index_path).unwrap(); + let word = dict.lookup("circumfero").unwrap(); + + // For the above dictionary, the chunk (or block) length of each uncompressed chunk is 58315; + // Exactly there, the definition circumfero is split into two pieces: + assert!(word.starts_with("circumfero")); + assert!(word.ends_with("herumtreiben\n")); +} + + diff --git a/tests/index.rs b/tests/index.rs new file mode 100644 index 0000000..c158cb4 --- /dev/null +++ b/tests/index.rs @@ -0,0 +1,48 @@ +use dict::*; +use std::io::Cursor; + +// Index parsing + +#[test] +#[should_panic] +fn invalid_line() { + let reader = Cursor::new("blabla\nblublbub yo"); + Index::new(reader).unwrap(); +} + +#[test] +#[should_panic] +fn invalid_column() { + let reader = Cursor::new("only one\t(tab) character"); + Index::new(reader).unwrap(); +} + +#[test] +fn good_line() { + let reader = Cursor::new("word\toffset\tlength"); + let index = Index::new(reader).unwrap(); + + assert_eq!( + *index.words.get("word").unwrap(), + (43478075309, 40242121569) + ); +} + +#[test] +fn two_entries_parsed() { + let reader = Cursor::new("word\toffset\tlength\nanother\ta0b\tc"); + let index = Index::new(reader).unwrap(); + + assert_eq!( + *index.words.get("word").unwrap(), + (43478075309, 40242121569) + ); + assert_eq!(*index.words.get("another").unwrap(), (109851, 28)); +} + +#[test] +#[should_panic] +fn number_parsing_fails() { + let reader = Cursor::new("valid word\tinvalid_offset\tDA"); + Index::new(reader).unwrap(); +} diff --git a/tests/test_dictreader.rs b/tests/test_dictreader.rs deleted file mode 100644 index 4a99f60..0000000 --- a/tests/test_dictreader.rs +++ /dev/null @@ -1,282 +0,0 @@ -use std::fs::File; -use std::io::{Cursor, Read}; -use std::path::PathBuf; - -use dict::*; -use dict::dictreader::*; - -type StringFile = Cursor; - -fn get_asset_path(fname: &str) -> PathBuf { - let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); - path.push("tests"); - path.push("assets"); - path.push(fname); - path -} - -// load test resource from tests/assets -fn load_resource(name: &str) -> File { - let path = get_asset_path(name); - File::open(path).unwrap() -} - - -fn str2file(input: &str) -> StringFile { - let input = input.to_string(); - // Cursor<&[u8]> implements Read and Seek - Cursor::new(input) -} - - -fn mk_dict(x: StringFile) -> dictreader::DictReaderRaw { - dictreader::DictReaderRaw::new(x).unwrap() -} - -#[test] -fn test_that_dictreader_does_to_correct_position() { - let text = str2file("Ignore me: important"); - assert_eq!(mk_dict(text).fetch_definition(11, 9).unwrap(), "important"); -} - -#[test] -fn test_that_seeking_to_beginning_works() { - let text = str2file("abcdefg"); - assert_eq!(mk_dict(text).fetch_definition(0, 3).unwrap(), "abc"); -} - -#[test] -#[should_panic] -fn test_that_seeking_beyond_file_is_caught() { - let text = str2file("xyz is too short ;)"); - mk_dict(text).fetch_definition(66642, 18).unwrap(); -} - -#[test] -#[should_panic] -fn test_that_reading_beyond_file_boundary_is_caught() { - let text = str2file("blablablup"); - mk_dict(text).fetch_definition(0, 424242).unwrap(); -} - -#[test] -#[should_panic] -fn test_error_if_length_is_too_large() { - let mut longfile = String::with_capacity(dictreader::MAX_BYTES_FOR_BUFFER as usize + 10); - for _ in 0..(dictreader::MAX_BYTES_FOR_BUFFER+10) { - longfile.push('u'); - } - let text = str2file(&longfile); - mk_dict(text).fetch_definition(0, dictreader::MAX_BYTES_FOR_BUFFER+1).unwrap(); -} - -//////////////////////////////////////////////////////////////////////////////// -// test dict.dz reader - - -#[test] -#[should_panic] -fn test_files_with_incorrect_file_id_are_detected() { - let data = Cursor::new(vec![0x1F, 0x8C]); - DictReaderDz::new(data).unwrap(); -} - -#[test] -fn test_files_with_correct_file_id_work() { - let file = load_resource("lat-deu.dict.dz"); - DictReaderDz::new(file).unwrap(); -} - -#[test] -#[should_panic] -fn test_gzip_files_without_fextra_panic() { - let mut rsrc = load_resource("lat-deu.dict.dz"); - let mut data = Vec::new(); - rsrc.read_to_end(&mut data).unwrap(); - // reset flags field to 0 - data[3] = 0; - let data = Cursor::new(data); - DictReaderDz::new(data).unwrap(); -} - -#[test] -#[should_panic] -fn test_that_file_with_invalid_si_bytes_is_reported() { - // si1si2 are the identification for the dictzip extension - let mut rsrc = load_resource("lat-deu.dict.dz"); - let mut data = Vec::new(); - rsrc.read_to_end(&mut data).unwrap(); - // reset flags field to 0 - data[12] = 0; - data[13] = 0; - let data = Cursor::new(data); - DictReaderDz::new(data).unwrap(); -} - -#[test] -#[should_panic] -fn test_gzip_with_invalid_version_num_are_reported() { - // the dictzip format specifies a field called "VER" - let mut rsrc = load_resource("lat-deu.dict.dz"); - let mut data = Vec::new(); - rsrc.read_to_end(&mut data).unwrap(); - // reset version field to 0 - data[16] = 0; - data[17] = 0; - let data = Cursor::new(data); - DictReaderDz::new(data).unwrap(); -} - -#[test] -#[should_panic] -fn test_mismatching_subfield_length_and_fextra_length_is_reported() { - // the "FEXTRA" length (also called XLEN in the specification) contains the additional header - // information for the dictzip format. This field has a header on its own and hence it is - // necessary to check whether both match and whether non-matching field lengths are detected - // the dictzip format specifies a field called "VER" - let mut rsrc = load_resource("lat-deu.dict.dz"); - let mut data = Vec::new(); - rsrc.read_to_end(&mut data).unwrap(); - // reset flags field to 0 - data[14] = 0; - data[15] = 0; - let data = Cursor::new(data); - DictReaderDz::new(data).unwrap(); -} - -#[test] -#[should_panic] -fn test_chunk_count_may_not_be_0() { - // the "FEXTRA" length (also called XLEN in the specification) contains the additional header - // information for the dictzip format. This field has a header on its own and hence it is - // necessary to check whether both match and whether non-matching field lengths are detected - // the dictzip format specifies a field called "VER" - let mut rsrc = load_resource("lat-deu.dict.dz"); - let mut data = Vec::new(); - rsrc.read_to_end(&mut data).unwrap(); - // reset chunk count to 0 - data[20] = 0; - data[21] = 0; - let data = Cursor::new(data); - DictReaderDz::new(data).unwrap(); -} - - -#[test] -#[should_panic] -fn test_chunk_count_and_xlen_must_match() { - // doc see above - let mut rsrc = load_resource("lat-deu.dict.dz"); - let mut data = Vec::new(); - rsrc.read_to_end(&mut data).unwrap(); - // reset chunk count to 0 - data[20] = 8; - data[21] = 9; - let data = Cursor::new(data); - DictReaderDz::new(data).unwrap(); -} - -#[test] -fn test_retrieval_of_a_word_which_doesnt_exist_yields_error() { - let dictdz = get_asset_path("lat-deu.dict.dz"); - let index = get_asset_path("lat-deu.index"); - let mut dict = load_dictionary_from_file(dictdz, index).unwrap(); - assert!(dict.lookup("testtesttest").is_err()); -} - -#[test] -fn test_retrieval_of_a_word_which_exists_works() { - let dictdz = get_asset_path("lat-deu.dict.dz"); - let index = get_asset_path("lat-deu.index"); - let mut dict = load_dictionary_from_file(dictdz, index).unwrap(); - let word = dict.lookup("mater"); - let word = word.unwrap(); - assert!(word.starts_with("mater")); -} - -#[test] -fn test_that_word_from_first_chunk_works() { - let dictdz = get_asset_path("lat-deu.dict.dz"); - let index = get_asset_path("lat-deu.index"); - let mut dict = load_dictionary_from_file(dictdz, index).unwrap(); - let word = dict.lookup("amo").unwrap(); - assert!(word.starts_with("amo")); -} - -#[test] -fn test_lookup_into_last_chunk_works() { - let dictdz = get_asset_path("lat-deu.dict.dz"); - let index = get_asset_path("lat-deu.index"); - let mut dict = load_dictionary_from_file(dictdz, index).unwrap(); - let word = dict.lookup("vultus").unwrap(); - assert!(word.starts_with("vultus")); -} - -#[test] -fn test_that_definitions_wrapping_around_chunk_border_are_extracted_correctly() { - let dictdz = get_asset_path("lat-deu.dict.dz"); - let index = get_asset_path("lat-deu.index"); - let mut dict = load_dictionary_from_file(dictdz, index).unwrap(); - // for the above dictionary, the chunk (or block) length of each uncompressed chunk is 58315; - // exactly there, the definition circumfero is split into two pieces: - let word = dict.lookup("circumfero").unwrap(); - assert!(word.starts_with("circumfero")); - // last word from definition must be present, too - assert!(word.ends_with("herumtreiben\n")); -} - -#[test] -fn test_files_with_comment_is_parsed_correctly() { - // file in assets has no comment, so add one - let mut rsrc = load_resource("lat-deu.dict.dz"); - let mut data = Vec::new(); - rsrc.read_to_end(&mut data).unwrap(); - // set comment bit to 1 - data[3] |= dictreader::GZ_COMMENT; - // add comment _after_file name; the header itself is for this particular file 36 bytes + 13 - // bytes file name (byte 13 is 0-byte) - let mut newdata: Vec = Vec::with_capacity(data.len() - 13); - newdata.extend(&data[0..49]); - // "h", "i", " ", "t", "h", "e", "r", "e" - newdata.extend(vec![104u8, 105u8, 32u8, 116u8, 104u8, 101u8, 114u8, 101u8, 0u8]); - newdata.extend(&data[49..]); - - let data = dict::dictreader::DictReaderDz::new(Cursor::new(newdata)).unwrap(); - let index = dict::indexing::parse_index_from_file(get_asset_path("lat-deu.index")).unwrap(); - let mut dict = dict::load_dictionary(Box::new(data), index); - let word = dict.lookup("mater"); - let word = word.unwrap(); - assert!(word.starts_with("mater")); -} - -#[test] -fn test_file_without_file_name_is_parsed_correctly() { - let mut rsrc = load_resource("lat-deu.dict.dz"); - let mut data = Vec::new(); - rsrc.read_to_end(&mut data).unwrap(); - // reset fname bit to 0 - data[3] &= !dictreader::GZ_FNAME; // flags byte of gz header - // remove file name from file; there are various fields in the gz header, which I won't repeat; - // together with the bytes in fextra (listing 7 compressed chunks), the file name starts at - // position 36. If you want to check the maths, have a look at src/dictreader.rs. The file name - // is 13 bytes long, so these need to be extracted: - let mut newdata: Vec = Vec::with_capacity(data.len() - 13); - newdata.extend(&data[0..36]); - newdata.extend(&data[49..]); - - let data = dict::dictreader::DictReaderDz::new(Cursor::new(newdata)).unwrap(); - let index = dict::indexing::parse_index_from_file(get_asset_path("lat-deu.index")).unwrap(); - let mut dict = dict::load_dictionary(Box::new(data), index); - let word = dict.lookup("mater"); - let word = word.unwrap(); - assert!(word.starts_with("mater")); -} - -#[test] -#[should_panic] -fn test_that_seek_beyond_end_of_file_is_detected() { - let dictdz = get_asset_path("lat-deu.dict.dz"); - let mut dict = dictreader::load_dict(dictdz).unwrap(); - dict.fetch_definition(9999999999u64, 888u64).unwrap(); -} - diff --git a/tests/test_indexing.rs b/tests/test_indexing.rs deleted file mode 100644 index c10ebed..0000000 --- a/tests/test_indexing.rs +++ /dev/null @@ -1,94 +0,0 @@ -use dict::indexing::*; - -use std::io::Cursor; - -//////////////////////////////////////////////////////////////////////////////// -// Test single-character calculations -/////////////////////////////////////////////////////////////////////////////// - -#[test] -fn test_that_uppercase_letters_get_correct_number() { - assert_eq!(dict::indexing::decode_number("A").unwrap(), 0); - assert_eq!(dict::indexing::decode_number("M").unwrap(), 12); - assert_eq!(dict::indexing::decode_number("Z").unwrap(), 25); -} - -#[test] -fn test_that_lowercase_letters_get_correct_number() { - assert_eq!(dict::indexing::decode_number("a").unwrap(), 26); - assert_eq!(dict::indexing::decode_number("m").unwrap(), 38); - assert_eq!(dict::indexing::decode_number("z").unwrap(), 51); -} - -#[test] -fn test_that_characters_get_correct_number() { - assert_eq!(dict::indexing::decode_number("0").unwrap(), 52); - assert_eq!(dict::indexing::decode_number("9").unwrap(), 61); -} - -#[test] -fn test_that_slash_and_plus_get_correct_number() { - assert_eq!(dict::indexing::decode_number("+").unwrap(), 62); - assert_eq!(dict::indexing::decode_number("/").unwrap(), 63); -} - -#[test] -fn test_that_unknown_characters_return_error() { - assert!(dict::indexing::decode_number("*").is_err(), 99999); -} - -//////////////////////////////////////////////////////////////////////////////// -// Test multi-character-calculations calculations -/////////////////////////////////////////////////////////////////////////////// - -#[test] -fn test_that_big_offsets_work() { - assert_eq!(dict::indexing::decode_number("3fW2").unwrap(), 14546358); -} - -#[test] -fn test_that_short_strings_work() { - assert_eq!(dict::indexing::decode_number("c").unwrap(), 28); -} - -//////////////////////////////////////////////////////////////////////////////// -// Test parse_index -//////////////////////////////////////////////////////////////////////////////// - -fn mk_file(input: &str) -> Box> { - let input = input.to_string(); - // Cursor<&[u8]> implements BufRead already - Box::new(Cursor::new(input)) -} - -#[test] -#[should_panic] -fn test_that_invalid_line_causes_error() { - parse_index(*mk_file("blabla\nblublbub yo")).unwrap(); -} - -#[test] -#[should_panic] -fn test_only_one_tab_causes_panic() { - parse_index(*mk_file("only one\t(tab) character")).unwrap(); -} - -#[test] -fn test_that_normal_entry_works() { - let index = parse_index(*mk_file("word\toffset\tlength")).unwrap(); - assert_eq!(*(index.get("word").unwrap()), (43478075309, 40242121569)); -} - -#[test] -fn test_that_two_entries_are_parsed() { - let index = parse_index(*mk_file("word\toffset\tlength\nanother\ta0b\tc")).unwrap(); - assert_eq!(*(index.get("word").unwrap()), (43478075309, 40242121569)); - assert_eq!(*(index.get("another").unwrap()), (109851, 28)); -} - -#[test] -#[should_panic] -fn test_that_number_parsing_errors_are_propagated() { - parse_index(*mk_file("valid word\tinvalid_offset\tDA")).unwrap(); -} -