crc: add CRC32C to every FST

This modifies the FST builder to compute the CRC32C checksum as the FST is being constructed. It is written as the last 4 bytes of the FST. We also add a new `verify` routine on `raw::Fst` that permits callers to check whether the FST's integrity is intact. Since verification could be quite expensive for very large FSTs, we do not do this by default. We avoid a dependency on a CRC32C crate since hand-rolling it ourselves is very simple and not much code. I tried using a SIMD version of CRC32C, but I couldn't benchmark a difference. In particular, I suspect that the FST writing process is doing a lot of small writes, so the checksummer doesn't get much opportunity to checksum a lot of bytes at once. It's not quite clear how to fix this. We could use our own buffer internally, but then the caller wouldn't be able to use their own buffered reader, which is a bit weird. But not without precedent. In any case, the overhead of checksumming during construction is virtually negligible, so we don't sweat it for now.
BurntSushi · Mar 7, 2020 · 954fa87 · 954fa87
1 parent d2ea6f3
commit 954fa87
Show file tree

Hide file tree

Showing 10 changed files with 420 additions and 21 deletions.
diff --git a/build.rs b/build.rs
@@ -0,0 +1,124 @@
+use std::env;
+use std::fs::File;
+use std::io::{self, Write};
+use std::path::{Path, PathBuf};
+
+const CASTAGNOLI_POLY: u32 = 0x82f63b78;
+
+type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>;
+
+fn main() {
+    if let Err(err) = try_main() {
+        panic!("{}", err);
+    }
+}
+
+fn try_main() -> Result<()> {
+    let out_dir = match env::var_os("OUT_DIR") {
+        None => {
+            return Err(From::from("OUT_DIR environment variable not defined"))
+        }
+        Some(out_dir) => PathBuf::from(out_dir),
+    };
+    write_tag_lookup_table(&out_dir)?;
+    write_crc_tables(&out_dir)?;
+    Ok(())
+}
+
+fn write_tag_lookup_table(out_dir: &Path) -> Result<()> {
+    let out_path = out_dir.join("tag.rs");
+    let mut out = io::BufWriter::new(File::create(out_path)?);
+
+    writeln!(out, "pub const TAG_LOOKUP_TABLE: [u16; 256] = [")?;
+    for b in 0u8..=255 {
+        writeln!(out, "    {},", tag_entry(b))?;
+    }
+    writeln!(out, "];")?;
+    Ok(())
+}
+
+fn tag_entry(b: u8) -> u16 {
+    let b = b as u16;
+    match b & 0b00000011 {
+        0b00 => {
+            let lit_len = (b >> 2) + 1;
+            if lit_len <= 60 {
+                lit_len
+            } else {
+                assert!(lit_len <= 64);
+                (lit_len - 60) << 11
+            }
+        }
+        0b01 => {
+            let len = 4 + ((b >> 2) & 0b111);
+            let offset = (b >> 5) & 0b111;
+            (1 << 11) | (offset << 8) | len
+        }
+        0b10 => {
+            let len = 1 + (b >> 2);
+            (2 << 11) | len
+        }
+        0b11 => {
+            let len = 1 + (b >> 2);
+            (4 << 11) | len
+        }
+        _ => unreachable!(),
+    }
+}
+
+fn write_crc_tables(out_dir: &Path) -> Result<()> {
+    let out_path = out_dir.join("crc32_table.rs");
+    let mut out = io::BufWriter::new(File::create(out_path)?);
+
+    let table = make_table(CASTAGNOLI_POLY);
+    let table16 = make_table16(CASTAGNOLI_POLY);
+
+    writeln!(out, "pub const TABLE: [u32; 256] = [")?;
+    for &x in table.iter() {
+        writeln!(out, "    {},", x)?;
+    }
+    writeln!(out, "];\n")?;
+
+    writeln!(out, "pub const TABLE16: [[u32; 256]; 16] = [")?;
+    for table in table16.iter() {
+        writeln!(out, "    [")?;
+        for &x in table.iter() {
+            writeln!(out, "        {},", x)?;
+        }
+        writeln!(out, "    ],")?;
+    }
+    writeln!(out, "];")?;
+
+    out.flush()?;
+
+    Ok(())
+}
+
+fn make_table16(poly: u32) -> [[u32; 256]; 16] {
+    let mut tab = [[0; 256]; 16];
+    tab[0] = make_table(poly);
+    for i in 0..256 {
+        let mut crc = tab[0][i];
+        for j in 1..16 {
+            crc = (crc >> 8) ^ tab[0][crc as u8 as usize];
+            tab[j][i] = crc;
+        }
+    }
+    tab
+}
+
+fn make_table(poly: u32) -> [u32; 256] {
+    let mut tab = [0; 256];
+    for i in 0u32..256u32 {
+        let mut crc = i;
+        for _ in 0..8 {
+            if crc & 1 == 1 {
+                crc = (crc >> 1) ^ poly;
+            } else {
+                crc >>= 1;
+            }
+        }
+        tab[i as usize] = crc;
+    }
+    tab
+}
diff --git a/src/bytes.rs b/src/bytes.rs
@@ -0,0 +1,120 @@
+#![allow(warnings)]
+
+use std::convert::TryInto;
+use std::io;
+
+/// Read a u16 in little endian format from the beginning of the given slice.
+/// This panics if the slice has length less than 2.
+pub fn read_u16_le(slice: &[u8]) -> u16 {
+    u16::from_le_bytes(slice[..2].try_into().unwrap())
+}
+
+/// Read a u24 (returned as a u32 with the most significant 8 bits always set
+/// to 0) in little endian format from the beginning of the given slice. This
+/// panics if the slice has length less than 3.
+pub fn read_u24_le(slice: &[u8]) -> u32 {
+    slice[0] as u32 | (slice[1] as u32) << 8 | (slice[2] as u32) << 16
+}
+
+/// Read a u32 in little endian format from the beginning of the given slice.
+/// This panics if the slice has length less than 4.
+pub fn read_u32_le(slice: &[u8]) -> u32 {
+    u32::from_le_bytes(slice[..4].try_into().unwrap())
+}
+
+/// Like read_u32_le, but from an io::Read implementation. If io::Read does
+/// not yield at least 4 bytes, then this returns an unexpected EOF error.
+pub fn io_read_u32_le<R: io::Read>(mut rdr: R) -> io::Result<u32> {
+    let mut buf = [0; 4];
+    rdr.read_exact(&mut buf)?;
+    Ok(u32::from_le_bytes(buf))
+}
+
+/// Write a u16 in little endian format to the beginning of the given slice.
+/// This panics if the slice has length less than 2.
+pub fn write_u16_le(n: u16, slice: &mut [u8]) {
+    assert!(slice.len() >= 2);
+    let bytes = n.to_le_bytes();
+    slice[0] = bytes[0];
+    slice[1] = bytes[1];
+}
+
+/// Write a u24 (given as a u32 where the most significant 8 bits are ignored)
+/// in little endian format to the beginning of the given slice. This panics
+/// if the slice has length less than 3.
+pub fn write_u24_le(n: u32, slice: &mut [u8]) {
+    slice[0] = n as u8;
+    slice[1] = (n >> 8) as u8;
+    slice[2] = (n >> 16) as u8;
+}
+
+/// Write a u32 in little endian format to the beginning of the given slice.
+/// This panics if the slice has length less than 4.
+pub fn write_u32_le(n: u32, slice: &mut [u8]) {
+    assert!(slice.len() >= 4);
+    let bytes = n.to_le_bytes();
+    slice[0] = bytes[0];
+    slice[1] = bytes[1];
+    slice[2] = bytes[2];
+    slice[3] = bytes[3];
+}
+
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+pub fn write_varu64(data: &mut [u8], mut n: u64) -> usize {
+    let mut i = 0;
+    while n >= 0b1000_0000 {
+        data[i] = (n as u8) | 0b1000_0000;
+        n >>= 7;
+        i += 1;
+    }
+    data[i] = n as u8;
+    i + 1
+}
+
+/// https://developers.google.com/protocol-buffers/docs/encoding#varints
+pub fn read_varu64(data: &[u8]) -> (u64, usize) {
+    let mut n: u64 = 0;
+    let mut shift: u32 = 0;
+    for (i, &b) in data.iter().enumerate() {
+        if b < 0b1000_0000 {
+            return match (b as u64).checked_shl(shift) {
+                None => (0, 0),
+                Some(b) => (n | b, i + 1),
+            };
+        }
+        match ((b as u64) & 0b0111_1111).checked_shl(shift) {
+            None => return (0, 0),
+            Some(b) => n |= b,
+        }
+        shift += 7;
+    }
+    (0, 0)
+}
+
+/// Does an unaligned load of a little endian encoded u32.
+///
+/// This is unsafe because `data` must point to some memory of size at least 4.
+pub unsafe fn loadu_u32_le(data: *const u8) -> u32 {
+    loadu_u32_ne(data).to_le()
+}
+
+/// Does an unaligned load of a native endian encoded u32.
+///
+/// This is unsafe because `data` must point to some memory of size at least 4.
+pub unsafe fn loadu_u32_ne(data: *const u8) -> u32 {
+    (data as *const u32).read_unaligned()
+}
+
+/// Does an unaligned load of a little endian encoded u64.
+///
+/// This is unsafe because `data` must point to some memory of size at least 8.
+pub unsafe fn loadu_u64_le(data: *const u8) -> u64 {
+    loadu_u64_ne(data).to_le()
+}
+
+/// Does an unaligned load of a native endian encoded u64.
+///
+/// This is unsafe because `data` must point to some memory of size at least 8.
+pub unsafe fn loadu_u64_ne(data: *const u8) -> u64 {
+    (data as *const u64).read_unaligned()
+}
diff --git a/src/lib.rs b/src/lib.rs
@@ -308,6 +308,7 @@ pub use crate::map::{Map, MapBuilder};
 pub use crate::set::{Set, SetBuilder};
 pub use crate::stream::{IntoStreamer, Streamer};
 
+mod bytes;
 mod error;
 #[path = "automaton/mod.rs"]
 mod inner_automaton;

diff --git a/src/raw/build.rs b/src/raw/build.rs
@@ -1,4 +1,4 @@
-use std::io::{self, Write};
+use std::io;
 
 use byteorder::{LittleEndian, WriteBytesExt};
 
@@ -219,8 +219,12 @@ impl<W: io::Write> Builder<W> {
         let root_addr = self.compile(&root_node)?;
         self.wtr.write_u64::<LittleEndian>(self.len as u64)?;
         self.wtr.write_u64::<LittleEndian>(root_addr as u64)?;
-        self.wtr.flush()?;
-        Ok(self.wtr.into_inner())
+
+        let sum = self.wtr.masked_checksum();
+        let mut wtr = self.wtr.into_inner();
+        wtr.write_u32::<LittleEndian>(sum)?;
+        wtr.flush()?;
+        Ok(wtr)
     }
 
     fn insert_output<B>(&mut self, bs: B, out: Option<Output>) -> Result<()>

diff --git a/src/raw/counting_writer.rs b/src/raw/counting_writer.rs
@@ -1,15 +1,18 @@
 use std::io;
 
-/// Wraps any writer and counts bytes written.
+use crate::raw::crc32::CheckSummer;
+
+/// Wraps any writer that counts and checksums bytes written.
 pub struct CountingWriter<W> {
     wtr: W,
     cnt: u64,
+    summer: CheckSummer,
 }
 
 impl<W: io::Write> CountingWriter<W> {
     /// Wrap the given writer with a counter.
     pub fn new(wtr: W) -> CountingWriter<W> {
-        CountingWriter { wtr, cnt: 0 }
+        CountingWriter { wtr, cnt: 0, summer: CheckSummer::new() }
     }
 
     /// Return the total number of bytes written to the underlying writer.
@@ -20,6 +23,15 @@ impl<W: io::Write> CountingWriter<W> {
         self.cnt
     }
 
+    /// Returns the masked CRC32C checksum of the bytes written so far.
+    ///
+    /// This "masked" checksum is the same one used by the Snappy frame format.
+    /// Masking is supposed to make the checksum robust with respect to data
+    /// that contains the checksum itself.
+    pub fn masked_checksum(&self) -> u32 {
+        self.summer.masked()
+    }
+
     /// Unwrap the counting writer and return the inner writer.
     pub fn into_inner(self) -> W {
         self.wtr
@@ -33,6 +45,7 @@ impl<W: io::Write> CountingWriter<W> {
 
 impl<W: io::Write> io::Write for CountingWriter<W> {
     fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
+        self.summer.update(buf);
         let n = self.wtr.write(buf)?;
         self.cnt += n as u64;
         Ok(n)

diff --git a/src/raw/crc32.rs b/src/raw/crc32.rs
@@ -0,0 +1,59 @@
+use crate::bytes;
+use crate::raw::crc32_table::{TABLE, TABLE16};
+
+/// Provides a simple API to perform a rolling CRC32C checksum.
+#[derive(Clone, Copy, Debug)]
+pub struct CheckSummer {
+    sum: u32,
+}
+
+impl CheckSummer {
+    /// Create a new checksummer that can compute CRC32C checksums on arbitrary
+    /// bytes.
+    pub fn new() -> CheckSummer {
+        CheckSummer { sum: 0 }
+    }
+
+    /// Returns the "masked" CRC32 checksum of the data so far using the
+    /// Castagnoli polynomial. This "masked" checksum is the same one used
+    /// by the Snappy frame format. Masking is supposed to make the checksum
+    /// robust with respect to data that contains the checksum itself.
+    pub fn masked(&self) -> u32 {
+        let sum = self.sum;
+        (sum.wrapping_shr(15) | sum.wrapping_shl(17)).wrapping_add(0xA282EAD8)
+    }
+
+    /// Update the current checksum with the checksum for the given bytes.
+    pub fn update(&mut self, buf: &[u8]) {
+        self.sum = crc32c_slice16(self.sum, buf);
+    }
+}
+
+/// Returns the CRC32 checksum of `buf` using the Castagnoli polynomial.
+fn crc32c_slice16(prev: u32, mut buf: &[u8]) -> u32 {
+    let mut crc: u32 = !prev;
+    while buf.len() >= 16 {
+        crc ^= bytes::read_u32_le(buf);
+        crc = TABLE16[0][buf[15] as usize]
+            ^ TABLE16[1][buf[14] as usize]
+            ^ TABLE16[2][buf[13] as usize]
+            ^ TABLE16[3][buf[12] as usize]
+            ^ TABLE16[4][buf[11] as usize]
+            ^ TABLE16[5][buf[10] as usize]
+            ^ TABLE16[6][buf[9] as usize]
+            ^ TABLE16[7][buf[8] as usize]
+            ^ TABLE16[8][buf[7] as usize]
+            ^ TABLE16[9][buf[6] as usize]
+            ^ TABLE16[10][buf[5] as usize]
+            ^ TABLE16[11][buf[4] as usize]
+            ^ TABLE16[12][(crc >> 24) as u8 as usize]
+            ^ TABLE16[13][(crc >> 16) as u8 as usize]
+            ^ TABLE16[14][(crc >> 8) as u8 as usize]
+            ^ TABLE16[15][(crc) as u8 as usize];
+        buf = &buf[16..];
+    }
+    for &b in buf {
+        crc = TABLE[((crc as u8) ^ b) as usize] ^ (crc >> 8);
+    }
+    !crc
+}
diff --git a/src/raw/crc32_table.rs b/src/raw/crc32_table.rs
@@ -0,0 +1,2 @@
+// Generated by build.rs.
+include!(concat!(env!("OUT_DIR"), "/crc32_table.rs"));
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		// Generated by build.rs.
		include!(concat!(env!("OUT_DIR"), "/crc32_table.rs"));