Skip to content

Commit

Permalink
crc: add CRC32C to every FST
Browse files Browse the repository at this point in the history
This modifies the FST builder to compute the CRC32C checksum as the FST
is being constructed. It is written as the last 4 bytes of the FST.

We also add a new `verify` routine on `raw::Fst` that permits callers to
check whether the FST's integrity is intact. Since verification could be
quite expensive for very large FSTs, we do not do this by default.

We avoid a dependency on a CRC32C crate since hand-rolling it ourselves
is very simple and not much code. I tried using a SIMD version of
CRC32C, but I couldn't benchmark a difference. In particular, I suspect
that the FST writing process is doing a lot of small writes, so the
checksummer doesn't get much opportunity to checksum a lot of bytes at
once.

It's not quite clear how to fix this. We could use our own buffer
internally, but then the caller wouldn't be able to use their own
buffered reader, which is a bit weird. But not without precedent.

In any case, the overhead of checksumming during construction is
virtually negligible, so we don't sweat it for now.
  • Loading branch information
BurntSushi committed Mar 7, 2020
1 parent d2ea6f3 commit 954fa87
Show file tree
Hide file tree
Showing 10 changed files with 420 additions and 21 deletions.
124 changes: 124 additions & 0 deletions build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
use std::env;
use std::fs::File;
use std::io::{self, Write};
use std::path::{Path, PathBuf};

const CASTAGNOLI_POLY: u32 = 0x82f63b78;

type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>;

fn main() {
if let Err(err) = try_main() {
panic!("{}", err);
}
}

fn try_main() -> Result<()> {
let out_dir = match env::var_os("OUT_DIR") {
None => {
return Err(From::from("OUT_DIR environment variable not defined"))
}
Some(out_dir) => PathBuf::from(out_dir),
};
write_tag_lookup_table(&out_dir)?;
write_crc_tables(&out_dir)?;
Ok(())
}

fn write_tag_lookup_table(out_dir: &Path) -> Result<()> {
let out_path = out_dir.join("tag.rs");
let mut out = io::BufWriter::new(File::create(out_path)?);

writeln!(out, "pub const TAG_LOOKUP_TABLE: [u16; 256] = [")?;
for b in 0u8..=255 {
writeln!(out, " {},", tag_entry(b))?;
}
writeln!(out, "];")?;
Ok(())
}

fn tag_entry(b: u8) -> u16 {
let b = b as u16;
match b & 0b00000011 {
0b00 => {
let lit_len = (b >> 2) + 1;
if lit_len <= 60 {
lit_len
} else {
assert!(lit_len <= 64);
(lit_len - 60) << 11
}
}
0b01 => {
let len = 4 + ((b >> 2) & 0b111);
let offset = (b >> 5) & 0b111;
(1 << 11) | (offset << 8) | len
}
0b10 => {
let len = 1 + (b >> 2);
(2 << 11) | len
}
0b11 => {
let len = 1 + (b >> 2);
(4 << 11) | len
}
_ => unreachable!(),
}
}

fn write_crc_tables(out_dir: &Path) -> Result<()> {
let out_path = out_dir.join("crc32_table.rs");
let mut out = io::BufWriter::new(File::create(out_path)?);

let table = make_table(CASTAGNOLI_POLY);
let table16 = make_table16(CASTAGNOLI_POLY);

writeln!(out, "pub const TABLE: [u32; 256] = [")?;
for &x in table.iter() {
writeln!(out, " {},", x)?;
}
writeln!(out, "];\n")?;

writeln!(out, "pub const TABLE16: [[u32; 256]; 16] = [")?;
for table in table16.iter() {
writeln!(out, " [")?;
for &x in table.iter() {
writeln!(out, " {},", x)?;
}
writeln!(out, " ],")?;
}
writeln!(out, "];")?;

out.flush()?;

Ok(())
}

fn make_table16(poly: u32) -> [[u32; 256]; 16] {
let mut tab = [[0; 256]; 16];
tab[0] = make_table(poly);
for i in 0..256 {
let mut crc = tab[0][i];
for j in 1..16 {
crc = (crc >> 8) ^ tab[0][crc as u8 as usize];
tab[j][i] = crc;
}
}
tab
}

fn make_table(poly: u32) -> [u32; 256] {
let mut tab = [0; 256];
for i in 0u32..256u32 {
let mut crc = i;
for _ in 0..8 {
if crc & 1 == 1 {
crc = (crc >> 1) ^ poly;
} else {
crc >>= 1;
}
}
tab[i as usize] = crc;
}
tab
}
120 changes: 120 additions & 0 deletions src/bytes.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#![allow(warnings)]

use std::convert::TryInto;
use std::io;

/// Read a u16 in little endian format from the beginning of the given slice.
/// This panics if the slice has length less than 2.
pub fn read_u16_le(slice: &[u8]) -> u16 {
u16::from_le_bytes(slice[..2].try_into().unwrap())
}

/// Read a u24 (returned as a u32 with the most significant 8 bits always set
/// to 0) in little endian format from the beginning of the given slice. This
/// panics if the slice has length less than 3.
pub fn read_u24_le(slice: &[u8]) -> u32 {
slice[0] as u32 | (slice[1] as u32) << 8 | (slice[2] as u32) << 16
}

/// Read a u32 in little endian format from the beginning of the given slice.
/// This panics if the slice has length less than 4.
pub fn read_u32_le(slice: &[u8]) -> u32 {
u32::from_le_bytes(slice[..4].try_into().unwrap())
}

/// Like read_u32_le, but from an io::Read implementation. If io::Read does
/// not yield at least 4 bytes, then this returns an unexpected EOF error.
pub fn io_read_u32_le<R: io::Read>(mut rdr: R) -> io::Result<u32> {
let mut buf = [0; 4];
rdr.read_exact(&mut buf)?;
Ok(u32::from_le_bytes(buf))
}

/// Write a u16 in little endian format to the beginning of the given slice.
/// This panics if the slice has length less than 2.
pub fn write_u16_le(n: u16, slice: &mut [u8]) {
assert!(slice.len() >= 2);
let bytes = n.to_le_bytes();
slice[0] = bytes[0];
slice[1] = bytes[1];
}

/// Write a u24 (given as a u32 where the most significant 8 bits are ignored)
/// in little endian format to the beginning of the given slice. This panics
/// if the slice has length less than 3.
pub fn write_u24_le(n: u32, slice: &mut [u8]) {
slice[0] = n as u8;
slice[1] = (n >> 8) as u8;
slice[2] = (n >> 16) as u8;
}

/// Write a u32 in little endian format to the beginning of the given slice.
/// This panics if the slice has length less than 4.
pub fn write_u32_le(n: u32, slice: &mut [u8]) {
assert!(slice.len() >= 4);
let bytes = n.to_le_bytes();
slice[0] = bytes[0];
slice[1] = bytes[1];
slice[2] = bytes[2];
slice[3] = bytes[3];
}

/// https://developers.google.com/protocol-buffers/docs/encoding#varints
pub fn write_varu64(data: &mut [u8], mut n: u64) -> usize {
let mut i = 0;
while n >= 0b1000_0000 {
data[i] = (n as u8) | 0b1000_0000;
n >>= 7;
i += 1;
}
data[i] = n as u8;
i + 1
}

/// https://developers.google.com/protocol-buffers/docs/encoding#varints
pub fn read_varu64(data: &[u8]) -> (u64, usize) {
let mut n: u64 = 0;
let mut shift: u32 = 0;
for (i, &b) in data.iter().enumerate() {
if b < 0b1000_0000 {
return match (b as u64).checked_shl(shift) {
None => (0, 0),
Some(b) => (n | b, i + 1),
};
}
match ((b as u64) & 0b0111_1111).checked_shl(shift) {
None => return (0, 0),
Some(b) => n |= b,
}
shift += 7;
}
(0, 0)
}

/// Does an unaligned load of a little endian encoded u32.
///
/// This is unsafe because `data` must point to some memory of size at least 4.
pub unsafe fn loadu_u32_le(data: *const u8) -> u32 {
loadu_u32_ne(data).to_le()
}

/// Does an unaligned load of a native endian encoded u32.
///
/// This is unsafe because `data` must point to some memory of size at least 4.
pub unsafe fn loadu_u32_ne(data: *const u8) -> u32 {
(data as *const u32).read_unaligned()
}

/// Does an unaligned load of a little endian encoded u64.
///
/// This is unsafe because `data` must point to some memory of size at least 8.
pub unsafe fn loadu_u64_le(data: *const u8) -> u64 {
loadu_u64_ne(data).to_le()
}

/// Does an unaligned load of a native endian encoded u64.
///
/// This is unsafe because `data` must point to some memory of size at least 8.
pub unsafe fn loadu_u64_ne(data: *const u8) -> u64 {
(data as *const u64).read_unaligned()
}
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,7 @@ pub use crate::map::{Map, MapBuilder};
pub use crate::set::{Set, SetBuilder};
pub use crate::stream::{IntoStreamer, Streamer};

mod bytes;
mod error;
#[path = "automaton/mod.rs"]
mod inner_automaton;
Expand Down
10 changes: 7 additions & 3 deletions src/raw/build.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::io::{self, Write};
use std::io;

use byteorder::{LittleEndian, WriteBytesExt};

Expand Down Expand Up @@ -219,8 +219,12 @@ impl<W: io::Write> Builder<W> {
let root_addr = self.compile(&root_node)?;
self.wtr.write_u64::<LittleEndian>(self.len as u64)?;
self.wtr.write_u64::<LittleEndian>(root_addr as u64)?;
self.wtr.flush()?;
Ok(self.wtr.into_inner())

let sum = self.wtr.masked_checksum();
let mut wtr = self.wtr.into_inner();
wtr.write_u32::<LittleEndian>(sum)?;
wtr.flush()?;
Ok(wtr)
}

fn insert_output<B>(&mut self, bs: B, out: Option<Output>) -> Result<()>
Expand Down
17 changes: 15 additions & 2 deletions src/raw/counting_writer.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
use std::io;

/// Wraps any writer and counts bytes written.
use crate::raw::crc32::CheckSummer;

/// Wraps any writer that counts and checksums bytes written.
pub struct CountingWriter<W> {
wtr: W,
cnt: u64,
summer: CheckSummer,
}

impl<W: io::Write> CountingWriter<W> {
/// Wrap the given writer with a counter.
pub fn new(wtr: W) -> CountingWriter<W> {
CountingWriter { wtr, cnt: 0 }
CountingWriter { wtr, cnt: 0, summer: CheckSummer::new() }
}

/// Return the total number of bytes written to the underlying writer.
Expand All @@ -20,6 +23,15 @@ impl<W: io::Write> CountingWriter<W> {
self.cnt
}

/// Returns the masked CRC32C checksum of the bytes written so far.
///
/// This "masked" checksum is the same one used by the Snappy frame format.
/// Masking is supposed to make the checksum robust with respect to data
/// that contains the checksum itself.
pub fn masked_checksum(&self) -> u32 {
self.summer.masked()
}

/// Unwrap the counting writer and return the inner writer.
pub fn into_inner(self) -> W {
self.wtr
Expand All @@ -33,6 +45,7 @@ impl<W: io::Write> CountingWriter<W> {

impl<W: io::Write> io::Write for CountingWriter<W> {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
self.summer.update(buf);
let n = self.wtr.write(buf)?;
self.cnt += n as u64;
Ok(n)
Expand Down
59 changes: 59 additions & 0 deletions src/raw/crc32.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
use crate::bytes;
use crate::raw::crc32_table::{TABLE, TABLE16};

/// Provides a simple API to perform a rolling CRC32C checksum.
#[derive(Clone, Copy, Debug)]
pub struct CheckSummer {
sum: u32,
}

impl CheckSummer {
/// Create a new checksummer that can compute CRC32C checksums on arbitrary
/// bytes.
pub fn new() -> CheckSummer {
CheckSummer { sum: 0 }
}

/// Returns the "masked" CRC32 checksum of the data so far using the
/// Castagnoli polynomial. This "masked" checksum is the same one used
/// by the Snappy frame format. Masking is supposed to make the checksum
/// robust with respect to data that contains the checksum itself.
pub fn masked(&self) -> u32 {
let sum = self.sum;
(sum.wrapping_shr(15) | sum.wrapping_shl(17)).wrapping_add(0xA282EAD8)
}

/// Update the current checksum with the checksum for the given bytes.
pub fn update(&mut self, buf: &[u8]) {
self.sum = crc32c_slice16(self.sum, buf);
}
}

/// Returns the CRC32 checksum of `buf` using the Castagnoli polynomial.
fn crc32c_slice16(prev: u32, mut buf: &[u8]) -> u32 {
let mut crc: u32 = !prev;
while buf.len() >= 16 {
crc ^= bytes::read_u32_le(buf);
crc = TABLE16[0][buf[15] as usize]
^ TABLE16[1][buf[14] as usize]
^ TABLE16[2][buf[13] as usize]
^ TABLE16[3][buf[12] as usize]
^ TABLE16[4][buf[11] as usize]
^ TABLE16[5][buf[10] as usize]
^ TABLE16[6][buf[9] as usize]
^ TABLE16[7][buf[8] as usize]
^ TABLE16[8][buf[7] as usize]
^ TABLE16[9][buf[6] as usize]
^ TABLE16[10][buf[5] as usize]
^ TABLE16[11][buf[4] as usize]
^ TABLE16[12][(crc >> 24) as u8 as usize]
^ TABLE16[13][(crc >> 16) as u8 as usize]
^ TABLE16[14][(crc >> 8) as u8 as usize]
^ TABLE16[15][(crc) as u8 as usize];
buf = &buf[16..];
}
for &b in buf {
crc = TABLE[((crc as u8) ^ b) as usize] ^ (crc >> 8);
}
!crc
}
2 changes: 2 additions & 0 deletions src/raw/crc32_table.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
// Generated by build.rs.
include!(concat!(env!("OUT_DIR"), "/crc32_table.rs"));
Loading

0 comments on commit 954fa87

Please sign in to comment.