Skip to content

Commit

Permalink
feat!: Check the hash when reading via File::at() just like git, …
Browse files Browse the repository at this point in the history
…or skip the check.

Note that indices written with `index.skipHash=true` will be vastly
faster to read by a factor of 2 or more.
  • Loading branch information
Byron committed Aug 27, 2023
1 parent 3ff5ac0 commit 61c2e34
Show file tree
Hide file tree
Showing 7 changed files with 76 additions and 15 deletions.
2 changes: 1 addition & 1 deletion gix-index/src/access/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,7 @@ mod tests {
let file = PathBuf::from("tests")
.join("fixtures")
.join(Path::new("loose_index").join("conflicting-file.git-index"));
let file = crate::File::at(file, gix_hash::Kind::Sha1, Default::default()).expect("valid file");
let file = crate::File::at(file, gix_hash::Kind::Sha1, false, Default::default()).expect("valid file");
assert_eq!(
file.entries().len(),
3,
Expand Down
2 changes: 2 additions & 0 deletions gix-index/src/extension/link.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ impl Link {
self,
split_index: &mut crate::File,
object_hash: gix_hash::Kind,
skip_hash: bool,
options: crate::decode::Options,
) -> Result<(), crate::file::init::Error> {
let shared_index_path = split_index
Expand All @@ -82,6 +83,7 @@ impl Link {
let mut shared_index = crate::File::at(
&shared_index_path,
object_hash,
skip_hash,
crate::decode::Options {
expected_checksum: self.shared_index_checksum.into(),
..options
Expand Down
49 changes: 43 additions & 6 deletions gix-index/src/file/init.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,18 @@ pub use error::Error;
/// Initialization
impl File {
/// Try to open the index file at `path` with `options`, assuming `object_hash` is used throughout the file, or create a new
/// index that merely exists in memory and is empty.
/// index that merely exists in memory and is empty. `skip_hash` will increase the performance by a factor of 2, at the cost of
/// possibly not detecting corruption.
///
/// Note that the `path` will not be written if it doesn't exist.
pub fn at_or_default(
path: impl Into<PathBuf>,
object_hash: gix_hash::Kind,
skip_hash: bool,
options: decode::Options,
) -> Result<Self, Error> {
let path = path.into();
Ok(match Self::at(&path, object_hash, options) {
Ok(match Self::at(&path, object_hash, skip_hash, options) {
Ok(f) => f,
Err(Error::Io(err)) if err.kind() == std::io::ErrorKind::NotFound => {
File::from_state(State::new(object_hash), path)
Expand All @@ -44,25 +46,60 @@ impl File {
})
}

/// Open an index file at `path` with `options`, assuming `object_hash` is used throughout the file.
/// Open an index file at `path` with `options`, assuming `object_hash` is used throughout the file. If `skip_hash` is `true`,
/// we will not get or compare the checksum of the index at all, which generally increases performance of this method by a factor
/// of 2 or more.
///
/// Note that the verification of the file hash depends on `options`, and even then it's performed after the file was read and not
/// before it is read. That way, invalid files would see a more descriptive error message as we try to parse them.
pub fn at(path: impl Into<PathBuf>, object_hash: gix_hash::Kind, options: decode::Options) -> Result<Self, Error> {
pub fn at(
path: impl Into<PathBuf>,
object_hash: gix_hash::Kind,
skip_hash: bool,
options: decode::Options,
) -> Result<Self, Error> {
let _span = gix_features::trace::detail!("gix_index::File::at()");
let path = path.into();
let (data, mtime) = {
// SAFETY: we have to take the risk of somebody changing the file underneath. Git never writes into the same file.
let file = std::fs::File::open(&path)?;
// SAFETY: we have to take the risk of somebody changing the file underneath. Git never writes into the same file.
#[allow(unsafe_code)]
let data = unsafe { Mmap::map(&file)? };

if !skip_hash {
// Note that even though it's trivial to offload this into a thread, which is worth it for all but the smallest
// index files, we choose more safety here just like git does and don't even try to decode the index if the hashes
// don't match.
// Thanks to `skip_hash`, we can get performance and it's under caller control, at the cost of some safety.
let expected = gix_hash::ObjectId::from(&data[data.len() - object_hash.len_in_bytes()..]);
if !expected.is_null() {
let _span = gix_features::trace::detail!("gix::open_index::hash_index", path = ?path);
let meta = file.metadata()?;
let num_bytes_to_hash = meta.len() - object_hash.len_in_bytes() as u64;
let actual_hash = gix_features::hash::bytes(
&file,
num_bytes_to_hash as usize,
object_hash,
&mut gix_features::progress::Discard,
&Default::default(),
)?;

if actual_hash != expected {
return Err(Error::Decode(decode::Error::ChecksumMismatch {
actual_checksum: actual_hash,
expected_checksum: expected,
}));
}
}
}

(data, filetime::FileTime::from_last_modification_time(&file.metadata()?))
};

let (state, checksum) = State::from_bytes(&data, mtime, object_hash, options)?;
let mut file = File { state, path, checksum };
if let Some(mut link) = file.link.take() {
link.dissolve_into(&mut file, object_hash, options)?;
link.dissolve_into(&mut file, object_hash, skip_hash, options)?;
}

Ok(file)
Expand Down
4 changes: 3 additions & 1 deletion gix-index/tests/index/file/init.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ mod at_or_new {
gix_index::File::at_or_default(
Generated("v4_more_files_IEOT").to_path(),
gix_hash::Kind::Sha1,
false,
Default::default(),
)
.expect("file exists and can be opened");
Expand All @@ -16,6 +17,7 @@ mod at_or_new {
let index = gix_index::File::at_or_default(
"__definitely no file that exists ever__",
gix_hash::Kind::Sha1,
false,
Default::default(),
)
.expect("file is defaulting to a new one");
Expand Down Expand Up @@ -46,7 +48,7 @@ mod from_state {
let new_index_path = tmp.path().join(fixture.to_name());
assert!(!new_index_path.exists());

let index = gix_index::File::at(fixture.to_path(), gix_hash::Kind::Sha1, Default::default())?;
let index = gix_index::File::at(fixture.to_path(), gix_hash::Kind::Sha1, false, Default::default())?;
let mut index = gix_index::File::from_state(index.into(), new_index_path.clone());
assert!(index.checksum().is_none());
assert_eq!(index.path(), new_index_path);
Expand Down
18 changes: 14 additions & 4 deletions gix-index/tests/index/file/read.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,14 @@ fn verify(index: gix_index::File) -> gix_index::File {

pub(crate) fn loose_file(name: &str) -> gix_index::File {
let path = loose_file_path(name);
let file = gix_index::File::at(path, gix_hash::Kind::Sha1, Default::default()).unwrap();
let file = gix_index::File::at(path, gix_hash::Kind::Sha1, false, Default::default()).unwrap();
verify(file)
}
pub(crate) fn try_file(name: &str) -> Result<gix_index::File, gix_index::file::init::Error> {
let file = gix_index::File::at(
crate::fixture_index_path(name),
gix_hash::Kind::Sha1,
false,
Default::default(),
)?;
Ok(verify(file))
Expand All @@ -34,7 +35,7 @@ pub(crate) fn file(name: &str) -> gix_index::File {
try_file(name).unwrap()
}
fn file_opt(name: &str, opts: gix_index::decode::Options) -> gix_index::File {
let file = gix_index::File::at(crate::fixture_index_path(name), gix_hash::Kind::Sha1, opts).unwrap();
let file = gix_index::File::at(crate::fixture_index_path(name), gix_hash::Kind::Sha1, false, opts).unwrap();
verify(file)
}

Expand Down Expand Up @@ -140,6 +141,7 @@ fn split_index_without_any_extension() {
let file = gix_index::File::at(
find_shared_index_for(crate::fixture_index_path("v2_split_index")),
gix_hash::Kind::Sha1,
false,
Default::default(),
)
.unwrap();
Expand Down Expand Up @@ -337,8 +339,15 @@ fn split_index_and_regular_index_of_same_content_are_indeed_the_same() {
)
.unwrap();

let split =
verify(gix_index::File::at(base.join("split/.git/index"), gix_hash::Kind::Sha1, Default::default()).unwrap());
let split = verify(
gix_index::File::at(
base.join("split/.git/index"),
gix_hash::Kind::Sha1,
false,
Default::default(),
)
.unwrap(),
);

assert!(
split.link().is_none(),
Expand All @@ -349,6 +358,7 @@ fn split_index_and_regular_index_of_same_content_are_indeed_the_same() {
gix_index::File::at(
base.join("regular/.git/index"),
gix_hash::Kind::Sha1,
false,
Default::default(),
)
.unwrap(),
Expand Down
14 changes: 12 additions & 2 deletions gix-index/tests/index/file/write.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,12 @@ fn skip_hash() -> crate::Result {
skip_hash: false,
})?;

let actual = gix_index::File::at(&path, expected.checksum().expect("present").kind(), Default::default())?;
let actual = gix_index::File::at(
&path,
expected.checksum().expect("present").kind(),
false,
Default::default(),
)?;
assert_eq!(
actual.checksum(),
expected.checksum(),
Expand All @@ -62,7 +67,12 @@ fn skip_hash() -> crate::Result {
skip_hash: true,
})?;

let actual = gix_index::File::at(&path, expected.checksum().expect("present").kind(), Default::default())?;
let actual = gix_index::File::at(
&path,
expected.checksum().expect("present").kind(),
false,
Default::default(),
)?;
assert_eq!(actual.checksum(), None, "no hash is produced in this case");

Ok(())
Expand Down
2 changes: 1 addition & 1 deletion gix-index/tests/index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ impl Fixture {
}

pub fn open(&self) -> gix_index::File {
gix_index::File::at(self.to_path(), gix_hash::Kind::Sha1, Default::default())
gix_index::File::at(self.to_path(), gix_hash::Kind::Sha1, false, Default::default())
.expect("fixtures are always readable")
}
}

0 comments on commit 61c2e34

Please sign in to comment.