diff --git a/src/read.rs b/src/read.rs index 9e0d27010..39686012d 100644 --- a/src/read.rs +++ b/src/read.rs @@ -8,10 +8,7 @@ use crate::crc32::Crc32Reader; use crate::extra_fields::{ExtendedTimestamp, ExtraField}; use crate::read::zip_archive::{Shared, SharedBuilder}; use crate::result::{ZipError, ZipResult}; -use crate::spec::{ - self, FixedSizeBlock, Pod, Zip32CentralDirectoryEnd, Zip64CDELocatorBlock, - Zip64CentralDirectoryEnd, ZIP64_ENTRY_THR, -}; +use crate::spec::{self, CentralDirectoryEndInfo, DataAndPosition, FixedSizeBlock, Pod}; use crate::types::{ AesMode, AesVendorVersion, DateTime, System, ZipCentralEntryBlock, ZipFileData, ZipLocalEntryBlock, @@ -26,7 +23,6 @@ use std::mem; use std::mem::size_of; use std::ops::Deref; use std::path::{Path, PathBuf}; -use std::rc::Rc; use std::sync::{Arc, OnceLock}; mod config; @@ -42,6 +38,8 @@ pub(crate) mod lzma; #[cfg(feature = "xz")] pub(crate) mod xz; +pub(crate) mod magic_finder; + // Put the struct declaration in a private module to convince rustdoc to display ZipArchive nicely pub(crate) mod zip_archive { use indexmap::IndexMap; @@ -56,6 +54,8 @@ pub(crate) mod zip_archive { // This isn't yet used anywhere, but it is here for use cases in the future. #[allow(dead_code)] pub(super) config: super::Config, + pub(crate) comment: Box<[u8]>, + pub(crate) zip64_comment: Option>, } #[derive(Debug)] @@ -69,7 +69,7 @@ pub(crate) mod zip_archive { } impl SharedBuilder { - pub fn build(self) -> Shared { + pub fn build(self, comment: Box<[u8]>, zip64_comment: Option>) -> Shared { let mut index_map = IndexMap::with_capacity(self.files.len()); self.files.into_iter().for_each(|file| { index_map.insert(file.file_name.clone(), file); @@ -79,6 +79,8 @@ pub(crate) mod zip_archive { offset: self.offset, dir_start: self.dir_start, config: self.config, + comment, + zip64_comment, } } } @@ -108,7 +110,6 @@ pub(crate) mod zip_archive { pub struct ZipArchive { pub(super) reader: R, pub(super) shared: Arc, - pub(super) comment: Arc<[u8]>, } } @@ -360,6 +361,7 @@ fn find_data_start( block.file_name_length as u64 + block.extra_field_length as u64; let data_start = data.header_start + size_of::() as u64 + variable_fields_len; + // Set the value so we don't have to read it again. match data.data_start.set(data_start) { Ok(()) => (), @@ -369,6 +371,7 @@ fn find_data_start( debug_assert_eq!(*data.data_start.get().unwrap(), data_start); } } + Ok(data_start) } @@ -434,17 +437,62 @@ pub(crate) fn make_reader( pub(crate) struct CentralDirectoryInfo { pub(crate) archive_offset: u64, pub(crate) directory_start: u64, - pub(crate) cde_position: u64, pub(crate) number_of_files: usize, pub(crate) disk_number: u32, pub(crate) disk_with_central_directory: u32, - pub(crate) is_zip64: bool, +} + +impl<'a> TryFrom<&'a CentralDirectoryEndInfo> for CentralDirectoryInfo { + type Error = ZipError; + + fn try_from(value: &'a CentralDirectoryEndInfo) -> Result { + let (relative_cd_offset, number_of_files, disk_number, disk_with_central_directory) = + match &value.eocd64 { + Some(DataAndPosition { data: eocd64, .. }) => { + if eocd64.number_of_files_on_this_disk > eocd64.number_of_files { + return Err(InvalidArchive( + "ZIP64 footer indicates more files on this disk than in the whole archive", + )); + } else if eocd64.version_needed_to_extract > eocd64.version_made_by { + return Err(InvalidArchive( + "ZIP64 footer indicates a new version is needed to extract this archive than the \ + version that wrote it", + )); + } + ( + eocd64.central_directory_offset, + eocd64.number_of_files as usize, + eocd64.disk_number, + eocd64.disk_with_central_directory, + ) + } + _ => ( + value.eocd.data.central_directory_offset as u64, + value.eocd.data.number_of_files_on_this_disk as usize, + value.eocd.data.disk_number as u32, + value.eocd.data.disk_with_central_directory as u32, + ), + }; + + let directory_start = relative_cd_offset + .checked_add(value.archive_offset) + .ok_or(InvalidArchive("Invalid central directory size or offset"))?; + + Ok(Self { + archive_offset: value.archive_offset, + directory_start, + number_of_files, + disk_number, + disk_with_central_directory, + }) + } } impl ZipArchive { pub(crate) fn from_finalized_writer( files: IndexMap, ZipFileData>, comment: Box<[u8]>, + zip64_comment: Option>, reader: R, central_start: u64, ) -> ZipResult { @@ -459,12 +507,10 @@ impl ZipArchive { config: Config { archive_offset: ArchiveOffset::Known(initial_offset), }, + comment, + zip64_comment, }); - Ok(Self { - reader, - shared, - comment: comment.into(), - }) + Ok(Self { reader, shared }) } /// Total size of the files in the archive, if it can be known. Doesn't include directories or @@ -549,264 +595,36 @@ impl ZipArchive { Ok(new_files) } - fn get_directory_info_zip32( - config: &Config, - reader: &mut R, - footer: &Zip32CentralDirectoryEnd, - cde_start_pos: u64, - ) -> ZipResult { - let archive_offset = match config.archive_offset { - ArchiveOffset::Known(n) => n, - ArchiveOffset::FromCentralDirectory | ArchiveOffset::Detect => { - // Some zip files have data prepended to them, resulting in the - // offsets all being too small. Get the amount of error by comparing - // the actual file position we found the CDE at with the offset - // recorded in the CDE. - let mut offset = cde_start_pos - .checked_sub(footer.central_directory_size as u64) - .and_then(|x| x.checked_sub(footer.central_directory_offset as u64)) - .ok_or(InvalidArchive("Invalid central directory size or offset"))?; - - if config.archive_offset == ArchiveOffset::Detect { - // Check whether the archive offset makes sense by peeking at the directory start. If it - // doesn't, fall back to using no archive offset. This supports zips with the central - // directory entries somewhere other than directly preceding the end of central directory. - reader.seek(SeekFrom::Start( - offset + footer.central_directory_offset as u64, - ))?; - let mut buf = [0; 4]; - reader.read_exact(&mut buf)?; - if spec::Magic::from_le_bytes(buf) - != spec::Magic::CENTRAL_DIRECTORY_HEADER_SIGNATURE - { - offset = 0; - } - } - - offset - } - }; - - let directory_start = footer.central_directory_offset as u64 + archive_offset; - let number_of_files = footer.number_of_files_on_this_disk as usize; - Ok(CentralDirectoryInfo { - archive_offset, - directory_start, - number_of_files, - disk_number: footer.disk_number as u32, - disk_with_central_directory: footer.disk_with_central_directory as u32, - cde_position: cde_start_pos, - is_zip64: false, - }) - } - - const fn order_lower_upper_bounds(a: u64, b: u64) -> (u64, u64) { - if a > b { - (b, a) - } else { - (a, b) - } - } + /// Get the directory start offset and number of files. This is done in a + /// separate function to ease the control flow design. + pub(crate) fn get_metadata(config: Config, reader: &mut R) -> ZipResult { + // End of the probed region, initially set to the end of the file + let file_len = reader.seek(io::SeekFrom::End(0))?; + let mut end_exclusive = file_len; - fn get_directory_info_zip64( - config: &Config, - reader: &mut R, - cde_start_pos: u64, - ) -> ZipResult>> { - // See if there's a ZIP64 footer. The ZIP64 locator if present will - // have its signature 20 bytes in front of the standard footer. The - // standard footer, in turn, is 22+N bytes large, where N is the - // comment length. Therefore: - reader.seek(SeekFrom::Start( - cde_start_pos - .checked_sub(size_of::() as u64) - .ok_or(InvalidArchive( - "No room for ZIP64 locator before central directory end", - ))?, - ))?; - let locator64 = spec::Zip64CentralDirectoryEndLocator::parse(reader)?; - - // We need to reassess `archive_offset`. We know where the ZIP64 - // central-directory-end structure *should* be, but unfortunately we - // don't know how to precisely relate that location to our current - // actual offset in the file, since there may be junk at its - // beginning. Therefore we need to perform another search, as in - // read::Zip32CentralDirectoryEnd::find_and_parse, except now we search - // forward. There may be multiple results because of Zip64 central-directory signatures in - // ZIP comment data. - - let search_upper_bound = cde_start_pos - .checked_sub( - (size_of::() - + size_of::()) as u64, - ) - .ok_or(InvalidArchive( - "File cannot contain ZIP64 central directory end", - ))?; - - let (lower, upper) = Self::order_lower_upper_bounds( - locator64.end_of_central_directory_offset, - search_upper_bound, - ); + loop { + // Find the EOCD and possibly EOCD64 entries and determine the archive offset. + let cde = spec::find_central_directory( + reader, + config.archive_offset, + end_exclusive, + file_len, + )?; - let search_results = Zip64CentralDirectoryEnd::find_and_parse(reader, lower, upper)?; - let results: Vec> = - search_results.into_iter().map(|(footer64, archive_offset)| { - let archive_offset = match config.archive_offset { - ArchiveOffset::Known(n) => n, - ArchiveOffset::FromCentralDirectory => archive_offset, - ArchiveOffset::Detect => { - archive_offset.checked_add(footer64.central_directory_offset) - .and_then(|start| { - // Check whether the archive offset makes sense by peeking at the directory start. - // - // If any errors occur or no header signature is found, fall back to no offset to see if that works. - reader.seek(SeekFrom::Start(start)).ok()?; - let mut buf = [0; 4]; - reader.read_exact(&mut buf).ok()?; - if spec::Magic::from_le_bytes(buf) != spec::Magic::CENTRAL_DIRECTORY_HEADER_SIGNATURE { - None - } else { - Some(archive_offset) - } - }) - .unwrap_or(0) - } - }; - let directory_start = footer64 - .central_directory_offset - .checked_add(archive_offset) - .ok_or(InvalidArchive( - "Invalid central directory size or offset", - ))?; - if directory_start > search_upper_bound { - Err(InvalidArchive( - "Invalid central directory size or offset", - )) - } else if footer64.number_of_files_on_this_disk > footer64.number_of_files { - Err(InvalidArchive( - "ZIP64 footer indicates more files on this disk than in the whole archive", - )) - } else if footer64.version_needed_to_extract > footer64.version_made_by { - Err(InvalidArchive( - "ZIP64 footer indicates a new version is needed to extract this archive than the \ - version that wrote it", - )) - } else { - Ok(CentralDirectoryInfo { - archive_offset, - directory_start, - number_of_files: footer64.number_of_files as usize, - disk_number: footer64.disk_number, - disk_with_central_directory: footer64.disk_with_central_directory, - cde_position: cde_start_pos, - is_zip64: true, - }) - } - }).collect(); - Ok(results) - } + // Turn EOCD into internal representation. + let Ok(shared) = CentralDirectoryInfo::try_from(&cde) + .and_then(|info| Self::read_central_header(info, config, reader)) + else { + // The next EOCD candidate should start before the current one. + end_exclusive = cde.eocd.position; + continue; + }; - /// Get the directory start offset and number of files. This is done in a - /// separate function to ease the control flow design. - pub(crate) fn get_metadata( - config: Config, - reader: &mut R, - ) -> ZipResult<(Zip32CentralDirectoryEnd, Shared)> { - let mut invalid_errors_32 = Vec::new(); - let mut unsupported_errors_32 = Vec::new(); - let mut invalid_errors_64 = Vec::new(); - let mut unsupported_errors_64 = Vec::new(); - let mut ok_results = Vec::new(); - let cde_locations = Zip32CentralDirectoryEnd::find_and_parse(reader)?; - cde_locations - .into_vec() - .into_iter() - .for_each(|(footer, cde_start_pos)| { - let zip32_result = - Self::get_directory_info_zip32(&config, reader, &footer, cde_start_pos); - Self::sort_result( - zip32_result, - &mut invalid_errors_32, - &mut unsupported_errors_32, - &mut ok_results, - &footer, - ); - let mut inner_results = Vec::with_capacity(1); - // Check if file has a zip64 footer - let zip64_vec_result = - Self::get_directory_info_zip64(&config, reader, cde_start_pos); - Self::sort_result( - zip64_vec_result, - &mut invalid_errors_64, - &mut unsupported_errors_64, - &mut inner_results, - &(), - ); - inner_results.into_iter().for_each(|(_, results)| { - results.into_iter().for_each(|result| { - Self::sort_result( - result, - &mut invalid_errors_64, - &mut unsupported_errors_64, - &mut ok_results, - &footer, - ); - }); - }); - }); - ok_results.sort_by_key(|(_, result)| { - ( - u64::MAX - result.cde_position, // try the last one first - !result.is_zip64, // try ZIP64 first - ) - }); - let mut best_result = None; - for (footer, result) in ok_results { - let mut inner_result = Vec::with_capacity(1); - let is_zip64 = result.is_zip64; - Self::sort_result( - Self::read_central_header(result, config, reader), - if is_zip64 { - &mut invalid_errors_64 - } else { - &mut invalid_errors_32 - }, - if is_zip64 { - &mut unsupported_errors_64 - } else { - &mut unsupported_errors_32 - }, - &mut inner_result, - &(), - ); - if let Some((_, shared)) = inner_result.into_iter().next() { - if shared.files.len() == footer.number_of_files as usize - || (is_zip64 && footer.number_of_files == ZIP64_ENTRY_THR as u16) - { - best_result = Some((footer, shared)); - break; - } else { - if is_zip64 { - &mut invalid_errors_64 - } else { - &mut invalid_errors_32 - } - .push(InvalidArchive("wrong number of files")) - } - } + return Ok(shared.build( + cde.eocd.data.zip_file_comment, + cde.eocd64.map(|v| v.data.extensible_data_sector), + )); } - let Some((footer, shared)) = best_result else { - return Err(unsupported_errors_32 - .into_iter() - .chain(unsupported_errors_64) - .chain(invalid_errors_32) - .chain(invalid_errors_64) - .next() - .unwrap()); - }; - reader.seek(SeekFrom::Start(shared.dir_start))?; - Ok((Rc::try_unwrap(footer).unwrap(), shared.build())) } fn read_central_header( @@ -821,15 +639,22 @@ impl ZipArchive { } else { dir_info.number_of_files }; + if dir_info.disk_number != dir_info.disk_with_central_directory { return unsupported_zip_error("Support for multi-disk files is not implemented"); } + + if file_capacity.saturating_mul(size_of::()) > isize::MAX as usize { + return unsupported_zip_error("Oversized central directory"); + } + let mut files = Vec::with_capacity(file_capacity); reader.seek(SeekFrom::Start(dir_info.directory_start))?; for _ in 0..dir_info.number_of_files { - let file = central_header_to_zip_file(reader, dir_info.archive_offset)?; + let file = central_header_to_zip_file(reader, &dir_info)?; files.push(file); } + Ok(SharedBuilder { files, offset: dir_info.archive_offset, @@ -838,22 +663,6 @@ impl ZipArchive { }) } - fn sort_result( - result: ZipResult, - invalid_errors: &mut Vec, - unsupported_errors: &mut Vec, - ok_results: &mut Vec<(U, T)>, - footer: &U, - ) { - match result { - Err(ZipError::UnsupportedArchive(e)) => { - unsupported_errors.push(ZipError::UnsupportedArchive(e)) - } - Err(e) => invalid_errors.push(e), - Ok(o) => ok_results.push((footer.clone(), o)), - } - } - /// Returns the verification value and salt for the AES encryption of the file /// /// It fails if the file number is invalid. @@ -902,15 +711,12 @@ impl ZipArchive { /// /// This uses the central directory record of the ZIP file, and ignores local file headers. pub fn with_config(config: Config, mut reader: R) -> ZipResult> { - reader.seek(SeekFrom::Start(0))?; - if let Ok((footer, shared)) = Self::get_metadata(config, &mut reader) { - return Ok(ZipArchive { - reader, - shared: shared.into(), - comment: footer.zip_file_comment.into(), - }); - } - Err(InvalidArchive("No valid central directory found")) + let shared = Self::get_metadata(config, &mut reader)?; + + Ok(ZipArchive { + reader, + shared: shared.into(), + }) } /// Extract a Zip archive into a directory, overwriting files if they @@ -1050,7 +856,12 @@ impl ZipArchive { /// Get the comment of the zip archive. pub fn comment(&self) -> &[u8] { - &self.comment + &self.shared.comment + } + + /// Get the ZIP64 comment of the zip archive, if it is ZIP64. + pub fn zip64_comment(&self) -> Option<&[u8]> { + self.shared.zip64_comment.as_deref() } /// Returns an iterator over all the file and directory names in this archive. @@ -1235,21 +1046,36 @@ const fn unsupported_zip_error(detail: &'static str) -> ZipResult { /// Parse a central directory entry to collect the information for the file. pub(crate) fn central_header_to_zip_file( reader: &mut R, - archive_offset: u64, + central_directory: &CentralDirectoryInfo, ) -> ZipResult { let central_header_start = reader.stream_position()?; // Parse central header let block = ZipCentralEntryBlock::parse(reader)?; - let file = - central_header_to_zip_file_inner(reader, archive_offset, central_header_start, block)?; + + let file = central_header_to_zip_file_inner( + reader, + central_directory.archive_offset, + central_header_start, + block, + )?; + let central_header_end = reader.stream_position()?; + + if file.header_start >= central_directory.directory_start { + return Err(InvalidArchive( + "A local file entry can't start after the central directory", + )); + } + let data_start = find_data_start(&file, reader)?; - if data_start > central_header_start { + + if data_start > central_directory.directory_start { return Err(InvalidArchive( - "A file can't start after its central-directory header", + "File data can't start after the central directory", )); } + reader.seek(SeekFrom::Start(central_header_end))?; Ok(file) } diff --git a/src/read/config.rs b/src/read/config.rs index 583b40248..b00c58a49 100644 --- a/src/read/config.rs +++ b/src/read/config.rs @@ -16,6 +16,7 @@ pub enum ArchiveOffset { #[default] Detect, /// Use the central directory length and offset to determine the start of the archive. + #[deprecated(since = "2.3.0", note = "use `Detect` instead")] FromCentralDirectory, /// Specify a fixed archive offset. Known(u64), diff --git a/src/read/magic_finder.rs b/src/read/magic_finder.rs new file mode 100644 index 000000000..a1b47903d --- /dev/null +++ b/src/read/magic_finder.rs @@ -0,0 +1,279 @@ +use std::io::{Read, Seek, SeekFrom}; + +use memchr::memmem::{Finder, FinderRev}; + +use crate::result::ZipResult; + +pub trait FinderDirection<'a> { + fn new(needle: &'a [u8]) -> Self; + fn reset_cursor(bounds: (u64, u64), window_size: usize) -> u64; + fn scope_window(window: &[u8], mid_window_offset: usize) -> (&[u8], usize); + + fn needle(&self) -> &[u8]; + fn find(&self, haystack: &[u8]) -> Option; + fn move_cursor(&self, cursor: u64, bounds: (u64, u64), window_size: usize) -> Option; + fn move_scope(&self, offset: usize) -> usize; +} + +pub struct Forward<'a>(Finder<'a>); +impl<'a> FinderDirection<'a> for Forward<'a> { + fn new(needle: &'a [u8]) -> Self { + Self(Finder::new(needle)) + } + + fn reset_cursor((start_inclusive, _): (u64, u64), _: usize) -> u64 { + start_inclusive + } + + fn scope_window(window: &[u8], mid_window_offset: usize) -> (&[u8], usize) { + (&window[mid_window_offset..], mid_window_offset) + } + + fn find(&self, haystack: &[u8]) -> Option { + self.0.find(haystack) + } + + fn needle(&self) -> &[u8] { + self.0.needle() + } + + fn move_cursor(&self, cursor: u64, bounds: (u64, u64), window_size: usize) -> Option { + let magic_overlap = self.needle().len().saturating_sub(1) as u64; + let next = cursor.saturating_add(window_size as u64 - magic_overlap); + + if next >= bounds.1 { + None + } else { + Some(next) + } + } + + fn move_scope(&self, offset: usize) -> usize { + offset + self.needle().len() + } +} + +pub struct Backwards<'a>(FinderRev<'a>); +impl<'a> FinderDirection<'a> for Backwards<'a> { + fn new(needle: &'a [u8]) -> Self { + Self(FinderRev::new(needle)) + } + + fn reset_cursor(bounds: (u64, u64), window_size: usize) -> u64 { + bounds + .1 + .saturating_sub(window_size as u64) + .clamp(bounds.0, bounds.1) + } + + fn scope_window(window: &[u8], mid_window_offset: usize) -> (&[u8], usize) { + (&window[..mid_window_offset], 0) + } + + fn find(&self, haystack: &[u8]) -> Option { + self.0.rfind(haystack) + } + + fn needle(&self) -> &[u8] { + self.0.needle() + } + + fn move_cursor(&self, cursor: u64, bounds: (u64, u64), window_size: usize) -> Option { + let magic_overlap = self.needle().len().saturating_sub(1) as u64; + + if cursor <= bounds.0 { + None + } else { + Some( + cursor + .saturating_add(magic_overlap) + .saturating_sub(window_size as u64) + .clamp(bounds.0, bounds.1), + ) + } + } + + fn move_scope(&self, offset: usize) -> usize { + offset + } +} + +/// A utility for finding magic symbols from the end of a seekable reader. +/// +/// Can be repurposed to recycle the internal buffer. +pub struct MagicFinder { + buffer: Box<[u8]>, + pub(self) finder: Direction, + cursor: u64, + mid_buffer_offset: Option, + bounds: (u64, u64), +} + +impl<'a, T: FinderDirection<'a>> MagicFinder { + /// Create a new magic bytes finder to look within specific bounds. + pub fn new(magic_bytes: &'a [u8], start_inclusive: u64, end_exclusive: u64) -> Self { + const BUFFER_SIZE: usize = 2048; + + // Smaller buffer size would be unable to locate bytes. + // Equal buffer size would stall (the window could not be moved). + debug_assert!(BUFFER_SIZE >= magic_bytes.len()); + + Self { + buffer: vec![0; BUFFER_SIZE].into_boxed_slice(), + finder: T::new(magic_bytes), + cursor: T::reset_cursor((start_inclusive, end_exclusive), BUFFER_SIZE), + mid_buffer_offset: None, + bounds: (start_inclusive, end_exclusive), + } + } + + /// Repurpose the finder for different bytes or bounds. + pub fn repurpose(&mut self, magic_bytes: &'a [u8], bounds: (u64, u64)) -> &mut Self { + debug_assert!(self.buffer.len() >= magic_bytes.len()); + + self.finder = T::new(magic_bytes); + self.cursor = T::reset_cursor(bounds, self.buffer.len()); + self.bounds = bounds; + + // Reset the mid-buffer offset, to invalidate buffer content. + self.mid_buffer_offset = None; + + self + } + + /// Find the next magic bytes in the direction specified in the type. + pub fn next(&mut self, reader: &mut R) -> ZipResult> { + loop { + if self.cursor < self.bounds.0 || self.cursor >= self.bounds.1 { + // The finder is consumed + break; + } + + /* Position the window and ensure correct length */ + let window_start = self.cursor; + let window_end = self + .cursor + .saturating_add(self.buffer.len() as u64) + .min(self.bounds.1); + + if window_end <= window_start { + // Short-circuit on zero-sized windows to prevent loop + break; + } + + let window = &mut self.buffer[..(window_end - window_start) as usize]; + + if self.mid_buffer_offset.is_none() { + reader.seek(SeekFrom::Start(window_start))?; + reader.read_exact(window)?; + } + + let (window, window_start_offset) = match self.mid_buffer_offset { + Some(mid_buffer_offset) => T::scope_window(window, mid_buffer_offset), + None => (&*window, 0usize), + }; + + if let Some(offset) = self.finder.find(window) { + let magic_pos = window_start + window_start_offset as u64 + offset as u64; + reader.seek(SeekFrom::Start(magic_pos))?; + + self.mid_buffer_offset = Some(self.finder.move_scope(window_start_offset + offset)); + + return Ok(Some(magic_pos)); + } + + self.mid_buffer_offset = None; + + match self + .finder + .move_cursor(self.cursor, self.bounds, self.buffer.len()) + { + Some(new_cursor) => { + self.cursor = new_cursor; + } + None => { + // Destroy the finder when we've reached the end of the bounds. + self.bounds.0 = self.bounds.1; + break; + } + } + } + + Ok(None) + } +} + +/// A magic bytes finder with an optimistic guess that is tried before +/// the inner finder begins searching from end. This enables much faster +/// lookup in files without appended junk, because the magic bytes will be +/// found directly. +/// +/// The guess can be marked as mandatory to produce an error. This is useful +/// if the ArchiveOffset is known and auto-detection is not desired. +pub struct OptimisticMagicFinder { + inner: MagicFinder, + initial_guess: Option<(u64, bool)>, +} + +/// This is a temporary restriction, to avoid heap allocation in [`Self::next_back`]. +/// +/// We only use magic bytes of size 4 at the moment. +const STACK_BUFFER_SIZE: usize = 8; + +impl<'a, Direction: FinderDirection<'a>> OptimisticMagicFinder { + /// Create a new empty optimistic magic bytes finder. + pub fn new_empty() -> Self { + Self { + inner: MagicFinder::new(&[], 0, 0), + initial_guess: None, + } + } + + /// Repurpose the finder for different bytes, bounds and initial guesses. + pub fn repurpose( + &mut self, + magic_bytes: &'a [u8], + bounds: (u64, u64), + initial_guess: Option<(u64, bool)>, + ) -> &mut Self { + debug_assert!(magic_bytes.len() <= STACK_BUFFER_SIZE); + + self.inner.repurpose(magic_bytes, bounds); + self.initial_guess = initial_guess; + + self + } + + /// Equivalent to `next_back`, with an optional initial guess attempted before + /// proceeding with reading from the back of the reader. + pub fn next(&mut self, reader: &mut R) -> ZipResult> { + if let Some((v, mandatory)) = self.initial_guess { + reader.seek(SeekFrom::Start(v))?; + + let mut buffer = [0; STACK_BUFFER_SIZE]; + let buffer = &mut buffer[..self.inner.finder.needle().len()]; + + // Attempt to match only if there's enough space for the needle + if v.saturating_add(buffer.len() as u64) <= self.inner.bounds.1 { + reader.read_exact(buffer)?; + + // If a match is found, yield it. + if self.inner.finder.needle() == buffer { + self.initial_guess.take(); + reader.seek(SeekFrom::Start(v))?; + return Ok(Some(v)); + } + } + + // If a match is not found, but the initial guess was mandatory, return an error. + if mandatory { + return Ok(None); + } + + // If the initial guess was not mandatory, remove it, as it was not found. + self.initial_guess.take(); + } + + self.inner.next(reader) + } +} diff --git a/src/spec.rs b/src/spec.rs index 0bd89a9a7..3d5318872 100644 --- a/src/spec.rs +++ b/src/spec.rs @@ -1,11 +1,11 @@ #![macro_use] +use crate::read::magic_finder::{Backwards, Forward, MagicFinder, OptimisticMagicFinder}; +use crate::read::ArchiveOffset; use crate::result::{ZipError, ZipResult}; use core::mem; -use memchr::memmem::FinderRev; use std::io; use std::io::prelude::*; -use std::rc::Rc; use std::slice; /// "Magic" header values used in the zip spec to locate metadata records. @@ -22,6 +22,7 @@ impl Magic { } #[inline(always)] + #[allow(dead_code)] pub const fn from_le_bytes(bytes: [u8; 4]) -> Self { Self(u32::from_le_bytes(bytes)) } @@ -289,7 +290,7 @@ pub(crate) struct Zip32CentralDirectoryEnd { } impl Zip32CentralDirectoryEnd { - fn block_and_comment(self) -> ZipResult<(Zip32CDEBlock, Box<[u8]>)> { + fn into_block_and_comment(self) -> (Zip32CDEBlock, Box<[u8]>) { let Self { disk_number, disk_with_central_directory, @@ -307,12 +308,10 @@ impl Zip32CentralDirectoryEnd { number_of_files, central_directory_size, central_directory_offset, - zip_file_comment_length: zip_file_comment - .len() - .try_into() - .map_err(|_| ZipError::InvalidArchive("File comment must be less than 64 KiB"))?, + zip_file_comment_length: zip_file_comment.len() as u16, }; - Ok((block, zip_file_comment)) + + (block, zip_file_comment) } pub fn parse(reader: &mut T) -> ZipResult { @@ -329,7 +328,15 @@ impl Zip32CentralDirectoryEnd { } = Zip32CDEBlock::parse(reader)?; let mut zip_file_comment = vec![0u8; zip_file_comment_length as usize].into_boxed_slice(); - reader.read_exact(&mut zip_file_comment)?; + if let Err(e) = reader.read_exact(&mut zip_file_comment) { + if e.kind() == io::ErrorKind::UnexpectedEof { + return Err(ZipError::InvalidArchive( + "EOCD comment exceeds file boundary", + )); + } + + return Err(e.into()); + } Ok(Zip32CentralDirectoryEnd { disk_number, @@ -342,99 +349,23 @@ impl Zip32CentralDirectoryEnd { }) } - #[allow(clippy::type_complexity)] - pub fn find_and_parse( - reader: &mut T, - ) -> ZipResult, u64)]>> { - let mut results = vec![]; - let file_length = reader.seek(io::SeekFrom::End(0))?; - - if file_length < mem::size_of::() as u64 { - return Err(ZipError::InvalidArchive("Invalid zip header")); - } - - // The End Of Central Directory Record should be the last thing in - // the file and so searching the last 65557 bytes of the file should - // be enough. However, not all zips are well-formed and other - // programs may consume zips with extra junk at the end without - // error, so we go back 128K to be compatible with them. 128K is - // arbitrary, but it matches what Info-Zip does. - const EOCDR_SEARCH_SIZE: u64 = 128 * 1024; - let search_lower_bound = file_length.saturating_sub(EOCDR_SEARCH_SIZE); - - const END_WINDOW_SIZE: usize = 8192; - /* TODO: use static_assertions!() */ - debug_assert!(END_WINDOW_SIZE > mem::size_of::()); - - const SIG_BYTES: [u8; mem::size_of::()] = - Magic::CENTRAL_DIRECTORY_END_SIGNATURE.to_le_bytes(); - let finder = FinderRev::new(&SIG_BYTES); - - let mut window_start: u64 = file_length.saturating_sub(END_WINDOW_SIZE as u64); - let mut window = [0u8; END_WINDOW_SIZE]; - while window_start >= search_lower_bound { - /* Go to the start of the window in the file. */ - reader.seek(io::SeekFrom::Start(window_start))?; - - /* Identify how many bytes to read (this may be less than the window size for files - * smaller than END_WINDOW_SIZE). */ - let end = (window_start + END_WINDOW_SIZE as u64).min(file_length); - let cur_len = (end - window_start) as usize; - debug_assert!(cur_len > 0); - debug_assert!(cur_len <= END_WINDOW_SIZE); - let cur_window: &mut [u8] = &mut window[..cur_len]; - /* Read the window into the bytes! */ - reader.read_exact(cur_window)?; - - /* Find instances of the magic signature. */ - for offset in finder.rfind_iter(cur_window) { - let cde_start_pos = window_start + offset as u64; - reader.seek(io::SeekFrom::Start(cde_start_pos))?; - /* Drop any headers that don't parse. */ - if let Ok(cde) = Self::parse(reader) { - results.push((Rc::new(cde), cde_start_pos)); - } - } + pub fn write(self, writer: &mut T) -> ZipResult<()> { + let (block, comment) = self.into_block_and_comment(); - /* We always want to make sure we go allllll the way back to the start of the file if - * we can't find it elsewhere. However, our `while` condition doesn't check that. So we - * avoid infinite looping by checking at the end of the loop. */ - if window_start == search_lower_bound { - break; - } - /* Shift the window by END_WINDOW_SIZE bytes, but make sure to cover matches that - * overlap our nice neat window boundaries! */ - window_start = (window_start - /* NB: To catch matches across window boundaries, we need to make our blocks overlap - * by the width of the pattern to match. */ - + mem::size_of::() as u64) - /* This should never happen, but make sure we don't go past the end of the file. */ - .min(file_length); - window_start = window_start - .saturating_sub( - /* Shift the window upon each iteration so we search END_WINDOW_SIZE bytes at - * once (unless limited by file_length). */ - END_WINDOW_SIZE as u64, - ) - /* This will never go below the value of `search_lower_bound`, so we have a special - * `if window_start == search_lower_bound` check above. */ - .max(search_lower_bound); - } - if results.is_empty() { - Err(ZipError::InvalidArchive( - "Could not find central directory end", - )) - } else { - Ok(results.into_boxed_slice()) + if comment.len() > u16::MAX as usize { + return Err(ZipError::InvalidArchive( + "EOCD comment length exceeds u16::MAX", + )); } - } - pub fn write(self, writer: &mut T) -> ZipResult<()> { - let (block, comment) = self.block_and_comment()?; block.write(writer)?; writer.write_all(&comment)?; Ok(()) } + + pub fn may_be_zip64(&self) -> bool { + self.number_of_files == u16::MAX || self.central_directory_offset == u32::MAX + } } #[derive(Copy, Clone)] @@ -551,6 +482,7 @@ impl FixedSizeBlock for Zip64CDEBlock { } pub(crate) struct Zip64CentralDirectoryEnd { + pub record_size: u64, pub version_made_by: u16, pub version_needed_to_extract: u16, pub disk_number: u32, @@ -559,13 +491,13 @@ pub(crate) struct Zip64CentralDirectoryEnd { pub number_of_files: u64, pub central_directory_size: u64, pub central_directory_offset: u64, - //pub extensible_data_sector: Vec, <-- We don't do anything with this at the moment. + pub extensible_data_sector: Box<[u8]>, } impl Zip64CentralDirectoryEnd { - pub fn parse(reader: &mut T) -> ZipResult { + pub fn parse(reader: &mut T, max_size: u64) -> ZipResult { let Zip64CDEBlock { - // record_size, + record_size, version_made_by, version_needed_to_extract, disk_number, @@ -576,7 +508,20 @@ impl Zip64CentralDirectoryEnd { central_directory_offset, .. } = Zip64CDEBlock::parse(reader)?; + + if record_size < 44 { + return Err(ZipError::InvalidArchive("Low EOCD64 record size")); + } else if record_size.saturating_add(12) > max_size { + return Err(ZipError::InvalidArchive( + "EOCD64 extends beyond EOCD64 locator", + )); + } + + let mut zip_file_comment = vec![0u8; record_size as usize - 44].into_boxed_slice(); + reader.read_exact(&mut zip_file_comment)?; + Ok(Self { + record_size, version_made_by, version_needed_to_extract, disk_number, @@ -585,94 +530,13 @@ impl Zip64CentralDirectoryEnd { number_of_files, central_directory_size, central_directory_offset, + extensible_data_sector: zip_file_comment, }) } - pub fn find_and_parse( - reader: &mut T, - search_lower_bound: u64, - search_upper_bound: u64, - ) -> ZipResult> { - let mut results = Vec::new(); - - const END_WINDOW_SIZE: usize = 2048; - /* TODO: use static_assertions!() */ - debug_assert!(END_WINDOW_SIZE > mem::size_of::()); - - const SIG_BYTES: [u8; mem::size_of::()] = - Magic::ZIP64_CENTRAL_DIRECTORY_END_SIGNATURE.to_le_bytes(); - let finder = FinderRev::new(&SIG_BYTES); - - let mut window_start: u64 = search_upper_bound - .saturating_sub(END_WINDOW_SIZE as u64) - .max(search_lower_bound); - let mut window = [0u8; END_WINDOW_SIZE]; - while window_start >= search_lower_bound { - reader.seek(io::SeekFrom::Start(window_start))?; - - /* Identify how many bytes to read (this may be less than the window size for files - * smaller than END_WINDOW_SIZE). */ - let end = (window_start + END_WINDOW_SIZE as u64).min(search_upper_bound); - - debug_assert!(end >= window_start); - let cur_len = (end - window_start) as usize; - if cur_len == 0 { - break; - } - debug_assert!(cur_len <= END_WINDOW_SIZE); - let cur_window: &mut [u8] = &mut window[..cur_len]; - /* Read the window into the bytes! */ - reader.read_exact(cur_window)?; - - /* Find instances of the magic signature. */ - for offset in finder.rfind_iter(cur_window) { - let cde_start_pos = window_start + offset as u64; - reader.seek(io::SeekFrom::Start(cde_start_pos))?; - - debug_assert!(cde_start_pos >= search_lower_bound); - let archive_offset = cde_start_pos - search_lower_bound; - let cde = Self::parse(reader)?; - - results.push((cde, archive_offset)); - } - - /* We always want to make sure we go allllll the way back to the start of the file if - * we can't find it elsewhere. However, our `while` condition doesn't check that. So we - * avoid infinite looping by checking at the end of the loop. */ - if window_start == search_lower_bound { - break; - } - /* Shift the window by END_WINDOW_SIZE bytes, but make sure to cover matches that - * overlap our nice neat window boundaries! */ - window_start = (window_start - /* NB: To catch matches across window boundaries, we need to make our blocks overlap - * by the width of the pattern to match. */ - + mem::size_of::() as u64) - /* This may never happen, but make sure we don't go past the end of the specified - * range. */ - .min(search_upper_bound); - window_start = window_start - .saturating_sub( - /* Shift the window upon each iteration so we search END_WINDOW_SIZE bytes at - * once (unless limited by search_upper_bound). */ - END_WINDOW_SIZE as u64, - ) - /* This will never go below the value of `search_lower_bound`, so we have a special - * `if window_start == search_lower_bound` check above. */ - .max(search_lower_bound); - } - - if results.is_empty() { - Err(ZipError::InvalidArchive( - "Could not find ZIP64 central directory end", - )) - } else { - Ok(results) - } - } - - pub fn block(self) -> Zip64CDEBlock { + pub fn into_block_and_comment(self) -> (Zip64CDEBlock, Box<[u8]>) { let Self { + record_size, version_made_by, version_needed_to_extract, disk_number, @@ -681,27 +545,277 @@ impl Zip64CentralDirectoryEnd { number_of_files, central_directory_size, central_directory_offset, + extensible_data_sector, } = self; - Zip64CDEBlock { - magic: Zip64CDEBlock::MAGIC, - /* currently unused */ - record_size: 44, - version_made_by, - version_needed_to_extract, - disk_number, - disk_with_central_directory, - number_of_files_on_this_disk, - number_of_files, - central_directory_size, - central_directory_offset, - } + + ( + Zip64CDEBlock { + magic: Zip64CDEBlock::MAGIC, + record_size, + version_made_by, + version_needed_to_extract, + disk_number, + disk_with_central_directory, + number_of_files_on_this_disk, + number_of_files, + central_directory_size, + central_directory_offset, + }, + extensible_data_sector, + ) } pub fn write(self, writer: &mut T) -> ZipResult<()> { - self.block().write(writer) + let (block, comment) = self.into_block_and_comment(); + block.write(writer)?; + writer.write_all(&comment)?; + Ok(()) } } +pub(crate) struct DataAndPosition { + pub data: T, + #[allow(dead_code)] + pub position: u64, +} + +impl From<(T, u64)> for DataAndPosition { + fn from(value: (T, u64)) -> Self { + Self { + data: value.0, + position: value.1, + } + } +} + +pub(crate) struct CentralDirectoryEndInfo { + pub eocd: DataAndPosition, + pub eocd64: Option>, + + pub archive_offset: u64, +} + +/// Finds the EOCD and possibly the EOCD64 block and determines the archive offset. +/// +/// In the best case scenario (no prepended junk), this function will not backtrack +/// in the reader. +pub(crate) fn find_central_directory( + reader: &mut R, + archive_offset: ArchiveOffset, + end_exclusive: u64, + file_len: u64, +) -> ZipResult { + const EOCD_SIG_BYTES: [u8; mem::size_of::()] = + Magic::CENTRAL_DIRECTORY_END_SIGNATURE.to_le_bytes(); + + const EOCD64_SIG_BYTES: [u8; mem::size_of::()] = + Magic::ZIP64_CENTRAL_DIRECTORY_END_SIGNATURE.to_le_bytes(); + + const CDFH_SIG_BYTES: [u8; mem::size_of::()] = + Magic::CENTRAL_DIRECTORY_HEADER_SIGNATURE.to_le_bytes(); + + // Instantiate the mandatory finder + let mut eocd_finder = MagicFinder::>::new(&EOCD_SIG_BYTES, 0, end_exclusive); + let mut subfinder: Option>> = None; + + // Keep the last errors for cases of improper EOCD instances. + let mut parsing_error = None; + + while let Some(eocd_offset) = eocd_finder.next(reader)? { + // Attempt to parse the EOCD block + let eocd = match Zip32CentralDirectoryEnd::parse(reader) { + Ok(eocd) => eocd, + Err(e) => { + if parsing_error.is_none() { + parsing_error = Some(e); + } + continue; + } + }; + + // ! Relaxed (inequality) due to garbage-after-comment Python files + // Consistency check: the EOCD comment must terminate before the end of file + if eocd.zip_file_comment.len() as u64 + eocd_offset + 22 > file_len { + parsing_error = Some(ZipError::InvalidArchive("Invalid EOCD comment length")); + continue; + } + + let zip64_metadata = if eocd.may_be_zip64() { + fn try_read_eocd64_locator( + reader: &mut (impl Read + Seek), + eocd_offset: u64, + ) -> ZipResult<(u64, Zip64CentralDirectoryEndLocator)> { + if eocd_offset < mem::size_of::() as u64 { + return Err(ZipError::InvalidArchive( + "EOCD64 Locator does not fit in file", + )); + } + + let locator64_offset = eocd_offset - mem::size_of::() as u64; + + reader.seek(io::SeekFrom::Start(locator64_offset))?; + Ok(( + locator64_offset, + Zip64CentralDirectoryEndLocator::parse(reader)?, + )) + } + + try_read_eocd64_locator(reader, eocd_offset).ok() + } else { + None + }; + + let Some((locator64_offset, locator64)) = zip64_metadata else { + // Branch out for zip32 + let relative_cd_offset = eocd.central_directory_offset as u64; + + // If the archive is empty, there is nothing more to be checked, the archive is correct. + if eocd.number_of_files == 0 { + return Ok(CentralDirectoryEndInfo { + eocd: (eocd, eocd_offset).into(), + eocd64: None, + archive_offset: eocd_offset.saturating_sub(relative_cd_offset), + }); + } + + // Consistency check: the CD relative offset cannot be after the EOCD + if relative_cd_offset >= eocd_offset { + parsing_error = Some(ZipError::InvalidArchive("Invalid CDFH offset in EOCD")); + continue; + } + + // Attempt to find the first CDFH + let subfinder = subfinder + .get_or_insert_with(OptimisticMagicFinder::new_empty) + .repurpose( + &CDFH_SIG_BYTES, + // The CDFH must be before the EOCD and after the relative offset, + // because prepended junk can only move it forward. + (relative_cd_offset, eocd_offset), + match archive_offset { + ArchiveOffset::Known(n) => { + Some((relative_cd_offset.saturating_add(n).min(eocd_offset), true)) + } + _ => Some((relative_cd_offset, false)), + }, + ); + + // Consistency check: find the first CDFH + if let Some(cd_offset) = subfinder.next(reader)? { + // The first CDFH will define the archive offset + let archive_offset = cd_offset - relative_cd_offset; + + return Ok(CentralDirectoryEndInfo { + eocd: (eocd, eocd_offset).into(), + eocd64: None, + archive_offset, + }); + } + + parsing_error = Some(ZipError::InvalidArchive("No CDFH found")); + continue; + }; + + // Consistency check: the EOCD64 offset must be before EOCD64 Locator offset */ + if locator64.end_of_central_directory_offset >= locator64_offset { + parsing_error = Some(ZipError::InvalidArchive("Invalid EOCD64 Locator CD offset")); + continue; + } + + if locator64.number_of_disks > 1 { + parsing_error = Some(ZipError::InvalidArchive( + "Multi-disk ZIP files are not supported", + )); + continue; + } + + // This was hidden inside a function to collect errors in a single place. + // Once try blocks are stabilized, this can go away. + fn try_read_eocd64( + reader: &mut R, + locator64: &Zip64CentralDirectoryEndLocator, + expected_length: u64, + ) -> ZipResult { + let z64 = Zip64CentralDirectoryEnd::parse(reader, expected_length)?; + + // Consistency check: EOCD64 locator should agree with the EOCD64 + if z64.disk_with_central_directory != locator64.disk_with_central_directory { + return Err(ZipError::InvalidArchive( + "Invalid EOCD64: inconsistency with Locator data", + )); + } + + // Consistency check: the EOCD64 must have the expected length + if z64.record_size + 12 != expected_length { + return Err(ZipError::InvalidArchive( + "Invalid EOCD64: inconsistent length", + )); + } + + Ok(z64) + } + + // Attempt to find the EOCD64 with an initial guess + let subfinder = subfinder + .get_or_insert_with(OptimisticMagicFinder::new_empty) + .repurpose( + &EOCD64_SIG_BYTES, + (locator64.end_of_central_directory_offset, locator64_offset), + match archive_offset { + ArchiveOffset::Known(n) => Some(( + locator64 + .end_of_central_directory_offset + .saturating_add(n) + .min(locator64_offset), + true, + )), + _ => Some((locator64.end_of_central_directory_offset, false)), + }, + ); + + // Consistency check: Find the EOCD64 + let mut local_error = None; + while let Some(eocd64_offset) = subfinder.next(reader)? { + let archive_offset = eocd64_offset - locator64.end_of_central_directory_offset; + + match try_read_eocd64( + reader, + &locator64, + locator64_offset.saturating_sub(eocd64_offset), + ) { + Ok(eocd64) => { + if eocd64_offset + < eocd64 + .number_of_files + .saturating_mul( + mem::size_of::() as u64 + ) + .saturating_add(eocd64.central_directory_offset) + { + local_error = Some(ZipError::InvalidArchive( + "Invalid EOCD64: inconsistent number of files", + )); + continue; + } + + return Ok(CentralDirectoryEndInfo { + eocd: (eocd, eocd_offset).into(), + eocd64: Some((eocd64, eocd64_offset).into()), + archive_offset, + }); + } + Err(e) => { + local_error = Some(e); + } + } + } + + parsing_error = local_error.or(Some(ZipError::InvalidArchive("Could not find EOCD64"))); + } + + Err(parsing_error.unwrap_or(ZipError::InvalidArchive("Could not find EOCD"))) +} + pub(crate) fn is_dir(filename: &str) -> bool { filename .chars() diff --git a/src/write.rs b/src/write.rs index 8d077b595..96b72a2f3 100644 --- a/src/write.rs +++ b/src/write.rs @@ -160,6 +160,7 @@ pub(crate) mod zip_writer { pub(super) writing_to_file: bool, pub(super) writing_raw: bool, pub(super) comment: Box<[u8]>, + pub(super) zip64_comment: Option>, pub(super) flush_on_finish_file: bool, } @@ -628,19 +629,19 @@ impl ZipWriter { /// This uses the given read configuration to initially read the archive. pub fn new_append_with_config(config: Config, mut readwriter: A) -> ZipResult> { readwriter.seek(SeekFrom::Start(0))?; - if let Ok((footer, shared)) = ZipArchive::get_metadata(config, &mut readwriter) { - Ok(ZipWriter { - inner: Storer(MaybeEncrypted::Unencrypted(readwriter)), - files: shared.files, - stats: Default::default(), - writing_to_file: false, - comment: footer.zip_file_comment, - writing_raw: true, // avoid recomputing the last file's header - flush_on_finish_file: false, - }) - } else { - Err(InvalidArchive("No central-directory end header found")) - } + + let shared = ZipArchive::get_metadata(config, &mut readwriter)?; + + Ok(ZipWriter { + inner: Storer(MaybeEncrypted::Unencrypted(readwriter)), + files: shared.files, + stats: Default::default(), + writing_to_file: false, + comment: shared.comment, + zip64_comment: shared.zip64_comment, + writing_raw: true, // avoid recomputing the last file's header + flush_on_finish_file: false, + }) } /// `flush_on_finish_file` is designed to support a streaming `inner` that may unload flushed @@ -774,8 +775,11 @@ impl ZipWriter { let central_start = self.finalize()?; let inner = mem::replace(&mut self.inner, Closed).unwrap(); let comment = mem::take(&mut self.comment); + let zip64_comment = mem::take(&mut self.zip64_comment); let files = mem::take(&mut self.files); - let archive = ZipArchive::from_finalized_writer(files, comment, inner, central_start)?; + + let archive = + ZipArchive::from_finalized_writer(files, comment, zip64_comment, inner, central_start)?; Ok(archive) } } @@ -794,6 +798,7 @@ impl ZipWriter { writing_to_file: false, writing_raw: false, comment: Box::new([]), + zip64_comment: None, flush_on_finish_file: false, } } @@ -832,6 +837,35 @@ impl ZipWriter { &self.comment } + /// Set ZIP64 archive comment. + pub fn set_zip64_comment(&mut self, comment: Option) + where + S: Into>, + { + self.set_raw_zip64_comment(comment.map(|v| v.into().into_boxed_bytes())) + } + + /// Set ZIP64 archive comment. + /// + /// This sets the raw bytes of the comment. The comment + /// is typically expected to be encoded in UTF-8. + pub fn set_raw_zip64_comment(&mut self, comment: Option>) { + self.zip64_comment = comment; + } + + /// Get ZIP64 archive comment. + pub fn get_zip64_comment(&mut self) -> Option> { + self.get_raw_zip64_comment().map(from_utf8) + } + + /// Get ZIP archive comment. + /// + /// This returns the raw bytes of the comment. The comment + /// is typically expected to be encoded in UTF-8. + pub fn get_raw_zip64_comment(&self) -> Option<&[u8]> { + self.zip64_comment.as_deref() + } + /// Set the file length and crc32 manually. /// /// # Safety @@ -1516,11 +1550,15 @@ impl ZipWriter { version_needed = version_needed.max(file.version_needed()); } let central_size = writer.stream_position()? - central_start; - - if self.files.len() > spec::ZIP64_ENTRY_THR + let is64 = self.files.len() > spec::ZIP64_ENTRY_THR || central_size.max(central_start) > spec::ZIP64_BYTES_THR - { + || self.zip64_comment.is_some(); + + if is64 { + let comment = self.zip64_comment.clone().unwrap_or_default(); + let zip64_footer = spec::Zip64CentralDirectoryEnd { + record_size: comment.len() as u64 + 44, version_made_by: version_needed, version_needed_to_extract: version_needed, disk_number: 0, @@ -1529,6 +1567,7 @@ impl ZipWriter { number_of_files: self.files.len() as u64, central_directory_size: central_size, central_directory_offset: central_start, + extensible_data_sector: comment, }; zip64_footer.write(writer)?; diff --git a/tests/prepended_garbage.rs b/tests/prepended_garbage.rs new file mode 100644 index 000000000..dad30e54a --- /dev/null +++ b/tests/prepended_garbage.rs @@ -0,0 +1,24 @@ +use std::io::Cursor; +use zip::ZipArchive; + +#[test] +fn test_prepended_garbage() { + let mut v = vec![0, 1, 2, 3]; + v.extend_from_slice(include_bytes!("../tests/data/extended_timestamp.zip")); + + let mut archive = ZipArchive::new(Cursor::new(v)).expect("couldn't open test zip file"); + + assert_eq!(2, archive.len()); + + for file_idx in 0..archive.len() { + let file = archive.by_index(file_idx).unwrap(); + let outpath = file.enclosed_name().unwrap(); + + println!( + "Entry {} has name \"{}\" ({} bytes)", + file_idx, + outpath.display(), + file.size() + ); + } +}