Skip to content

Commit

Permalink
feat: support both directions in MagicFinder, correctly find first CDFH
Browse files Browse the repository at this point in the history
  • Loading branch information
RisaI committed Oct 20, 2024
1 parent 0f7b326 commit 37a431b
Show file tree
Hide file tree
Showing 3 changed files with 174 additions and 57 deletions.
182 changes: 134 additions & 48 deletions src/read/magic_finder.rs
Original file line number Diff line number Diff line change
@@ -1,21 +1,115 @@
use std::io::{Read, Seek, SeekFrom};

use memchr::memmem::FinderRev;
use memchr::memmem::{Finder, FinderRev};

use crate::result::ZipResult;

pub trait FinderDirection<'a> {
fn new(needle: &'a [u8]) -> Self;
fn reset_cursor(bounds: (u64, u64), window_size: usize) -> u64;
fn scope_window(window: &[u8], mid_window_offset: usize) -> &[u8];

fn needle(&self) -> &[u8];
fn find(&self, haystack: &[u8]) -> Option<usize>;
fn move_cursor(&self, cursor: u64, bounds: (u64, u64), window_size: usize) -> Option<u64>;
fn move_scope(&self, offset: usize) -> usize;
}

pub struct Forward<'a>(Finder<'a>);
impl<'a> FinderDirection<'a> for Forward<'a> {
fn new(needle: &'a [u8]) -> Self {
Self(Finder::new(needle))
}

fn reset_cursor((start_inclusive, _): (u64, u64), _: usize) -> u64 {
start_inclusive
}

fn scope_window(window: &[u8], mid_window_offset: usize) -> &[u8] {
&window[mid_window_offset..]
}

fn find(&self, haystack: &[u8]) -> Option<usize> {
self.0.find(haystack)
}

fn needle(&self) -> &[u8] {
self.0.needle()
}

fn move_cursor(&self, cursor: u64, bounds: (u64, u64), window_size: usize) -> Option<u64> {
let magic_overlap = self.needle().len().saturating_sub(1) as u64;
let next = cursor.saturating_add(window_size as u64 - magic_overlap);

if next >= bounds.1 {
None
} else {
Some(next)
}
}

fn move_scope(&self, offset: usize) -> usize {
offset + self.needle().len()
}
}

pub struct Backwards<'a>(FinderRev<'a>);
impl<'a> FinderDirection<'a> for Backwards<'a> {
fn new(needle: &'a [u8]) -> Self {
Self(FinderRev::new(needle))
}

fn reset_cursor(bounds: (u64, u64), window_size: usize) -> u64 {
bounds
.1
.saturating_sub(window_size as u64)
.clamp(bounds.0, bounds.1)
}

fn scope_window(window: &[u8], mid_window_offset: usize) -> &[u8] {
&window[..mid_window_offset]
}

fn find(&self, haystack: &[u8]) -> Option<usize> {
self.0.rfind(haystack)
}

fn needle(&self) -> &[u8] {
self.0.needle()
}

fn move_cursor(&self, cursor: u64, bounds: (u64, u64), window_size: usize) -> Option<u64> {
let magic_overlap = self.needle().len().saturating_sub(1) as u64;

if cursor <= bounds.0 {
None
} else {
Some(
cursor
.saturating_add(magic_overlap)
.saturating_sub(window_size as u64)
.clamp(bounds.0, bounds.1),
)
}
}

fn move_scope(&self, offset: usize) -> usize {
offset
}
}

/// A utility for finding magic symbols from the end of a seekable reader.
///
/// Can be repurposed to recycle the internal buffer.
pub struct MagicFinder<'a> {
pub struct MagicFinder<Direction> {
buffer: Box<[u8]>,
pub(self) finder: FinderRev<'a>,
pub(self) finder: Direction,
cursor: u64,
mid_buffer_offset: Option<usize>,
bounds: (u64, u64),
}

impl<'a> MagicFinder<'a> {
impl<'a, T: FinderDirection<'a>> MagicFinder<T> {
/// Create a new magic bytes finder to look within specific bounds.
pub fn new(magic_bytes: &'a [u8], start_inclusive: u64, end_exclusive: u64) -> Self {
const BUFFER_SIZE: usize = 2048;
Expand All @@ -26,24 +120,19 @@ impl<'a> MagicFinder<'a> {

Self {
buffer: vec![0; BUFFER_SIZE].into_boxed_slice(),
finder: FinderRev::new(magic_bytes),
cursor: end_exclusive
.saturating_sub(BUFFER_SIZE as u64)
.clamp(start_inclusive, end_exclusive),
finder: T::new(magic_bytes),
cursor: T::reset_cursor((start_inclusive, end_exclusive), BUFFER_SIZE),
mid_buffer_offset: None,
bounds: (start_inclusive, end_exclusive),
}
}

/// Repurpose the finder for different bytes or bounds.
pub fn repurpose(&mut self, magic_bytes: &'a [u8], bounds: (u64, u64)) -> &mut Self {
debug_assert!(self.buffer.len() > magic_bytes.len());
debug_assert!(self.buffer.len() >= magic_bytes.len());

self.finder = FinderRev::new(magic_bytes);
self.cursor = bounds
.1
.saturating_sub(self.buffer.len() as u64)
.clamp(bounds.0, bounds.1);
self.finder = T::new(magic_bytes);
self.cursor = T::reset_cursor(bounds, self.buffer.len());
self.bounds = bounds;

// Reset the mid-buffer offset, to invalidate buffer content.
Expand All @@ -52,14 +141,10 @@ impl<'a> MagicFinder<'a> {
self
}

/// Find the next magic bytes from the end of the reader.
///
/// Similar in functionality to a double ended iterator, except
/// it propagates errors first and doesn't hold on to the reader
/// between items.
pub fn next_back<R: Read + Seek>(&mut self, reader: &mut R) -> ZipResult<Option<u64>> {
/// Find the next magic bytes in the direction specified in the type.
pub fn next<R: Read + Seek>(&mut self, reader: &mut R) -> ZipResult<Option<u64>> {
loop {
if self.cursor < self.bounds.0 {
if self.cursor < self.bounds.0 || self.cursor >= self.bounds.1 {
// The finder is consumed
break;
}
Expand All @@ -83,34 +168,35 @@ impl<'a> MagicFinder<'a> {
reader.read_exact(window)?;
}

let mid_buffer_offset = self.mid_buffer_offset.unwrap_or(window.len());
let window = &mut window[..mid_buffer_offset];
let window = match self.mid_buffer_offset {
Some(mid_buffer_offset) => T::scope_window(window, mid_buffer_offset),
None => window,
};

if let Some(offset) = self.finder.rfind(window) {
if let Some(offset) = self.finder.find(window) {
let magic_pos = window_start + offset as u64;
reader.seek(SeekFrom::Start(magic_pos))?;

self.mid_buffer_offset = Some(offset);
self.mid_buffer_offset = Some(self.finder.move_scope(offset));

return Ok(Some(magic_pos));
}

self.mid_buffer_offset = None;

/* We always want to make sure we go allllll the way back to the start of the file if
* we can't find it elsewhere. However, our `while` condition doesn't check that. So we
* avoid infinite looping by checking at the end of the loop. */
if window_start == self.bounds.0 {
self.bounds.0 = self.bounds.1;
break;
match self
.finder
.move_cursor(self.cursor, self.bounds, self.buffer.len())
{
Some(new_cursor) => {
self.cursor = new_cursor;
}
None => {
// Destroy the finder when we've reached the end of the bounds.
self.bounds.0 = self.bounds.1;
break;
}
}

/* Move cursor to the next chunk, cover magic at boundary by shifting by needle length. */
self.cursor = self
.cursor
.saturating_add(self.finder.needle().len() as u64 - 1)
.saturating_sub(self.buffer.len() as u64)
.clamp(self.bounds.0, self.bounds.1);
}

Ok(None)
Expand All @@ -124,8 +210,8 @@ impl<'a> MagicFinder<'a> {
///
/// The guess can be marked as mandatory to produce an error. This is useful
/// if the ArchiveOffset is known and auto-detection is not desired.
pub struct OptimisticMagicFinder<'a> {
inner: MagicFinder<'a>,
pub struct OptimisticMagicFinder<Direction> {
inner: MagicFinder<Direction>,
initial_guess: Option<(u64, bool)>,
}

Expand All @@ -134,7 +220,7 @@ pub struct OptimisticMagicFinder<'a> {
/// We only use magic bytes of size 4 at the moment.
const STACK_BUFFER_SIZE: usize = 8;

impl<'a> OptimisticMagicFinder<'a> {
impl<'a, Direction: FinderDirection<'a>> OptimisticMagicFinder<Direction> {
/// Create a new empty optimistic magic bytes finder.
pub fn new_empty() -> Self {
Self {
Expand All @@ -159,8 +245,8 @@ impl<'a> OptimisticMagicFinder<'a> {
}

/// Equivalent to `next_back`, with an optional initial guess attempted before
/// proceeding with reading from the back of the file.
pub fn next_back<R: Read + Seek>(&mut self, reader: &mut R) -> ZipResult<Option<u64>> {
/// proceeding with reading from the back of the reader.
pub fn next<R: Read + Seek>(&mut self, reader: &mut R) -> ZipResult<Option<u64>> {
if let Some((v, mandatory)) = self.initial_guess {
reader.seek(SeekFrom::Start(v))?;

Expand All @@ -172,7 +258,7 @@ impl<'a> OptimisticMagicFinder<'a> {
reader.read_exact(buffer)?;

// If a match is found, yield it.
if self.inner.finder.rfind(&buffer).is_some() {
if self.inner.finder.needle() == buffer {
self.initial_guess.take();
reader.seek(SeekFrom::Start(v))?;
return Ok(Some(v));
Expand All @@ -182,12 +268,12 @@ impl<'a> OptimisticMagicFinder<'a> {
// If a match is not found, but the initial guess was mandatory, return an error.
if mandatory {
return Ok(None);
} else {
// If the initial guess was not mandatory, remove it, as it was not found.
self.initial_guess.take();
}

// If the initial guess was not mandatory, remove it, as it was not found.
self.initial_guess.take();
}

self.inner.next_back(reader)
self.inner.next(reader)
}
}
25 changes: 16 additions & 9 deletions src/spec.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#![macro_use]

use crate::read::magic_finder::{MagicFinder, OptimisticMagicFinder};
use crate::read::magic_finder::{Backwards, Forward, MagicFinder, OptimisticMagicFinder};
use crate::read::ArchiveOffset;
use crate::result::{ZipError, ZipResult};
use core::mem;
Expand Down Expand Up @@ -350,7 +350,7 @@ impl Zip32CentralDirectoryEnd {
Ok(())
}

pub fn is_zip64(&self) -> bool {
pub fn may_be_zip64(&self) -> bool {
self.number_of_files == u16::MAX || self.central_directory_offset == u32::MAX
}
}
Expand Down Expand Up @@ -593,13 +593,13 @@ pub(crate) fn find_central_directory<R: Read + Seek>(
Magic::CENTRAL_DIRECTORY_HEADER_SIGNATURE.to_le_bytes();

// Instantiate the mandatory finder
let mut eocd_finder = MagicFinder::new(&EOCD_SIG_BYTES, 0, end_exclusive);
let mut subfinder: Option<OptimisticMagicFinder<'static>> = None;
let mut eocd_finder = MagicFinder::<Backwards<'static>>::new(&EOCD_SIG_BYTES, 0, end_exclusive);
let mut subfinder: Option<OptimisticMagicFinder<Forward<'static>>> = None;

// Keep the last errors for cases of improper EOCD instances.
let mut parsing_error = None;

while let Some(eocd_offset) = eocd_finder.next_back(reader)? {
while let Some(eocd_offset) = eocd_finder.next(reader)? {
// Attempt to parse the EOCD block
let eocd = match Zip32CentralDirectoryEnd::parse(reader) {
Ok(eocd) => eocd,
Expand All @@ -619,15 +619,15 @@ pub(crate) fn find_central_directory<R: Read + Seek>(
}

// Branch out for zip32
if !eocd.is_zip64() {
if !eocd.may_be_zip64() {
let relative_cd_offset = eocd.central_directory_offset as u64;

// If the archive is empty, there is nothing more to be checked, the archive is correct.
if eocd.number_of_files == 0 {
return Ok(CentralDirectoryEndInfo {
eocd: (eocd, eocd_offset).into(),
eocd64: None,
archive_offset: eocd_offset - relative_cd_offset,
archive_offset: eocd_offset.saturating_sub(relative_cd_offset),
});
}

Expand All @@ -654,7 +654,7 @@ pub(crate) fn find_central_directory<R: Read + Seek>(
);

// Consistency check: find the first CDFH
if let Some(cd_offset) = subfinder.next_back(reader)? {
if let Some(cd_offset) = subfinder.next(reader)? {
// The first CDFH will define the archive offset
let archive_offset = cd_offset - relative_cd_offset;

Expand All @@ -669,6 +669,13 @@ pub(crate) fn find_central_directory<R: Read + Seek>(
continue;
}

if eocd_offset < mem::size_of::<Zip64CDELocatorBlock>() as u64 {
parsing_error = Some(ZipError::InvalidArchive(
"EOCD64 Locator does not fit in file",
));
continue;
}

let locator64_offset = eocd_offset - mem::size_of::<Zip64CDELocatorBlock>() as u64;

// Consistency check: the EOCD64 locator must be present at the specific offset
Expand Down Expand Up @@ -740,7 +747,7 @@ pub(crate) fn find_central_directory<R: Read + Seek>(

// Consistency check: Find the EOCD64
let mut local_error = None;
while let Some(eocd64_offset) = subfinder.next_back(reader)? {
while let Some(eocd64_offset) = subfinder.next(reader)? {
let archive_offset = eocd64_offset - locator64.end_of_central_directory_offset;

match try_read_eocd64(
Expand Down
24 changes: 24 additions & 0 deletions tests/prepended_garbage.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
use std::io;
use zip::ZipArchive;

#[test]
fn test_prepended_garbage() {
let mut v = vec![0, 1, 2, 3];
v.extend_from_slice(include_bytes!("../tests/data/extended_timestamp.zip"));

let mut archive = ZipArchive::new(io::Cursor::new(v)).expect("couldn't open test zip file");

assert_eq!(2, archive.len());

for file_idx in 0..archive.len() {
let file = archive.by_index(file_idx).unwrap();
let outpath = file.enclosed_name().unwrap();

println!(
"Entry {} has name \"{}\" ({} bytes)",
file_idx,
outpath.display(),
file.size()
);
}
}

0 comments on commit 37a431b

Please sign in to comment.