Skip to content

Commit

Permalink
Use u64 offsets to be able to read 4GB+ files on 32-bit targets
Browse files Browse the repository at this point in the history
  • Loading branch information
Mingun authored and dralley committed Jun 24, 2024
1 parent d2c75ab commit d71978e
Show file tree
Hide file tree
Showing 6 changed files with 79 additions and 74 deletions.
5 changes: 5 additions & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,17 @@

### Bug Fixes

- [#751]: Fix internal overflow when read 4GB+ files on 32-bit targets using `Reader<impl BufRead>` readers.

### Misc Changes

- [#760]: `Attribute::decode_and_unescape_value` and `Attribute::decode_and_unescape_value_with` now
accepts `Decoder` instead of `Reader`. Use `Reader::decoder()` to get it.
- [#760]: `Writer::write_event` now consumes event. Use `Event::borrow()` if you want to keep ownership.
- [#751]: Type of `Reader::error_position()` and `Reader::buffer_position()` changed from `usize` to `u64`.
- [#751]: Type alias `Span` changed from `Range<usize>` to `Range<u64>`.

[#751]: https://github.com/tafia/quick-xml/issues/751
[#760]: https://github.com/tafia/quick-xml/pull/760


Expand Down
28 changes: 14 additions & 14 deletions src/reader/buffered_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ macro_rules! impl_buffered_source {
$($async)? fn read_text $(<$lf>)? (
&mut self,
buf: &'b mut Vec<u8>,
position: &mut usize,
position: &mut u64,
) -> ReadTextResult<'b, &'b mut Vec<u8>> {
let mut read = 0;
let start = buf.len();
Expand All @@ -79,7 +79,7 @@ macro_rules! impl_buffered_source {

let used = i + 1;
self $(.$reader)? .consume(used);
read += used;
read += used as u64;

*position += read;
return ReadTextResult::UpToMarkup(&buf[start..]);
Expand All @@ -89,7 +89,7 @@ macro_rules! impl_buffered_source {

let used = available.len();
self $(.$reader)? .consume(used);
read += used;
read += used as u64;
}
}
}
Expand All @@ -103,7 +103,7 @@ macro_rules! impl_buffered_source {
&mut self,
byte: u8,
buf: &'b mut Vec<u8>,
position: &mut usize,
position: &mut u64,
) -> io::Result<(&'b [u8], bool)> {
// search byte must be within the ascii range
debug_assert!(byte.is_ascii());
Expand All @@ -127,7 +127,7 @@ macro_rules! impl_buffered_source {

let used = i + 1;
self $(.$reader)? .consume(used);
read += used;
read += used as u64;

*position += read;
return Ok((&buf[start..], true));
Expand All @@ -137,7 +137,7 @@ macro_rules! impl_buffered_source {

let used = available.len();
self $(.$reader)? .consume(used);
read += used;
read += used as u64;
}
}
}
Expand All @@ -151,7 +151,7 @@ macro_rules! impl_buffered_source {
&mut self,
mut parser: P,
buf: &'b mut Vec<u8>,
position: &mut usize,
position: &mut u64,
) -> Result<&'b [u8]> {
let mut read = 0;
let start = buf.len();
Expand All @@ -171,7 +171,7 @@ macro_rules! impl_buffered_source {

// +1 for `>` which we do not include
self $(.$reader)? .consume(i + 1);
read += i + 1;
read += i as u64 + 1;

*position += read;
return Ok(&buf[start..]);
Expand All @@ -182,7 +182,7 @@ macro_rules! impl_buffered_source {

let used = available.len();
self $(.$reader)? .consume(used);
read += used;
read += used as u64;
}

*position += read;
Expand All @@ -193,7 +193,7 @@ macro_rules! impl_buffered_source {
$($async)? fn read_bang_element $(<$lf>)? (
&mut self,
buf: &'b mut Vec<u8>,
position: &mut usize,
position: &mut u64,
) -> Result<(BangType, &'b [u8])> {
// Peeked one bang ('!') before being called, so it's guaranteed to
// start with it.
Expand All @@ -216,7 +216,7 @@ macro_rules! impl_buffered_source {
buf.extend_from_slice(consumed);

self $(.$reader)? .consume(used);
read += used;
read += used as u64;

*position += read;
return Ok((bang_type, &buf[start..]));
Expand All @@ -225,7 +225,7 @@ macro_rules! impl_buffered_source {

let used = available.len();
self $(.$reader)? .consume(used);
read += used;
read += used as u64;
}
}
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
Expand All @@ -241,14 +241,14 @@ macro_rules! impl_buffered_source {
}

#[inline]
$($async)? fn skip_whitespace(&mut self, position: &mut usize) -> io::Result<()> {
$($async)? fn skip_whitespace(&mut self, position: &mut u64) -> io::Result<()> {
loop {
break match self $(.$reader)? .fill_buf() $(.$await)? {
Ok(n) => {
let count = n.iter().position(|b| !is_whitespace(*b)).unwrap_or(n.len());
if count > 0 {
self $(.$reader)? .consume(count);
*position += count;
*position += count as u64;
continue;
} else {
Ok(())
Expand Down
19 changes: 10 additions & 9 deletions src/reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -416,7 +416,7 @@ pub use ns_reader::NsReader;
pub use pi::PiParser;

/// Range of input in bytes, that corresponds to some piece of XML
pub type Span = Range<usize>;
pub type Span = Range<u64>;

////////////////////////////////////////////////////////////////////////////////////////////////////

Expand Down Expand Up @@ -619,7 +619,8 @@ impl<R> Reader<R> {
/// let mut buf = Vec::new();
///
/// fn into_line_and_column(reader: Reader<Cursor<&[u8]>>) -> (usize, usize) {
/// let end_pos = reader.buffer_position();
/// // We known that size cannot exceed usize::MAX because we created parser from single &[u8]
/// let end_pos = reader.buffer_position() as usize;
/// let mut cursor = reader.into_inner();
/// let s = String::from_utf8(cursor.into_inner()[0..end_pos].to_owned())
/// .expect("can't make a string");
Expand Down Expand Up @@ -667,7 +668,7 @@ impl<R> Reader<R> {
}

/// Gets the current byte position in the input data.
pub const fn buffer_position(&self) -> usize {
pub const fn buffer_position(&self) -> u64 {
// when internal state is InsideMarkup, we have actually read until '<',
// which we don't want to show
if let ParseState::InsideMarkup = self.state.state {
Expand All @@ -688,7 +689,7 @@ impl<R> Reader<R> {
/// markup element (i. e. to the `<` character).
///
/// This position is always `<= buffer_position()`.
pub const fn error_position(&self) -> usize {
pub const fn error_position(&self) -> u64 {
self.state.last_error_offset
}

Expand Down Expand Up @@ -813,7 +814,7 @@ trait XmlSource<'r, B> {
/// - `position`: Will be increased by amount of bytes consumed
///
/// [events]: crate::events::Event
fn read_text(&mut self, buf: B, position: &mut usize) -> ReadTextResult<'r, B>;
fn read_text(&mut self, buf: B, position: &mut u64) -> ReadTextResult<'r, B>;

/// Read input until `byte` is found or end of input is reached.
///
Expand Down Expand Up @@ -845,7 +846,7 @@ trait XmlSource<'r, B> {
&mut self,
byte: u8,
buf: B,
position: &mut usize,
position: &mut u64,
) -> io::Result<(&'r [u8], bool)>;

/// Read input until processing instruction is finished.
Expand All @@ -867,7 +868,7 @@ trait XmlSource<'r, B> {
/// reader which provides bytes fed into the parser.
///
/// [events]: crate::events::Event
fn read_with<P>(&mut self, parser: P, buf: B, position: &mut usize) -> Result<&'r [u8]>
fn read_with<P>(&mut self, parser: P, buf: B, position: &mut u64) -> Result<&'r [u8]>
where
P: Parser;

Expand All @@ -886,14 +887,14 @@ trait XmlSource<'r, B> {
/// - `position`: Will be increased by amount of bytes consumed
///
/// [events]: crate::events::Event
fn read_bang_element(&mut self, buf: B, position: &mut usize) -> Result<(BangType, &'r [u8])>;
fn read_bang_element(&mut self, buf: B, position: &mut u64) -> Result<(BangType, &'r [u8])>;

/// Consume and discard all the whitespace until the next non-whitespace
/// character or EOF.
///
/// # Parameters
/// - `position`: Will be increased by amount of bytes consumed
fn skip_whitespace(&mut self, position: &mut usize) -> io::Result<()>;
fn skip_whitespace(&mut self, position: &mut u64) -> io::Result<()>;

/// Return one character without consuming it, so that future `read_*` calls
/// will still include it. On EOF, return `None`.
Expand Down
37 changes: 18 additions & 19 deletions src/reader/slice_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,10 @@ impl<'a> Reader<&'a [u8]> {
let buffer = self.reader;
let span = self.read_to_end(end)?;

self.decoder().decode(&buffer[0..span.len()])
let len = span.end - span.start;
// SAFETY: `span` can only contain indexes up to usize::MAX because it
// was created from offsets from a single &[u8] slice
self.decoder().decode(&buffer[0..len as usize])
}
}

Expand Down Expand Up @@ -258,21 +261,21 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] {
}

#[inline]
fn read_text(&mut self, _buf: (), position: &mut usize) -> ReadTextResult<'a, ()> {
fn read_text(&mut self, _buf: (), position: &mut u64) -> ReadTextResult<'a, ()> {
match memchr::memchr(b'<', self) {
Some(0) => {
*position += 1;
*self = &self[1..];
ReadTextResult::Markup(())
}
Some(i) => {
*position += i + 1;
*position += i as u64 + 1;
let bytes = &self[..i];
*self = &self[i + 1..];
ReadTextResult::UpToMarkup(bytes)
}
None => {
*position += self.len();
*position += self.len() as u64;
let bytes = &self[..];
*self = &[];
ReadTextResult::UpToEof(bytes)
Expand All @@ -285,70 +288,66 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] {
&mut self,
byte: u8,
_buf: (),
position: &mut usize,
position: &mut u64,
) -> io::Result<(&'a [u8], bool)> {
// search byte must be within the ascii range
debug_assert!(byte.is_ascii());

if let Some(i) = memchr::memchr(byte, self) {
*position += i + 1;
*position += i as u64 + 1;
let bytes = &self[..i];
*self = &self[i + 1..];
Ok((bytes, true))
} else {
*position += self.len();
*position += self.len() as u64;
let bytes = &self[..];
*self = &[];
Ok((bytes, false))
}
}

#[inline]
fn read_with<P>(&mut self, mut parser: P, _buf: (), position: &mut usize) -> Result<&'a [u8]>
fn read_with<P>(&mut self, mut parser: P, _buf: (), position: &mut u64) -> Result<&'a [u8]>
where
P: Parser,
{
if let Some(i) = parser.feed(self) {
// +1 for `>` which we do not include
*position += i + 1;
*position += i as u64 + 1;
let bytes = &self[..i];
*self = &self[i + 1..];
return Ok(bytes);
}

*position += self.len();
*position += self.len() as u64;
Err(Error::Syntax(P::eof_error()))
}

#[inline]
fn read_bang_element(
&mut self,
_buf: (),
position: &mut usize,
) -> Result<(BangType, &'a [u8])> {
fn read_bang_element(&mut self, _buf: (), position: &mut u64) -> Result<(BangType, &'a [u8])> {
// Peeked one bang ('!') before being called, so it's guaranteed to
// start with it.
debug_assert_eq!(self[0], b'!');

let bang_type = BangType::new(self[1..].first().copied())?;

if let Some((bytes, i)) = bang_type.parse(&[], self) {
*position += i;
*position += i as u64;
*self = &self[i..];
return Ok((bang_type, bytes));
}

*position += self.len();
*position += self.len() as u64;
Err(bang_type.to_err())
}

#[inline]
fn skip_whitespace(&mut self, position: &mut usize) -> io::Result<()> {
fn skip_whitespace(&mut self, position: &mut u64) -> io::Result<()> {
let whitespaces = self
.iter()
.position(|b| !is_whitespace(*b))
.unwrap_or(self.len());
*position += whitespaces;
*position += whitespaces as u64;
*self = &self[whitespaces..];
Ok(())
}
Expand Down
Loading

0 comments on commit d71978e

Please sign in to comment.