Use u64 offsets to be able to read 4GB+ files on 32-bit targets

tafia · Jun 24, 2024 · d71978e · d71978e
1 parent d2c75ab
commit d71978e
Show file tree

Hide file tree

Showing 6 changed files with 79 additions and 74 deletions.
diff --git a/Changelog.md b/Changelog.md
@@ -17,12 +17,17 @@
 
 ### Bug Fixes
 
+- [#751]: Fix internal overflow when read 4GB+ files on 32-bit targets using `Reader<impl BufRead>` readers.
+
 ### Misc Changes
 
 - [#760]: `Attribute::decode_and_unescape_value` and `Attribute::decode_and_unescape_value_with` now
   accepts `Decoder` instead of `Reader`. Use `Reader::decoder()` to get it.
 - [#760]: `Writer::write_event` now consumes event. Use `Event::borrow()` if you want to keep ownership.
+- [#751]: Type of `Reader::error_position()` and `Reader::buffer_position()` changed from `usize` to `u64`.
+- [#751]: Type alias `Span` changed from `Range<usize>` to `Range<u64>`.
 
+[#751]: https://github.com/tafia/quick-xml/issues/751
 [#760]: https://github.com/tafia/quick-xml/pull/760
 
 

diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs
@@ -53,7 +53,7 @@ macro_rules! impl_buffered_source {
         $($async)? fn read_text $(<$lf>)? (
             &mut self,
             buf: &'b mut Vec<u8>,
-            position: &mut usize,
+            position: &mut u64,
         ) -> ReadTextResult<'b, &'b mut Vec<u8>> {
             let mut read = 0;
             let start = buf.len();
@@ -79,7 +79,7 @@ macro_rules! impl_buffered_source {
 
                         let used = i + 1;
                         self $(.$reader)? .consume(used);
-                        read += used;
+                        read += used as u64;
 
                         *position += read;
                         return ReadTextResult::UpToMarkup(&buf[start..]);
@@ -89,7 +89,7 @@ macro_rules! impl_buffered_source {
 
                         let used = available.len();
                         self $(.$reader)? .consume(used);
-                        read += used;
+                        read += used as u64;
                     }
                 }
             }
@@ -103,7 +103,7 @@ macro_rules! impl_buffered_source {
             &mut self,
             byte: u8,
             buf: &'b mut Vec<u8>,
-            position: &mut usize,
+            position: &mut u64,
         ) -> io::Result<(&'b [u8], bool)> {
             // search byte must be within the ascii range
             debug_assert!(byte.is_ascii());
@@ -127,7 +127,7 @@ macro_rules! impl_buffered_source {
 
                         let used = i + 1;
                         self $(.$reader)? .consume(used);
-                        read += used;
+                        read += used as u64;
 
                         *position += read;
                         return Ok((&buf[start..], true));
@@ -137,7 +137,7 @@ macro_rules! impl_buffered_source {
 
                         let used = available.len();
                         self $(.$reader)? .consume(used);
-                        read += used;
+                        read += used as u64;
                     }
                 }
             }
@@ -151,7 +151,7 @@ macro_rules! impl_buffered_source {
             &mut self,
             mut parser: P,
             buf: &'b mut Vec<u8>,
-            position: &mut usize,
+            position: &mut u64,
         ) -> Result<&'b [u8]> {
             let mut read = 0;
             let start = buf.len();
@@ -171,7 +171,7 @@ macro_rules! impl_buffered_source {
 
                     // +1 for `>` which we do not include
                     self $(.$reader)? .consume(i + 1);
-                    read += i + 1;
+                    read += i as u64 + 1;
 
                     *position += read;
                     return Ok(&buf[start..]);
@@ -182,7 +182,7 @@ macro_rules! impl_buffered_source {
 
                 let used = available.len();
                 self $(.$reader)? .consume(used);
-                read += used;
+                read += used as u64;
             }
 
             *position += read;
@@ -193,7 +193,7 @@ macro_rules! impl_buffered_source {
         $($async)? fn read_bang_element $(<$lf>)? (
             &mut self,
             buf: &'b mut Vec<u8>,
-            position: &mut usize,
+            position: &mut u64,
         ) -> Result<(BangType, &'b [u8])> {
             // Peeked one bang ('!') before being called, so it's guaranteed to
             // start with it.
@@ -216,7 +216,7 @@ macro_rules! impl_buffered_source {
                             buf.extend_from_slice(consumed);
 
                             self $(.$reader)? .consume(used);
-                            read += used;
+                            read += used as u64;
 
                             *position += read;
                             return Ok((bang_type, &buf[start..]));
@@ -225,7 +225,7 @@ macro_rules! impl_buffered_source {
 
                             let used = available.len();
                             self $(.$reader)? .consume(used);
-                            read += used;
+                            read += used as u64;
                         }
                     }
                     Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
@@ -241,14 +241,14 @@ macro_rules! impl_buffered_source {
         }
 
         #[inline]
-        $($async)? fn skip_whitespace(&mut self, position: &mut usize) -> io::Result<()> {
+        $($async)? fn skip_whitespace(&mut self, position: &mut u64) -> io::Result<()> {
             loop {
                 break match self $(.$reader)? .fill_buf() $(.$await)? {
                     Ok(n) => {
                         let count = n.iter().position(|b| !is_whitespace(*b)).unwrap_or(n.len());
                         if count > 0 {
                             self $(.$reader)? .consume(count);
-                            *position += count;
+                            *position += count as u64;
                             continue;
                         } else {
                             Ok(())

diff --git a/src/reader/mod.rs b/src/reader/mod.rs
@@ -416,7 +416,7 @@ pub use ns_reader::NsReader;
 pub use pi::PiParser;
 
 /// Range of input in bytes, that corresponds to some piece of XML
-pub type Span = Range<usize>;
+pub type Span = Range<u64>;
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -619,7 +619,8 @@ impl<R> Reader<R> {
     /// let mut buf = Vec::new();
     ///
     /// fn into_line_and_column(reader: Reader<Cursor<&[u8]>>) -> (usize, usize) {
-    ///     let end_pos = reader.buffer_position();
+    ///     // We known that size cannot exceed usize::MAX because we created parser from single &[u8]
+    ///     let end_pos = reader.buffer_position() as usize;
     ///     let mut cursor = reader.into_inner();
     ///     let s = String::from_utf8(cursor.into_inner()[0..end_pos].to_owned())
     ///         .expect("can't make a string");
@@ -667,7 +668,7 @@ impl<R> Reader<R> {
     }
 
     /// Gets the current byte position in the input data.
-    pub const fn buffer_position(&self) -> usize {
+    pub const fn buffer_position(&self) -> u64 {
         // when internal state is InsideMarkup, we have actually read until '<',
         // which we don't want to show
         if let ParseState::InsideMarkup = self.state.state {
@@ -688,7 +689,7 @@ impl<R> Reader<R> {
     /// markup element (i. e. to the `<` character).
     ///
     /// This position is always `<= buffer_position()`.
-    pub const fn error_position(&self) -> usize {
+    pub const fn error_position(&self) -> u64 {
         self.state.last_error_offset
     }
 
@@ -813,7 +814,7 @@ trait XmlSource<'r, B> {
     /// - `position`: Will be increased by amount of bytes consumed
     ///
     /// [events]: crate::events::Event
-    fn read_text(&mut self, buf: B, position: &mut usize) -> ReadTextResult<'r, B>;
+    fn read_text(&mut self, buf: B, position: &mut u64) -> ReadTextResult<'r, B>;
 
     /// Read input until `byte` is found or end of input is reached.
     ///
@@ -845,7 +846,7 @@ trait XmlSource<'r, B> {
         &mut self,
         byte: u8,
         buf: B,
-        position: &mut usize,
+        position: &mut u64,
     ) -> io::Result<(&'r [u8], bool)>;
 
     /// Read input until processing instruction is finished.
@@ -867,7 +868,7 @@ trait XmlSource<'r, B> {
     /// reader which provides bytes fed into the parser.
     ///
     /// [events]: crate::events::Event
-    fn read_with<P>(&mut self, parser: P, buf: B, position: &mut usize) -> Result<&'r [u8]>
+    fn read_with<P>(&mut self, parser: P, buf: B, position: &mut u64) -> Result<&'r [u8]>
     where
         P: Parser;
 
@@ -886,14 +887,14 @@ trait XmlSource<'r, B> {
     /// - `position`: Will be increased by amount of bytes consumed
     ///
     /// [events]: crate::events::Event
-    fn read_bang_element(&mut self, buf: B, position: &mut usize) -> Result<(BangType, &'r [u8])>;
+    fn read_bang_element(&mut self, buf: B, position: &mut u64) -> Result<(BangType, &'r [u8])>;
 
     /// Consume and discard all the whitespace until the next non-whitespace
     /// character or EOF.
     ///
     /// # Parameters
     /// - `position`: Will be increased by amount of bytes consumed
-    fn skip_whitespace(&mut self, position: &mut usize) -> io::Result<()>;
+    fn skip_whitespace(&mut self, position: &mut u64) -> io::Result<()>;
 
     /// Return one character without consuming it, so that future `read_*` calls
     /// will still include it. On EOF, return `None`.

diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs
@@ -229,7 +229,10 @@ impl<'a> Reader<&'a [u8]> {
         let buffer = self.reader;
         let span = self.read_to_end(end)?;
 
-        self.decoder().decode(&buffer[0..span.len()])
+        let len = span.end - span.start;
+        // SAFETY: `span` can only contain indexes up to usize::MAX because it
+        // was created from offsets from a single &[u8] slice
+        self.decoder().decode(&buffer[0..len as usize])
     }
 }
 
@@ -258,21 +261,21 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] {
     }
 
     #[inline]
-    fn read_text(&mut self, _buf: (), position: &mut usize) -> ReadTextResult<'a, ()> {
+    fn read_text(&mut self, _buf: (), position: &mut u64) -> ReadTextResult<'a, ()> {
         match memchr::memchr(b'<', self) {
             Some(0) => {
                 *position += 1;
                 *self = &self[1..];
                 ReadTextResult::Markup(())
             }
             Some(i) => {
-                *position += i + 1;
+                *position += i as u64 + 1;
                 let bytes = &self[..i];
                 *self = &self[i + 1..];
                 ReadTextResult::UpToMarkup(bytes)
             }
             None => {
-                *position += self.len();
+                *position += self.len() as u64;
                 let bytes = &self[..];
                 *self = &[];
                 ReadTextResult::UpToEof(bytes)
@@ -285,70 +288,66 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] {
         &mut self,
         byte: u8,
         _buf: (),
-        position: &mut usize,
+        position: &mut u64,
     ) -> io::Result<(&'a [u8], bool)> {
         // search byte must be within the ascii range
         debug_assert!(byte.is_ascii());
 
         if let Some(i) = memchr::memchr(byte, self) {
-            *position += i + 1;
+            *position += i as u64 + 1;
             let bytes = &self[..i];
             *self = &self[i + 1..];
             Ok((bytes, true))
         } else {
-            *position += self.len();
+            *position += self.len() as u64;
             let bytes = &self[..];
             *self = &[];
             Ok((bytes, false))
         }
     }
 
     #[inline]
-    fn read_with<P>(&mut self, mut parser: P, _buf: (), position: &mut usize) -> Result<&'a [u8]>
+    fn read_with<P>(&mut self, mut parser: P, _buf: (), position: &mut u64) -> Result<&'a [u8]>
     where
         P: Parser,
     {
         if let Some(i) = parser.feed(self) {
             // +1 for `>` which we do not include
-            *position += i + 1;
+            *position += i as u64 + 1;
             let bytes = &self[..i];
             *self = &self[i + 1..];
             return Ok(bytes);
         }
 
-        *position += self.len();
+        *position += self.len() as u64;
         Err(Error::Syntax(P::eof_error()))
     }
 
     #[inline]
-    fn read_bang_element(
-        &mut self,
-        _buf: (),
-        position: &mut usize,
-    ) -> Result<(BangType, &'a [u8])> {
+    fn read_bang_element(&mut self, _buf: (), position: &mut u64) -> Result<(BangType, &'a [u8])> {
         // Peeked one bang ('!') before being called, so it's guaranteed to
         // start with it.
         debug_assert_eq!(self[0], b'!');
 
         let bang_type = BangType::new(self[1..].first().copied())?;
 
         if let Some((bytes, i)) = bang_type.parse(&[], self) {
-            *position += i;
+            *position += i as u64;
             *self = &self[i..];
             return Ok((bang_type, bytes));
         }
 
-        *position += self.len();
+        *position += self.len() as u64;
         Err(bang_type.to_err())
     }
 
     #[inline]
-    fn skip_whitespace(&mut self, position: &mut usize) -> io::Result<()> {
+    fn skip_whitespace(&mut self, position: &mut u64) -> io::Result<()> {
         let whitespaces = self
             .iter()
             .position(|b| !is_whitespace(*b))
             .unwrap_or(self.len());
-        *position += whitespaces;
+        *position += whitespaces as u64;
         *self = &self[whitespaces..];
         Ok(())
     }