Implement Shlex with bytes::Shlex.

danielparks · danielparks · commit 0c786d42a288 · 2023-09-04T12:31:49.000-07:00
diff --git a/src/bytes.rs b/src/bytes.rs
@@ -5,8 +5,9 @@
 
 //! [`Shlex`] and friends for byte strings.
 //!
-//! This may be more convenient if you are working with byte slices (`[u8]`)
-//! or types that are wrappers around bytes, such as [`OsStr`](std::ffi::OsStr):
+//! This is used internally by the [outer module](crate), and may be more
+//! convenient if you are working with byte slices (`[u8]`) or types that are
+//! wrappers around bytes, such as [`OsStr`](std::ffi::OsStr):
 //!
 //! ```rust
 //! #[cfg(unix)] {
diff --git a/src/lib.rs b/src/lib.rs
@@ -12,8 +12,9 @@
 //! This implementation also deviates from the Python version in not treating `\r` specially, which
 //! I believe is more compliant.
 //!
-//! The algorithms in this crate are oblivious to UTF-8 high bytes, so they iterate over the bytes
-//! directly as a micro-optimization.
+//! This is a string-friendly wrapper around the [bytes] module that works on the underlying byte
+//! slices. The algorithms in this crate are oblivious to UTF-8 high bytes, so working directly
+//! with bytes is a safe micro-optimization.
 //!
 //! Disabling the `std` feature (which is enabled by default) will allow the crate to work in
 //! `no_std` environments, where the `alloc` crate, and a global allocator, are available.
@@ -33,122 +34,38 @@ pub mod bytes;
 
 /// An iterator that takes an input string and splits it into the words using the same syntax as
 /// the POSIX shell.
-pub struct Shlex<'a> {
-    in_iter: core::str::Bytes<'a>,
-    /// The number of newlines read so far, plus one.
-    pub line_no: usize,
-    /// An input string is erroneous if it ends while inside a quotation or right after an
-    /// unescaped backslash.  Since Iterator does not have a mechanism to return an error, if that
-    /// happens, Shlex just throws out the last token, ends the iteration, and sets 'had_error' to
-    /// true; best to check it after you're done iterating.
-    pub had_error: bool,
-}
+///
+/// See [`bytes::Shlex`].
+pub struct Shlex<'a>(bytes::Shlex<'a>);
 
 impl<'a> Shlex<'a> {
     pub fn new(in_str: &'a str) -> Self {
-        Shlex {
-            in_iter: in_str.bytes(),
-            line_no: 1,
-            had_error: false,
-        }
-    }
-
-    fn parse_word(&mut self, mut ch: u8) -> Option<String> {
-        let mut result: Vec<u8> = Vec::new();
-        loop {
-            match ch as char {
-                '"' => if let Err(()) = self.parse_double(&mut result) {
-                    self.had_error = true;
-                    return None;
-                },
-                '\'' => if let Err(()) = self.parse_single(&mut result) {
-                    self.had_error = true;
-                    return None;
-                },
-                '\\' => if let Some(ch2) = self.next_char() {
-                    if ch2 != '\n' as u8 { result.push(ch2); }
-                } else {
-                    self.had_error = true;
-                    return None;
-                },
-                ' ' | '\t' | '\n' => { break; },
-                _ => { result.push(ch as u8); },
-            }
-            if let Some(ch2) = self.next_char() { ch = ch2; } else { break; }
-        }
-        unsafe { Some(String::from_utf8_unchecked(result)) }
+        Self(bytes::Shlex::new(in_str.as_bytes()))
     }
+}
 
-    fn parse_double(&mut self, result: &mut Vec<u8>) -> Result<(), ()> {
-        loop {
-            if let Some(ch2) = self.next_char() {
-                match ch2 as char {
-                    '\\' => {
-                        if let Some(ch3) = self.next_char() {
-                            match ch3 as char {
-                                // \$ => $
-                                '$' | '`' | '"' | '\\' => { result.push(ch3); },
-                                // \<newline> => nothing
-                                '\n' => {},
-                                // \x => =x
-                                _ => { result.push('\\' as u8); result.push(ch3); }
-                            }
-                        } else {
-                            return Err(());
-                        }
-                    },
-                    '"' => { return Ok(()); },
-                    _ => { result.push(ch2); },
-                }
-            } else {
-                return Err(());
-            }
-        }
+impl<'a> Iterator for Shlex<'a> {
+    type Item = String;
+    fn next(&mut self) -> Option<String> {
+        self.0.next().map(|byte_word| {
+            // Safety: given valid UTF-8, bytes::Shlex will always return valid UTF-8.
+            unsafe { String::from_utf8_unchecked(byte_word) }
+        })
     }
+}
 
-    fn parse_single(&mut self, result: &mut Vec<u8>) -> Result<(), ()> {
-        loop {
-            if let Some(ch2) = self.next_char() {
-                match ch2 as char {
-                    '\'' => { return Ok(()); },
-                    _ => { result.push(ch2); },
-                }
-            } else {
-                return Err(());
-            }
-        }
-    }
+impl<'a> core::ops::Deref for Shlex<'a> {
+    type Target = bytes::Shlex<'a>;
 
-    fn next_char(&mut self) -> Option<u8> {
-        let res = self.in_iter.next();
-        if res == Some('\n' as u8) { self.line_no += 1; }
-        res
+    fn deref(&self) -> &Self::Target {
+        &self.0
     }
 }
 
-impl<'a> Iterator for Shlex<'a> {
-    type Item = String;
-    fn next(&mut self) -> Option<String> {
-        if let Some(mut ch) = self.next_char() {
-            // skip initial whitespace
-            loop {
-                match ch as char {
-                    ' ' | '\t' | '\n' => {},
-                    '#' => {
-                        while let Some(ch2) = self.next_char() {
-                            if ch2 as char == '\n' { break; }
-                        }
-                    },
-                    _ => { break; }
-                }
-                if let Some(ch2) = self.next_char() { ch = ch2; } else { return None; }
-            }
-            self.parse_word(ch)
-        } else { // no initial character
-            None
-        }
+impl<'a> core::ops::DerefMut for Shlex<'a> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
     }
-
 }
 
 /// Convenience function that consumes the whole string at once.  Returns None if the input was