Skip to content

Commit 0c786d4

Browse files
committed
Implement Shlex with bytes::Shlex.
1 parent 879d212 commit 0c786d4

File tree

2 files changed

+26
-108
lines changed

2 files changed

+26
-108
lines changed

src/bytes.rs

+3-2
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@
55

66
//! [`Shlex`] and friends for byte strings.
77
//!
8-
//! This may be more convenient if you are working with byte slices (`[u8]`)
9-
//! or types that are wrappers around bytes, such as [`OsStr`](std::ffi::OsStr):
8+
//! This is used internally by the [outer module](crate), and may be more
9+
//! convenient if you are working with byte slices (`[u8]`) or types that are
10+
//! wrappers around bytes, such as [`OsStr`](std::ffi::OsStr):
1011
//!
1112
//! ```rust
1213
//! #[cfg(unix)] {

src/lib.rs

+23-106
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,9 @@
1212
//! This implementation also deviates from the Python version in not treating `\r` specially, which
1313
//! I believe is more compliant.
1414
//!
15-
//! The algorithms in this crate are oblivious to UTF-8 high bytes, so they iterate over the bytes
16-
//! directly as a micro-optimization.
15+
//! This is a string-friendly wrapper around the [bytes] module that works on the underlying byte
16+
//! slices. The algorithms in this crate are oblivious to UTF-8 high bytes, so working directly
17+
//! with bytes is a safe micro-optimization.
1718
//!
1819
//! Disabling the `std` feature (which is enabled by default) will allow the crate to work in
1920
//! `no_std` environments, where the `alloc` crate, and a global allocator, are available.
@@ -33,122 +34,38 @@ pub mod bytes;
3334

3435
/// An iterator that takes an input string and splits it into the words using the same syntax as
3536
/// the POSIX shell.
36-
pub struct Shlex<'a> {
37-
in_iter: core::str::Bytes<'a>,
38-
/// The number of newlines read so far, plus one.
39-
pub line_no: usize,
40-
/// An input string is erroneous if it ends while inside a quotation or right after an
41-
/// unescaped backslash. Since Iterator does not have a mechanism to return an error, if that
42-
/// happens, Shlex just throws out the last token, ends the iteration, and sets 'had_error' to
43-
/// true; best to check it after you're done iterating.
44-
pub had_error: bool,
45-
}
37+
///
38+
/// See [`bytes::Shlex`].
39+
pub struct Shlex<'a>(bytes::Shlex<'a>);
4640

4741
impl<'a> Shlex<'a> {
4842
pub fn new(in_str: &'a str) -> Self {
49-
Shlex {
50-
in_iter: in_str.bytes(),
51-
line_no: 1,
52-
had_error: false,
53-
}
54-
}
55-
56-
fn parse_word(&mut self, mut ch: u8) -> Option<String> {
57-
let mut result: Vec<u8> = Vec::new();
58-
loop {
59-
match ch as char {
60-
'"' => if let Err(()) = self.parse_double(&mut result) {
61-
self.had_error = true;
62-
return None;
63-
},
64-
'\'' => if let Err(()) = self.parse_single(&mut result) {
65-
self.had_error = true;
66-
return None;
67-
},
68-
'\\' => if let Some(ch2) = self.next_char() {
69-
if ch2 != '\n' as u8 { result.push(ch2); }
70-
} else {
71-
self.had_error = true;
72-
return None;
73-
},
74-
' ' | '\t' | '\n' => { break; },
75-
_ => { result.push(ch as u8); },
76-
}
77-
if let Some(ch2) = self.next_char() { ch = ch2; } else { break; }
78-
}
79-
unsafe { Some(String::from_utf8_unchecked(result)) }
43+
Self(bytes::Shlex::new(in_str.as_bytes()))
8044
}
45+
}
8146

82-
fn parse_double(&mut self, result: &mut Vec<u8>) -> Result<(), ()> {
83-
loop {
84-
if let Some(ch2) = self.next_char() {
85-
match ch2 as char {
86-
'\\' => {
87-
if let Some(ch3) = self.next_char() {
88-
match ch3 as char {
89-
// \$ => $
90-
'$' | '`' | '"' | '\\' => { result.push(ch3); },
91-
// \<newline> => nothing
92-
'\n' => {},
93-
// \x => =x
94-
_ => { result.push('\\' as u8); result.push(ch3); }
95-
}
96-
} else {
97-
return Err(());
98-
}
99-
},
100-
'"' => { return Ok(()); },
101-
_ => { result.push(ch2); },
102-
}
103-
} else {
104-
return Err(());
105-
}
106-
}
47+
impl<'a> Iterator for Shlex<'a> {
48+
type Item = String;
49+
fn next(&mut self) -> Option<String> {
50+
self.0.next().map(|byte_word| {
51+
// Safety: given valid UTF-8, bytes::Shlex will always return valid UTF-8.
52+
unsafe { String::from_utf8_unchecked(byte_word) }
53+
})
10754
}
55+
}
10856

109-
fn parse_single(&mut self, result: &mut Vec<u8>) -> Result<(), ()> {
110-
loop {
111-
if let Some(ch2) = self.next_char() {
112-
match ch2 as char {
113-
'\'' => { return Ok(()); },
114-
_ => { result.push(ch2); },
115-
}
116-
} else {
117-
return Err(());
118-
}
119-
}
120-
}
57+
impl<'a> core::ops::Deref for Shlex<'a> {
58+
type Target = bytes::Shlex<'a>;
12159

122-
fn next_char(&mut self) -> Option<u8> {
123-
let res = self.in_iter.next();
124-
if res == Some('\n' as u8) { self.line_no += 1; }
125-
res
60+
fn deref(&self) -> &Self::Target {
61+
&self.0
12662
}
12763
}
12864

129-
impl<'a> Iterator for Shlex<'a> {
130-
type Item = String;
131-
fn next(&mut self) -> Option<String> {
132-
if let Some(mut ch) = self.next_char() {
133-
// skip initial whitespace
134-
loop {
135-
match ch as char {
136-
' ' | '\t' | '\n' => {},
137-
'#' => {
138-
while let Some(ch2) = self.next_char() {
139-
if ch2 as char == '\n' { break; }
140-
}
141-
},
142-
_ => { break; }
143-
}
144-
if let Some(ch2) = self.next_char() { ch = ch2; } else { return None; }
145-
}
146-
self.parse_word(ch)
147-
} else { // no initial character
148-
None
149-
}
65+
impl<'a> core::ops::DerefMut for Shlex<'a> {
66+
fn deref_mut(&mut self) -> &mut Self::Target {
67+
&mut self.0
15068
}
151-
15269
}
15370

15471
/// Convenience function that consumes the whole string at once. Returns None if the input was

0 commit comments

Comments
 (0)