Skip to content

Commit f44b62e

Browse files
authored
Merge pull request #15 from danielparks/bytes
Add support for operating on byte strings
2 parents aa2d6e3 + 0c786d4 commit f44b62e

File tree

4 files changed

+308
-127
lines changed

4 files changed

+308
-127
lines changed

CHANGELOG.md

+4
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
# Next release
2+
3+
* Adds `bytes` module to support operating directly on byte strings.
4+
15
# 1.1.0
26

37
* Adds the `std` feature (enabled by default)

README.md

+3-2
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,9 @@ You only get the default settings of shlex.split, which mimic the POSIX shell:
1616
This implementation also deviates from the Python version in not treating \r
1717
specially, which I believe is more compliant.
1818

19-
The algorithms in this crate are oblivious to UTF-8 high bytes, so they iterate
20-
over the bytes directly as a micro-optimization.
19+
This crate can be used on either normal Rust strings, or on byte strings with
20+
the `bytes` module. The algorithms used are oblivious to UTF-8 high bytes, so
21+
internally they all work on bytes directly as a micro-optimization.
2122

2223
Disabling the `std` feature (which is enabled by default) will allow the crate
2324
to work in `no_std` environments, where the `alloc` crate, and a global

src/bytes.rs

+268
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,268 @@
1+
// Copyright 2015 Nicholas Allegra (comex).
2+
// Licensed under the Apache License, Version 2.0 <https://www.apache.org/licenses/LICENSE-2.0> or
3+
// the MIT license <https://opensource.org/licenses/MIT>, at your option. This file may not be
4+
// copied, modified, or distributed except according to those terms.
5+
6+
//! [`Shlex`] and friends for byte strings.
7+
//!
8+
//! This is used internally by the [outer module](crate), and may be more
9+
//! convenient if you are working with byte slices (`[u8]`) or types that are
10+
//! wrappers around bytes, such as [`OsStr`](std::ffi::OsStr):
11+
//!
12+
//! ```rust
13+
//! #[cfg(unix)] {
14+
//! use shlex::bytes::quote;
15+
//! use std::ffi::OsStr;
16+
//! use std::os::unix::ffi::OsStrExt;
17+
//!
18+
//! // `\x80` is invalid in UTF-8.
19+
//! let os_str = OsStr::from_bytes(b"a\x80b c");
20+
//! assert_eq!(quote(os_str.as_bytes()), &b"\"a\x80b c\""[..]);
21+
//! }
22+
//! ```
23+
//!
24+
//! (On Windows, `OsStr` uses 16 bit wide characters so this will not work.)
25+
26+
extern crate alloc;
27+
use alloc::vec::Vec;
28+
use alloc::borrow::Cow;
29+
#[cfg(test)]
30+
use alloc::vec;
31+
#[cfg(test)]
32+
use alloc::borrow::ToOwned;
33+
34+
/// An iterator that takes an input byte string and splits it into the words using the same syntax as
35+
/// the POSIX shell.
36+
pub struct Shlex<'a> {
37+
in_iter: core::slice::Iter<'a, u8>,
38+
/// The number of newlines read so far, plus one.
39+
pub line_no: usize,
40+
/// An input string is erroneous if it ends while inside a quotation or right after an
41+
/// unescaped backslash. Since Iterator does not have a mechanism to return an error, if that
42+
/// happens, Shlex just throws out the last token, ends the iteration, and sets 'had_error' to
43+
/// true; best to check it after you're done iterating.
44+
pub had_error: bool,
45+
}
46+
47+
impl<'a> Shlex<'a> {
48+
pub fn new(in_bytes: &'a [u8]) -> Self {
49+
Shlex {
50+
in_iter: in_bytes.iter(),
51+
line_no: 1,
52+
had_error: false,
53+
}
54+
}
55+
56+
fn parse_word(&mut self, mut ch: u8) -> Option<Vec<u8>> {
57+
let mut result: Vec<u8> = Vec::new();
58+
loop {
59+
match ch as char {
60+
'"' => if let Err(()) = self.parse_double(&mut result) {
61+
self.had_error = true;
62+
return None;
63+
},
64+
'\'' => if let Err(()) = self.parse_single(&mut result) {
65+
self.had_error = true;
66+
return None;
67+
},
68+
'\\' => if let Some(ch2) = self.next_char() {
69+
if ch2 != '\n' as u8 { result.push(ch2); }
70+
} else {
71+
self.had_error = true;
72+
return None;
73+
},
74+
' ' | '\t' | '\n' => { break; },
75+
_ => { result.push(ch as u8); },
76+
}
77+
if let Some(ch2) = self.next_char() { ch = ch2; } else { break; }
78+
}
79+
Some(result)
80+
}
81+
82+
fn parse_double(&mut self, result: &mut Vec<u8>) -> Result<(), ()> {
83+
loop {
84+
if let Some(ch2) = self.next_char() {
85+
match ch2 as char {
86+
'\\' => {
87+
if let Some(ch3) = self.next_char() {
88+
match ch3 as char {
89+
// \$ => $
90+
'$' | '`' | '"' | '\\' => { result.push(ch3); },
91+
// \<newline> => nothing
92+
'\n' => {},
93+
// \x => =x
94+
_ => { result.push('\\' as u8); result.push(ch3); }
95+
}
96+
} else {
97+
return Err(());
98+
}
99+
},
100+
'"' => { return Ok(()); },
101+
_ => { result.push(ch2); },
102+
}
103+
} else {
104+
return Err(());
105+
}
106+
}
107+
}
108+
109+
fn parse_single(&mut self, result: &mut Vec<u8>) -> Result<(), ()> {
110+
loop {
111+
if let Some(ch2) = self.next_char() {
112+
match ch2 as char {
113+
'\'' => { return Ok(()); },
114+
_ => { result.push(ch2); },
115+
}
116+
} else {
117+
return Err(());
118+
}
119+
}
120+
}
121+
122+
fn next_char(&mut self) -> Option<u8> {
123+
let res = self.in_iter.next().copied();
124+
if res == Some(b'\n') { self.line_no += 1; }
125+
res
126+
}
127+
}
128+
129+
impl<'a> Iterator for Shlex<'a> {
130+
type Item = Vec<u8>;
131+
fn next(&mut self) -> Option<Self::Item> {
132+
if let Some(mut ch) = self.next_char() {
133+
// skip initial whitespace
134+
loop {
135+
match ch as char {
136+
' ' | '\t' | '\n' => {},
137+
'#' => {
138+
while let Some(ch2) = self.next_char() {
139+
if ch2 as char == '\n' { break; }
140+
}
141+
},
142+
_ => { break; }
143+
}
144+
if let Some(ch2) = self.next_char() { ch = ch2; } else { return None; }
145+
}
146+
self.parse_word(ch)
147+
} else { // no initial character
148+
None
149+
}
150+
}
151+
152+
}
153+
154+
/// Convenience function that consumes the whole byte string at once. Returns None if the input was
155+
/// erroneous.
156+
pub fn split(in_bytes: &[u8]) -> Option<Vec<Vec<u8>>> {
157+
let mut shl = Shlex::new(in_bytes);
158+
let res = shl.by_ref().collect();
159+
if shl.had_error { None } else { Some(res) }
160+
}
161+
162+
/// Given a single word, return a byte string suitable to encode it as a shell argument.
163+
///
164+
/// If given valid UTF-8, this will never produce invalid UTF-8. This is because it only
165+
/// ever inserts valid ASCII characters before or after existing ASCII characters (or
166+
/// returns two double quotes if the input was an empty string). It will never modify a
167+
/// multibyte UTF-8 character.
168+
pub fn quote(in_bytes: &[u8]) -> Cow<[u8]> {
169+
if in_bytes.len() == 0 {
170+
b"\"\""[..].into()
171+
} else if in_bytes.iter().any(|c| match *c as char {
172+
'|' | '&' | ';' | '<' | '>' | '(' | ')' | '$' | '`' | '\\' | '"' | '\'' | ' ' | '\t' |
173+
'\r' | '\n' | '*' | '?' | '[' | '#' | '~' | '=' | '%' => true,
174+
_ => false
175+
}) {
176+
let mut out: Vec<u8> = Vec::new();
177+
out.push(b'"');
178+
for &c in in_bytes {
179+
match c {
180+
b'$' | b'`' | b'"' | b'\\' => out.push(b'\\'),
181+
_ => ()
182+
}
183+
out.push(c);
184+
}
185+
out.push(b'"');
186+
out.into()
187+
} else {
188+
in_bytes.into()
189+
}
190+
}
191+
192+
/// Convenience function that consumes an iterable of words and turns it into a single byte string,
193+
/// quoting words when necessary. Consecutive words will be separated by a single space.
194+
pub fn join<'a, I: core::iter::IntoIterator<Item = &'a [u8]>>(words: I) -> Vec<u8> {
195+
words.into_iter()
196+
.map(quote)
197+
.collect::<Vec<_>>()
198+
.join(&b' ')
199+
}
200+
201+
#[cfg(test)]
202+
const INVALID_UTF8: &[u8] = b"\xa1";
203+
204+
#[test]
205+
fn test_invalid_utf8() {
206+
// Check that our test string is actually invalid UTF-8.
207+
assert!(core::str::from_utf8(INVALID_UTF8).is_err());
208+
}
209+
210+
#[cfg(test)]
211+
static SPLIT_TEST_ITEMS: &'static [(&'static [u8], Option<&'static [&'static [u8]]>)] = &[
212+
(b"foo$baz", Some(&[b"foo$baz"])),
213+
(b"foo baz", Some(&[b"foo", b"baz"])),
214+
(b"foo\"bar\"baz", Some(&[b"foobarbaz"])),
215+
(b"foo \"bar\"baz", Some(&[b"foo", b"barbaz"])),
216+
(b" foo \nbar", Some(&[b"foo", b"bar"])),
217+
(b"foo\\\nbar", Some(&[b"foobar"])),
218+
(b"\"foo\\\nbar\"", Some(&[b"foobar"])),
219+
(b"'baz\\$b'", Some(&[b"baz\\$b"])),
220+
(b"'baz\\\''", None),
221+
(b"\\", None),
222+
(b"\"\\", None),
223+
(b"'\\", None),
224+
(b"\"", None),
225+
(b"'", None),
226+
(b"foo #bar\nbaz", Some(&[b"foo", b"baz"])),
227+
(b"foo #bar", Some(&[b"foo"])),
228+
(b"foo#bar", Some(&[b"foo#bar"])),
229+
(b"foo\"#bar", None),
230+
(b"'\\n'", Some(&[b"\\n"])),
231+
(b"'\\\\n'", Some(&[b"\\\\n"])),
232+
(INVALID_UTF8, Some(&[INVALID_UTF8])),
233+
];
234+
235+
#[test]
236+
fn test_split() {
237+
for &(input, output) in SPLIT_TEST_ITEMS {
238+
assert_eq!(split(input), output.map(|o| o.iter().map(|&x| x.to_owned()).collect()));
239+
}
240+
}
241+
242+
#[test]
243+
fn test_lineno() {
244+
let mut sh = Shlex::new(b"\nfoo\nbar");
245+
while let Some(word) = sh.next() {
246+
if word == b"bar" {
247+
assert_eq!(sh.line_no, 3);
248+
}
249+
}
250+
}
251+
252+
#[test]
253+
fn test_quote() {
254+
assert_eq!(quote(b"foobar"), &b"foobar"[..]);
255+
assert_eq!(quote(b"foo bar"), &b"\"foo bar\""[..]);
256+
assert_eq!(quote(b"\""), &b"\"\\\"\""[..]);
257+
assert_eq!(quote(b""), &b"\"\""[..]);
258+
assert_eq!(quote(INVALID_UTF8), INVALID_UTF8);
259+
}
260+
261+
#[test]
262+
fn test_join() {
263+
assert_eq!(join(vec![]), &b""[..]);
264+
assert_eq!(join(vec![&b""[..]]), &b"\"\""[..]);
265+
assert_eq!(join(vec![&b"a"[..], &b"b"[..]]), &b"a b"[..]);
266+
assert_eq!(join(vec![&b"foo bar"[..], &b"baz"[..]]), &b"\"foo bar\" baz"[..]);
267+
assert_eq!(join(vec![INVALID_UTF8]), INVALID_UTF8);
268+
}

0 commit comments

Comments
 (0)