|
| 1 | +// Copyright 2015 Nicholas Allegra (comex). |
| 2 | +// Licensed under the Apache License, Version 2.0 <https://www.apache.org/licenses/LICENSE-2.0> or |
| 3 | +// the MIT license <https://opensource.org/licenses/MIT>, at your option. This file may not be |
| 4 | +// copied, modified, or distributed except according to those terms. |
| 5 | + |
| 6 | +//! [`Shlex`] and friends for byte strings. |
| 7 | +//! |
| 8 | +//! This is used internally by the [outer module](crate), and may be more |
| 9 | +//! convenient if you are working with byte slices (`[u8]`) or types that are |
| 10 | +//! wrappers around bytes, such as [`OsStr`](std::ffi::OsStr): |
| 11 | +//! |
| 12 | +//! ```rust |
| 13 | +//! #[cfg(unix)] { |
| 14 | +//! use shlex::bytes::quote; |
| 15 | +//! use std::ffi::OsStr; |
| 16 | +//! use std::os::unix::ffi::OsStrExt; |
| 17 | +//! |
| 18 | +//! // `\x80` is invalid in UTF-8. |
| 19 | +//! let os_str = OsStr::from_bytes(b"a\x80b c"); |
| 20 | +//! assert_eq!(quote(os_str.as_bytes()), &b"\"a\x80b c\""[..]); |
| 21 | +//! } |
| 22 | +//! ``` |
| 23 | +//! |
| 24 | +//! (On Windows, `OsStr` uses 16 bit wide characters so this will not work.) |
| 25 | +
|
| 26 | +extern crate alloc; |
| 27 | +use alloc::vec::Vec; |
| 28 | +use alloc::borrow::Cow; |
| 29 | +#[cfg(test)] |
| 30 | +use alloc::vec; |
| 31 | +#[cfg(test)] |
| 32 | +use alloc::borrow::ToOwned; |
| 33 | + |
| 34 | +/// An iterator that takes an input byte string and splits it into the words using the same syntax as |
| 35 | +/// the POSIX shell. |
| 36 | +pub struct Shlex<'a> { |
| 37 | + in_iter: core::slice::Iter<'a, u8>, |
| 38 | + /// The number of newlines read so far, plus one. |
| 39 | + pub line_no: usize, |
| 40 | + /// An input string is erroneous if it ends while inside a quotation or right after an |
| 41 | + /// unescaped backslash. Since Iterator does not have a mechanism to return an error, if that |
| 42 | + /// happens, Shlex just throws out the last token, ends the iteration, and sets 'had_error' to |
| 43 | + /// true; best to check it after you're done iterating. |
| 44 | + pub had_error: bool, |
| 45 | +} |
| 46 | + |
| 47 | +impl<'a> Shlex<'a> { |
| 48 | + pub fn new(in_bytes: &'a [u8]) -> Self { |
| 49 | + Shlex { |
| 50 | + in_iter: in_bytes.iter(), |
| 51 | + line_no: 1, |
| 52 | + had_error: false, |
| 53 | + } |
| 54 | + } |
| 55 | + |
| 56 | + fn parse_word(&mut self, mut ch: u8) -> Option<Vec<u8>> { |
| 57 | + let mut result: Vec<u8> = Vec::new(); |
| 58 | + loop { |
| 59 | + match ch as char { |
| 60 | + '"' => if let Err(()) = self.parse_double(&mut result) { |
| 61 | + self.had_error = true; |
| 62 | + return None; |
| 63 | + }, |
| 64 | + '\'' => if let Err(()) = self.parse_single(&mut result) { |
| 65 | + self.had_error = true; |
| 66 | + return None; |
| 67 | + }, |
| 68 | + '\\' => if let Some(ch2) = self.next_char() { |
| 69 | + if ch2 != '\n' as u8 { result.push(ch2); } |
| 70 | + } else { |
| 71 | + self.had_error = true; |
| 72 | + return None; |
| 73 | + }, |
| 74 | + ' ' | '\t' | '\n' => { break; }, |
| 75 | + _ => { result.push(ch as u8); }, |
| 76 | + } |
| 77 | + if let Some(ch2) = self.next_char() { ch = ch2; } else { break; } |
| 78 | + } |
| 79 | + Some(result) |
| 80 | + } |
| 81 | + |
| 82 | + fn parse_double(&mut self, result: &mut Vec<u8>) -> Result<(), ()> { |
| 83 | + loop { |
| 84 | + if let Some(ch2) = self.next_char() { |
| 85 | + match ch2 as char { |
| 86 | + '\\' => { |
| 87 | + if let Some(ch3) = self.next_char() { |
| 88 | + match ch3 as char { |
| 89 | + // \$ => $ |
| 90 | + '$' | '`' | '"' | '\\' => { result.push(ch3); }, |
| 91 | + // \<newline> => nothing |
| 92 | + '\n' => {}, |
| 93 | + // \x => =x |
| 94 | + _ => { result.push('\\' as u8); result.push(ch3); } |
| 95 | + } |
| 96 | + } else { |
| 97 | + return Err(()); |
| 98 | + } |
| 99 | + }, |
| 100 | + '"' => { return Ok(()); }, |
| 101 | + _ => { result.push(ch2); }, |
| 102 | + } |
| 103 | + } else { |
| 104 | + return Err(()); |
| 105 | + } |
| 106 | + } |
| 107 | + } |
| 108 | + |
| 109 | + fn parse_single(&mut self, result: &mut Vec<u8>) -> Result<(), ()> { |
| 110 | + loop { |
| 111 | + if let Some(ch2) = self.next_char() { |
| 112 | + match ch2 as char { |
| 113 | + '\'' => { return Ok(()); }, |
| 114 | + _ => { result.push(ch2); }, |
| 115 | + } |
| 116 | + } else { |
| 117 | + return Err(()); |
| 118 | + } |
| 119 | + } |
| 120 | + } |
| 121 | + |
| 122 | + fn next_char(&mut self) -> Option<u8> { |
| 123 | + let res = self.in_iter.next().copied(); |
| 124 | + if res == Some(b'\n') { self.line_no += 1; } |
| 125 | + res |
| 126 | + } |
| 127 | +} |
| 128 | + |
| 129 | +impl<'a> Iterator for Shlex<'a> { |
| 130 | + type Item = Vec<u8>; |
| 131 | + fn next(&mut self) -> Option<Self::Item> { |
| 132 | + if let Some(mut ch) = self.next_char() { |
| 133 | + // skip initial whitespace |
| 134 | + loop { |
| 135 | + match ch as char { |
| 136 | + ' ' | '\t' | '\n' => {}, |
| 137 | + '#' => { |
| 138 | + while let Some(ch2) = self.next_char() { |
| 139 | + if ch2 as char == '\n' { break; } |
| 140 | + } |
| 141 | + }, |
| 142 | + _ => { break; } |
| 143 | + } |
| 144 | + if let Some(ch2) = self.next_char() { ch = ch2; } else { return None; } |
| 145 | + } |
| 146 | + self.parse_word(ch) |
| 147 | + } else { // no initial character |
| 148 | + None |
| 149 | + } |
| 150 | + } |
| 151 | + |
| 152 | +} |
| 153 | + |
| 154 | +/// Convenience function that consumes the whole byte string at once. Returns None if the input was |
| 155 | +/// erroneous. |
| 156 | +pub fn split(in_bytes: &[u8]) -> Option<Vec<Vec<u8>>> { |
| 157 | + let mut shl = Shlex::new(in_bytes); |
| 158 | + let res = shl.by_ref().collect(); |
| 159 | + if shl.had_error { None } else { Some(res) } |
| 160 | +} |
| 161 | + |
| 162 | +/// Given a single word, return a byte string suitable to encode it as a shell argument. |
| 163 | +/// |
| 164 | +/// If given valid UTF-8, this will never produce invalid UTF-8. This is because it only |
| 165 | +/// ever inserts valid ASCII characters before or after existing ASCII characters (or |
| 166 | +/// returns two double quotes if the input was an empty string). It will never modify a |
| 167 | +/// multibyte UTF-8 character. |
| 168 | +pub fn quote(in_bytes: &[u8]) -> Cow<[u8]> { |
| 169 | + if in_bytes.len() == 0 { |
| 170 | + b"\"\""[..].into() |
| 171 | + } else if in_bytes.iter().any(|c| match *c as char { |
| 172 | + '|' | '&' | ';' | '<' | '>' | '(' | ')' | '$' | '`' | '\\' | '"' | '\'' | ' ' | '\t' | |
| 173 | + '\r' | '\n' | '*' | '?' | '[' | '#' | '~' | '=' | '%' => true, |
| 174 | + _ => false |
| 175 | + }) { |
| 176 | + let mut out: Vec<u8> = Vec::new(); |
| 177 | + out.push(b'"'); |
| 178 | + for &c in in_bytes { |
| 179 | + match c { |
| 180 | + b'$' | b'`' | b'"' | b'\\' => out.push(b'\\'), |
| 181 | + _ => () |
| 182 | + } |
| 183 | + out.push(c); |
| 184 | + } |
| 185 | + out.push(b'"'); |
| 186 | + out.into() |
| 187 | + } else { |
| 188 | + in_bytes.into() |
| 189 | + } |
| 190 | +} |
| 191 | + |
| 192 | +/// Convenience function that consumes an iterable of words and turns it into a single byte string, |
| 193 | +/// quoting words when necessary. Consecutive words will be separated by a single space. |
| 194 | +pub fn join<'a, I: core::iter::IntoIterator<Item = &'a [u8]>>(words: I) -> Vec<u8> { |
| 195 | + words.into_iter() |
| 196 | + .map(quote) |
| 197 | + .collect::<Vec<_>>() |
| 198 | + .join(&b' ') |
| 199 | +} |
| 200 | + |
| 201 | +#[cfg(test)] |
| 202 | +const INVALID_UTF8: &[u8] = b"\xa1"; |
| 203 | + |
| 204 | +#[test] |
| 205 | +fn test_invalid_utf8() { |
| 206 | + // Check that our test string is actually invalid UTF-8. |
| 207 | + assert!(core::str::from_utf8(INVALID_UTF8).is_err()); |
| 208 | +} |
| 209 | + |
| 210 | +#[cfg(test)] |
| 211 | +static SPLIT_TEST_ITEMS: &'static [(&'static [u8], Option<&'static [&'static [u8]]>)] = &[ |
| 212 | + (b"foo$baz", Some(&[b"foo$baz"])), |
| 213 | + (b"foo baz", Some(&[b"foo", b"baz"])), |
| 214 | + (b"foo\"bar\"baz", Some(&[b"foobarbaz"])), |
| 215 | + (b"foo \"bar\"baz", Some(&[b"foo", b"barbaz"])), |
| 216 | + (b" foo \nbar", Some(&[b"foo", b"bar"])), |
| 217 | + (b"foo\\\nbar", Some(&[b"foobar"])), |
| 218 | + (b"\"foo\\\nbar\"", Some(&[b"foobar"])), |
| 219 | + (b"'baz\\$b'", Some(&[b"baz\\$b"])), |
| 220 | + (b"'baz\\\''", None), |
| 221 | + (b"\\", None), |
| 222 | + (b"\"\\", None), |
| 223 | + (b"'\\", None), |
| 224 | + (b"\"", None), |
| 225 | + (b"'", None), |
| 226 | + (b"foo #bar\nbaz", Some(&[b"foo", b"baz"])), |
| 227 | + (b"foo #bar", Some(&[b"foo"])), |
| 228 | + (b"foo#bar", Some(&[b"foo#bar"])), |
| 229 | + (b"foo\"#bar", None), |
| 230 | + (b"'\\n'", Some(&[b"\\n"])), |
| 231 | + (b"'\\\\n'", Some(&[b"\\\\n"])), |
| 232 | + (INVALID_UTF8, Some(&[INVALID_UTF8])), |
| 233 | +]; |
| 234 | + |
| 235 | +#[test] |
| 236 | +fn test_split() { |
| 237 | + for &(input, output) in SPLIT_TEST_ITEMS { |
| 238 | + assert_eq!(split(input), output.map(|o| o.iter().map(|&x| x.to_owned()).collect())); |
| 239 | + } |
| 240 | +} |
| 241 | + |
| 242 | +#[test] |
| 243 | +fn test_lineno() { |
| 244 | + let mut sh = Shlex::new(b"\nfoo\nbar"); |
| 245 | + while let Some(word) = sh.next() { |
| 246 | + if word == b"bar" { |
| 247 | + assert_eq!(sh.line_no, 3); |
| 248 | + } |
| 249 | + } |
| 250 | +} |
| 251 | + |
| 252 | +#[test] |
| 253 | +fn test_quote() { |
| 254 | + assert_eq!(quote(b"foobar"), &b"foobar"[..]); |
| 255 | + assert_eq!(quote(b"foo bar"), &b"\"foo bar\""[..]); |
| 256 | + assert_eq!(quote(b"\""), &b"\"\\\"\""[..]); |
| 257 | + assert_eq!(quote(b""), &b"\"\""[..]); |
| 258 | + assert_eq!(quote(INVALID_UTF8), INVALID_UTF8); |
| 259 | +} |
| 260 | + |
| 261 | +#[test] |
| 262 | +fn test_join() { |
| 263 | + assert_eq!(join(vec![]), &b""[..]); |
| 264 | + assert_eq!(join(vec![&b""[..]]), &b"\"\""[..]); |
| 265 | + assert_eq!(join(vec![&b"a"[..], &b"b"[..]]), &b"a b"[..]); |
| 266 | + assert_eq!(join(vec![&b"foo bar"[..], &b"baz"[..]]), &b"\"foo bar\" baz"[..]); |
| 267 | + assert_eq!(join(vec![INVALID_UTF8]), INVALID_UTF8); |
| 268 | +} |
0 commit comments