Skip to content

Commit cd8ec38

Browse files
committed
grep-regex: add fast path for -w/--word-regexp
Previously, ripgrep would always defer to the regex engine's capturing matches in order to implement word matching. Namely, ripgrep would determine the correct match offsets via a capturing group, since the word regex is itself generated from the user supplied regex. Unfortunately, the regex engine's capturing mode is still fairly slow, so this commit adds a fast path to avoid capturing mode in the vast majority of cases. See comments in the code for details.
1 parent 6a0e014 commit cd8ec38

File tree

4 files changed

+101
-4
lines changed

4 files changed

+101
-4
lines changed

CHANGELOG.md

+2
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ Performance improvements:
1212
of ` `.
1313
* PERF:
1414
Improve literal detection when the `-w/--word-regexp` flag is used.
15+
* PERF:
16+
Improve overall performance of the `-w/--word-regexp` flag.
1517

1618
Feature enhancements:
1719

Cargo.lock

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

grep-regex/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ license = "Unlicense/MIT"
1414

1515
[dependencies]
1616
aho-corasick = "0.7.3"
17+
bstr = "0.2.10"
1718
grep-matcher = { version = "0.1.2", path = "../grep-matcher" }
1819
log = "0.4.5"
1920
regex = "1.1"

grep-regex/src/word.rs

+97-4
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ use matcher::RegexCaptures;
1515
pub struct WordMatcher {
1616
/// The regex which is roughly `(?:^|\W)(<original pattern>)(?:$|\W)`.
1717
regex: Regex,
18+
/// The original regex supplied by the user, which we use in a fast path
19+
/// to try and detect matches before deferring to slower engines.
20+
original: Regex,
1821
/// A map from capture group name to capture group index.
1922
names: HashMap<String, usize>,
2023
/// A reusable buffer for finding the match location of the inner group.
@@ -28,6 +31,7 @@ impl Clone for WordMatcher {
2831
// usings `locs` to hit the fast path.
2932
WordMatcher {
3033
regex: self.regex.clone(),
34+
original: self.original.clone(),
3135
names: self.names.clone(),
3236
locs: Arc::new(CachedThreadLocal::new()),
3337
}
@@ -41,8 +45,13 @@ impl WordMatcher {
4145
/// The given options are used to construct the regular expression
4246
/// internally.
4347
pub fn new(expr: &ConfiguredHIR) -> Result<WordMatcher, Error> {
48+
let original = expr.with_pattern(|pat| {
49+
format!("^(?:{})$", pat)
50+
})?.regex()?;
4451
let word_expr = expr.with_pattern(|pat| {
45-
format!(r"(?:(?m:^)|\W)({})(?:(?m:$)|\W)", pat)
52+
let pat = format!(r"(?:(?-m:^)|\W)({})(?:(?-m:$)|\W)", pat);
53+
debug!("word regex: {:?}", pat);
54+
pat
4655
})?;
4756
let regex = word_expr.regex()?;
4857
let locs = Arc::new(CachedThreadLocal::new());
@@ -53,13 +62,65 @@ impl WordMatcher {
5362
names.insert(name.to_string(), i.checked_sub(1).unwrap());
5463
}
5564
}
56-
Ok(WordMatcher { regex, names, locs })
65+
Ok(WordMatcher { regex, original, names, locs })
5766
}
5867

5968
/// Return the underlying regex used by this matcher.
6069
pub fn regex(&self) -> &Regex {
6170
&self.regex
6271
}
72+
73+
/// Attempt to do a fast confirmation of a word match that covers a subset
74+
/// (but hopefully a big subset) of most cases. Ok(Some(..)) is returned
75+
/// when a match is found. Ok(None) is returned when there is definitively
76+
/// no match. Err(()) is returned when this routine could not detect
77+
/// whether there was a match or not.
78+
fn fast_find(
79+
&self,
80+
haystack: &[u8],
81+
at: usize,
82+
) -> Result<Option<Match>, ()> {
83+
// This is a bit hairy. The whole point here is to avoid running an
84+
// NFA simulation in the regex engine. Remember, our word regex looks
85+
// like this:
86+
//
87+
// (^|\W)(<original regex>)($|\W)
88+
// where ^ and $ have multiline mode DISABLED
89+
//
90+
// What we want are the match offsets of <original regex>. So in the
91+
// easy/common case, the original regex will be sandwiched between
92+
// two codepoints that are in the \W class. So our approach here is to
93+
// look for a match of the overall word regexp, strip the \W ends and
94+
// then check whether the original regex matches what's left. If so,
95+
// then we are guaranteed a correct match.
96+
//
97+
// This only works though if we know that the match is sandwiched
98+
// between two \W codepoints. This only occurs when neither ^ nor $
99+
// match. This in turn only occurs when the match is at either the
100+
// beginning or end of the haystack. In either of those cases, we
101+
// declare defeat and defer to the slower implementation.
102+
//
103+
// The reason why we cannot handle the ^/$ cases here is because we
104+
// can't assume anything about the original pattern. (Try commenting
105+
// out the checks for ^/$ below and run the tests to see examples.)
106+
let mut cand = match self.regex.find_at(haystack, at) {
107+
None => return Ok(None),
108+
Some(m) => Match::new(m.start(), m.end()),
109+
};
110+
if cand.start() == 0 || cand.end() == haystack.len() {
111+
return Err(());
112+
}
113+
let (_, slen) = bstr::decode_utf8(&haystack[cand]);
114+
let (_, elen) = bstr::decode_last_utf8(&haystack[cand]);
115+
cand = cand
116+
.with_start(cand.start() + slen)
117+
.with_end(cand.end() - elen);
118+
if self.original.is_match(&haystack[cand]) {
119+
Ok(Some(cand))
120+
} else {
121+
Err(())
122+
}
123+
}
63124
}
64125

65126
impl Matcher for WordMatcher {
@@ -76,6 +137,16 @@ impl Matcher for WordMatcher {
76137
// of `0`. We *could* use `find_at` here and then trim the match after
77138
// the fact, but that's a bit harder to get right, and it's not clear
78139
// if it's worth it.
140+
//
141+
// OK, well, it turns out that it is worth it! But it is quite tricky.
142+
// See `fast_find` for details. Effectively, this lets us skip running
143+
// the NFA simulation in the regex engine in the vast majority of
144+
// cases. However, the NFA simulation is required for full correctness.
145+
match self.fast_find(haystack, at) {
146+
Ok(Some(m)) => return Ok(Some(m)),
147+
Ok(None) => return Ok(None),
148+
Err(()) => {}
149+
}
79150

80151
let cell = self.locs.get_or(|| {
81152
RefCell::new(self.regex.capture_locations())
@@ -152,9 +223,31 @@ mod tests {
152223

153224
assert_eq!(Some((0, 3)), find(r"foo", "foo☃"));
154225
assert_eq!(None, find(r"foo", "fooб"));
155-
// assert_eq!(Some((0, 3)), find(r"foo", "fooб"));
156226

157-
// See: https://github.com/BurntSushi/ripgrep/issues/389
227+
assert_eq!(Some((0, 4)), find(r"foo5", "foo5"));
228+
assert_eq!(None, find(r"foo", "foo5"));
229+
230+
assert_eq!(Some((1, 4)), find(r"foo", "!foo!"));
231+
assert_eq!(Some((1, 5)), find(r"foo!", "!foo!"));
232+
assert_eq!(Some((0, 5)), find(r"!foo!", "!foo!"));
233+
234+
assert_eq!(Some((0, 3)), find(r"foo", "foo\n"));
235+
assert_eq!(Some((1, 4)), find(r"foo", "!foo!\n"));
236+
assert_eq!(Some((1, 5)), find(r"foo!", "!foo!\n"));
237+
assert_eq!(Some((0, 5)), find(r"!foo!", "!foo!\n"));
238+
239+
assert_eq!(Some((1, 6)), find(r"!?foo!?", "!!foo!!"));
240+
assert_eq!(Some((0, 5)), find(r"!?foo!?", "!foo!"));
241+
assert_eq!(Some((2, 5)), find(r"!?foo!?", "a!foo!a"));
242+
243+
assert_eq!(Some((2, 7)), find(r"!?foo!?", "##!foo!\n"));
244+
assert_eq!(Some((3, 7)), find(r"f?oo!?", "##\nfoo!##"));
245+
assert_eq!(Some((2, 5)), find(r"(?-u)foo[^a]*", "#!foo☃aaa"));
246+
}
247+
248+
// See: https://github.com/BurntSushi/ripgrep/issues/389
249+
#[test]
250+
fn regression_dash() {
158251
assert_eq!(Some((0, 2)), find(r"-2", "-2"));
159252
}
160253

0 commit comments

Comments
 (0)