Skip to content

Commit 3a77ec8

Browse files
committed
grep-regex: fix inner literal detection
It seems the inner literal detector fails spectacularly in cases of concatenations that involve groups. The issue here is that if the prefix of a group inside a concatenation can match the empty string, then any literals generated to that point in the concatenation need to be cut such that they are never extended. The detector isn't really built to handle this case, so we just act conservative cut literals whenever we see a sub-group. This may make some regexes slower, but the inner literal detector already misses plenty of cases. Literal detection (including in the regex engine) is a key component that needs to be completely rethought at some point. Fixes #1064
1 parent f72c2df commit 3a77ec8

File tree

2 files changed

+30
-2
lines changed

2 files changed

+30
-2
lines changed

grep-regex/src/literal.rs

+24-2
Original file line numberDiff line numberDiff line change
@@ -166,10 +166,10 @@ fn union_required(expr: &Hir, lits: &mut Literals) {
166166
lits.cut();
167167
continue;
168168
}
169-
if lits2.contains_empty() {
169+
if lits2.contains_empty() || !is_simple(&e) {
170170
lits.cut();
171171
}
172-
if !lits.cross_product(&lits2) {
172+
if !lits.cross_product(&lits2) || !lits2.any_complete() {
173173
// If this expression couldn't yield any literal that
174174
// could be extended, then we need to quit. Since we're
175175
// short-circuiting, we also need to freeze every member.
@@ -250,6 +250,20 @@ fn alternate_literals<F: FnMut(&Hir, &mut Literals)>(
250250
}
251251
}
252252

253+
fn is_simple(expr: &Hir) -> bool {
254+
match *expr.kind() {
255+
HirKind::Empty
256+
| HirKind::Literal(_)
257+
| HirKind::Class(_)
258+
| HirKind::Repetition(_)
259+
| HirKind::Concat(_)
260+
| HirKind::Alternation(_) => true,
261+
HirKind::Anchor(_)
262+
| HirKind::WordBoundary(_)
263+
| HirKind::Group(_) => false,
264+
}
265+
}
266+
253267
/// Return the number of characters in the given class.
254268
fn count_unicode_class(cls: &hir::ClassUnicode) -> u32 {
255269
cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum()
@@ -301,4 +315,12 @@ mod tests {
301315
// assert_eq!(one_regex(r"\w(foo|bar|baz)"), pat("foo|bar|baz"));
302316
// assert_eq!(one_regex(r"\w(foo|bar|baz)\w"), pat("foo|bar|baz"));
303317
}
318+
319+
#[test]
320+
fn regression_1064() {
321+
// Regression from:
322+
// https://github.com/BurntSushi/ripgrep/issues/1064
323+
// assert_eq!(one_regex(r"a.*c"), pat("a"));
324+
assert_eq!(one_regex(r"a(.*c)"), pat("a"));
325+
}
304326
}

tests/regression.rs

+6
Original file line numberDiff line numberDiff line change
@@ -562,3 +562,9 @@ rgtest!(r900, |dir: Dir, mut cmd: TestCommand| {
562562

563563
cmd.arg("-fpat").arg("sherlock").assert_err();
564564
});
565+
566+
// See: https://github.com/BurntSushi/ripgrep/issues/1064
567+
rgtest!(r1064, |dir: Dir, mut cmd: TestCommand| {
568+
dir.create("input", "abc");
569+
eqnice!("input:abc\n", cmd.arg("a(.*c)").stdout());
570+
});

0 commit comments

Comments
 (0)