@@ -15,6 +15,9 @@ use matcher::RegexCaptures;
15
15
pub struct WordMatcher {
16
16
/// The regex which is roughly `(?:^|\W)(<original pattern>)(?:$|\W)`.
17
17
regex : Regex ,
18
+ /// The original regex supplied by the user, which we use in a fast path
19
+ /// to try and detect matches before deferring to slower engines.
20
+ original : Regex ,
18
21
/// A map from capture group name to capture group index.
19
22
names : HashMap < String , usize > ,
20
23
/// A reusable buffer for finding the match location of the inner group.
@@ -28,6 +31,7 @@ impl Clone for WordMatcher {
28
31
// usings `locs` to hit the fast path.
29
32
WordMatcher {
30
33
regex : self . regex . clone ( ) ,
34
+ original : self . original . clone ( ) ,
31
35
names : self . names . clone ( ) ,
32
36
locs : Arc :: new ( CachedThreadLocal :: new ( ) ) ,
33
37
}
@@ -41,8 +45,13 @@ impl WordMatcher {
41
45
/// The given options are used to construct the regular expression
42
46
/// internally.
43
47
pub fn new ( expr : & ConfiguredHIR ) -> Result < WordMatcher , Error > {
48
+ let original = expr. with_pattern ( |pat| {
49
+ format ! ( "^(?:{})$" , pat)
50
+ } ) ?. regex ( ) ?;
44
51
let word_expr = expr. with_pattern ( |pat| {
45
- format ! ( r"(?:(?m:^)|\W)({})(?:(?m:$)|\W)" , pat)
52
+ let pat = format ! ( r"(?:(?-m:^)|\W)({})(?:(?-m:$)|\W)" , pat) ;
53
+ debug ! ( "word regex: {:?}" , pat) ;
54
+ pat
46
55
} ) ?;
47
56
let regex = word_expr. regex ( ) ?;
48
57
let locs = Arc :: new ( CachedThreadLocal :: new ( ) ) ;
@@ -53,13 +62,65 @@ impl WordMatcher {
53
62
names. insert ( name. to_string ( ) , i. checked_sub ( 1 ) . unwrap ( ) ) ;
54
63
}
55
64
}
56
- Ok ( WordMatcher { regex, names, locs } )
65
+ Ok ( WordMatcher { regex, original , names, locs } )
57
66
}
58
67
59
68
/// Return the underlying regex used by this matcher.
60
69
pub fn regex ( & self ) -> & Regex {
61
70
& self . regex
62
71
}
72
+
73
+ /// Attempt to do a fast confirmation of a word match that covers a subset
74
+ /// (but hopefully a big subset) of most cases. Ok(Some(..)) is returned
75
+ /// when a match is found. Ok(None) is returned when there is definitively
76
+ /// no match. Err(()) is returned when this routine could not detect
77
+ /// whether there was a match or not.
78
+ fn fast_find (
79
+ & self ,
80
+ haystack : & [ u8 ] ,
81
+ at : usize ,
82
+ ) -> Result < Option < Match > , ( ) > {
83
+ // This is a bit hairy. The whole point here is to avoid running an
84
+ // NFA simulation in the regex engine. Remember, our word regex looks
85
+ // like this:
86
+ //
87
+ // (^|\W)(<original regex>)($|\W)
88
+ // where ^ and $ have multiline mode DISABLED
89
+ //
90
+ // What we want are the match offsets of <original regex>. So in the
91
+ // easy/common case, the original regex will be sandwiched between
92
+ // two codepoints that are in the \W class. So our approach here is to
93
+ // look for a match of the overall word regexp, strip the \W ends and
94
+ // then check whether the original regex matches what's left. If so,
95
+ // then we are guaranteed a correct match.
96
+ //
97
+ // This only works though if we know that the match is sandwiched
98
+ // between two \W codepoints. This only occurs when neither ^ nor $
99
+ // match. This in turn only occurs when the match is at either the
100
+ // beginning or end of the haystack. In either of those cases, we
101
+ // declare defeat and defer to the slower implementation.
102
+ //
103
+ // The reason why we cannot handle the ^/$ cases here is because we
104
+ // can't assume anything about the original pattern. (Try commenting
105
+ // out the checks for ^/$ below and run the tests to see examples.)
106
+ let mut cand = match self . regex . find_at ( haystack, at) {
107
+ None => return Ok ( None ) ,
108
+ Some ( m) => Match :: new ( m. start ( ) , m. end ( ) ) ,
109
+ } ;
110
+ if cand. start ( ) == 0 || cand. end ( ) == haystack. len ( ) {
111
+ return Err ( ( ) ) ;
112
+ }
113
+ let ( _, slen) = bstr:: decode_utf8 ( & haystack[ cand] ) ;
114
+ let ( _, elen) = bstr:: decode_last_utf8 ( & haystack[ cand] ) ;
115
+ cand = cand
116
+ . with_start ( cand. start ( ) + slen)
117
+ . with_end ( cand. end ( ) - elen) ;
118
+ if self . original . is_match ( & haystack[ cand] ) {
119
+ Ok ( Some ( cand) )
120
+ } else {
121
+ Err ( ( ) )
122
+ }
123
+ }
63
124
}
64
125
65
126
impl Matcher for WordMatcher {
@@ -76,6 +137,16 @@ impl Matcher for WordMatcher {
76
137
// of `0`. We *could* use `find_at` here and then trim the match after
77
138
// the fact, but that's a bit harder to get right, and it's not clear
78
139
// if it's worth it.
140
+ //
141
+ // OK, well, it turns out that it is worth it! But it is quite tricky.
142
+ // See `fast_find` for details. Effectively, this lets us skip running
143
+ // the NFA simulation in the regex engine in the vast majority of
144
+ // cases. However, the NFA simulation is required for full correctness.
145
+ match self . fast_find ( haystack, at) {
146
+ Ok ( Some ( m) ) => return Ok ( Some ( m) ) ,
147
+ Ok ( None ) => return Ok ( None ) ,
148
+ Err ( ( ) ) => { }
149
+ }
79
150
80
151
let cell = self . locs . get_or ( || {
81
152
RefCell :: new ( self . regex . capture_locations ( ) )
@@ -152,9 +223,31 @@ mod tests {
152
223
153
224
assert_eq ! ( Some ( ( 0 , 3 ) ) , find( r"foo" , "foo☃" ) ) ;
154
225
assert_eq ! ( None , find( r"foo" , "fooб" ) ) ;
155
- // assert_eq!(Some((0, 3)), find(r"foo", "fooб"));
156
226
157
- // See: https://github.com/BurntSushi/ripgrep/issues/389
227
+ assert_eq ! ( Some ( ( 0 , 4 ) ) , find( r"foo5" , "foo5" ) ) ;
228
+ assert_eq ! ( None , find( r"foo" , "foo5" ) ) ;
229
+
230
+ assert_eq ! ( Some ( ( 1 , 4 ) ) , find( r"foo" , "!foo!" ) ) ;
231
+ assert_eq ! ( Some ( ( 1 , 5 ) ) , find( r"foo!" , "!foo!" ) ) ;
232
+ assert_eq ! ( Some ( ( 0 , 5 ) ) , find( r"!foo!" , "!foo!" ) ) ;
233
+
234
+ assert_eq ! ( Some ( ( 0 , 3 ) ) , find( r"foo" , "foo\n " ) ) ;
235
+ assert_eq ! ( Some ( ( 1 , 4 ) ) , find( r"foo" , "!foo!\n " ) ) ;
236
+ assert_eq ! ( Some ( ( 1 , 5 ) ) , find( r"foo!" , "!foo!\n " ) ) ;
237
+ assert_eq ! ( Some ( ( 0 , 5 ) ) , find( r"!foo!" , "!foo!\n " ) ) ;
238
+
239
+ assert_eq ! ( Some ( ( 1 , 6 ) ) , find( r"!?foo!?" , "!!foo!!" ) ) ;
240
+ assert_eq ! ( Some ( ( 0 , 5 ) ) , find( r"!?foo!?" , "!foo!" ) ) ;
241
+ assert_eq ! ( Some ( ( 2 , 5 ) ) , find( r"!?foo!?" , "a!foo!a" ) ) ;
242
+
243
+ assert_eq ! ( Some ( ( 2 , 7 ) ) , find( r"!?foo!?" , "##!foo!\n " ) ) ;
244
+ assert_eq ! ( Some ( ( 3 , 7 ) ) , find( r"f?oo!?" , "##\n foo!##" ) ) ;
245
+ assert_eq ! ( Some ( ( 2 , 5 ) ) , find( r"(?-u)foo[^a]*" , "#!foo☃aaa" ) ) ;
246
+ }
247
+
248
+ // See: https://github.com/BurntSushi/ripgrep/issues/389
249
+ #[ test]
250
+ fn regression_dash ( ) {
158
251
assert_eq ! ( Some ( ( 0 , 2 ) ) , find( r"-2" , "-2" ) ) ;
159
252
}
160
253
0 commit comments