Skip to content

Commit 793c5df

Browse files
author
Tom Lord
committed
Support for POSIX groups
Also some general improvements to character group parsing, and laid the foundations for the big CharGroup refactor
1 parent a9142be commit 793c5df

File tree

6 files changed

+95
-79
lines changed

6 files changed

+95
-79
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ For more detail on this, see [configuration options](#configuration-options).
4343
* Escape sequences, e.g. `/\x42/`, `/\x5word/`, `/#{"\x80".force_encoding("ASCII-8BIT")}/`
4444
* Unicode characters, e.g. `/\u0123/`, `/\uabcd/`, `/\u{789}/`
4545
* Octal characters, e.g. `/\10/`, `/\177/`
46+
* POSIX bracket expressions (including negation), e.g. `/[[:alnum:]]/`, `/[[:^space:]]/`
4647
* **Arbitrarily complex combinations of all the above!**
4748

4849
* Regexp options can also be used:
@@ -54,14 +55,13 @@ For more detail on this, see [configuration options](#configuration-options).
5455
## Bugs and Not-Yet-Supported syntax
5556

5657
* Nested character classes, and the use of set intersection ([See here](http://www.ruby-doc.org/core-2.2.0/Regexp.html#class-Regexp-label-Character+Classes) for the official documentation on this.) For example:
57-
* `/[[abc]]/.examples` (which _should_ return `["a", "b", "c"]`)
58+
* `/[[abc]de]/.examples` (which _should_ return `["a", "b", "c", "d", "e"]`)
5859
* `/[[a-d]&&[c-f]]/.examples` (which _should_ return: `["c", "d"]`)
5960

6061
* Conditional capture groups, such as `/(group1) (?(1)yes|no)`
6162

6263
Using any of the following will raise a RegexpExamples::UnsupportedSyntax exception (until such time as they are implemented!):
6364

64-
* POSIX bracket expressions, e.g. `/[[:alnum:]]/`, `/[[:space:]]/`
6565
* Named properties, e.g. `/\p{L}/` ("Letter"), `/\p{Arabic}/` ("Arabic character"), `/\p{^Ll}/` ("Not a lowercase letter")
6666
* Subexpression calls, e.g. `/(?<name> ... \g<name>* )/` (Note: These could get _really_ ugly to implement, and may even be impossible, so I highly doubt it's worth the effort!)
6767

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
module RegexpExamples
2+
# Given an array of chars from inside a character set,
3+
# Interprets all backslashes, ranges and negations
4+
# TODO: This needs a bit of a rewrite because:
5+
# A) It's ugly
6+
# B) It doesn't take into account nested character groups, or set intersection
7+
# To achieve this, the algorithm needs to be recursive, like the main Parser.
8+
class ChargroupParser
9+
def initialize(chars)
10+
@chars = chars
11+
if @chars[0] == "^"
12+
@negative = true
13+
@chars = @chars[1..-1]
14+
else
15+
@negative = false
16+
end
17+
18+
init_backslash_chars
19+
init_ranges
20+
end
21+
22+
def result
23+
@negative ? (CharSets::Any - @chars) : @chars
24+
end
25+
26+
private
27+
def init_backslash_chars
28+
@chars.each_with_index do |char, i|
29+
if char == "\\"
30+
if BackslashCharMap.keys.include?(@chars[i+1])
31+
@chars[i..i+1] = move_backslash_to_front( BackslashCharMap[@chars[i+1]] )
32+
elsif @chars[i+1] == 'b'
33+
@chars[i..i+1] = "\b"
34+
elsif @chars[i+1] == "\\"
35+
@chars.delete_at(i+1)
36+
else
37+
@chars.delete_at(i)
38+
end
39+
end
40+
end
41+
end
42+
43+
def init_ranges
44+
# remove hyphen ("-") from front/back, if present
45+
hyphen = nil
46+
hyphen = @chars.shift if @chars.first == "-"
47+
hyphen ||= @chars.pop if @chars.last == "-"
48+
# Replace all instances of e.g. ["a", "-", "z"] with ["a", "b", ..., "z"]
49+
while i = @chars.index("-")
50+
# Prevent infinite loops from expanding [",", "-", "."] to itself
51+
# (Since ",".ord = 44, "-".ord = 45, ".".ord = 46)
52+
if (@chars[i-1] == ',' && @chars[i+1] == '.')
53+
hyphen = @chars.delete_at(i)
54+
else
55+
@chars[i-1..i+1] = (@chars[i-1]..@chars[i+1]).to_a
56+
end
57+
end
58+
# restore hyphen, if stripped out earlier
59+
@chars.unshift(hyphen) if hyphen
60+
end
61+
62+
def move_backslash_to_front(chars)
63+
if index = chars.index { |char| char == '\\' }
64+
chars.unshift chars.delete_at(index)
65+
end
66+
chars
67+
end
68+
end
69+
end
70+

lib/regexp-examples/constants.rb

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -32,17 +32,17 @@ def self.MaxGroupResults
3232
end
3333

3434
module CharSets
35-
Lower = Array('a'..'z')
36-
Upper = Array('A'..'Z')
37-
Digit = Array('0'..'9')
38-
# Chars in ranges: [33..47, 58..64, 91..96, 123..126]
39-
Punct = %w(] [ ! " # $ % & ' ( ) * + , . / : ; < = > ? @ \\ ^ _ ` { | } ~ -)
40-
Hex = Array('a'..'f') | Array('A'..'F') | Digit
41-
Word = Lower | Upper | Digit | ['_']
42-
Whitespace = [' ', "\t", "\n", "\r", "\v", "\f"]
43-
Control = (0..31).map(&:chr) | ["\x7f"]
44-
# Ensure that the "common" characters appear first in the array. Do not include "\n"!
45-
Any = Lower | Upper | Digit | Punct | (0..255).map(&:chr) - ["\n"]
35+
Lower = Array('a'..'z')
36+
Upper = Array('A'..'Z')
37+
Digit = Array('0'..'9')
38+
Punct = %w(! " # % & ' ( ) * , - . / : ; ? @ [ \\ ] _ { })
39+
Hex = Array('a'..'f') | Array('A'..'F') | Digit
40+
Word = Lower | Upper | Digit | ['_']
41+
Whitespace = [' ', "\t", "\n", "\r", "\v", "\f"]
42+
Control = (0..31).map(&:chr) | ["\x7f"]
43+
# Ensure that the "common" characters appear first in the array
44+
Any = Lower | Upper | Digit | Punct | (0..127).map(&:chr)
45+
AnyNoNewLine = Any - ["\n"]
4646
end.freeze
4747

4848
# Map of special regex characters, to their associated character sets
@@ -79,7 +79,7 @@ module CharSets
7979
'upper' => CharSets::Upper,
8080
'xdigit' => CharSets::Hex,
8181
'word' => CharSets::Word,
82-
'ascii' => CharSets::Any | ["\n"],
82+
'ascii' => CharSets::Any
8383
}.freeze
8484
end
8585

lib/regexp-examples/groups.rb

Lines changed: 2 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -63,69 +63,14 @@ class CharGroup
6363
def initialize(chars, ignorecase)
6464
@chars = chars
6565
@ignorecase = ignorecase
66-
if chars[0] == "^"
67-
@negative = true
68-
@chars = @chars[1..-1]
69-
else
70-
@negative = false
71-
end
72-
73-
init_backslash_chars
74-
init_ranges
75-
end
76-
77-
def init_ranges
78-
# save first and last "-" if present
79-
80-
first = nil
81-
last = nil
82-
first = @chars.shift if @chars.first == "-"
83-
last = @chars.pop if @chars.last == "-"
84-
# Replace all instances of e.g. ["a", "-", "z"] with ["a", "b", ..., "z"]
85-
while i = @chars.index("-")
86-
# Prevent infinite loops from expanding [",", "-", "."] to itself
87-
# (Since ",".ord = 44, "-".ord = 45, ".".ord = 46)
88-
if (@chars[i-1] == ',' && @chars[i+1] == '.')
89-
first = '-'
90-
@chars.delete_at(i)
91-
else
92-
@chars[i-1..i+1] = (@chars[i-1]..@chars[i+1]).to_a
93-
end
94-
end
95-
# restore them back
96-
@chars.unshift(first) if first
97-
@chars.push(last) if last
98-
end
99-
100-
def init_backslash_chars
101-
@chars.each_with_index do |char, i|
102-
if char == "\\"
103-
if BackslashCharMap.keys.include?(@chars[i+1])
104-
@chars[i..i+1] = move_backslash_to_front( BackslashCharMap[@chars[i+1]] )
105-
elsif @chars[i+1] == 'b'
106-
@chars[i..i+1] = "\b"
107-
elsif @chars[i+1] == "\\"
108-
@chars.delete_at(i+1)
109-
else
110-
@chars.delete_at(i)
111-
end
112-
end
113-
end
11466
end
11567

11668
def result
117-
(@negative ? (CharSets::Any - @chars) : @chars).map do |result|
69+
@chars.map do |result|
11870
GroupResult.new(result)
11971
end
12072
end
12173

122-
private
123-
def move_backslash_to_front(chars)
124-
if index = chars.index { |char| char == '\\' }
125-
chars.unshift chars.delete_at(index)
126-
end
127-
chars
128-
end
12974
end
13075

13176
class DotGroup
@@ -135,8 +80,7 @@ def initialize(multiline)
13580
end
13681

13782
def result
138-
chars = CharSets::Any
139-
chars = (["\n"] | chars) if multiline
83+
chars = multiline ? CharSets::Any : CharSets::AnyNoNewLine
14084
chars.map do |result|
14185
GroupResult.new(result)
14286
end

lib/regexp-examples/parser.rb

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -218,8 +218,11 @@ def parse_multi_end_group
218218
end
219219

220220
def parse_char_group
221-
if rest_of_string =~ /\A\[\[:[^:]+:\]\]/
222-
raise UnsupportedSyntaxError, "POSIX bracket expressions are not yet implemented"
221+
# TODO: Extract all this logic into ChargroupParser
222+
if rest_of_string =~ /\A\[\[:(\^?)([^:]+):\]\]/
223+
@current_position += (6 + $1.length + $2.length)
224+
chars = $1.empty? ? POSIXCharMap[$2] : CharSets::Any - POSIXCharMap[$2]
225+
return CharGroup.new(chars, @ignorecase)
223226
end
224227
chars = []
225228
@current_position += 1
@@ -238,7 +241,8 @@ def parse_char_group
238241
chars << next_char
239242
@current_position += 1
240243
end
241-
CharGroup.new(chars, @ignorecase)
244+
parsed_chars = ChargroupParser.new(chars).result
245+
CharGroup.new(parsed_chars, @ignorecase)
242246
end
243247

244248
def parse_dot_group

spec/regexp-examples_spec.rb

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ def self.examples_exist_and_match(*regexps)
33
regexps.each do |regexp|
44
it do
55
begin
6-
regexp_examples = regexp.examples
6+
regexp_examples = regexp.examples(max_group_results: 999)
77
rescue
88
# TODO: Find a nicer way to display this?
99
puts "Error generating examples for /#{regexp.source}/"
@@ -187,8 +187,7 @@ def self.examples_are_empty(*regexps)
187187
/\p{L}/,
188188
/\p{Arabic}/,
189189
/\p{^Ll}/,
190-
/(?<name> ... \g<name>*)/,
191-
/[[:space:]]/
190+
/(?<name> ... \g<name>*)/
192191
)
193192
end
194193

@@ -244,7 +243,6 @@ def self.examples_are_empty(*regexps)
244243
end
245244

246245
context "for POSIX groups" do
247-
before { pending "TODO: POSIX Groups" }
248246
examples_exist_and_match(
249247
/[[:alnum:]]/,
250248
/[[:alpha:]]/,

0 commit comments

Comments
 (0)