Skip to content

Commit bb35ead

Browse files
committed
Support optional 'Script=' prefix (from ES2018 syntax) for Unicode script tokens (#225)
1 parent 4860122 commit bb35ead

File tree

4 files changed

+74
-14
lines changed

4 files changed

+74
-14
lines changed

src/addons/unicode-base.js

Lines changed: 46 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ export default (XRegExp) => {
2626

2727
// Storage for Unicode data
2828
const unicode = {};
29+
const unicodeTypes = {};
2930

3031
// Reuse utils
3132
const dec = XRegExp._dec;
@@ -123,41 +124,56 @@ export default (XRegExp) => {
123124
*/
124125
XRegExp.addToken(
125126
// Use `*` instead of `+` to avoid capturing `^` as the token name in `\p{^}`
126-
/\\([pP])(?:{(\^?)([^}]*)}|([A-Za-z]))/,
127+
/\\([pP])(?:{(\^?)(?:(Script|sc)=)?([^}]*)}|([A-Za-z]))/,
127128
(match, scope, flags) => {
128129
const ERR_DOUBLE_NEG = 'Invalid double negation ';
129130
const ERR_UNKNOWN_NAME = 'Unknown Unicode token ';
130131
const ERR_UNKNOWN_REF = 'Unicode token missing data ';
131132
const ERR_ASTRAL_ONLY = 'Astral mode required for Unicode token ';
132133
const ERR_ASTRAL_IN_CLASS = 'Astral mode does not support Unicode tokens within character classes';
134+
const [
135+
fullToken,
136+
pPrefix,
137+
caretNegation,
138+
typePrefix,
139+
tokenName,
140+
tokenSingleCharName
141+
] = match;
133142
// Negated via \P{..} or \p{^..}
134-
let isNegated = match[1] === 'P' || !!match[2];
143+
let isNegated = pPrefix === 'P' || !!caretNegation;
135144
// Switch from BMP (0-FFFF) to astral (0-10FFFF) mode via flag A
136145
const isAstralMode = flags.includes('A');
137-
// Token lookup name. Check `[4]` first to avoid passing `undefined` via `\p{}`
138-
let slug = normalize(match[4] || match[3]);
146+
// Token lookup name. Check `tokenSingleCharName` first to avoid passing `undefined`
147+
// via `\p{}`
148+
let slug = normalize(tokenSingleCharName || tokenName);
139149
// Token data object
140150
let item = unicode[slug];
141151

142-
if (match[1] === 'P' && match[2]) {
143-
throw new SyntaxError(ERR_DOUBLE_NEG + match[0]);
152+
if (pPrefix === 'P' && caretNegation) {
153+
throw new SyntaxError(ERR_DOUBLE_NEG + fullToken);
144154
}
145155
if (!unicode.hasOwnProperty(slug)) {
146-
throw new SyntaxError(ERR_UNKNOWN_NAME + match[0]);
156+
throw new SyntaxError(ERR_UNKNOWN_NAME + fullToken);
157+
}
158+
159+
if (typePrefix) {
160+
if (!(unicodeTypes[typePrefix] && unicodeTypes[typePrefix][slug])) {
161+
throw new SyntaxError(ERR_UNKNOWN_NAME + fullToken);
162+
}
147163
}
148164

149165
// Switch to the negated form of the referenced Unicode token
150166
if (item.inverseOf) {
151167
slug = normalize(item.inverseOf);
152168
if (!unicode.hasOwnProperty(slug)) {
153-
throw new ReferenceError(`${ERR_UNKNOWN_REF + match[0]} -> ${item.inverseOf}`);
169+
throw new ReferenceError(`${ERR_UNKNOWN_REF + fullToken} -> ${item.inverseOf}`);
154170
}
155171
item = unicode[slug];
156172
isNegated = !isNegated;
157173
}
158174

159175
if (!(item.bmp || isAstralMode)) {
160-
throw new SyntaxError(ERR_ASTRAL_ONLY + match[0]);
176+
throw new SyntaxError(ERR_ASTRAL_ONLY + fullToken);
161177
}
162178
if (isAstralMode) {
163179
if (scope === 'class') {
@@ -196,6 +212,9 @@ export default (XRegExp) => {
196212
* character classes and alternation, and should use surrogate pairs to represent astral code
197213
* points. `inverseOf` can be used to avoid duplicating character data if a Unicode token is
198214
* defined as the exact inverse of another token.
215+
* @param {String} [typePrefix] Enables optionally using this type as a prefix for all of the
216+
* provided Unicode tokens, e.g. if given `'Type'`, then `\p{TokenName}` can also be written
217+
* as `\p{Type=TokenName}`.
199218
* @example
200219
*
201220
* // Basic use
@@ -206,20 +225,35 @@ export default (XRegExp) => {
206225
* }]);
207226
* XRegExp('\\p{XDigit}:\\p{Hexadecimal}+').test('0:3D'); // -> true
208227
*/
209-
XRegExp.addUnicodeData = (data) => {
228+
XRegExp.addUnicodeData = (data, typePrefix) => {
210229
const ERR_NO_NAME = 'Unicode token requires name';
211230
const ERR_NO_DATA = 'Unicode token has no character data ';
212231

232+
if (typePrefix) {
233+
// Case sensitive to match ES2018
234+
unicodeTypes[typePrefix] = {};
235+
}
236+
213237
for (const item of data) {
214238
if (!item.name) {
215239
throw new Error(ERR_NO_NAME);
216240
}
217241
if (!(item.inverseOf || item.bmp || item.astral)) {
218242
throw new Error(ERR_NO_DATA + item.name);
219243
}
220-
unicode[normalize(item.name)] = item;
244+
245+
const normalizedName = normalize(item.name);
246+
unicode[normalizedName] = item;
247+
if (typePrefix) {
248+
unicodeTypes[typePrefix][normalizedName] = true;
249+
}
250+
221251
if (item.alias) {
222-
unicode[normalize(item.alias)] = item;
252+
const normalizedAlias = normalize(item.alias);
253+
unicode[normalizedAlias] = item;
254+
if (typePrefix) {
255+
unicodeTypes[typePrefix][normalizedAlias] = true;
256+
}
223257
}
224258
}
225259

src/addons/unicode-scripts.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,5 +22,5 @@ export default (XRegExp) => {
2222
throw new ReferenceError('Unicode Base must be loaded before Unicode Scripts');
2323
}
2424

25-
XRegExp.addUnicodeData(scripts);
25+
XRegExp.addUnicodeData(scripts, 'Script');
2626
};

tests/spec/s-addons-unicode.js

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,10 @@ describe('Unicode Categories addon:', function() {
417417
expect(function() {XRegExp('\\p{IsP}');}).toThrowError(SyntaxError);
418418
});
419419

420+
it('should not allow the "Script=" prefix for category names', function() {
421+
expect(function() {XRegExp('\\p{Script=P}');}).toThrowError(SyntaxError);
422+
});
423+
420424
it('should handle \\p{Cn}', function() {
421425
testUnicodeToken('Cn', {
422426
invalid: ['\u20BA']
@@ -489,6 +493,10 @@ describe('Unicode Properties addon:', function() {
489493
expect(function() {XRegExp('\\p{IsASCII}');}).toThrowError(SyntaxError);
490494
});
491495

496+
it('should not allow the "Script=" prefix for property names', function() {
497+
expect(function() {XRegExp('\\p{Script=ASCII}');}).toThrowError(SyntaxError);
498+
});
499+
492500
it('should handle \\p{Alphabetic}', function() {
493501
testUnicodeToken('Alphabetic', {
494502
valid: ['A', 'a', 'Å', 'å', '日', 'ي'],
@@ -529,6 +537,21 @@ describe('Unicode Scripts addon:', function() {
529537
expect(function() {XRegExp('\\p{IsLatin}');}).toThrowError(SyntaxError);
530538
});
531539

540+
it('should allow the "Script=" prefix for script names', function() {
541+
expect(function() {XRegExp('\\p{Script=Latin}');}).not.toThrow();
542+
testUnicodeToken('Script=Latin', {
543+
valid: ['A', 'B', 'C'],
544+
invalid: ['カ', 'タ', 'ナ']
545+
});
546+
});
547+
548+
it('should handle \\p{Latin}', function() {
549+
testUnicodeToken('Latin', {
550+
valid: ['A', 'B', 'C'],
551+
invalid: ['カ', 'タ', 'ナ']
552+
});
553+
});
554+
532555
it('should handle \\p{Katakana}', function() {
533556
testUnicodeToken('Katakana', {
534557
valid: ['カ', 'タ', 'ナ'],

types/index.d.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -497,6 +497,9 @@ declare namespace XRegExp {
497497
* character classes and alternation, and should use surrogate pairs to represent astral code
498498
* points. `inverseOf` can be used to avoid duplicating character data if a Unicode token is
499499
* defined as the exact inverse of another token.
500+
* @param typePrefix - Enables optionally using this type as a prefix for all of the
501+
* provided Unicode tokens, e.g. if given `'Type'`, then `\p{TokenName}` can also be written
502+
* as `\p{Type=TokenName}`.
500503
* @example
501504
*
502505
* // Basic use
@@ -507,7 +510,7 @@ declare namespace XRegExp {
507510
* }]);
508511
* XRegExp('\\p{XDigit}:\\p{Hexadecimal}+').test('0:3D'); // -> true
509512
*/
510-
function addUnicodeData(data: UnicodeCharacterRange[]): void;
513+
function addUnicodeData(data: UnicodeCharacterRange[], typePrefix?: string): void;
511514

512515
/**
513516
* Builds regexes using named subpatterns, for readability and pattern reuse. Backreferences in

0 commit comments

Comments
 (0)