Skip to content

Commit 0a0f409

Browse files
committed
Review and clean up some code
Simplification, style consistency, dead code deletion, some bounds-check removal, etc.
1 parent 0abaabe commit 0a0f409

File tree

10 files changed

+410
-476
lines changed

10 files changed

+410
-476
lines changed

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexReplacement.cs

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
using System.Collections.Generic;
66
using System.Diagnostics;
77
using System.Runtime.CompilerServices;
8-
using System.Runtime.InteropServices;
98

109
#pragma warning disable CS8500 // takes address of managed type
1110

Lines changed: 33 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,39 @@
11
// Licensed to the .NET Foundation under one or more agreements.
22
// The .NET Foundation licenses this file to you under the MIT license.
33

4-
namespace System.Text.RegularExpressions.Symbolic;
4+
using System.Diagnostics;
55

6-
internal readonly struct MatchReversal<TSet>(
7-
MatchReversalKind kind,
8-
int fixedLength,
9-
MatchingState<TSet>? adjustedStartState = null)
10-
where TSet : IComparable<TSet>, IEquatable<TSet>
6+
namespace System.Text.RegularExpressions.Symbolic
117
{
12-
internal MatchReversalKind Kind { get; } = kind;
13-
internal int FixedLength { get; } = fixedLength;
14-
internal MatchingState<TSet>? AdjustedStartState { get; } = adjustedStartState;
8+
/// <summary>Provides details on how a match may be processed in reverse to find the beginning of a match once a match's existence has been confirmed.</summary>
9+
internal readonly struct MatchReversalInfo<TSet> where TSet : IComparable<TSet>, IEquatable<TSet>
10+
{
11+
/// <summary>Initializes the match reversal details.</summary>
12+
internal MatchReversalInfo(MatchReversalKind kind, int fixedLength, MatchingState<TSet>? adjustedStartState = null)
13+
{
14+
Debug.Assert(kind is MatchReversalKind.MatchStart or MatchReversalKind.FixedLength or MatchReversalKind.PartialFixedLength);
15+
Debug.Assert(fixedLength >= 0);
16+
Debug.Assert((adjustedStartState is not null) == (kind is MatchReversalKind.PartialFixedLength));
17+
18+
Kind = kind;
19+
FixedLength = fixedLength;
20+
AdjustedStartState = adjustedStartState;
21+
}
22+
23+
/// <summary>Gets the kind of the match reversal processing required.</summary>
24+
internal MatchReversalKind Kind { get; }
25+
26+
/// <summary>Gets the fixed length of the match, if one is known.</summary>
27+
/// <remarks>
28+
/// For <see cref="MatchReversalKind.MatchStart"/>, this is ignored.
29+
/// For <see cref="MatchReversalKind.FixedLength"/>, this is the full length of the match. The beginning may be found simply
30+
/// by subtracting this length from the end.
31+
/// For <see cref="MatchReversalKind.PartialFixedLength"/>, this is the length of fixed portion of the match.
32+
/// </remarks>
33+
internal int FixedLength { get; }
34+
35+
/// <summary>Gets the adjusted start state to use for partial fixed-length matches.</summary>
36+
/// <remarks>This will be non-null iff <see cref="Kind"/> is <see cref="MatchReversalKind.PartialFixedLength"/>.</remarks>
37+
internal MatchingState<TSet>? AdjustedStartState { get; }
38+
}
1539
}
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,26 @@
11
// Licensed to the .NET Foundation under one or more agreements.
22
// The .NET Foundation licenses this file to you under the MIT license.
33

4-
namespace System.Text.RegularExpressions.Symbolic;
5-
6-
internal enum MatchReversalKind
4+
namespace System.Text.RegularExpressions.Symbolic
75
{
8-
/// <summary>The most generic option, run the regex backwards to find beginning of match</summary>
9-
MatchStart,
10-
/// <summary>Part of the reversal is fixed length and can be skipped</summary>
11-
PartialFixedLength,
12-
/// <summary>The entire pattern is fixed length, reversal not necessary</summary>
13-
FixedLength
6+
/// <summary>Specifies the kind of a <see cref="MatchReversalInfo{TSet}"/>.</summary>
7+
internal enum MatchReversalKind
8+
{
9+
/// <summary>The regex should be run in reverse to find beginning of the match.</summary>
10+
MatchStart,
11+
12+
/// <summary>The end of the pattern is of a fixed length and can be skipped as part of running a regex in reverse to find the beginning of the match.</summary>
13+
/// <remarks>
14+
/// Reverse execution is not necessary for a subset of the match.
15+
/// <see cref="MatchReversalInfo{TSet}.FixedLength"/> will contain the length of the fixed portion.
16+
/// </remarks>
17+
PartialFixedLength,
18+
19+
/// <summary>The entire pattern is of a fixed length.</summary>
20+
/// <remarks>
21+
/// Reverse execution is not necessary to find the beginning of the match.
22+
/// <see cref="MatchReversalInfo{TSet}.FixedLength"/> will contain the length of the match.
23+
/// </remarks>
24+
FixedLength
25+
}
1426
}

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/Symbolic/MatchingState.cs

Lines changed: 25 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@ internal MatchingState(SymbolicRegexNode<TSet> node, uint prevCharKind)
1717
NullabilityInfo = BuildNullabilityInfo();
1818
}
1919

20-
internal int NullabilityInfo { get; }
21-
2220
/// <summary>The regular expression that labels this state and gives it its semantics.</summary>
2321
internal SymbolicRegexNode<TSet> Node { get; }
2422

@@ -98,15 +96,31 @@ internal SymbolicRegexNode<TSet> Next(SymbolicRegexBuilder<TSet> builder, TSet m
9896
return Node.CreateNfaDerivativeWithEffects(builder, minterm, context);
9997
}
10098

101-
/// <summary>
102-
/// Cached nullability check with encoded bits
103-
/// </summary>
99+
/// <summary>Determines whether the node is nullable for the given context.</summary>
100+
/// <remarks>
101+
/// This is functionally equivalent to <see cref="SymbolicRegexNode{TSet}.IsNullableFor(uint)"/>, but using cached
102+
/// answers stored in <see cref="NullabilityInfo"/>.
103+
/// </remarks>
104104
[MethodImpl(MethodImplOptions.AggressiveInlining)]
105105
internal bool IsNullableFor(uint nextCharKind)
106106
{
107-
return ((1 << (int)nextCharKind) & NullabilityInfo) != 0;
107+
Debug.Assert(nextCharKind is >= 0 and < CharKind.CharKindCount);
108+
return (NullabilityInfo & (1 << (int)nextCharKind)) != 0;
108109
}
109110

111+
/// <summary>Gets the nullability info for the matching state.</summary>
112+
/// <remarks>
113+
/// <list>
114+
/// <item>00000 -> node cannot be nullable</item>
115+
/// <item>00001 -> nullable for General</item>
116+
/// <item>00010 -> nullable for BeginningEnd</item>
117+
/// <item>00100 -> nullable for NewLine</item>
118+
/// <item>01000 -> nullable for NewLineS</item>
119+
/// <item>10000 -> nullable for WordLetter</item>
120+
/// </list>
121+
/// </remarks>
122+
internal int NullabilityInfo { get; }
123+
110124
/// <summary>
111125
/// Builds a <see cref="StateFlags"/> with the relevant flags set.
112126
/// </summary>
@@ -138,24 +152,16 @@ internal StateFlags BuildStateFlags(bool isInitial)
138152
return info;
139153
}
140154

141-
/// <summary>
142-
/// Builds the nullability information for the matching state.
143-
/// Nullability for each context is encoded in a bit
144-
/// 0 means node cannot be nullable
145-
/// 00001 -> nullable for General
146-
/// 00010 -> nullable for BeginningEnd
147-
/// 00100 -> nullable for NewLine
148-
/// 01000 -> nullable for NewLineS
149-
/// 10000 -> nullable for WordLetter
150-
/// </summary>
151-
internal byte BuildNullabilityInfo()
155+
/// <summary>Builds the nullability information for the matching state.</summary>
156+
/// <remarks>Nullability for each context is encoded in a bit. See <see cref="NullabilityInfo"/>.</remarks>
157+
private byte BuildNullabilityInfo()
152158
{
153159
byte nullabilityInfo = 0;
154160
if (Node.CanBeNullable)
155161
{
156-
for (uint ck = 0; ck < CharKind.CharKindCount; ck++)
162+
for (uint charKind = 0; charKind < CharKind.CharKindCount; charKind++)
157163
{
158-
nullabilityInfo |= (byte)(Node.IsNullableFor(CharKind.Context(PrevCharKind, ck)) ? 1 << (int)ck : 0);
164+
nullabilityInfo |= (byte)(Node.IsNullableFor(CharKind.Context(PrevCharKind, charKind)) ? 1 << (int)charKind : 0);
159165
}
160166
}
161167

Lines changed: 42 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
// Licensed to the .NET Foundation under one or more agreements.
22
// The .NET Foundation licenses this file to you under the MIT license.
33

4+
using System.Buffers;
45
using System.Diagnostics;
6+
using System.Numerics;
57
using System.Runtime.CompilerServices;
68

79
namespace System.Text.RegularExpressions.Symbolic
@@ -20,12 +22,12 @@ namespace System.Text.RegularExpressions.Symbolic
2022
/// </remarks>
2123
internal sealed class MintermClassifier
2224
{
23-
/// <summary>An array used to map characters to minterms</summary>
25+
/// <summary>Mapping for characters to minterms, used in the vast majority case when there are less than 256 minterms.</summary>
26+
/// <remarks>_lookup[char] provides the minterm ID. If char &gt;= _lookup.Length, its minterm is 0.</remarks>
2427
private readonly byte[]? _lookup;
2528

26-
/// <summary>
27-
/// Fallback lookup if over 255 minterms. This is rarely used.
28-
/// </summary>
29+
/// <summary>Mapping for characters to minterms, used when there are at least 256 minterms. This is rarely used.</summary>
30+
/// <remarks>_intLookup[char] provides the minterm ID. If char &gt;= _intLookup.Length, its minterm is 0.</remarks>
2931
private readonly int[]? _intLookup;
3032

3133
/// <summary>Create a classifier that maps a character to the ID of its associated minterm.</summary>
@@ -37,61 +39,64 @@ public MintermClassifier(BDD[] minterms)
3739
if (minterms.Length == 1)
3840
{
3941
// With only a single minterm, the mapping is trivial: everything maps to it (ID 0).
40-
_lookup = Array.Empty<byte>();
42+
_lookup = [];
4143
return;
4244
}
4345

44-
int _maxChar = -1;
45-
// attempt to save memory in common cases by allocating only up to the highest char code
46+
// Compute all minterm ranges. We do this here in order to determine the maximum character value
47+
// in order to size the lookup array to minimize steady-state memory consumption of the potentially
48+
// large lookup array. We prefer to use the byte[] _lookup when possible, in order to keep memory
49+
// consumption to a minimum; doing so accomodates up to 255 minterms, which is the vast majority case.
50+
// However, when there are more than 255 minterms, we need to use int[] _intLookup.
51+
(uint, uint)[][] charRangesPerMinterm = ArrayPool<(uint, uint)[]>.Shared.Rent(minterms.Length);
52+
53+
int maxChar = -1;
4654
for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
4755
{
48-
_maxChar = Math.Max(_maxChar, (int)BDDRangeConverter.ToRanges(minterms[mintermId])[^1].Item2);
56+
(uint, uint)[] ranges = BDDRangeConverter.ToRanges(minterms[mintermId]);
57+
charRangesPerMinterm[mintermId] = ranges;
58+
maxChar = Math.Max(maxChar, (int)ranges[^1].Item2);
4959
}
5060

51-
// It's incredibly rare for a regex to use more than a hundred or two minterms,
52-
// but we need a fallback just in case.
61+
// It's incredibly rare for a regex to use more than a couple hundred minterms,
62+
// but we need a fallback just in case. (Over 128 unique sets also means it's never ASCII only.)
5363
if (minterms.Length > 255)
5464
{
55-
// over 255 unique sets also means it's never ascii only
56-
int[] lookup = new int[_maxChar + 1];
57-
for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
58-
{
59-
// precompute all assigned minterm categories
60-
(uint, uint)[] mintermRanges = BDDRangeConverter.ToRanges(minterms[mintermId]);
61-
foreach ((uint start, uint end) in mintermRanges)
62-
{
63-
// assign character ranges in bulk
64-
Span<int> slice = lookup.AsSpan((int)start, (int)(end + 1 - start));
65-
slice.Fill(mintermId);
66-
}
67-
}
68-
_intLookup = lookup;
65+
_intLookup = CreateLookup<int>(minterms, charRangesPerMinterm, maxChar);
6966
}
7067
else
7168
{
72-
byte[] lookup = new byte[_maxChar + 1];
69+
_lookup = CreateLookup<byte>(minterms, charRangesPerMinterm, maxChar);
70+
}
71+
72+
// Return the rented array. We clear it before returning it in order to avoid all the ranges arrays being kept alive.
73+
Array.Clear(charRangesPerMinterm, 0, minterms.Length);
74+
ArrayPool<(uint, uint)[]>.Shared.Return(charRangesPerMinterm);
75+
76+
// Creates the lookup array.
77+
static T[] CreateLookup<T>(BDD[] minterms, ReadOnlySpan<(uint, uint)[]> charRangesPerMinterm, int _maxChar) where T : IBinaryInteger<T>
78+
{
79+
T[] lookup = new T[_maxChar + 1];
7380
for (int mintermId = 1; mintermId < minterms.Length; mintermId++)
7481
{
75-
// precompute all assigned minterm categories
76-
(uint, uint)[] mintermRanges = BDDRangeConverter.ToRanges(minterms[mintermId]);
77-
foreach ((uint start, uint end) in mintermRanges)
82+
// Each minterm maps to a range of characters. Set each of the characters in those ranges to the corresponding minterm.
83+
foreach ((uint start, uint end) in charRangesPerMinterm[mintermId])
7884
{
79-
// assign character ranges in bulk
80-
Span<byte> slice = lookup.AsSpan((int)start, (int)(end + 1 - start));
81-
slice.Fill((byte)mintermId);
85+
lookup.AsSpan((int)start, (int)(end + 1 - start)).Fill(T.CreateTruncating(mintermId));
8286
}
8387
}
84-
_lookup = lookup;
88+
89+
return lookup;
8590
}
8691
}
8792

8893
/// <summary>Gets the ID of the minterm associated with the specified character. </summary>
8994
[MethodImpl(MethodImplOptions.AggressiveInlining)]
9095
public int GetMintermID(int c)
9196
{
92-
if (_intLookup is null)
97+
if (_lookup is not null)
9398
{
94-
byte[] lookup = _lookup!;
99+
byte[] lookup = _lookup;
95100
return (uint)c < (uint)lookup.Length ? lookup[c] : 0;
96101
}
97102
else
@@ -104,20 +109,17 @@ public int GetMintermID(int c)
104109
/// Gets a quick mapping from char to minterm for the common case when there are &lt;= 255 minterms.
105110
/// Null if there are greater than 255 minterms.
106111
/// </summary>
107-
[MethodImpl(MethodImplOptions.AggressiveInlining)]
108-
public byte[]? ByteLookup() => _lookup;
112+
public byte[]? ByteLookup => _lookup;
109113

110114
/// <summary>
111115
/// Gets a mapping from char to minterm for the rare case when there are &gt;= 255 minterms.
112116
/// Null in the common case where there are fewer than 255 minterms.
113117
/// </summary>
114-
[MethodImpl(MethodImplOptions.AggressiveInlining)]
115-
public int[]? IntLookup() => _intLookup;
118+
public int[]? IntLookup => _intLookup;
116119

117120
/// <summary>
118121
/// Maximum ordinal character for a non-0 minterm, used to conserve memory
119122
/// </summary>
120-
[MethodImpl(MethodImplOptions.AggressiveInlining)]
121-
public int MaxChar() => (_lookup?.Length ?? _intLookup!.Length) - 1;
123+
public int MaxChar => (_lookup?.Length ?? _intLookup!.Length) - 1;
122124
}
123125
}

0 commit comments

Comments
 (0)