Skip to content

Commit d586d36

Browse files
committed
Make Lexeme a trait, not a struct.
It is quite probable that in some parsing situations the way that `Lexeme`s are represented in memory will be performance critical. Previously we tried to satisfy everyone by having a fairly compact `Lexeme` representation, but that meant that we couldn't represent some reasonable size/lengths. I had hoped we could do something clever like expand the `Lexeme` struct depending on `StorageT`s size, but that's not possible in any sensible way. This commit solves the problem in, I think, a more general way: it lets users provide their own lexeme struct provided it conforms to the (simple) `Lexeme` trait. This does mean lots of additional type parameters, but (after three attempts!) these don't leak out to the end user *too* much, though they do a bit. As a half-way house, the "original" lexeme struct is now called `StandardLexeme`. The plan is to move that out of lrpar and into lrlex, but that can be done in a subsequent commit. Interestingly, this commit paves the way for splitting lrlex and lrpar apart in a more satisfactory way: the `Lexeme` trait makes clear that any lexer can be used with lrpar. In other words, while lrlex only makes sense with lrpar, the `Lexeme` trait will help make it clearer that lrpar can be used without lrlex.
1 parent 01a8923 commit d586d36

File tree

11 files changed

+321
-216
lines changed

11 files changed

+321
-216
lines changed

lrlex/src/lib/lexer.rs

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
use std::{
22
collections::{HashMap, HashSet},
3+
fmt,
34
hash::Hash,
45
marker::PhantomData,
56
slice::Iter,
@@ -9,7 +10,7 @@ use num_traits::{PrimInt, Unsigned};
910
use regex::{self, Regex, RegexBuilder};
1011
use try_from::TryFrom;
1112

12-
use lrpar::{LexError, Lexeme, Lexer, NonStreamingLexer, Span};
13+
use lrpar::{LexError, Lexeme, Lexer, NonStreamingLexer, Span, StandardLexeme};
1314

1415
use crate::{parser::LexParser, LexBuildResult};
1516

@@ -192,7 +193,7 @@ impl<StorageT: Copy + Eq + Hash + PrimInt + TryFrom<usize> + Unsigned> LexerDef<
192193
}
193194
}
194195

195-
impl<StorageT: Copy + Eq + Hash + PrimInt + TryFrom<usize> + Unsigned>
196+
impl<StorageT: Copy + Eq + fmt::Debug + Hash + PrimInt + TryFrom<usize> + Unsigned>
196197
LRNonStreamingLexerDef<StorageT>
197198
{
198199
/// Return an [LRNonStreamingLexer] for the `String` `s` that will lex relative to this
@@ -208,17 +209,20 @@ impl<StorageT: Copy + Eq + Hash + PrimInt + TryFrom<usize> + Unsigned>
208209
/// An `LRNonStreamingLexer` holds a reference to a string and can lex it into [lrpar::Lexeme]s.
209210
/// Although the struct is tied to a single string, no guarantees are made about whether the
210211
/// lexemes are cached or not.
211-
pub struct LRNonStreamingLexer<'lexer, 'input: 'lexer, StorageT> {
212+
pub struct LRNonStreamingLexer<'lexer, 'input: 'lexer, StorageT: fmt::Debug> {
212213
s: &'input str,
213-
lexemes: Vec<Result<Lexeme<StorageT>, LexError>>,
214+
lexemes: Vec<Result<StandardLexeme<StorageT>, LexError>>,
214215
/// A sorted list of the byte index of the start of the following line. i.e. for the input
215216
/// string `" a\nb\n c d"` this will contain `[3, 5]`.
216217
newlines: Vec<usize>,
217218
phantom: PhantomData<&'lexer ()>,
218219
}
219220

220-
impl<'lexer, 'input: 'lexer, StorageT: Copy + Eq + Hash + PrimInt + TryFrom<usize> + Unsigned>
221-
LRNonStreamingLexer<'lexer, 'input, StorageT>
221+
impl<
222+
'lexer,
223+
'input: 'lexer,
224+
StorageT: Copy + Eq + fmt::Debug + Hash + PrimInt + TryFrom<usize> + Unsigned,
225+
> LRNonStreamingLexer<'lexer, 'input, StorageT>
222226
{
223227
fn new(
224228
lexerdef: &'lexer LRNonStreamingLexerDef<StorageT>,
@@ -278,16 +282,19 @@ impl<'lexer, 'input: 'lexer, StorageT: Copy + Eq + Hash + PrimInt + TryFrom<usiz
278282
}
279283
}
280284

281-
impl<'lexer, 'input: 'lexer, StorageT: Copy + Eq + Hash + PrimInt + Unsigned> Lexer<StorageT>
282-
for LRNonStreamingLexer<'lexer, 'input, StorageT>
285+
impl<'lexer, 'input: 'lexer, StorageT: Copy + fmt::Debug + Eq + Hash + PrimInt + Unsigned>
286+
Lexer<StandardLexeme<StorageT>, StorageT> for LRNonStreamingLexer<'lexer, 'input, StorageT>
283287
{
284-
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = Result<Lexeme<StorageT>, LexError>> + 'a> {
288+
fn iter<'a>(
289+
&'a self,
290+
) -> Box<dyn Iterator<Item = Result<StandardLexeme<StorageT>, LexError>> + 'a> {
285291
Box::new(self.lexemes.iter().cloned())
286292
}
287293
}
288294

289-
impl<'lexer, 'input: 'lexer, StorageT: Copy + Eq + Hash + PrimInt + Unsigned>
290-
NonStreamingLexer<'input, StorageT> for LRNonStreamingLexer<'lexer, 'input, StorageT>
295+
impl<'lexer, 'input: 'lexer, StorageT: Copy + Eq + fmt::Debug + Hash + PrimInt + Unsigned>
296+
NonStreamingLexer<'input, StandardLexeme<StorageT>, StorageT>
297+
for LRNonStreamingLexer<'lexer, 'input, StorageT>
291298
{
292299
fn span_str(&self, span: Span) -> &'input str {
293300
if span.end() > self.s.len() {
@@ -334,15 +341,18 @@ impl<'lexer, 'input: 'lexer, StorageT: Copy + Eq + Hash + PrimInt + Unsigned>
334341
}
335342

336343
/// Returns `(line byte offset, line index)`.
337-
fn lc_byte<StorageT>(lexer: &LRNonStreamingLexer<StorageT>, i: usize) -> (usize, usize) {
344+
fn lc_byte<StorageT: fmt::Debug>(
345+
lexer: &LRNonStreamingLexer<StorageT>,
346+
i: usize,
347+
) -> (usize, usize) {
338348
match lexer.newlines.binary_search(&i) {
339349
Ok(j) => (lexer.newlines[j], j + 2),
340350
Err(0) => (0, 1),
341351
Err(j) => (lexer.newlines[j - 1], j + 1),
342352
}
343353
}
344354

345-
fn lc_char<StorageT: Copy + Eq + Hash + PrimInt + Unsigned>(
355+
fn lc_char<StorageT: Copy + Eq + fmt::Debug + Hash + PrimInt + Unsigned>(
346356
lexer: &LRNonStreamingLexer<StorageT>,
347357
i: usize,
348358
s: &str,

lrlex/src/main.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ use std::{
88
};
99

1010
use lrlex::{LRNonStreamingLexerDef, LexerDef};
11-
use lrpar::Lexer;
11+
use lrpar::{Lexeme, Lexer};
1212

1313
fn usage(prog: &str, msg: &str) {
1414
let path = Path::new(prog);

lrpar/cttests/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ use lrpar::lrpar_mod;
33
#[cfg(test)]
44
use lrpar::Span;
55
#[cfg(test)]
6-
use lrpar::{Lexer, NonStreamingLexer};
6+
use lrpar::{Lexeme, Lexer, NonStreamingLexer};
77

88
lrlex_mod!("calc_multitypes.l");
99
lrpar_mod!("calc_multitypes.y");

lrpar/examples/calc_ast/src/main.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
use std::io::{self, BufRead, Write};
44

55
use lrlex::lrlex_mod;
6-
use lrpar::{lrpar_mod, NonStreamingLexer, Span};
6+
use lrpar::{lrpar_mod, NonStreamingLexer, Span, StandardLexeme};
77

88
// Using `lrlex_mod!` brings the lexer for `calc.l` into scope. By default the module name will be
99
// `calc_l` (i.e. the file name, minus any extensions, with a suffix of `_l`).
@@ -54,7 +54,10 @@ fn main() {
5454
}
5555
}
5656

57-
fn eval(lexer: &dyn NonStreamingLexer<u32>, e: Expr) -> Result<u64, (Span, &'static str)> {
57+
fn eval(
58+
lexer: &dyn NonStreamingLexer<StandardLexeme<u32>, u32>,
59+
e: Expr,
60+
) -> Result<u64, (Span, &'static str)> {
5861
match e {
5962
Expr::Add { span, lhs, rhs } => eval(lexer, *lhs)?
6063
.checked_add(eval(lexer, *rhs)?)

lrpar/examples/calc_parsetree/build.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use cfgrammar::yacc::{YaccKind, YaccOriginalActionKind};
22
use lrlex::CTLexerBuilder;
3-
use lrpar::CTParserBuilder;
3+
use lrpar::{CTParserBuilder, StandardLexeme};
44

55
fn main() -> Result<(), Box<dyn std::error::Error>> {
66
// First we create the parser, which returns a HashMap of all the tokens used, then we pass
@@ -9,7 +9,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
99
// Note that we specify the integer type (u8) we'll use for token IDs (this type *must* be big
1010
// enough to fit all IDs in) as well as the input file (which must end in ".y" for lrpar, and
1111
// ".l" for lrlex).
12-
let cp = CTParserBuilder::<u8>::new_with_storaget()
12+
let cp = CTParserBuilder::<StandardLexeme<u8>, _>::new_with_storaget()
1313
.yacckind(YaccKind::Original(YaccOriginalActionKind::GenericParseTree))
1414
.grammar_in_src_dir("calc.y")?
1515
.build()?;

lrpar/examples/calc_parsetree/src/main.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ use std::io::{self, BufRead, Write};
22

33
use cfgrammar::RIdx;
44
use lrlex::lrlex_mod;
5-
use lrpar::{lrpar_mod, Node};
5+
use lrpar::{lrpar_mod, Lexeme, Node, StandardLexeme};
66

77
// Using `lrlex_mod!` brings the lexer for `calc.l` into scope. By default the module name will be
88
// `calc_l` (i.e. the file name, minus any extensions, with a suffix of `_l`).
@@ -49,7 +49,7 @@ impl<'a> Eval<'a> {
4949
Eval { s }
5050
}
5151

52-
fn eval(&self, n: &Node<u8>) -> i64 {
52+
fn eval(&self, n: &Node<StandardLexeme<u8>, u8>) -> i64 {
5353
match *n {
5454
Node::Nonterm {
5555
ridx: RIdx(ridx),

lrpar/src/lib/cpctplus.rs

Lines changed: 60 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -100,13 +100,25 @@ impl<StorageT: PrimInt + Unsigned> PartialEq for PathFNode<StorageT> {
100100

101101
impl<StorageT: PrimInt + Unsigned> Eq for PathFNode<StorageT> {}
102102

103-
struct CPCTPlus<'a, 'b: 'a, 'input: 'b, StorageT: 'static + Eq + Hash, ActionT: 'a> {
104-
parser: &'a Parser<'a, 'b, 'input, StorageT, ActionT>,
103+
struct CPCTPlus<
104+
'a,
105+
'b: 'a,
106+
'input: 'b,
107+
LexemeT: Lexeme<StorageT>,
108+
StorageT: 'static + Eq + Hash,
109+
ActionT: 'a,
110+
> {
111+
parser: &'a Parser<'a, 'b, 'input, LexemeT, StorageT, ActionT>,
105112
}
106113

107-
pub(super) fn recoverer<'a, StorageT: 'static + Debug + Hash + PrimInt + Unsigned, ActionT: 'a>(
108-
parser: &'a Parser<StorageT, ActionT>,
109-
) -> Box<dyn Recoverer<StorageT, ActionT> + 'a>
114+
pub(super) fn recoverer<
115+
'a,
116+
LexemeT: Lexeme<StorageT>,
117+
StorageT: 'static + Debug + Hash + PrimInt + Unsigned,
118+
ActionT: 'a,
119+
>(
120+
parser: &'a Parser<LexemeT, StorageT, ActionT>,
121+
) -> Box<dyn Recoverer<LexemeT, StorageT, ActionT> + 'a>
110122
where
111123
usize: AsPrimitive<StorageT>,
112124
{
@@ -117,21 +129,22 @@ impl<
117129
'a,
118130
'b: 'a,
119131
'input: 'b,
132+
LexemeT: Lexeme<StorageT>,
120133
StorageT: 'static + Debug + Hash + PrimInt + Unsigned,
121134
ActionT: 'a,
122-
> Recoverer<StorageT, ActionT> for CPCTPlus<'a, 'b, 'input, StorageT, ActionT>
135+
> Recoverer<LexemeT, StorageT, ActionT> for CPCTPlus<'a, 'b, 'input, LexemeT, StorageT, ActionT>
123136
where
124137
usize: AsPrimitive<StorageT>,
125138
{
126139
fn recover(
127140
&self,
128141
finish_by: Instant,
129-
parser: &Parser<StorageT, ActionT>,
142+
parser: &Parser<LexemeT, StorageT, ActionT>,
130143
in_laidx: usize,
131144
mut in_pstack: &mut Vec<StIdx>,
132-
mut astack: &mut Vec<AStackType<ActionT, StorageT>>,
145+
mut astack: &mut Vec<AStackType<LexemeT, ActionT>>,
133146
mut spans: &mut Vec<Span>,
134-
) -> (usize, Vec<Vec<ParseRepair<StorageT>>>) {
147+
) -> (usize, Vec<Vec<ParseRepair<LexemeT, StorageT>>>) {
135148
// This function implements a minor variant of the algorithm from "Repairing syntax errors
136149
// in LR parsers" by Rafael Corchuelo, Jose A. Perez, Antonio Ruiz, and Miguel Toro.
137150
//
@@ -249,9 +262,10 @@ impl<
249262
'a,
250263
'b: 'a,
251264
'input: 'b,
265+
LexemeT: Lexeme<StorageT>,
252266
StorageT: 'static + Debug + Hash + PrimInt + Unsigned,
253267
ActionT: 'a,
254-
> CPCTPlus<'a, 'b, 'input, StorageT, ActionT>
268+
> CPCTPlus<'a, 'b, 'input, LexemeT, StorageT, ActionT>
255269
where
256270
usize: AsPrimitive<StorageT>,
257271
{
@@ -357,7 +371,7 @@ where
357371
&self,
358372
in_laidx: usize,
359373
cnds: Vec<PathFNode<StorageT>>,
360-
) -> Vec<Vec<Vec<ParseRepair<StorageT>>>> {
374+
) -> Vec<Vec<Vec<ParseRepair<LexemeT, StorageT>>>> {
361375
fn traverse<StorageT: PrimInt>(
362376
rm: &Cactus<RepairMerge<StorageT>>,
363377
) -> Vec<Vec<Repair<StorageT>>> {
@@ -411,7 +425,7 @@ where
411425
&self,
412426
mut laidx: usize,
413427
from: &[Repair<StorageT>],
414-
) -> Vec<ParseRepair<StorageT>> {
428+
) -> Vec<ParseRepair<LexemeT, StorageT>> {
415429
from.iter()
416430
.map(|y| match *y {
417431
Repair::InsertTerm(token_idx) => ParseRepair::Insert(token_idx),
@@ -432,13 +446,18 @@ where
432446

433447
/// Apply the `repairs` to `pstack` starting at position `laidx`: return the resulting parse
434448
/// distance and a new pstack.
435-
fn apply_repairs<'a, StorageT: 'static + Debug + Hash + PrimInt + Unsigned, ActionT: 'a>(
436-
parser: &Parser<StorageT, ActionT>,
449+
fn apply_repairs<
450+
'a,
451+
LexemeT: Lexeme<StorageT>,
452+
StorageT: 'static + Debug + Hash + PrimInt + Unsigned,
453+
ActionT: 'a,
454+
>(
455+
parser: &Parser<LexemeT, StorageT, ActionT>,
437456
mut laidx: usize,
438457
mut pstack: &mut Vec<StIdx>,
439-
mut astack: &mut Option<&mut Vec<AStackType<ActionT, StorageT>>>,
458+
mut astack: &mut Option<&mut Vec<AStackType<LexemeT, ActionT>>>,
440459
mut spans: &mut Option<&mut Vec<Span>>,
441-
repairs: &[ParseRepair<StorageT>],
460+
repairs: &[ParseRepair<LexemeT, StorageT>],
442461
) -> usize
443462
where
444463
usize: AsPrimitive<StorageT>,
@@ -474,9 +493,13 @@ where
474493
}
475494

476495
/// Simplifies repair sequences, removes duplicates, and sorts them into order.
477-
fn simplify_repairs<StorageT: 'static + Hash + PrimInt + Unsigned, ActionT>(
478-
parser: &Parser<StorageT, ActionT>,
479-
all_rprs: &mut Vec<Vec<ParseRepair<StorageT>>>,
496+
fn simplify_repairs<
497+
LexemeT: Lexeme<StorageT>,
498+
StorageT: 'static + Hash + PrimInt + Unsigned,
499+
ActionT,
500+
>(
501+
parser: &Parser<LexemeT, StorageT, ActionT>,
502+
all_rprs: &mut Vec<Vec<ParseRepair<LexemeT, StorageT>>>,
480503
) where
481504
usize: AsPrimitive<StorageT>,
482505
{
@@ -494,13 +517,13 @@ fn simplify_repairs<StorageT: 'static + Hash + PrimInt + Unsigned, ActionT>(
494517
// Use a HashSet as a quick way of deduplicating repair sequences: occasionally we can end up
495518
// with hundreds of thousands (!), and we don't have a sensible ordering on ParseRepair to make
496519
// it plausible to do a sort and dedup.
497-
let mut hs: HashSet<Vec<ParseRepair<StorageT>>> = all_rprs.drain(..).collect();
520+
let mut hs: HashSet<Vec<ParseRepair<LexemeT, StorageT>>> = all_rprs.drain(..).collect();
498521
all_rprs.extend(hs.drain());
499522

500523
// Sort repair sequences:
501524
// 1) by whether they contain Inserts that are %insert_avoid
502525
// 2) by the number of repairs they contain
503-
let contains_avoid_insert = |rprs: &Vec<ParseRepair<StorageT>>| -> bool {
526+
let contains_avoid_insert = |rprs: &Vec<ParseRepair<LexemeT, StorageT>>| -> bool {
504527
for r in rprs.iter() {
505528
if let ParseRepair::Insert(tidx) = r {
506529
if parser.grm.avoid_insert(*tidx) {
@@ -528,13 +551,18 @@ fn simplify_repairs<StorageT: 'static + Hash + PrimInt + Unsigned, ActionT>(
528551
/// `ParseRepair`s allow the same distance of parsing, then the `ParseRepair` which requires
529552
/// repairs over the shortest distance is preferred. Amongst `ParseRepair`s of the same rank, the
530553
/// ordering is non-deterministic.
531-
fn rank_cnds<'a, StorageT: 'static + Debug + Hash + PrimInt + Unsigned, ActionT: 'a>(
532-
parser: &Parser<StorageT, ActionT>,
554+
fn rank_cnds<
555+
'a,
556+
LexemeT: Lexeme<StorageT>,
557+
StorageT: 'static + Debug + Hash + PrimInt + Unsigned,
558+
ActionT: 'a,
559+
>(
560+
parser: &Parser<LexemeT, StorageT, ActionT>,
533561
finish_by: Instant,
534562
in_laidx: usize,
535563
in_pstack: &[StIdx],
536-
in_cnds: Vec<Vec<Vec<ParseRepair<StorageT>>>>,
537-
) -> Vec<Vec<ParseRepair<StorageT>>>
564+
in_cnds: Vec<Vec<Vec<ParseRepair<LexemeT, StorageT>>>>,
565+
) -> Vec<Vec<ParseRepair<LexemeT, StorageT>>>
538566
where
539567
usize: AsPrimitive<StorageT>,
540568
{
@@ -603,9 +631,9 @@ mod test {
603631
parser::{test::do_parse, LexParseError, ParseRepair, RecoveryKind},
604632
};
605633

606-
fn pp_repairs<StorageT: 'static + Hash + PrimInt + Unsigned>(
634+
fn pp_repairs<LexemeT: Lexeme<StorageT>, StorageT: 'static + Hash + PrimInt + Unsigned>(
607635
grm: &YaccGrammar<StorageT>,
608-
repairs: &[ParseRepair<StorageT>],
636+
repairs: &[ParseRepair<LexemeT, StorageT>],
609637
) -> String
610638
where
611639
usize: AsPrimitive<StorageT>,
@@ -623,9 +651,12 @@ mod test {
623651
out.join(", ")
624652
}
625653

626-
fn check_all_repairs<StorageT: 'static + Debug + Hash + PrimInt + Unsigned>(
654+
fn check_all_repairs<
655+
LexemeT: Lexeme<StorageT>,
656+
StorageT: 'static + Debug + Hash + PrimInt + Unsigned,
657+
>(
627658
grm: &YaccGrammar<StorageT>,
628-
err: &LexParseError<StorageT>,
659+
err: &LexParseError<LexemeT, StorageT>,
629660
expected: &[&str],
630661
) where
631662
usize: AsPrimitive<StorageT>,

0 commit comments

Comments
 (0)