@@ -476,11 +476,6 @@ import edu.stanford.nlp.util.logging.Redwood;
476
476
return - 1 ;
477
477
}
478
478
479
- private Object getNext() {
480
- String txt = yytext();
481
- return getNext(txt, txt);
482
- }
483
-
484
479
/* * Make the next token.
485
480
* If the begin character offset exceeds what can be stored in 32 bits, it is
486
481
* entered as Integer.MAX_VALUE and an error is logged.
@@ -509,10 +504,12 @@ import edu.stanford.nlp.util.logging.Redwood;
509
504
}
510
505
}
511
506
507
+ /*
512
508
private void fixJFlex4SpaceAfterTokenBug() {
513
509
// try to work around an apparent jflex bug where it
514
510
// gets a space at the token end by getting
515
511
// wrong the length of the trailing context.
512
+ // cdm2022: This bug no longer seems to exist; tested on several megabytes of text
516
513
while (yylength() > 0) {
517
514
char last = yycharat(yylength()-1);
518
515
if (last == ' ' || last == '\t' || (last >= '\n' && last <= '\r' || last == '\u0085')) {
@@ -523,9 +520,10 @@ import edu.stanford.nlp.util.logging.Redwood;
523
520
}
524
521
}
525
522
}
523
+ */
526
524
527
525
private Object processAcronym() {
528
- fixJFlex4SpaceAfterTokenBug();
526
+ // fixJFlex4SpaceAfterTokenBug();
529
527
String s;
530
528
if (yylength() == 2 ) { // "I.", etc. Treat as "I" + "."
531
529
yypushback(1 ); // return a period next time;
@@ -543,7 +541,7 @@ import edu.stanford.nlp.util.logging.Redwood;
543
541
}
544
542
545
543
private Object processAbbrev3() {
546
- fixJFlex4SpaceAfterTokenBug();
544
+ // fixJFlex4SpaceAfterTokenBug();
547
545
String txt = yytext();
548
546
if (DEBUG ) { logger. info(" Used {ABBREV3} to recognize " + txt); }
549
547
return getNext(txt, txt);
@@ -595,6 +593,7 @@ DIGIT = [:digit:]|[\u07C0-\u07C9]
595
593
DATE = {DIGIT} {1,2} [ \- \u2012 \/ ] {DIGIT} {1,2} [ \- \u2012 \/ ] {DIGIT} {2,4}| {DIGIT} {4} [ \- \u2012 \/ ] {DIGIT} {1,2} [ \- \u2012 \/ ] {DIGIT} {1,2}
596
594
/* Note that NUM also includes times like 12:55. One can start with a . or , but not a : */
597
595
NUM = {DIGIT} *( [ .,\u066B\u066C ] {DIGIT} +)+| {DIGIT} +( [ .:,\u00AD\u066B\u066C\u2009\u202F ] {DIGIT} +)*
596
+ LEADING_NUM = {DIGIT} +( [ .,\u066B\u066C ] {DIGIT} +)+
598
597
/* Now don't allow bracketed negative numbers! They have too many uses (e.g.,
599
598
years or times in parentheses), and having them in tokens messes up
600
599
treebank parsing.
@@ -623,10 +622,14 @@ SEP_SUFFIX = ({SEP_CURRENCY}|{SEP_UNITS}|{SEP_OTHER})
623
622
LETTER = ([:letter:]| {SPLET} | [ \u00AD\u200C\u200D\u2060\u0237 - \u024F\u02C2 - \u02C5\u02D2 - \u02DF\u02E5 - \u02FF\u0300 - \u036F\u0370 - \u037D\u0384\u0385\u03CF\u03F6\u03FC - \u03FF\u0483 - \u0487\u04CF\u04F6 - \u04FF\u0510 - \u0525\u055A - \u055F\u0591 - \u05BD\u05BF\u05C1\u05C2\u05C4\u05C5\u05C7\u0615 - \u061A\u063B - \u063F\u064B - \u065E\u0670\u06D6 - \u06EF\u06FA - \u06FF\u070F\u0711\u0730 - \u074F\u0750 - \u077F\u07A6 - \u07B1\u07CA - \u07F5\u07FA\u0900 - \u0903\u093C\u093E - \u094E\u0951 - \u0955\u0962 - \u0963\u0981 - \u0983\u09BC - \u09C4\u09C7\u09C8\u09CB - \u09CD\u09D7\u09E2\u09E3\u0A01 - \u0A03\u0A3C\u0A3E - \u0A4F\u0A81 - \u0A83\u0ABC - \u0ACF\u0B82\u0BBE - \u0BC2\u0BC6 - \u0BC8\u0BCA - \u0BCD\u0C01 - \u0C03\u0C3E - \u0C56\u0D3E - \u0D44\u0D46 - \u0D48\u0E30 - \u0E3A\u0E47 - \u0E4E\u0EB1 - \u0EBC\u0EC8 - \u0ECD ] )
624
623
/* Allow in the zero-width (non-)joiner characters. Allow in Modifier non-spacing (= separated accent chars) */
625
624
WORD = {LETTER} ( {LETTER} | {DIGIT} | [\p{Mn}\p{Mc}] )*( [ .!?] {LETTER} ( {LETTER} | {DIGIT} | [\p{Mn}\p{Mc}] )*)*
625
+ /* VARIANT THAT CAN'T END IN A NUMBER. Seemed needed for use with trailing number context, though unclear why */
626
+ WORD_LETTER = {LETTER} | {LETTER} ( {LETTER} | {DIGIT} | [\p{Mn}\p{Mc}] )*( [ .!?] {LETTER} ( {LETTER} | {DIGIT} | [\p{Mn}\p{Mc}] )*)* {LETTER}
626
627
/* THING: The $ was for things like New$;
627
628
WAS: only keep hyphens with short one side like co-ed. But (old) treebank just allows hyphenated things as words!
628
629
THING allows d'Avignon or NUMBER before HYPHEN and the same things after it. Only first number can be negative. */
629
630
THING = ( [ dDoOlL] {APOSETCETERA} [\p{Alpha}\p{Digit}] )?( [\p{Alpha}\p{Digit}] +| {NUMBER} )( {HYPHEN} ( [ dDoOlL] {APOSETCETERA} [\p{Alpha}\p{Digit}] )?( [\p{Alpha}\p{Digit}] +| {NUM} ))*
631
+ /* variant with final letter for trailing context bug */
632
+ THING_LETTER = ( [ dDoOlL] {APOSETCETERA} [\p{Alpha}\p{Digit}] )?( [\p{Alpha}\p{Digit}] +| {NUMBER} )( {HYPHEN} ( [ dDoOlL] {APOSETCETERA} [\p{Alpha}\p{Digit}] )?( [\p{Alpha}\p{Digit}] +| {NUM} ))*\p{Alpha}
630
633
THINGA = [ A- Z] +(( [ +&] | {SPAMP} ) [ A- Z] +)+
631
634
THING3 = [\p{Alpha}\p{Digit}] +( -[\p{Alpha}] +){0,2}( \\ ? \/ [\p{Alpha}\p{Digit}] +( -[\p{Alpha}] +){0,2}){1,2}
632
635
APOS = [ '\u0092\u2019 ´] | ' /* ASCII straight quote, single right curly quote in CP1252 (wrong) or Unicode or reversed quote or HTML SGML escape */
@@ -916,6 +919,16 @@ CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
916
919
if (DEBUG ) { logger. info(" Used {DIGIT}/{SEP_SUFFIX} to recognize " + txt); }
917
920
return getNext(txt, txt);
918
921
}
922
+ /* for WORD in front of decimal number of dotted number sequence, leave the latter alone. */
923
+ /* Sometimes this is for currencies like RM = Malaysian currency, DM = Deutschmark, SK = Swedish Kroner, etc. */
924
+ {WORD_LETTER} / {LEADING_NUM} { final String origTxt = yytext();
925
+ String tok = LexerUtils . removeSoftHyphens(origTxt);
926
+ if (americanize) {
927
+ tok = Americanize . americanize(tok);
928
+ }
929
+ if (DEBUG ) { logger. info(" Used {WORD_LETTER} to recognize " + origTxt + " as " + tok); }
930
+ return getNext(tok, origTxt);
931
+ }
919
932
{WORD} { final String origTxt = yytext();
920
933
String tok = LexerUtils . removeSoftHyphens(origTxt);
921
934
if (americanize) {
@@ -982,11 +995,6 @@ CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
982
995
if (DEBUG ) { logger. info(" Used {DATE} to recognize " + origTxt + " as " + txt); }
983
996
return getNext(txt, origTxt);
984
997
}
985
- /* Malaysian currency */
986
- RM/ {NUM} { String txt = yytext();
987
- if (DEBUG ) { logger. info(" Used Malaysian currency to recognize " + txt); }
988
- return getNext(txt, txt);
989
- }
990
998
{NUMBER} { String txt = yytext();
991
999
handleHyphenatedNumber(txt);
992
1000
if (DEBUG ) { logger. info(" Used {NUMBER} to recognize " + yytext() + " as " + removeFromNumber(yytext())); }
@@ -1453,7 +1461,16 @@ RM/{NUM} { String txt = yytext();
1453
1461
" ; probablyLeft=" + false ); }
1454
1462
return getNext( norm, tok) ;
1455
1463
} */
1456
- {THING} { breakByHyphensSlashes(yytext()); // this was causing fail of attempted to pushback too much!
1464
+ {THING_LETTER} / {LEADING_NUM} {
1465
+ breakByHyphensSlashes(yytext()); // this was causing fail of attempt to pushback too much!
1466
+ String tok = yytext();
1467
+ /* A THING can contain quote like O'Malley */
1468
+ String norm = LexerUtils . handleQuotes(tok, false , quoteStyle);
1469
+ if (DEBUG ) { logger. info(" Used {THING_LETTER} to recognize " + tok + " as " + norm +
1470
+ " ; probablyLeft=" + false ); }
1471
+ return getNext(norm, tok);
1472
+ }
1473
+ {THING} { breakByHyphensSlashes(yytext()); // this was causing fail of attempt to pushback too much!
1457
1474
String tok = yytext();
1458
1475
/* A THING can contain quote like O'Malley */
1459
1476
String norm = LexerUtils . handleQuotes(tok, false , quoteStyle);
@@ -1582,17 +1599,17 @@ RM/{NUM} { String txt = yytext();
1582
1599
this . seenUntokenizableCharacter = true ;
1583
1600
break ;
1584
1601
case NONE_KEEP :
1585
- return getNext();
1602
+ return getNext(str, str );
1586
1603
case FIRST_KEEP :
1587
1604
if ( ! this . seenUntokenizableCharacter) {
1588
1605
logger. warning(msg);
1589
1606
this . seenUntokenizableCharacter = true ;
1590
1607
}
1591
- return getNext();
1608
+ return getNext(str, str );
1592
1609
case ALL_KEEP :
1593
1610
logger. warning(msg);
1594
1611
this . seenUntokenizableCharacter = true ;
1595
- return getNext();
1612
+ return getNext(str, str );
1596
1613
}
1597
1614
}
1598
1615
<<EOF>> { if (invertible) {
0 commit comments