Skip to content

Commit d46fecd

Browse files
committed
Normalize all PTB produced tokens, not just the German ones, using NFC
Testing on 0.1% of Wikipedia (from a few years ago), this slows down the English tokenizer by about 1.5% The German umlaut unit test still works as well
1 parent 58a2288 commit d46fecd

File tree

3 files changed

+4
-61
lines changed

3 files changed

+4
-61
lines changed

src/edu/stanford/nlp/international/german/process/GermanTokenizerPostProcessor.java

Lines changed: 0 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -45,64 +45,6 @@ public static void mergeTokens(CoreLabel token, CoreLabel nextToken) {
4545
token.setValue(token.word()+"-"+token.sentIndex());
4646
}
4747

48-
/**
49-
* Some people write umlauts as two characters instead of just one
50-
*<br>
51-
* German CoreNLP doesn't handle the two character versions correctly,
52-
* so here we condense it into the one character version
53-
*/
54-
public static void condenseUmlauts(CoreLabel token) {
55-
String value = token.value();
56-
String updatedValue = condenseUmlauts(value);
57-
if (updatedValue != null) {
58-
token.setValue(updatedValue);
59-
}
60-
61-
String word = token.word();
62-
String updatedWord = condenseUmlauts(word);
63-
if (updatedWord != null) {
64-
token.setWord(updatedWord);
65-
}
66-
}
67-
68-
public static String condenseUmlauts(String value) {
69-
StringBuilder ns = null;
70-
for (int i = 0; i < value.length(); ++i) {
71-
final char cur = value.charAt(i);
72-
if ((int) cur == 776) {
73-
// this is the umlaut character
74-
if (ns == null) {
75-
ns = new StringBuilder(value.length());
76-
ns.append(value.substring(0, i));
77-
}
78-
final char prev = ns.length() == 0 ? ' ' : ns.charAt(ns.length() - 1);
79-
if (prev == 'a') {
80-
ns.setCharAt(ns.length() - 1, 'ä');
81-
} else if (prev == 'A') {
82-
ns.setCharAt(ns.length() - 1, 'Ä');
83-
} else if (prev == 'o') {
84-
ns.setCharAt(ns.length() - 1, 'ö');
85-
} else if (prev == 'O') {
86-
ns.setCharAt(ns.length() - 1, 'Ö');
87-
} else if (prev == 'u') {
88-
ns.setCharAt(ns.length() - 1, 'ü');
89-
} else if (prev == 'U') {
90-
ns.setCharAt(ns.length() - 1, 'Ü');
91-
} else {
92-
ns.append(cur);
93-
}
94-
} else {
95-
if (ns != null) {
96-
ns.append(cur);
97-
}
98-
}
99-
}
100-
if (ns != null) {
101-
return ns.toString();
102-
}
103-
return null;
104-
}
105-
10648
@Override
10749
public List<CoreLabel> process(List<CoreLabel> tokens) {
10850
List<CoreLabel> processedTokens = new ArrayList<CoreLabel>();
@@ -134,9 +76,6 @@ public List<CoreLabel> process(List<CoreLabel> tokens) {
13476
}
13577
}
13678

137-
for (CoreLabel label : processedTokens) {
138-
condenseUmlauts(label);
139-
}
14079
return processedTokens;
14180
}
14281

src/edu/stanford/nlp/process/PTBLexer.flex

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ package edu.stanford.nlp.process;
2727

2828

2929
import java.io.Reader;
30+
import java.text.Normalizer;
3031
import java.util.Locale;
3132
import java.util.Map;
3233
import java.util.Properties;
@@ -488,6 +489,7 @@ import edu.stanford.nlp.util.logging.Redwood;
488489
* @param originalText The original String that got transformed into txt
489490
*/
490491
private Object getNext(String txt, String originalText) {
492+
txt = Normalizer.normalize(txt, Normalizer.Form.NFC);
491493
int begin = Math.toIntExact(yychar);
492494
if (invertible) {
493495
String str = prevWordAfter.toString();

src/edu/stanford/nlp/process/PTBLexer.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131

3232

3333
import java.io.Reader;
34+
import java.text.Normalizer;
3435
import java.util.Locale;
3536
import java.util.Map;
3637
import java.util.Properties;
@@ -88334,6 +88335,7 @@ private Object getNext() {
8833488335
* @param originalText The original String that got transformed into txt
8833588336
*/
8833688337
private Object getNext(String txt, String originalText) {
88338+
txt = Normalizer.normalize(txt, Normalizer.Form.NFC);
8833788339
int begin = Math.toIntExact(yychar);
8833888340
if (invertible) {
8833988341
String str = prevWordAfter.toString();

0 commit comments

Comments
 (0)