Normalize all PTB produced tokens, not just the German ones, using NFC

AngledLuffa · AngledLuffa · commit d46fecd93c69 · 2022-04-20T13:40:17.000-07:00
Testing on 0.1% of Wikipedia (from a few years ago), this slows down the English tokenizer by about 1.5%
The German umlaut unit test still works as well
diff --git a/src/edu/stanford/nlp/international/german/process/GermanTokenizerPostProcessor.java b/src/edu/stanford/nlp/international/german/process/GermanTokenizerPostProcessor.java
@@ -45,64 +45,6 @@ public static void mergeTokens(CoreLabel token, CoreLabel nextToken) {
     token.setValue(token.word()+"-"+token.sentIndex());
   }
 
-  /**
-   * Some people write umlauts as two characters instead of just one
-   *<br>
-   * German CoreNLP doesn't handle the two character versions correctly,
-   * so here we condense it into the one character version
-   */
-  public static void condenseUmlauts(CoreLabel token) {
-    String value = token.value();
-    String updatedValue = condenseUmlauts(value);
-    if (updatedValue != null) {
-      token.setValue(updatedValue);
-    }
-
-    String word = token.word();
-    String updatedWord = condenseUmlauts(word);
-    if (updatedWord != null) {
-      token.setWord(updatedWord);
-    }
-  }
-    
-  public static String condenseUmlauts(String value) {
-    StringBuilder ns = null;
-    for (int i = 0; i < value.length(); ++i) {
-      final char cur = value.charAt(i);
-      if ((int) cur == 776) {
-        // this is the umlaut character
-        if (ns == null) {
-          ns = new StringBuilder(value.length());
-          ns.append(value.substring(0, i));
-        }
-        final char prev = ns.length() == 0 ? ' ' : ns.charAt(ns.length() - 1);
-        if (prev == 'a') {
-          ns.setCharAt(ns.length() - 1, 'ä');
-        } else if (prev == 'A') {
-          ns.setCharAt(ns.length() - 1, 'Ä');
-        } else if (prev == 'o') {
-          ns.setCharAt(ns.length() - 1, 'ö');
-        } else if (prev == 'O') {
-          ns.setCharAt(ns.length() - 1, 'Ö');
-        } else if (prev == 'u') {
-          ns.setCharAt(ns.length() - 1, 'ü');
-        } else if (prev == 'U') {
-          ns.setCharAt(ns.length() - 1, 'Ü');
-        } else {
-          ns.append(cur);
-        }
-      } else {
-        if (ns != null) {
-          ns.append(cur);
-        }
-      }
-    }
-    if (ns != null) {
-      return ns.toString();
-    }
-    return null;
-  }
-
   @Override
   public List<CoreLabel> process(List<CoreLabel> tokens) {
     List<CoreLabel> processedTokens = new ArrayList<CoreLabel>();
@@ -134,9 +76,6 @@ public List<CoreLabel> process(List<CoreLabel> tokens) {
       }
     }
 
-    for (CoreLabel label : processedTokens) {
-      condenseUmlauts(label);
-    }
     return processedTokens;
   }
 
diff --git a/src/edu/stanford/nlp/process/PTBLexer.flex b/src/edu/stanford/nlp/process/PTBLexer.flex
@@ -27,6 +27,7 @@ package edu.stanford.nlp.process;
 
 
 import java.io.Reader;
+import java.text.Normalizer;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Properties;
@@ -488,6 +489,7 @@ import edu.stanford.nlp.util.logging.Redwood;
    *  @param originalText The original String that got transformed into txt
    */
   private Object getNext(String txt, String originalText) {
+    txt = Normalizer.normalize(txt, Normalizer.Form.NFC);
     int begin = Math.toIntExact(yychar);
     if (invertible) {
       String str = prevWordAfter.toString();
diff --git a/src/edu/stanford/nlp/process/PTBLexer.java b/src/edu/stanford/nlp/process/PTBLexer.java
@@ -31,6 +31,7 @@
 
 
 import java.io.Reader;
+import java.text.Normalizer;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Properties;
@@ -88334,6 +88335,7 @@ private Object getNext() {
    *  @param originalText The original String that got transformed into txt
    */
   private Object getNext(String txt, String originalText) {
+    txt = Normalizer.normalize(txt, Normalizer.Form.NFC);
     int begin = Math.toIntExact(yychar);
     if (invertible) {
       String str = prevWordAfter.toString();