Reset the start_char and end_char on single Word objects if the Token object has start_char and end_char.

AngledLuffa · AngledLuffa · commit 1a36efb53135 · 2024-11-27T00:18:40.000-08:00
Will accommodate MWT Tokens which were detected by the tokenizer but not expanded by the MWT model, which can happen with typos such as it"s #1436
diff --git a/stanza/models/common/doc.py b/stanza/models/common/doc.py
@@ -396,7 +396,7 @@ def set_mwt_expansions(self, expansions,
                     word.sent = sentence
                     word.parent = token
                     sentence.words.append(word)
-                if len(token.words) > 1 and token.start_char is not None and token.end_char is not None and "".join(word.text for word in token.words) == token.text:
+                if token.start_char is not None and token.end_char is not None and "".join(word.text for word in token.words) == token.text:
                     start_char = token.start_char
                     for word in token.words:
                         end_char = start_char + len(word.text)