Skip to content

Commit 3aeea46

Browse files
committed
Potentially lowercase the data in a lemmatizer if all of the training data (or a user flag) requested it
Testing additions: Add a basic unit test of the all_lowercase function Add a test of the caseless lemmatizer in the Pipeline Test that the Latin ITTB lemmatizer is marked as caseless. Check that the results for capitalized text is as expected Addresses #1330
1 parent 4b7c6b4 commit 3aeea46

File tree

6 files changed

+118
-0
lines changed

6 files changed

+118
-0
lines changed

stanza/models/lemma/data.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,13 @@ def __iter__(self):
117117
def load_doc(self, doc):
118118
data = doc.get([TEXT, UPOS, LEMMA])
119119
data = self.resolve_none(data)
120+
if self.args.get('caseless', False):
121+
data = self.lowercase_data(data)
122+
return data
123+
124+
def lowercase_data(self, data):
125+
for token in data:
126+
token[0] = token[0].lower()
120127
return data
121128

122129
def resolve_none(self, data):

stanza/models/lemma/trainer.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,9 @@ def __init__(self, args=None, vocab=None, emb_matrix=None, model_file=None, devi
4545
# dict-based components
4646
self.word_dict = dict()
4747
self.composite_dict = dict()
48+
49+
self.caseless = self.args.get('caseless', False)
50+
4851
if not self.args['dict_only']:
4952
self.model = self.model.to(device)
5053
if self.args.get('edit', False):
@@ -164,6 +167,8 @@ def predict_dict(self, pairs):
164167
lemmas = []
165168
for p in pairs:
166169
w, pos = p
170+
if self.caseless:
171+
w = w.lower()
167172
if (w,pos) in self.composite_dict:
168173
lemmas += [self.composite_dict[(w,pos)]]
169174
elif w in self.word_dict:
@@ -178,6 +183,8 @@ def skip_seq2seq(self, pairs):
178183
skip = []
179184
for p in pairs:
180185
w, pos = p
186+
if self.caseless:
187+
w = w.lower()
181188
if (w,pos) in self.composite_dict:
182189
skip.append(True)
183190
elif w in self.word_dict:
@@ -192,6 +199,8 @@ def ensemble(self, pairs, other_preds):
192199
assert len(pairs) == len(other_preds)
193200
for p, pred in zip(pairs, other_preds):
194201
w, pos = p
202+
if self.caseless:
203+
w = w.lower()
195204
if (w,pos) in self.composite_dict:
196205
lemma = self.composite_dict[(w,pos)]
197206
elif w in self.word_dict:

stanza/models/lemmatizer.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ def build_argparse():
7878
parser.add_argument('--save_dir', type=str, default='saved_models/lemma', help='Root dir for saving models.')
7979
parser.add_argument('--save_name', type=str, default="{shorthand}_{embedding}_lemmatizer.pt", help="File name to save the model")
8080

81+
parser.add_argument('--caseless', default=False, action='store_true', help='Lowercase everything first before processing. This will happen automatically if 100%% of the data is caseless')
82+
8183
parser.add_argument('--seed', type=int, default=1234)
8284
utils.add_device_args(parser)
8385

@@ -110,6 +112,13 @@ def main(args=None):
110112
else:
111113
evaluate(args)
112114

115+
def all_lowercase(doc):
116+
for sentence in doc.sentences:
117+
for word in sentence.words:
118+
if word.text.lower() != word.text:
119+
return False
120+
return True
121+
113122
def build_model_filename(args):
114123
embedding = "nocharlm"
115124
if args['charlm'] and args['charlm_forward_file']:
@@ -147,6 +156,10 @@ def train(args):
147156
logger.warning("[Skip training because no training data available...]")
148157
return
149158

159+
if not args['caseless'] and all_lowercase(train_doc):
160+
logger.info("Building a caseless model, as all of the training data is caseless")
161+
args['caseless'] = True
162+
150163
# start training
151164
# train a dictionary-based lemmatizer
152165
logger.info("Building lemmatizer in %s", model_file)

stanza/tests/lemma/test_lowercase.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import pytest
2+
3+
from stanza.models.lemmatizer import all_lowercase
4+
from stanza.utils.conll import CoNLL
5+
6+
LATIN_CONLLU = """
7+
# sent_id = train-s1
8+
# text = unde et philosophus dicit felicitatem esse operationem perfectam.
9+
# reference = ittb-scg-s4203
10+
1 unde unde ADV O4 AdvType=Loc|PronType=Rel 4 advmod:lmod _ _
11+
2 et et CCONJ O4 _ 3 advmod:emph _ _
12+
3 philosophus philosophus NOUN B1|grn1|casA|gen1 Case=Nom|Gender=Masc|InflClass=IndEurO|Number=Sing 4 nsubj _ _
13+
4 dicit dico VERB N3|modA|tem1|gen6 Aspect=Imp|InflClass=LatX|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act 0 root _ TraditionalMood=Indicativus|TraditionalTense=Praesens
14+
5 felicitatem felicitas NOUN C1|grn1|casD|gen2 Case=Acc|Gender=Fem|InflClass=IndEurX|Number=Sing 7 nsubj _ _
15+
6 esse sum AUX N3|modH|tem1 Aspect=Imp|Tense=Pres|VerbForm=Inf 7 cop _ _
16+
7 operationem operatio NOUN C1|grn1|casD|gen2|vgr1 Case=Acc|Gender=Fem|InflClass=IndEurX|Number=Sing 4 ccomp _ _
17+
8 perfectam perfectus ADJ A1|grn1|casD|gen2 Case=Acc|Gender=Fem|InflClass=IndEurA|Number=Sing 7 amod _ SpaceAfter=No
18+
9 . . PUNCT Punc _ 4 punct _ _
19+
20+
# sent_id = train-s2
21+
# text = perfectio autem operationis dependet ex quatuor.
22+
# reference = ittb-scg-s4204
23+
1 perfectio perfectio NOUN C1|grn1|casA|gen2 Case=Nom|Gender=Fem|InflClass=IndEurX|Number=Sing 4 nsubj _ _
24+
2 autem autem PART O4 _ 4 discourse _ _
25+
3 operationis operatio NOUN C1|grn1|casB|gen2|vgr1 Case=Gen|Gender=Fem|InflClass=IndEurX|Number=Sing 1 nmod _ _
26+
4 dependet dependeo VERB K3|modA|tem1|gen6 Aspect=Imp|InflClass=LatE|Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin|Voice=Act 0 root _ TraditionalMood=Indicativus|TraditionalTense=Praesens
27+
5 ex ex ADP S4|vgr2 _ 6 case _ _
28+
6 quatuor quattuor NUM G1|gen3|vgr1 NumForm=Word|NumType=Card 4 obl:arg _ SpaceAfter=No
29+
7 . . PUNCT Punc _ 4 punct _ _
30+
""".lstrip()
31+
32+
ENG_CONLLU = """
33+
# sent_id = weblog-juancole.com_juancole_20051126063000_ENG_20051126_063000-0007
34+
# text = You wonder if he was manipulating the market with his bombing targets.
35+
1 You you PRON PRP Case=Nom|Person=2|PronType=Prs 2 nsubj 2:nsubj _
36+
2 wonder wonder VERB VBP Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin 0 root 0:root _
37+
3 if if SCONJ IN _ 6 mark 6:mark _
38+
4 he he PRON PRP Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs 6 nsubj 6:nsubj _
39+
5 was be AUX VBD Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin 6 aux 6:aux _
40+
6 manipulating manipulate VERB VBG Tense=Pres|VerbForm=Part 2 ccomp 2:ccomp _
41+
7 the the DET DT Definite=Def|PronType=Art 8 det 8:det _
42+
8 market market NOUN NN Number=Sing 6 obj 6:obj _
43+
9 with with ADP IN _ 12 case 12:case _
44+
10 his his PRON PRP$ Case=Gen|Gender=Masc|Number=Sing|Person=3|Poss=Yes|PronType=Prs 12 nmod:poss 12:nmod:poss _
45+
11 bombing bombing NOUN NN Number=Sing 12 compound 12:compound _
46+
12 targets target NOUN NNS Number=Plur 6 obl 6:obl:with SpaceAfter=No
47+
13 . . PUNCT . _ 2 punct 2:punct _
48+
""".lstrip()
49+
50+
51+
def test_all_lowercase():
52+
doc = CoNLL.conll2doc(input_str=LATIN_CONLLU)
53+
assert all_lowercase(doc)
54+
55+
def test_not_all_lowercase():
56+
doc = CoNLL.conll2doc(input_str=ENG_CONLLU)
57+
assert not all_lowercase(doc)

stanza/tests/pipeline/test_lemmatizer.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,3 +104,33 @@ def test_store_results():
104104
assert stuff == stuff2
105105

106106
assert az not in lemmatizer.word_dict
107+
108+
def test_caseless_lemmatizer():
109+
"""
110+
Test that setting the lemmatizer as caseless at Pipeline time lowercases the text
111+
"""
112+
nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma', model_dir=TEST_MODELS_DIR, download_method=None)
113+
# the capital letter here should throw off the lemmatizer & it won't remove the plural
114+
# although weirdly the current English model *does* lowercase the A
115+
doc = nlp("Jennifer has nice Antennae")
116+
assert doc.sentences[0].words[-1].lemma == 'antennae'
117+
118+
nlp = stanza.Pipeline('en', processors='tokenize,pos,lemma', model_dir=TEST_MODELS_DIR, download_method=None, lemma_caseless=True)
119+
# with the model set to lowercasing, the word will be treated as if it were 'antennae'
120+
doc = nlp("Jennifer has nice Antennae")
121+
assert doc.sentences[0].words[-1].lemma == 'antenna'
122+
123+
def test_latin_caseless_lemmatizer():
124+
"""
125+
Test the Latin caseless lemmatizer
126+
"""
127+
nlp = stanza.Pipeline('la', package='ittb', processors='tokenize,pos,lemma', model_dir=TEST_MODELS_DIR, download_method=None)
128+
lemmatizer = nlp.processors['lemma']
129+
assert lemmatizer.config['caseless']
130+
131+
doc = nlp("Quod Erat Demonstrandum")
132+
expected_lemmas = "qui sum demonstro".split()
133+
assert len(doc.sentences) == 1
134+
assert len(doc.sentences[0].words) == 3
135+
for word, expected in zip(doc.sentences[0].words, expected_lemmas):
136+
assert word.lemma == expected

stanza/tests/setup.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@
4040
stanza.download(lang='en', model_dir=models_dir, logging_level='info')
4141
stanza.download(lang="en", model_dir=models_dir, package=None, processors={"ner":"ncbi_disease"})
4242
stanza.download(lang='fr', model_dir=models_dir, logging_level='info')
43+
# Latin ITTB has no case information for the lemmatizer
44+
stanza.download(lang='la', model_dir=models_dir, package='ittb', logging_level='info')
4345
stanza.download(lang='zh', model_dir=models_dir, logging_level='info')
4446
# useful not just for verifying RtL, but because the default Arabic has a unique style of xpos tags
4547
stanza.download(lang='ar', model_dir=models_dir, logging_level='info')

0 commit comments

Comments
 (0)