Skip to content

Commit 99f7038

Browse files
committed
Add a batch of unknown FR verb lemmas to the combined FR model
1 parent 03cb6c7 commit 99f7038

File tree

1 file changed

+15
-0
lines changed

1 file changed

+15
-0
lines changed

stanza/utils/datasets/prepare_tokenizer_treebank.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -881,6 +881,20 @@ def add_english_sentence_final_punctuation(handparsed_sentences):
881881
new_sents.append(sent)
882882
return new_sents
883883

884+
def build_extra_combined_french_dataset(paths, model_type, dataset):
885+
"""
886+
Extra sentences we don't want augmented for French - currently, handparsed lemmas
887+
"""
888+
handparsed_dir = paths["HANDPARSED_DIR"]
889+
sents = []
890+
if dataset == 'train':
891+
if model_type is common.ModelType.LEMMA:
892+
handparsed_path = os.path.join(handparsed_dir, "french-lemmas", "fr_lemmas.conllu")
893+
handparsed_sentences = read_sentences_from_conllu(handparsed_path)
894+
print("Loaded %d sentences from %s" % (len(handparsed_sentences), handparsed_path))
895+
sents.extend(handparsed_sentences)
896+
return sents
897+
884898

885899
def build_extra_combined_english_dataset(paths, model_type, dataset):
886900
"""
@@ -1095,6 +1109,7 @@ def build_combined_hebrew_dataset(paths, model_type, dataset):
10951109
# some extra data for the combined models without augmenting
10961110
COMBINED_EXTRA_FNS = {
10971111
"en_combined": build_extra_combined_english_dataset,
1112+
"fr_combined": build_extra_combined_french_dataset,
10981113
"it_combined": build_extra_combined_italian_dataset,
10991114
}
11001115

0 commit comments

Comments
 (0)