Skip to content

Commit 88cd0df

Browse files
committed
Use an alias for se / sme, as per #1279
For any language with a default code of 3 letters (as per universaldependencies), and an alternate code of 2 letters, we can add that langcode to the resources file to make an alias for people who expect the 2 letter code. Currently that only applies to se / sme (that we know of, at least)
1 parent 5b3c8b3 commit 88cd0df

File tree

2 files changed

+26
-7
lines changed

2 files changed

+26
-7
lines changed

stanza/models/common/constant.py

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,6 @@
200200
("frr", "North_Frisian"),
201201
("nd", "North_Ndebele"),
202202
("sme", "North_Sami"),
203-
("se", "Northern_Sami"),
204203
("nso", "Northern_Sotho"),
205204
("nb", "Norwegian_Bokmaal"),
206205
("nn", "Norwegian_Nynorsk"),
@@ -346,20 +345,38 @@
346345
("xh", "xho"),
347346
("yo", "yor"),
348347
("zu", "zul"),
348+
349+
# this is a weird case where a 2 letter code was available,
350+
# but UD used the 3 letter code instead
351+
("se", "sme"),
349352
)
350353

351354
for two, three in two_to_three_letters_raw:
352-
assert two in lcode2lang
353-
assert three not in lcode2lang
354-
assert three not in lang2lcode
355-
lang2lcode[three] = two
356-
lcode2lang[three] = lcode2lang[two]
355+
if two in lcode2lang:
356+
assert two in lcode2lang
357+
assert three not in lcode2lang
358+
assert three not in lang2lcode
359+
lang2lcode[three] = two
360+
lcode2lang[three] = lcode2lang[two]
361+
elif three in lcode2lang:
362+
assert three in lcode2lang
363+
assert two not in lcode2lang
364+
assert two not in lang2lcode
365+
lang2lcode[two] = three
366+
lcode2lang[two] = lcode2lang[three]
367+
else:
368+
raise AssertionError("Found a proposed alias %s -> %s when neither code was already known" % (two, three))
357369

358370
two_to_three_letters = {
359371
two: three for two, three in two_to_three_letters_raw
360372
}
361373

374+
three_to_two_letters = {
375+
three: two for two, three in two_to_three_letters_raw
376+
}
377+
362378
assert len(two_to_three_letters) == len(two_to_three_letters_raw)
379+
assert len(three_to_two_letters) == len(two_to_three_letters_raw)
363380

364381
# additional useful code to language mapping
365382
# added after dict invert to avoid conflict

stanza/resources/prepare_resources.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
import zipfile
1919

2020
from stanza import __resources_version__
21-
from stanza.models.common.constant import lcode2lang, two_to_three_letters
21+
from stanza.models.common.constant import lcode2lang, two_to_three_letters, three_to_two_letters
2222
from stanza.resources.default_packages import default_treebanks, no_pretrain_languages, default_pretrains, pos_pretrains, depparse_pretrains, ner_pretrains, default_charlms, pos_charlms, depparse_charlms, ner_charlms, lemma_charlms, known_nicknames
2323

2424
def parse_args():
@@ -482,6 +482,8 @@ def process_lcode(args):
482482
resources_new[lang_name.lower()] = {'alias': lang.lower()}
483483
if lang.lower() in two_to_three_letters:
484484
resources_new[two_to_three_letters[lang.lower()]] = {'alias': lang.lower()}
485+
elif lang.lower() in three_to_two_letters:
486+
resources_new[three_to_two_letters[lang.lower()]] = {'alias': lang.lower()}
485487
print("Processed lcode aliases. Writing resources.json")
486488
json.dump(resources_new, open(os.path.join(args.output_dir, 'resources.json'), 'w'), indent=2)
487489

0 commit comments

Comments
 (0)