-
Notifications
You must be signed in to change notification settings - Fork 91
/
Copy pathopus1m+bt-2021-05-01.yml
182 lines (182 loc) · 5.67 KB
/
opus1m+bt-2021-05-01.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
release: inc-eng/opus1m+bt-2021-05-01.zip
release-date: 2021-05-01
dataset-name: opus1m+bt
modeltype: transformer-align
vocabulary:
source: opus1m+bt.spm32k-spm32k.vocab.yml
target: opus1m+bt.spm32k-spm32k.vocab.yml
pre-processing: normalization + SentencePiece (spm32k,spm32k)
subwords:
source: spm32k
target: spm32k
subword-models:
source: source.spm
target: target.spm
source-languages:
- asm
- awa
- ben
- bho
- dty
- gbm
- gom
- guj
- hif
- hin
- mai
- mar
- nep
- npi
- ori
- pan
- pnb
- rmn
- rmy
- rom
- san
- sin
- snd
- urd
target-languages:
- eng
training-data:
asm-eng: Tatoeba-train (533376)
ben-eng: Tatoeba-train (1000000) wikibooks.aa.eng-ben (991280) wikinews.aa.eng-ben (457237) wikipedia.aa.eng-ben (981315) wikipedia.ab.eng-ben (981484) wikipedia.ac.eng-ben (981456) wikipedia.ad.eng-ben (981283) wikiquote.aa.eng-ben (996803)
ben_Cyrl-eng: Tatoeba-train (41)
ben_Deva-eng: Tatoeba-train (9)
ben_Gujr-eng: Tatoeba-train (6)
dty-eng: Tatoeba-train (34)
guj-eng: Tatoeba-train (1000000)
hin-eng: Tatoeba-train (1000000) wikibooks.aa (991421) wikinews.aa (457025) wikipedia.aa (980702) wikipedia.ab (980937) wikipedia.ac (980943) wikipedia.ad (980746) wikiquote.aa (996873)
mai-eng: Tatoeba-train (203954)
mar-eng: Tatoeba-train (369400) wikibooks.aa (991486) wikinews.aa (457181) wikipedia.aa (981666) wikipedia.ab (981851) wikipedia.ac (981816) wikipedia.ad (981629) wikiquote.aa (996859)
nep-eng: Tatoeba-train (1000000)
ori-eng: Tatoeba-train (55808)
pan-eng: Tatoeba-train (2)
pan_Guru-eng: Tatoeba-train (665989)
rmn-eng: Tatoeba-train (11228)
rmy-eng: Tatoeba-train (12)
rom-eng: Tatoeba-train (12359)
san-eng: Tatoeba-train (72)
san_Deva-eng: Tatoeba-train (2945)
sin-eng: Tatoeba-train (1000000)
snd_Arab-eng: Tatoeba-train (107499)
urd-eng: Tatoeba-train (1000000) wikibooks.aa (990223) wikinews.aa (456864) wikipedia.aa (979321) wikipedia.ab (979502) wikipedia.ac (979521) wikipedia.ad (979328) wikiquote.aa (996716)
validation-data:
asm-eng: Tatoeba-dev, 987
ben-eng: Tatoeba-dev, 2637
bho-eng: Tatoeba-dev, 90
eng-gom: Tatoeba-dev, 191
eng-guj: Tatoeba-dev, 987
eng-hif_Latn: Tatoeba-dev, 7
eng-hin: Tatoeba-dev, 5821
eng-mai: Tatoeba-dev, 941
eng-mar: Tatoeba-dev, 42834
eng-nep: Tatoeba-dev, 993
eng-ori: Tatoeba-dev, 968
eng-pan: Tatoeba-dev, 1000
eng-pan_Guru: Tatoeba-dev, 984
eng-pnb: Tatoeba-dev, 3
eng-pnb_Guru: Tatoeba-dev, 2
eng-rmn: Tatoeba-dev, 484
eng-rom: Tatoeba-dev, 516
eng-san: Tatoeba-dev, 17
eng-san_Deva: Tatoeba-dev, 776
eng-sin: Tatoeba-dev, 997
eng-snd_Arab: Tatoeba-dev, 996
eng-urd: Tatoeba-dev, 1000
total-size-shuffled: 13505
devset-selected: top 5000 lines of Tatoeba-dev.src.shuffled
test-data:
newsdev2014.hin-eng: 520/10406
newsdev2019-engu.guj-eng: 1998/41862
newstest2014-hien.hin-eng: 2507/55571
newstest2019-guen.guj-eng: 1016/17778
Tatoeba-test.asm-eng: 117/706
Tatoeba-test.awa-eng: 279/1335
Tatoeba-test.ben-eng: 2500/13978
Tatoeba-test.bho-eng: 42/283
Tatoeba-test.gbm-eng: 39/156
Tatoeba-test.guj-eng: 154/962
Tatoeba-test.hif-eng: 36/241
Tatoeba-test.hin-eng: 5000/33943
Tatoeba-test.kok-eng: 1/7
Tatoeba-test.lah-eng: 32/196
Tatoeba-test.mai-eng: 8/26
Tatoeba-test.mar-eng: 10000/64825
Tatoeba-test.nep-eng: 115/508
Tatoeba-test.multi-eng: 10000/64508
Tatoeba-test.ori-eng: 33/238
Tatoeba-test.pan-eng: 87/616
Tatoeba-test.rom-eng: 671/4457
Tatoeba-test.san-eng: 144/657
Tatoeba-test.sin-eng: 45/260
Tatoeba-test.snd-eng: 4/19
Tatoeba-test.urd-eng: 1663/12027
tico19-test.ben-eng: 2100/56848
tico19-test.hin-eng: 2100/56347
tico19-test.mar-eng: 2100/56339
tico19-test.nep-eng: 2100/56848
tico19-test.urd-eng: 2100/56339
BLEU-scores:
newsdev2014.hin-eng: 11.6
newsdev2019-engu.guj-eng: 13.4
newstest2014-hien.hin-eng: 17.6
newstest2019-guen.guj-eng: 8.6
Tatoeba-test.asm-eng: 19.2
Tatoeba-test.awa-eng: 14.8
Tatoeba-test.ben-eng: 47.2
Tatoeba-test.bho-eng: 26.6
Tatoeba-test.gbm-eng: 17.1
Tatoeba-test.guj-eng: 21.4
Tatoeba-test.hif-eng: 4.1
Tatoeba-test.hin-eng: 42.4
Tatoeba-test.kok-eng: 4.2
Tatoeba-test.lah-eng: 14.4
Tatoeba-test.mai-eng: 41.0
Tatoeba-test.mar-eng: 45.0
Tatoeba-test.nep-eng: 24.7
Tatoeba-test.multi-eng: 40.2
Tatoeba-test.ori-eng: 0.3
Tatoeba-test.pan-eng: 18.1
Tatoeba-test.rom-eng: 5.8
Tatoeba-test.san-eng: 2.7
Tatoeba-test.sin-eng: 30.6
Tatoeba-test.snd-eng: 28.1
Tatoeba-test.urd-eng: 27.7
tico19-test.ben-eng: 20.7
tico19-test.hin-eng: 27.9
tico19-test.mar-eng: 20.4
tico19-test.nep-eng: 24.6
tico19-test.urd-eng: 16.5
chr-F-scores:
newsdev2014.hin-eng: 0.403
newsdev2019-engu.guj-eng: 0.394
newstest2014-hien.hin-eng: 0.469
newstest2019-guen.guj-eng: 0.339
Tatoeba-test.asm-eng: 0.381
Tatoeba-test.awa-eng: 0.299
Tatoeba-test.ben-eng: 0.619
Tatoeba-test.bho-eng: 0.458
Tatoeba-test.gbm-eng: 0.312
Tatoeba-test.guj-eng: 0.389
Tatoeba-test.hif-eng: 0.285
Tatoeba-test.hin-eng: 0.601
Tatoeba-test.kok-eng: 0.254
Tatoeba-test.lah-eng: 0.291
Tatoeba-test.mai-eng: 0.650
Tatoeba-test.mar-eng: 0.640
Tatoeba-test.nep-eng: 0.430
Tatoeba-test.multi-eng: 0.582
Tatoeba-test.ori-eng: 0.138
Tatoeba-test.pan-eng: 0.378
Tatoeba-test.rom-eng: 0.229
Tatoeba-test.san-eng: 0.184
Tatoeba-test.sin-eng: 0.515
Tatoeba-test.snd-eng: 0.456
Tatoeba-test.urd-eng: 0.478
tico19-test.ben-eng: 0.480
tico19-test.hin-eng: 0.547
tico19-test.mar-eng: 0.502
tico19-test.nep-eng: 0.527
tico19-test.urd-eng: 0.425