-
Notifications
You must be signed in to change notification settings - Fork 91
/
Copy pathopus1m-2021-02-18.yml
229 lines (229 loc) · 6.24 KB
/
opus1m-2021-02-18.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
release: zle-roa/opus1m-2021-02-18.zip
release-date: 2021-02-18
dataset-name: opus1m
modeltype: transformer
pre-processing: normalization + SentencePiece (spm32k,spm32k)
subwords:
source: spm32k
target: spm32k
subword-models:
source: source.spm
target: target.spm
use-target-labels:
- ">>ast<<"
- ">>cat<<"
- ">>fra<<"
- ">>gcf_Latn<<"
- ">>glg<<"
- ">>ind<<"
- ">>ita<<"
- ">>jak_Latn<<"
- ">>lad<<"
- ">>lad_Latn<<"
- ">>min<<"
- ">>mol<<"
- ">>msa_Latn<<"
- ">>oci<<"
- ">>pob<<"
- ">>por<<"
- ">>ron<<"
- ">>spa<<"
- ">>zlm<<"
- ">>zlm_Latn<<"
- ">>zsm_Latn<<"
source-languages:
- bel
- orv
- rue
- rus
- ukr
target-languages:
- ast
- cat
- fra
- gcf
- glg
- ind
- ita
- jak
- lad
- min
- mol
- msa
- oci
- pob
- por
- ron
- spa
- zlm
- zsm
training-data:
bel-fra: Tatoeba-train (68085)
bel-ind: Tatoeba-train (28578)
bel-ita: Tatoeba-train (66625)
bel-min: Tatoeba-train (1)
bel-msa_Latn: Tatoeba-train (91407)
bel-por: Tatoeba-train (102224)
bel-spa: Tatoeba-train (203839)
bel_Latn-fra: Tatoeba-train (570)
bel_Latn-ind: Tatoeba-train (97)
bel_Latn-ita: Tatoeba-train (582)
bel_Latn-msa_Latn: Tatoeba-train (575)
bel_Latn-por: Tatoeba-train (1014)
bel_Latn-spa: Tatoeba-train (708)
rus-ast: Tatoeba-train (11268)
rus-cat: Tatoeba-train (539814)
rus-fra: Tatoeba-train (48432451)
rus-glg: Tatoeba-train (186041)
rus-ind: Tatoeba-train (5420593)
rus-ita: Tatoeba-train (17343976)
rus-jak_Latn: Tatoeba-train (6456)
rus-mol: Tatoeba-train (3)
rus-msa_Latn: Tatoeba-train (1233479)
rus-oci: Tatoeba-train (20559)
rus-pob: Tatoeba-train (20542541)
rus-por: Tatoeba-train (15203597)
rus-ron: Tatoeba-train (19080347)
rus-spa: Tatoeba-train (50217824)
rus-zlm: Tatoeba-train (245)
rus-zlm_Latn: Tatoeba-train (74640)
ukr-cat: Tatoeba-train (272844)
ukr-fra: Tatoeba-train (1700175)
ukr-ind: Tatoeba-train (815214)
ukr-ita: Tatoeba-train (1682300)
ukr-jak_Latn: Tatoeba-train (6417)
ukr-msa_Latn: Tatoeba-train (162908)
ukr-pob: Tatoeba-train (454584)
ukr-por: Tatoeba-train (1893558)
ukr-spa: Tatoeba-train (1890039)
ukr-zlm: Tatoeba-train (228)
ukr-zlm_Latn: Tatoeba-train (76372)
validation-data:
bel-fra: Tatoeba-dev, 994
bel-ita: Tatoeba-dev, 991
bel-msa: Tatoeba-dev, 1000
bel-por: Tatoeba-dev, 990
bel-spa: Tatoeba-dev, 995
ast-rus: Tatoeba-dev, 975
cat-rus: Tatoeba-dev, 999
fra-rus: Tatoeba-dev, 179567
glg-rus: Tatoeba-dev, 996
ita-rus: Tatoeba-dev, 66049
msa-rus: Tatoeba-dev, 1000
oci-rus: Tatoeba-dev, 975
por-rus: Tatoeba-dev, 9489
ron-rus: Tatoeba-dev, 999
rus-spa: Tatoeba-dev, 86620
cat-ukr: Tatoeba-dev, 996
fra-ukr: Tatoeba-dev, 18306
ita-ukr: Tatoeba-dev, 8658
msa-ukr: Tatoeba-dev, 1000
por-ukr: Tatoeba-dev, 1000
spa-ukr: Tatoeba-dev, 12924
total-size-shuffled: 394361
devset-selected: top 5000 lines of opus-dev.src.shuffled!
test-data:
newstest2012.rus-fra: 3003/78011
newstest2012.rus-spa: 3003/79006
newstest2013.rus-fra: 3000/70037
newstest2013.rus-spa: 3000/70528
Tatoeba-test.bel-fra: 283/2005
Tatoeba-test.bel-ita: 264/1681
Tatoeba-test.bel-lad: 2/14
Tatoeba-test.bel-msa: 3/43
Tatoeba-test.bel-por: 3/21
Tatoeba-test.bel-spa: 205/1412
Tatoeba-test.multi-multi: 10000/66633
Tatoeba-test.orv-fra: 37/290
Tatoeba-test.orv-ita: 8/53
Tatoeba-test.orv-spa: 33/171
Tatoeba-test.rue-spa: 97/469
Tatoeba-test.rus-ast: 1/5
Tatoeba-test.rus-cat: 185/1342
Tatoeba-test.rus-fra: 10000/70132
Tatoeba-test.rus-gcf: 1/3
Tatoeba-test.rus-glg: 37/228
Tatoeba-test.rus-ita: 10000/71254
Tatoeba-test.rus-lad: 18/100
Tatoeba-test.rus-msa: 88/634
Tatoeba-test.rus-oci: 84/571
Tatoeba-test.rus-por: 10000/74713
Tatoeba-test.rus-ron: 782/4768
Tatoeba-test.rus-spa: 10000/71496
Tatoeba-test.ukr-cat: 455/2670
Tatoeba-test.ukr-fra: 10000/62877
Tatoeba-test.ukr-ita: 5000/27846
Tatoeba-test.ukr-lad: 20/108
Tatoeba-test.ukr-msa: 9/79
Tatoeba-test.ukr-por: 3372/21315
Tatoeba-test.ukr-spa: 10000/58486
BLEU-scores:
newstest2012.rus-fra: 21.5
newstest2012.rus-spa: 25.6
newstest2013.rus-fra: 24.9
newstest2013.rus-spa: 27.5
Tatoeba-test.bel-fra: 38.8
Tatoeba-test.bel-ita: 39.0
Tatoeba-test.bel-lad: 6.3
Tatoeba-test.bel-msa: 1.1
Tatoeba-test.bel-por: 19.3
Tatoeba-test.bel-spa: 39.8
Tatoeba-test.multi-multi: 44.9
Tatoeba-test.orv-fra: 8.0
Tatoeba-test.orv-ita: 4.4
Tatoeba-test.orv-spa: 7.4
Tatoeba-test.rue-spa: 28.2
Tatoeba-test.rus-ast: 23.6
Tatoeba-test.rus-cat: 36.0
Tatoeba-test.rus-fra: 49.7
Tatoeba-test.rus-gcf: 10.7
Tatoeba-test.rus-glg: 31.8
Tatoeba-test.rus-ita: 38.9
Tatoeba-test.rus-lad: 14.9
Tatoeba-test.rus-msa: 17.7
Tatoeba-test.rus-oci: 2.5
Tatoeba-test.rus-por: 37.7
Tatoeba-test.rus-ron: 35.9
Tatoeba-test.rus-spa: 48.3
Tatoeba-test.ukr-cat: 39.6
Tatoeba-test.ukr-fra: 47.3
Tatoeba-test.ukr-ita: 46.4
Tatoeba-test.ukr-lad: 12.6
Tatoeba-test.ukr-msa: 14.8
Tatoeba-test.ukr-por: 39.8
Tatoeba-test.ukr-spa: 47.6
chr-F-scores:
newstest2012.rus-fra: 0.509
newstest2012.rus-spa: 0.525
newstest2013.rus-fra: 0.528
newstest2013.rus-spa: 0.535
Tatoeba-test.bel-fra: 0.577
Tatoeba-test.bel-ita: 0.580
Tatoeba-test.bel-lad: 0.186
Tatoeba-test.bel-msa: 0.156
Tatoeba-test.bel-por: 0.444
Tatoeba-test.bel-spa: 0.604
Tatoeba-test.multi-multi: 0.642
Tatoeba-test.orv-fra: 0.232
Tatoeba-test.orv-ita: 0.180
Tatoeba-test.orv-spa: 0.289
Tatoeba-test.rue-spa: 0.441
Tatoeba-test.rus-ast: 0.703
Tatoeba-test.rus-cat: 0.587
Tatoeba-test.rus-fra: 0.660
Tatoeba-test.rus-gcf: 0.128
Tatoeba-test.rus-glg: 0.560
Tatoeba-test.rus-ita: 0.612
Tatoeba-test.rus-lad: 0.399
Tatoeba-test.rus-msa: 0.399
Tatoeba-test.rus-oci: 0.226
Tatoeba-test.rus-por: 0.600
Tatoeba-test.rus-ron: 0.595
Tatoeba-test.rus-spa: 0.671
Tatoeba-test.ukr-cat: 0.598
Tatoeba-test.ukr-fra: 0.644
Tatoeba-test.ukr-ita: 0.671
Tatoeba-test.ukr-lad: 0.320
Tatoeba-test.ukr-msa: 0.366
Tatoeba-test.ukr-por: 0.612
Tatoeba-test.ukr-spa: 0.662