@@ -133,7 +133,7 @@ def tensorize(self, batch):
133
133
"""Tensorizer knows how to pad and tensorize a batch of it's own output."""
134
134
return batch
135
135
136
- def initialize (self ):
136
+ def initialize (self , from_scratch = True ):
137
137
"""
138
138
The initialize function is carefully designed to allow us to read through the
139
139
training dataset only once, and not store it in memory. As such, it can't itself
@@ -264,9 +264,9 @@ def _lookup_tokens(self, text=None, pre_tokenized=None):
264
264
def _reverse_lookup (self , token_ids ):
265
265
return [self .vocab [id ] for id in token_ids ]
266
266
267
- def initialize (self , vocab_builder = None ):
267
+ def initialize (self , vocab_builder = None , from_scratch = True ):
268
268
"""Build vocabulary based on training corpus."""
269
- if self .vocab :
269
+ if self .vocab and from_scratch :
270
270
if self .vocab_config .build_from_data or self .vocab_config .vocab_files :
271
271
print (
272
272
f"`{ self .text_column } ` column: vocab already provided, skipping "
@@ -279,10 +279,12 @@ def initialize(self, vocab_builder=None):
279
279
f"To create token tensorizer for '{ self .text_column } ', either "
280
280
f"`build_from_data` or `vocab_files` must be set."
281
281
)
282
-
283
- self .vocab_builder = vocab_builder or VocabBuilder ()
284
- self .vocab_builder .use_bos = self .add_bos_token
285
- self .vocab_builder .use_eos = self .add_eos_token
282
+ if not self .vocab_builder :
283
+ # else means not initialize from scratch, self.vocab_builder
284
+ # would be set already
285
+ self .vocab_builder = vocab_builder or VocabBuilder ()
286
+ self .vocab_builder .use_bos = self .add_bos_token
287
+ self .vocab_builder .use_eos = self .add_eos_token
286
288
if not self .vocab_config .build_from_data :
287
289
self ._add_vocab_from_files ()
288
290
self .vocab = self .vocab_builder .make_vocab ()
@@ -561,11 +563,11 @@ def __init__(
561
563
def column_schema (self ):
562
564
return [(self .label_column , str )]
563
565
564
- def initialize (self ):
566
+ def initialize (self , from_scratch = True ):
565
567
"""
566
568
Look through the dataset for all labels and create a vocab map for them.
567
569
"""
568
- if self .vocab :
570
+ if self .vocab and from_scratch :
569
571
return
570
572
try :
571
573
while True :
@@ -652,11 +654,11 @@ def _get_row_value_as_str(self, row) -> str:
652
654
row_value = str (row_value .item ())
653
655
return row_value
654
656
655
- def initialize (self ):
657
+ def initialize (self , from_scratch = True ):
656
658
"""
657
659
Look through the dataset for all uids and create a vocab map for them.
658
660
"""
659
- if self .vocab :
661
+ if self .vocab and from_scratch :
660
662
return
661
663
try :
662
664
while True :
@@ -881,24 +883,27 @@ def __init__(
881
883
self .allow_unknown = allow_unknown
882
884
self .tokenizer = tokenizer or Tokenizer ()
883
885
self .pad_idx = Padding .DEFAULT_LABEL_PAD_IDX
886
+ self .vocab_builder = VocabBuilder ()
887
+ self .vocab_builder .add (NO_LABEL )
888
+ self .vocab_builder .use_pad = False
889
+ self .vocab_builder .use_unk = self .allow_unknown
890
+ self .vocab = None
884
891
885
892
@property
886
893
def column_schema (self ):
887
894
return [(self .text_column , str ), (self .slot_column , List [Slot ])]
888
895
889
- def initialize (self ):
896
+ def initialize (self , from_scratch = True ):
890
897
"""Look through the dataset for all labels and create a vocab map for them."""
891
- builder = VocabBuilder ()
892
- builder .add (NO_LABEL )
893
- builder .use_pad = False
894
- builder .use_unk = self .allow_unknown
898
+ if self .vocab and from_scratch :
899
+ return
895
900
try :
896
901
while True :
897
902
row = yield
898
903
slots = row [self .slot_column ]
899
- builder .add_all (s .label for s in slots )
904
+ self . vocab_builder .add_all (s .label for s in slots )
900
905
except GeneratorExit :
901
- self .vocab = builder .make_vocab ()
906
+ self .vocab = self . vocab_builder .make_vocab ()
902
907
903
908
def numberize (self , row ):
904
909
"""
@@ -993,23 +998,26 @@ def __init__(
993
998
self .text_column = text_column
994
999
self .dict_column = dict_column
995
1000
self .tokenizer = tokenizer or Tokenizer ()
1001
+ self .vocab_builder = VocabBuilder ()
1002
+ self .vocab = None
996
1003
997
1004
@property
998
1005
def column_schema (self ):
999
1006
return [(self .text_column , str ), (self .dict_column , Gazetteer )]
1000
1007
1001
- def initialize (self ):
1008
+ def initialize (self , from_scratch = True ):
1002
1009
"""
1003
1010
Look through the dataset for all dict features to create vocab.
1004
1011
"""
1005
- builder = VocabBuilder ()
1012
+ if self .vocab and from_scratch :
1013
+ return
1006
1014
try :
1007
1015
while True :
1008
1016
row = yield
1009
1017
for token_dict in row [self .dict_column ]:
1010
- builder .add_all (token_dict ["features" ])
1018
+ self . vocab_builder .add_all (token_dict ["features" ])
1011
1019
except GeneratorExit :
1012
- self .vocab = builder .make_vocab ()
1020
+ self .vocab = self . vocab_builder .make_vocab ()
1013
1021
1014
1022
def numberize (self , row ):
1015
1023
"""
@@ -1169,6 +1177,7 @@ def __init__(
1169
1177
self .column = column
1170
1178
self .tokenizer = tokenizer or Tokenizer ()
1171
1179
self .vocab = vocab
1180
+ self .vocab_builder = None
1172
1181
self .add_bos_token = add_bos_token
1173
1182
self .add_eos_token = add_eos_token
1174
1183
self .use_eos_token_for_bos = use_eos_token_for_bos
@@ -1181,24 +1190,25 @@ def __init__(
1181
1190
def column_schema (self ):
1182
1191
return [(self .column , List [str ])]
1183
1192
1184
- def initialize (self , vocab_builder = None ):
1193
+ def initialize (self , vocab_builder = None , from_scratch = True ):
1185
1194
"""Build vocabulary based on training corpus."""
1186
- if self .vocab :
1195
+ if self .vocab and from_scratch :
1187
1196
return
1188
- vocab_builder = vocab_builder or VocabBuilder ()
1189
- vocab_builder .use_bos = self .add_bos_token
1190
- vocab_builder .use_eos = self .add_eos_token
1191
- vocab_builder .use_bol = self .add_bol_token
1192
- vocab_builder .use_eol = self .add_eol_token
1197
+ if not self .vocab_builder :
1198
+ self .vocab_builder = vocab_builder or VocabBuilder ()
1199
+ self .vocab_builder .use_bos = self .add_bos_token
1200
+ self .vocab_builder .use_eos = self .add_eos_token
1201
+ self .vocab_builder .use_bol = self .add_bol_token
1202
+ self .vocab_builder .use_eol = self .add_eol_token
1193
1203
1194
1204
try :
1195
1205
while True :
1196
1206
row = yield
1197
1207
for raw_text in row [self .column ]:
1198
1208
tokenized = self .tokenizer .tokenize (raw_text )
1199
- vocab_builder .add_all ([t .value for t in tokenized ])
1209
+ self . vocab_builder .add_all ([t .value for t in tokenized ])
1200
1210
except GeneratorExit :
1201
- self .vocab = vocab_builder .make_vocab ()
1211
+ self .vocab = self . vocab_builder .make_vocab ()
1202
1212
1203
1213
_lookup_tokens = TokenTensorizer ._lookup_tokens
1204
1214
_tokenize = TokenTensorizer ._tokenize
@@ -1274,27 +1284,29 @@ def from_config(cls, config: Config):
1274
1284
def __init__ (self , column : str = Config .column , vocab = None ):
1275
1285
self .column = column
1276
1286
self .vocab = vocab
1287
+ self .vocab_builder = None
1277
1288
1278
1289
@property
1279
1290
def column_schema (self ):
1280
1291
return [(self .column , List [str ])]
1281
1292
1282
- def initialize (self , vocab_builder = None ):
1293
+ def initialize (self , vocab_builder = None , from_scratch = True ):
1283
1294
"""Build vocabulary based on training corpus."""
1284
- if self .vocab :
1295
+ if self .vocab and from_scratch :
1285
1296
return
1286
- vocab_builder = vocab_builder or VocabBuilder ()
1287
- vocab_builder .use_unk = False
1288
- vocab_builder .use_pad = False
1297
+ if not self .vocab_builder :
1298
+ self .vocab_builder = vocab_builder or VocabBuilder ()
1299
+ self .vocab_builder .use_unk = False
1300
+ self .vocab_builder .use_pad = False
1289
1301
1290
1302
try :
1291
1303
while True :
1292
1304
row = yield
1293
1305
annotation = Annotation (row [self .column ])
1294
1306
actions = annotation .tree .to_actions ()
1295
- vocab_builder .add_all (actions )
1307
+ self . vocab_builder .add_all (actions )
1296
1308
except GeneratorExit :
1297
- self .vocab = vocab_builder .make_vocab ()
1309
+ self .vocab = self . vocab_builder .make_vocab ()
1298
1310
self .shift_idx = self .vocab .idx [SHIFT ]
1299
1311
self .reduce_idx = self .vocab .idx [REDUCE ]
1300
1312
@@ -1378,11 +1390,16 @@ def tensorize(self, batch):
1378
1390
return cuda .tensor (batch , torch .float )
1379
1391
1380
1392
1381
- def initialize_tensorizers (tensorizers , data_source ):
1393
+ def initialize_tensorizers (tensorizers , data_source , from_scratch = True ):
1382
1394
"""A utility function to stream a data source to the initialize functions
1383
1395
of a dict of tensorizers."""
1384
1396
initializers = []
1385
- for init in [tensorizer .initialize () for tensorizer in tensorizers .values ()]:
1397
+ for init in [
1398
+ tensorizer .initialize (from_scratch = from_scratch )
1399
+ if hasattr (tensorizer , "vocab" )
1400
+ else tensorizer .initialize ()
1401
+ for tensorizer in tensorizers .values ()
1402
+ ]:
1386
1403
try :
1387
1404
init .send (None ) # kick
1388
1405
initializers .append (init )
0 commit comments