@@ -105,14 +105,19 @@ namespace {
105
105
* 6 '' aa b__ccc__
106
106
* ```
107
107
*
108
- * @tparam Tokenizer provides unique functions for split/rsplit.
109
- * @param strings_column The strings to split
108
+ * @tparam Tokenizer provides unique functions for split/rsplit
109
+ * @tparam DelimiterFn Functor for locating delimiters
110
+ * @param input The strings to split
110
111
* @param tokenizer Tokenizer for counting and producing tokens
112
+ * @param delimiter_fn Functor called on each byte to check for delimiters
113
+ * @param stream CUDA stream used for device memory operations and kernel launches
114
+ * @param mr Device memory resource used to allocate the returned objects' device memory
111
115
* @return table of columns for the output of the split
112
116
*/
113
- template <typename Tokenizer>
117
+ template <typename Tokenizer, typename DelimiterFn >
114
118
std::unique_ptr<table> split_fn (strings_column_view const & input,
115
119
Tokenizer tokenizer,
120
+ DelimiterFn delimiter_fn,
116
121
rmm::cuda_stream_view stream,
117
122
rmm::device_async_resource_ref mr)
118
123
{
@@ -123,7 +128,7 @@ std::unique_ptr<table> split_fn(strings_column_view const& input,
123
128
}
124
129
125
130
// builds the offsets and the vector of all tokens
126
- auto [offsets, tokens] = split_helper (input, tokenizer, stream, mr);
131
+ auto [offsets, tokens] = split_helper (input, tokenizer, delimiter_fn, stream, mr);
127
132
auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator (offsets->view ());
128
133
auto const d_tokens = tokens.data ();
129
134
@@ -386,7 +391,7 @@ std::unique_ptr<table> whitespace_split_fn(size_type strings_count,
386
391
387
392
} // namespace
388
393
389
- std::unique_ptr<table> split (strings_column_view const & strings_column ,
394
+ std::unique_ptr<table> split (strings_column_view const & input ,
390
395
string_scalar const & delimiter,
391
396
size_type maxsplit,
392
397
rmm::cuda_stream_view stream,
@@ -396,20 +401,18 @@ std::unique_ptr<table> split(strings_column_view const& strings_column,
396
401
397
402
size_type max_tokens = maxsplit > 0 ? maxsplit + 1 : std::numeric_limits<size_type>::max ();
398
403
399
- auto strings_device_view = column_device_view::create (strings_column .parent (), stream);
404
+ auto d_strings = column_device_view::create (input .parent (), stream);
400
405
if (delimiter.size () == 0 ) {
401
- return whitespace_split_fn (strings_column.size (),
402
- whitespace_split_tokenizer_fn{*strings_device_view, max_tokens},
403
- stream,
404
- mr);
406
+ return whitespace_split_fn (
407
+ input.size (), whitespace_split_tokenizer_fn{*d_strings, max_tokens}, stream, mr);
405
408
}
406
409
407
- string_view d_delimiter (delimiter. data () , delimiter.size ()) ;
408
- return split_fn (
409
- strings_column, split_tokenizer_fn{*strings_device_view, d_delimiter, max_tokens} , stream, mr);
410
+ auto tokenizer = split_tokenizer_fn{*d_strings , delimiter.size (), max_tokens} ;
411
+ auto delimiter_fn = string_delimiter_fn{delimiter. value (stream)};
412
+ return split_fn (input, tokenizer, delimiter_fn , stream, mr);
410
413
}
411
414
412
- std::unique_ptr<table> rsplit (strings_column_view const & strings_column ,
415
+ std::unique_ptr<table> rsplit (strings_column_view const & input ,
413
416
string_scalar const & delimiter,
414
417
size_type maxsplit,
415
418
rmm::cuda_stream_view stream,
@@ -419,17 +422,15 @@ std::unique_ptr<table> rsplit(strings_column_view const& strings_column,
419
422
420
423
size_type max_tokens = maxsplit > 0 ? maxsplit + 1 : std::numeric_limits<size_type>::max ();
421
424
422
- auto strings_device_view = column_device_view::create (strings_column .parent (), stream);
425
+ auto d_strings = column_device_view::create (input .parent (), stream);
423
426
if (delimiter.size () == 0 ) {
424
- return whitespace_split_fn (strings_column.size (),
425
- whitespace_rsplit_tokenizer_fn{*strings_device_view, max_tokens},
426
- stream,
427
- mr);
427
+ return whitespace_split_fn (
428
+ input.size (), whitespace_rsplit_tokenizer_fn{*d_strings, max_tokens}, stream, mr);
428
429
}
429
430
430
- string_view d_delimiter (delimiter. data () , delimiter.size ()) ;
431
- return split_fn (
432
- strings_column, rsplit_tokenizer_fn{*strings_device_view, d_delimiter, max_tokens} , stream, mr);
431
+ auto tokenizer = rsplit_tokenizer_fn{*d_strings , delimiter.size (), max_tokens} ;
432
+ auto delimiter_fn = string_delimiter_fn{delimiter. value (stream)};
433
+ return split_fn (input, tokenizer, delimiter_fn , stream, mr);
433
434
}
434
435
435
436
} // namespace detail
0 commit comments