Skip to content

Commit 4c104f0

Browse files
authored
Custom parsers / null literals should use pre-legalized names (issue #190) (#223)
1 parent 4cc4156 commit 4c104f0

File tree

3 files changed

+80
-10
lines changed

3 files changed

+80
-10
lines changed

src/main/java/io/deephaven/csv/CsvSpecs.java

+6-3
Original file line numberDiff line numberDiff line change
@@ -105,9 +105,12 @@ public interface Builder {
105105
Builder customTimeZoneParser(Tokenizer.CustomTimeZoneParser customTimeZoneParser);
106106

107107
/**
108-
* An optional legalizer for column headers. The legalizer is a function that takes column names (as a
109-
* {@code String[]}) names and returns legal column names (as a {@code String[]}). The legalizer function is
110-
* permitted to reuse its input data structure. Defaults to {@code Function#identity()}.
108+
* An optional legalizer for column headers. The legalizer is a function that takes the column names from the
109+
* input (as a {@code String[]}) and returns "legal" column names (as a {@code String[]}). What constitutes
110+
* "legal" is entirely up to the caller. For example, some applications cannot tolerate punctuation characters
111+
* in column names and need to remove them. The CSV library itself has no limitations with regard to column
112+
* names. The legalizer function is permitted to return the input array (perhaps with some elements modified) as
113+
* the return value. Defaults to {@code Function#identity()}.
111114
*/
112115
Builder headerLegalizer(Function<String[], String[]> headerLegalizer);
113116

src/main/java/io/deephaven/csv/reading/CsvReader.java

+9-7
Original file line numberDiff line numberDiff line change
@@ -95,9 +95,7 @@ private static Result delimitedReadLogic(
9595
headersTemp2 = headersTemp;
9696
}
9797
final int numOutputCols = headersTemp2.length;
98-
final String[] headersToUse = canonicalizeHeaders(specs, headersTemp2);
99-
100-
return commonReadLogic(specs, grabber, firstDataRow, numInputCols, numOutputCols, headersToUse, sinkFactory);
98+
return commonReadLogic(specs, grabber, firstDataRow, numInputCols, numOutputCols, headersTemp2, sinkFactory);
10199
}
102100

103101
private static Result fixedReadLogic(
@@ -113,14 +111,17 @@ private static Result fixedReadLogic(
113111

114112
private static Result commonReadLogic(final CsvSpecs specs, CellGrabber grabber, byte[][] optionalFirstDataRow,
115113
int numInputCols, int numOutputCols,
116-
String[] headersToUse, final SinkFactory sinkFactory)
114+
String[] headersBeforeLegalization, final SinkFactory sinkFactory)
117115
throws CsvReaderException {
116+
118117
final String[][] nullValueLiteralsToUse = new String[numOutputCols][];
119118
for (int ii = 0; ii < numOutputCols; ++ii) {
120119
nullValueLiteralsToUse[ii] =
121-
calcNullValueLiteralsToUse(specs, headersToUse[ii], ii).toArray(new String[0]);
120+
calcNullValueLiteralsToUse(specs, headersBeforeLegalization[ii], ii).toArray(new String[0]);
122121
}
123122

123+
final String[] headersToUse = canonicalizeHeaders(specs, headersBeforeLegalization);
124+
124125
// Create a DenseStorageWriter for each column. The arrays are sized to "numInputCols" but only populated up to
125126
// "numOutputCols". The remaining (numInputCols - numOutputCols) are set to null. The code in
126127
// parseInputToDenseStorge knows that having a null DenseStorageWriter means that the column is all-empty and
@@ -156,7 +157,7 @@ private static Result commonReadLogic(final CsvSpecs specs, CellGrabber grabber,
156157
final ArrayList<Future<Object>> sinkFutures = new ArrayList<>();
157158
try {
158159
for (int ii = 0; ii < numOutputCols; ++ii) {
159-
final List<Parser<?>> parsersToUse = calcParsersToUse(specs, headersToUse[ii], ii);
160+
final List<Parser<?>> parsersToUse = calcParsersToUse(specs, headersBeforeLegalization[ii], ii);
160161

161162
final int iiCopy = ii;
162163
final Future<Object> fcb = ecs.submit(
@@ -230,7 +231,8 @@ private static List<String> calcNullValueLiteralsToUse(final CsvSpecs specs, fin
230231
}
231232

232233
private static String[] canonicalizeHeaders(CsvSpecs specs, final String[] headers) throws CsvReaderException {
233-
final String[] legalized = specs.headerLegalizer().apply(headers);
234+
// legalizer is allowed to mutate the input in-place, so we clone it before passing it.
235+
final String[] legalized = specs.headerLegalizer().apply(headers.clone());
234236
final Set<String> unique = new HashSet<>();
235237
final List<String> repeats = new ArrayList<>();
236238
final List<String> invalidNames = new ArrayList<>();

src/test/java/io/deephaven/csv/CsvReaderTest.java

+65
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
import java.util.Collections;
4242
import java.util.List;
4343
import java.util.function.BiFunction;
44+
import java.util.function.Function;
4445
import java.util.stream.Collectors;
4546
import java.util.stream.Stream;
4647

@@ -206,6 +207,70 @@ public void bug162() throws CsvReaderException {
206207
invokeTest(defaultCsvBuilder().parsers(Parsers.DEFAULT).build(), input, expected);
207208
}
208209

210+
/**
211+
* Reported in <a href="https://github.com/deephaven/deephaven-csv/issues/190">Deephaven CSV Issue #190</a>. That
212+
* issue report misidentifies the root cause as having to do with reserved keywords. This is not correct because the
213+
* library doesn't care whether a column header is a reserved keyword. The actual root cause is an interaction
214+
* between the user-supplied "legalizer" and user-specified parsers or null literals that are specified by column
215+
* names. Specifically the question is whether column names mentioned in {@link CsvSpecs.Builder#putParserForName}
216+
* and {@link CsvSpecs.Builder#putNullValueLiteralsForName} should refer to the name that the column had *before* it
217+
* was transformed by the legalizer, or *after*. The expected behavior is "before", but prior to this fix the
218+
* library was doing the "after" behavior. This is a parameterized test that invokes the behavior for {delimited,
219+
* fixed columns} x {without and with a legalizer}.
220+
*/
221+
@ParameterizedTest
222+
@CsvSource({"false,false", "false,true", "true,false", "true,true"})
223+
public void bug190(boolean hasFixedWidthColumns, boolean invokeLegalizer) throws CsvReaderException {
224+
// +++ is the null value literal for Col1
225+
// *** is the null value literal for Col2
226+
// ??? is the null value literal for Col3
227+
228+
final String input;
229+
230+
if (!hasFixedWidthColumns) {
231+
input = "Col1,Col2,Col3\n" +
232+
"+++,20,30\n" +
233+
"100,***,300\n" +
234+
"1000,2000,???\n";
235+
} else {
236+
input = "Col1 Col2 Col3\n" +
237+
"+++ 20 30\n" +
238+
"100 *** 300\n" +
239+
"1000 2000 ???\n";
240+
}
241+
242+
final String[] expectedColumnNames = !invokeLegalizer ? new String[] {"Col1", "Col2", "Col3"}
243+
: new String[] {"xyzCol1", "xyzCol2", "xyzCol3"};
244+
245+
final ColumnSet expected =
246+
ColumnSet.of(
247+
Column.ofValues(expectedColumnNames[0], Sentinels.NULL_LONG, (long) 100, (long) 1000),
248+
Column.ofValues(expectedColumnNames[1], (double) 20, Sentinels.NULL_DOUBLE, (double) 2000),
249+
Column.ofRefs(expectedColumnNames[2], "30", "300", null));
250+
251+
Function<String[], String[]> legalizer = in -> {
252+
for (int i = 0; i != in.length; ++i) {
253+
// e.g. transform Col1 to xyzCol1
254+
in[i] = "xyz" + in[i];
255+
}
256+
return in;
257+
};
258+
259+
CsvSpecs.Builder specsBase =
260+
defaultCsvBuilder().hasFixedWidthColumns(hasFixedWidthColumns).parsers(Parsers.DEFAULT)
261+
.putParserForName("Col1", Parsers.LONG).putParserForName("Col2", Parsers.DOUBLE)
262+
.putParserForName("Col3", Parsers.STRING)
263+
.putNullValueLiteralsForName("Col1", Collections.singletonList("+++"))
264+
.putNullValueLiteralsForName("Col2", Collections.singletonList("***"))
265+
.putNullValueLiteralsForName("Col3", Collections.singletonList("???"));
266+
267+
if (invokeLegalizer) {
268+
specsBase = specsBase.headerLegalizer(legalizer);
269+
}
270+
271+
invokeTest(specsBase.build(), input, expected);
272+
}
273+
209274
@Test
210275
public void validates() {
211276
final String lengthyMessage = "CsvSpecs failed validation for the following reasons: "

0 commit comments

Comments
 (0)