Custom parsers / null literals should use pre-legalized names (issue #190) (#223)

kosak · web-flow · commit 4c104f00790b · 2024-11-13T15:11:44.000-05:00
diff --git a/src/main/java/io/deephaven/csv/CsvSpecs.java b/src/main/java/io/deephaven/csv/CsvSpecs.java
@@ -105,9 +105,12 @@ public interface Builder {
         Builder customTimeZoneParser(Tokenizer.CustomTimeZoneParser customTimeZoneParser);
 
         /**
-         * An optional legalizer for column headers. The legalizer is a function that takes column names (as a
-         * {@code String[]}) names and returns legal column names (as a {@code String[]}). The legalizer function is
-         * permitted to reuse its input data structure. Defaults to {@code Function#identity()}.
+         * An optional legalizer for column headers. The legalizer is a function that takes the column names from the
+         * input (as a {@code String[]}) and returns "legal" column names (as a {@code String[]}). What constitutes
+         * "legal" is entirely up to the caller. For example, some applications cannot tolerate punctuation characters
+         * in column names and need to remove them. The CSV library itself has no limitations with regard to column
+         * names. The legalizer function is permitted to return the input array (perhaps with some elements modified) as
+         * the return value. Defaults to {@code Function#identity()}.
          */
         Builder headerLegalizer(Function<String[], String[]> headerLegalizer);
 
diff --git a/src/main/java/io/deephaven/csv/reading/CsvReader.java b/src/main/java/io/deephaven/csv/reading/CsvReader.java
@@ -95,9 +95,7 @@ private static Result delimitedReadLogic(
             headersTemp2 = headersTemp;
         }
         final int numOutputCols = headersTemp2.length;
-        final String[] headersToUse = canonicalizeHeaders(specs, headersTemp2);
-
-        return commonReadLogic(specs, grabber, firstDataRow, numInputCols, numOutputCols, headersToUse, sinkFactory);
+        return commonReadLogic(specs, grabber, firstDataRow, numInputCols, numOutputCols, headersTemp2, sinkFactory);
     }
 
     private static Result fixedReadLogic(
@@ -113,14 +111,17 @@ private static Result fixedReadLogic(
 
     private static Result commonReadLogic(final CsvSpecs specs, CellGrabber grabber, byte[][] optionalFirstDataRow,
             int numInputCols, int numOutputCols,
-            String[] headersToUse, final SinkFactory sinkFactory)
+            String[] headersBeforeLegalization, final SinkFactory sinkFactory)
             throws CsvReaderException {
+
         final String[][] nullValueLiteralsToUse = new String[numOutputCols][];
         for (int ii = 0; ii < numOutputCols; ++ii) {
             nullValueLiteralsToUse[ii] =
-                    calcNullValueLiteralsToUse(specs, headersToUse[ii], ii).toArray(new String[0]);
+                    calcNullValueLiteralsToUse(specs, headersBeforeLegalization[ii], ii).toArray(new String[0]);
         }
 
+        final String[] headersToUse = canonicalizeHeaders(specs, headersBeforeLegalization);
+
         // Create a DenseStorageWriter for each column. The arrays are sized to "numInputCols" but only populated up to
         // "numOutputCols". The remaining (numInputCols - numOutputCols) are set to null. The code in
         // parseInputToDenseStorge knows that having a null DenseStorageWriter means that the column is all-empty and
@@ -156,7 +157,7 @@ private static Result commonReadLogic(final CsvSpecs specs, CellGrabber grabber,
         final ArrayList<Future<Object>> sinkFutures = new ArrayList<>();
         try {
             for (int ii = 0; ii < numOutputCols; ++ii) {
-                final List<Parser<?>> parsersToUse = calcParsersToUse(specs, headersToUse[ii], ii);
+                final List<Parser<?>> parsersToUse = calcParsersToUse(specs, headersBeforeLegalization[ii], ii);
 
                 final int iiCopy = ii;
                 final Future<Object> fcb = ecs.submit(
@@ -230,7 +231,8 @@ private static List<String> calcNullValueLiteralsToUse(final CsvSpecs specs, fin
     }
 
     private static String[] canonicalizeHeaders(CsvSpecs specs, final String[] headers) throws CsvReaderException {
-        final String[] legalized = specs.headerLegalizer().apply(headers);
+        // legalizer is allowed to mutate the input in-place, so we clone it before passing it.
+        final String[] legalized = specs.headerLegalizer().apply(headers.clone());
         final Set<String> unique = new HashSet<>();
         final List<String> repeats = new ArrayList<>();
         final List<String> invalidNames = new ArrayList<>();
diff --git a/src/test/java/io/deephaven/csv/CsvReaderTest.java b/src/test/java/io/deephaven/csv/CsvReaderTest.java
@@ -41,6 +41,7 @@
 import java.util.Collections;
 import java.util.List;
 import java.util.function.BiFunction;
+import java.util.function.Function;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 
@@ -206,6 +207,70 @@ public void bug162() throws CsvReaderException {
         invokeTest(defaultCsvBuilder().parsers(Parsers.DEFAULT).build(), input, expected);
     }
 
+    /**
+     * Reported in <a href="https://github.com/deephaven/deephaven-csv/issues/190">Deephaven CSV Issue #190</a>. That
+     * issue report misidentifies the root cause as having to do with reserved keywords. This is not correct because the
+     * library doesn't care whether a column header is a reserved keyword. The actual root cause is an interaction
+     * between the user-supplied "legalizer" and user-specified parsers or null literals that are specified by column
+     * names. Specifically the question is whether column names mentioned in {@link CsvSpecs.Builder#putParserForName}
+     * and {@link CsvSpecs.Builder#putNullValueLiteralsForName} should refer to the name that the column had *before* it
+     * was transformed by the legalizer, or *after*. The expected behavior is "before", but prior to this fix the
+     * library was doing the "after" behavior. This is a parameterized test that invokes the behavior for {delimited,
+     * fixed columns} x {without and with a legalizer}.
+     */
+    @ParameterizedTest
+    @CsvSource({"false,false", "false,true", "true,false", "true,true"})
+    public void bug190(boolean hasFixedWidthColumns, boolean invokeLegalizer) throws CsvReaderException {
+        // +++ is the null value literal for Col1
+        // *** is the null value literal for Col2
+        // ??? is the null value literal for Col3
+
+        final String input;
+
+        if (!hasFixedWidthColumns) {
+            input = "Col1,Col2,Col3\n" +
+                    "+++,20,30\n" +
+                    "100,***,300\n" +
+                    "1000,2000,???\n";
+        } else {
+            input = "Col1 Col2 Col3\n" +
+                    "+++  20   30\n" +
+                    "100  ***  300\n" +
+                    "1000 2000 ???\n";
+        }
+
+        final String[] expectedColumnNames = !invokeLegalizer ? new String[] {"Col1", "Col2", "Col3"}
+                : new String[] {"xyzCol1", "xyzCol2", "xyzCol3"};
+
+        final ColumnSet expected =
+                ColumnSet.of(
+                        Column.ofValues(expectedColumnNames[0], Sentinels.NULL_LONG, (long) 100, (long) 1000),
+                        Column.ofValues(expectedColumnNames[1], (double) 20, Sentinels.NULL_DOUBLE, (double) 2000),
+                        Column.ofRefs(expectedColumnNames[2], "30", "300", null));
+
+        Function<String[], String[]> legalizer = in -> {
+            for (int i = 0; i != in.length; ++i) {
+                // e.g. transform Col1 to xyzCol1
+                in[i] = "xyz" + in[i];
+            }
+            return in;
+        };
+
+        CsvSpecs.Builder specsBase =
+                defaultCsvBuilder().hasFixedWidthColumns(hasFixedWidthColumns).parsers(Parsers.DEFAULT)
+                        .putParserForName("Col1", Parsers.LONG).putParserForName("Col2", Parsers.DOUBLE)
+                        .putParserForName("Col3", Parsers.STRING)
+                        .putNullValueLiteralsForName("Col1", Collections.singletonList("+++"))
+                        .putNullValueLiteralsForName("Col2", Collections.singletonList("***"))
+                        .putNullValueLiteralsForName("Col3", Collections.singletonList("???"));
+
+        if (invokeLegalizer) {
+            specsBase = specsBase.headerLegalizer(legalizer);
+        }
+
+        invokeTest(specsBase.build(), input, expected);
+    }
+
     @Test
     public void validates() {
         final String lengthyMessage = "CsvSpecs failed validation for the following reasons: "