Skip to content

Commit f9293f1

Browse files
authored
Add fixed-width column support (#220)
1 parent 1c52d59 commit f9293f1

File tree

5 files changed

+948
-13
lines changed

5 files changed

+948
-13
lines changed

src/main/java/io/deephaven/csv/CsvSpecs.java

+102-6
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
public abstract class CsvSpecs {
2525
public interface Builder {
2626
/**
27-
* Copy all of the parameters from {@code specs} into {@code this} builder.
27+
* Copy all the parameters from {@code specs} into {@code this} builder.
2828
*/
2929
Builder from(CsvSpecs specs);
3030

@@ -117,6 +117,34 @@ public interface Builder {
117117
*/
118118
Builder headerValidator(Predicate<String> headerValidator);
119119

120+
/**
121+
* True if the input is organized into fixed width columns rather than delimited by a delimiter.
122+
*/
123+
Builder hasFixedWidthColumns(boolean hasFixedWidthColumns);
124+
125+
/**
126+
* When {@link #hasFixedWidthColumns} is set, the library either determines the column widths from the header
127+
* row (provided {@link #hasHeaderRow} is set), or the column widths can be specified explicitly by the caller.
128+
* If the caller wants to specify them explicitly, they can use this method. It is an error to set this
129+
* parameter if {@link #hasFixedWidthColumns} is false. Note that because the library is tolerant of the last
130+
* cell being shorter or wider than expected, the value specified here for the width of the last column is
131+
* simply a placeholder; its value is ignored.
132+
*/
133+
Builder fixedColumnWidths(Iterable<Integer> fixedColumnWidths);
134+
135+
/**
136+
* This setting controls what units fixed width columns are measured in. When true, fixed width columns are
137+
* measured in Unicode code points. When false, fixed width columns are measured in UTF-16 units (aka Java
138+
* chars). The difference arises when encountering characters outside the Unicode Basic Multilingual Plane. For
139+
* example, the Unicode code point 💔 (U+1F494) is one Unicode code point, but takes two Java chars to
140+
* represent. Along these lines, the string 💔💔💔 would fit in a column of width 3 when utf32CountingMode is
141+
* true, but would require a column width of at least 6 when utf32CountingMode is false. The default setting of
142+
* true is arguably more natural for users (the number of characters they see matches the visual width of the
143+
* column). But some programs may want the value of false because they are counting Java chars. It is an error
144+
* to set this parameter if {@link #hasFixedWidthColumns} is false.
145+
*/
146+
Builder useUtf32CountingConvention(boolean useUtf32CountingConvention);
147+
120148
/**
121149
* Number of data rows to skip before processing data. This is useful when you want to parse data in chunks.
122150
* Typically used together with {@link Builder#numRows}. Defaults to 0.
@@ -160,7 +188,7 @@ public interface Builder {
160188

161189
/**
162190
* The field delimiter character (the character that separates one column from the next). Must be 7-bit ASCII.
163-
* Defaults to {code ','}.
191+
* Defaults to {code ','}. It is an error to set this parameter if {@link #hasFixedWidthColumns} is true.
164192
*/
165193
Builder delimiter(char delimiter);
166194

@@ -179,6 +207,8 @@ public interface Builder {
179207
* <li>hello, there
180208
* <li>456
181209
* </ul>
210+
*
211+
* It is an error to set this parameter if {@link #hasFixedWidthColumns} is true.
182212
*/
183213
Builder quote(char quote);
184214

@@ -188,7 +218,8 @@ public interface Builder {
188218
Builder ignoreSurroundingSpaces(boolean ignoreSurroundingSpaces);
189219

190220
/**
191-
* Whether to trim leading and trailing blanks from inside quoted values. Defaults to {@code false}.
221+
* Whether to trim leading and trailing blanks from inside quoted values. Defaults to {@code false}. It is an
222+
* error to set this parameter if {@link #hasFixedWidthColumns} is true.
192223
*/
193224
Builder trim(boolean trim);
194225

@@ -224,6 +255,38 @@ void check() {
224255
if (!hasHeaderRow() && skipHeaderRows() > 0) {
225256
problems.add("skipHeaderRows != 0 but hasHeaderRow is not set");
226257
}
258+
259+
for (final Integer colWidth : fixedColumnWidths()) {
260+
if (colWidth < 1) {
261+
problems.add(String.format("Fixed column width %d is invalid", colWidth));
262+
}
263+
}
264+
265+
// Certain items must not be set in fixed-width column mode. Other items must not be set in delimited column
266+
// mode.
267+
if (hasFixedWidthColumns()) {
268+
final String format = "Incompatible parameters: can't set %s when hasFixedWidthColumns is true";
269+
if (quote() != defaultQuote) {
270+
problems.add(String.format(format, "quote"));
271+
}
272+
273+
if (delimiter() != defaultDelimiter) {
274+
problems.add(String.format(format, "delimiter"));
275+
}
276+
277+
if (trim() != defaultTrim) {
278+
problems.add(String.format(format, "trim"));
279+
}
280+
} else {
281+
final String format = "Incompatible parameters: can't set %s when hasFixedWidthColumns is false";
282+
if (fixedColumnWidths().size() != 0) {
283+
problems.add(String.format(format, "fixedColumnWidths"));
284+
}
285+
286+
if (useUtf32CountingConvention() != defaultUtf32CountingConvention) {
287+
problems.add(String.format(format, "useUtf32CountingConvention"));
288+
}
289+
}
227290
if (problems.isEmpty()) {
228291
return;
229292
}
@@ -340,6 +403,32 @@ public Predicate<String> headerValidator() {
340403
return c -> true;
341404
}
342405

406+
/**
407+
* See {@link Builder#hasFixedWidthColumns}.
408+
*/
409+
@Default
410+
public boolean hasFixedWidthColumns() {
411+
return false;
412+
}
413+
414+
/**
415+
* See {@link Builder#fixedColumnWidths}.
416+
*/
417+
@Default
418+
public List<Integer> fixedColumnWidths() {
419+
return Collections.emptyList();
420+
}
421+
422+
private static final boolean defaultUtf32CountingConvention = true;
423+
424+
/**
425+
* See {@link Builder#useUtf32CountingConvention}.
426+
*/
427+
@Default
428+
public boolean useUtf32CountingConvention() {
429+
return defaultUtf32CountingConvention;
430+
}
431+
343432
/**
344433
* See {@link Builder#skipRows}.
345434
*/
@@ -396,20 +485,25 @@ public long skipHeaderRows() {
396485
return 0;
397486
}
398487

488+
private final char defaultDelimiter = ',';
489+
399490
/**
400491
* See {@link Builder#delimiter}.
401492
*/
402493
@Default
403494
public char delimiter() {
404-
return ',';
495+
return defaultDelimiter;
405496
}
406497

498+
499+
private static final char defaultQuote = '"';
500+
407501
/**
408502
* See {@link Builder#quote}.
409503
*/
410504
@Default
411505
public char quote() {
412-
return '"';
506+
return defaultQuote;
413507
}
414508

415509
/**
@@ -420,12 +514,14 @@ public boolean ignoreSurroundingSpaces() {
420514
return true;
421515
}
422516

517+
private static boolean defaultTrim = false;
518+
423519
/**
424520
* See {@link Builder#trim}.
425521
*/
426522
@Default
427523
public boolean trim() {
428-
return false;
524+
return defaultTrim;
429525
}
430526

431527
/**

src/main/java/io/deephaven/csv/reading/CsvReader.java

+14-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,9 @@
77
import io.deephaven.csv.parsers.Parser;
88
import io.deephaven.csv.reading.cells.CellGrabber;
99
import io.deephaven.csv.reading.cells.DelimitedCellGrabber;
10+
import io.deephaven.csv.reading.cells.FixedCellGrabber;
1011
import io.deephaven.csv.reading.headers.DelimitedHeaderFinder;
12+
import io.deephaven.csv.reading.headers.FixedHeaderFinder;
1113
import io.deephaven.csv.sinks.Sink;
1214
import io.deephaven.csv.sinks.SinkFactory;
1315
import io.deephaven.csv.util.*;
@@ -63,7 +65,8 @@ private CsvReader() {}
6365
*/
6466
public static Result read(final CsvSpecs specs, final InputStream stream, final SinkFactory sinkFactory)
6567
throws CsvReaderException {
66-
return delimitedReadLogic(specs, stream, sinkFactory);
68+
return specs.hasFixedWidthColumns() ? fixedReadLogic(specs, stream, sinkFactory)
69+
: delimitedReadLogic(specs, stream, sinkFactory);
6770
}
6871

6972
private static Result delimitedReadLogic(
@@ -97,6 +100,16 @@ private static Result delimitedReadLogic(
97100
return commonReadLogic(specs, grabber, firstDataRow, numInputCols, numOutputCols, headersToUse, sinkFactory);
98101
}
99102

103+
private static Result fixedReadLogic(
104+
final CsvSpecs specs, final InputStream stream, final SinkFactory sinkFactory) throws CsvReaderException {
105+
final CellGrabber lineGrabber = FixedCellGrabber.makeLineGrabber(stream);
106+
MutableObject<int[]> columnWidths = new MutableObject<>();
107+
final String[] headers = FixedHeaderFinder.determineHeadersToUse(specs, lineGrabber, columnWidths);
108+
final int numCols = headers.length;
109+
final CellGrabber grabber = new FixedCellGrabber(lineGrabber, columnWidths.getValue(),
110+
specs.ignoreSurroundingSpaces(), specs.useUtf32CountingConvention());
111+
return commonReadLogic(specs, grabber, null, numCols, numCols, headers, sinkFactory);
112+
}
100113

101114
private static Result commonReadLogic(final CsvSpecs specs, CellGrabber grabber, byte[][] optionalFirstDataRow,
102115
int numInputCols, int numOutputCols,
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
package io.deephaven.csv.reading.cells;
2+
3+
import io.deephaven.csv.containers.ByteSlice;
4+
import io.deephaven.csv.reading.ReaderUtil;
5+
import io.deephaven.csv.util.CsvReaderException;
6+
import io.deephaven.csv.util.MutableBoolean;
7+
import io.deephaven.csv.util.MutableInt;
8+
9+
import java.io.InputStream;
10+
11+
/**
12+
* This class uses an underlying DelimitedCellGrabber to grab whole lines at a time from the input stream, and then it
13+
* breaks them into fixed-sized cells to return to the caller.
14+
*/
15+
public class FixedCellGrabber implements CellGrabber {
16+
/**
17+
* Makes a degenerate CellGrabber that has no delimiters or quotes and therefore returns whole lines. This is a
18+
* somewhat quick-and-dirty way to reuse the buffering and newline logic in DelimitedCellGrabber without rewriting
19+
* it.
20+
*
21+
* @param stream The underlying stream.
22+
* @return The "line grabber"
23+
*/
24+
public static CellGrabber makeLineGrabber(InputStream stream) {
25+
final byte IllegalUtf8 = (byte) 0xff;
26+
return new DelimitedCellGrabber(stream, IllegalUtf8, IllegalUtf8, true, false);
27+
}
28+
29+
private final CellGrabber lineGrabber;
30+
private final int[] columnWidths;
31+
private final boolean ignoreSurroundingSpaces;
32+
private final boolean utf32CountingMode;
33+
private final ByteSlice rowText;
34+
private boolean needsUnderlyingRefresh;
35+
private int colIndex;
36+
private final MutableBoolean dummy1;
37+
private final MutableInt dummy2;
38+
39+
/** Constructor. */
40+
public FixedCellGrabber(final CellGrabber lineGrabber, final int[] columnWidths, boolean ignoreSurroundingSpaces,
41+
boolean utf32CountingMode) {
42+
this.lineGrabber = lineGrabber;
43+
this.columnWidths = columnWidths;
44+
this.ignoreSurroundingSpaces = ignoreSurroundingSpaces;
45+
this.utf32CountingMode = utf32CountingMode;
46+
this.rowText = new ByteSlice();
47+
this.needsUnderlyingRefresh = true;
48+
this.colIndex = 0;
49+
this.dummy1 = new MutableBoolean();
50+
this.dummy2 = new MutableInt();
51+
}
52+
53+
@Override
54+
public void grabNext(ByteSlice dest, MutableBoolean lastInRow, MutableBoolean endOfInput)
55+
throws CsvReaderException {
56+
if (needsUnderlyingRefresh) {
57+
// Underlying row used up, and all columns provided. Ask underlying CellGrabber for the next line.
58+
lineGrabber.grabNext(rowText, dummy1, endOfInput);
59+
60+
if (endOfInput.booleanValue()) {
61+
// Set dest to the empty string, and leave 'endOfInput' set to true.
62+
dest.reset(rowText.data(), rowText.end(), rowText.end());
63+
return;
64+
}
65+
66+
needsUnderlyingRefresh = false;
67+
colIndex = 0;
68+
}
69+
70+
// There is data to return. Count off N characters. The final column gets all remaining characters.
71+
final boolean lastCol = colIndex == columnWidths.length - 1;
72+
final int numCharsToTake = lastCol ? Integer.MAX_VALUE : columnWidths[colIndex];
73+
takeNCharactersInCharset(rowText, dest, numCharsToTake, utf32CountingMode, dummy2);
74+
++colIndex;
75+
needsUnderlyingRefresh = lastCol || dest.size() == 0;
76+
lastInRow.setValue(needsUnderlyingRefresh);
77+
endOfInput.setValue(false);
78+
79+
if (ignoreSurroundingSpaces) {
80+
ReaderUtil.trimSpacesAndTabs(dest);
81+
}
82+
}
83+
84+
private static void takeNCharactersInCharset(ByteSlice src, ByteSlice dest, int numCharsToTake,
85+
boolean utf32CountingMode, MutableInt tempInt) {
86+
final byte[] data = src.data();
87+
final int cellBegin = src.begin();
88+
int current = cellBegin;
89+
while (numCharsToTake > 0) {
90+
if (current == src.end()) {
91+
break;
92+
}
93+
final int utf8Length = ReaderUtil.getUtf8LengthAndCharLength(data[current], src.end() - current,
94+
utf32CountingMode, tempInt);
95+
if (numCharsToTake < tempInt.intValue()) {
96+
// There is not enough space left in the field to store this character.
97+
// This can happen if CsvSpecs is set for the UTF16 counting convention,
98+
// there is one unit left in the field, and we encounter a character outside
99+
// the Basic Multilingual Plane, which would require two units.
100+
break;
101+
}
102+
numCharsToTake -= tempInt.intValue();
103+
current += utf8Length;
104+
}
105+
dest.reset(src.data(), cellBegin, current);
106+
src.reset(src.data(), current, src.end());
107+
}
108+
109+
@Override
110+
public int physicalRowNum() {
111+
return lineGrabber.physicalRowNum();
112+
}
113+
}

0 commit comments

Comments
 (0)