suvodeep-pyne commented on code in PR #13913: URL: https://github.com/apache/pinot/pull/13913#discussion_r1742547048
########## pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java: ########## @@ -60,102 +61,119 @@ public class CSVRecordReader implements RecordReader { private Map<String, Integer> _headerMap = new HashMap<>(); private boolean _isHeaderProvided = false; - // line iterator specific variables - private boolean _useLineIterator = false; - private boolean _skipHeaderRecord = false; - private long _skippedLinesCount; private BufferedReader _bufferedReader; - private String _nextLine; - private GenericRow _nextRecord; + private CSVRecordReaderConfig _config = null; public CSVRecordReader() { } - @Override - public void init(File dataFile, @Nullable Set<String> fieldsToRead, @Nullable RecordReaderConfig recordReaderConfig) - throws IOException { - _dataFile = dataFile; - CSVRecordReaderConfig config = (CSVRecordReaderConfig) recordReaderConfig; - Character multiValueDelimiter = null; - if (config == null) { - _format = CSVFormat.DEFAULT.builder().setDelimiter(CSVRecordReaderConfig.DEFAULT_DELIMITER).setHeader().build(); - multiValueDelimiter = CSVRecordReaderConfig.DEFAULT_MULTI_VALUE_DELIMITER; - } else { - CSVFormat format; - String formatString = config.getFileFormat(); - if (formatString == null) { - format = CSVFormat.DEFAULT; - } else { - switch (formatString.toUpperCase()) { - case "EXCEL": - format = CSVFormat.EXCEL; - break; - case "MYSQL": - format = CSVFormat.MYSQL; - break; - case "RFC4180": - format = CSVFormat.RFC4180; - break; - case "TDF": - format = CSVFormat.TDF; - break; - default: - format = CSVFormat.DEFAULT; - break; - } - } - char delimiter = config.getDelimiter(); - format = format.builder().setDelimiter(delimiter).build(); + private static CSVFormat baseCsvFormat(CSVRecordReaderConfig config) { + if (config.getFileFormat() == null) { + return CSVFormat.DEFAULT; + } + switch (config.getFileFormat().toUpperCase()) { + case "EXCEL": + return CSVFormat.EXCEL; + case "MYSQL": + return CSVFormat.MYSQL; + case "RFC4180": + return CSVFormat.RFC4180; + case "TDF": + return CSVFormat.TDF; + default: + return CSVFormat.DEFAULT; + } + } - if (config.isSkipUnParseableLines()) { - _useLineIterator = true; - } + private static CSVFormat defaultFormat() { + return CSVFormat.DEFAULT.builder().setDelimiter(CSVRecordReaderConfig.DEFAULT_DELIMITER).setHeader().build(); + } - _isHeaderProvided = config.getHeader() != null; - _skipHeaderRecord = config.isSkipHeader(); - _format = format.builder() - .setHeader() - .setSkipHeaderRecord(config.isSkipHeader()) - .setCommentMarker(config.getCommentMarker()) - .setEscape(config.getEscapeCharacter()) - .setIgnoreEmptyLines(config.isIgnoreEmptyLines()) - .setIgnoreSurroundingSpaces(config.isIgnoreSurroundingSpaces()) - .setQuote(config.getQuoteCharacter()) - .build(); - - if (config.getQuoteMode() != null) { - _format = _format.builder().setQuoteMode(QuoteMode.valueOf(config.getQuoteMode())).build(); - } + private static <T> Optional<T> optional(T value) { + return Optional.ofNullable(value); + } - if (config.getRecordSeparator() != null) { - _format = _format.builder().setRecordSeparator(config.getRecordSeparator()).build(); - } + private static CSVFormat.Builder formatBuilder(CSVRecordReaderConfig config) { + final CSVFormat.Builder builder = baseCsvFormat(config).builder().setDelimiter(config.getDelimiter()).setHeader() + .setSkipHeaderRecord(config.isSkipHeader()).setCommentMarker(config.getCommentMarker()) + .setEscape(config.getEscapeCharacter()).setIgnoreEmptyLines(config.isIgnoreEmptyLines()) + .setIgnoreSurroundingSpaces(config.isIgnoreSurroundingSpaces()).setQuote(config.getQuoteCharacter()); + + optional(config.getQuoteMode()).map(QuoteMode::valueOf).ifPresent(builder::setQuoteMode); + optional(config.getRecordSeparator()).ifPresent(builder::setRecordSeparator); + optional(config.getNullStringValue()).ifPresent(builder::setNullString); + return builder; + } - String nullString = config.getNullStringValue(); - if (nullString != null) { - _format = _format.builder().setNullString(nullString).build(); + private static Map<String, Integer> parseLineAsHeader(CSVFormat format, String line) + throws IOException { + try (StringReader stringReader = new StringReader(line)) { + try (CSVParser parser = format.parse(stringReader)) { + return parser.getHeaderMap(); } + } + } + + private static Character getMultiValueDelimiter(CSVRecordReaderConfig config) { + if (config == null) { + return CSVRecordReaderConfig.DEFAULT_MULTI_VALUE_DELIMITER; + } else if (config.isMultiValueDelimiterEnabled()) { + return config.getMultiValueDelimiter(); + } + return null; + } + private static boolean useLineIterator(CSVRecordReaderConfig config) { + return config != null && config.isSkipUnParseableLines(); + } + + @Override + public void init(File dataFile, @Nullable Set<String> fieldsToRead, @Nullable RecordReaderConfig recordReaderConfig) + throws IOException { + _dataFile = dataFile; + _config = (CSVRecordReaderConfig) recordReaderConfig; + if (_config == null) { + _format = defaultFormat(); + } else { + _isHeaderProvided = _config.getHeader() != null; + final CSVFormat.Builder builder = formatBuilder(_config); if (_isHeaderProvided) { - _headerMap = parseLineAsHeader(config.getHeader()); - _format = _format.builder().setHeader(_headerMap.keySet().toArray(new String[0])).build(); - if (!_useLineIterator) { - validateHeaderForDelimiter(delimiter, config.getHeader(), _format); - } + // use an intermediate format to parse the header line. It still needs to be updated later + _headerMap = parseLineAsHeader(builder.build(), _config.getHeader()); + builder.setHeader(_headerMap.keySet().toArray(new String[0])); } + _format = builder.build(); - if (config.isMultiValueDelimiterEnabled()) { - multiValueDelimiter = config.getMultiValueDelimiter(); + if (_isHeaderProvided) { Review Comment: yeah. I didn't really see a lot of benefit maintain a cached value of the config unless it is being iterated on or accessed frequently. The `CSVRecordExtractor` uses this to capture multi value fields. `CSVRecordReader` doesn't use it. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org