suvodeep-pyne commented on code in PR #13913: URL: https://github.com/apache/pinot/pull/13913#discussion_r1742543940
########## pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java: ########## @@ -60,102 +61,119 @@ public class CSVRecordReader implements RecordReader { private Map<String, Integer> _headerMap = new HashMap<>(); private boolean _isHeaderProvided = false; - // line iterator specific variables - private boolean _useLineIterator = false; - private boolean _skipHeaderRecord = false; - private long _skippedLinesCount; private BufferedReader _bufferedReader; - private String _nextLine; - private GenericRow _nextRecord; + private CSVRecordReaderConfig _config = null; public CSVRecordReader() { } - @Override - public void init(File dataFile, @Nullable Set<String> fieldsToRead, @Nullable RecordReaderConfig recordReaderConfig) - throws IOException { - _dataFile = dataFile; - CSVRecordReaderConfig config = (CSVRecordReaderConfig) recordReaderConfig; - Character multiValueDelimiter = null; - if (config == null) { - _format = CSVFormat.DEFAULT.builder().setDelimiter(CSVRecordReaderConfig.DEFAULT_DELIMITER).setHeader().build(); - multiValueDelimiter = CSVRecordReaderConfig.DEFAULT_MULTI_VALUE_DELIMITER; - } else { - CSVFormat format; - String formatString = config.getFileFormat(); - if (formatString == null) { - format = CSVFormat.DEFAULT; - } else { - switch (formatString.toUpperCase()) { - case "EXCEL": - format = CSVFormat.EXCEL; - break; - case "MYSQL": - format = CSVFormat.MYSQL; - break; - case "RFC4180": - format = CSVFormat.RFC4180; - break; - case "TDF": - format = CSVFormat.TDF; - break; - default: - format = CSVFormat.DEFAULT; - break; - } - } - char delimiter = config.getDelimiter(); - format = format.builder().setDelimiter(delimiter).build(); + private static CSVFormat baseCsvFormat(CSVRecordReaderConfig config) { + if (config.getFileFormat() == null) { + return CSVFormat.DEFAULT; + } + switch (config.getFileFormat().toUpperCase()) { + case "EXCEL": + return CSVFormat.EXCEL; + case "MYSQL": + return CSVFormat.MYSQL; + case "RFC4180": + return CSVFormat.RFC4180; + case "TDF": + return CSVFormat.TDF; + default: + return CSVFormat.DEFAULT; + } + } - if (config.isSkipUnParseableLines()) { - _useLineIterator = true; - } + private static CSVFormat defaultFormat() { + return CSVFormat.DEFAULT.builder().setDelimiter(CSVRecordReaderConfig.DEFAULT_DELIMITER).setHeader().build(); + } - _isHeaderProvided = config.getHeader() != null; - _skipHeaderRecord = config.isSkipHeader(); - _format = format.builder() - .setHeader() - .setSkipHeaderRecord(config.isSkipHeader()) - .setCommentMarker(config.getCommentMarker()) - .setEscape(config.getEscapeCharacter()) - .setIgnoreEmptyLines(config.isIgnoreEmptyLines()) - .setIgnoreSurroundingSpaces(config.isIgnoreSurroundingSpaces()) - .setQuote(config.getQuoteCharacter()) - .build(); - - if (config.getQuoteMode() != null) { - _format = _format.builder().setQuoteMode(QuoteMode.valueOf(config.getQuoteMode())).build(); - } + private static <T> Optional<T> optional(T value) { + return Optional.ofNullable(value); + } - if (config.getRecordSeparator() != null) { - _format = _format.builder().setRecordSeparator(config.getRecordSeparator()).build(); - } + private static CSVFormat.Builder formatBuilder(CSVRecordReaderConfig config) { + final CSVFormat.Builder builder = baseCsvFormat(config).builder().setDelimiter(config.getDelimiter()).setHeader() + .setSkipHeaderRecord(config.isSkipHeader()).setCommentMarker(config.getCommentMarker()) + .setEscape(config.getEscapeCharacter()).setIgnoreEmptyLines(config.isIgnoreEmptyLines()) + .setIgnoreSurroundingSpaces(config.isIgnoreSurroundingSpaces()).setQuote(config.getQuoteCharacter()); + + optional(config.getQuoteMode()).map(QuoteMode::valueOf).ifPresent(builder::setQuoteMode); + optional(config.getRecordSeparator()).ifPresent(builder::setRecordSeparator); + optional(config.getNullStringValue()).ifPresent(builder::setNullString); + return builder; + } - String nullString = config.getNullStringValue(); - if (nullString != null) { - _format = _format.builder().setNullString(nullString).build(); + private static Map<String, Integer> parseLineAsHeader(CSVFormat format, String line) + throws IOException { + try (StringReader stringReader = new StringReader(line)) { + try (CSVParser parser = format.parse(stringReader)) { + return parser.getHeaderMap(); } + } + } + + private static Character getMultiValueDelimiter(CSVRecordReaderConfig config) { + if (config == null) { + return CSVRecordReaderConfig.DEFAULT_MULTI_VALUE_DELIMITER; + } else if (config.isMultiValueDelimiterEnabled()) { + return config.getMultiValueDelimiter(); + } + return null; Review Comment: `null` means multi value is disabled. by default it is on and is also customizable. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org