suvodeep-pyne commented on code in PR #13913:
URL: https://github.com/apache/pinot/pull/13913#discussion_r1742543940


##########
pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java:
##########
@@ -60,102 +61,119 @@ public class CSVRecordReader implements RecordReader {
   private Map<String, Integer> _headerMap = new HashMap<>();
   private boolean _isHeaderProvided = false;
 
-  // line iterator specific variables
-  private boolean _useLineIterator = false;
-  private boolean _skipHeaderRecord = false;
-  private long _skippedLinesCount;
   private BufferedReader _bufferedReader;
-  private String _nextLine;
-  private GenericRow _nextRecord;
+  private CSVRecordReaderConfig _config = null;
 
   public CSVRecordReader() {
   }
 
-  @Override
-  public void init(File dataFile, @Nullable Set<String> fieldsToRead, 
@Nullable RecordReaderConfig recordReaderConfig)
-      throws IOException {
-    _dataFile = dataFile;
-    CSVRecordReaderConfig config = (CSVRecordReaderConfig) recordReaderConfig;
-    Character multiValueDelimiter = null;
-    if (config == null) {
-      _format = 
CSVFormat.DEFAULT.builder().setDelimiter(CSVRecordReaderConfig.DEFAULT_DELIMITER).setHeader().build();
-      multiValueDelimiter = 
CSVRecordReaderConfig.DEFAULT_MULTI_VALUE_DELIMITER;
-    } else {
-      CSVFormat format;
-      String formatString = config.getFileFormat();
-      if (formatString == null) {
-        format = CSVFormat.DEFAULT;
-      } else {
-        switch (formatString.toUpperCase()) {
-          case "EXCEL":
-            format = CSVFormat.EXCEL;
-            break;
-          case "MYSQL":
-            format = CSVFormat.MYSQL;
-            break;
-          case "RFC4180":
-            format = CSVFormat.RFC4180;
-            break;
-          case "TDF":
-            format = CSVFormat.TDF;
-            break;
-          default:
-            format = CSVFormat.DEFAULT;
-            break;
-        }
-      }
-      char delimiter = config.getDelimiter();
-      format = format.builder().setDelimiter(delimiter).build();
+  private static CSVFormat baseCsvFormat(CSVRecordReaderConfig config) {
+    if (config.getFileFormat() == null) {
+      return CSVFormat.DEFAULT;
+    }
+    switch (config.getFileFormat().toUpperCase()) {
+      case "EXCEL":
+        return CSVFormat.EXCEL;
+      case "MYSQL":
+        return CSVFormat.MYSQL;
+      case "RFC4180":
+        return CSVFormat.RFC4180;
+      case "TDF":
+        return CSVFormat.TDF;
+      default:
+        return CSVFormat.DEFAULT;
+    }
+  }
 
-      if (config.isSkipUnParseableLines()) {
-        _useLineIterator = true;
-      }
+  private static CSVFormat defaultFormat() {
+    return 
CSVFormat.DEFAULT.builder().setDelimiter(CSVRecordReaderConfig.DEFAULT_DELIMITER).setHeader().build();
+  }
 
-      _isHeaderProvided = config.getHeader() != null;
-      _skipHeaderRecord = config.isSkipHeader();
-      _format = format.builder()
-          .setHeader()
-          .setSkipHeaderRecord(config.isSkipHeader())
-          .setCommentMarker(config.getCommentMarker())
-          .setEscape(config.getEscapeCharacter())
-          .setIgnoreEmptyLines(config.isIgnoreEmptyLines())
-          .setIgnoreSurroundingSpaces(config.isIgnoreSurroundingSpaces())
-          .setQuote(config.getQuoteCharacter())
-          .build();
-
-      if (config.getQuoteMode() != null) {
-        _format = 
_format.builder().setQuoteMode(QuoteMode.valueOf(config.getQuoteMode())).build();
-      }
+  private static <T> Optional<T> optional(T value) {
+    return Optional.ofNullable(value);
+  }
 
-      if (config.getRecordSeparator() != null) {
-        _format = 
_format.builder().setRecordSeparator(config.getRecordSeparator()).build();
-      }
+  private static CSVFormat.Builder formatBuilder(CSVRecordReaderConfig config) 
{
+    final CSVFormat.Builder builder = 
baseCsvFormat(config).builder().setDelimiter(config.getDelimiter()).setHeader()
+        
.setSkipHeaderRecord(config.isSkipHeader()).setCommentMarker(config.getCommentMarker())
+        
.setEscape(config.getEscapeCharacter()).setIgnoreEmptyLines(config.isIgnoreEmptyLines())
+        
.setIgnoreSurroundingSpaces(config.isIgnoreSurroundingSpaces()).setQuote(config.getQuoteCharacter());
+
+    
optional(config.getQuoteMode()).map(QuoteMode::valueOf).ifPresent(builder::setQuoteMode);
+    
optional(config.getRecordSeparator()).ifPresent(builder::setRecordSeparator);
+    optional(config.getNullStringValue()).ifPresent(builder::setNullString);
+    return builder;
+  }
 
-      String nullString = config.getNullStringValue();
-      if (nullString != null) {
-        _format = _format.builder().setNullString(nullString).build();
+  private static Map<String, Integer> parseLineAsHeader(CSVFormat format, 
String line)
+      throws IOException {
+    try (StringReader stringReader = new StringReader(line)) {
+      try (CSVParser parser = format.parse(stringReader)) {
+        return parser.getHeaderMap();
       }
+    }
+  }
+
+  private static Character getMultiValueDelimiter(CSVRecordReaderConfig 
config) {
+    if (config == null) {
+      return CSVRecordReaderConfig.DEFAULT_MULTI_VALUE_DELIMITER;
+    } else if (config.isMultiValueDelimiterEnabled()) {
+      return config.getMultiValueDelimiter();
+    }
+    return null;

Review Comment:
   `null` means multi value is disabled. by default it is on and is also 
customizable.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org
For additional commands, e-mail: commits-h...@pinot.apache.org

Reply via email to