[GitHub] [incubator-pinot] mqliang commented on a change in pull request #6710: DataTable V3 implementation and measure data table serialization cost on server

GitBox Sun, 28 Mar 2021 22:52:02 -0700


mqliang commented on a change in pull request #6710:
URL: https://github.com/apache/incubator-pinot/pull/6710#discussion_r603028236




##########
File path: 
pinot-core/src/main/java/org/apache/pinot/core/common/datatable/DataTableImplV3.java
##########
@@ -0,0 +1,702 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.core.common.datatable;
+
+import com.google.common.primitives.Ints;
+import com.google.common.primitives.Longs;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.TreeMap;
+import org.apache.pinot.common.response.ProcessingException;
+import org.apache.pinot.common.utils.DataSchema;
+import org.apache.pinot.common.utils.DataTable;
+import org.apache.pinot.common.utils.StringUtil;
+import org.apache.pinot.core.common.ObjectSerDeUtils;
+import org.apache.pinot.core.query.request.context.ThreadTimer;
+import org.apache.pinot.spi.utils.ByteArray;
+import org.apache.pinot.spi.utils.BytesUtils;
+
+import static org.apache.pinot.core.common.datatable.DataTableUtils.*;
+
+
+public class DataTableImplV3 implements DataTable {
+  private static final int VERSION = 3;
+
+  // VERSION
+  // NUM_ROWS
+  // NUM_COLUMNS
+  // EXCEPTIONS (START|SIZE)
+  // DICTIONARY_MAP (START|SIZE)
+  // DATA_SCHEMA (START|SIZE)
+  // FIXED_SIZE_DATA (START|SIZE)
+  // VARIABLE_SIZE_DATA (START|SIZE)
+  // TRAILER (START|SIZE)
+  private static final int HEADER_SIZE = Integer.BYTES * 15;
+
+  private final int _numRows;
+  private final int _numColumns;
+  private final DataSchema _dataSchema;
+  private final int[] _columnOffsets;
+  private final int _rowSizeInBytes;
+  private final Map<String, Map<Integer, String>> _dictionaryMap;
+  private final byte[] _fixedSizeDataBytes;
+  private final ByteBuffer _fixedSizeData;
+  private final byte[] _variableSizeDataBytes;
+  private final ByteBuffer _variableSizeData;
+  // _exceptions stores exceptions as a map of errorCode->errorMessage
+  private final Map<Integer, String> _exceptions;
+  /**
+   * _metadata stores KV pairs for metadata. Metadata is actually a part of 
_trailer in V3 when serialize DataTable
+   * into bytes. When deserialize, we extract metadata from _trailer into this 
_metadata map to provide the same
+   * interface with V2. There are many code use
+   * datatable.getMetadata().get("key")/datatable.getMetadata().put("key", 
"value") to get/set metadata.
+   * TODO(@mqliang): revise this if we decide to get/set metadata by
+   *  datable.getTailerData(key)/datable.setTailer(key, value).
+   */
+  private final Map<String, String> _metadata;
+  private Map<TrailerKeys, String> _trailer;
+
+  private long _responseSerializationCpuTimeNs;
+  private int _responseSerializationCpuTimeNsValueOffset;
+
+  /**
+   * Construct data table with results. (Server side)
+   */
+  public DataTableImplV3(int numRows, DataSchema dataSchema, Map<String, 
Map<Integer, String>> dictionaryMap,
+      byte[] fixedSizeDataBytes, byte[] variableSizeDataBytes) {
+    _numRows = numRows;
+    _numColumns = dataSchema.size();
+    _dataSchema = dataSchema;
+    _columnOffsets = new int[_numColumns];
+    _rowSizeInBytes = DataTableUtils.computeColumnOffsets(dataSchema, 
_columnOffsets);
+    _dictionaryMap = dictionaryMap;
+    _fixedSizeDataBytes = fixedSizeDataBytes;
+    _fixedSizeData = ByteBuffer.wrap(fixedSizeDataBytes);
+    _variableSizeDataBytes = variableSizeDataBytes;
+    _variableSizeData = ByteBuffer.wrap(variableSizeDataBytes);
+    _exceptions = new HashMap<>();
+    _metadata = new HashMap<>();
+    _trailer = new TreeMap<>();
+  }
+
+  /**
+   * Construct empty data table. (Server side)
+   */
+  public DataTableImplV3() {
+    _numRows = 0;
+    _numColumns = 0;
+    _dataSchema = null;
+    _columnOffsets = null;
+    _rowSizeInBytes = 0;
+    _dictionaryMap = null;
+    _fixedSizeDataBytes = null;
+    _fixedSizeData = null;
+    _variableSizeDataBytes = null;
+    _variableSizeData = null;
+    _exceptions = new HashMap<>();
+    _metadata = new HashMap<>();
+    _trailer = new TreeMap<>();
+  }
+
+  @Override
+  public void addException(ProcessingException processingException) {
+    _exceptions.put(processingException.getErrorCode(), 
processingException.getMessage());
+  }
+
+  @Override
+  public Map<Integer, String> getExceptions() {
+    return _exceptions;
+  }
+
+  @Override
+  public byte[] toBytes()
+      throws IOException {
+    _trailer.put(TrailerKeys.RESPONSE_SERIALIZATION_CPU_TIME_NS_METADATA_KEY, 
String.valueOf(-1));
+    ThreadTimer threadTimer = new ThreadTimer();

Review comment:
       > Instead of starting threadtimer here you can start and end the timer 
inside the toBytesInternal() method. We don't need to capture the timer for 
serializing the trailer. Instead, just save the value in a local variable, and 
add it to the trailer. That way, you can avoid having the variable 
_responseSerializationCpuTimeNsValueOffset
   
   This implementation is impossible, since we need write trailer section(or 
metadata as suggested by @Jackie-Jiang, whatever we name it) start offset and 
length into header, so trailer section serialization happed before we write 
actual data bytes (exceptions data bytes, data schema bytes, fixed size data 
bytes, variable size data byte) into data output stream. If we implement in 
this way, only the time of serialize each section get account, the time of 
writing data bytes into data output stream was ignored.
   
   Instead, my current implementation: when write trailer section start offset 
and length into header, increase the length by the length of serialization_cost 
KV pair. And append the bytes of serialization_cost KV pair to the end of data 
output stream.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org
For additional commands, e-mail: commits-h...@pinot.apache.org

[GitHub] [incubator-pinot] mqliang commented on a change in pull request #6710: DataTable V3 implementation and measure data table serialization cost on server

Reply via email to