This is an automated email from the ASF dual-hosted git repository.

jiaguo pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new d1827c5a17 Support madvise for MmapMemory (#13721)
d1827c5a17 is described below

commit d1827c5a17be77d539affe54cf4310e9fe33844c
Author: Dino Occhialini <dino.occhial...@gmail.com>
AuthorDate: Thu Aug 15 13:58:50 2024 -0700

    Support madvise for MmapMemory (#13721)
    
    * Support madvise for MmapMemory
    
    * Remove default madvise and keep current behavior
    
    * Add instance-level config for default madvise for mmap buffers
    
    * Fix checkstyle
    
    * Log errors from posix_madvise() calls
    
    * Warn when JNR cannot load libC for madvise
    
    * Fix checkstyle for fallthrough of switch
---
 .../segment/spi/memory/unsafe/MmapMemory.java      | 66 ++++++++++++++++++
 .../spi/memory/unsafe/MmapMemoryConfig.java        | 81 ++++++++++++++++++++++
 .../server/starter/helix/BaseServerStarter.java    |  7 ++
 .../apache/pinot/spi/utils/CommonConstants.java    |  1 +
 4 files changed, 155 insertions(+)

diff --git 
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/memory/unsafe/MmapMemory.java
 
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/memory/unsafe/MmapMemory.java
index ce91c255fa..a3e0661f6c 100644
--- 
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/memory/unsafe/MmapMemory.java
+++ 
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/memory/unsafe/MmapMemory.java
@@ -28,6 +28,8 @@ import java.lang.reflect.Method;
 import java.nio.channels.FileChannel;
 import java.util.List;
 import java.util.function.BiConsumer;
+import jnr.ffi.LibraryLoader;
+import jnr.ffi.types.size_t;
 import net.openhft.chronicle.core.Jvm;
 import net.openhft.chronicle.core.OS;
 import net.openhft.posix.MSyncFlag;
@@ -124,6 +126,16 @@ public class MmapMemory implements Memory {
     });
     private final long _address;
     private final UnmapFun _unmapFun;
+    private static final LibC LIB_C;
+    static {
+      LibC libC = null;
+      try {
+        libC = LibraryLoader.create(LibC.class).failImmediately().load("c");
+      } catch (Throwable ignored) {
+        LOGGER.warn("Could not load JNR C Library, madvise will not be used 
for mmap memory.");
+      }
+      LIB_C = libC;
+    }
 
     public MapSection(long address, UnmapFun unmapFun) {
       _address = address;
@@ -137,6 +149,46 @@ public class MmapMemory implements Memory {
     public UnmapFun getUnmapFun() {
       return _unmapFun;
     }
+
+    /**
+     * Call posix_madvise (if available) at the address aligned start of 
MapSection for size bytes
+     *
+     * Internally posix_madvise operates on pages, so unaligned size would 
affect all remaining bytes on the page.
+     * _address is expected to be page aligned.
+     * In the future, it may be helpful to expose this to upstream consumers 
as a "hint()" abstraction in cases where
+     * advice other than MADV_RANDOM perform better (particularly sequential 
reads on slow filesystems).
+     *
+     * Errors during advise are ignored since this is considered a 
nice-to-have step.
+     *
+     * @param size Size of the region to set advice for.
+     * @param advice Specific advice to apply (see the LibC interface for 
options)
+     */
+    protected void madvise(long size, int advice) {
+      if (LIB_C != null) {
+        int errno = LIB_C.posix_madvise(_address, size, advice);
+        switch (errno) {
+          case 0:
+            // 0 indicates a successful call
+            break;
+          case 22:
+            LOGGER.warn("posix_madvise failed with EINVAL, either addr is not 
aligned or advice was invalid");
+            break;
+          case 12:
+            LOGGER.warn("posix_madvise failed with ENOMEM, indicating a bad 
address or size");
+            break;
+          default:
+            LOGGER.warn("posix_madvise returned an unknown error code: {}", 
errno);
+            break;
+        }
+      }
+    }
+
+    protected void madvise(long size) {
+      int defaultAdvice = MmapMemoryConfig.getDefaultAdvice();
+      if (defaultAdvice >= 0) {
+        madvise(size, defaultAdvice);
+      }
+    }
   }
 
   /**
@@ -218,6 +270,7 @@ public class MmapMemory implements Memory {
         long mapSize = size + pagePosition;
 
         MapSection map0Section = map0(fc, readOnly, mapPosition, mapSize);
+        map0Section.madvise(mapSize);
         return new MapSection(map0Section.getAddress() + pagePosition, 
map0Section.getUnmapFun());
       } catch (InvocationTargetException | IllegalAccessException e) {
         throw new RuntimeException("Cannot map file " + file + " from address 
" + offset + " with size " + size, e);
@@ -369,4 +422,17 @@ public class MmapMemory implements Memory {
   private interface Finder<C> {
     C tryFind() throws NoSuchMethodException, ClassNotFoundException;
   }
+
+  // CHECKSTYLE:OFF
+  protected interface LibC {
+    public static final int POSIX_MADV_NORMAL = 0; /* No further special 
treatment */
+    public static final int POSIX_MADV_RANDOM = 1; /* Expect random page 
references */
+    public static final int POSIX_MADV_SEQUENTIAL = 2; /* Expect sequential 
page references */
+    public static final int POSIX_MADV_WILLNEED = 3; /* Will need these pages 
*/
+    public static final int POSIX_MADV_DONTNEED = 4; /* Don't need these pages 
*/
+
+    @SuppressWarnings({"UnusedReturnValue"})
+    int posix_madvise(@size_t long address, @size_t long size, int advice);
+  }
+  // CHECKSTYLE:ON
 }
diff --git 
a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/memory/unsafe/MmapMemoryConfig.java
 
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/memory/unsafe/MmapMemoryConfig.java
new file mode 100644
index 0000000000..fc6b5b6b48
--- /dev/null
+++ 
b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/memory/unsafe/MmapMemoryConfig.java
@@ -0,0 +1,81 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.segment.spi.memory.unsafe;
+
+import com.google.common.base.Preconditions;
+import org.apache.commons.lang3.EnumUtils;
+
+/**
+ * Simple singleton config for managing advanced mmap configurations
+ */
+public class MmapMemoryConfig {
+  private MmapMemoryConfig() { }
+  private static final MmapMemoryConfig INSTANCE = new MmapMemoryConfig();
+
+  enum Advice {
+    NORMAL(MmapMemory.LibC.POSIX_MADV_NORMAL),
+    RANDOM(MmapMemory.LibC.POSIX_MADV_RANDOM),
+    SEQUENTIAL(MmapMemory.LibC.POSIX_MADV_SEQUENTIAL),
+    WILL_NEED(MmapMemory.LibC.POSIX_MADV_WILLNEED),
+    DONT_NEED(MmapMemory.LibC.POSIX_MADV_DONTNEED);
+
+    private final int _advice;
+
+    Advice(int advice) {
+      _advice = advice;
+    }
+
+    /**
+     *  Get posix-compatible advice integer
+     */
+    public int getAdvice() {
+      return _advice;
+    }
+  }
+
+  /**
+   * Advice to use by default after calling mmap on a region of a file.
+   * Notably this is expected to be an integer corresponding to advice
+   * supported by posix_madvise()
+   */
+  private int _defaultAdvice = -1;
+
+  public static int getDefaultAdvice() {
+    return INSTANCE._defaultAdvice;
+  }
+
+  public static void setDefaultAdvice(int advice) {
+    Preconditions.checkArgument(
+        advice >= 0 && advice <= 4,
+        "Default advice for mmap buffers must be posix_madvise compatible 
(0-4): %d",
+        advice
+    );
+    INSTANCE._defaultAdvice = advice;
+  }
+
+  public static void setDefaultAdvice(String adviceString) {
+    Preconditions.checkArgument(
+      EnumUtils.isValidEnum(Advice.class, adviceString),
+      "Default advice for mmap buffers must match a posix_madvise compatible 
option: %s",
+      adviceString
+    );
+
+    setDefaultAdvice(Advice.valueOf(adviceString).getAdvice());
+  }
+}
diff --git 
a/pinot-server/src/main/java/org/apache/pinot/server/starter/helix/BaseServerStarter.java
 
b/pinot-server/src/main/java/org/apache/pinot/server/starter/helix/BaseServerStarter.java
index 54e118634f..dc4100eebc 100644
--- 
a/pinot-server/src/main/java/org/apache/pinot/server/starter/helix/BaseServerStarter.java
+++ 
b/pinot-server/src/main/java/org/apache/pinot/server/starter/helix/BaseServerStarter.java
@@ -75,6 +75,7 @@ import org.apache.pinot.core.util.ListenerConfigUtil;
 import 
org.apache.pinot.segment.local.realtime.impl.invertedindex.RealtimeLuceneIndexRefreshManager;
 import 
org.apache.pinot.segment.local.realtime.impl.invertedindex.RealtimeLuceneTextIndexSearcherPool;
 import org.apache.pinot.segment.spi.memory.PinotDataBuffer;
+import org.apache.pinot.segment.spi.memory.unsafe.MmapMemoryConfig;
 import org.apache.pinot.server.access.AccessControlFactory;
 import org.apache.pinot.server.api.AdminApiApplication;
 import org.apache.pinot.server.conf.ServerConf;
@@ -202,6 +203,12 @@ public abstract class BaseServerStarter implements 
ServiceStartable {
     // Initialize Pinot Environment Provider
     _pinotEnvironmentProvider = initializePinotEnvironmentProvider();
 
+    // Set instance-level mmap advice defaults
+    String defaultMmapAdvice = 
_serverConf.getProperty(Server.CONFIG_OF_MMAP_DEFAULT_ADVICE);
+    if (defaultMmapAdvice != null) {
+      MmapMemoryConfig.setDefaultAdvice(defaultMmapAdvice);
+    }
+
     // Initialize the data buffer factory
     PinotDataBuffer.loadDefaultFactory(serverConf);
 
diff --git 
a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/CommonConstants.java 
b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/CommonConstants.java
index 6769ae1894..2bfc61e7f4 100644
--- a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/CommonConstants.java
+++ b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/CommonConstants.java
@@ -662,6 +662,7 @@ public class CommonConstants {
     public static final double DEFAULT_SERVER_CONSUMPTION_RATE_LIMIT = 0.0;
 
     public static final String DEFAULT_READ_MODE = "mmap";
+    public static final String CONFIG_OF_MMAP_DEFAULT_ADVICE = 
"pinot.server.mmap.advice.default";
     // Whether to reload consuming segment on scheme update
     public static final boolean DEFAULT_RELOAD_CONSUMING_SEGMENT = true;
     public static final String DEFAULT_INSTANCE_BASE_DIR =


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org
For additional commands, e-mail: commits-h...@pinot.apache.org

Reply via email to