This is an automated email from the ASF dual-hosted git repository. jiaguo pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push: new d1827c5a17 Support madvise for MmapMemory (#13721) d1827c5a17 is described below commit d1827c5a17be77d539affe54cf4310e9fe33844c Author: Dino Occhialini <dino.occhial...@gmail.com> AuthorDate: Thu Aug 15 13:58:50 2024 -0700 Support madvise for MmapMemory (#13721) * Support madvise for MmapMemory * Remove default madvise and keep current behavior * Add instance-level config for default madvise for mmap buffers * Fix checkstyle * Log errors from posix_madvise() calls * Warn when JNR cannot load libC for madvise * Fix checkstyle for fallthrough of switch --- .../segment/spi/memory/unsafe/MmapMemory.java | 66 ++++++++++++++++++ .../spi/memory/unsafe/MmapMemoryConfig.java | 81 ++++++++++++++++++++++ .../server/starter/helix/BaseServerStarter.java | 7 ++ .../apache/pinot/spi/utils/CommonConstants.java | 1 + 4 files changed, 155 insertions(+) diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/memory/unsafe/MmapMemory.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/memory/unsafe/MmapMemory.java index ce91c255fa..a3e0661f6c 100644 --- a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/memory/unsafe/MmapMemory.java +++ b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/memory/unsafe/MmapMemory.java @@ -28,6 +28,8 @@ import java.lang.reflect.Method; import java.nio.channels.FileChannel; import java.util.List; import java.util.function.BiConsumer; +import jnr.ffi.LibraryLoader; +import jnr.ffi.types.size_t; import net.openhft.chronicle.core.Jvm; import net.openhft.chronicle.core.OS; import net.openhft.posix.MSyncFlag; @@ -124,6 +126,16 @@ public class MmapMemory implements Memory { }); private final long _address; private final UnmapFun _unmapFun; + private static final LibC LIB_C; + static { + LibC libC = null; + try { + libC = LibraryLoader.create(LibC.class).failImmediately().load("c"); + } catch (Throwable ignored) { + LOGGER.warn("Could not load JNR C Library, madvise will not be used for mmap memory."); + } + LIB_C = libC; + } public MapSection(long address, UnmapFun unmapFun) { _address = address; @@ -137,6 +149,46 @@ public class MmapMemory implements Memory { public UnmapFun getUnmapFun() { return _unmapFun; } + + /** + * Call posix_madvise (if available) at the address aligned start of MapSection for size bytes + * + * Internally posix_madvise operates on pages, so unaligned size would affect all remaining bytes on the page. + * _address is expected to be page aligned. + * In the future, it may be helpful to expose this to upstream consumers as a "hint()" abstraction in cases where + * advice other than MADV_RANDOM perform better (particularly sequential reads on slow filesystems). + * + * Errors during advise are ignored since this is considered a nice-to-have step. + * + * @param size Size of the region to set advice for. + * @param advice Specific advice to apply (see the LibC interface for options) + */ + protected void madvise(long size, int advice) { + if (LIB_C != null) { + int errno = LIB_C.posix_madvise(_address, size, advice); + switch (errno) { + case 0: + // 0 indicates a successful call + break; + case 22: + LOGGER.warn("posix_madvise failed with EINVAL, either addr is not aligned or advice was invalid"); + break; + case 12: + LOGGER.warn("posix_madvise failed with ENOMEM, indicating a bad address or size"); + break; + default: + LOGGER.warn("posix_madvise returned an unknown error code: {}", errno); + break; + } + } + } + + protected void madvise(long size) { + int defaultAdvice = MmapMemoryConfig.getDefaultAdvice(); + if (defaultAdvice >= 0) { + madvise(size, defaultAdvice); + } + } } /** @@ -218,6 +270,7 @@ public class MmapMemory implements Memory { long mapSize = size + pagePosition; MapSection map0Section = map0(fc, readOnly, mapPosition, mapSize); + map0Section.madvise(mapSize); return new MapSection(map0Section.getAddress() + pagePosition, map0Section.getUnmapFun()); } catch (InvocationTargetException | IllegalAccessException e) { throw new RuntimeException("Cannot map file " + file + " from address " + offset + " with size " + size, e); @@ -369,4 +422,17 @@ public class MmapMemory implements Memory { private interface Finder<C> { C tryFind() throws NoSuchMethodException, ClassNotFoundException; } + + // CHECKSTYLE:OFF + protected interface LibC { + public static final int POSIX_MADV_NORMAL = 0; /* No further special treatment */ + public static final int POSIX_MADV_RANDOM = 1; /* Expect random page references */ + public static final int POSIX_MADV_SEQUENTIAL = 2; /* Expect sequential page references */ + public static final int POSIX_MADV_WILLNEED = 3; /* Will need these pages */ + public static final int POSIX_MADV_DONTNEED = 4; /* Don't need these pages */ + + @SuppressWarnings({"UnusedReturnValue"}) + int posix_madvise(@size_t long address, @size_t long size, int advice); + } + // CHECKSTYLE:ON } diff --git a/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/memory/unsafe/MmapMemoryConfig.java b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/memory/unsafe/MmapMemoryConfig.java new file mode 100644 index 0000000000..fc6b5b6b48 --- /dev/null +++ b/pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/memory/unsafe/MmapMemoryConfig.java @@ -0,0 +1,81 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.spi.memory.unsafe; + +import com.google.common.base.Preconditions; +import org.apache.commons.lang3.EnumUtils; + +/** + * Simple singleton config for managing advanced mmap configurations + */ +public class MmapMemoryConfig { + private MmapMemoryConfig() { } + private static final MmapMemoryConfig INSTANCE = new MmapMemoryConfig(); + + enum Advice { + NORMAL(MmapMemory.LibC.POSIX_MADV_NORMAL), + RANDOM(MmapMemory.LibC.POSIX_MADV_RANDOM), + SEQUENTIAL(MmapMemory.LibC.POSIX_MADV_SEQUENTIAL), + WILL_NEED(MmapMemory.LibC.POSIX_MADV_WILLNEED), + DONT_NEED(MmapMemory.LibC.POSIX_MADV_DONTNEED); + + private final int _advice; + + Advice(int advice) { + _advice = advice; + } + + /** + * Get posix-compatible advice integer + */ + public int getAdvice() { + return _advice; + } + } + + /** + * Advice to use by default after calling mmap on a region of a file. + * Notably this is expected to be an integer corresponding to advice + * supported by posix_madvise() + */ + private int _defaultAdvice = -1; + + public static int getDefaultAdvice() { + return INSTANCE._defaultAdvice; + } + + public static void setDefaultAdvice(int advice) { + Preconditions.checkArgument( + advice >= 0 && advice <= 4, + "Default advice for mmap buffers must be posix_madvise compatible (0-4): %d", + advice + ); + INSTANCE._defaultAdvice = advice; + } + + public static void setDefaultAdvice(String adviceString) { + Preconditions.checkArgument( + EnumUtils.isValidEnum(Advice.class, adviceString), + "Default advice for mmap buffers must match a posix_madvise compatible option: %s", + adviceString + ); + + setDefaultAdvice(Advice.valueOf(adviceString).getAdvice()); + } +} diff --git a/pinot-server/src/main/java/org/apache/pinot/server/starter/helix/BaseServerStarter.java b/pinot-server/src/main/java/org/apache/pinot/server/starter/helix/BaseServerStarter.java index 54e118634f..dc4100eebc 100644 --- a/pinot-server/src/main/java/org/apache/pinot/server/starter/helix/BaseServerStarter.java +++ b/pinot-server/src/main/java/org/apache/pinot/server/starter/helix/BaseServerStarter.java @@ -75,6 +75,7 @@ import org.apache.pinot.core.util.ListenerConfigUtil; import org.apache.pinot.segment.local.realtime.impl.invertedindex.RealtimeLuceneIndexRefreshManager; import org.apache.pinot.segment.local.realtime.impl.invertedindex.RealtimeLuceneTextIndexSearcherPool; import org.apache.pinot.segment.spi.memory.PinotDataBuffer; +import org.apache.pinot.segment.spi.memory.unsafe.MmapMemoryConfig; import org.apache.pinot.server.access.AccessControlFactory; import org.apache.pinot.server.api.AdminApiApplication; import org.apache.pinot.server.conf.ServerConf; @@ -202,6 +203,12 @@ public abstract class BaseServerStarter implements ServiceStartable { // Initialize Pinot Environment Provider _pinotEnvironmentProvider = initializePinotEnvironmentProvider(); + // Set instance-level mmap advice defaults + String defaultMmapAdvice = _serverConf.getProperty(Server.CONFIG_OF_MMAP_DEFAULT_ADVICE); + if (defaultMmapAdvice != null) { + MmapMemoryConfig.setDefaultAdvice(defaultMmapAdvice); + } + // Initialize the data buffer factory PinotDataBuffer.loadDefaultFactory(serverConf); diff --git a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/CommonConstants.java b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/CommonConstants.java index 6769ae1894..2bfc61e7f4 100644 --- a/pinot-spi/src/main/java/org/apache/pinot/spi/utils/CommonConstants.java +++ b/pinot-spi/src/main/java/org/apache/pinot/spi/utils/CommonConstants.java @@ -662,6 +662,7 @@ public class CommonConstants { public static final double DEFAULT_SERVER_CONSUMPTION_RATE_LIMIT = 0.0; public static final String DEFAULT_READ_MODE = "mmap"; + public static final String CONFIG_OF_MMAP_DEFAULT_ADVICE = "pinot.server.mmap.advice.default"; // Whether to reload consuming segment on scheme update public static final boolean DEFAULT_RELOAD_CONSUMING_SEGMENT = true; public static final String DEFAULT_INSTANCE_BASE_DIR = --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org