npawar commented on code in PR #10528: URL: https://github.com/apache/pinot/pull/10528#discussion_r1212496361
########## pinot-segment-spi/src/main/java/org/apache/pinot/segment/spi/memory/unsafe/MmapMemory.java: ########## @@ -0,0 +1,348 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.segment.spi.memory.unsafe; + +import com.google.common.collect.Lists; +import java.io.File; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.io.UncheckedIOException; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.nio.channels.FileChannel; +import java.util.List; +import java.util.function.BiConsumer; +import net.openhft.chronicle.core.Jvm; +import net.openhft.chronicle.core.OS; +import net.openhft.posix.MSyncFlag; +import net.openhft.posix.PosixAPI; +import org.apache.pinot.segment.spi.utils.JavaVersion; + + +public class MmapMemory implements Memory { + + private static final MapFun MAP_FUN; + + /** + * The address actually mapped. It has to be page aligned. + * + * {@code _address = _offset - offset % pageSize} + */ + private final long _address; + /** + * The offset requested to map. + * + * {@code _address = _offset - offset % pageSize} + */ + private final long _offset; + /** + * How many bytes have been requested to be mapped. + * The actual mapped size may be larger (up to the next page), but the actual mapped size + * is stored by {@link #_section}. + */ + private final long _size; + private final MapSection _section; + + static { + try { + Jvm.init(); + MAP_FUN = MapFun.find(); + } catch (ClassNotFoundException | NoSuchMethodException e) { + throw new RuntimeException(e); + } + } + + public MmapMemory(File file, boolean readOnly, long offset, long size) { + _size = size; + _offset = offset; + + try { + _section = MAP_FUN.map(file, readOnly, offset, size); + } catch (IOException e) { + throw new RuntimeException(e); + } + _address = _section.getAddress(); + } + + @Override + public long getAddress() { + return _address; + } + + @Override + public long getSize() { + return _size; + } + + @Override + public void flush() { + MSyncFlag mode = MSyncFlag.MS_SYNC; + PosixAPI.posix().msync(_offset, _size, mode); + } + + @Override + public void close() + throws IOException { + try { + _section._unmapFun.unmap(); + } catch (InvocationTargetException | IllegalAccessException e) { + throw new RuntimeException("Error while calling unmap", e); + } + } + + private static class MapSection { + public static final MapSection EMPTY = new MapSection(0, () -> { + }); + private final long _address; + private final UnmapFun _unmapFun; + + public MapSection(long address, UnmapFun unmapFun) { + _address = address; + _unmapFun = unmapFun; + } + + public long getAddress() { + return _address; + } + + public UnmapFun getUnmapFun() { + return _unmapFun; + } + } + + interface MapFun { + + /** + * @param file The file to be mapped. If its length is lower than offset + size and the mode is not read only, + * the file will be resized to that size. + * @param offset The offset in the file. Any positive value is valid, even if it is larger than the file size. + * @param size How many bytes to map. + * @throws IOException in several situations. For example, if the offset + size is larger than file length and the + * mode is read only or if the process doesn't have permission to read or modify the file. + */ + MapSection map(File file, boolean readOnly, long offset, long size) throws IOException; + + static MapFun find() + throws ClassNotFoundException, NoSuchMethodException { + List<Finder<? extends MapFun>> candidates = Lists.newArrayList( + new Map0Fun.ChronicleCore(), + new Map0Fun.Java11(), + new Map0Fun.Java17(), + new Java20() + ); + + for (Finder<? extends MapFun> candidate : candidates) { + try { + return candidate.tryFind(); + } catch (NoSuchMethodException | ClassNotFoundException | AssertionError e) { + // IGNORE + } + } + throw new NoSuchMethodException("Cannot find how to create memory map files in Java " + JavaVersion.VERSION); + } + + class Java20 implements Finder<MapFun> { + @Override + public MapFun tryFind() + throws NoSuchMethodException, ClassNotFoundException { + Class<?> fileChannelImpl = MmapMemory.class.getClassLoader().loadClass("sun.nio.ch.FileChannelImpl"); + + Method mapMethod = fileChannelImpl.getDeclaredMethod("mapInternal", FileChannel.MapMode.class, long.class, + long.class, int.class, boolean.class); + mapMethod.setAccessible(true); + + Class<?> unmapperClass = MmapMemory.class.getClassLoader().loadClass("sun.nio.ch.FileChannelImpl$Unmapper"); + Method unmapMethod = unmapperClass.getDeclaredMethod("unmap"); + unmapMethod.setAccessible(true); + Method addressMethod = unmapperClass.getDeclaredMethod("address"); + addressMethod.setAccessible(true); + + return (file, readOnly, offset, size) -> { + FileChannel.MapMode mapMode = readOnly ? FileChannel.MapMode.READ_ONLY : FileChannel.MapMode.READ_WRITE; + // see https://github.com/openjdk/jdk/blob/cc9f7ad9ce33dc44d335fb7fb5483795c62ba936/src/java.base/share/ + // classes/sun/nio/ch/FileChannelImpl.java#L1223 + int prot = readOnly ? 0 : 1; + + String mode = readOnly ? "r" : "rw"; + try (RandomAccessFile raf = new RandomAccessFile(file, mode); FileChannel fc = raf.getChannel()) { + Object unmapper = mapMethod.invoke(fc, mapMode, offset, size, prot, false); + long address; + UnmapFun unmapFun; + if (unmapper == null) { + // unmapper may be null if the size is 0 or if the file descriptor is closed while mapInternal was called + address = 0; + unmapFun = () -> { + }; + } else { + address = (long) addressMethod.invoke(unmapper);; + unmapFun = () -> unmapMethod.invoke(unmapper); + } + + return new MapSection(address, unmapFun); + } catch (InvocationTargetException | IllegalAccessException e) { + throw new RuntimeException(e); + } + }; + } + } + } + + /** + * A {@link MapFun} that actually delegates into a map0 native method included in pre 19 Java releases. + * + * Unlike normal map methods, map0 actually has some low level requirements. For example, the offset must be page + * aligned. + */ + interface Map0Fun extends MapFun { + + /** + * @param offset It has to be a positive value that is page aligned. + */ + MapSection map0(FileChannel fc, boolean readOnly, long offset, long size) + throws InvocationTargetException, IllegalAccessException, IOException; + + default MapSection map(File file, boolean readOnly, long offset, long size) throws IOException { + String mode = readOnly ? "r" : "rw"; + try (RandomAccessFile raf = new RandomAccessFile(file, mode); FileChannel fc = raf.getChannel()) { + if (size == 0) { + return MapSection.EMPTY; + } + + long allocationGranule = Unsafer.UNSAFE.pageSize(); + int pagePosition = (int) (offset % allocationGranule); + + // Compute mmap address + if (!fc.isOpen()) { + throw new IOException("closed " + file.getPath()); + } + + long fileSize = fc.size(); + if (fileSize < offset + size) { + // If file size is smaller than the specified size, extend the file size + raf.seek(offset + size - 1); + raf.write(0); + //logger.trace(s"extend file size to ${fc.size}") + } + long mapPosition = offset - pagePosition; + long mapSize = size + pagePosition; + // A workaround for the error when calling fc.map(MapMode.READ_WRITE, offset, size) with size more than 2GB + + MapSection map0Section = map0(fc, readOnly, mapPosition, mapSize); + return new MapSection(map0Section.getAddress() + pagePosition, map0Section.getUnmapFun()); + } catch (InvocationTargetException | IllegalAccessException e) { + throw new RuntimeException("Cannot map file " + file + " from address " + offset + " with size " + size, e); + } + } + + static BiConsumer<Long, Long> tryFindUnmapper() + throws NoSuchMethodException, ClassNotFoundException { + Class<?> fileChannelImpl = MmapMemory.class.getClassLoader().loadClass("sun.nio.ch.FileChannelImpl"); + Method unmapMethod = fileChannelImpl.getDeclaredMethod("unmap0", long.class, long.class); + unmapMethod.setAccessible(true); + return (address, size) -> { + try { + unmapMethod.invoke(null, address, size); + } catch (IllegalAccessException | InvocationTargetException e) { + throw new RuntimeException(e); + } + }; + } + + class ChronicleCore implements Finder<Map0Fun> { Review Comment: previous experiments / attempts by you had concluded that Chronicle Bytes has several blocking bugs and was not suitable for use by us. What changed since then that we are back to using it here and also preferring it to be the default option? I thought that option 4 was the obvious choice, but seems like I'm missing something -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org