(couchdb) 10/38: Implement parallel preads

vatamane Sun, 16 Mar 2025 22:12:21 -0700

This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a commit to branch merge-3.4.3
in repository https://gitbox.apache.org/repos/asf/couchdb.git


commit 83ee0e3d0b702b57f92c7e4d061c83ef23855766
Author: Nick Vatamaniuc <[email protected]>
AuthorDate: Mon Jan 13 12:12:28 2025 -0500

    Implement parallel preads
    
    Let clients issue concurrent pread calls without blocking each other or 
having
    to wait for all the writes and fsync calls.
    
    Even though at the POSIX level pread calls are thread-safe [1], Erlang OTP 
file
    backend forces a single controlling process for raw file handles. So, all 
our
    reads were always funnelled through the couch_file gen_server, having to 
queue
    up behind potentially slower writes. In particular this is problematic with
    remote file systems, where fsyncs and writes may take a lot longer while 
preads
    can hit the cache and return quicker.
    
    Parallel pread calls are implemented via a NIF which copies some of the file
    functions OTP's prim_file NIF [2]. The original OTP handle is dup-ed, and 
then
    closed, then our NIF takes control of the new duplicated file descriptor. 
This
    is necessary in order to allow multiple reader access via reader/writer 
locks,
    and also to carefully manage the closing state.
    
    In order to keep things simple the new handles created by couch_cfile 
implement
    the `#file_descriptor{module = $Module, data = $Data}` protocol, such that 
once
    opened, the regular `file` module in OTP will "know" how to dispatch calls 
with
    this handle to our couch_cfile.erl functions. In this way most of the
    `couch_file` stays the same, with all the same `file:` calls in the main 
data
    path.
    
    `couch_cfile` bypass is also opportunistic, if it is not available (on 
Windows)
    or not enables things proceed as before.
    
    The reason we need a new dup()-ed file descriptor is to manage closing very
    carefully. Since on POSIX systems file descriptors are just integers, it's 
very
    easy to accidentally read from an already closed and re-opened (by something
    else) file descriptor. That's why there are locks and a whole new file
    descriptor which our NIF controls. But as long as we control the file
    descriptor with our resource "handle", we can be sure it will stay open and
    won't be re-used by any other process.
    
    To gain confidence that the new couch_cfile behaves the same way as the
    Erlang/OTP one there is a property test which asserts that for any pair of
    {Raw, CFile} handle any supported file operations return exactly the same
    results. It was validated by modifying some of couch_file.c arguments and 
the
    property tests started to fail.
    
    Since neither one of the three compatible IOQ systems currently know how 
call a
    simple MFA, and instead only send a `$gen_call` message to a gen_server,
    parallel cfile reads are only available if we bypass the IOQ. By default if 
the
    requests are already configured to bypass the IOQ, then they will use the
    parallel preads. To enable parallel preads for all requests, toggle the
    `[couchdb] cfile_skip_ioq` setting to `true`.
    
    A simple sequential benchmark was run initially to show that even the most
    unfavorable case, all sequential operations, we haven't gotten worse:
    ```
    > fabric_bench:go(#{q=>1, n=>1, doc_size=>small, docs=>100000}).
     *** Parameters
     * batch_size       : 1000
     * doc_size         : small
     * docs             : 100000
     * individual_docs  : 1000
     * n                : 1
     * q                : 1
    
     *** Environment
     * Nodes        : 1
     * Bench ver.   : 1
     * N            : 1
     * Q            : 1
     * OS           : unix/linux
    ```
    
    Each case ran 5 times and picked the best rate in ops/sec, so higher is 
better:
    
    ```
                                                    Default  CFile
    
    * Add 100000 docs, ok:100/accepted:0     (Hz):   16000    16000
    * Get random doc 100000X                 (Hz):    4900     5800
    * All docs                               (Hz):  120000   140000
    * All docs w/ include_docs               (Hz):   24000    31000
    * Changes                                (Hz):   49000    51000
    * Single doc updates 1000X               (Hz):     380      410
    ```
    
    [1] https://www.man7.org/linux/man-pages/man2/pread.2.html
    [2] 
https://github.com/erlang/otp/blob/maint-25/erts/emulator/nifs/unix/unix_prim_file.c
    [3] https://github.com/saleyn/emmap
    [4] https://www.man7.org/linux/man-pages/man2/dup.2.html
---
 LICENSE                                         |  21 +-
 rel/overlay/etc/default.ini                     |  16 +
 src/couch/.gitignore                            |   1 +
 src/couch/priv/couch_cfile/couch_cfile.c        | 721 ++++++++++++++++++++++++
 src/couch/rebar.config.script                   |   8 +-
 src/couch/src/couch_cfile.erl                   | 285 ++++++++++
 src/couch/src/couch_file.erl                    | 156 ++++-
 src/couch/test/eunit/couch_cfile_prop_tests.erl | 156 +++++
 src/couch/test/eunit/couch_cfile_tests.erl      | 534 ++++++++++++++++++
 src/couch/test/eunit/couch_file_tests.erl       |  62 +-
 src/ioq/src/ioq.erl                             |   9 +-
 11 files changed, 1930 insertions(+), 39 deletions(-)

diff --git a/LICENSE b/LICENSE
index 1579cdf53..1fbc82f30 100644
--- a/LICENSE
+++ b/LICENSE
@@ -2385,4 +2385,23 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN 
NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
\ No newline at end of file
+THE SOFTWARE.
+
+couch_cfile
+
+couch_cfile.c NIF has parts from Erlang/OTP's prim_file NIF to ensure we
+have the exact pread behavior as Erlang/OTP
+
+ Copyright Ericsson 2017-2022. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/rel/overlay/etc/default.ini b/rel/overlay/etc/default.ini
index f62ff2a31..1a0f318bf 100644
--- a/rel/overlay/etc/default.ini
+++ b/rel/overlay/etc/default.ini
@@ -103,6 +103,22 @@ view_index_dir = {{view_index_dir}}
 ; Javascript engine. The choices are: spidermonkey and quickjs
 ;js_engine = spidermonkey
 
+; Use cfile. This is a C-based file I/O module that can execute parallel file
+; read calls. The regular Erlang VM file module, at least as of OTP 28 forces
+; all file operations to go through a single controlling process which can
+; become a bottleneck sometimes. cfile is enabled by default on supported
+; systems (currently Linux, MacOS and FreeBSD). However, it is a new feature,
+; so there any issues with it is possible to disable by setting the value to
+; "false".
+;use_cfile = true
+
+; When enabled, use cfile parallel reads for all the requests. By default the
+; setting is "false", so only requests which are configured to bypass the IOQ
+; would use the cfile parallel reads. If there is enough RAM available for a
+; large file cache and the disks have enough IO bandwith, consider enabling
+; this setting.
+;cfile_skip_ioq = false
+
 [purge]
 ; Allowed maximum number of documents in one purge request
 ;max_document_id_number = 100
diff --git a/src/couch/.gitignore b/src/couch/.gitignore
index 861974adb..dd39c7ddf 100644
--- a/src/couch/.gitignore
+++ b/src/couch/.gitignore
@@ -5,6 +5,7 @@ ebin/
 priv/couch_js/config.h
 priv/couchjs
 priv/couchspawnkillable
+priv/couch_cfile/*.d
 priv/*.exp
 priv/*.lib
 priv/*.dll
diff --git a/src/couch/priv/couch_cfile/couch_cfile.c 
b/src/couch/priv/couch_cfile/couch_cfile.c
new file mode 100644
index 000000000..36ac0c673
--- /dev/null
+++ b/src/couch/priv/couch_cfile/couch_cfile.c
@@ -0,0 +1,721 @@
+// Licensed under the Apache License, Version 2.0 (the "License"); you may not
+// use this file except in compliance with the License. You may obtain a copy 
of
+// the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+// License for the specific language governing permissions and limitations 
under
+// the License.
+
+
+#ifndef _WIN32
+
+#include <errno.h>
+#include <limits.h>
+#include <sys/types.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <unistd.h>
+
+// erl_driver.h is for erl_errno_id()
+#include "erl_driver.h"
+#include "erl_nif.h"
+
+#endif
+
+static ErlNifResourceType* HANDLE_T;
+static ErlNifPid JANITOR_PID;
+
+static ERL_NIF_TERM ATOM_BOF;
+static ERL_NIF_TERM ATOM_EOF;
+static ERL_NIF_TERM ATOM_ERROR;
+static ERL_NIF_TERM ATOM_BADARG;
+static ERL_NIF_TERM ATOM_OK;
+static ERL_NIF_TERM ATOM_CLOSE;
+static ERL_NIF_TERM ATOM_CONTINUE;
+
+typedef int posix_errno_t;
+
+typedef struct {
+    // This read-write lock is for the closing state, not for the the
+    // operations themselves (pread vs write). The owner process in the
+    // #file_descriptor{} record handle in Erlang controls who can read vs
+    // write to the file.
+    ErlNifRWLock *lock;
+    // Monitor for the owner process. If owner exits we'll get notified
+    // in the handle_down callback
+    ErlNifMonitor monitor;
+    // Our file descriptor. If set to -1 it means it's closed
+    int fd;
+    // This is for the "belt and suspenders" in the sanity_checker function.
+    // After we dup a handle we verify that the dup-ed old_fd matcheds the
+    // original fd and we still have both open (they were not somehow closed in
+    // the meantime).
+    int old_fd;
+} handle_t;
+
+#define LOCK        enif_rwlock_rwlock(hdl->lock)
+#define UNLOCK      enif_rwlock_rwunlock(hdl->lock)
+#define READ_LOCK   enif_rwlock_rlock(hdl->lock)
+#define READ_UNLOCK enif_rwlock_runlock(hdl->lock)
+
+// The fallback, min and iov defines are for writev function unix_prim_file.c
+#define FALLBACK_RW_LENGTH ((1ull << 31) - 1)
+#ifndef MIN
+    #define MIN(A, B) ((A) < (B) ? (A) : (B))
+#endif
+#if !defined(IOV_MAX) && defined(UIO_MAXIOV)
+    #define IOV_MAX UIO_MAXIOV
+#elif !defined(IOV_MAX)
+    #define IOV_MAX 16
+#endif
+
+static int get_handle(ErlNifEnv *env, ERL_NIF_TERM arg, handle_t** h) {
+    return enif_get_resource(env, arg, HANDLE_T, (void**)h);
+}
+
+static ERL_NIF_TERM ok_tup(ErlNifEnv* env, ERL_NIF_TERM res) {
+  return enif_make_tuple2(env, ATOM_OK, res);
+}
+
+static ERL_NIF_TERM err_tup(ErlNifEnv *env, posix_errno_t posix_errno) {
+    ERL_NIF_TERM error = enif_make_atom(env, erl_errno_id(posix_errno));
+    return enif_make_tuple2(env, ATOM_ERROR, error);
+}
+
+static ERL_NIF_TERM badarg(ErlNifEnv *env) {
+    return enif_make_tuple2(env, ATOM_ERROR, ATOM_BADARG);
+}
+
+// Copy from OTP. For the sake of keeping the same behavior we use
+// the same logic. If it changes in OTP, consider updating this as well.
+//
+static void shift_iov(SysIOVec **iov, int *iovlen, ssize_t shift) {
+    SysIOVec *head_vec = (*iov);
+
+    while(shift > 0) {
+
+        if(shift < head_vec->iov_len) {
+            head_vec->iov_base = (char*)head_vec->iov_base + shift;
+            head_vec->iov_len -= shift;
+            break;
+        } else {
+            shift -= head_vec->iov_len;
+            head_vec++;
+        }
+    }
+
+    (*iovlen) -= head_vec - (*iov);
+    (*iov) = head_vec;
+}
+
+// Copied form OTP to keep the same logic. Some differences:
+//   - Pass file descriptor as int and errno result as a separate arg
+//   - Assume preadv exists so skip the check + fallback code
+//
+static long efile_preadv(int fd, long offset, SysIOVec *iov, int iovlen, 
posix_errno_t* res_errno) {
+    unsigned long bytes_read;
+    long result;
+
+    bytes_read = 0;
+
+    do {
+        if(iovlen < 1) {
+            result = 0;
+            break;
+        }
+        result = preadv(fd, (const struct iovec*)iov, MIN(IOV_MAX, iovlen), 
offset);
+        if(result > 0) {
+            shift_iov(&iov, &iovlen, result);
+            bytes_read += result;
+            offset += result;
+        }
+    } while(result > 0 || (result < 0 && errno == EINTR));
+
+    *res_errno = errno;
+
+    if(result == 0 && bytes_read > 0) {
+        return bytes_read;
+    }
+
+    return result;
+}
+
+// Copied from OTP just like efile_preadv. Differences are:
+//  - Pass file descriptor as int and errno result as a separate arg
+//  - Assume writev exists
+//
+static long efile_writev(int fd, SysIOVec *iov, int iovlen, posix_errno_t* 
res_errno) {
+    long bytes_written;
+    ssize_t result;
+
+    bytes_written = 0;
+
+    do {
+        int use_fallback = 0;
+
+        if(iovlen < 1) {
+            result = 0;
+            break;
+        }
+
+        result = writev(fd, (const struct iovec *)iov, MIN(IOV_MAX, iovlen));
+
+        /* Fall back to using write(2) if writev(2) reports that the combined
+         * size of iov is greater than SSIZE_T_MAX. */
+        use_fallback = (result < 0 && errno == EINVAL);
+
+        if(use_fallback) {
+            result = write(fd, iov->iov_base, iov->iov_len);
+
+            /* Some OSs (e.g. macOS) does not allow writes greater than 2 GB,
+               so if we get EINVAL in the fallback, we try with a smaller 
length */
+            if (result < 0 && errno == EINVAL && iov->iov_len > 
FALLBACK_RW_LENGTH)
+                result = write(fd, iov->iov_base, FALLBACK_RW_LENGTH);
+        }
+
+        if(result > 0) {
+            shift_iov(&iov, &iovlen, result);
+            bytes_written += result;
+        }
+    } while(result > 0 || (result < 0 && errno == EINTR));
+
+    *res_errno = errno;
+
+    if(result == 0 && bytes_written > 0) {
+        return bytes_written;
+    }
+
+    return result;
+}
+
+// Copied from OTP. Differences are:
+//    - File descriptor and return error passed in as separate args
+//    - This is for datasync only so don't pass that extra argument in
+//
+int efile_datasync(int fd, posix_errno_t* res_errno) {
+
+#if !defined(__DARWIN__)
+    if(fdatasync(fd) < 0) {
+        *res_errno = errno;
+        return 0;
+    }
+    return 1;
+#endif
+
+#if defined(__DARWIN__) && defined(F_BARRIERFSYNC)
+    if(fcntl(fd, F_BARRIERFSYNC) < 0) {
+#elif defined(__DARWIN__) && defined(F_FULLFSYNC)
+    if(fcntl(fd, F_FULLFSYNC) < 0) {
+#else
+    if(fsync(fd) < 0) {
+#endif
+        *res_errno = errno;
+        return 0;
+    }
+    return 1;
+}
+
+// Duplicate a file descriptor. This doesn't open a new "file description"
+// entry, just creates an extra light-weight file "descriptor" id for it.
+// Position, buffers, permission bits are all shared with the main description
+// entry. See https://www.man7.org/linux/man-pages/man2/dup.2.html for details
+//
+static ERL_NIF_TERM dup_nif(ErlNifEnv *env, int argc, const ERL_NIF_TERM 
argv[])
+{
+#ifndef _WIN32
+   int fd, newfd;
+   handle_t* hdl;
+   ErlNifRWLock *lock;
+   ErlNifPid pid;
+   ERL_NIF_TERM res;
+
+
+   if (argc != 1 || !enif_is_number(env, argv[0])) {
+       return badarg(env);
+   }
+   if (!enif_get_int(env, argv[0], &fd) || fd < 0) {
+       return err_tup(env, EINVAL);
+   }
+
+   if(!enif_self(env, &pid)){
+       // Calling from not a process-bound environment? (highly unlikely)
+       return err_tup(env, EINVAL);
+   }
+   newfd = dup(fd);
+
+   if (newfd < 0) {
+       return err_tup(env, errno);
+   }
+   // From here on close the new dup-ed file descriptors on any failure.
+
+   lock = enif_rwlock_create("couch_cfile:rwlock");
+   if(!lock) {
+       close(newfd);
+       return err_tup(env, ENOMEM);
+   }
+
+   hdl = (handle_t*) enif_alloc_resource(HANDLE_T, sizeof(handle_t));
+   if (!hdl) {
+       close(newfd);
+       enif_rwlock_destroy(lock);
+       return err_tup(env, ENOMEM);
+   }
+   // From here on, once we release our resource the destructor will be called.
+   // On failures below, we'll let the destructor deallocate the lock, but
+   // we'll close the new handle here manually. The descriptors will be set to
+   // -1 so the state in the handle will be "closed" until we know everything
+   // is good to go
+
+   hdl->lock = lock;
+   hdl->fd = -1;
+   hdl->old_fd = -1;
+
+   if (enif_monitor_process(env, hdl, &pid, &hdl->monitor) != 0) {
+       close(newfd);
+       enif_release_resource(hdl);
+       return err_tup(env, EINVAL);
+   }
+
+   // Everything is good to go. Return the new handle
+   hdl->fd = newfd;
+   hdl->old_fd = fd;
+
+   res = enif_make_resource(env, hdl);
+   enif_release_resource(hdl);
+   return ok_tup(env, res);
+#else
+   return err_tup(env, EINVAL);
+#endif
+}
+
+static ERL_NIF_TERM close_nif(ErlNifEnv *env, int argc, const ERL_NIF_TERM 
argv[])
+{
+#ifndef _WIN32
+    handle_t* hdl;
+
+    if (argc != 1 || !get_handle(env, argv[0], &hdl)) {
+        return badarg(env);
+    }
+
+    // ------ Critical section start ------
+    LOCK;
+    if (hdl->fd < 0) {
+         UNLOCK;
+         return err_tup(env, EINVAL);
+    }
+    enif_demonitor_process(env, hdl, &hdl->monitor);
+    if (close(hdl->fd) < 0) {
+        hdl->fd = -1;
+        UNLOCK;
+        return err_tup(env, errno);
+    }
+    hdl->fd = -1;
+    UNLOCK;
+    // ------ Critical section end ------
+
+    return ATOM_OK;
+#else
+   return err_tup(env, EINVAL);
+#endif
+}
+
+// !!! This should be called from the janitor process only !!!
+//
+static ERL_NIF_TERM close_fd_nif(ErlNifEnv *env, int argc, const ERL_NIF_TERM 
argv[])
+{
+#ifndef _WIN32
+   int fd;
+
+   if (argc != 1 || !enif_is_number(env, argv[0])) {
+       return badarg(env);
+   }
+   if (!enif_get_int(env, argv[0], &fd) || fd < 0) {
+       return err_tup(env, EINVAL);
+   }
+
+   if (close(fd) < 0) {
+        return err_tup(env, errno);
+   }
+
+   return ATOM_OK;
+#else
+   return err_tup(env, EINVAL);
+#endif
+}
+
+
+// Follows pread_nif_impl from prim_file_nif.c
+//
+static ERL_NIF_TERM pread_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM 
argv[])
+{
+#ifndef _WIN32
+   handle_t* hdl;
+   long offset, block_size, bytes_read;
+   SysIOVec io_vec[1];
+   posix_errno_t res_errno = 0;
+   ErlNifBinary result;
+
+   if (argc != 3
+       || !get_handle(env, argv[0], &hdl)
+       || !enif_is_number(env, argv[1])
+       || !enif_is_number(env, argv[2])
+   ) {
+     return badarg(env);
+   }
+
+   if (!enif_get_int64(env, argv[1], &offset)
+       || !enif_get_int64(env, argv[2], &block_size)
+       || offset < 0
+       || block_size < 0
+   ) {
+        return err_tup(env, EINVAL);
+   }
+
+   if (!enif_alloc_binary((size_t) block_size, &result)) {
+       return err_tup(env, ENOMEM);
+   }
+
+   io_vec[0].iov_base = (char *)result.data;
+   io_vec[0].iov_len = result.size;
+
+   // ------ Critical section start ------
+   READ_LOCK;
+   if (hdl->fd < 0) {
+       READ_UNLOCK;
+       enif_release_binary(&result);
+       return err_tup(env, EINVAL);
+   }
+   bytes_read = efile_preadv(hdl->fd, offset, io_vec, 1, &res_errno);
+   READ_UNLOCK;
+   // ------ Critical section end ------
+
+   if (bytes_read < 0) {
+       enif_release_binary(&result);
+       return err_tup(env, res_errno);
+   }
+   if (bytes_read == 0) {
+       enif_release_binary(&result);
+       return ATOM_EOF;
+   }
+   if (bytes_read < block_size && !enif_realloc_binary(&result, bytes_read)) {
+       enif_release_binary(&result);
+       return err_tup(env, ENOMEM);
+   }
+   return ok_tup(env, enif_make_binary(env, &result));
+#else
+   return err_tup(env, EINVAL);
+#endif
+}
+
+// Follows implementation from prim_file_nif.c
+//
+static ERL_NIF_TERM write_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM 
argv[]) {
+#ifndef _WIN32
+    handle_t* hdl;
+    ErlNifIOVec vec, *input = &vec;
+    posix_errno_t  res_errno = 0;
+    long bytes_written;
+    ERL_NIF_TERM tail;
+
+    if (argc != 2
+       || !get_handle(env, argv[0], &hdl)
+       || !enif_inspect_iovec(env, 64, argv[1], &tail, &input)
+    ) {
+     return badarg(env);
+    }
+
+    // ------ Critical section start ------
+    READ_LOCK;
+    if (hdl->fd < 0) {
+       READ_UNLOCK;
+       return err_tup(env, EINVAL);
+    }
+    bytes_written = efile_writev(hdl->fd, input->iov, input->iovcnt, 
&res_errno);
+    READ_UNLOCK;
+    // ------- Critical section end ------
+
+    if(bytes_written < 0) {
+        return err_tup(env, res_errno);
+    }
+
+    if(!enif_is_empty_list(env, tail)) {
+        return enif_make_tuple2(env, ATOM_CONTINUE, tail);
+    }
+
+    return ATOM_OK;
+#else
+    return err_tup(env, EINVAL)
+#endif
+}
+
+static ERL_NIF_TERM seek_nif(ErlNifEnv *env, int argc, const ERL_NIF_TERM 
argv[]) {
+#ifndef _WIN32
+    handle_t* hdl;
+    long result, offset;
+    int whence;
+
+    if (argc != 3
+       || !get_handle(env, argv[0], &hdl)
+       || !enif_is_atom(env, argv[1])
+       || !enif_is_number(env, argv[2])
+    ) {
+     return badarg(env);
+    }
+
+    if (enif_is_identical(argv[1], ATOM_BOF)) {
+        whence = SEEK_SET;
+    } else if (enif_is_identical(argv[1], ATOM_EOF)) {
+        whence = SEEK_END;
+    } else {
+        return badarg(env);
+    }
+
+    if(!enif_get_int64(env, argv[2], &offset) || offset < 0){
+        return err_tup(env, EINVAL);
+    }
+
+    // ------ Critical section start ------
+    READ_LOCK;
+    if (hdl->fd < 0) {
+        READ_UNLOCK;
+        return err_tup(env, EINVAL);
+    }
+    result = lseek(hdl->fd, offset, whence);
+    READ_UNLOCK;
+    // ------ Critical section end ------
+
+    // Follow OTP special case here: result < 0 with ernno = 0 is einval
+    if(result < 0 && errno == 0) {
+        return err_tup(env, EINVAL);
+    }
+
+    if(result < 0) {
+        return err_tup(env, errno);
+    }
+    return ok_tup(env, enif_make_uint64(env, result));
+#else
+    return err_tup(env, EINVAL)
+#endif
+}
+
+static ERL_NIF_TERM datasync_nif(ErlNifEnv *env, int argc, const ERL_NIF_TERM 
argv[]) {
+#ifndef _WIN32
+    handle_t* hdl;
+    posix_errno_t  res_errno = 0;
+
+    if (argc != 1 || !get_handle(env, argv[0], &hdl)) {
+        return badarg(env);
+    }
+
+    // ------ Critical section start ------
+    READ_LOCK;
+    if (hdl->fd < 0) {
+        READ_UNLOCK;
+        return err_tup(env, EINVAL);
+    }
+    if(!efile_datasync(hdl->fd, &res_errno)) {
+       READ_UNLOCK;
+       return err_tup(env, res_errno);
+    }
+    READ_UNLOCK;
+    // ------ Critical section end ------
+
+    return ATOM_OK;
+#else
+    return err_tup(env, EINVAL)
+#endif
+}
+
+static ERL_NIF_TERM truncate_nif(ErlNifEnv *env, int argc, const ERL_NIF_TERM 
argv[]) {
+#ifndef _WIN32
+    handle_t* hdl;
+    off_t offset;
+
+    if (argc != 1 || !get_handle(env, argv[0], &hdl)) {
+     return badarg(env);
+    }
+
+    // ------ Critical section start ------
+    READ_LOCK;
+    if (hdl->fd < 0) {
+        READ_UNLOCK;
+        return err_tup(env, EINVAL);
+    }
+    offset = lseek(hdl->fd, 0, SEEK_CUR);
+    if (offset < 0) {
+        READ_UNLOCK;
+        return err_tup(env, errno);
+    }
+    if (ftruncate(hdl->fd, offset) < 0){
+        READ_UNLOCK;
+        return err_tup(env, errno);
+    }
+    READ_UNLOCK;
+    // ------ Critical section end ------
+
+    return ATOM_OK;
+#else
+    return err_tup(env, EINVAL)
+#endif
+}
+
+// Return a tuple with info about the handle The fields are:
+//   fd : file descriptor (int)
+//   old_fd : file descriptor we dup()-ed from (int)
+//
+static ERL_NIF_TERM info_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM 
argv[])
+{
+#ifndef _WIN32
+    handle_t* hdl;
+    int fd, old_fd;
+
+    if (argc != 1 || !get_handle(env, argv[0], &hdl)) {
+        return badarg(env);
+    }
+
+    // ------ Critical section start ------
+    READ_LOCK;
+    if (hdl->fd < 0) {
+        READ_UNLOCK;
+        return err_tup(env, EINVAL);
+    }
+    fd = hdl->fd;
+    old_fd = hdl->old_fd;
+    READ_UNLOCK;
+    // ------ Critical section end ------
+
+    return ok_tup(env, enif_make_tuple2(env,
+        enif_make_int(env, fd),
+        enif_make_int(env, old_fd)
+    ));
+#else
+    return err_tup(env, EINVAL);
+#endif
+}
+
+// Return the eof marker (the file size). This is a read-only call
+// and can be called by any reader process not just the owner
+//
+static ERL_NIF_TERM eof_nif(ErlNifEnv* env, int argc, const ERL_NIF_TERM 
argv[])
+{
+#ifndef _WIN32
+    handle_t* hdl;
+    struct stat data;
+
+    if (argc != 1 || !get_handle(env, argv[0], &hdl)) {
+        return badarg(env);
+    }
+
+    // ------ Critical section start ------
+    READ_LOCK;
+    if (hdl->fd < 0) {
+        READ_UNLOCK;
+        return err_tup(env, EINVAL);
+    }
+    if (fstat(hdl->fd, &data) < 0) {
+        READ_UNLOCK;
+        return err_tup(env, errno);
+    }
+    READ_UNLOCK;
+    // ------ Critical section end ------
+
+    return ok_tup(env, enif_make_int64(env, data.st_size));
+#else
+    return err_tup(env, EINVAL);
+#endif
+}
+
+// We cannot block the main scheduler in GC to close fds. NFS remote files
+// could be delayed indefinitely so we, at least, want to make sure it
+// happens on a dirty scheduler. See OTP prim_file_nif as example of this
+// pattern.
+//
+static void send_delay_close(ErlNifEnv *env, int fd)
+{
+    ERL_NIF_TERM msg;
+    msg = enif_make_tuple2(env, ATOM_CLOSE, enif_make_int(env, fd));
+    enif_send(env, &JANITOR_PID, NULL, msg);
+}
+
+static void handle_dtor(ErlNifEnv* env, void *obj)
+{
+    handle_t* hdl = (handle_t*) obj;
+
+    // ------ Critical section start ------
+    LOCK;
+
+    if (hdl->fd > -1) {
+        send_delay_close(env, hdl->fd);
+        hdl->fd = -1;
+    }
+    UNLOCK;
+    // ------ Critical section end ------
+
+    enif_rwlock_destroy(hdl->lock);
+}
+
+static void handle_down(ErlNifEnv* env, void* obj, ErlNifPid* pid, 
ErlNifMonitor* mon)
+{
+    handle_t* hdl = (handle_t*) obj;
+
+    // ------ Critical section start ------
+    LOCK;
+    if (hdl->fd > -1) {
+        send_delay_close(env, hdl->fd);
+        hdl->fd = -1;
+    }
+    UNLOCK;
+    // ------ Critical section end ------
+}
+
+
+static int load(ErlNifEnv* env, void** priv_data, ERL_NIF_TERM pid)
+{
+  ErlNifResourceTypeInit init;
+
+  if(!enif_get_local_pid(env, pid, &JANITOR_PID)) {
+      return -1;
+  }
+
+  init.down    = handle_down;
+  init.dtor    = handle_dtor;
+  init.stop    = NULL;
+  init.dyncall = NULL;
+  init.members = 4;
+  HANDLE_T = enif_init_resource_type(env, "couch_cfile:hdl", &init, 
ERL_NIF_RT_CREATE, NULL);
+  if(!HANDLE_T) {
+      return -1;
+  }
+
+  ATOM_BOF      = enif_make_atom(env, "bof");
+  ATOM_EOF      = enif_make_atom(env, "eof");
+  ATOM_ERROR    = enif_make_atom(env, "error");
+  ATOM_OK       = enif_make_atom(env, "ok");
+  ATOM_CLOSE    = enif_make_atom(env, "close");
+  ATOM_BADARG   = enif_make_atom(env, "badarg");
+  ATOM_CONTINUE = enif_make_atom(env, "continue");
+
+  *priv_data = NULL;
+
+  return 0;
+}
+
+static ErlNifFunc funcs[] = {
+    {"dup_nif",      1, dup_nif,      ERL_NIF_DIRTY_JOB_IO_BOUND},
+    {"close_nif",    1, close_nif,    ERL_NIF_DIRTY_JOB_IO_BOUND},
+    {"close_fd_nif", 1, close_fd_nif, ERL_NIF_DIRTY_JOB_IO_BOUND},
+    {"pread_nif",    3, pread_nif,    ERL_NIF_DIRTY_JOB_IO_BOUND},
+    {"eof_nif",      1, eof_nif,      ERL_NIF_DIRTY_JOB_IO_BOUND},
+    {"seek_nif",     3, seek_nif,     ERL_NIF_DIRTY_JOB_IO_BOUND},
+    {"write_nif",    2, write_nif,    ERL_NIF_DIRTY_JOB_IO_BOUND},
+    {"datasync_nif", 1, datasync_nif, ERL_NIF_DIRTY_JOB_IO_BOUND},
+    {"truncate_nif", 1, truncate_nif, ERL_NIF_DIRTY_JOB_IO_BOUND},
+    {"info_nif",     1, info_nif}
+};
+
+ERL_NIF_INIT(couch_cfile, funcs, load, NULL, NULL, NULL);
diff --git a/src/couch/rebar.config.script b/src/couch/rebar.config.script
index e26b6b608..c4e0b7a7e 100644
--- a/src/couch/rebar.config.script
+++ b/src/couch/rebar.config.script
@@ -295,6 +295,10 @@ IcuWinEnv = [{"CFLAGS", "$DRV_CFLAGS /DXP_WIN"},
 ComparePath = "priv/couch_ejson_compare.so".
 CompareSrc = ["priv/couch_ejson_compare/*.c"].
 
+CouchCFileEnv = [{"CFLAGS", "$CFLAGS -Wall -Werror -DNDEBUG -O3"}].
+CouchCFilePath = "priv/couch_cfile.so".
+CouchCFileSrc = ["priv/couch_cfile/*.c"].
+
 SpidermonkeySpecs = case WithSpidermonkey of
     true -> [{".*", CouchJSPath, CouchJSSrc, [{env, CouchJSEnv}]}];
     false -> []
@@ -305,7 +309,9 @@ PortSpecs = SpidermonkeySpecs ++ [
         {"darwin", ComparePath, CompareSrc, [{env, IcuEnv ++ IcuDarwinEnv}]},
         {"linux",  ComparePath, CompareSrc, [{env, IcuEnv}]},
         {"bsd",   ComparePath, CompareSrc, [{env, IcuEnv ++ IcuBsdEnv}]},
-        {"win32",  ComparePath, CompareSrc, [{env, IcuWinEnv}]}
+        {"win32",  ComparePath, CompareSrc, [{env, IcuWinEnv}]},
+        {"(linux|bsd|darwin)", CouchCFilePath, CouchCFileSrc, [{env, 
CouchCFileEnv}]},
+        {"win32",  CouchCFilePath, CouchCFileSrc, []}
 ].
 
 %% hack required until switch to enc/rebar3
diff --git a/src/couch/src/couch_cfile.erl b/src/couch/src/couch_cfile.erl
new file mode 100644
index 000000000..e4d2835fc
--- /dev/null
+++ b/src/couch/src/couch_cfile.erl
@@ -0,0 +1,285 @@
+% Licensed under the Apache License, Version 2.0 (the "License"); you may not
+% use this file except in compliance with the License. You may obtain a copy of
+% the License at
+%
+%   http://www.apache.org/licenses/LICENSE-2.0
+%
+% Unless required by applicable law or agreed to in writing, software
+% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+% License for the specific language governing permissions and limitations under
+% the License.
+
+% This module can dup(licate) raw file handles to create file handles which
+% allow other proceses to issue pread calls. This lets clients completely
+% bypass the couch_file gen_server message queue to do reads.
+%
+% At the POSIX API level pread() functions are thread-safe so calls can be
+% issued in parallel by multiple threads. See these links to find out more
+% about dup() and pread():
+%
+%  - https://www.man7.org/linux/man-pages/man2/dup.2.html
+%  - https://www.man7.org/linux/man-pages/man2/pread.2.html
+
+-module(couch_cfile).
+
+-export([
+    dup/1,
+    pread/2,
+    pread/3,
+    close/1,
+    position/2,
+    datasync/1,
+    write/2,
+    truncate/1,
+    fd/1,
+    advise/4
+]).
+
+% Internal exports
+%
+-export([
+    janitor/0
+]).
+
+-on_load(init/0).
+
+-nifs([
+    dup_nif/1,
+    close_nif/1,
+    close_fd_nif/1,
+    pread_nif/3,
+    info_nif/1,
+    eof_nif/1,
+    seek_nif/3,
+    write_nif/2,
+    datasync_nif/1,
+    truncate_nif/1
+]).
+
+-include_lib("kernel/include/file.hrl").
+
+% Duplicate an open file handle The dup-ed handle will reference the same "file
+% description" as the prim_file raw handle. After duplicating, the original
+% prim_file handle can be closed.
+%
+% Handles returned from dup/1 follow the standard Erlang/OTP #file_descriptor{}
+% "protocol", so they can be be transparently used by regular `file` module for
+% pread, write, truncate and position calls.
+%
+dup(#file_descriptor{module = prim_file} = Fd) ->
+    case fd(Fd) of
+        {ok, FdInt} ->
+            case dup_nif(FdInt) of
+                {ok, Ref} -> make_handle(Fd, Ref);
+                {error, _} = Error -> Error
+            end;
+        {error, _} = Error ->
+            Error
+    end;
+dup(_) ->
+    {error, einval}.
+
+close(#file_descriptor{module = ?MODULE} = Fd) ->
+    close_nif(owner_handle(Fd));
+close(_) ->
+    {error, einval}.
+
+pread(#file_descriptor{module = ?MODULE} = Fd, Pos, Len) ->
+    pread_nif(handle(Fd), Pos, Len);
+pread(_, _, _) ->
+    {error, einval}.
+
+pread(#file_descriptor{module = ?MODULE} = Fd, LocNums) ->
+    pread_list(handle(Fd), LocNums, []);
+pread(_, _) ->
+    {error, einval}.
+
+% Only position(Fd, eof|Pos) are supported. The variant eof one can be
+% used by other processes. Only the owner can change position via the lseek API
+% call. Readers (non-owners) can still call file:position(Fd, eof) to get the
+% size of the file but they'll get it via the fstat call.
+%
+position(#file_descriptor{module = ?MODULE, data = Data} = Fd, eof) ->
+    #{owner := Owner} = Data,
+    case self() =:= Owner of
+        true -> seek_nif(owner_handle(Fd), eof, 0);
+        false -> eof_nif(handle(Fd))
+    end;
+position(#file_descriptor{module = ?MODULE} = Fd, Pos) when is_integer(Pos), 
Pos >= 0 ->
+    seek_nif(owner_handle(Fd), bof, Pos);
+position(_, _) ->
+    {error, einval}.
+
+datasync(#file_descriptor{module = ?MODULE} = Fd) ->
+    datasync_nif(owner_handle(Fd));
+datasync(_) ->
+    {error, einval}.
+
+write(#file_descriptor{module = ?MODULE} = Fd, IOData) ->
+    write_1(owner_handle(Fd), erlang:iolist_to_iovec(IOData)).
+
+truncate(#file_descriptor{module = ?MODULE} = Fd) ->
+    truncate_nif(owner_handle(Fd)).
+
+% Can use this for debugging to inspect the raw (integer) file descriptors
+%
+fd(#file_descriptor{module = prim_file} = RawFd) ->
+    case prim_file:get_handle(RawFd) of
+        <<FdInt:32/native-signed-integer>> -> {ok, FdInt};
+        _ -> {error, einval}
+    end;
+fd(#file_descriptor{module = ?MODULE, data = Data}) ->
+    #{handle := Ref} = Data,
+    case info_nif(Ref) of
+        {ok, {FdInt, _}} -> {ok, FdInt};
+        {error, _} = Error -> Error
+    end;
+fd(_) ->
+    {error, einval}.
+
+% Since this is optional to implement we skip it for now
+%
+advise(#file_descriptor{module = ?MODULE} = Fd, Offset, Length, Advice) when
+    is_integer(Offset) andalso Offset >= 0,
+    is_integer(Length) andalso Length >= 0,
+    is_atom(Advice)
+->
+    % Check the owner at least. If/when we implement this, only the owner will
+    % get to call it.
+    _ = owner_handle(Fd),
+    ok;
+advise(_, _, _, _) ->
+    {error, einval}.
+
+% Internal helpers
+
+make_handle(#file_descriptor{module = prim_file} = Orig, Ref) ->
+    Data = #{handle => Ref, owner => self()},
+    Dup = #file_descriptor{module = ?MODULE, data = Data},
+    case sanity_check(Orig, Dup) of
+        true ->
+            {ok, Dup};
+        false ->
+            close_nif(Ref),
+            couch_log:error("~p : sanity check failed fd:~p dup:~p", [?MODULE, 
Orig, Dup]),
+            {error, einval}
+    end.
+
+sanity_check(#file_descriptor{} = Orig, #file_descriptor{} = Dup) ->
+    % Compare original and dup-ed saved origin fds. This should run after the
+    % dup call. Not sure how this could fail (somehow the raw fd crashed and
+    % re-opened by someone else right before dup-ing?) but it's better to be
+    % safe than sorry here. Another important bit is re-fetching both
+    % descriptors implicitly is asserting they haven't closed in the meantime.
+    case fd(Orig) of
+        {ok, Fd1} when is_integer(Fd1), Fd1 > -1 ->
+            case info_nif(owner_handle(Dup)) of
+                {ok, {_, Fd2}} when is_integer(Fd2), Fd2 > -1 -> Fd1 =:= Fd2;
+                {ok, {_, _}} -> false;
+                {error, _} -> false
+            end;
+        _ ->
+            false
+    end.
+
+handle(#file_descriptor{module = ?MODULE, data = #{} = Data}) ->
+    #{handle := Ref} = Data,
+    Ref.
+
+owner_handle(#file_descriptor{module = ?MODULE, data = #{} = Data}) ->
+    #{handle := Ref, owner := Owner} = Data,
+    case self() =:= Owner of
+        true -> Ref;
+        false -> error(not_on_controlling_process)
+    end.
+
+% These are are copied from the OTP pread/write logic.
+%
+pread_list(_Fd, [], ResultList) ->
+    {ok, lists:reverse(ResultList)};
+pread_list(Fd, [{Pos, Len} | Rest], ResultList) ->
+    case pread_nif(Fd, Pos, Len) of
+        {ok, Data} -> pread_list(Fd, Rest, [Data | ResultList]);
+        eof -> pread_list(Fd, Rest, [eof | ResultList]);
+        {error, _} = Error -> Error
+    end.
+
+write_1(Ref, IOVec) ->
+    case write_nif(Ref, IOVec) of
+        {continue, Remainder} ->
+            write_1(Ref, Remainder);
+        ok ->
+            ok;
+        {error, Reason} ->
+            {error, Reason}
+    end.
+
+init() ->
+    PrivDir =
+        case code:priv_dir(?MODULE) of
+            {error, _} ->
+                EbinDir = filename:dirname(code:which(?MODULE)),
+                AppPath = filename:dirname(EbinDir),
+                filename:join(AppPath, "priv");
+            Path ->
+                Path
+        end,
+    erlang:load_nif(filename:join(PrivDir, "couch_cfile"), spawn_janitor()).
+
+% Spawn a janitor process to run all the delayed close calls on the dirty IO
+% schedulers. This is what OTP does, so we stick to the same pattern in order
+% to avoid re-inventing the wheel
+%
+spawn_janitor() ->
+    case whereis(?MODULE) of
+        ExistingPid when is_pid(ExistingPid) ->
+            ExistingPid;
+        _ ->
+            Pid = spawn(?MODULE, janitor, []),
+            register(?MODULE, Pid),
+            Pid
+    end.
+
+janitor() ->
+    % We want to crash the node if this process dies. This approximates the
+    % behavior of erts_internal:spawn_system_process/3.
+    link(whereis(init)),
+    loop().
+
+loop() ->
+    receive
+        {close, Fd} when is_integer(Fd) -> close_fd_nif(Fd);
+        _ -> ok
+    end,
+    loop().
+
+dup_nif(_) ->
+    erlang:nif_error(nif_not_loaded).
+
+close_nif(_) ->
+    erlang:nif_error(nif_not_loaded).
+
+close_fd_nif(_) ->
+    erlang:nif_error(nif_not_loaded).
+
+pread_nif(_, _, _) ->
+    erlang:nif_error(nif_not_loaded).
+
+eof_nif(_) ->
+    erlang:nif_error(nif_not_loaded).
+
+info_nif(_) ->
+    erlang:nif_error(nif_not_loaded).
+
+seek_nif(_, _, _) ->
+    erlang:nif_error(nif_not_loaded).
+
+write_nif(_, _) ->
+    erlang:nif_error(nif_not_loaded).
+
+datasync_nif(_) ->
+    erlang:nif_error(nif_not_loaded).
+
+truncate_nif(_) ->
+    erlang:nif_error(nif_not_loaded).
diff --git a/src/couch/src/couch_file.erl b/src/couch/src/couch_file.erl
index 9c2a7a1d8..8f15bd8cd 100644
--- a/src/couch/src/couch_file.erl
+++ b/src/couch/src/couch_file.erl
@@ -23,6 +23,10 @@
 -define(DEFAULT_READ_COUNT, 1024).
 -define(WRITE_XXHASH_CHECKSUMS_DEFAULT, false).
 
+-define(USE_CFILE_DEFAULT, true).
+-define(CFILE_SKIP_IOQ_DEFAULT, false).
+-define(CFILE_HANDLE, cfile_handle).
+
 -type block_id() :: non_neg_integer().
 -type location() :: non_neg_integer().
 -type header_size() :: non_neg_integer().
@@ -31,7 +35,8 @@
     fd,
     is_sys,
     eof = 0,
-    db_monitor
+    db_monitor,
+    filepath
 }).
 
 % public API
@@ -186,11 +191,49 @@ pread_binaries(Fd, PosList) ->
     ZipFun = fun(Pos, {IoList, Checksum}) ->
         verify_checksum(Fd, Pos, iolist_to_binary(IoList), Checksum, false)
     end,
-    case ioq:call(Fd, {pread_iolists, PosList}, erlang:get(io_priority)) of
+    case pread_iolists(Fd, PosList) of
         {ok, DataAndChecksums} -> {ok, lists:zipwith(ZipFun, PosList, 
DataAndChecksums)};
         Error -> Error
     end.
 
+pread_iolists(Fd, PosList) ->
+    IoqPriority = erlang:get(io_priority),
+    IoqMsg = {pread_iolists, PosList},
+    case {get_cfile(Fd), cfile_skip_ioq()} of
+        {undefined, _} ->
+            % No cfile, that's fine, do what we always did
+            ioq:call(Fd, IoqMsg, IoqPriority);
+        {#file{} = CFile, true} ->
+            % Skip the IOQ if we have a cfile handle. Use this option on a
+            % system with enough RAM for the page cache and plenty of IO
+            % bandwidth
+            parallel_pread(CFile, PosList);
+        {#file{} = CFile, false} ->
+            % Use parallel preads only if the request would be bypassed by the
+            % IOQ. All three compatible ioqs (the two from the
+            % apache/couchdb-ioq, and the source default one) currently do not
+            % know how to call a function, they all expect to send a
+            % '$gen_call' message to a gen_server. Until we figure out how
+            % teach the IOQ(s) to call an MFA we can let them keep calling the
+            % main file descriptor gen_server.
+            case ioq:bypass(IoqMsg, IoqPriority) of
+                true -> parallel_pread(CFile, PosList);
+                false -> ioq:call(Fd, IoqMsg, IoqPriority)
+            end
+    end.
+
+parallel_pread(#file{} = CFile, PosList) when is_list(PosList) ->
+    try
+        pread(CFile, PosList)
+    catch
+        % Catch the early return since we're bypassing the gen_server
+        throw:{stop, Error, _, _} ->
+            Error
+    end.
+
+cfile_skip_ioq() ->
+    config:get_boolean("couchdb", "cfile_skip_ioq", ?CFILE_SKIP_IOQ_DEFAULT).
+
 append_terms(Fd, Terms) ->
     append_terms(Fd, Terms, []).
 
@@ -217,9 +260,12 @@ append_binaries(Fd, Bins) when is_list(Bins) ->
 %%  or {error, Reason}.
 %%----------------------------------------------------------------------
 
-% length in bytes
+% length in bytes.
 bytes(Fd) ->
-    gen_server:call(Fd, bytes, infinity).
+    case get_cfile(Fd) of
+        undefined -> gen_server:call(Fd, bytes, infinity);
+        #file{} = CFile -> eof(CFile)
+    end.
 
 %%----------------------------------------------------------------------
 %% Purpose: Truncate a file to the number of bytes.
@@ -404,6 +450,7 @@ init_status_error(ReturnPid, Ref, Error) ->
 init({Filepath, Options, ReturnPid, Ref}) ->
     OpenOptions = file_open_options(Options),
     IsSys = lists:member(sys_db, Options),
+    File = #file{filepath = Filepath, is_sys = IsSys},
     case lists:member(create, Options) of
         true ->
             filelib:ensure_dir(Filepath),
@@ -424,7 +471,7 @@ init({Filepath, Options, ReturnPid, Ref}) ->
                                     ok = fsync(Fd),
                                     maybe_track_open_os_files(Options),
                                     erlang:send_after(?INITIAL_WAIT, self(), 
maybe_close),
-                                    {ok, #file{fd = Fd, is_sys = IsSys}};
+                                    {ok, dup(File#file{fd = Fd})};
                                 false ->
                                     ok = file:close(Fd),
                                     init_status_error(ReturnPid, Ref, {error, 
eexist})
@@ -432,7 +479,7 @@ init({Filepath, Options, ReturnPid, Ref}) ->
                         false ->
                             maybe_track_open_os_files(Options),
                             erlang:send_after(?INITIAL_WAIT, self(), 
maybe_close),
-                            {ok, #file{fd = Fd, is_sys = IsSys}}
+                            {ok, dup(File#file{fd = Fd})}
                     end;
                 Error ->
                     init_status_error(ReturnPid, Ref, Error)
@@ -449,7 +496,7 @@ init({Filepath, Options, ReturnPid, Ref}) ->
                             maybe_track_open_os_files(Options),
                             {ok, Eof} = file:position(Fd, eof),
                             erlang:send_after(?INITIAL_WAIT, self(), 
maybe_close),
-                            {ok, #file{fd = Fd, eof = Eof, is_sys = IsSys}};
+                            {ok, dup(File#file{fd = Fd, eof = Eof})};
                         Error ->
                             init_status_error(ReturnPid, Ref, Error)
                     end;
@@ -483,27 +530,9 @@ terminate(_Reason, #file{fd = Fd}) ->
 handle_call(close, _From, #file{fd = Fd} = File) ->
     {stop, normal, file:close(Fd), File#file{fd = nil}};
 handle_call({pread_iolists, PosL}, _From, File) ->
-    LocNums1 = [{Pos, 4} || Pos <- PosL],
-    DataSizes = read_multi_raw_iolists_int(File, LocNums1),
-    MapFun = fun({LenIoList, NextPos}) ->
-        case iolist_to_binary(LenIoList) of
-            % a checksum-prefixed term
-            <<1:1/integer, Len:31/integer>> -> {NextPos, Len + 16};
-            <<0:1/integer, Len:31/integer>> -> {NextPos, Len}
-        end
-    end,
-    LocNums2 = lists:map(MapFun, DataSizes),
-    Resps = read_multi_raw_iolists_int(File, LocNums2),
-    ZipFun = fun({LenIoList, _}, {FullIoList, _}) ->
-        case iolist_to_binary(LenIoList) of
-            <<1:1/integer, _:31/integer>> -> extract_checksum(FullIoList);
-            <<0:1/integer, _:31/integer>> -> {FullIoList, <<>>}
-        end
-    end,
-    Extracted = lists:zipwith(ZipFun, DataSizes, Resps),
-    {reply, {ok, Extracted}, File};
-handle_call(bytes, _From, #file{fd = Fd} = File) ->
-    {reply, file:position(Fd, eof), File};
+    {reply, pread(File, PosL), File};
+handle_call(bytes, _From, #file{} = File) ->
+    {reply, eof(File), File};
 handle_call({set_db_pid, Pid}, _From, #file{db_monitor = OldRef} = File) ->
     case is_reference(OldRef) of
         true -> demonitor(OldRef, [flush]);
@@ -586,6 +615,30 @@ format_status(_Opt, [PDict, #file{} = File]) ->
     {_Fd, FilePath} = couch_util:get_value(couch_file_fd, PDict),
     [{data, [{"State", File}, {"InitialFilePath", FilePath}]}].
 
+eof(#file{fd = Fd}) ->
+    file:position(Fd, eof).
+
+pread(#file{} = File, PosL) ->
+    LocNums1 = [{Pos, 4} || Pos <- PosL],
+    DataSizes = read_multi_raw_iolists_int(File, LocNums1),
+    MapFun = fun({LenIoList, NextPos}) ->
+        case iolist_to_binary(LenIoList) of
+            % a checksum-prefixed term
+            <<1:1/integer, Len:31/integer>> -> {NextPos, Len + 16};
+            <<0:1/integer, Len:31/integer>> -> {NextPos, Len}
+        end
+    end,
+    LocNums2 = lists:map(MapFun, DataSizes),
+    Resps = read_multi_raw_iolists_int(File, LocNums2),
+    ZipFun = fun({LenIoList, _}, {FullIoList, _}) ->
+        case iolist_to_binary(LenIoList) of
+            <<1:1/integer, _:31/integer>> -> extract_checksum(FullIoList);
+            <<0:1/integer, _:31/integer>> -> {FullIoList, <<>>}
+        end
+    end,
+    Extracted = lists:zipwith(ZipFun, DataSizes, Resps),
+    {ok, Extracted}.
+
 fsync(Fd) ->
     T0 = erlang:monotonic_time(),
     % We do not rely on mtime/atime for our safety/consitency so we can use
@@ -694,7 +747,7 @@ read_multi_raw_iolists_int(#file{fd = Fd, eof = Eof} = 
File, PosLens) ->
             false ->
                 couch_stats:increment_counter([pread, exceed_eof]),
                 {ok, CurEof} = file:position(File#file.fd, eof),
-                {_Fd, Filepath} = get(couch_file_fd),
+                Filepath = File#file.filepath,
                 throw_stop({read_beyond_eof, Filepath, Pos, TotalBytes, Eof, 
CurEof}, File)
         end
     end,
@@ -702,7 +755,7 @@ read_multi_raw_iolists_int(#file{fd = Fd, eof = Eof} = 
File, PosLens) ->
         {ok, Bins} ->
             lists:zipwith(ZipFun, LocNums, Bins);
         {error, Error} ->
-            {_Fd, Filepath} = get(couch_file_fd),
+            Filepath = File#file.filepath,
             throw_stop({pread, Filepath, Error, hd(LocNums)}, File)
     end.
 
@@ -713,7 +766,7 @@ get_pread_locnum(#file{eof = Eof} = File, Pos, Len) ->
         Size when Size > Eof ->
             couch_stats:increment_counter([pread, exceed_eof]),
             {ok, CurEof} = file:position(File#file.fd, eof),
-            {_Fd, Filepath} = get(couch_file_fd),
+            Filepath = File#file.filepath,
             throw_stop({read_beyond_eof, Filepath, Pos, TotalBytes, Eof, 
CurEof}, File);
         _ ->
             {Pos, TotalBytes}
@@ -884,6 +937,47 @@ generate_xxhash_checksums() ->
     Default = ?WRITE_XXHASH_CHECKSUMS_DEFAULT,
     config:get_boolean("couchdb", "write_xxhash_checksums", Default).
 
+% couch_cfile handling
+%
+
+get_cfile(Pid) when is_pid(Pid) ->
+    case {is_process_alive(Pid), get(?CFILE_HANDLE)} of
+        {false, _} ->
+            erase(?CFILE_HANDLE),
+            undefined;
+        {true, {Pid, #file{} = CFile}} ->
+            CFile;
+        {true, _} ->
+            % Maybe use an ets table as this is still a signal send/recv
+            case couch_util:process_dict_get(Pid, ?CFILE_HANDLE) of
+                #file{} = CFile ->
+                    put(?CFILE_HANDLE, {Pid, CFile}),
+                    CFile;
+                undefined ->
+                    undefined
+            end
+    end.
+
+dup(#file{fd = Fd} = File) ->
+    case config:get_boolean("couchdb", "use_cfile", ?USE_CFILE_DEFAULT) of
+        true ->
+            case couch_cfile:dup(Fd) of
+                {ok, CFd} ->
+                    % Successfully opened a couch_cfile handle, close
+                    % the original one to free the fd
+                    ok = file:close(Fd),
+                    CFile = File#file{fd = CFd},
+                    put(couch_file_fd, {CFd, CFile#file.filepath}),
+                    % Use an effective infinity for eof max limit for now
+                    put(?CFILE_HANDLE, CFile#file{eof = 1 bsl 60}),
+                    CFile;
+                {error, _Error} ->
+                    File
+            end;
+        false ->
+            File
+    end.
+
 -ifdef(TEST).
 -include_lib("couch/include/couch_eunit.hrl").
 
diff --git a/src/couch/test/eunit/couch_cfile_prop_tests.erl 
b/src/couch/test/eunit/couch_cfile_prop_tests.erl
new file mode 100644
index 000000000..a2d954e5c
--- /dev/null
+++ b/src/couch/test/eunit/couch_cfile_prop_tests.erl
@@ -0,0 +1,156 @@
+% Licensed under the Apache License, Version 2.0 (the "License"); you may not
+% use this file except in compliance with the License. You may obtain a copy of
+% the License at
+%
+%   http://www.apache.org/licenses/LICENSE-2.0
+%
+% Unless required by applicable law or agreed to in writing, software
+% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+% License for the specific language governing permissions and limitations under
+% the License.
+
+-module(couch_cfile_prop_tests).
+
+-ifdef(WITH_PROPER).
+
+-include_lib("couch/include/couch_eunit_proper.hrl").
+-include_lib("couch/include/couch_eunit.hrl").
+-include_lib("kernel/include/file.hrl").
+
+property_test_() ->
+    ?EUNIT_QUICKCHECK(60, 25000).
+
+-define(SIZE_LIMIT, 5000).
+-define(CFILE_FD, cfile_fd).
+-define(RAW_FD, raw_fd).
+-define(RAW_PATH, raw_path).
+-define(CFILE_PATH, cfile_path).
+
+% Check that any combination of file operations on both the raw handle and the
+% cfile handle yield the same results
+%
+prop_file_ops_results_match_raw_file() ->
+    case os:type() of
+        {win32, _} ->
+            % Dummy test that passes on Windows. We want to return a valid
+            % property to avoid upsetting PropEr.
+            ?FORALL(N, number(), is_number(N));
+        {_, _} ->
+            % Setup is a bit awkward but this is the PropEr pattern (pun
+            % intended). The general idea is ?SETUP takes a function, which
+            % returns another teardown function. SETUP?s can be nested so,
+            % instead of the property as the second argument, use another
+            % ?SETUP and so on.
+            ?SETUP(
+                fun() ->
+                    Path = ?tempfile(),
+                    put(?RAW_FD, open_raw(Path)),
+                    put(?RAW_PATH, Path),
+                    fun() ->
+                        ok = file:close(get(?RAW_FD)),
+                        ok = file:delete(Path),
+                        erase(?RAW_FD),
+                        erase(?RAW_PATH),
+                        ok
+                    end
+                end,
+                ?SETUP(
+                    fun() ->
+                        Path = ?tempfile(),
+                        put(?CFILE_FD, open_cfile(Path)),
+                        put(?CFILE_PATH, Path),
+                        fun() ->
+                            ok = file:close(get(?CFILE_FD)),
+                            ok = file:delete(Path),
+                            erase(?CFILE_FD),
+                            erase(?CFILE_PATH),
+                            ok
+                        end
+                    end,
+                    ?FORALL(
+                        {Cmd, Args},
+                        g_file_ops(),
+                        begin
+                            % Apply the same operations to the raw file handle
+                            % and to the cfile one, and assert that they return
+                            % the same results.
+                            RawResult = apply_op(get(?RAW_FD), Cmd, Args),
+                            CFileResult = apply_op(get(?CFILE_FD), Cmd, Args),
+                            RawResult == CFileResult
+                        end
+                    )
+                )
+            )
+    end.
+
+open_raw(Path) ->
+    {ok, Fd} = file:open(Path, [append, read, binary, raw]),
+    Fd.
+
+open_cfile(Path) ->
+    Fd0 = open_raw(Path),
+    {ok, Fd} = couch_cfile:dup(Fd0),
+    ok = file:close(Fd0),
+    Fd.
+
+apply_op(#file_descriptor{module = prim_file} = Fd, reopen, []) ->
+    ok = file:close(Fd),
+    Fd1 = open_raw(get(?RAW_PATH)),
+    put(?RAW_FD, Fd1),
+    {ok, Len} = file:position(Fd1, eof),
+    file:pread(Fd1, 0, Len);
+apply_op(#file_descriptor{module = couch_cfile} = Fd, reopen, []) ->
+    ok = file:close(Fd),
+    Fd1 = open_cfile(get(?CFILE_PATH)),
+    put(?CFILE_FD, Fd1),
+    {ok, Len} = file:position(Fd1, eof),
+    file:pread(Fd1, 0, Len);
+apply_op(Fd, truncate_pos, [Pos]) ->
+    % Position + truncate immediately after like in couch_file, otherwise
+    % position will be reset to the end of the file on next write, and we
+    % might not test this combination as often
+    PosRes = file:position(Fd, Pos),
+    TruncateRes = file:truncate(Fd),
+    {PosRes, TruncateRes};
+apply_op(Fd, Cmd, Args) ->
+    apply(file, Cmd, [Fd] ++ Args).
+
+g_file_ops() ->
+    frequency([
+        {15, g_write()},
+        {15, g_truncate_pos()},
+        {10, g_pread()},
+        {5, g_datasync()},
+        {3, g_truncate()},
+        {3, g_position()},
+        {2, g_reopen()}
+    ]).
+
+g_reopen() ->
+    {reopen, []}.
+
+g_pread() ->
+    {pread, [range(-1, ?SIZE_LIMIT), range(-1, ?SIZE_LIMIT)]}.
+
+g_position() ->
+    {position, [
+        frequency([
+            {5, range(0, ?SIZE_LIMIT)},
+            {1, eof}
+        ])
+    ]}.
+
+g_write() ->
+    {write, [binary()]}.
+
+g_datasync() ->
+    {datasync, []}.
+
+g_truncate() ->
+    {truncate, []}.
+
+g_truncate_pos() ->
+    {truncate_pos, [range(0, ?SIZE_LIMIT)]}.
+
+-endif.
diff --git a/src/couch/test/eunit/couch_cfile_tests.erl 
b/src/couch/test/eunit/couch_cfile_tests.erl
new file mode 100644
index 000000000..87c15310c
--- /dev/null
+++ b/src/couch/test/eunit/couch_cfile_tests.erl
@@ -0,0 +1,534 @@
+% Licensed under the Apache License, Version 2.0 (the "License"); you may not
+% use this file except in compliance with the License. You may obtain a copy of
+% the License at
+%
+%   http://www.apache.org/licenses/LICENSE-2.0
+%
+% Unless required by applicable law or agreed to in writing, software
+% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+% License for the specific language governing permissions and limitations under
+% the License.
+
+-module(couch_cfile_tests).
+
+-include_lib("couch/include/couch_eunit.hrl").
+
+-define(CONCURRENT_READER_JITTER_MSEC, 5).
+
+couch_cfile_test_() ->
+    {
+        foreach,
+        fun setup/0,
+        fun teardown/1,
+        case os:type() of
+            {win32, _} ->
+                [
+                    ?TDEF_FE(t_unsupported)
+                ];
+            {_, _} ->
+                [
+                    ?TDEF_FE(t_basics),
+                    ?TDEF_FE(t_pread_and_position),
+                    ?TDEF_FE(t_pread_from_other_procesess),
+                    ?TDEF_FE(t_write),
+                    ?TDEF_FE(t_datasync),
+                    ?TDEF_FE(t_position_and_truncate),
+                    ?TDEF_FE(t_advise),
+                    ?TDEF_FE(t_invalid_fd),
+                    ?TDEF_FE(t_fd),
+                    ?TDEF_FE(t_cannot_dup_cfile_handle),
+                    ?TDEF_FE(t_gc_is_closing_file_handles),
+                    ?TDEF_FE(t_monitor_is_closing_file_handles),
+                    ?TDEF_FE(t_janitor_proc_is_up),
+                    ?TDEF_FE(t_concurrent_reads_512b),
+                    ?TDEF_FE(t_concurrent_reads_4kb),
+                    ?TDEF_FE(t_concurrent_reads_1mb)
+                ]
+        end
+    }.
+
+setup() ->
+    ?tempfile().
+
+teardown(Path) ->
+    catch file:delete(Path).
+
+open_raw(Path) ->
+    % Use the options couch_file is using
+    {ok, Fd} = file:open(Path, [binary, append, raw, read]),
+    ok = file:write(Fd, <<"abcd">>),
+    Fd.
+
+t_basics(Path) ->
+    Fd = open_raw(Path),
+    Res = couch_cfile:dup(Fd),
+    ?assertMatch({ok, _}, Res),
+    {ok, CFd} = Res,
+    ?assertEqual({ok, <<"ab">>}, couch_cfile:pread(CFd, 0, 2)),
+    file:close(Fd),
+    % Check dup-ing a closed raw file descriptor
+    ?assertEqual({error, einval}, couch_cfile:dup(Fd)),
+    ?assertEqual({ok, [<<"ab">>]}, couch_cfile:pread(CFd, [{0, 2}])),
+    ?assertMatch({ok, Int} when is_integer(Int), couch_cfile:fd(CFd)),
+    ?assertEqual(ok, couch_cfile:close(CFd)),
+    ?assertEqual({error, einval}, couch_cfile:pread(CFd, [{0, 2}])).
+
+t_pread_and_position(Path) ->
+    % Note: we'll be reading using 'file' module functions even for cfile
+    % handles. We're specifically acting like an OTP file layer to ensure we
+    % don't have to duplicate our couch_file code
+
+    Fd = open_raw(Path),
+    {ok, CFd} = couch_cfile:dup(Fd),
+
+    % Check positions
+    {ok, Eof} = file:position(Fd, eof),
+    {ok, CFdEof} = file:position(CFd, eof),
+    ?assertEqual(Eof, CFdEof),
+
+    % Basic preads
+    ?assertEqual({ok, []}, file:pread(CFd, [])),
+    ?assertEqual({ok, [eof]}, file:pread(CFd, [{0, 0}])),
+    ?assertEqual(eof, file:pread(CFd, 0, 0)),
+    ?assertEqual({ok, [<<"a">>]}, file:pread(CFd, [{0, 1}])),
+    ?assertEqual({ok, <<"a">>}, file:pread(CFd, 0, 1)),
+    ?assertEqual({ok, <<"d">>}, file:pread(CFd, 3, 1)),
+    ?assertEqual({ok, <<"d">>}, file:pread(CFd, 3, 2)),
+    ?assertEqual({ok, [<<"a">>, eof]}, file:pread(CFd, [{0, 1}, {4, 1}])),
+    ?assertEqual({error, einval}, couch_cfile:pread(junk, [{0, 1}])),
+    ?assertEqual({error, badarg}, file:pread(CFd, junk)),
+
+    % Most of all we care that behavior matches file:pread/1 for
+    % any combination of valid/invalid/eof ranges
+    [
+        ?assertEqual(file:pread(Fd, P, L), file:pread(CFd, P, L))
+     || P <- lists:seq(-1, Eof + 1) ++ [1 bsl 42],
+        L <- lists:seq(-1, Eof + 1) ++ [1 bsl 42]
+    ],
+
+    % Positions and preads are updated after a write
+    ok = file:write(Fd, <<"ef">>),
+
+    % Check new positions
+    {ok, Eof1} = file:position(Fd, eof),
+    {ok, CFdEof1} = file:position(CFd, eof),
+    ?assertEqual(Eof1, CFdEof1),
+
+    [
+        ?assertEqual(file:pread(Fd, P, L), file:pread(CFd, P, L))
+     || P <- lists:seq(-1, Eof1 + 1), L <- lists:seq(-1, Eof1 + 1)
+    ],
+
+    % File truncation also is reflected in position and preads
+    {ok, 4} = file:position(Fd, 4),
+    ok = file:truncate(Fd),
+
+    ?assertEqual({ok, 4}, file:position(Fd, eof)),
+    ?assertEqual({ok, 4}, file:position(CFd, eof)),
+
+    {ok, 3} = file:position(Fd, 3),
+    {ok, 3} = file:position(CFd, 3),
+
+    ok = file:truncate(CFd),
+
+    ?assertEqual({ok, 3}, file:position(Fd, eof)),
+    ?assertEqual({ok, 3}, file:position(CFd, eof)),
+
+    [
+        ?assertEqual(file:pread(Fd, [{P, L}]), file:pread(CFd, [{P, L}]))
+     || P <- lists:seq(-1, 4), L <- lists:seq(-1, 4)
+    ],
+
+    % Test closing behavior
+    ok = file:close(Fd),
+    ?assertEqual({error, einval}, file:pread(Fd, 0, 1)),
+    ?assertEqual({error, einval}, file:position(Fd, eof)),
+
+    % Can still read from our dup-ed handle
+    ?assertEqual({ok, <<"a">>}, file:pread(CFd, 0, 1)),
+    ?assertEqual({ok, 3}, file:position(CFd, eof)),
+
+    ok = file:close(CFd),
+    ?assertEqual({error, einval}, file:pread(CFd, 0, 1)),
+    ?assertEqual({error, einval}, file:position(CFd, eof)).
+
+t_pread_from_other_procesess(Path) ->
+    % Note: we'll be reading using 'file' module functions even for cfile
+    % handles. We're specifically acting like an OTP file layer to ensure we
+    % don't have to duplicate our couch_file code
+
+    Fd = open_raw(Path),
+    {ok, CFd} = couch_cfile:dup(Fd),
+
+    Proc = spawn_proc(),
+
+    {ok, Eof} = file:position(Fd, eof),
+
+    ?assertEqual({ok, Eof}, file:position(CFd, eof)),
+    ?assertEqual({ok, Eof}, proc_run(Proc, file, position, [CFd, eof])),
+
+    % Closing original raw fd should still keep ours open and we should still
+    % be able to read from it from this or other processes
+    ok = file:close(Fd),
+
+    ?assertEqual({ok, [<<"a">>]}, file:pread(CFd, [{0, 1}])),
+    ?assertEqual({ok, [<<"a">>]}, proc_run(Proc, file, pread, [CFd, [{0, 
1}]])),
+
+    % Fd works from other process, just for completeness
+    {ok, FdInt} = couch_cfile:fd(CFd),
+    ?assertEqual({ok, FdInt}, proc_run(Proc, couch_cfile, fd, [CFd])),
+
+    ok = file:close(CFd),
+    ?assertEqual({error, einval}, file:pread(CFd, [{0, 1}])),
+    ?assertEqual({error, einval}, proc_run(Proc, file, pread, [CFd, [{0, 
1}]])),
+
+    kill_proc(Proc).
+
+t_datasync(Path) ->
+    Fd = open_raw(Path),
+    {ok, CFd} = couch_cfile:dup(Fd),
+    ok = file:close(Fd),
+
+    {ok, Pos} = file:position(CFd, eof),
+    ?assertEqual(ok, file:datasync(CFd)),
+    ?assertEqual(ok, file:write(CFd, <<"x">>)),
+    ?assertEqual(ok, file:datasync(CFd)),
+    {ok, Pos1} = file:position(CFd, eof),
+    ?assertEqual(Pos + 1, Pos1),
+    ?assertEqual({ok, <<"x">>}, file:pread(CFd, Pos, 1)),
+
+    % Try something larger
+    TwoMBs = <<<<"y">> || _ <- lists:seq(1, 1 bsl 21)>>,
+    ?assertEqual(ok, file:write(CFd, TwoMBs)),
+    ?assertEqual(ok, file:datasync(CFd)),
+    {ok, Pos2} = file:position(CFd, eof),
+    ?assertEqual(Pos1 + (1 bsl 21), Pos2),
+
+    % 10 in a row
+    lists:foreach(
+        fun(_) ->
+            ?assertEqual(ok, file:datasync(CFd))
+        end,
+        lists:seq(1, 10)
+    ),
+
+    % Others can't datasync
+    Proc = spawn_proc(),
+    Expect = {exc, error, not_on_controlling_process},
+    ?assertEqual(Expect, proc_run(Proc, file, datasync, [CFd])),
+    kill_proc(Proc),
+
+    % Can't datasync after closing
+    ok = file:close(CFd),
+    ?assertEqual({error, einval}, file:datasync(CFd)).
+
+t_write(Path) ->
+    Fd = open_raw(Path),
+    {ok, CFd} = couch_cfile:dup(Fd),
+    ok = file:close(Fd),
+
+    {ok, Pos} = file:position(CFd, eof),
+
+    ?assertEqual(ok, file:write(CFd, <<"x">>)),
+    {ok, Pos1} = file:position(CFd, eof),
+    ?assertEqual(Pos + 1, Pos1),
+    ?assertEqual({ok, <<"x">>}, file:pread(CFd, Pos, 1)),
+    TwoMBs = <<<<"y">> || _ <- lists:seq(1, 1 bsl 21)>>,
+    ?assertEqual(ok, file:write(CFd, TwoMBs)),
+    {ok, Pos2} = file:position(CFd, eof),
+    ?assertEqual(Pos1 + (1 bsl 21), Pos2),
+
+    {ok, ReadTwoMBs} = file:pread(CFd, Pos1, 1 bsl 21),
+    ?assertEqual(byte_size(TwoMBs), byte_size(ReadTwoMBs)),
+    ?assertEqual(TwoMBs, ReadTwoMBs),
+
+    % Others can't write
+    Proc = spawn_proc(),
+    Expect = {exc, error, not_on_controlling_process},
+    ?assertEqual(Expect, proc_run(Proc, file, write, [CFd, <<"y">>])),
+    kill_proc(Proc),
+
+    % Can't write after closing
+    ?assertEqual(ok, file:close(CFd)),
+    ?assertEqual({error, einval}, file:write(CFd, <<"z">>)).
+
+t_position_and_truncate(Path) ->
+    Fd = open_raw(Path),
+    {ok, CFd} = couch_cfile:dup(Fd),
+    ok = file:close(Fd),
+
+    {ok, Pos} = file:position(CFd, eof),
+    ?assert(Pos > 0),
+    ?assertEqual({ok, 0}, file:position(CFd, 0)),
+    ?assertEqual(ok, file:truncate(CFd)),
+    {ok, Pos1} = file:position(CFd, eof),
+    ?assertEqual(eof, file:pread(CFd, 0, 1)),
+    ?assertEqual(Pos1, 0),
+
+    ok = file:write(CFd, <<"abc">>),
+    ?assertEqual({ok, 1}, file:position(CFd, 1)),
+    ?assertEqual(ok, file:truncate(CFd)),
+    ?assertEqual({ok, <<"a">>}, file:pread(CFd, 0, 10)),
+
+    Proc = spawn_proc(),
+
+    % Others can't do absolute position changes or truncate
+    Expect = {exc, error, not_on_controlling_process},
+    ?assertEqual(Expect, proc_run(Proc, file, position, [CFd, 1])),
+    ?assertEqual(Expect, proc_run(Proc, file, truncate, [CFd])),
+
+    % Others can call position(Fd, eof) to get the file size
+    ?assertEqual({ok, 1}, proc_run(Proc, file, position, [CFd, eof])),
+
+    kill_proc(Proc),
+
+    % After closing, can't truncate or position
+    ok = file:close(CFd),
+    ?assertEqual({error, einval}, file:position(CFd, 42)),
+    ?assertEqual({error, einval}, file:truncate(CFd)).
+
+t_advise(Path) ->
+    % This is one optional so not implemented as a nif
+    % we just check that it behaves reasonably
+    Fd = open_raw(Path),
+    {ok, CFd} = couch_cfile:dup(Fd),
+    ok = file:close(Fd),
+
+    ?assertEqual(ok, file:advise(CFd, 42, 42, dont_need)),
+
+    % Others can't call it
+    Proc = spawn_proc(),
+    Expect = {exc, error, not_on_controlling_process},
+    Args = [CFd, 42, 42, dont_need],
+    ?assertEqual(Expect, proc_run(Proc, file, advise, Args)),
+    kill_proc(Proc).
+
+t_invalid_fd(_Path) ->
+    ?assertEqual({error, einval}, couch_cfile:dup(junk)),
+    ?assertEqual({error, einval}, couch_cfile:pread(junk, 1, 1)),
+    ?assertEqual({error, einval}, couch_cfile:close(junk)),
+    ?assertEqual({error, einval}, couch_cfile:fd(junk)),
+    ?assertEqual({error, einval}, couch_cfile:position(junk, eof)).
+
+t_fd(Path) ->
+    Fd = open_raw(Path),
+    {ok, CFd} = couch_cfile:dup(Fd),
+
+    {ok, FdInt} = couch_cfile:fd(Fd),
+    {ok, CFdInt} = couch_cfile:fd(CFd),
+    ?assert(is_integer(FdInt) andalso FdInt > -1),
+    ?assert(is_integer(CFdInt) andalso CFdInt > -1),
+
+    ?assertEqual({error, einval}, couch_cfile:fd(potato)),
+
+    % We can't say a whole lot more just that both are
+    % not equal since they are both open and one is dup-ed
+    % from the other
+    ?assertNotEqual(FdInt, CFdInt),
+
+    ok = file:close(Fd),
+    ok = file:close(CFd),
+
+    % Here we check our sanity-checker: after handles are closed we cannot get
+    % any access to them. In the sanity checker we access the int fds after
+    % dup-ing in order to assert that we still have access to the same file
+    % handles we started with.
+    ?assertEqual({error, einval}, couch_cfile:fd(Fd)),
+    ?assertEqual({error, einval}, couch_cfile:fd(CFd)).
+
+t_janitor_proc_is_up(Path) ->
+    Fd = open_raw(Path),
+    {ok, CFd} = couch_cfile:dup(Fd),
+    couch_cfile:close(CFd),
+    ok = file:close(Fd),
+    ?assertEqual(true, is_process_alive(whereis(couch_cfile))).
+
+t_unsupported(Fd) ->
+    ?assertEqual({error, einval}, couch_cfile:dup(Fd)),
+    ?assertEqual({error, einval}, couch_cfile:pread(Fd, 1, 1)),
+    ?assertEqual({error, einval}, couch_cfile:close(Fd)),
+    ?assertEqual({error, einval}, couch_cfile:fd(Fd)),
+    ?assertEqual({error, einval}, couch_cfile:position(Fd, eof)).
+
+t_cannot_dup_cfile_handle(Path) ->
+    Fd = open_raw(Path),
+    {ok, CFd0} = couch_cfile:dup(Fd),
+    ok = file:close(Fd),
+    ?assertEqual({error, einval}, couch_cfile:dup(CFd0)).
+
+t_gc_is_closing_file_handles(Path) ->
+    Fd = open_raw(Path),
+    {ok, FdInt} = couch_cfile:fd(Fd),
+    % Since we'll be checking the janitor, send it some junk message
+    % it should cope with them by dropping them (like the OTP one)
+    whereis(couch_cfile) ! {some_junk, message},
+    Cnt = 750,
+    {_, Ref} = spawn_monitor(fun() ->
+        Fd1 = open_raw(Path),
+        lists:foreach(fun(_) -> couch_cfile:dup(Fd1) end, lists:seq(1, Cnt))
+    end),
+    receive
+        {'DOWN', Ref, _, _, _} -> ok
+    end,
+    % According the dup() docs:
+    %
+    % "The new file descriptor number is guaranteed to be the lowest-numbered
+    % file descriptor that was unused in the calling process."
+    %
+    % Unless during the test something else opened another Cnt descriptors, if
+    % we open another one we should get something lower than FdInt + Cnt
+    {ok, Fd2} = couch_cfile:dup(Fd),
+    {ok, FdInt1} = couch_cfile:fd(Fd2),
+    ?assert(FdInt1 =< FdInt + Cnt),
+    ok = file:close(Fd),
+    ok = file:close(Fd2).
+
+t_monitor_is_closing_file_handles(Path) ->
+    Proc = spawn_proc(),
+    {ok, Fd} = proc_run(Proc, file, open, [Path, [binary, append, raw, read]]),
+    ?assertError(not_on_controlling_process, couch_cfile:dup(Fd)),
+    {ok, CFd} = proc_run(Proc, couch_cfile, dup, [Fd]),
+    ?assertEqual(eof, file:pread(CFd, 0, 1)),
+    kill_proc(Proc),
+    ?assertEqual({error, einval}, file:pread(CFd, 0, 1)).
+
+t_concurrent_reads_512b(Path) ->
+    Fd = cfile(Path),
+    Eof = write(Fd, 0, 512),
+    ReadersPidRefs = spawn_readers(20, Fd, Eof),
+    timer:sleep(1000),
+    [Pid ! stop_reading || {Pid, _} <- ReadersPidRefs],
+    Count = gather_read_results(ReadersPidRefs, 0),
+    ?assert(is_integer(Count) andalso Count > 1000).
+
+t_concurrent_reads_4kb(Path) ->
+    Fd = cfile(Path),
+    Eof = write(Fd, 0, 4096),
+    ReadersPidRefs = spawn_readers(10, Fd, Eof),
+    timer:sleep(1000),
+    [Pid ! stop_reading || {Pid, _} <- ReadersPidRefs],
+    Count = gather_read_results(ReadersPidRefs, 0),
+    ?assert(is_integer(Count) andalso Count > 100).
+
+t_concurrent_reads_1mb(Path) ->
+    Fd = cfile(Path),
+    Eof = write(Fd, 0, 1048576),
+    ReadersPidRefs = spawn_readers(2, Fd, Eof),
+    timer:sleep(1000),
+    [Pid ! stop_reading || {Pid, _} <- ReadersPidRefs],
+    Count = gather_read_results(ReadersPidRefs, 0),
+    ?assert(is_integer(Count) andalso Count > 10).
+
+spawn_proc() ->
+    {Pid, Ref} = spawn_monitor(fun proc_loop/0),
+    {Pid, Ref}.
+
+proc_run({Pid, _Ref}, M, F, A) ->
+    Pid ! {do, self(), {M, F, A}},
+    receive
+        {did, Res} ->
+            Res
+    end.
+
+kill_proc({Pid, Ref}) ->
+    exit(Pid, kill),
+    receive
+        {'DOWN', Ref, _, _, _} -> ok
+    end.
+
+proc_loop() ->
+    receive
+        {do, From, {M, F, A}} ->
+            Res =
+                try
+                    apply(M, F, A)
+                catch
+                    T:E -> {exc, T, E}
+                end,
+            From ! {did, Res},
+            proc_loop()
+    end.
+
+% Concurrent reader helpers
+
+cfile(Path) ->
+    {ok, RawFd} = file:open(Path, [binary, append, raw, read]),
+    {ok, Fd} = couch_cfile:dup(RawFd),
+    ok = file:close(RawFd),
+    Fd.
+
+spawn_readers(N, Fd, Eof) ->
+    spawn_readers(N, Fd, Eof, []).
+
+spawn_readers(0, _Fd, _Eof, Acc) ->
+    Acc;
+spawn_readers(N, Fd, Eof, Acc) ->
+    {Pid, Ref} = spawn_monitor(fun() -> reader(Fd, Eof, 0) end),
+    spawn_readers(N - 1, Fd, Eof, [{Pid, Ref} | Acc]).
+
+reader(Fd, Eof, Count) ->
+    Wait = rand:uniform(?CONCURRENT_READER_JITTER_MSEC) - 1,
+    case Wait of
+        W when W =< 2 ->
+            % If wait is too low, just use erlang:yield()
+            erlang:yield(),
+            pread_and_verify(Fd, Eof),
+            reader(Fd, Eof, Count + 1);
+        _ ->
+            receive
+                stop_reading ->
+                    exit({shutdown, {read_results, Count}})
+            after Wait ->
+                pread_and_verify(Fd, Eof),
+                reader(Fd, Eof, Count + 1)
+            end
+    end.
+
+gather_read_results([], Acc) ->
+    Acc;
+gather_read_results([{Pid, Ref} | Rest], Acc) ->
+    Acc1 =
+        receive
+            {'DOWN', Ref, _, _, {shutdown, {read_results, Count}}} ->
+                Acc + Count;
+            {'DOWN', Ref, _, _, Other} ->
+                error({preader_crashed, Pid, Other})
+        end,
+    gather_read_results(Rest, Acc1).
+
+% Use a simple scheme: byte at position X should be have value X rem 256
+%
+
+write(Fd, Pos, Len) ->
+    Bin = <<<<(I rem 256)>> || I <- lists:seq(Pos, Pos + Len - 1)>>,
+    %sanity check
+    ?assertEqual(Len, byte_size(Bin)),
+    ok = file:write(Fd, Bin),
+    ?assertEqual({ok, Pos + Len}, file:position(Fd, eof)),
+    Pos + Len.
+
+pread_and_verify(Fd, Eof) ->
+    Pos = rand:uniform(Eof + 1) - 1,
+    Len = rand:uniform(Eof + 1) - 1,
+    case {Pos, Len} of
+        {P, _} when P >= Eof ->
+            ?assertEqual(eof, file:pread(Fd, Pos, Len));
+        {_, 0} ->
+            ?assertEqual(eof, file:pread(Fd, Pos, Len));
+        {P, L} when P + L =< Eof ->
+            {ok, Bin} = file:pread(Fd, Pos, Len),
+            ?assert(is_binary(Bin)),
+            ?assertEqual(Len, byte_size(Bin)),
+            verify_binary(Pos, Bin);
+        {P, L} when P + L > Eof ->
+            {ok, Bin} = file:pread(Fd, Pos, Len),
+            ?assertEqual(Eof - Pos, byte_size(Bin)),
+            verify_binary(Pos, Bin)
+    end.
+
+verify_binary(_, <<>>) ->
+    ok;
+verify_binary(Pos, <<Byte:8, Rest/binary>>) ->
+    ?assertEqual(Pos rem 256, Byte),
+    verify_binary(Pos + 1, Rest).
diff --git a/src/couch/test/eunit/couch_file_tests.erl 
b/src/couch/test/eunit/couch_file_tests.erl
index df4caa714..4397adf0c 100644
--- a/src/couch/test/eunit/couch_file_tests.erl
+++ b/src/couch/test/eunit/couch_file_tests.erl
@@ -67,6 +67,62 @@ should_close_file_properly() ->
 should_create_empty_new_files(Fd) ->
     ?_assertMatch({ok, 0}, couch_file:bytes(Fd)).
 
+cfile_setup() ->
+    test_util:start_couch().
+
+cfile_teardown(Ctx) ->
+    erase(io_priority),
+    config:delete("couchdb", "use_cfile", false),
+    config:delete("couchdb", "cfile_skip_ioq", false),
+    config:delete("ioq.bypass", "read", false),
+    test_util:stop_couch(Ctx).
+
+cfile_enable_disable_test_() ->
+    {
+        foreach,
+        fun cfile_setup/0,
+        fun cfile_teardown/1,
+        [
+            ?TDEF_FE(t_cfile_default),
+            ?TDEF_FE(t_cfile_enabled),
+            ?TDEF_FE(t_cfile_disabled),
+            ?TDEF_FE(t_cfile_with_ioq_bypass),
+            ?TDEF_FE(t_cfile_without_ioq_bypass)
+        ]
+    }.
+
+t_cfile_default(_) ->
+    t_can_open_read_and_write().
+
+t_cfile_enabled(_) ->
+    % This is the default, but we'll just test when we
+    % explicitly set to "true" here
+    config:set("couchdb", "use_cfile", "true", false),
+    t_can_open_read_and_write().
+
+t_cfile_disabled(_) ->
+    config:set("couchdb", "use_cfile", "false", false),
+    t_can_open_read_and_write().
+
+t_cfile_with_ioq_bypass(_) ->
+    config:set("couchdb", "use_cfile", "true", false),
+    config:set("couchdb", "cfile_skip_ioq", "true", false),
+    config:set("ioq.bypass", "read", "true", false),
+    t_can_open_read_and_write().
+
+t_cfile_without_ioq_bypass(_) ->
+    config:set("couchdb", "use_cfile", "true", false),
+    config:set("couchdb", "cfile_skip_ioq", "false", false),
+    config:set("ioq.bypass", "read", "true", false),
+    t_can_open_read_and_write().
+
+t_can_open_read_and_write() ->
+    {ok, Fd} = couch_file:open(?tempfile(), [create, overwrite]),
+    ioq:set_io_priority({interactive, <<"somedb">>}),
+    ?assertMatch({ok, 0, _}, couch_file:append_term(Fd, foo)),
+    ?assertEqual({ok, foo}, couch_file:pread_term(Fd, 0)),
+    ok = couch_file:close(Fd).
+
 read_write_test_() ->
     {
         "Common file read/write tests",
@@ -260,8 +316,10 @@ should_apply_overwrite_create_option(Fd) ->
     ?assertEqual(ok, couch_file:close(Fd)),
     {ok, Fd1} = couch_file:open(Path, [create, overwrite]),
     unlink(Fd1),
-    ExpectError = {error, {read_beyond_eof, Path, Pos, 5, 0, 0}},
-    ?assertEqual(ExpectError, couch_file:pread_term(Fd1, Pos)).
+    ?assertMatch(
+        {error, {read_beyond_eof, Path, Pos, _, _, _}},
+        couch_file:pread_term(Fd1, Pos)
+    ).
 
 should_error_on_creation_if_exists(Fd) ->
     {_, Path} = couch_file:process_info(Fd),
diff --git a/src/ioq/src/ioq.erl b/src/ioq/src/ioq.erl
index 586a75084..8e38c2a00 100644
--- a/src/ioq/src/ioq.erl
+++ b/src/ioq/src/ioq.erl
@@ -17,6 +17,7 @@
 -export([start_link/0, call/3, call_search/3]).
 -export([get_queue_lengths/0]).
 -export([get_io_priority/0, set_io_priority/1, maybe_set_io_priority/1]).
+-export([bypass/2]).
 -export([init/1, handle_call/3, handle_cast/2, handle_info/2]).
 
 % config_listener api
@@ -59,18 +60,18 @@ call_search(Fd, Msg, Metadata) ->
     call(Fd, Msg, Metadata).
 
 call(Fd, Msg, Metadata) ->
-    Priority = io_class(Msg, Metadata),
-    case bypass(Priority) of
+    case bypass(Msg, Metadata) of
         true ->
             gen_server:call(Fd, Msg, infinity);
         false ->
-            queued_call(Fd, Msg, Priority)
+            queued_call(Fd, Msg, io_class(Msg, Metadata))
     end.
 
 get_queue_lengths() ->
     gen_server:call(?MODULE, get_queue_lengths).
 
-bypass(Priority) ->
+bypass(Msg, Metadata) ->
+    Priority = io_class(Msg, Metadata),
     case Priority of
         os_process -> config:get_boolean("ioq.bypass", "os_process", true);
         read -> config:get_boolean("ioq.bypass", "read", true);

(couchdb) 10/38: Implement parallel preads

Reply via email to