uvm_coredump_walkmap() is the API used by the ELF code to get information 
about the process's memory image so that it can write it out to the core 
file.  Currently, uvm_coredump_walkmap() just does a pass across the 
process's VA, invoking a callback on each range.  The ELF code executes it 
twice: once to get a count of the ranges, then memory is allocated to hold 
the segment information, and then it's called again to fill in that array.

The catch is that the two calls must return the same number of ranges so 
that a consistent ELF header can be generated.  If that fails, a kernel 
assert will trigger.  That's easy right now where every vm_map_entry in 
the process's VA results in a single range, but it would be nice if the 
coredump could be more precise about what is included and, for example, 
not try to write out pages which were never faulted into existence.  In 
that case it may be necessary to hold locks or references between the 
first and second walk of the VA, which means the logic of multiple walks 
should be in the UVM code and not in the ELF code.

So, the diff below changes uvm_coredump_walkmap() to instead take two 
callbacks.  To quote a comment in the diff:
+/*
+ * Walk the VA space for a process to identify what to write to
+ * a coredump.  First the number of contiguous ranges is counted,
+ * then the 'setup' callback is invoked to prepare for actually
+ * recording the ranges, then the VA is walked again, invoking
+ * the 'walk' callback for each range.  The number of ranges walked
+ * is guaranteed to match the count seen by the 'setup' callback.
+ */

The ELF code of course changes to match that, such that what was 
previously done between the two calls is now done in the 
coredump_setup_elf() callback.


I have a followup diff that then alters uvm_unix.c to treat unshared amaps 
(anonymous maps) as multiple ranges such that pages that were never 
faulted don't get written to the coredump: it's useless to write them out 
(they're just zeros) and it'll fail if faulting in those pages would hit 
the process's memory limit.  That diff takes advantage of the API change 
in the present diff to hold extra references on some amaps between the two 
walks of the VA.

(This all originally arose from a report to bugs@ from semarie@ about 
kernel log messages generated when firefox coredumps)

Comments, concerns, oks?


Philip Guenther

Index: uvm/uvm_extern.h
===================================================================
RCS file: /data/src/openbsd/src/sys/uvm/uvm_extern.h,v
retrieving revision 1.140
diff -u -p -r1.140 uvm_extern.h
--- uvm/uvm_extern.h    12 Feb 2017 04:55:08 -0000      1.140
+++ uvm/uvm_extern.h    16 Feb 2017 04:40:44 -0000
@@ -240,20 +240,6 @@ extern struct uvm_constraint_range *uvm_
 extern struct pool *uvm_aiobuf_pool;
 
 /*
- * used to keep state while iterating over the map for a core dump.
- */
-struct uvm_coredump_state {
-       void *cookie;           /* opaque for the caller */
-       vaddr_t start;          /* start of region */
-       vaddr_t realend;        /* real end of region */
-       vaddr_t end;            /* virtual end of region */
-       vm_prot_t prot;         /* protection of region */
-       int flags;              /* flags; see below */
-};
-
-#define        UVM_COREDUMP_STACK      0x01    /* region is user stack */
-
-/*
  * the various kernel maps, owned by MD code
  */
 extern struct vm_map *exec_map;
@@ -456,9 +442,13 @@ int                        uvm_pglistalloc(psize_t, 
paddr_t, 
 void                   uvm_pglistfree(struct pglist *);
 void                   uvm_pmr_use_inc(paddr_t, paddr_t);
 void                   uvm_swap_init(void);
-int                    uvm_coredump_walkmap(struct proc *,
-                           void *, int (*)(struct proc *, void *,
-                           struct uvm_coredump_state *), void *);
+typedef int            uvm_coredump_setup_cb(int _nsegment, void *_cookie);
+typedef int            uvm_coredump_walk_cb(vaddr_t _start, vaddr_t _realend,
+                           vaddr_t _end, vm_prot_t _prot, int _nsegment,
+                           void *_cookie);
+int                    uvm_coredump_walkmap(struct proc *_p,
+                           uvm_coredump_setup_cb *_setup,
+                           uvm_coredump_walk_cb *_walk, void *_cookie);
 void                   uvm_grow(struct proc *, vaddr_t);
 void                   uvm_deallocate(vm_map_t, vaddr_t, vsize_t);
 struct uvm_object      *uvn_attach(struct vnode *, vm_prot_t);
Index: uvm/uvm_unix.c
===================================================================
RCS file: /data/src/openbsd/src/sys/uvm/uvm_unix.c,v
retrieving revision 1.61
diff -u -p -r1.61 uvm_unix.c
--- uvm/uvm_unix.c      2 Feb 2017 06:23:58 -0000       1.61
+++ uvm/uvm_unix.c      16 Feb 2017 06:31:50 -0000
@@ -135,90 +135,101 @@ uvm_grow(struct proc *p, vaddr_t sp)
 #ifndef SMALL_KERNEL
 
 /*
- * Walk the VA space for a process, invoking 'func' on each present range
- * that should be included in a coredump.
+ * Common logic for whether a map entry should be included in a coredump
  */
+static inline int
+uvm_should_coredump(struct proc *p, struct vm_map_entry *entry)
+{
+       if (!(entry->protection & PROT_WRITE) &&
+           entry->aref.ar_amap == NULL &&
+           entry->start != p->p_p->ps_sigcode)
+               return 0;
+
+       /*
+        * Skip ranges marked as unreadable, as uiomove(UIO_USERSPACE)
+        * will fail on them.  Maybe this really should be a test of
+        * entry->max_protection, but doing
+        *      uvm_map_extract(UVM_EXTRACT_FIXPROT)
+        * on each such page would suck.
+        */
+       if ((entry->protection & PROT_READ) == 0)
+               return 0;
+
+       /* Don't dump mmaped devices. */
+       if (entry->object.uvm_obj != NULL &&
+           UVM_OBJ_IS_DEVICE(entry->object.uvm_obj))
+               return 0;
+
+       if (entry->start >= VM_MAXUSER_ADDRESS)
+               return 0;
+
+       return 1;
+}
+
+/*
+ * Walk the VA space for a process to identify what to write to
+ * a coredump.  First the number of contiguous ranges is counted,
+ * then the 'setup' callback is invoked to prepare for actually
+ * recording the ranges, then the VA is walked again, invoking
+ * the 'walk' callback for each range.  The number of ranges walked
+ * is guaranteed to match the count seen by the 'setup' callback.
+ */
+
 int
-uvm_coredump_walkmap(struct proc *p, void *iocookie,
-    int (*func)(struct proc *, void *, struct uvm_coredump_state *),
-    void *cookie)
+uvm_coredump_walkmap(struct proc *p, uvm_coredump_setup_cb *setup,
+    uvm_coredump_walk_cb *walk, void *cookie)
 {
-       struct uvm_coredump_state state;
        struct vmspace *vm = p->p_vmspace;
        struct vm_map *map = &vm->vm_map;
        struct vm_map_entry *entry;
-       vaddr_t top;
-       int error;
+       vaddr_t end;
+       int nsegment, error;
 
+       /*
+        * Walk the map once to count the segments.
+        */
+       nsegment = 0;
        RBT_FOREACH(entry, uvm_map_addr, &map->addr) {
-               state.cookie = cookie;
-               state.prot = entry->protection;
-               state.flags = 0;
-
                /* should never happen for a user process */
                if (UVM_ET_ISSUBMAP(entry)) {
                        panic("%s: user process with submap?", __func__);
                }
 
-               if (!(entry->protection & PROT_WRITE) &&
-                   entry->aref.ar_amap == NULL &&
-                   entry->start != p->p_p->ps_sigcode)
-                       continue;
-
-               /*
-                * Skip pages marked as unreadable, as uiomove(UIO_USERSPACE)
-                * will fail on them.  Maybe this really should be a test of
-                * entry->max_protection, but doing
-                *      uvm_map_extract(UVM_EXTRACT_FIXPROT)
-                * when dumping such a mapping would suck.
-                */
-               if ((entry->protection & PROT_READ) == 0)
+               if (! uvm_should_coredump(p, entry))
                        continue;
 
-               /* Don't dump mmaped devices. */
-               if (entry->object.uvm_obj != NULL &&
-                   UVM_OBJ_IS_DEVICE(entry->object.uvm_obj))
-                       continue;
-
-               state.start = entry->start;
-               state.realend = entry->end;
-               state.end = entry->end;
+               nsegment++;
+       }
 
-               if (state.start >= VM_MAXUSER_ADDRESS)
+       /*
+        * Okay, we have a count in nsegment; invoke the setup callback.
+        */
+       error = (*setup)(nsegment, cookie);
+       if (error)
+               goto cleanup;
+
+       /*
+        * Setup went okay, so do the second walk, invoking the walk
+        * callback on the counted segments.
+        */
+       nsegment = 0;
+       RBT_FOREACH(entry, uvm_map_addr, &map->addr) {
+               if (! uvm_should_coredump(p, entry))
                        continue;
 
-               if (state.end > VM_MAXUSER_ADDRESS)
-                       state.end = VM_MAXUSER_ADDRESS;
-
-#ifdef MACHINE_STACK_GROWS_UP
-               if ((vaddr_t)vm->vm_maxsaddr <= state.start &&
-                   state.start < ((vaddr_t)vm->vm_maxsaddr + MAXSSIZ)) {
-                       top = round_page((vaddr_t)vm->vm_maxsaddr +
-                           ptoa(vm->vm_ssize));
-                       if (state.end > top)
-                               state.end = top;
-
-                       if (state.start >= state.end)
-                               continue;
-#else
-               if (state.start >= (vaddr_t)vm->vm_maxsaddr) {
-                       top = trunc_page((vaddr_t)vm->vm_minsaddr -
-                           ptoa(vm->vm_ssize));
-                       if (state.start < top)
-                               state.start = top;
-
-                       if (state.start >= state.end)
-                               continue;
-#endif
-                       state.flags |= UVM_COREDUMP_STACK;
-               }
+               end = entry->end;
+               if (end > VM_MAXUSER_ADDRESS)
+                       end = VM_MAXUSER_ADDRESS;
 
-               error = (*func)(p, iocookie, &state);
+               error = (*walk)(entry->start, end, end, entry->protection,
+                   nsegment, cookie);
                if (error)
-                       return (error);
+                       break;
+               nsegment++;
        }
 
-       return (0);
+cleanup:
+       return error;
 }
 
 #endif /* !SMALL_KERNEL */
Index: kern/exec_elf.c
===================================================================
RCS file: /data/src/openbsd/src/sys/kern/exec_elf.c,v
retrieving revision 1.138
diff -u -p -r1.138 exec_elf.c
--- kern/exec_elf.c     11 Feb 2017 06:07:03 -0000      1.138
+++ kern/exec_elf.c     16 Feb 2017 06:52:11 -0000
@@ -933,21 +933,20 @@ coredump_elf(struct proc *p, void *cooki
 }
 #else /* !SMALL_KERNEL */
 
-
-struct countsegs_state {
-       int     npsections;
-};
-
-int    coredump_countsegs_elf(struct proc *, void *,
-           struct uvm_coredump_state *);
-
 struct writesegs_state {
-       Elf_Phdr *psections;
+       off_t   notestart;
+       off_t   secstart;
        off_t   secoff;
+       struct  proc *p;
+       void    *iocookie;
+       Elf_Phdr *psections;
+       size_t  psectionslen;
+       size_t  notesize;
+       int     npsections;
 };
 
-int    coredump_writeseghdrs_elf(struct proc *, void *,
-           struct uvm_coredump_state *);
+uvm_coredump_setup_cb  coredump_setup_elf;
+uvm_coredump_walk_cb   coredump_walk_elf;
 
 int    coredump_notes_elf(struct proc *, void *, size_t *);
 int    coredump_note_elf(struct proc *, void *, size_t *);
@@ -960,184 +959,178 @@ int     coredump_writenote_elf(struct proc *
 int
 coredump_elf(struct proc *p, void *cookie)
 {
-       Elf_Ehdr ehdr;
-       Elf_Phdr *psections = NULL;
-       struct countsegs_state cs;
+#ifdef DIAGNOSTIC
+       off_t offset;
+#endif
        struct writesegs_state ws;
-       off_t notestart, secstart, offset;
-       size_t notesize, psectionslen;
+       size_t notesize;
        int error, i;
 
+       ws.p = p;
+       ws.iocookie = cookie;
+       ws.psections = NULL;
+
        /*
-        * We have to make a total of 3 passes across the map:
-        *
-        *      1. Count the number of map entries (the number of
-        *         PT_LOAD sections).
-        *
-        *      2. Write the P-section headers.
-        *
-        *      3. Write the P-sections.
+        * Walk the map to get all the segment offsets and lengths,
+        * write out the ELF header.
         */
-
-       /* Pass 1: count the entries. */
-       cs.npsections = 0;
-       error = uvm_coredump_walkmap(p, NULL, coredump_countsegs_elf, &cs);
-       if (error)
-               goto out;
-
-       /* Count the PT_NOTE section. */
-       cs.npsections++;
-
-       /* Get the size of the notes. */
-       error = coredump_notes_elf(p, NULL, &notesize);
-       if (error)
-               goto out;
-
-       memset(&ehdr, 0, sizeof(ehdr));
-       memcpy(ehdr.e_ident, ELFMAG, SELFMAG);
-       ehdr.e_ident[EI_CLASS] = ELF_TARG_CLASS;
-       ehdr.e_ident[EI_DATA] = ELF_TARG_DATA;
-       ehdr.e_ident[EI_VERSION] = EV_CURRENT;
-       /* XXX Should be the OSABI/ABI version of the executable. */
-       ehdr.e_ident[EI_OSABI] = ELFOSABI_SYSV;
-       ehdr.e_ident[EI_ABIVERSION] = 0;
-       ehdr.e_type = ET_CORE;
-       /* XXX This should be the e_machine of the executable. */
-       ehdr.e_machine = ELF_TARG_MACH;
-       ehdr.e_version = EV_CURRENT;
-       ehdr.e_entry = 0;
-       ehdr.e_phoff = sizeof(ehdr);
-       ehdr.e_shoff = 0;
-       ehdr.e_flags = 0;
-       ehdr.e_ehsize = sizeof(ehdr);
-       ehdr.e_phentsize = sizeof(Elf_Phdr);
-       ehdr.e_phnum = cs.npsections;
-       ehdr.e_shentsize = 0;
-       ehdr.e_shnum = 0;
-       ehdr.e_shstrndx = 0;
-
-       /* Write out the ELF header. */
-       error = coredump_write(cookie, UIO_SYSSPACE, &ehdr, sizeof(ehdr));
-       if (error)
-               goto out;
-
-       psections = mallocarray(cs.npsections, sizeof(Elf_Phdr),
-           M_TEMP, M_WAITOK|M_ZERO);
-       psectionslen = cs.npsections * sizeof(Elf_Phdr);
-
-       offset = sizeof(ehdr);
-       notestart = offset + psectionslen;
-       secstart = notestart + notesize;
-
-       /* Pass 2: now write the P-section headers. */
-       ws.secoff = secstart;
-       ws.psections = psections;
-       error = uvm_coredump_walkmap(p, cookie, coredump_writeseghdrs_elf, &ws);
+       error = uvm_coredump_walkmap(p, coredump_setup_elf,
+           coredump_walk_elf, &ws);
        if (error)
                goto out;
 
-       /* Write out the PT_NOTE header. */
-       ws.psections->p_type = PT_NOTE;
-       ws.psections->p_offset = notestart;
-       ws.psections->p_vaddr = 0;
-       ws.psections->p_paddr = 0;
-       ws.psections->p_filesz = notesize;
-       ws.psections->p_memsz = 0;
-       ws.psections->p_flags = PF_R;
-       ws.psections->p_align = ELFROUNDSIZE;
-
-       error = coredump_write(cookie, UIO_SYSSPACE, psections, psectionslen);
+       error = coredump_write(cookie, UIO_SYSSPACE, ws.psections,
+           ws.psectionslen);
        if (error)
                goto out;
 
-#ifdef DIAGNOSTIC
-       offset += psectionslen;
-       if (offset != notestart)
-               panic("coredump: offset %lld != notestart %lld",
-                   (long long) offset, (long long) notestart);
-#endif
-
        /* Write out the notes. */
        error = coredump_notes_elf(p, cookie, &notesize);
        if (error)
                goto out;
 
 #ifdef DIAGNOSTIC
-       offset += notesize;
-       if (offset != secstart)
+       if (notesize != ws.notesize)
+               panic("coredump: notesize changed: %zu != %zu",
+                   ws.notesize, notesize);
+       offset = ws.notestart + notesize;
+       if (offset != ws.secstart)
                panic("coredump: offset %lld != secstart %lld",
-                   (long long) offset, (long long) secstart);
+                   (long long) offset, (long long) ws.secstart);
 #endif
 
        /* Pass 3: finally, write the sections themselves. */
-       for (i = 0; i < cs.npsections - 1; i++) {
-               if (psections[i].p_filesz == 0)
+       for (i = 0; i < ws.npsections - 1; i++) {
+               Elf_Phdr *pent = &ws.psections[i];
+               if (pent->p_filesz == 0)
                        continue;
 
 #ifdef DIAGNOSTIC
-               if (offset != psections[i].p_offset)
+               if (offset != pent->p_offset)
                        panic("coredump: offset %lld != p_offset[%d] %lld",
                            (long long) offset, i,
-                           (long long) psections[i].p_filesz);
+                           (long long) pent->p_filesz);
 #endif
 
                error = coredump_write(cookie, UIO_USERSPACE,
-                   (void *)(vaddr_t)psections[i].p_vaddr,
-                   psections[i].p_filesz);
+                   (void *)(vaddr_t)pent->p_vaddr, pent->p_filesz);
                if (error)
                        goto out;
 
-               coredump_unmap(cookie, (vaddr_t)psections[i].p_vaddr,
-                   (vaddr_t)psections[i].p_vaddr + psections[i].p_filesz);
+               coredump_unmap(cookie, (vaddr_t)pent->p_vaddr,
+                   (vaddr_t)pent->p_vaddr + pent->p_filesz);
 
 #ifdef DIAGNOSTIC
-               offset += psections[i].p_filesz;
+               offset += ws.psections[i].p_filesz;
 #endif
        }
 
 out:
-       free(psections, M_TEMP, psectionslen);
+       free(ws.psections, M_TEMP, ws.psectionslen);
        return (error);
 }
 
+
+
 int
-coredump_countsegs_elf(struct proc *p, void *iocookie,
-    struct uvm_coredump_state *us)
+coredump_setup_elf(int segment_count, void *cookie)
 {
-       struct countsegs_state *cs = us->cookie;
+       Elf_Ehdr ehdr;
+       struct writesegs_state *ws = cookie;
+       Elf_Phdr *note;
+       int error;
+
+       /* Get the count of segments, plus one for the PT_NOTE */
+       ws->npsections = segment_count + 1;
+
+       /* Get the size of the notes. */
+       error = coredump_notes_elf(ws->p, NULL, &ws->notesize);
+       if (error)
+               return error;
+
+       /* Setup the ELF header */
+       memset(&ehdr, 0, sizeof(ehdr));
+       memcpy(ehdr.e_ident, ELFMAG, SELFMAG);
+       ehdr.e_ident[EI_CLASS] = ELF_TARG_CLASS;
+       ehdr.e_ident[EI_DATA] = ELF_TARG_DATA;
+       ehdr.e_ident[EI_VERSION] = EV_CURRENT;
+       /* XXX Should be the OSABI/ABI version of the executable. */
+       ehdr.e_ident[EI_OSABI] = ELFOSABI_SYSV;
+       ehdr.e_ident[EI_ABIVERSION] = 0;
+       ehdr.e_type = ET_CORE;
+       /* XXX This should be the e_machine of the executable. */
+       ehdr.e_machine = ELF_TARG_MACH;
+       ehdr.e_version = EV_CURRENT;
+       ehdr.e_entry = 0;
+       ehdr.e_phoff = sizeof(ehdr);
+       ehdr.e_shoff = 0;
+       ehdr.e_flags = 0;
+       ehdr.e_ehsize = sizeof(ehdr);
+       ehdr.e_phentsize = sizeof(Elf_Phdr);
+       ehdr.e_phnum = ws->npsections;
+       ehdr.e_shentsize = 0;
+       ehdr.e_shnum = 0;
+       ehdr.e_shstrndx = 0;
+
+       /* Write out the ELF header. */
+       error = coredump_write(ws->iocookie, UIO_SYSSPACE, &ehdr, sizeof(ehdr));
+       if (error)
+               return error;
+
+       /*
+        * Allocate the segment header array and setup to collect
+        * the section sizes and offsets
+        */
+       ws->psections = mallocarray(ws->npsections, sizeof(Elf_Phdr),
+           M_TEMP, M_WAITOK|M_ZERO);
+       ws->psectionslen = ws->npsections * sizeof(Elf_Phdr);
+
+       ws->notestart = sizeof(ehdr) + ws->psectionslen;
+       ws->secstart = ws->notestart + ws->notesize;
+       ws->secoff = ws->secstart;
+
+       /* Fill in the PT_NOTE segment header in the last slot */
+       note = &ws->psections[ws->npsections - 1];
+       note->p_type = PT_NOTE;
+       note->p_offset = ws->notestart;
+       note->p_vaddr = 0;
+       note->p_paddr = 0;
+       note->p_filesz = ws->notesize;
+       note->p_memsz = 0;
+       note->p_flags = PF_R;
+       note->p_align = ELFROUNDSIZE;
 
-       cs->npsections++;
        return (0);
 }
 
 int
-coredump_writeseghdrs_elf(struct proc *p, void *iocookie,
-    struct uvm_coredump_state *us)
+coredump_walk_elf(vaddr_t start, vaddr_t realend, vaddr_t end, vm_prot_t prot,
+    int nsegment, void *cookie)
 {
-       struct writesegs_state *ws = us->cookie;
+       struct writesegs_state *ws = cookie;
        Elf_Phdr phdr;
        vsize_t size, realsize;
 
-       size = us->end - us->start;
-       realsize = us->realend - us->start;
+       size = end - start;
+       realsize = realend - start;
 
        phdr.p_type = PT_LOAD;
        phdr.p_offset = ws->secoff;
-       phdr.p_vaddr = us->start;
+       phdr.p_vaddr = start;
        phdr.p_paddr = 0;
        phdr.p_filesz = realsize;
        phdr.p_memsz = size;
        phdr.p_flags = 0;
-       if (us->prot & PROT_READ)
+       if (prot & PROT_READ)
                phdr.p_flags |= PF_R;
-       if (us->prot & PROT_WRITE)
+       if (prot & PROT_WRITE)
                phdr.p_flags |= PF_W;
-       if (us->prot & PROT_EXEC)
+       if (prot & PROT_EXEC)
                phdr.p_flags |= PF_X;
        phdr.p_align = PAGE_SIZE;
 
        ws->secoff += phdr.p_filesz;
-       *ws->psections++ = phdr;
+       ws->psections[nsegment] = phdr;
 
        return (0);
 }

Reply via email to