On Sat, 17 Jun 2017, Tetsuo Handa wrote:

> diff --git a/mm/oom_kill.c b/mm/oom_kill.c
> index 04c9143..cf1d331 100644
> --- a/mm/oom_kill.c
> +++ b/mm/oom_kill.c
> @@ -470,38 +470,9 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, 
> struct mm_struct *mm)
>  {
>       struct mmu_gather tlb;
>       struct vm_area_struct *vma;
> -     bool ret = true;
> -
> -     /*
> -      * We have to make sure to not race with the victim exit path
> -      * and cause premature new oom victim selection:
> -      * __oom_reap_task_mm           exit_mm
> -      *   mmget_not_zero
> -      *                                mmput
> -      *                                  atomic_dec_and_test
> -      *                                exit_oom_victim
> -      *                              [...]
> -      *                              out_of_memory
> -      *                                select_bad_process
> -      *                                  # no TIF_MEMDIE task selects new 
> victim
> -      *  unmap_page_range # frees some memory
> -      */
> -     mutex_lock(&oom_lock);
> -
> -     if (!down_read_trylock(&mm->mmap_sem)) {
> -             ret = false;
> -             goto unlock_oom;
> -     }
>  
> -     /*
> -      * increase mm_users only after we know we will reap something so
> -      * that the mmput_async is called only when we have reaped something
> -      * and delayed __mmput doesn't matter that much
> -      */
> -     if (!mmget_not_zero(mm)) {
> -             up_read(&mm->mmap_sem);
> -             goto unlock_oom;
> -     }
> +     if (!down_read_trylock(&mm->mmap_sem))
> +             return false;
>  
>       /*
>        * Tell all users of get_user/copy_from_user etc... that the content
> @@ -537,16 +508,7 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, 
> struct mm_struct *mm)
>                       K(get_mm_counter(mm, MM_FILEPAGES)),
>                       K(get_mm_counter(mm, MM_SHMEMPAGES)));
>       up_read(&mm->mmap_sem);
> -
> -     /*
> -      * Drop our reference but make sure the mmput slow path is called from a
> -      * different context because we shouldn't risk we get stuck there and
> -      * put the oom_reaper out of the way.
> -      */
> -     mmput_async(mm);
> -unlock_oom:
> -     mutex_unlock(&oom_lock);
> -     return ret;
> +     return true;
>  }
>  
>  #define MAX_OOM_REAP_RETRIES 10
> @@ -569,12 +531,31 @@ static void oom_reap_task(struct task_struct *tsk)
>  
>  done:
>       tsk->oom_reaper_list = NULL;
> +     /*
> +      * Drop a mm_users reference taken by mark_oom_victim().
> +      * A mm_count reference taken by mark_oom_victim() remains.
> +      */
> +     mmput_async(mm);

This doesn't prevent serial oom killing for either the system oom killer 
or for the memcg oom killer.

The oom killer cannot detect tsk_is_oom_victim() if the task has either 
been removed from the tasklist or has already done cgroup_exit().  For 
memcg oom killings in particular, cgroup_exit() is usually called very 
shortly after the oom killer has sent the SIGKILL.  If the oom reaper does 
not fail (for example by failing to grab mm->mmap_sem) before another 
memcg charge after cgroup_exit(victim), additional processes are killed 
because the iteration does not view the victim.

This easily kills all processes attached to the memcg with no memory 
freeing from any victim.

Reply via email to