read barriers

Peter Hurley Thu, 16 Apr 2015 05:00:02 -0400

On 04/16/2015 02:39 AM, Mario Kleiner wrote:
> On 04/16/2015 03:29 AM, Peter Hurley wrote:
>> On 04/15/2015 05:26 PM, Mario Kleiner wrote:
>>> A couple of questions to educate me and one review comment.
>>>
>>> On 04/15/2015 07:34 PM, Daniel Vetter wrote:
>>>> This was a bit too much cargo-culted, so lets make it solid:
>>>> - vblank->count doesn't need to be an atomic, writes are always done
>>>>     under the protection of dev->vblank_time_lock. Switch to an unsigned
>>>>     long instead and update comments. Note that atomic_read is just a
>>>>     normal read of a volatile variable, so no need to audit all the
>>>>     read-side access specifically.
>>>>
>>>> - The barriers for the vblank counter seqlock weren't complete: The
>>>>     read-side was missing the first barrier between the counter read and
>>>>     the timestamp read, it only had a barrier between the ts and the
>>>>     counter read. We need both.
>>>>
>>>> - Barriers weren't properly documented. Since barriers only work if
>>>>     you have them on boths sides of the transaction it's prudent to
>>>>     reference where the other side is. To avoid duplicating the
>>>>     write-side comment 3 times extract a little store_vblank() helper.
>>>>     In that helper also assert that we do indeed hold
>>>>     dev->vblank_time_lock, since in some cases the lock is acquired a
>>>>     few functions up in the callchain.
>>>>
>>>> Spotted while reviewing a patch from Chris Wilson to add a fastpath to
>>>> the vblank_wait ioctl.
>>>>
>>>> v2: Add comment to better explain how store_vblank works, suggested by
>>>> Chris.
>>>>
>>>> v3: Peter noticed that as-is the 2nd smp_wmb is redundant with the
>>>> implicit barrier in the spin_unlock. But that can only be proven by
>>>> auditing all callers and my point in extracting this little helper was
>>>> to localize all the locking into just one place. Hence I think that
>>>> additional optimization is too risky.
>>>>
>>>> Cc: Chris Wilson <chris at chris-wilson.co.uk>
>>>> Cc: Mario Kleiner <mario.kleiner.de at gmail.com>
>>>> Cc: Ville SyrjÃ¤lÃ¤ <ville.syrjala at linux.intel.com>
>>>> Cc: Michel DÃ¤nzer <michel at daenzer.net>
>>>> Cc: Peter Hurley <peter at hurleysoftware.com>
>>>> Signed-off-by: Daniel Vetter <daniel.vetter at intel.com>
>>>> ---
>>>>    drivers/gpu/drm/drm_irq.c | 95 
>>>> +++++++++++++++++++++++++----------------------
>>>>    include/drm/drmP.h        |  8 +++-
>>>>    2 files changed, 57 insertions(+), 46 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/drm_irq.c b/drivers/gpu/drm/drm_irq.c
>>>> index c8a34476570a..8694b77d0002 100644
>>>> --- a/drivers/gpu/drm/drm_irq.c
>>>> +++ b/drivers/gpu/drm/drm_irq.c
>>>> @@ -74,6 +74,36 @@ module_param_named(vblankoffdelay, drm_vblank_offdelay, 
>>>> int, 0600);
>>>>    module_param_named(timestamp_precision_usec, drm_timestamp_precision, 
>>>> int, 0600);
>>>>    module_param_named(timestamp_monotonic, drm_timestamp_monotonic, int, 
>>>> 0600);
>>>>
>>>> +static void store_vblank(struct drm_device *dev, int crtc,
>>>> +             unsigned vblank_count_inc,
>>>> +             struct timeval *t_vblank)
>>>> +{
>>>> +    struct drm_vblank_crtc *vblank = &dev->vblank[crtc];
>>>> +    u32 tslot;
>>>> +
>>>> +    assert_spin_locked(&dev->vblank_time_lock);
>>>> +
>>>> +    if (t_vblank) {
>>>> +        /* All writers hold the spinlock, but readers are serialized by
>>>> +         * the latching of vblank->count below.
>>>> +         */
>>>> +        tslot = vblank->count + vblank_count_inc;
>>>> +        vblanktimestamp(dev, crtc, tslot) = *t_vblank;
>>>> +    }
>>>> +
>>>> +    /*
>>>> +     * vblank timestamp updates are protected on the write side with
>>>> +     * vblank_time_lock, but on the read side done locklessly using a
>>>> +     * sequence-lock on the vblank counter. Ensure correct ordering using
>>>> +     * memory barrriers. We need the barrier both before and also after 
>>>> the
>>>> +     * counter update to synchronize with the next timestamp write.
>>>> +     * The read-side barriers for this are in drm_vblank_count_and_time.
>>>> +     */
>>>> +    smp_wmb();
>>>> +    vblank->count += vblank_count_inc;
>>>> +    smp_wmb();
>>>> +}
>>>> +
>>>>    /**
>>>>     * drm_update_vblank_count - update the master vblank counter
>>>>     * @dev: DRM device
>>>> @@ -93,7 +123,7 @@ module_param_named(timestamp_monotonic, 
>>>> drm_timestamp_monotonic, int, 0600);
>>>>    static void drm_update_vblank_count(struct drm_device *dev, int crtc)
>>>>    {
>>>>        struct drm_vblank_crtc *vblank = &dev->vblank[crtc];
>>>> -    u32 cur_vblank, diff, tslot;
>>>> +    u32 cur_vblank, diff;
>>>>        bool rc;
>>>>        struct timeval t_vblank;
>>>>
>>>> @@ -129,18 +159,12 @@ static void drm_update_vblank_count(struct 
>>>> drm_device *dev, int crtc)
>>>>        if (diff == 0)
>>>>            return;
>>>>
>>>> -    /* Reinitialize corresponding vblank timestamp if high-precision query
>>>> -     * available. Skip this step if query unsupported or failed. Will
>>>> -     * reinitialize delayed at next vblank interrupt in that case.
>>>> +    /*
>>>> +     * Only reinitialize corresponding vblank timestamp if high-precision 
>>>> query
>>>> +     * available and didn't fail. Will reinitialize delayed at next vblank
>>>> +     * interrupt in that case.
>>>>         */
>>>> -    if (rc) {
>>>> -        tslot = atomic_read(&vblank->count) + diff;
>>>> -        vblanktimestamp(dev, crtc, tslot) = t_vblank;
>>>> -    }
>>>> -
>>>> -    smp_mb__before_atomic();
>>>> -    atomic_add(diff, &vblank->count);
>>>> -    smp_mb__after_atomic();
>>>> +    store_vblank(dev, crtc, diff, rc ? &t_vblank : NULL);
>>>>    }
>>>>
>>>>    /*
>>>> @@ -218,7 +242,7 @@ static void vblank_disable_and_save(struct drm_device 
>>>> *dev, int crtc)
>>>>        /* Compute time difference to stored timestamp of last vblank
>>>>         * as updated by last invocation of drm_handle_vblank() in vblank 
>>>> irq.
>>>>         */
>>>> -    vblcount = atomic_read(&vblank->count);
>>>> +    vblcount = vblank->count;
>>>>        diff_ns = timeval_to_ns(&tvblank) -
>>>>              timeval_to_ns(&vblanktimestamp(dev, crtc, vblcount));
>>>>
>>>> @@ -234,17 +258,8 @@ static void vblank_disable_and_save(struct drm_device 
>>>> *dev, int crtc)
>>>>         * available. In that case we can't account for this and just
>>>>         * hope for the best.
>>>>         */
>>>> -    if (vblrc && (abs64(diff_ns) > 1000000)) {
>>>> -        /* Store new timestamp in ringbuffer. */
>>>> -        vblanktimestamp(dev, crtc, vblcount + 1) = tvblank;
>>>> -
>>>> -        /* Increment cooked vblank count. This also atomically commits
>>>> -         * the timestamp computed above.
>>>> -         */
>>>> -        smp_mb__before_atomic();
>>>> -        atomic_inc(&vblank->count);
>>>> -        smp_mb__after_atomic();
>>>> -    }
>>>> +    if (vblrc && (abs64(diff_ns) > 1000000))
>>>> +        store_vblank(dev, crtc, 1, &tvblank);
>>>>
>>>>        spin_unlock_irqrestore(&dev->vblank_time_lock, irqflags);
>>>>    }
>>>> @@ -852,7 +867,7 @@ u32 drm_vblank_count(struct drm_device *dev, int crtc)
>>>>
>>>>        if (WARN_ON(crtc >= dev->num_crtcs))
>>>>            return 0;
>>>> -    return atomic_read(&vblank->count);
>>>> +    return vblank->count;
>>>
>>> I wrongly assumed atomic_read would guarantee more than it actually does, 
>>> so please help me to learn something. Why don't we need some smp_rmb() here 
>>> before returning vblank->count? What guarantees that drm_vblank_count() 
>>> does return the latest value assigned to vblank->count in store_vblank()? 
>>> In store_vblank() there is a smp_wmb(), but why don't we need a matching 
>>> smp_rmb() here to benefit from it?
>>
>> Well, you could but the vblank counter resolution is very low (60HZ),
>> so practically speaking, what would be the point?
>>
> 
> Thanks for the explanations Peter.
> 
> Depends on the latency induced. A few microseconds don't matter, a 
> millisecond or more would matter for some applications.
> 
>> The trailing write barrier in store_vblank() is only necessary to
>> force the ordering of the timestamp wrt to vblank count *in case there
>> was no write barrier executed between to calls to store_vblank()*,
>> not to ensure the cpu forces the write to main memory immediately.
>>
> 
> That part i understand, the possibly slightly earlier write of some store 
> buffers to main memory is just a possible nice side effect. Bits and pieces 
> of my memory about how cache coherency etc. work slowly come back to my own 
> memory...
> 
>> Because the time scales for these events don't require that level of
>> resolution; consider how much code has to get executed between a
>> hardware vblank irq triggering and the vblank counter being updated.
>>
>> Realistically, the only relevant requirement is that the timestamp
>> match the counter.
>>
> 
> Yes that is the really important part. A msec delay would possibly matter for 
> some timing sensitive apps like mine - some more exotic displays run at 200 
> Hz, and some apps need to synchronize to the vblank not strictly for 
> graphics. But i assume potential delays here are more on the order of a few 
> microseconds if some pending loads from the cache would get reordered for 
> overall efficiency?
> 
>>>
>>>>    }
>>>>    EXPORT_SYMBOL(drm_vblank_count);
>>>>
>>>> @@ -897,16 +912,17 @@ u32 drm_vblank_count_and_time(struct drm_device 
>>>> *dev, int crtc,
>>>>        if (WARN_ON(crtc >= dev->num_crtcs))
>>>>            return 0;
>>>>
>>>> -    /* Read timestamp from slot of _vblank_time ringbuffer
>>>> -     * that corresponds to current vblank count. Retry if
>>>> -     * count has incremented during readout. This works like
>>>> -     * a seqlock.
>>>> +    /*
>>>> +     * Vblank timestamps are read lockless. To ensure consistency the 
>>>> vblank
>>>> +     * counter is rechecked and ordering is ensured using memory barriers.
>>>> +     * This works like a seqlock. The write-side barriers are in 
>>>> store_vblank.
>>>>         */
>>>>        do {
>>>> -        cur_vblank = atomic_read(&vblank->count);
>>>> +        cur_vblank = vblank->count;
>>>> +        smp_rmb();
>>>>            *vblanktime = vblanktimestamp(dev, crtc, cur_vblank);
>>>>            smp_rmb();
>>>> -    } while (cur_vblank != atomic_read(&vblank->count));
>>>> +    } while (cur_vblank != vblank->count);
>>>>
>>>
>>> Similar question as above. We have a new smp_rmb() after the cur_vblank 
>>> assignment and then after *vblanktime assignment. My original wrong 
>>> assumption was that the first smp_rmb() wouldn't be needed because 
>>> atomic_read() would imply that, and that the compiler/cpu couldn't reorder 
>>> anything here because the *vblanktime assignment depends on cur_vblank from 
>>> the preceeding atomic_read.
>>>
>>> But why can we now do the comparison while(cur_vblank != vblank->count) 
>>> without needing something like
>>>
>>>      new_vblank = vblank->count;
>>>      smp_rmb();
>>>     } while (cur_vblank != new_vblank);
>>>
>>> to make sure the value from the 2nd vblank->count read isn't stale wrt. to 
>>> potential updates from store_vblank()?
>>
>> Similar response here.
>>
>> What matters is that the timestamp read is consistent with the
>> vblank count; not that you "just missed" new values.
>>
>>> Another question is why the drm_vblank_count_and_time() code ever worked 
>>> without triggering any of my own tests and consistency checks in my 
>>> software, or any of your igt tests? I run my tests very often, but only on 
>>> Intel architecture cpus. I assume the same is true for the igt tests? Is 
>>> there anything specific about Intel cpu's that makes this still work or 
>>> very unlikely to break? Or are the tests insufficient to catch this? Or 
>>> just luck?
>>
>> Intel x86 cpus are strongly-ordered, so smp_rmb() and smp_wmb() are only
>> compiler barriers on x86 arch.
>>
> 
> Ok, that makes sense as part of the explanation why stuff still worked, at 
> least on the tested x86 arch.
> 
>> The compiler could have hoisted the vblanktimestamp read in front of
>> the vblank count read, but chose not to.
>>
> 
> This one still confuses me. I know why the smp_rmb after the vblanktimestamp 
> read is there - i placed one there in the current code myself. But why could 
> the compiler - in absence of the new smp_rmb compiler barrier in this patch  
> - reorder those two loads without violating the semantic of the code?
> 
> Why could it have turned this
> 
> cur_vblank = vblank->count;
> // smp_rmb();
> vblanktime = vblanktimestamp(dev, crtc, cur_vblank);
> 
> into this instead
> 
> *vblanktime = vblanktimestamp(dev, crtc, cur_vblank);
> // smp_rmb();
> cur_vblank = vblank->count;
> 
> when the first load would need the value of cur_vblank - and thereby the 
> result of the 2nd load - as input to calculate its address for the 
> *vblanktime = ... load?
> 
> I think i'm still not getting something about why the compiler would be 
> allowed to reorder in this way in absence of the additional smp_rmb? Or is 
> that barrier required for other archs which are less strongly ordered?


Apologies for the confusion; I missed that it was data-dependent load.

Regards,
Peter Hurley


> thanks,
> -mario
> 
>>> I looked through kernels back to 3.16 and most uses of the function would 
>>> be safe from races due to the locking around it, holding of vblank 
>>> refcounts, or the place and order of execution, e.g., from within 
>>> drm_handle_vblank(). But in some tight test loop just calling the 
>>> drmWaitVblank ioctl to query current values i'd expect it to at least 
>>> occassionally return corrupted timestamps, e.g., time jumping forward or 
>>> backward, etc.?
>>
>> As a test, reverse the load order of vblanktimestamp wrt vblank count
>> and see if your tests trip; if not, you know the tests are insufficient.
>>
>> Regards,
>> Peter Hurley
>>
>>>
>>>>        return cur_vblank;
>>>>    }
>>>> @@ -1715,7 +1731,7 @@ bool drm_handle_vblank(struct drm_device *dev, int 
>>>> crtc)
>>>>         */
>>>>
>>>>        /* Get current timestamp and count. */
>>>> -    vblcount = atomic_read(&vblank->count);
>>>> +    vblcount = vblank->count;
>>>>        drm_get_last_vbltimestamp(dev, crtc, &tvblank, 
>>>> DRM_CALLED_FROM_VBLIRQ);
>>>>
>>>>        /* Compute time difference to timestamp of last vblank */
>>>> @@ -1731,20 +1747,11 @@ bool drm_handle_vblank(struct drm_device *dev, int 
>>>> crtc)
>>>>         * e.g., due to spurious vblank interrupts. We need to
>>>>         * ignore those for accounting.
>>>>         */
>>>> -    if (abs64(diff_ns) > DRM_REDUNDANT_VBLIRQ_THRESH_NS) {
>>>> -        /* Store new timestamp in ringbuffer. */
>>>> -        vblanktimestamp(dev, crtc, vblcount + 1) = tvblank;
>>>> -
>>>> -        /* Increment cooked vblank count. This also atomically commits
>>>> -         * the timestamp computed above.
>>>> -         */
>>>> -        smp_mb__before_atomic();
>>>> -        atomic_inc(&vblank->count);
>>>> -        smp_mb__after_atomic();
>>>> -    } else {
>>>> +    if (abs64(diff_ns) > DRM_REDUNDANT_VBLIRQ_THRESH_NS)
>>>> +        store_vblank(dev, crtc, 1, &tvblank);
>>>> +    else
>>>>            DRM_DEBUG("crtc %d: Redundant vblirq ignored. diff_ns = %d\n",
>>>>                  crtc, (int) diff_ns);
>>>> -    }
>>>>
>>>>        spin_unlock(&dev->vblank_time_lock);
>>>>
>>>> diff --git a/include/drm/drmP.h b/include/drm/drmP.h
>>>> index 62c40777c009..4c31a2cc5a33 100644
>>>> --- a/include/drm/drmP.h
>>>> +++ b/include/drm/drmP.h
>>>> @@ -686,9 +686,13 @@ struct drm_pending_vblank_event {
>>>>    struct drm_vblank_crtc {
>>>>        struct drm_device *dev;        /* pointer to the drm_device */
>>>>        wait_queue_head_t queue;    /**< VBLANK wait queue */
>>>> -    struct timeval time[DRM_VBLANKTIME_RBSIZE];    /**< timestamp of 
>>>> current count */
>>>>        struct timer_list disable_timer;        /* delayed disable timer */
>>>> -    atomic_t count;            /**< number of VBLANK interrupts */
>>>> +
>>>> +    /* vblank counter, protected by dev->vblank_time_lock for writes */
>>>> +    unsigned long count;
>>>
>>> Why is count an unsigned long (= 64 bit on 64-bit kernels) instead of u32 
>>> when all users of count are u32? Is this intentional?
>>>
>>>
>>>> +    /* vblank timestamps, protected by dev->vblank_time_lock for writes */
>>>> +    struct timeval time[DRM_VBLANKTIME_RBSIZE];
>>>> +
>>>>        atomic_t refcount;        /* number of users of vblank 
>>>> interruptsper crtc */
>>>>        u32 last;            /* protected by dev->vbl_lock, used */
>>>>                        /* for wraparound handling */
>>>>
>>>
>>> Thanks,
>>> -mario
>>

[PATCH] drm/vblank: Fixup and document timestamp update/read barriers

Reply via email to