On 04/16/2015 03:29 AM, Peter Hurley wrote:
> On 04/15/2015 05:26 PM, Mario Kleiner wrote:
>> A couple of questions to educate me and one review comment.
>>
>> On 04/15/2015 07:34 PM, Daniel Vetter wrote:
>>> This was a bit too much cargo-culted, so lets make it solid:
>>> - vblank->count doesn't need to be an atomic, writes are always done
>>> under the protection of dev->vblank_time_lock. Switch to an unsigned
>>> long instead and update comments. Note that atomic_read is just a
>>> normal read of a volatile variable, so no need to audit all the
>>> read-side access specifically.
>>>
>>> - The barriers for the vblank counter seqlock weren't complete: The
>>> read-side was missing the first barrier between the counter read and
>>> the timestamp read, it only had a barrier between the ts and the
>>> counter read. We need both.
>>>
>>> - Barriers weren't properly documented. Since barriers only work if
>>> you have them on boths sides of the transaction it's prudent to
>>> reference where the other side is. To avoid duplicating the
>>> write-side comment 3 times extract a little store_vblank() helper.
>>> In that helper also assert that we do indeed hold
>>> dev->vblank_time_lock, since in some cases the lock is acquired a
>>> few functions up in the callchain.
>>>
>>> Spotted while reviewing a patch from Chris Wilson to add a fastpath to
>>> the vblank_wait ioctl.
>>>
>>> v2: Add comment to better explain how store_vblank works, suggested by
>>> Chris.
>>>
>>> v3: Peter noticed that as-is the 2nd smp_wmb is redundant with the
>>> implicit barrier in the spin_unlock. But that can only be proven by
>>> auditing all callers and my point in extracting this little helper was
>>> to localize all the locking into just one place. Hence I think that
>>> additional optimization is too risky.
>>>
>>> Cc: Chris Wilson <chris at chris-wilson.co.uk>
>>> Cc: Mario Kleiner <mario.kleiner.de at gmail.com>
>>> Cc: Ville Syrjälä <ville.syrjala at linux.intel.com>
>>> Cc: Michel Dänzer <michel at daenzer.net>
>>> Cc: Peter Hurley <peter at hurleysoftware.com>
>>> Signed-off-by: Daniel Vetter <daniel.vetter at intel.com>
>>> ---
>>> drivers/gpu/drm/drm_irq.c | 95
>>> +++++++++++++++++++++++++----------------------
>>> include/drm/drmP.h | 8 +++-
>>> 2 files changed, 57 insertions(+), 46 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/drm_irq.c b/drivers/gpu/drm/drm_irq.c
>>> index c8a34476570a..8694b77d0002 100644
>>> --- a/drivers/gpu/drm/drm_irq.c
>>> +++ b/drivers/gpu/drm/drm_irq.c
>>> @@ -74,6 +74,36 @@ module_param_named(vblankoffdelay, drm_vblank_offdelay,
>>> int, 0600);
>>> module_param_named(timestamp_precision_usec, drm_timestamp_precision,
>>> int, 0600);
>>> module_param_named(timestamp_monotonic, drm_timestamp_monotonic, int,
>>> 0600);
>>>
>>> +static void store_vblank(struct drm_device *dev, int crtc,
>>> + unsigned vblank_count_inc,
>>> + struct timeval *t_vblank)
>>> +{
>>> + struct drm_vblank_crtc *vblank = &dev->vblank[crtc];
>>> + u32 tslot;
>>> +
>>> + assert_spin_locked(&dev->vblank_time_lock);
>>> +
>>> + if (t_vblank) {
>>> + /* All writers hold the spinlock, but readers are serialized by
>>> + * the latching of vblank->count below.
>>> + */
>>> + tslot = vblank->count + vblank_count_inc;
>>> + vblanktimestamp(dev, crtc, tslot) = *t_vblank;
>>> + }
>>> +
>>> + /*
>>> + * vblank timestamp updates are protected on the write side with
>>> + * vblank_time_lock, but on the read side done locklessly using a
>>> + * sequence-lock on the vblank counter. Ensure correct ordering using
>>> + * memory barrriers. We need the barrier both before and also after the
>>> + * counter update to synchronize with the next timestamp write.
>>> + * The read-side barriers for this are in drm_vblank_count_and_time.
>>> + */
>>> + smp_wmb();
>>> + vblank->count += vblank_count_inc;
>>> + smp_wmb();
>>> +}
>>> +
>>> /**
>>> * drm_update_vblank_count - update the master vblank counter
>>> * @dev: DRM device
>>> @@ -93,7 +123,7 @@ module_param_named(timestamp_monotonic,
>>> drm_timestamp_monotonic, int, 0600);
>>> static void drm_update_vblank_count(struct drm_device *dev, int crtc)
>>> {
>>> struct drm_vblank_crtc *vblank = &dev->vblank[crtc];
>>> - u32 cur_vblank, diff, tslot;
>>> + u32 cur_vblank, diff;
>>> bool rc;
>>> struct timeval t_vblank;
>>>
>>> @@ -129,18 +159,12 @@ static void drm_update_vblank_count(struct drm_device
>>> *dev, int crtc)
>>> if (diff == 0)
>>> return;
>>>
>>> - /* Reinitialize corresponding vblank timestamp if high-precision query
>>> - * available. Skip this step if query unsupported or failed. Will
>>> - * reinitialize delayed at next vblank interrupt in that case.
>>> + /*
>>> + * Only reinitialize corresponding vblank timestamp if high-precision
>>> query
>>> + * available and didn't fail. Will reinitialize delayed at next vblank
>>> + * interrupt in that case.
>>> */
>>> - if (rc) {
>>> - tslot = atomic_read(&vblank->count) + diff;
>>> - vblanktimestamp(dev, crtc, tslot) = t_vblank;
>>> - }
>>> -
>>> - smp_mb__before_atomic();
>>> - atomic_add(diff, &vblank->count);
>>> - smp_mb__after_atomic();
>>> + store_vblank(dev, crtc, diff, rc ? &t_vblank : NULL);
>>> }
>>>
>>> /*
>>> @@ -218,7 +242,7 @@ static void vblank_disable_and_save(struct drm_device
>>> *dev, int crtc)
>>> /* Compute time difference to stored timestamp of last vblank
>>> * as updated by last invocation of drm_handle_vblank() in vblank
>>> irq.
>>> */
>>> - vblcount = atomic_read(&vblank->count);
>>> + vblcount = vblank->count;
>>> diff_ns = timeval_to_ns(&tvblank) -
>>> timeval_to_ns(&vblanktimestamp(dev, crtc, vblcount));
>>>
>>> @@ -234,17 +258,8 @@ static void vblank_disable_and_save(struct drm_device
>>> *dev, int crtc)
>>> * available. In that case we can't account for this and just
>>> * hope for the best.
>>> */
>>> - if (vblrc && (abs64(diff_ns) > 1000000)) {
>>> - /* Store new timestamp in ringbuffer. */
>>> - vblanktimestamp(dev, crtc, vblcount + 1) = tvblank;
>>> -
>>> - /* Increment cooked vblank count. This also atomically commits
>>> - * the timestamp computed above.
>>> - */
>>> - smp_mb__before_atomic();
>>> - atomic_inc(&vblank->count);
>>> - smp_mb__after_atomic();
>>> - }
>>> + if (vblrc && (abs64(diff_ns) > 1000000))
>>> + store_vblank(dev, crtc, 1, &tvblank);
>>>
>>> spin_unlock_irqrestore(&dev->vblank_time_lock, irqflags);
>>> }
>>> @@ -852,7 +867,7 @@ u32 drm_vblank_count(struct drm_device *dev, int crtc)
>>>
>>> if (WARN_ON(crtc >= dev->num_crtcs))
>>> return 0;
>>> - return atomic_read(&vblank->count);
>>> + return vblank->count;
>>
>> I wrongly assumed atomic_read would guarantee more than it actually does, so
>> please help me to learn something. Why don't we need some smp_rmb() here
>> before returning vblank->count? What guarantees that drm_vblank_count() does
>> return the latest value assigned to vblank->count in store_vblank()? In
>> store_vblank() there is a smp_wmb(), but why don't we need a matching
>> smp_rmb() here to benefit from it?
>
> Well, you could but the vblank counter resolution is very low (60HZ),
> so practically speaking, what would be the point?
>
Thanks for the explanations Peter.
Depends on the latency induced. A few microseconds don't matter, a
millisecond or more would matter for some applications.
> The trailing write barrier in store_vblank() is only necessary to
> force the ordering of the timestamp wrt to vblank count *in case there
> was no write barrier executed between to calls to store_vblank()*,
> not to ensure the cpu forces the write to main memory immediately.
>
That part i understand, the possibly slightly earlier write of some
store buffers to main memory is just a possible nice side effect. Bits
and pieces of my memory about how cache coherency etc. work slowly come
back to my own memory...
> Because the time scales for these events don't require that level of
> resolution; consider how much code has to get executed between a
> hardware vblank irq triggering and the vblank counter being updated.
>
> Realistically, the only relevant requirement is that the timestamp
> match the counter.
>
Yes that is the really important part. A msec delay would possibly
matter for some timing sensitive apps like mine - some more exotic
displays run at 200 Hz, and some apps need to synchronize to the vblank
not strictly for graphics. But i assume potential delays here are more
on the order of a few microseconds if some pending loads from the cache
would get reordered for overall efficiency?
>>
>>> }
>>> EXPORT_SYMBOL(drm_vblank_count);
>>>
>>> @@ -897,16 +912,17 @@ u32 drm_vblank_count_and_time(struct drm_device *dev,
>>> int crtc,
>>> if (WARN_ON(crtc >= dev->num_crtcs))
>>> return 0;
>>>
>>> - /* Read timestamp from slot of _vblank_time ringbuffer
>>> - * that corresponds to current vblank count. Retry if
>>> - * count has incremented during readout. This works like
>>> - * a seqlock.
>>> + /*
>>> + * Vblank timestamps are read lockless. To ensure consistency the
>>> vblank
>>> + * counter is rechecked and ordering is ensured using memory barriers.
>>> + * This works like a seqlock. The write-side barriers are in
>>> store_vblank.
>>> */
>>> do {
>>> - cur_vblank = atomic_read(&vblank->count);
>>> + cur_vblank = vblank->count;
>>> + smp_rmb();
>>> *vblanktime = vblanktimestamp(dev, crtc, cur_vblank);
>>> smp_rmb();
>>> - } while (cur_vblank != atomic_read(&vblank->count));
>>> + } while (cur_vblank != vblank->count);
>>>
>>
>> Similar question as above. We have a new smp_rmb() after the cur_vblank
>> assignment and then after *vblanktime assignment. My original wrong
>> assumption was that the first smp_rmb() wouldn't be needed because
>> atomic_read() would imply that, and that the compiler/cpu couldn't reorder
>> anything here because the *vblanktime assignment depends on cur_vblank from
>> the preceeding atomic_read.
>>
>> But why can we now do the comparison while(cur_vblank != vblank->count)
>> without needing something like
>>
>> new_vblank = vblank->count;
>> smp_rmb();
>> } while (cur_vblank != new_vblank);
>>
>> to make sure the value from the 2nd vblank->count read isn't stale wrt. to
>> potential updates from store_vblank()?
>
> Similar response here.
>
> What matters is that the timestamp read is consistent with the
> vblank count; not that you "just missed" new values.
>
>> Another question is why the drm_vblank_count_and_time() code ever worked
>> without triggering any of my own tests and consistency checks in my
>> software, or any of your igt tests? I run my tests very often, but only on
>> Intel architecture cpus. I assume the same is true for the igt tests? Is
>> there anything specific about Intel cpu's that makes this still work or very
>> unlikely to break? Or are the tests insufficient to catch this? Or just luck?
>
> Intel x86 cpus are strongly-ordered, so smp_rmb() and smp_wmb() are only
> compiler barriers on x86 arch.
>
Ok, that makes sense as part of the explanation why stuff still worked,
at least on the tested x86 arch.
> The compiler could have hoisted the vblanktimestamp read in front of
> the vblank count read, but chose not to.
>
This one still confuses me. I know why the smp_rmb after the
vblanktimestamp read is there - i placed one there in the current code
myself. But why could the compiler - in absence of the new smp_rmb
compiler barrier in this patch - reorder those two loads without
violating the semantic of the code?
Why could it have turned this
cur_vblank = vblank->count;
// smp_rmb();
vblanktime = vblanktimestamp(dev, crtc, cur_vblank);
into this instead
*vblanktime = vblanktimestamp(dev, crtc, cur_vblank);
// smp_rmb();
cur_vblank = vblank->count;
when the first load would need the value of cur_vblank - and thereby the
result of the 2nd load - as input to calculate its address for the
*vblanktime = ... load?
I think i'm still not getting something about why the compiler would be
allowed to reorder in this way in absence of the additional smp_rmb? Or
is that barrier required for other archs which are less strongly ordered?
thanks,
-mario
>> I looked through kernels back to 3.16 and most uses of the function would be
>> safe from races due to the locking around it, holding of vblank refcounts,
>> or the place and order of execution, e.g., from within drm_handle_vblank().
>> But in some tight test loop just calling the drmWaitVblank ioctl to query
>> current values i'd expect it to at least occassionally return corrupted
>> timestamps, e.g., time jumping forward or backward, etc.?
>
> As a test, reverse the load order of vblanktimestamp wrt vblank count
> and see if your tests trip; if not, you know the tests are insufficient.
>
> Regards,
> Peter Hurley
>
>>
>>> return cur_vblank;
>>> }
>>> @@ -1715,7 +1731,7 @@ bool drm_handle_vblank(struct drm_device *dev, int
>>> crtc)
>>> */
>>>
>>> /* Get current timestamp and count. */
>>> - vblcount = atomic_read(&vblank->count);
>>> + vblcount = vblank->count;
>>> drm_get_last_vbltimestamp(dev, crtc, &tvblank,
>>> DRM_CALLED_FROM_VBLIRQ);
>>>
>>> /* Compute time difference to timestamp of last vblank */
>>> @@ -1731,20 +1747,11 @@ bool drm_handle_vblank(struct drm_device *dev, int
>>> crtc)
>>> * e.g., due to spurious vblank interrupts. We need to
>>> * ignore those for accounting.
>>> */
>>> - if (abs64(diff_ns) > DRM_REDUNDANT_VBLIRQ_THRESH_NS) {
>>> - /* Store new timestamp in ringbuffer. */
>>> - vblanktimestamp(dev, crtc, vblcount + 1) = tvblank;
>>> -
>>> - /* Increment cooked vblank count. This also atomically commits
>>> - * the timestamp computed above.
>>> - */
>>> - smp_mb__before_atomic();
>>> - atomic_inc(&vblank->count);
>>> - smp_mb__after_atomic();
>>> - } else {
>>> + if (abs64(diff_ns) > DRM_REDUNDANT_VBLIRQ_THRESH_NS)
>>> + store_vblank(dev, crtc, 1, &tvblank);
>>> + else
>>> DRM_DEBUG("crtc %d: Redundant vblirq ignored. diff_ns = %d\n",
>>> crtc, (int) diff_ns);
>>> - }
>>>
>>> spin_unlock(&dev->vblank_time_lock);
>>>
>>> diff --git a/include/drm/drmP.h b/include/drm/drmP.h
>>> index 62c40777c009..4c31a2cc5a33 100644
>>> --- a/include/drm/drmP.h
>>> +++ b/include/drm/drmP.h
>>> @@ -686,9 +686,13 @@ struct drm_pending_vblank_event {
>>> struct drm_vblank_crtc {
>>> struct drm_device *dev; /* pointer to the drm_device */
>>> wait_queue_head_t queue; /**< VBLANK wait queue */
>>> - struct timeval time[DRM_VBLANKTIME_RBSIZE]; /**< timestamp of
>>> current count */
>>> struct timer_list disable_timer; /* delayed disable timer */
>>> - atomic_t count; /**< number of VBLANK interrupts */
>>> +
>>> + /* vblank counter, protected by dev->vblank_time_lock for writes */
>>> + unsigned long count;
>>
>> Why is count an unsigned long (= 64 bit on 64-bit kernels) instead of u32
>> when all users of count are u32? Is this intentional?
>>
>>
>>> + /* vblank timestamps, protected by dev->vblank_time_lock for writes */
>>> + struct timeval time[DRM_VBLANKTIME_RBSIZE];
>>> +
>>> atomic_t refcount; /* number of users of vblank interruptsper
>>> crtc */
>>> u32 last; /* protected by dev->vbl_lock, used */
>>> /* for wraparound handling */
>>>
>>
>> Thanks,
>> -mario
>