This is an automated email from the ASF dual-hosted git repository.

cmcfarlen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/trafficserver.git


The following commit(s) were added to refs/heads/master by this push:
     new 53adece861 Event loop sub-stats (#11761)
53adece861 is described below

commit 53adece861b975b85b53e6197d1d6b297225b593
Author: Nathan Wang <[email protected]>
AuthorDate: Tue Oct 1 11:40:53 2024 -0700

    Event loop sub-stats (#11761)
    
    * introduce sub-event loop stats
    
    * add documentation
    
    ---------
    
    Co-authored-by: Nathan Wang <[email protected]>
---
 .../monitoring/statistics/core/eventloop.en.rst    | 45 +++++++++++++
 include/iocore/eventsystem/EThread.h               | 78 ++++++++++++++++++----
 src/iocore/eventsystem/UnixEThread.cc              | 31 ++++++---
 src/iocore/eventsystem/UnixEventProcessor.cc       |  3 +
 src/iocore/net/NetHandler.cc                       |  9 ++-
 5 files changed, 142 insertions(+), 24 deletions(-)

diff --git a/doc/admin-guide/monitoring/statistics/core/eventloop.en.rst 
b/doc/admin-guide/monitoring/statistics/core/eventloop.en.rst
index e8946ee189..9d2acb7e73 100644
--- a/doc/admin-guide/monitoring/statistics/core/eventloop.en.rst
+++ b/doc/admin-guide/monitoring/statistics/core/eventloop.en.rst
@@ -80,6 +80,21 @@ high then it is likely transactions are experiencing 
significant latency.
 
     The maximum amount of time spent in a single loop in the last 10 seconds.
 
+.. ts:stat:: global proxy.process.eventloop.drain.queue.max.10s integer
+   :units: nanoseconds
+
+    The maximum amount of time spent draining the event queue in a single loop 
in the last 10 seconds.
+
+.. ts:stat:: global proxy.process.eventloop.io.wait.max.10s integer
+   :units: nanoseconds
+
+    The maximum amount of time spent waiting for network IO in a single loop 
in the last 10 seconds.
+
+.. ts:stat:: global proxy.process.eventloop.io.work.max.10s integer
+   :units: nanoseconds
+
+    The maximum amount of time spent processing network IO in a single loop in 
the last 10 seconds.
+
 .. rubric:: 100 Second Metrics
 
 .. ts:stat:: global proxy.process.eventloop.count.100s integer
@@ -113,6 +128,21 @@ high then it is likely transactions are experiencing 
significant latency.
 
     The maximum amount of time spent in a single loop in the last 100 seconds.
 
+.. ts:stat:: global proxy.process.eventloop.drain.queue.max.100s integer
+   :units: nanoseconds
+
+    The maximum amount of time spent draining the event queue in a single loop 
in the last 100 seconds.
+
+.. ts:stat:: global proxy.process.eventloop.io.wait.max.100s integer
+   :units: nanoseconds
+
+    The maximum amount of time spent waiting for network IO in a single loop 
in the last 100 seconds.
+
+.. ts:stat:: global proxy.process.eventloop.io.work.max.100s integer
+   :units: nanoseconds
+
+    The maximum amount of time spent processing network IO in a single loop in 
the last 100 seconds.
+
 .. rubric:: 1000 Second Metrics
 
 .. ts:stat:: global proxy.process.eventloop.count.1000s integer
@@ -146,6 +176,21 @@ high then it is likely transactions are experiencing 
significant latency.
 
     The maximum amount of time spent in a single loop in the last 1000 seconds.
 
+.. ts:stat:: global proxy.process.eventloop.drain.queue.max.1000s integer
+   :units: nanoseconds
+
+    The maximum amount of time spent draining the event queue in a single loop 
in the last 1000 seconds.
+
+.. ts:stat:: global proxy.process.eventloop.io.wait.max.1000s integer
+   :units: nanoseconds
+
+    The maximum amount of time spent waiting for network IO in a single loop 
in the last 1000 seconds.
+
+.. ts:stat:: global proxy.process.eventloop.io.work.max.1000s integer
+   :units: nanoseconds
+
+    The maximum amount of time spent processing network IO in a single loop in 
the last 1000 seconds.
+
 .. rubric:: Histogram Metrics
 
 .. ts:stat:: global proxy.process.eventloop.time.*ms integer
diff --git a/include/iocore/eventsystem/EThread.h 
b/include/iocore/eventsystem/EThread.h
index 8e404e7540..9b6f64bfed 100644
--- a/include/iocore/eventsystem/EThread.h
+++ b/include/iocore/eventsystem/EThread.h
@@ -418,10 +418,15 @@ public:
 
       /// Data for timing of the loop.
       struct Duration {
-        ink_hrtime _start = 0;         ///< The time of the first loop for 
this sample. Used to mark valid entries.
-        ink_hrtime _min   = INT64_MAX; ///< Shortest loop time.
-        ink_hrtime _max   = 0;         ///< Longest loop time.
-        Duration()        = default;
+        ink_hrtime _start           = 0;         ///< The time of the first 
loop for this sample. Used to mark valid entries.
+        ink_hrtime _min             = INT64_MAX; ///< Shortest loop time.
+        ink_hrtime _max             = 0;         ///< Longest loop time.
+        ink_hrtime _max_io_wait     = 0;         ///< Longest time spent 
waiting for IO activity
+        ink_hrtime _max_io_work     = 0;         ///< Longest time spent 
processing IO ready list
+        ink_hrtime _max_drain_queue = 0;         ///< Longest time spent 
draining the event queue
+        ink_hrtime _last_io_work    = 0;         ///< time spent processing IO 
this loop, needed for ratio
+        ink_hrtime _io_work_at_max  = 0;         ///< the time spent 
processing IO within the longest loop time
+        Duration()                  = default;
       } _duration;
 
       /// Events in the slice.
@@ -456,6 +461,21 @@ public:
        */
       self_type &record_event_count(int count);
 
+      /** Record duration of loop spent processing event queue.
+       *
+       * @param time Time spent processing the queue
+       * @return @a this.
+       */
+      self_type &record_drain_queue(ink_hrtime time);
+
+      /** Record time spent doing IO, both poll and processing time.
+       *
+       * @param io_wait Time spent polling for IO activity
+       * @param io_work Time spent processing the ready list
+       * @return @a this.
+       */
+      self_type &record_io_stats(ink_hrtime io_wait, ink_hrtime io_work);
+
       /// Add @a that to @a this data.
       /// This embodies the custom logic per member concerning whether each is 
a sum, min, or max.
       Slice &operator+=(Slice const &that);
@@ -465,16 +485,19 @@ public:
           More than one part of the code depends on this exact order. Be 
careful and thorough when changing.
       */
       enum class STAT_ID {
-        LOOP_COUNT,      ///< # of event loops executed.
-        LOOP_EVENTS,     ///< # of events
-        LOOP_EVENTS_MIN, ///< min # of events dispatched in a loop
-        LOOP_EVENTS_MAX, ///< max # of events dispatched in a loop
-        LOOP_WAIT,       ///< # of loops that did a conditional wait.
-        LOOP_TIME_MIN,   ///< Shortest time spent in loop.
-        LOOP_TIME_MAX,   ///< Longest time spent in loop.
+        LOOP_COUNT,           ///< # of event loops executed.
+        LOOP_EVENTS,          ///< # of events
+        LOOP_EVENTS_MIN,      ///< min # of events dispatched in a loop
+        LOOP_EVENTS_MAX,      ///< max # of events dispatched in a loop
+        LOOP_WAIT,            ///< # of loops that did a conditional wait.
+        LOOP_TIME_MIN,        ///< Shortest time spent in loop.
+        LOOP_TIME_MAX,        ///< Longest time spent in loop.
+        LOOP_DRAIN_QUEUE_MAX, ///< Max time Draining the event queue
+        LOOP_IO_WAIT_MAX,     ///< Min time spent IO waiting
+        LOOP_IO_WORK_MAX,     ///< Max time spent IO processing
       };
       /// Number of statistics for a slice.
-      static constexpr unsigned N_STAT_ID = unsigned(STAT_ID::LOOP_TIME_MAX) + 
1;
+      static constexpr unsigned N_STAT_ID = 
unsigned(STAT_ID::LOOP_IO_WORK_MAX) + 1;
 
       /// Statistic name stems.
       /// These will be qualified by time scale.
@@ -578,7 +601,8 @@ inline auto
 EThread::Metrics::Slice::record_loop_duration(ink_hrtime delta) -> self_type &
 {
   if (delta > _duration._max) {
-    _duration._max = delta;
+    _duration._max            = delta;
+    _duration._io_work_at_max = _duration._last_io_work;
   }
   if (delta < _duration._min) {
     _duration._min = delta;
@@ -599,6 +623,34 @@ EThread::Metrics::Slice::record_event_count(int count) -> 
self_type &
   return *this;
 }
 
+inline auto
+EThread::Metrics::Slice::record_drain_queue(ink_hrtime drain_queue) -> 
self_type &
+{
+  if (drain_queue > 0) {
+    if (drain_queue > _duration._max_drain_queue) {
+      _duration._max_drain_queue = drain_queue;
+    }
+  }
+  return *this;
+}
+
+inline auto
+EThread::Metrics::Slice::record_io_stats(ink_hrtime io_wait, ink_hrtime 
io_work) -> self_type &
+{
+  if (io_wait > 0) {
+    if (io_wait > _duration._max_io_wait) {
+      _duration._max_io_wait = io_wait;
+    }
+  }
+  if (io_work > 0) {
+    _duration._last_io_work = io_work;
+    if (io_work > _duration._max_io_work) {
+      _duration._max_io_work = io_work;
+    }
+  }
+  return *this;
+}
+
 inline EThread::Metrics::Slice *
 EThread::Metrics::prev_slice(EThread::Metrics::Slice *current)
 {
diff --git a/src/iocore/eventsystem/UnixEThread.cc 
b/src/iocore/eventsystem/UnixEThread.cc
index b2e1f37560..cfcd889fbe 100644
--- a/src/iocore/eventsystem/UnixEThread.cc
+++ b/src/iocore/eventsystem/UnixEThread.cc
@@ -45,9 +45,12 @@ struct AIOCallback;
 
 // !! THIS MUST BE IN THE ENUM ORDER !!
 char const *const EThread::Metrics::Slice::STAT_NAME[] = {
-  "proxy.process.eventloop.count",      "proxy.process.eventloop.events", 
"proxy.process.eventloop.events.min",
-  "proxy.process.eventloop.events.max", "proxy.process.eventloop.wait",   
"proxy.process.eventloop.time.min",
-  "proxy.process.eventloop.time.max"};
+  "proxy.process.eventloop.count",       "proxy.process.eventloop.events",
+  "proxy.process.eventloop.events.min",  "proxy.process.eventloop.events.max",
+  "proxy.process.eventloop.wait",        "proxy.process.eventloop.time.min",
+  "proxy.process.eventloop.time.max",    
"proxy.process.eventloop.drain.queue.max",
+  "proxy.process.eventloop.io.wait.max", "proxy.process.eventloop.io.work.max",
+};
 
 int thread_max_heartbeat_mseconds = THREAD_MAX_HEARTBEAT_MSECONDS;
 
@@ -297,6 +300,10 @@ EThread::execute_regular()
       sleep_time = 0;
     }
 
+    // drained the queue by this point
+    ink_hrtime post_drain  = ink_get_hrtime();
+    ink_hrtime drain_queue = post_drain - loop_start_time;
+
     tail_cb->waitForActivity(sleep_time);
 
     // loop cleanup
@@ -309,6 +316,7 @@ EThread::execute_regular()
     metrics.decay();
     metrics.record_loop_time(delta);
     current_slice->record_event_count(ev_count);
+    current_slice->record_drain_queue(drain_queue);
   }
 }
 
@@ -364,13 +372,16 @@ EThread::execute()
 EThread::Metrics::Slice &
 EThread::Metrics::Slice::operator+=(Slice const &that)
 {
-  this->_events._max    = std::max(this->_events._max, that._events._max);
-  this->_events._min    = std::min(this->_events._min, that._events._min);
-  this->_events._total += that._events._total;
-  this->_duration._min  = std::min(this->_duration._min, that._duration._min);
-  this->_duration._max  = std::max(this->_duration._max, that._duration._max);
-  this->_count         += that._count;
-  this->_wait          += that._wait;
+  this->_events._max                = std::max(this->_events._max, 
that._events._max);
+  this->_events._min                = std::min(this->_events._min, 
that._events._min);
+  this->_events._total             += that._events._total;
+  this->_duration._min              = std::min(this->_duration._min, 
that._duration._min);
+  this->_duration._max              = std::max(this->_duration._max, 
that._duration._max);
+  this->_count                     += that._count;
+  this->_wait                      += that._wait;
+  this->_duration._max_drain_queue  = 
std::max(this->_duration._max_drain_queue, that._duration._max_drain_queue);
+  this->_duration._max_io_wait      = std::max(this->_duration._max_io_wait, 
that._duration._max_io_wait);
+  this->_duration._max_io_work      = std::max(this->_duration._max_io_work, 
that._duration._max_io_work);
   return *this;
 }
 
diff --git a/src/iocore/eventsystem/UnixEventProcessor.cc 
b/src/iocore/eventsystem/UnixEventProcessor.cc
index aae8e76f4c..27f620d69c 100644
--- a/src/iocore/eventsystem/UnixEventProcessor.cc
+++ b/src/iocore/eventsystem/UnixEventProcessor.cc
@@ -118,6 +118,9 @@ EventMetricStatSync(const char *, RecDataT, RecData *, 
RecRawStatBlock *rsb, int
     slice_stat_update(ID::LOOP_EVENTS, id, slice->_events._total);
     slice_stat_update(ID::LOOP_EVENTS_MIN, id, slice->_events._min);
     slice_stat_update(ID::LOOP_EVENTS_MAX, id, slice->_events._max);
+    slice_stat_update(ID::LOOP_DRAIN_QUEUE_MAX, id, 
slice->_duration._max_drain_queue);
+    slice_stat_update(ID::LOOP_IO_WAIT_MAX, id, slice->_duration._max_io_wait);
+    slice_stat_update(ID::LOOP_IO_WORK_MAX, id, slice->_duration._max_io_work);
   }
 
   // Next are the event loop histogram buckets.
diff --git a/src/iocore/net/NetHandler.cc b/src/iocore/net/NetHandler.cc
index 1b36780051..208728636a 100644
--- a/src/iocore/net/NetHandler.cc
+++ b/src/iocore/net/NetHandler.cc
@@ -357,8 +357,11 @@ NetHandler::waitForActivity(ink_hrtime timeout)
 #endif
 
   // Polling event by PollCont
-  PollCont *p = get_PollCont(this->thread);
+  PollCont  *p        = get_PollCont(this->thread);
+  ink_hrtime pre_poll = ink_get_hrtime();
   p->do_poll(timeout);
+  ink_hrtime post_poll = ink_get_hrtime();
+  ink_hrtime poll_time = post_poll - pre_poll;
 
   // Get & Process polling result
   PollDescriptor *pd = get_PollDescriptor(this->thread);
@@ -372,6 +375,10 @@ NetHandler::waitForActivity(ink_hrtime timeout)
   pd->result = 0;
 
   process_ready_list();
+  ink_hrtime post_process = ink_get_hrtime();
+  ink_hrtime process_time = post_process - post_poll;
+  
this->thread->metrics.current_slice.load(std::memory_order_acquire)->record_io_stats(poll_time,
 process_time);
+
 #if TS_USE_LINUX_IO_URING
   ur->service();
 #endif

Reply via email to