Re: hardware packet timestamping for mbufs, then sockets and bpf

Scott Cheloha Fri, 07 Jun 2019 17:37:50 -0700

On Fri, Jun 07, 2019 at 02:34:20PM +1000, David Gwynne wrote:
> nics are starting to offer the ability to timestamp packets when
> they're received. other systems (eg linux and freebsd) have support
> for recording that timestamp on mbufs and then using it as the
> backend for at least the SO_TIMESTAMP socket option instead of a
> call to microtime().
> 
> this implements the above, and additionally supports using the hw
> timestamp in bpf too. other systems may do the bpf thing too, but i
> didn't look closely enough to find out.
> 
> timestamps are recorded as the uptime of the system in nanoseconds
> in the ph_timestamp field in mbufs. this mirrors the use of
> ph_timestamp in the fq_codel code to store the uptime in nanoseconds.
> im using another bit in m_pkthdr.csum_flags to say whether the timestamp
> is valid or not (M_TIMESTAMP). im arguing that it's another offloading
> feature and therefore appropriate for the csum offload flags field.
> 
> this adds some inline functions to time.h for turning ns into a timeval
> and timespec, which are ns_to_microtime and ns_to_nanotime respectively.
> i originally wanted ns_to_timeval and ns_to_timespec, but the linux
> compat stuff in drm already uses those names and ruined the idea.
> especially since they return the time{val,spec}s as values.


If we're going to add more conversion stuff to sys/time.h I want it to
resemble the stuff that's already there.  I think NSEC_TO_TIMESPEC and
NSEC_TO_TIMEVAL would be consistent names, so in sys/time.h I'd strongly
prefer the following:

static __inline void
NSEC_TO_TIMEVAL(uint64_t ns, struct timeval *tv)
{
        tv->tv_sec = ns / 1000000000LL;
        tv->tv_usec = (ns % 1000000000L) / 1000L;
}

static __inline void
NSEC_TO_TIMESPEC(uint64_t ns, struct timespec *ts)
{
        ts->tv_sec = ns / 1000000000LL;
        ts->tv_nsec = ns % 1000000000L;
}

> the ipv4 SO_TIMESTAMP and bpf code looks at whether M_TIMESTAMP is set,
> and if so turns ph_timestamp into a timeval before adding it to boottime
> (which is the wall clock time that uptime starts at), before using it
> instead of microtime().
> 
> the mcx changes are based on what freebsd did to their driver, but
> simplified a bit.
> 
> i want this because we're being asked to look at recording network
> traffic for possible audit use. part of that is having accurate
> timestamps on received packets, and hopefully it will mitigate against
> chunks of packets getting reordered or delayed significantly when the
> box is busy.
> 
> thoughts? ok?

Thoughts inline.

> Index: sys/mbuf.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/mbuf.h,v
> retrieving revision 1.242
> diff -u -p -r1.242 mbuf.h
> --- sys/mbuf.h        11 Feb 2019 00:25:33 -0000      1.242
> +++ sys/mbuf.h        7 Jun 2019 03:27:41 -0000
> @@ -226,13 +226,14 @@ struct mbuf {
>  #define      M_ICMP_CSUM_IN_OK       0x0400  /* ICMP/ICMPv6 checksum 
> verified */
>  #define      M_ICMP_CSUM_IN_BAD      0x0800  /* ICMP/ICMPv6 checksum bad */
>  #define      M_IPV6_DF_OUT           0x1000  /* don't fragment outgoing IPv6 
> */
> +#define      M_TIMESTAMP             0x2000  /* ph_timestamp is set */
>  
>  #ifdef _KERNEL
>  #define MCS_BITS \
>      ("\20\1IPV4_CSUM_OUT\2TCP_CSUM_OUT\3UDP_CSUM_OUT\4IPV4_CSUM_IN_OK" \
>      "\5IPV4_CSUM_IN_BAD\6TCP_CSUM_IN_OK\7TCP_CSUM_IN_BAD\10UDP_CSUM_IN_OK" \
>      
> "\11UDP_CSUM_IN_BAD\12ICMP_CSUM_OUT\13ICMP_CSUM_IN_OK\14ICMP_CSUM_IN_BAD" \
> -    "\15IPV6_NODF_OUT")
> +    "\15IPV6_NODF_OUT" "\16TIMESTAMP")
>  #endif
>  
>  /* mbuf types */
> Index: sys/time.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/time.h,v
> retrieving revision 1.41
> diff -u -p -r1.41 time.h
> --- sys/time.h        3 Jun 2019 01:27:30 -0000       1.41
> +++ sys/time.h        7 Jun 2019 03:27:41 -0000
> @@ -333,6 +333,20 @@ void clock_secs_to_ymdhms(time_t, struct
>  /* Traditional POSIX base year */
>  #define POSIX_BASE_YEAR 1970
>  
> +static __inline void
> +ns_to_microtime(struct timeval *tv, uint64_t ns)
> +{
> +     tv->tv_sec = ns / 1000000000L;
> +     tv->tv_usec = (ns % 1000000000L) / 1000;
> +}
> +
> +static __inline void
> +ns_to_nanotime(struct timespec *tv, uint64_t ns)
> +{
> +     tv->tv_sec = ns / 1000000000L;
> +     tv->tv_nsec = ns % 1000000000L;
> +}
> +

Same thoughts from before about NSEC_TO_TIMEVAL etc.

Also, unless I've missed something, I don't see you use ns_to_nanotime
in this diff.

However I see people doing this conversion elsewhere, so maybe
we could use it anyway.

>  #else /* !_KERNEL */
>  #include <time.h>
>  
> Index: net/bpf.c
> ===================================================================
> RCS file: /cvs/src/sys/net/bpf.c,v
> retrieving revision 1.175
> diff -u -p -r1.175 bpf.c
> --- net/bpf.c 18 May 2019 12:59:32 -0000      1.175
> +++ net/bpf.c 7 Jun 2019 03:27:41 -0000
> @@ -1284,13 +1284,25 @@ _bpf_mtap(caddr_t arg, const struct mbuf
>                       fcode = bps->bps_bf.bf_insns;
>               slen = bpf_mfilter(fcode, m, pktlen);
>  
> -             if (slen == 0)
> +             if (slen == 0)
>                       continue;
>               if (d->bd_fildrop != BPF_FILDROP_PASS)
>                       drop = 1;
>               if (d->bd_fildrop != BPF_FILDROP_DROP) {
> -                     if (!gottime++)
> -                             microtime(&tv);
> +                     if (!gottime) {
> +                             if (ISSET(m->m_flags, M_PKTHDR) &&
> +                                 ISSET(m->m_pkthdr.csum_flags,
> +                                  M_TIMESTAMP)) {
> +                                     struct timeval btv;
> +                                     ns_to_microtime(&tv,
> +                                         m->m_pkthdr.ph_timestamp);
> +                                     microboottime(&btv);
> +                                     timeradd(&tv, &btv, &tv);

You could probably put this dance into a function.

void
pkthdr_microtime(const struct pkthdr *hdr, struct timeval *tv)
{
        struct timeval boottime, uptime;

        microboottime(&boottime);
        NSEC_TO_TIMEVAL(hdr->ph_timestamp, &uptime);
        timeradd(&boottime, &uptime, &tv);
}

> +                             } else
> +                                     microtime(&tv);
> +
> +                             gottime = 1;
> +                     }
>  
>                       mtx_enter(&d->bd_mtx);
>                       bpf_catchpacket(d, (u_char *)m, pktlen, slen, cpfn,
> Index: netinet/ip_input.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet/ip_input.c,v
> retrieving revision 1.342
> diff -u -p -r1.342 ip_input.c
> --- netinet/ip_input.c        13 Oct 2018 18:36:01 -0000      1.342
> +++ netinet/ip_input.c        7 Jun 2019 03:27:41 -0000
> @@ -1712,7 +1712,14 @@ ip_savecontrol(struct inpcb *inp, struct
>       if (inp->inp_socket->so_options & SO_TIMESTAMP) {
>               struct timeval tv;
>  
> -             microtime(&tv);
> +             if (ISSET(m->m_pkthdr.csum_flags, M_TIMESTAMP)) {
> +                     struct timeval btv;
> +                     ns_to_microtime(&tv, m->m_pkthdr.ph_timestamp);
> +                     microboottime(&btv);
> +                     timeradd(&tv, &btv, &tv);

Yeah see, you do it here too.  Dunno where to actually put that
function though.

> +             } else
> +                     microtime(&tv);
> +
>               *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv),
>                   SCM_TIMESTAMP, SOL_SOCKET);
>               if (*mp)
> Index: dev/pci/if_mcx.c
> ===================================================================
> RCS file: /cvs/src/sys/dev/pci/if_mcx.c,v
> retrieving revision 1.16
> diff -u -p -r1.16 if_mcx.c
> --- dev/pci/if_mcx.c  4 Jun 2019 05:29:30 -0000       1.16
> +++ dev/pci/if_mcx.c  7 Jun 2019 03:27:41 -0000
> @@ -1243,20 +1245,22 @@ struct mcx_cmd_destroy_cq_out {
>  } __packed __aligned(4);
>  
>  struct mcx_cq_entry {
> -     uint32_t                cq_reserved1;
> +     uint32_t                __reserved__;
>       uint32_t                cq_lro;
>       uint32_t                cq_lro_ack_seq_num;
>       uint32_t                cq_rx_hash;
> -     uint32_t                cq_rx_hash_type;
> +     uint8_t                 cq_rx_hash_type;
> +     uint8_t                 cq_ml_path;
> +     uint16_t                __reserved__;
>       uint32_t                cq_checksum;
> -     uint32_t                cq_reserved2;
> +     uint32_t                __reserved__;
>       uint32_t                cq_flags;
>       uint32_t                cq_lro_srqn;
> -     uint32_t                cq_reserved3[2];
> +     uint32_t                __reserved__[2];
>       uint32_t                cq_byte_cnt;
> -     uint32_t                cq_lro_ts_value;
> -     uint32_t                cq_lro_ts_echo;
> -     uint32_t                cq_flow_tag;
> +     uint64_t                cq_timestamp;
> +#define MCX_CQ_ENTRY_TIMESTAMP_PTP   (1 << 63)
> +     uint32_t                cq_rx_drops;
>       uint16_t                cq_wqe_count;
>       uint8_t                 cq_signature;
>       uint8_t                 cq_opcode_owner;
> @@ -1895,6 +1899,18 @@ struct mcx_cq {
>       int                      cq_count;
>  };
>  
> +struct mcx_calibration {
> +     uint64_t                 c_timestamp;   /* previous mcx chip time */
> +     uint64_t                 c_uptime;      /* previous kernel nanouptime */
> +     uint64_t                 c_tbase;       /* mcx chip time */
> +     uint64_t                 c_ubase;       /* kernel nanouptime */
> +     uint64_t                 c_tdiff;
> +     uint64_t                 c_udiff;
> +};
> +
> +#define MCX_CALIBRATE_FIRST  2
> +#define MCX_CALIBRATE_NORMAL 30
> +
>  struct mcx_softc {
>       struct device            sc_dev;
>       struct arpcom            sc_ac;
> @@ -1948,6 +1964,11 @@ struct mcx_softc {
>       int                      sc_extra_mcast;
>       uint8_t                  
> sc_mcast_flows[MCX_NUM_MCAST_FLOWS][ETHER_ADDR_LEN];
>  
> +     struct mcx_calibration   sc_calibration[2];
> +     unsigned int             sc_calibration_gen;
> +     uint32_t                 sc_freq;
> +     struct timeout           sc_calibrate;
> +
>       struct mcx_cq            sc_cq[MCX_MAX_CQS];
>       int                      sc_num_cq;
>  
> @@ -2039,8 +2060,8 @@ static void     mcx_cmdq_dump(const struct m
>  static void  mcx_cmdq_mbox_dump(struct mcx_dmamem *, int);
>  */
>  static void  mcx_refill(void *);
> -static void  mcx_process_rx(struct mcx_softc *, struct mcx_cq_entry *,
> -                 struct mbuf_list *, int *);
> +static int   mcx_process_rx(struct mcx_softc *, struct mcx_cq_entry *,
> +                 struct mbuf_list *, const struct mcx_calibration *);
>  static void  mcx_process_txeof(struct mcx_softc *, struct mcx_cq_entry *,
>                   int *);
>  static void  mcx_process_cq(struct mcx_softc *, struct mcx_cq *);
> @@ -2060,6 +2081,9 @@ static void     mcx_media_status(struct ifne
>  static int   mcx_media_change(struct ifnet *);
>  static int   mcx_get_sffpage(struct ifnet *, struct if_sffpage *);
>  
> +static void  mcx_calibrate_first(struct mcx_softc *);
> +static void  mcx_calibrate(void *);
> +
>  static inline uint32_t
>               mcx_rd(struct mcx_softc *, bus_size_t);
>  static inline void
> @@ -2067,6 +2091,8 @@ static inline void
>  static inline void
>               mcx_bar(struct mcx_softc *, bus_size_t, bus_size_t, int);
>  
> +static uint64_t      mcx_timer(struct mcx_softc *);
> +
>  static int   mcx_dmamem_alloc(struct mcx_softc *, struct mcx_dmamem *,
>                   bus_size_t, u_int align);
>  static void  mcx_dmamem_zero(struct mcx_dmamem *);
> @@ -2338,6 +2364,7 @@ mcx_attach(struct device *parent, struct
>       ether_ifattach(ifp);
>  
>       timeout_set(&sc->sc_rx_refill, mcx_refill, sc);
> +     timeout_set(&sc->sc_calibrate, mcx_calibrate, sc);
>  
>       sc->sc_flow_table_id = -1;
>       for (i = 0; i < MCX_NUM_FLOW_GROUPS; i++) {
> @@ -5557,9 +5590,65 @@ mcx_process_txeof(struct mcx_softc *sc, 
>       ms->ms_m = NULL;
>  }
>  
> -void
> +static uint64_t
> +mcx_uptime(void)
> +{
> +     struct timespec ts;
> +
> +     nanouptime(&ts);
> +
> +     return ((uint64_t)ts.tv_sec * 1000000000 + (uint64_t)ts.tv_nsec);
> +}
> 
> +
> +static void
> +mcx_calibrate_first(struct mcx_softc *sc)
> +{
> +     struct mcx_calibration *c = &sc->sc_calibration[0];
> +
> +     sc->sc_calibration_gen = 0;
> +
> +     c->c_ubase = mcx_uptime();
> +     c->c_tbase = mcx_timer(sc);
> +     c->c_tdiff = 0;
> +
> +     timeout_add_sec(&sc->sc_calibrate, MCX_CALIBRATE_FIRST);

should this be MCX_CALIBRATE_NORMAL?

Also, maybe "mcx_start_calibration"?

> +}
> +
> +#define MCX_TIMESTAMP_SHIFT 10
> +
> +static void
> +mcx_calibrate(void *arg)
> +{
> +     struct mcx_softc *sc = arg;
> +     struct mcx_calibration *nc, *pc;
> +     unsigned int gen;
> +
> +     if (!ISSET(sc->sc_ac.ac_if.if_flags, IFF_RUNNING))
> +             return;
> +
> +     timeout_add_sec(&sc->sc_calibrate, MCX_CALIBRATE_NORMAL);
> +
> +     gen = sc->sc_calibration_gen;
> +     pc = &sc->sc_calibration[gen % nitems(sc->sc_calibration)];
> +     gen++;
> +     nc = &sc->sc_calibration[gen % nitems(sc->sc_calibration)];
> +
> +     nc->c_uptime = pc->c_ubase;
> +     nc->c_timestamp = pc->c_tbase;
> +
> +     nc->c_ubase = mcx_uptime();
> +     nc->c_tbase = mcx_timer(sc);
> +
> +     nc->c_udiff = (nc->c_ubase - nc->c_uptime) >> MCX_TIMESTAMP_SHIFT;
> +     nc->c_tdiff = (nc->c_tbase - nc->c_timestamp) >> MCX_TIMESTAMP_SHIFT;
> +
> +     membar_producer();
> +     sc->sc_calibration_gen = gen;
> +}
> +
> +static int
>  mcx_process_rx(struct mcx_softc *sc, struct mcx_cq_entry *cqe,
> -    struct mbuf_list *ml, int *slots)
> +    struct mbuf_list *ml, const struct mcx_calibration *c)

Would you ever use a mcx_calibration struct in this function that
wasn't in the mcx_softc?

>  {
>       struct mcx_slot *ms;
>       struct mbuf *m;
> @@ -5574,10 +5663,26 @@ mcx_process_rx(struct mcx_softc *sc, str
>  
>       m = ms->ms_m;
>       ms->ms_m = NULL;
> -     m->m_pkthdr.len = m->m_len = betoh32(cqe->cq_byte_cnt);
> -     (*slots)++;
> +
> +     m->m_pkthdr.len = m->m_len = bemtoh32(&cqe->cq_byte_cnt);
> +
> +     if (cqe->cq_rx_hash_type) {
> +             m->m_pkthdr.ph_flowid = M_FLOWID_VALID |
> +                 bemtoh32(&cqe->cq_rx_hash);
> +     }
> +
> +     if (c->c_tdiff) {
> +             uint64_t t = bemtoh64(&cqe->cq_timestamp) - c->c_timestamp;
> +             t *= c->c_udiff;
> +             t /= c->c_tdiff;
> +
> +             m->m_pkthdr.ph_timestamp = c->c_uptime + t;
> +             SET(m->m_pkthdr.csum_flags, M_TIMESTAMP);
> +     }
>  
>       ml_enqueue(ml, m);
> +
> +     return (1);
>  }
>  
>  static struct mcx_cq_entry *
> @@ -5624,11 +5729,17 @@ void
>  mcx_process_cq(struct mcx_softc *sc, struct mcx_cq *cq)
>  {
>       struct ifnet *ifp = &sc->sc_ac.ac_if;
> +     const struct mcx_calibration *c;
> +     unsigned int gen;
>       struct mcx_cq_entry *cqe;
>       uint8_t *cqp;
>       struct mbuf_list ml = MBUF_LIST_INITIALIZER();
>       int rxfree, txfree;
>  
> +     gen = sc->sc_calibration_gen;
> +     membar_consumer();
> +     c = &sc->sc_calibration[gen % nitems(sc->sc_calibration)];
> +

What IPL does this run at?  Couldn't the calibration struct be modified
out from under you?

>       rxfree = 0;
>       txfree = 0;
>       while ((cqe = mcx_next_cq_entry(sc, cq))) {
> @@ -5639,7 +5750,7 @@ mcx_process_cq(struct mcx_softc *sc, str
>                       mcx_process_txeof(sc, cqe, &txfree);
>                       break;
>               case MCX_CQ_ENTRY_OPCODE_SEND:
> -                     mcx_process_rx(sc, cqe, &ml, &rxfree);
> +                     rxfree += mcx_process_rx(sc, cqe, &ml, c);
>                       break;
>               case MCX_CQ_ENTRY_OPCODE_REQ_ERR:
>               case MCX_CQ_ENTRY_OPCODE_SEND_ERR:
> @@ -5882,6 +5993,8 @@ mcx_up(struct mcx_softc *sc)
>       sc->sc_rx_prod = 0;
>       mcx_rx_fill(sc);
>  
> +     mcx_calibrate_first(sc);
> +
>       SET(ifp->if_flags, IFF_RUNNING);
>  
>       sc->sc_tx_cons = 0;
> @@ -5922,6 +6035,8 @@ mcx_down(struct mcx_softc *sc)
>       intr_barrier(&sc->sc_ih);
>       ifq_barrier(&ifp->if_snd);
>  
> +     timeout_del_barrier(&sc->sc_calibrate);
> +
>       for (group = 0; group < MCX_NUM_FLOW_GROUPS; group++) {
>               if (sc->sc_flow_group_id[group] != -1)
>                       mcx_destroy_flow_group(sc,
> @@ -6435,6 +6550,26 @@ static inline void
>  mcx_bar(struct mcx_softc *sc, bus_size_t r, bus_size_t l, int f)
>  {
>       bus_space_barrier(sc->sc_memt, sc->sc_memh, r, l, f);
> +}
> +
> +static uint64_t
> +mcx_timer(struct mcx_softc *sc)
> +{
> +     uint32_t hi, lo, ni;
> +
> +     hi = mcx_rd(sc, MCX_INTERNAL_TIMER_H);
> +     for (;;) {
> +             lo = mcx_rd(sc, MCX_INTERNAL_TIMER_L);
> +             mcx_bar(sc, MCX_INTERNAL_TIMER_L, 8, BUS_SPACE_BARRIER_READ);
> +             ni = mcx_rd(sc, MCX_INTERNAL_TIMER_H);
> +
> +             if (ni == hi)
> +                     break;
> +
> +             hi = ni;
> +     }
> +
> +     return (((uint64_t)hi << 32) | (uint64_t)lo);
>  }

You could actually truncate this loop:

        for (ni = hi + 1; ni != hi; hi = ni) {
                lo = mcx_rd(sc, MCX_INTERNAL_TIMER_L);
                mcx_bar(sc, MCX_INTERNAL_TIMER_L, 8, BUS_SPACE_BARRIER_READ);
                ni = mcx_rd(sc, MCX_INTERNAL_TIMER_H);
        }

but maybe that's too clever :)

Re: hardware packet timestamping for mbufs, then sockets and bpf

Reply via email to