On Fri, Jun 07, 2019 at 02:34:20PM +1000, David Gwynne wrote: > nics are starting to offer the ability to timestamp packets when > they're received. other systems (eg linux and freebsd) have support > for recording that timestamp on mbufs and then using it as the > backend for at least the SO_TIMESTAMP socket option instead of a > call to microtime(). > > this implements the above, and additionally supports using the hw > timestamp in bpf too. other systems may do the bpf thing too, but i > didn't look closely enough to find out. > > timestamps are recorded as the uptime of the system in nanoseconds > in the ph_timestamp field in mbufs. this mirrors the use of > ph_timestamp in the fq_codel code to store the uptime in nanoseconds. > im using another bit in m_pkthdr.csum_flags to say whether the timestamp > is valid or not (M_TIMESTAMP). im arguing that it's another offloading > feature and therefore appropriate for the csum offload flags field. > > this adds some inline functions to time.h for turning ns into a timeval > and timespec, which are ns_to_microtime and ns_to_nanotime respectively. > i originally wanted ns_to_timeval and ns_to_timespec, but the linux > compat stuff in drm already uses those names and ruined the idea. > especially since they return the time{val,spec}s as values.
If we're going to add more conversion stuff to sys/time.h I want it to resemble the stuff that's already there. I think NSEC_TO_TIMESPEC and NSEC_TO_TIMEVAL would be consistent names, so in sys/time.h I'd strongly prefer the following: static __inline void NSEC_TO_TIMEVAL(uint64_t ns, struct timeval *tv) { tv->tv_sec = ns / 1000000000LL; tv->tv_usec = (ns % 1000000000L) / 1000L; } static __inline void NSEC_TO_TIMESPEC(uint64_t ns, struct timespec *ts) { ts->tv_sec = ns / 1000000000LL; ts->tv_nsec = ns % 1000000000L; } > the ipv4 SO_TIMESTAMP and bpf code looks at whether M_TIMESTAMP is set, > and if so turns ph_timestamp into a timeval before adding it to boottime > (which is the wall clock time that uptime starts at), before using it > instead of microtime(). > > the mcx changes are based on what freebsd did to their driver, but > simplified a bit. > > i want this because we're being asked to look at recording network > traffic for possible audit use. part of that is having accurate > timestamps on received packets, and hopefully it will mitigate against > chunks of packets getting reordered or delayed significantly when the > box is busy. > > thoughts? ok? Thoughts inline. > Index: sys/mbuf.h > =================================================================== > RCS file: /cvs/src/sys/sys/mbuf.h,v > retrieving revision 1.242 > diff -u -p -r1.242 mbuf.h > --- sys/mbuf.h 11 Feb 2019 00:25:33 -0000 1.242 > +++ sys/mbuf.h 7 Jun 2019 03:27:41 -0000 > @@ -226,13 +226,14 @@ struct mbuf { > #define M_ICMP_CSUM_IN_OK 0x0400 /* ICMP/ICMPv6 checksum > verified */ > #define M_ICMP_CSUM_IN_BAD 0x0800 /* ICMP/ICMPv6 checksum bad */ > #define M_IPV6_DF_OUT 0x1000 /* don't fragment outgoing IPv6 > */ > +#define M_TIMESTAMP 0x2000 /* ph_timestamp is set */ > > #ifdef _KERNEL > #define MCS_BITS \ > ("\20\1IPV4_CSUM_OUT\2TCP_CSUM_OUT\3UDP_CSUM_OUT\4IPV4_CSUM_IN_OK" \ > "\5IPV4_CSUM_IN_BAD\6TCP_CSUM_IN_OK\7TCP_CSUM_IN_BAD\10UDP_CSUM_IN_OK" \ > > "\11UDP_CSUM_IN_BAD\12ICMP_CSUM_OUT\13ICMP_CSUM_IN_OK\14ICMP_CSUM_IN_BAD" \ > - "\15IPV6_NODF_OUT") > + "\15IPV6_NODF_OUT" "\16TIMESTAMP") > #endif > > /* mbuf types */ > Index: sys/time.h > =================================================================== > RCS file: /cvs/src/sys/sys/time.h,v > retrieving revision 1.41 > diff -u -p -r1.41 time.h > --- sys/time.h 3 Jun 2019 01:27:30 -0000 1.41 > +++ sys/time.h 7 Jun 2019 03:27:41 -0000 > @@ -333,6 +333,20 @@ void clock_secs_to_ymdhms(time_t, struct > /* Traditional POSIX base year */ > #define POSIX_BASE_YEAR 1970 > > +static __inline void > +ns_to_microtime(struct timeval *tv, uint64_t ns) > +{ > + tv->tv_sec = ns / 1000000000L; > + tv->tv_usec = (ns % 1000000000L) / 1000; > +} > + > +static __inline void > +ns_to_nanotime(struct timespec *tv, uint64_t ns) > +{ > + tv->tv_sec = ns / 1000000000L; > + tv->tv_nsec = ns % 1000000000L; > +} > + Same thoughts from before about NSEC_TO_TIMEVAL etc. Also, unless I've missed something, I don't see you use ns_to_nanotime in this diff. However I see people doing this conversion elsewhere, so maybe we could use it anyway. > #else /* !_KERNEL */ > #include <time.h> > > Index: net/bpf.c > =================================================================== > RCS file: /cvs/src/sys/net/bpf.c,v > retrieving revision 1.175 > diff -u -p -r1.175 bpf.c > --- net/bpf.c 18 May 2019 12:59:32 -0000 1.175 > +++ net/bpf.c 7 Jun 2019 03:27:41 -0000 > @@ -1284,13 +1284,25 @@ _bpf_mtap(caddr_t arg, const struct mbuf > fcode = bps->bps_bf.bf_insns; > slen = bpf_mfilter(fcode, m, pktlen); > > - if (slen == 0) > + if (slen == 0) > continue; > if (d->bd_fildrop != BPF_FILDROP_PASS) > drop = 1; > if (d->bd_fildrop != BPF_FILDROP_DROP) { > - if (!gottime++) > - microtime(&tv); > + if (!gottime) { > + if (ISSET(m->m_flags, M_PKTHDR) && > + ISSET(m->m_pkthdr.csum_flags, > + M_TIMESTAMP)) { > + struct timeval btv; > + ns_to_microtime(&tv, > + m->m_pkthdr.ph_timestamp); > + microboottime(&btv); > + timeradd(&tv, &btv, &tv); You could probably put this dance into a function. void pkthdr_microtime(const struct pkthdr *hdr, struct timeval *tv) { struct timeval boottime, uptime; microboottime(&boottime); NSEC_TO_TIMEVAL(hdr->ph_timestamp, &uptime); timeradd(&boottime, &uptime, &tv); } > + } else > + microtime(&tv); > + > + gottime = 1; > + } > > mtx_enter(&d->bd_mtx); > bpf_catchpacket(d, (u_char *)m, pktlen, slen, cpfn, > Index: netinet/ip_input.c > =================================================================== > RCS file: /cvs/src/sys/netinet/ip_input.c,v > retrieving revision 1.342 > diff -u -p -r1.342 ip_input.c > --- netinet/ip_input.c 13 Oct 2018 18:36:01 -0000 1.342 > +++ netinet/ip_input.c 7 Jun 2019 03:27:41 -0000 > @@ -1712,7 +1712,14 @@ ip_savecontrol(struct inpcb *inp, struct > if (inp->inp_socket->so_options & SO_TIMESTAMP) { > struct timeval tv; > > - microtime(&tv); > + if (ISSET(m->m_pkthdr.csum_flags, M_TIMESTAMP)) { > + struct timeval btv; > + ns_to_microtime(&tv, m->m_pkthdr.ph_timestamp); > + microboottime(&btv); > + timeradd(&tv, &btv, &tv); Yeah see, you do it here too. Dunno where to actually put that function though. > + } else > + microtime(&tv); > + > *mp = sbcreatecontrol((caddr_t) &tv, sizeof(tv), > SCM_TIMESTAMP, SOL_SOCKET); > if (*mp) > Index: dev/pci/if_mcx.c > =================================================================== > RCS file: /cvs/src/sys/dev/pci/if_mcx.c,v > retrieving revision 1.16 > diff -u -p -r1.16 if_mcx.c > --- dev/pci/if_mcx.c 4 Jun 2019 05:29:30 -0000 1.16 > +++ dev/pci/if_mcx.c 7 Jun 2019 03:27:41 -0000 > @@ -1243,20 +1245,22 @@ struct mcx_cmd_destroy_cq_out { > } __packed __aligned(4); > > struct mcx_cq_entry { > - uint32_t cq_reserved1; > + uint32_t __reserved__; > uint32_t cq_lro; > uint32_t cq_lro_ack_seq_num; > uint32_t cq_rx_hash; > - uint32_t cq_rx_hash_type; > + uint8_t cq_rx_hash_type; > + uint8_t cq_ml_path; > + uint16_t __reserved__; > uint32_t cq_checksum; > - uint32_t cq_reserved2; > + uint32_t __reserved__; > uint32_t cq_flags; > uint32_t cq_lro_srqn; > - uint32_t cq_reserved3[2]; > + uint32_t __reserved__[2]; > uint32_t cq_byte_cnt; > - uint32_t cq_lro_ts_value; > - uint32_t cq_lro_ts_echo; > - uint32_t cq_flow_tag; > + uint64_t cq_timestamp; > +#define MCX_CQ_ENTRY_TIMESTAMP_PTP (1 << 63) > + uint32_t cq_rx_drops; > uint16_t cq_wqe_count; > uint8_t cq_signature; > uint8_t cq_opcode_owner; > @@ -1895,6 +1899,18 @@ struct mcx_cq { > int cq_count; > }; > > +struct mcx_calibration { > + uint64_t c_timestamp; /* previous mcx chip time */ > + uint64_t c_uptime; /* previous kernel nanouptime */ > + uint64_t c_tbase; /* mcx chip time */ > + uint64_t c_ubase; /* kernel nanouptime */ > + uint64_t c_tdiff; > + uint64_t c_udiff; > +}; > + > +#define MCX_CALIBRATE_FIRST 2 > +#define MCX_CALIBRATE_NORMAL 30 > + > struct mcx_softc { > struct device sc_dev; > struct arpcom sc_ac; > @@ -1948,6 +1964,11 @@ struct mcx_softc { > int sc_extra_mcast; > uint8_t > sc_mcast_flows[MCX_NUM_MCAST_FLOWS][ETHER_ADDR_LEN]; > > + struct mcx_calibration sc_calibration[2]; > + unsigned int sc_calibration_gen; > + uint32_t sc_freq; > + struct timeout sc_calibrate; > + > struct mcx_cq sc_cq[MCX_MAX_CQS]; > int sc_num_cq; > > @@ -2039,8 +2060,8 @@ static void mcx_cmdq_dump(const struct m > static void mcx_cmdq_mbox_dump(struct mcx_dmamem *, int); > */ > static void mcx_refill(void *); > -static void mcx_process_rx(struct mcx_softc *, struct mcx_cq_entry *, > - struct mbuf_list *, int *); > +static int mcx_process_rx(struct mcx_softc *, struct mcx_cq_entry *, > + struct mbuf_list *, const struct mcx_calibration *); > static void mcx_process_txeof(struct mcx_softc *, struct mcx_cq_entry *, > int *); > static void mcx_process_cq(struct mcx_softc *, struct mcx_cq *); > @@ -2060,6 +2081,9 @@ static void mcx_media_status(struct ifne > static int mcx_media_change(struct ifnet *); > static int mcx_get_sffpage(struct ifnet *, struct if_sffpage *); > > +static void mcx_calibrate_first(struct mcx_softc *); > +static void mcx_calibrate(void *); > + > static inline uint32_t > mcx_rd(struct mcx_softc *, bus_size_t); > static inline void > @@ -2067,6 +2091,8 @@ static inline void > static inline void > mcx_bar(struct mcx_softc *, bus_size_t, bus_size_t, int); > > +static uint64_t mcx_timer(struct mcx_softc *); > + > static int mcx_dmamem_alloc(struct mcx_softc *, struct mcx_dmamem *, > bus_size_t, u_int align); > static void mcx_dmamem_zero(struct mcx_dmamem *); > @@ -2338,6 +2364,7 @@ mcx_attach(struct device *parent, struct > ether_ifattach(ifp); > > timeout_set(&sc->sc_rx_refill, mcx_refill, sc); > + timeout_set(&sc->sc_calibrate, mcx_calibrate, sc); > > sc->sc_flow_table_id = -1; > for (i = 0; i < MCX_NUM_FLOW_GROUPS; i++) { > @@ -5557,9 +5590,65 @@ mcx_process_txeof(struct mcx_softc *sc, > ms->ms_m = NULL; > } > > -void > +static uint64_t > +mcx_uptime(void) > +{ > + struct timespec ts; > + > + nanouptime(&ts); > + > + return ((uint64_t)ts.tv_sec * 1000000000 + (uint64_t)ts.tv_nsec); > +} > > + > +static void > +mcx_calibrate_first(struct mcx_softc *sc) > +{ > + struct mcx_calibration *c = &sc->sc_calibration[0]; > + > + sc->sc_calibration_gen = 0; > + > + c->c_ubase = mcx_uptime(); > + c->c_tbase = mcx_timer(sc); > + c->c_tdiff = 0; > + > + timeout_add_sec(&sc->sc_calibrate, MCX_CALIBRATE_FIRST); should this be MCX_CALIBRATE_NORMAL? Also, maybe "mcx_start_calibration"? > +} > + > +#define MCX_TIMESTAMP_SHIFT 10 > + > +static void > +mcx_calibrate(void *arg) > +{ > + struct mcx_softc *sc = arg; > + struct mcx_calibration *nc, *pc; > + unsigned int gen; > + > + if (!ISSET(sc->sc_ac.ac_if.if_flags, IFF_RUNNING)) > + return; > + > + timeout_add_sec(&sc->sc_calibrate, MCX_CALIBRATE_NORMAL); > + > + gen = sc->sc_calibration_gen; > + pc = &sc->sc_calibration[gen % nitems(sc->sc_calibration)]; > + gen++; > + nc = &sc->sc_calibration[gen % nitems(sc->sc_calibration)]; > + > + nc->c_uptime = pc->c_ubase; > + nc->c_timestamp = pc->c_tbase; > + > + nc->c_ubase = mcx_uptime(); > + nc->c_tbase = mcx_timer(sc); > + > + nc->c_udiff = (nc->c_ubase - nc->c_uptime) >> MCX_TIMESTAMP_SHIFT; > + nc->c_tdiff = (nc->c_tbase - nc->c_timestamp) >> MCX_TIMESTAMP_SHIFT; > + > + membar_producer(); > + sc->sc_calibration_gen = gen; > +} > + > +static int > mcx_process_rx(struct mcx_softc *sc, struct mcx_cq_entry *cqe, > - struct mbuf_list *ml, int *slots) > + struct mbuf_list *ml, const struct mcx_calibration *c) Would you ever use a mcx_calibration struct in this function that wasn't in the mcx_softc? > { > struct mcx_slot *ms; > struct mbuf *m; > @@ -5574,10 +5663,26 @@ mcx_process_rx(struct mcx_softc *sc, str > > m = ms->ms_m; > ms->ms_m = NULL; > - m->m_pkthdr.len = m->m_len = betoh32(cqe->cq_byte_cnt); > - (*slots)++; > + > + m->m_pkthdr.len = m->m_len = bemtoh32(&cqe->cq_byte_cnt); > + > + if (cqe->cq_rx_hash_type) { > + m->m_pkthdr.ph_flowid = M_FLOWID_VALID | > + bemtoh32(&cqe->cq_rx_hash); > + } > + > + if (c->c_tdiff) { > + uint64_t t = bemtoh64(&cqe->cq_timestamp) - c->c_timestamp; > + t *= c->c_udiff; > + t /= c->c_tdiff; > + > + m->m_pkthdr.ph_timestamp = c->c_uptime + t; > + SET(m->m_pkthdr.csum_flags, M_TIMESTAMP); > + } > > ml_enqueue(ml, m); > + > + return (1); > } > > static struct mcx_cq_entry * > @@ -5624,11 +5729,17 @@ void > mcx_process_cq(struct mcx_softc *sc, struct mcx_cq *cq) > { > struct ifnet *ifp = &sc->sc_ac.ac_if; > + const struct mcx_calibration *c; > + unsigned int gen; > struct mcx_cq_entry *cqe; > uint8_t *cqp; > struct mbuf_list ml = MBUF_LIST_INITIALIZER(); > int rxfree, txfree; > > + gen = sc->sc_calibration_gen; > + membar_consumer(); > + c = &sc->sc_calibration[gen % nitems(sc->sc_calibration)]; > + What IPL does this run at? Couldn't the calibration struct be modified out from under you? > rxfree = 0; > txfree = 0; > while ((cqe = mcx_next_cq_entry(sc, cq))) { > @@ -5639,7 +5750,7 @@ mcx_process_cq(struct mcx_softc *sc, str > mcx_process_txeof(sc, cqe, &txfree); > break; > case MCX_CQ_ENTRY_OPCODE_SEND: > - mcx_process_rx(sc, cqe, &ml, &rxfree); > + rxfree += mcx_process_rx(sc, cqe, &ml, c); > break; > case MCX_CQ_ENTRY_OPCODE_REQ_ERR: > case MCX_CQ_ENTRY_OPCODE_SEND_ERR: > @@ -5882,6 +5993,8 @@ mcx_up(struct mcx_softc *sc) > sc->sc_rx_prod = 0; > mcx_rx_fill(sc); > > + mcx_calibrate_first(sc); > + > SET(ifp->if_flags, IFF_RUNNING); > > sc->sc_tx_cons = 0; > @@ -5922,6 +6035,8 @@ mcx_down(struct mcx_softc *sc) > intr_barrier(&sc->sc_ih); > ifq_barrier(&ifp->if_snd); > > + timeout_del_barrier(&sc->sc_calibrate); > + > for (group = 0; group < MCX_NUM_FLOW_GROUPS; group++) { > if (sc->sc_flow_group_id[group] != -1) > mcx_destroy_flow_group(sc, > @@ -6435,6 +6550,26 @@ static inline void > mcx_bar(struct mcx_softc *sc, bus_size_t r, bus_size_t l, int f) > { > bus_space_barrier(sc->sc_memt, sc->sc_memh, r, l, f); > +} > + > +static uint64_t > +mcx_timer(struct mcx_softc *sc) > +{ > + uint32_t hi, lo, ni; > + > + hi = mcx_rd(sc, MCX_INTERNAL_TIMER_H); > + for (;;) { > + lo = mcx_rd(sc, MCX_INTERNAL_TIMER_L); > + mcx_bar(sc, MCX_INTERNAL_TIMER_L, 8, BUS_SPACE_BARRIER_READ); > + ni = mcx_rd(sc, MCX_INTERNAL_TIMER_H); > + > + if (ni == hi) > + break; > + > + hi = ni; > + } > + > + return (((uint64_t)hi << 32) | (uint64_t)lo); > } You could actually truncate this loop: for (ni = hi + 1; ni != hi; hi = ni) { lo = mcx_rd(sc, MCX_INTERNAL_TIMER_L); mcx_bar(sc, MCX_INTERNAL_TIMER_L, 8, BUS_SPACE_BARRIER_READ); ni = mcx_rd(sc, MCX_INTERNAL_TIMER_H); } but maybe that's too clever :)