Merge branch 'af_packet-timestamp'
Daniel Borkmann says:
====================
This is a joint effort with Willem to bring optional i) tx hw/sw
timestamping into PF_PACKET, that was reported by Paul Chavent,
and ii) to expose the type of timestamp to the user, which is in
the current situation not possible to distinguish with the RX_RING
and TX_RING API (but distinguishable through the normal timestamping
API), reported by Richard Cochran. This set is based on top of
``packet: account statistics only in tpacket_stats_u''. Related
discussion can be found in: http://patchwork.ozlabs.org/patch/238125/
====================
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt
index 65efb85..23dd80e 100644
--- a/Documentation/networking/packet_mmap.txt
+++ b/Documentation/networking/packet_mmap.txt
@@ -1016,10 +1016,11 @@
-------------------------------------------------------------------------------
The PACKET_TIMESTAMP setting determines the source of the timestamp in
-the packet meta information. If your NIC is capable of timestamping
-packets in hardware, you can request those hardware timestamps to used.
-Note: you may need to enable the generation of hardware timestamps with
-SIOCSHWTSTAMP.
+the packet meta information for mmap(2)ed RX_RING and TX_RINGs. If your
+NIC is capable of timestamping packets in hardware, you can request those
+hardware timestamps to be used. Note: you may need to enable the generation
+of hardware timestamps with SIOCSHWTSTAMP (see related information from
+Documentation/networking/timestamping.txt).
PACKET_TIMESTAMP accepts the same integer bit field as
SO_TIMESTAMPING. However, only the SOF_TIMESTAMPING_SYS_HARDWARE
@@ -1031,8 +1032,36 @@
req |= SOF_TIMESTAMPING_SYS_HARDWARE;
setsockopt(fd, SOL_PACKET, PACKET_TIMESTAMP, (void *) &req, sizeof(req))
-If PACKET_TIMESTAMP is not set, a software timestamp generated inside
-the networking stack is used (the behavior before this setting was added).
+For the mmap(2)ed ring buffers, such timestamps are stored in the
+tpacket{,2,3}_hdr structure's tp_sec and tp_{n,u}sec members. To determine
+what kind of timestamp has been reported, the tp_status field is binary |'ed
+with the following possible bits ...
+
+ TP_STATUS_TS_SYS_HARDWARE
+ TP_STATUS_TS_RAW_HARDWARE
+ TP_STATUS_TS_SOFTWARE
+
+... that are equivalent to its SOF_TIMESTAMPING_* counterparts. For the
+RX_RING, if none of those 3 are set (i.e. PACKET_TIMESTAMP is not set),
+then this means that a software fallback was invoked *within* PF_PACKET's
+processing code (less precise).
+
+Getting timestamps for the TX_RING works as follows: i) fill the ring frames,
+ii) call sendto() e.g. in blocking mode, iii) wait for status of relevant
+frames to be updated resp. the frame handed over to the application, iv) walk
+through the frames to pick up the individual hw/sw timestamps.
+
+Only (!) if transmit timestamping is enabled, then these bits are combined
+with binary | with TP_STATUS_AVAILABLE, so you must check for that in your
+application (e.g. !(tp_status & (TP_STATUS_SEND_REQUEST | TP_STATUS_SENDING))
+in a first step to see if the frame belongs to the application, and then
+one can extract the type of timestamp in a second step from tp_status)!
+
+If you don't care about them, thus having it disabled, checking for
+TP_STATUS_AVAILABLE resp. TP_STATUS_WRONG_FORMAT is sufficient. If in the
+TX_RING part only TP_STATUS_AVAILABLE is set, then the tp_sec and tp_{n,u}sec
+members do not contain a valid value. For TX_RINGs, by default no timestamp
+is generated!
See include/linux/net_tstamp.h and Documentation/networking/timestamping
for more information on hardware timestamps.
diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h
index 8136658..b950c02 100644
--- a/include/uapi/linux/if_packet.h
+++ b/include/uapi/linux/if_packet.h
@@ -86,19 +86,24 @@
};
/* Rx ring - header status */
-#define TP_STATUS_KERNEL 0x0
-#define TP_STATUS_USER 0x1
-#define TP_STATUS_COPY 0x2
-#define TP_STATUS_LOSING 0x4
-#define TP_STATUS_CSUMNOTREADY 0x8
-#define TP_STATUS_VLAN_VALID 0x10 /* auxdata has valid tp_vlan_tci */
-#define TP_STATUS_BLK_TMO 0x20
+#define TP_STATUS_KERNEL 0
+#define TP_STATUS_USER (1 << 0)
+#define TP_STATUS_COPY (1 << 1)
+#define TP_STATUS_LOSING (1 << 2)
+#define TP_STATUS_CSUMNOTREADY (1 << 3)
+#define TP_STATUS_VLAN_VALID (1 << 4) /* auxdata has valid tp_vlan_tci */
+#define TP_STATUS_BLK_TMO (1 << 5)
/* Tx ring - header status */
-#define TP_STATUS_AVAILABLE 0x0
-#define TP_STATUS_SEND_REQUEST 0x1
-#define TP_STATUS_SENDING 0x2
-#define TP_STATUS_WRONG_FORMAT 0x4
+#define TP_STATUS_AVAILABLE 0
+#define TP_STATUS_SEND_REQUEST (1 << 0)
+#define TP_STATUS_SENDING (1 << 1)
+#define TP_STATUS_WRONG_FORMAT (1 << 2)
+
+/* Rx and Tx ring - header status */
+#define TP_STATUS_TS_SOFTWARE (1 << 29)
+#define TP_STATUS_TS_SYS_HARDWARE (1 << 30)
+#define TP_STATUS_TS_RAW_HARDWARE (1 << 31)
/* Rx ring - feature request bits */
#define TP_FT_REQ_FILL_RXHASH 0x1
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 898cf5c..af9185d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3327,12 +3327,8 @@
if (!sk)
return;
- skb = skb_clone(orig_skb, GFP_ATOMIC);
- if (!skb)
- return;
-
if (hwtstamps) {
- *skb_hwtstamps(skb) =
+ *skb_hwtstamps(orig_skb) =
*hwtstamps;
} else {
/*
@@ -3340,9 +3336,13 @@
* so keep the shared tx_flags and only
* store software time stamp
*/
- skb->tstamp = ktime_get_real();
+ orig_skb->tstamp = ktime_get_real();
}
+ skb = skb_clone(orig_skb, GFP_ATOMIC);
+ if (!skb)
+ return;
+
serr = SKB_EXT_ERR(skb);
memset(serr, 0, sizeof(*serr));
serr->ee.ee_errno = ENOMSG;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 7e387ff..ba8309a 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -339,6 +339,59 @@
}
}
+static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
+ unsigned int flags)
+{
+ struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
+
+ if (shhwtstamps) {
+ if ((flags & SOF_TIMESTAMPING_SYS_HARDWARE) &&
+ ktime_to_timespec_cond(shhwtstamps->syststamp, ts))
+ return TP_STATUS_TS_SYS_HARDWARE;
+ if ((flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
+ ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
+ return TP_STATUS_TS_RAW_HARDWARE;
+ }
+
+ if (ktime_to_timespec_cond(skb->tstamp, ts))
+ return TP_STATUS_TS_SOFTWARE;
+
+ return 0;
+}
+
+static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
+ struct sk_buff *skb)
+{
+ union tpacket_uhdr h;
+ struct timespec ts;
+ __u32 ts_status;
+
+ if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
+ return 0;
+
+ h.raw = frame;
+ switch (po->tp_version) {
+ case TPACKET_V1:
+ h.h1->tp_sec = ts.tv_sec;
+ h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
+ break;
+ case TPACKET_V2:
+ h.h2->tp_sec = ts.tv_sec;
+ h.h2->tp_nsec = ts.tv_nsec;
+ break;
+ case TPACKET_V3:
+ default:
+ WARN(1, "TPACKET version not supported.\n");
+ BUG();
+ }
+
+ /* one flush is safe, as both fields always lie on the same cacheline */
+ flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
+ smp_wmb();
+
+ return ts_status;
+}
+
static void *packet_lookup_frame(struct packet_sock *po,
struct packet_ring_buffer *rb,
unsigned int position,
@@ -1657,26 +1710,6 @@
return 0;
}
-static void tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
- unsigned int flags)
-{
- struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
-
- if (shhwtstamps) {
- if ((flags & SOF_TIMESTAMPING_SYS_HARDWARE) &&
- ktime_to_timespec_cond(shhwtstamps->syststamp, ts))
- return;
- if ((flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
- ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
- return;
- }
-
- if (ktime_to_timespec_cond(skb->tstamp, ts))
- return;
-
- getnstimeofday(ts);
-}
-
static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
@@ -1691,6 +1724,7 @@
unsigned short macoff, netoff, hdrlen;
struct sk_buff *copy_skb = NULL;
struct timespec ts;
+ __u32 ts_status;
if (skb->pkt_type == PACKET_LOOPBACK)
goto drop;
@@ -1773,7 +1807,11 @@
spin_unlock(&sk->sk_receive_queue.lock);
skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
- tpacket_get_timestamp(skb, &ts, po->tp_tstamp);
+
+ if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
+ getnstimeofday(&ts);
+
+ status |= ts_status;
switch (po->tp_version) {
case TPACKET_V1:
@@ -1874,10 +1912,14 @@
void *ph;
if (likely(po->tx_ring.pg_vec)) {
+ __u32 ts;
+
ph = skb_shinfo(skb)->destructor_arg;
BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
atomic_dec(&po->tx_ring.pending);
- __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
+
+ ts = __packet_set_timestamp(po, ph, skb);
+ __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
}
sock_wfree(skb);
@@ -1900,6 +1942,7 @@
skb->dev = dev;
skb->priority = po->sk.sk_priority;
skb->mark = po->sk.sk_mark;
+ sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
skb_shinfo(skb)->destructor_arg = ph.raw;
switch (po->tp_version) {