Merge branch 'af_packet-timestamp'

Daniel Borkmann says:

====================
This is a joint effort with Willem to bring optional i) tx hw/sw
timestamping into PF_PACKET, that was reported by Paul Chavent,
and ii) to expose the type of timestamp to the user, which is in
the current situation not possible to distinguish with the RX_RING
and TX_RING API (but distinguishable through the normal timestamping
API), reported by Richard Cochran. This set is based on top of
``packet: account statistics only in tpacket_stats_u''. Related
discussion can be found in: http://patchwork.ozlabs.org/patch/238125/
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/Documentation/networking/packet_mmap.txt b/Documentation/networking/packet_mmap.txt
index 65efb85..23dd80e 100644
--- a/Documentation/networking/packet_mmap.txt
+++ b/Documentation/networking/packet_mmap.txt
@@ -1016,10 +1016,11 @@
 -------------------------------------------------------------------------------
 
 The PACKET_TIMESTAMP setting determines the source of the timestamp in
-the packet meta information.  If your NIC is capable of timestamping
-packets in hardware, you can request those hardware timestamps to used.
-Note: you may need to enable the generation of hardware timestamps with
-SIOCSHWTSTAMP.
+the packet meta information for mmap(2)ed RX_RING and TX_RINGs.  If your
+NIC is capable of timestamping packets in hardware, you can request those
+hardware timestamps to be used. Note: you may need to enable the generation
+of hardware timestamps with SIOCSHWTSTAMP (see related information from
+Documentation/networking/timestamping.txt).
 
 PACKET_TIMESTAMP accepts the same integer bit field as
 SO_TIMESTAMPING.  However, only the SOF_TIMESTAMPING_SYS_HARDWARE
@@ -1031,8 +1032,36 @@
     req |= SOF_TIMESTAMPING_SYS_HARDWARE;
     setsockopt(fd, SOL_PACKET, PACKET_TIMESTAMP, (void *) &req, sizeof(req))
 
-If PACKET_TIMESTAMP is not set, a software timestamp generated inside
-the networking stack is used (the behavior before this setting was added).
+For the mmap(2)ed ring buffers, such timestamps are stored in the
+tpacket{,2,3}_hdr structure's tp_sec and tp_{n,u}sec members. To determine
+what kind of timestamp has been reported, the tp_status field is binary |'ed
+with the following possible bits ...
+
+    TP_STATUS_TS_SYS_HARDWARE
+    TP_STATUS_TS_RAW_HARDWARE
+    TP_STATUS_TS_SOFTWARE
+
+... that are equivalent to its SOF_TIMESTAMPING_* counterparts. For the
+RX_RING, if none of those 3 are set (i.e. PACKET_TIMESTAMP is not set),
+then this means that a software fallback was invoked *within* PF_PACKET's
+processing code (less precise).
+
+Getting timestamps for the TX_RING works as follows: i) fill the ring frames,
+ii) call sendto() e.g. in blocking mode, iii) wait for status of relevant
+frames to be updated resp. the frame handed over to the application, iv) walk
+through the frames to pick up the individual hw/sw timestamps.
+
+Only (!) if transmit timestamping is enabled, then these bits are combined
+with binary | with TP_STATUS_AVAILABLE, so you must check for that in your
+application (e.g. !(tp_status & (TP_STATUS_SEND_REQUEST | TP_STATUS_SENDING))
+in a first step to see if the frame belongs to the application, and then
+one can extract the type of timestamp in a second step from tp_status)!
+
+If you don't care about them, thus having it disabled, checking for
+TP_STATUS_AVAILABLE resp. TP_STATUS_WRONG_FORMAT is sufficient. If in the
+TX_RING part only TP_STATUS_AVAILABLE is set, then the tp_sec and tp_{n,u}sec
+members do not contain a valid value. For TX_RINGs, by default no timestamp
+is generated!
 
 See include/linux/net_tstamp.h and Documentation/networking/timestamping
 for more information on hardware timestamps.
diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h
index 8136658..b950c02 100644
--- a/include/uapi/linux/if_packet.h
+++ b/include/uapi/linux/if_packet.h
@@ -86,19 +86,24 @@
 };
 
 /* Rx ring - header status */
-#define TP_STATUS_KERNEL	0x0
-#define TP_STATUS_USER		0x1
-#define TP_STATUS_COPY		0x2
-#define TP_STATUS_LOSING	0x4
-#define TP_STATUS_CSUMNOTREADY	0x8
-#define TP_STATUS_VLAN_VALID   0x10 /* auxdata has valid tp_vlan_tci */
-#define TP_STATUS_BLK_TMO	0x20
+#define TP_STATUS_KERNEL	      0
+#define TP_STATUS_USER		(1 << 0)
+#define TP_STATUS_COPY		(1 << 1)
+#define TP_STATUS_LOSING	(1 << 2)
+#define TP_STATUS_CSUMNOTREADY	(1 << 3)
+#define TP_STATUS_VLAN_VALID	(1 << 4) /* auxdata has valid tp_vlan_tci */
+#define TP_STATUS_BLK_TMO	(1 << 5)
 
 /* Tx ring - header status */
-#define TP_STATUS_AVAILABLE	0x0
-#define TP_STATUS_SEND_REQUEST	0x1
-#define TP_STATUS_SENDING	0x2
-#define TP_STATUS_WRONG_FORMAT	0x4
+#define TP_STATUS_AVAILABLE	      0
+#define TP_STATUS_SEND_REQUEST	(1 << 0)
+#define TP_STATUS_SENDING	(1 << 1)
+#define TP_STATUS_WRONG_FORMAT	(1 << 2)
+
+/* Rx and Tx ring - header status */
+#define TP_STATUS_TS_SOFTWARE		(1 << 29)
+#define TP_STATUS_TS_SYS_HARDWARE	(1 << 30)
+#define TP_STATUS_TS_RAW_HARDWARE	(1 << 31)
 
 /* Rx ring - feature request bits */
 #define TP_FT_REQ_FILL_RXHASH	0x1
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 898cf5c..af9185d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3327,12 +3327,8 @@
 	if (!sk)
 		return;
 
-	skb = skb_clone(orig_skb, GFP_ATOMIC);
-	if (!skb)
-		return;
-
 	if (hwtstamps) {
-		*skb_hwtstamps(skb) =
+		*skb_hwtstamps(orig_skb) =
 			*hwtstamps;
 	} else {
 		/*
@@ -3340,9 +3336,13 @@
 		 * so keep the shared tx_flags and only
 		 * store software time stamp
 		 */
-		skb->tstamp = ktime_get_real();
+		orig_skb->tstamp = ktime_get_real();
 	}
 
+	skb = skb_clone(orig_skb, GFP_ATOMIC);
+	if (!skb)
+		return;
+
 	serr = SKB_EXT_ERR(skb);
 	memset(serr, 0, sizeof(*serr));
 	serr->ee.ee_errno = ENOMSG;
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 7e387ff..ba8309a 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -339,6 +339,59 @@
 	}
 }
 
+static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
+				   unsigned int flags)
+{
+	struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
+
+	if (shhwtstamps) {
+		if ((flags & SOF_TIMESTAMPING_SYS_HARDWARE) &&
+		    ktime_to_timespec_cond(shhwtstamps->syststamp, ts))
+			return TP_STATUS_TS_SYS_HARDWARE;
+		if ((flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
+		    ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
+			return TP_STATUS_TS_RAW_HARDWARE;
+	}
+
+	if (ktime_to_timespec_cond(skb->tstamp, ts))
+		return TP_STATUS_TS_SOFTWARE;
+
+	return 0;
+}
+
+static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
+				    struct sk_buff *skb)
+{
+	union tpacket_uhdr h;
+	struct timespec ts;
+	__u32 ts_status;
+
+	if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
+		return 0;
+
+	h.raw = frame;
+	switch (po->tp_version) {
+	case TPACKET_V1:
+		h.h1->tp_sec = ts.tv_sec;
+		h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
+		break;
+	case TPACKET_V2:
+		h.h2->tp_sec = ts.tv_sec;
+		h.h2->tp_nsec = ts.tv_nsec;
+		break;
+	case TPACKET_V3:
+	default:
+		WARN(1, "TPACKET version not supported.\n");
+		BUG();
+	}
+
+	/* one flush is safe, as both fields always lie on the same cacheline */
+	flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
+	smp_wmb();
+
+	return ts_status;
+}
+
 static void *packet_lookup_frame(struct packet_sock *po,
 		struct packet_ring_buffer *rb,
 		unsigned int position,
@@ -1657,26 +1710,6 @@
 	return 0;
 }
 
-static void tpacket_get_timestamp(struct sk_buff *skb, struct timespec *ts,
-				  unsigned int flags)
-{
-	struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
-
-	if (shhwtstamps) {
-		if ((flags & SOF_TIMESTAMPING_SYS_HARDWARE) &&
-		    ktime_to_timespec_cond(shhwtstamps->syststamp, ts))
-			return;
-		if ((flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
-		    ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts))
-			return;
-	}
-
-	if (ktime_to_timespec_cond(skb->tstamp, ts))
-		return;
-
-	getnstimeofday(ts);
-}
-
 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
 		       struct packet_type *pt, struct net_device *orig_dev)
 {
@@ -1691,6 +1724,7 @@
 	unsigned short macoff, netoff, hdrlen;
 	struct sk_buff *copy_skb = NULL;
 	struct timespec ts;
+	__u32 ts_status;
 
 	if (skb->pkt_type == PACKET_LOOPBACK)
 		goto drop;
@@ -1773,7 +1807,11 @@
 	spin_unlock(&sk->sk_receive_queue.lock);
 
 	skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
-	tpacket_get_timestamp(skb, &ts, po->tp_tstamp);
+
+	if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
+		getnstimeofday(&ts);
+
+	status |= ts_status;
 
 	switch (po->tp_version) {
 	case TPACKET_V1:
@@ -1874,10 +1912,14 @@
 	void *ph;
 
 	if (likely(po->tx_ring.pg_vec)) {
+		__u32 ts;
+
 		ph = skb_shinfo(skb)->destructor_arg;
 		BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
 		atomic_dec(&po->tx_ring.pending);
-		__packet_set_status(po, ph, TP_STATUS_AVAILABLE);
+
+		ts = __packet_set_timestamp(po, ph, skb);
+		__packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
 	}
 
 	sock_wfree(skb);
@@ -1900,6 +1942,7 @@
 	skb->dev = dev;
 	skb->priority = po->sk.sk_priority;
 	skb->mark = po->sk.sk_mark;
+	sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
 	skb_shinfo(skb)->destructor_arg = ph.raw;
 
 	switch (po->tp_version) {