From 590ce401c207fd944827eb5aa5e87d834eddb149 Mon Sep 17 00:00:00 2001
From: Sergio Paracuellos <sergio.paracuellos@gmail.com>
Date: Sun, 13 Jan 2019 09:56:48 +0100
Subject: dt-bindings: net: dsa: ksz9477: fix indentation for switch spi
 bindings

Switch bindings for spi managed mode are using spaces instead of tabs.
Fix them to get a file with a proper kernel indentation style.

Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Sergio Paracuellos <sergio.paracuellos@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/net/dsa/ksz.txt | 102 +++++++++++-----------
 1 file changed, 51 insertions(+), 51 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/devicetree/bindings/net/dsa/ksz.txt b/Documentation/devicetree/bindings/net/dsa/ksz.txt
index 0f407fb371ce..8d58c2a7de39 100644
--- a/Documentation/devicetree/bindings/net/dsa/ksz.txt
+++ b/Documentation/devicetree/bindings/net/dsa/ksz.txt
@@ -19,58 +19,58 @@ Examples:
 
 Ethernet switch connected via SPI to the host, CPU port wired to eth0:
 
-                             eth0: ethernet@10001000 {
-                                             fixed-link {
-                                                             speed = <1000>;
-                                                             full-duplex;
-                                             };
-                             };
+	eth0: ethernet@10001000 {
+		fixed-link {
+			speed = <1000>;
+			full-duplex;
+		};
+	};
 
-                             spi1: spi@f8008000 {
-                                             pinctrl-0 = <&pinctrl_spi_ksz>;
-                                             cs-gpios = <&pioC 25 0>;
-                                             id = <1>;
+	spi1: spi@f8008000 {
+		pinctrl-0 = <&pinctrl_spi_ksz>;
+		cs-gpios = <&pioC 25 0>;
+		id = <1>;
 
-                                             ksz9477: ksz9477@0 {
-                                                             compatible = "microchip,ksz9477";
-                                                             reg = <0>;
+		ksz9477: ksz9477@0 {
+			compatible = "microchip,ksz9477";
+			reg = <0>;
 
-                                                             spi-max-frequency = <44000000>;
-                                                             spi-cpha;
-                                                             spi-cpol;
+			spi-max-frequency = <44000000>;
+			spi-cpha;
+			spi-cpol;
 
-                                                             ports {
-                                                                             #address-cells = <1>;
-                                                                             #size-cells = <0>;
-                                                                             port@0 {
-                                                                                             reg = <0>;
-                                                                                             label = "lan1";
-                                                                             };
-                                                                             port@1 {
-                                                                                             reg = <1>;
-                                                                                             label = "lan2";
-                                                                             };
-                                                                             port@2 {
-                                                                                             reg = <2>;
-                                                                                             label = "lan3";
-                                                                             };
-                                                                             port@3 {
-                                                                                             reg = <3>;
-                                                                                             label = "lan4";
-                                                                             };
-                                                                             port@4 {
-                                                                                             reg = <4>;
-                                                                                             label = "lan5";
-                                                                             };
-                                                                             port@5 {
-                                                                                             reg = <5>;
-                                                                                             label = "cpu";
-                                                                                             ethernet = <&eth0>;
-                                                                                             fixed-link {
-                                                                                                             speed = <1000>;
-                                                                                                             full-duplex;
-                                                                                             };
-                                                                             };
-                                                             };
-                                             };
-                             };
+			ports {
+				#address-cells = <1>;
+				#size-cells = <0>;
+				port@0 {
+					reg = <0>;
+					label = "lan1";
+				};
+				port@1 {
+					reg = <1>;
+					label = "lan2";
+				};
+				port@2 {
+					reg = <2>;
+					label = "lan3";
+				};
+				port@3 {
+					reg = <3>;
+					label = "lan4";
+				};
+				port@4 {
+					reg = <4>;
+					label = "lan5";
+				};
+				port@5 {
+					reg = <5>;
+					label = "cpu";
+					ethernet = <&eth0>;
+					fixed-link {
+						speed = <1000>;
+						full-duplex;
+					};
+				};
+			};
+		};
+	};
-- 
cgit v1.2.3


From ae5220c672180765615458ae54dbcff9abe6a01d Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@infradead.org>
Date: Sun, 13 Jan 2019 20:17:41 -0800
Subject: networking: Documentation: fix snmp_counters.rst Sphinx warnings

Fix over 100 documentation warnings in snmp_counter.rst by
extending the underline string lengths and inserting a blank line
after bullet items.

Examples:

Documentation/networking/snmp_counter.rst:1: WARNING: Title overline too short.
Documentation/networking/snmp_counter.rst:14: WARNING: Bullet list ends without a blank line; unexpected unindent.

Fixes: 2b96547223e3 ("add document for TCP OFO, PAWS and skip ACK counters")
Fixes: 8e2ea53a83df ("add snmp counters document")
Fixes: 712ee16c230f ("add documents for snmp counters")
Fixes: 80cc49507ba4 ("net: Add part of TCP counts explanations in snmp_counters.rst")
Fixes: b08794a922c4 ("documentation of some IP/ICMP snmp counters")

Signed-off-by: Randy Dunlap <rdunlap@infradead.org>
Cc: yupeng <yupeng0921@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/snmp_counter.rst | 113 +++++++++++++++++++++++-------
 1 file changed, 86 insertions(+), 27 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/networking/snmp_counter.rst b/Documentation/networking/snmp_counter.rst
index b0dfdaaca512..486ab33acc3a 100644
--- a/Documentation/networking/snmp_counter.rst
+++ b/Documentation/networking/snmp_counter.rst
@@ -1,16 +1,17 @@
-===========
+============
 SNMP counter
-===========
+============
 
 This document explains the meaning of SNMP counters.
 
 General IPv4 counters
-====================
+=====================
 All layer 4 packets and ICMP packets will change these counters, but
 these counters won't be changed by layer 2 packets (such as STP) or
 ARP packets.
 
 * IpInReceives
+
 Defined in `RFC1213 ipInReceives`_
 
 .. _RFC1213 ipInReceives: https://tools.ietf.org/html/rfc1213#page-26
@@ -23,6 +24,7 @@ and so on).  It indicates the number of aggregated segments after
 GRO/LRO.
 
 * IpInDelivers
+
 Defined in `RFC1213 ipInDelivers`_
 
 .. _RFC1213 ipInDelivers: https://tools.ietf.org/html/rfc1213#page-28
@@ -33,6 +35,7 @@ supported protocols will be delivered, if someone listens on the raw
 socket, all valid IP packets will be delivered.
 
 * IpOutRequests
+
 Defined in `RFC1213 ipOutRequests`_
 
 .. _RFC1213 ipOutRequests: https://tools.ietf.org/html/rfc1213#page-28
@@ -42,6 +45,7 @@ multicast packets, and would always be updated together with
 IpExtOutOctets.
 
 * IpExtInOctets and IpExtOutOctets
+
 They are Linux kernel extensions, no RFC definitions. Please note,
 RFC1213 indeed defines ifInOctets  and ifOutOctets, but they
 are different things. The ifInOctets and ifOutOctets include the MAC
@@ -49,6 +53,7 @@ layer header size but IpExtInOctets and IpExtOutOctets don't, they
 only include the IP layer header and the IP layer data.
 
 * IpExtInNoECTPkts, IpExtInECT1Pkts, IpExtInECT0Pkts, IpExtInCEPkts
+
 They indicate the number of four kinds of ECN IP packets, please refer
 `Explicit Congestion Notification`_ for more details.
 
@@ -60,6 +65,7 @@ for the same packet, you might find that IpInReceives count 1, but
 IpExtInNoECTPkts counts 2 or more.
 
 * IpInHdrErrors
+
 Defined in `RFC1213 ipInHdrErrors`_. It indicates the packet is
 dropped due to the IP header error. It might happen in both IP input
 and IP forward paths.
@@ -67,6 +73,7 @@ and IP forward paths.
 .. _RFC1213 ipInHdrErrors: https://tools.ietf.org/html/rfc1213#page-27
 
 * IpInAddrErrors
+
 Defined in `RFC1213 ipInAddrErrors`_. It will be increased in two
 scenarios: (1) The IP address is invalid. (2) The destination IP
 address is not a local address and IP forwarding is not enabled
@@ -74,6 +81,7 @@ address is not a local address and IP forwarding is not enabled
 .. _RFC1213 ipInAddrErrors: https://tools.ietf.org/html/rfc1213#page-27
 
 * IpExtInNoRoutes
+
 This counter means the packet is dropped when the IP stack receives a
 packet and can't find a route for it from the route table. It might
 happen when IP forwarding is enabled and the destination IP address is
@@ -81,6 +89,7 @@ not a local address and there is no route for the destination IP
 address.
 
 * IpInUnknownProtos
+
 Defined in `RFC1213 ipInUnknownProtos`_. It will be increased if the
 layer 4 protocol is unsupported by kernel. If an application is using
 raw socket, kernel will always deliver the packet to the raw socket
@@ -89,10 +98,12 @@ and this counter won't be increased.
 .. _RFC1213 ipInUnknownProtos: https://tools.ietf.org/html/rfc1213#page-27
 
 * IpExtInTruncatedPkts
+
 For IPv4 packet, it means the actual data size is smaller than the
 "Total Length" field in the IPv4 header.
 
 * IpInDiscards
+
 Defined in `RFC1213 ipInDiscards`_. It indicates the packet is dropped
 in the IP receiving path and due to kernel internal reasons (e.g. no
 enough memory).
@@ -100,20 +111,23 @@ enough memory).
 .. _RFC1213 ipInDiscards: https://tools.ietf.org/html/rfc1213#page-28
 
 * IpOutDiscards
+
 Defined in `RFC1213 ipOutDiscards`_. It indicates the packet is
 dropped in the IP sending path and due to kernel internal reasons.
 
 .. _RFC1213 ipOutDiscards: https://tools.ietf.org/html/rfc1213#page-28
 
 * IpOutNoRoutes
+
 Defined in `RFC1213 ipOutNoRoutes`_. It indicates the packet is
 dropped in the IP sending path and no route is found for it.
 
 .. _RFC1213 ipOutNoRoutes: https://tools.ietf.org/html/rfc1213#page-29
 
 ICMP counters
-============
+=============
 * IcmpInMsgs and IcmpOutMsgs
+
 Defined by `RFC1213 icmpInMsgs`_ and `RFC1213 icmpOutMsgs`_
 
 .. _RFC1213 icmpInMsgs: https://tools.ietf.org/html/rfc1213#page-41
@@ -126,6 +140,7 @@ IcmpOutMsgs would still be updated if the IP header is constructed by
 a userspace program.
 
 * ICMP named types
+
 | These counters include most of common ICMP types, they are:
 | IcmpInDestUnreachs: `RFC1213 icmpInDestUnreachs`_
 | IcmpInTimeExcds: `RFC1213 icmpInTimeExcds`_
@@ -180,6 +195,7 @@ straightforward. The 'In' counter means kernel receives such a packet
 and the 'Out' counter means kernel sends such a packet.
 
 * ICMP numeric types
+
 They are IcmpMsgInType[N] and IcmpMsgOutType[N], the [N] indicates the
 ICMP type number. These counters track all kinds of ICMP packets. The
 ICMP type number definition could be found in the `ICMP parameters`_
@@ -192,12 +208,14 @@ IcmpMsgOutType8 would increase 1. And if kernel gets an ICMP Echo Reply
 packet, IcmpMsgInType0 would increase 1.
 
 * IcmpInCsumErrors
+
 This counter indicates the checksum of the ICMP packet is
 wrong. Kernel verifies the checksum after updating the IcmpInMsgs and
 before updating IcmpMsgInType[N]. If a packet has bad checksum, the
 IcmpInMsgs would be updated but none of IcmpMsgInType[N] would be updated.
 
 * IcmpInErrors and IcmpOutErrors
+
 Defined by `RFC1213 icmpInErrors`_ and `RFC1213 icmpOutErrors`_
 
 .. _RFC1213 icmpInErrors: https://tools.ietf.org/html/rfc1213#page-41
@@ -209,7 +227,7 @@ and the sending packet path use IcmpOutErrors. When IcmpInCsumErrors
 is increased, IcmpInErrors would always be increased too.
 
 relationship of the ICMP counters
--------------------------------
+---------------------------------
 The sum of IcmpMsgOutType[N] is always equal to IcmpOutMsgs, as they
 are updated at the same time. The sum of IcmpMsgInType[N] plus
 IcmpInErrors should be equal or larger than IcmpInMsgs. When kernel
@@ -229,8 +247,9 @@ IcmpInMsgs should be less than the sum of IcmpMsgOutType[N] plus
 IcmpInErrors.
 
 General TCP counters
-==================
+====================
 * TcpInSegs
+
 Defined in `RFC1213 tcpInSegs`_
 
 .. _RFC1213 tcpInSegs: https://tools.ietf.org/html/rfc1213#page-48
@@ -247,6 +266,7 @@ isn't aware of GRO. So if two packets are merged by GRO, the TcpInSegs
 counter would only increase 1.
 
 * TcpOutSegs
+
 Defined in `RFC1213 tcpOutSegs`_
 
 .. _RFC1213 tcpOutSegs: https://tools.ietf.org/html/rfc1213#page-48
@@ -258,6 +278,7 @@ GSO, so if a packet would be split to 2 by GSO, TcpOutSegs will
 increase 2.
 
 * TcpActiveOpens
+
 Defined in `RFC1213 tcpActiveOpens`_
 
 .. _RFC1213 tcpActiveOpens: https://tools.ietf.org/html/rfc1213#page-47
@@ -267,6 +288,7 @@ state. Every time TcpActiveOpens increases 1, TcpOutSegs should always
 increase 1.
 
 * TcpPassiveOpens
+
 Defined in `RFC1213 tcpPassiveOpens`_
 
 .. _RFC1213 tcpPassiveOpens: https://tools.ietf.org/html/rfc1213#page-47
@@ -275,6 +297,7 @@ It means the TCP layer receives a SYN, replies a SYN+ACK, come into
 the SYN-RCVD state.
 
 * TcpExtTCPRcvCoalesce
+
 When packets are received by the TCP layer and are not be read by the
 application, the TCP layer will try to merge them. This counter
 indicate how many packets are merged in such situation. If GRO is
@@ -282,12 +305,14 @@ enabled, lots of packets would be merged by GRO, these packets
 wouldn't be counted to TcpExtTCPRcvCoalesce.
 
 * TcpExtTCPAutoCorking
+
 When sending packets, the TCP layer will try to merge small packets to
 a bigger one. This counter increase 1 for every packet merged in such
 situation. Please refer to the LWN article for more details:
 https://lwn.net/Articles/576263/
 
 * TcpExtTCPOrigDataSent
+
 This counter is explained by `kernel commit f19c29e3e391`_, I pasted the
 explaination below::
 
@@ -297,6 +322,7 @@ explaination below::
   more useful to track the TCP retransmission rate.
 
 * TCPSynRetrans
+
 This counter is explained by `kernel commit f19c29e3e391`_, I pasted the
 explaination below::
 
@@ -304,6 +330,7 @@ explaination below::
   retransmissions into SYN, fast-retransmits, timeout retransmits, etc.
 
 * TCPFastOpenActiveFail
+
 This counter is explained by `kernel commit f19c29e3e391`_, I pasted the
 explaination below::
 
@@ -313,6 +340,7 @@ explaination below::
 .. _kernel commit f19c29e3e391: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f19c29e3e391a66a273e9afebaf01917245148cd
 
 * TcpExtListenOverflows and TcpExtListenDrops
+
 When kernel receives a SYN from a client, and if the TCP accept queue
 is full, kernel will drop the SYN and add 1 to TcpExtListenOverflows.
 At the same time kernel will also add 1 to TcpExtListenDrops. When a
@@ -337,7 +365,7 @@ to the accept queue.
 
 
 TCP Fast Open
-============
+=============
 When kernel receives a TCP packet, it has two paths to handler the
 packet, one is fast path, another is slow path. The comment in kernel
 code provides a good explanation of them, I pasted them below::
@@ -370,22 +398,24 @@ will disable the fast path at first, and try to enable it after kernel
 receives packets.
 
 * TcpExtTCPPureAcks and TcpExtTCPHPAcks
+
 If a packet set ACK flag and has no data, it is a pure ACK packet, if
 kernel handles it in the fast path, TcpExtTCPHPAcks will increase 1,
 if kernel handles it in the slow path, TcpExtTCPPureAcks will
 increase 1.
 
 * TcpExtTCPHPHits
+
 If a TCP packet has data (which means it is not a pure ACK packet),
 and this packet is handled in the fast path, TcpExtTCPHPHits will
 increase 1.
 
 
 TCP abort
-========
-
+=========
 
 * TcpExtTCPAbortOnData
+
 It means TCP layer has data in flight, but need to close the
 connection. So TCP layer sends a RST to the other side, indicate the
 connection is not closed very graceful. An easy way to increase this
@@ -404,11 +434,13 @@ when the application closes a connection, kernel will send a RST
 immediately and increase the TcpExtTCPAbortOnData counter.
 
 * TcpExtTCPAbortOnClose
+
 This counter means the application has unread data in the TCP layer when
 the application wants to close the TCP connection. In such a situation,
 kernel will send a RST to the other side of the TCP connection.
 
 * TcpExtTCPAbortOnMemory
+
 When an application closes a TCP connection, kernel still need to track
 the connection, let it complete the TCP disconnect process. E.g. an
 app calls the close method of a socket, kernel sends fin to the other
@@ -430,10 +462,12 @@ the tcp_mem. Please refer the tcp_mem section in the `TCP man page`_:
 
 
 * TcpExtTCPAbortOnTimeout
+
 This counter will increase when any of the TCP timers expire. In such
 situation, kernel won't send RST, just give up the connection.
 
 * TcpExtTCPAbortOnLinger
+
 When a TCP connection comes into FIN_WAIT_2 state, instead of waiting
 for the fin packet from the other side, kernel could send a RST and
 delete the socket immediately. This is not the default behavior of
@@ -441,6 +475,7 @@ Linux kernel TCP stack. By configuring the TCP_LINGER2 socket option,
 you could let kernel follow this behavior.
 
 * TcpExtTCPAbortFailed
+
 The kernel TCP layer will send RST if the `RFC2525 2.17 section`_ is
 satisfied. If an internal error occurs during this process,
 TcpExtTCPAbortFailed will be increased.
@@ -448,7 +483,7 @@ TcpExtTCPAbortFailed will be increased.
 .. _RFC2525 2.17 section: https://tools.ietf.org/html/rfc2525#page-50
 
 TCP Hybrid Slow Start
-====================
+=====================
 The Hybrid Slow Start algorithm is an enhancement of the traditional
 TCP congestion window Slow Start algorithm. It uses two pieces of
 information to detect whether the max bandwidth of the TCP path is
@@ -464,23 +499,27 @@ relate with the Hybrid Slow Start algorithm.
 .. _Hybrid Slow Start paper: https://pdfs.semanticscholar.org/25e9/ef3f03315782c7f1cbcd31b587857adae7d1.pdf
 
 * TcpExtTCPHystartTrainDetect
+
 How many times the ACK train length threshold is detected
 
 * TcpExtTCPHystartTrainCwnd
+
 The sum of CWND detected by ACK train length. Dividing this value by
 TcpExtTCPHystartTrainDetect is the average CWND which detected by the
 ACK train length.
 
 * TcpExtTCPHystartDelayDetect
+
 How many times the packet delay threshold is detected.
 
 * TcpExtTCPHystartDelayCwnd
+
 The sum of CWND detected by packet delay. Dividing this value by
 TcpExtTCPHystartDelayDetect is the average CWND which detected by the
 packet delay.
 
 TCP retransmission and congestion control
-======================================
+=========================================
 The TCP protocol has two retransmission mechanisms: SACK and fast
 recovery. They are exclusive with each other. When SACK is enabled,
 the kernel TCP stack would use SACK, or kernel would use fast
@@ -499,12 +538,14 @@ https://pdfs.semanticscholar.org/0e9c/968d09ab2e53e24c4dca5b2d67c7f7140f8e.pdf
 .. _RFC6582: https://tools.ietf.org/html/rfc6582
 
 * TcpExtTCPRenoRecovery and TcpExtTCPSackRecovery
+
 When the congestion control comes into Recovery state, if sack is
 used, TcpExtTCPSackRecovery increases 1, if sack is not used,
 TcpExtTCPRenoRecovery increases 1. These two counters mean the TCP
 stack begins to retransmit the lost packets.
 
 * TcpExtTCPSACKReneging
+
 A packet was acknowledged by SACK, but the receiver has dropped this
 packet, so the sender needs to retransmit this packet. In this
 situation, the sender adds 1 to TcpExtTCPSACKReneging. A receiver
@@ -515,6 +556,7 @@ the RTO expires for this packet, then the sender assumes this packet
 has been dropped by the receiver.
 
 * TcpExtTCPRenoReorder
+
 The reorder packet is detected by fast recovery. It would only be used
 if SACK is disabled. The fast recovery algorithm detects recorder by
 the duplicate ACK number. E.g., if retransmission is triggered, and
@@ -525,6 +567,7 @@ order packet. Thus the sender would find more ACks than its
 expectation, and the sender knows out of order occurs.
 
 * TcpExtTCPTSReorder
+
 The reorder packet is detected when a hole is filled. E.g., assume the
 sender sends packet 1,2,3,4,5, and the receiving order is
 1,2,4,5,3. When the sender receives the ACK of packet 3 (which will
@@ -534,6 +577,7 @@ fill the hole), two conditions will let TcpExtTCPTSReorder increase
 than the retransmission timestamp.
 
 * TcpExtTCPSACKReorder
+
 The reorder packet detected by SACK. The SACK has two methods to
 detect reorder: (1) DSACK is received by the sender. It means the
 sender sends the same packet more than one times. And the only reason
@@ -558,39 +602,46 @@ sender side.
 .. _RFC2883 : https://tools.ietf.org/html/rfc2883
 
 * TcpExtTCPDSACKOldSent
+
 The TCP stack receives a duplicate packet which has been acked, so it
 sends a DSACK to the sender.
 
 * TcpExtTCPDSACKOfoSent
+
 The TCP stack receives an out of order duplicate packet, so it sends a
 DSACK to the sender.
 
 * TcpExtTCPDSACKRecv
+
 The TCP stack receives a DSACK, which indicate an acknowledged
 duplicate packet is received.
 
 * TcpExtTCPDSACKOfoRecv
+
 The TCP stack receives a DSACK, which indicate an out of order
 duplicate packet is received.
 
 TCP out of order
-===============
+================
 * TcpExtTCPOFOQueue
+
 The TCP layer receives an out of order packet and has enough memory
 to queue it.
 
 * TcpExtTCPOFODrop
+
 The TCP layer receives an out of order packet but doesn't have enough
 memory, so drops it. Such packets won't be counted into
 TcpExtTCPOFOQueue.
 
 * TcpExtTCPOFOMerge
+
 The received out of order packet has an overlay with the previous
 packet. the overlay part will be dropped. All of TcpExtTCPOFOMerge
 packets will also be counted into TcpExtTCPOFOQueue.
 
 TCP PAWS
-=======
+========
 PAWS (Protection Against Wrapped Sequence numbers) is an algorithm
 which is used to drop old packets. It depends on the TCP
 timestamps. For detail information, please refer the `timestamp wiki`_
@@ -600,13 +651,15 @@ and the `RFC of PAWS`_.
 .. _timestamp wiki: https://en.wikipedia.org/wiki/Transmission_Control_Protocol#TCP_timestamps
 
 * TcpExtPAWSActive
+
 Packets are dropped by PAWS in Syn-Sent status.
 
 * TcpExtPAWSEstab
+
 Packets are dropped by PAWS in any status other than Syn-Sent.
 
 TCP ACK skip
-===========
+============
 In some scenarios, kernel would avoid sending duplicate ACKs too
 frequently. Please find more details in the tcp_invalid_ratelimit
 section of the `sysctl document`_. When kernel decides to skip an ACK
@@ -618,6 +671,7 @@ it has no data.
 .. _sysctl document: https://www.kernel.org/doc/Documentation/networking/ip-sysctl.txt
 
 * TcpExtTCPACKSkippedSynRecv
+
 The ACK is skipped in Syn-Recv status. The Syn-Recv status means the
 TCP stack receives a SYN and replies SYN+ACK. Now the TCP stack is
 waiting for an ACK. Generally, the TCP stack doesn't need to send ACK
@@ -631,6 +685,7 @@ increase TcpExtTCPACKSkippedSynRecv.
 
 
 * TcpExtTCPACKSkippedPAWS
+
 The ACK is skipped due to PAWS (Protect Against Wrapped Sequence
 numbers) check fails. If the PAWS check fails in Syn-Recv, Fin-Wait-2
 or Time-Wait statuses, the skipped ACK would be counted to
@@ -639,18 +694,22 @@ TcpExtTCPACKSkippedTimeWait. In all other statuses, the skipped ACK
 would be counted to TcpExtTCPACKSkippedPAWS.
 
 * TcpExtTCPACKSkippedSeq
+
 The sequence number is out of window and the timestamp passes the PAWS
 check and the TCP status is not Syn-Recv, Fin-Wait-2, and Time-Wait.
 
 * TcpExtTCPACKSkippedFinWait2
+
 The ACK is skipped in Fin-Wait-2 status, the reason would be either
 PAWS check fails or the received sequence number is out of window.
 
 * TcpExtTCPACKSkippedTimeWait
+
 Tha ACK is skipped in Time-Wait status, the reason would be either
 PAWS check failed or the received sequence number is out of window.
 
 * TcpExtTCPACKSkippedChallenge
+
 The ACK is skipped if the ACK is a challenge ACK. The RFC 5961 defines
 3 kind of challenge ACK, please refer `RFC 5961 section 3.2`_,
 `RFC 5961 section 4.2`_ and `RFC 5961 section 5.2`_. Besides these
@@ -664,10 +723,10 @@ unacknowledged number (more strict than `RFC 5961 section 5.2`_).
 
 
 examples
-=======
+========
 
 ping test
---------
+---------
 Run the ping command against the public dns server 8.8.8.8::
 
   nstatuser@nstat-a:~$ ping 8.8.8.8 -c 1
@@ -711,7 +770,7 @@ and its corresponding Echo Reply packet are constructed by:
 So the IpExtInOctets and IpExtOutOctets are 20+16+48=84.
 
 tcp 3-way handshake
-------------------
+-------------------
 On server side, we run::
 
   nstatuser@nstat-b:~$ nc -lknv 0.0.0.0 9000
@@ -753,7 +812,7 @@ ACK, so client sent 2 packets, received 1 packet, TcpInSegs increased
 1, TcpOutSegs increased 2.
 
 TCP normal traffic
------------------
+------------------
 Run nc on server::
 
   nstatuser@nstat-b:~$ nc -lkv 0.0.0.0 9000
@@ -876,7 +935,7 @@ and the packet received from client qualified for fast path, so it
 was counted into 'TcpExtTCPHPHits'.
 
 TcpExtTCPAbortOnClose
---------------------
+---------------------
 On the server side, we run below python script::
 
   import socket
@@ -910,7 +969,7 @@ If we run tcpdump on the server side, we could find the server sent a
 RST after we type Ctrl-C.
 
 TcpExtTCPAbortOnMemory and TcpExtTCPAbortOnTimeout
------------------------------------------------
+---------------------------------------------------
 Below is an example which let the orphan socket count be higher than
 net.ipv4.tcp_max_orphans.
 Change tcp_max_orphans to a smaller value on client::
@@ -1032,7 +1091,7 @@ FIN_WAIT_1 state finally. So we wait for a few minutes, we could find
   TcpExtTCPAbortOnTimeout         10                 0.0
 
 TcpExtTCPAbortOnLinger
----------------------
+----------------------
 The server side code::
 
   nstatuser@nstat-b:~$ cat server_linger.py
@@ -1077,7 +1136,7 @@ After run client_linger.py, check the output of nstat::
   TcpExtTCPAbortOnLinger          1                  0.0
 
 TcpExtTCPRcvCoalesce
--------------------
+--------------------
 On the server, we run a program which listen on TCP port 9000, but
 doesn't read any data::
 
@@ -1137,7 +1196,7 @@ the receiving queue. So the TCP layer merged the two packets, and we
 could find the TcpExtTCPRcvCoalesce increased 1.
 
 TcpExtListenOverflows and TcpExtListenDrops
-----------------------------------------
+-------------------------------------------
 On server, run the nc command, listen on port 9000::
 
   nstatuser@nstat-b:~$ nc -lkv 0.0.0.0 9000
@@ -1185,7 +1244,7 @@ TcpExtListenOverflows and TcpExtListenDrops would be larger, because
 the SYN of the 4th nc was dropped, the client was retrying.
 
 IpInAddrErrors, IpExtInNoRoutes and IpOutNoRoutes
-----------------------------------------------
+-------------------------------------------------
 server A IP address: 192.168.122.250
 server B IP address: 192.168.122.251
 Prepare on server A, add a route to server B::
@@ -1280,7 +1339,7 @@ a route for the 8.8.8.8 IP address, so server B increased
 IpOutNoRoutes.
 
 TcpExtTCPACKSkippedSynRecv
-------------------------
+--------------------------
 In this test, we send 3 same SYN packets from client to server. The
 first SYN will let server create a socket, set it to Syn-Recv status,
 and reply a SYN/ACK. The second SYN will let server reply the SYN/ACK
@@ -1328,7 +1387,7 @@ Check snmp cunter on nstat-b::
 As we expected, TcpExtTCPACKSkippedSynRecv is 1.
 
 TcpExtTCPACKSkippedPAWS
-----------------------
+-----------------------
 To trigger PAWS, we could send an old SYN.
 
 On nstat-b, let nc listen on port 9000::
@@ -1365,7 +1424,7 @@ failed, the nstat-b replied an ACK for the first SYN, skipped the ACK
 for the second SYN, and updated TcpExtTCPACKSkippedPAWS.
 
 TcpExtTCPACKSkippedSeq
---------------------
+----------------------
 To trigger TcpExtTCPACKSkippedSeq, we send packets which have valid
 timestamp (to pass PAWS check) but the sequence number is out of
 window. The linux TCP stack would avoid to skip if the packet has
-- 
cgit v1.2.3


From 44543f1dd2a39d56c9afdc3778aa050b5a4725b4 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Tue, 15 Jan 2019 14:35:02 -0800
Subject: Documentation: networking: dsa: Update documentation

Since 83c0afaec7b7 ("net: dsa: Add new binding implementation"), DSA is
no longer a platform device exclusively and can support registering DSA
switches from other bus drivers (PCI, USB, I2C, etc.).

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/dsa/dsa.txt | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/networking/dsa/dsa.txt b/Documentation/networking/dsa/dsa.txt
index 25170ad7d25b..1000b821681c 100644
--- a/Documentation/networking/dsa/dsa.txt
+++ b/Documentation/networking/dsa/dsa.txt
@@ -236,19 +236,6 @@ description.
 Design limitations
 ==================
 
-DSA is a platform device driver
--------------------------------
-
-DSA is implemented as a DSA platform device driver which is convenient because
-it will register the entire DSA switch tree attached to a master network device
-in one-shot, facilitating the device creation and simplifying the device driver
-model a bit, this comes however with a number of limitations:
-
-- building DSA and its switch drivers as modules is currently not working
-- the device driver parenting does not necessarily reflect the original
-  bus/device the switch can be created from
-- supporting non-MDIO and non-MMIO (platform) switches is not possible
-
 Limits on the number of devices and ports
 -----------------------------------------
 
-- 
cgit v1.2.3


From 6685987c29582afc79b7fa3998dfbf36b4295791 Mon Sep 17 00:00:00 2001
From: Petr Machata <petrm@mellanox.com>
Date: Wed, 16 Jan 2019 23:06:56 +0000
Subject: switchdev: Add extack argument to call_switchdev_notifiers()

A follow-up patch will enable vetoing of FDB entries. Make it possible
to communicate details of why an FDB entry is not acceptable back to the
user.

Signed-off-by: Petr Machata <petrm@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/switchdev.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'Documentation')

diff --git a/Documentation/networking/switchdev.txt b/Documentation/networking/switchdev.txt
index 82236a17b5e6..f3244d87512a 100644
--- a/Documentation/networking/switchdev.txt
+++ b/Documentation/networking/switchdev.txt
@@ -196,7 +196,7 @@ The switch device will learn/forget source MAC address/VLAN on ingress packets
 and notify the switch driver of the mac/vlan/port tuples.  The switch driver,
 in turn, will notify the bridge driver using the switchdev notifier call:
 
-	err = call_switchdev_notifiers(val, dev, info);
+	err = call_switchdev_notifiers(val, dev, info, extack);
 
 Where val is SWITCHDEV_FDB_ADD when learning and SWITCHDEV_FDB_DEL when
 forgetting, and info points to a struct switchdev_notifier_fdb_info.  On
-- 
cgit v1.2.3


From b8c45a033acc607201588f7665ba84207e5149e0 Mon Sep 17 00:00:00 2001
From: Aya Levin <ayal@mellanox.com>
Date: Thu, 17 Jan 2019 23:59:20 +0200
Subject: devlink: Add Documentation/networking/devlink-health.txt

This patch adds a new file to add information about devlink health
mechanism.

Signed-off-by: Aya Levin <ayal@mellanox.com>
Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/devlink-health.txt | 86 +++++++++++++++++++++++++++++
 1 file changed, 86 insertions(+)
 create mode 100644 Documentation/networking/devlink-health.txt

(limited to 'Documentation')

diff --git a/Documentation/networking/devlink-health.txt b/Documentation/networking/devlink-health.txt
new file mode 100644
index 000000000000..1db3fbea0831
--- /dev/null
+++ b/Documentation/networking/devlink-health.txt
@@ -0,0 +1,86 @@
+The health mechanism is targeted for Real Time Alerting, in order to know when
+something bad had happened to a PCI device
+- Provide alert debug information
+- Self healing
+- If problem needs vendor support, provide a way to gather all needed debugging
+  information.
+
+The main idea is to unify and centralize driver health reports in the
+generic devlink instance and allow the user to set different
+attributes of the health reporting and recovery procedures.
+
+The devlink health reporter:
+Device driver creates a "health reporter" per each error/health type.
+Error/Health type can be a known/generic (eg pci error, fw error, rx/tx error)
+or unknown (driver specific).
+For each registered health reporter a driver can issue error/health reports
+asynchronously. All health reports handling is done by devlink.
+Device driver can provide specific callbacks for each "health reporter", e.g.
+ - Recovery procedures
+ - Diagnostics and object dump procedures
+ - OOB initial parameters
+Different parts of the driver can register different types of health reporters
+with different handlers.
+
+Once an error is reported, devlink health will do the following actions:
+  * A log is being send to the kernel trace events buffer
+  * Health status and statistics are being updated for the reporter instance
+  * Object dump is being taken and saved at the reporter instance (as long as
+    there is no other dump which is already stored)
+  * Auto recovery attempt is being done. Depends on:
+    - Auto-recovery configuration
+    - Grace period vs. time passed since last recover
+
+The user interface:
+User can access/change each reporter's parameters and driver specific callbacks
+via devlink, e.g per error type (per health reporter)
+ - Configure reporter's generic parameters (like: disable/enable auto recovery)
+ - Invoke recovery procedure
+ - Run diagnostics
+ - Object dump
+
+The devlink health interface (via netlink):
+DEVLINK_CMD_HEALTH_REPORTER_GET
+  Retrieves status and configuration info per DEV and reporter.
+DEVLINK_CMD_HEALTH_REPORTER_SET
+  Allows reporter-related configuration setting.
+DEVLINK_CMD_HEALTH_REPORTER_RECOVER
+  Triggers a reporter's recovery procedure.
+DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE
+  Retrieves diagnostics data from a reporter on a device.
+DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET
+  Retrieves the last stored dump. Devlink health
+  saves a single dump. If an dump is not already stored by the devlink
+  for this reporter, devlink generates a new dump.
+  dump output is defined by the reporter.
+DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR
+  Clears the last saved dump file for the specified reporter.
+
+
+                                               netlink
+                                      +--------------------------+
+                                      |                          |
+                                      |            +             |
+                                      |            |             |
+                                      +--------------------------+
+                                                   |request for ops
+                                                   |(diagnose,
+ mlx5_core                             devlink     |recover,
+                                                   |dump)
++--------+                            +--------------------------+
+|        |                            |    reporter|             |
+|        |                            |  +---------v----------+  |
+|        |   ops execution            |  |                    |  |
+|     <----------------------------------+                    |  |
+|        |                            |  |                    |  |
+|        |                            |  + ^------------------+  |
+|        |                            |    | request for ops     |
+|        |                            |    | (recover, dump)     |
+|        |                            |    |                     |
+|        |                            |  +-+------------------+  |
+|        |     health report          |  | health handler     |  |
+|        +------------------------------->                    |  |
+|        |                            |  +--------------------+  |
+|        |     health reporter create |                          |
+|        +---------------------------->                          |
++--------+                            +--------------------------+
-- 
cgit v1.2.3


From 856c395cfa63b94a1d8215182f0243c222f6f927 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Thu, 17 Jan 2019 23:27:11 -0800
Subject: net: introduce a knob to control whether to inherit devconf config

There have been many people complaining about the inconsistent
behaviors of IPv4 and IPv6 devconf when creating new network
namespaces.  Currently, for IPv4, we inherit all current settings
from init_net, but for IPv6 we reset all setting to default.

This patch introduces a new /proc file
/proc/sys/net/core/devconf_inherit_init_net to control the
behavior of whether to inhert sysctl current settings from init_net.
This file itself is only available in init_net.

As demonstrated below:

Initial setup in init_net:
 # cat /proc/sys/net/ipv4/conf/all/rp_filter
 2
 # cat /proc/sys/net/ipv6/conf/all/accept_dad
 1

Default value 0 (current behavior):
 # ip netns del test
 # ip netns add test
 # ip netns exec test cat /proc/sys/net/ipv4/conf/all/rp_filter
 2
 # ip netns exec test cat /proc/sys/net/ipv6/conf/all/accept_dad
 0

Set to 1 (inherit from init_net):
 # echo 1 > /proc/sys/net/core/devconf_inherit_init_net
 # ip netns del test
 # ip netns add test
 # ip netns exec test cat /proc/sys/net/ipv4/conf/all/rp_filter
 2
 # ip netns exec test cat /proc/sys/net/ipv6/conf/all/accept_dad
 1

Set to 2 (reset to default):
 # echo 2 > /proc/sys/net/core/devconf_inherit_init_net
 # ip netns del test
 # ip netns add test
 # ip netns exec test cat /proc/sys/net/ipv4/conf/all/rp_filter
 0
 # ip netns exec test cat /proc/sys/net/ipv6/conf/all/accept_dad
 0

Set to a value out of range (invalid):
 # echo 3 > /proc/sys/net/core/devconf_inherit_init_net
 -bash: echo: write error: Invalid argument
 # echo -1 > /proc/sys/net/core/devconf_inherit_init_net
 -bash: echo: write error: Invalid argument

Reported-by: Zhu Yanjun <Yanjun.Zhu@windriver.com>
Reported-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Cc: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Acked-by: Tonghao Zhang <xiangxia.m.yue@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/sysctl/net.txt | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'Documentation')

diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt
index 2793d4eac55f..bc0680706870 100644
--- a/Documentation/sysctl/net.txt
+++ b/Documentation/sysctl/net.txt
@@ -291,6 +291,20 @@ user space is responsible for creating them if needed.
 
 Default : 0  (for compatibility reasons)
 
+devconf_inherit_init_net
+----------------------------
+
+Controls if a new network namespace should inherit all current
+settings under /proc/sys/net/{ipv4,ipv6}/conf/{all,default}/. By
+default, we keep the current behavior: for IPv4 we inherit all current
+settings from init_net and for IPv6 we reset all settings to default.
+
+If set to 1, both IPv4 and IPv6 settings are forced to inherit from
+current ones in init_net. If set to 2, both IPv4 and IPv6 settings are
+forced to reset to their default values.
+
+Default : 0  (for compatibility reasons)
+
 2. /proc/sys/net/unix - Parameters for Unix domain sockets
 -------------------------------------------------------
 
-- 
cgit v1.2.3


From 00f1ee5361c3f644133ef2d19d2c340d2a730f1d Mon Sep 17 00:00:00 2001
From: Vinod Koul <vkoul@kernel.org>
Date: Mon, 21 Jan 2019 14:43:14 +0530
Subject: dt-bindings: net: Add Qualcomm ethqos binding

Add support for Qualcomm ethqos found in some SoCs like QCS404.

Signed-off-by: Vinod Koul <vkoul@kernel.org>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 .../devicetree/bindings/net/qcom,ethqos.txt        | 64 ++++++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/net/qcom,ethqos.txt

(limited to 'Documentation')

diff --git a/Documentation/devicetree/bindings/net/qcom,ethqos.txt b/Documentation/devicetree/bindings/net/qcom,ethqos.txt
new file mode 100644
index 000000000000..fcf5035810b5
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/qcom,ethqos.txt
@@ -0,0 +1,64 @@
+Qualcomm Ethernet ETHQOS device
+
+This documents dwmmac based ethernet device which supports Gigabit
+ethernet for version v2.3.0 onwards.
+
+This device has following properties:
+
+Required properties:
+
+- compatible: Should be qcom,qcs404-ethqos"
+
+- reg: Address and length of the register set for the device
+
+- reg-names: Should contain register names "stmmaceth", "rgmii"
+
+- clocks: Should contain phandle to clocks
+
+- clock-names: Should contain clock names "stmmaceth", "pclk",
+		"ptp_ref", "rgmii"
+
+- interrupts: Should contain phandle to interrupts
+
+- interrupt-names: Should contain interrupt names "macirq", "eth_lpi"
+
+Rest of the properties are defined in stmmac.txt file in same directory
+
+
+Example:
+
+ethernet: ethernet@7a80000 {
+	compatible = "qcom,qcs404-ethqos";
+	reg = <0x07a80000 0x10000>,
+		<0x07a96000 0x100>;
+	reg-names = "stmmaceth", "rgmii";
+	clock-names = "stmmaceth", "pclk", "ptp_ref", "rgmii";
+	clocks = <&gcc GCC_ETH_AXI_CLK>,
+		<&gcc GCC_ETH_SLAVE_AHB_CLK>,
+		<&gcc GCC_ETH_PTP_CLK>,
+		<&gcc GCC_ETH_RGMII_CLK>;
+	interrupts = <GIC_SPI 56 IRQ_TYPE_LEVEL_HIGH>,
+			<GIC_SPI 55 IRQ_TYPE_LEVEL_HIGH>;
+	interrupt-names = "macirq", "eth_lpi";
+	snps,reset-gpio = <&tlmm 60 GPIO_ACTIVE_LOW>;
+	snps,reset-active-low;
+
+	snps,txpbl = <8>;
+	snps,rxpbl = <2>;
+	snps,aal;
+	snps,tso;
+
+	phy-handle = <&phy1>;
+	phy-mode = "rgmii";
+
+	mdio {
+		#address-cells = <0x1>;
+		#size-cells = <0x0>;
+		compatible = "snps,dwmac-mdio";
+		phy1: phy@4 {
+			device_type = "ethernet-phy";
+			reg = <0x4>;
+		};
+	};
+
+};
-- 
cgit v1.2.3


From 5ff2698b3301c37246f1f79dc9bdcd378b000dbe Mon Sep 17 00:00:00 2001
From: Yangbo Lu <yangbo.lu@nxp.com>
Date: Mon, 21 Jan 2019 18:41:40 +0800
Subject: dt-binding: ptp_qoriq: document "fsl,extts-fifo" property

Documented "fsl,extts-fifo" property.

Signed-off-by: Yangbo Lu <yangbo.lu@nxp.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/ptp/ptp-qoriq.txt | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'Documentation')

diff --git a/Documentation/devicetree/bindings/ptp/ptp-qoriq.txt b/Documentation/devicetree/bindings/ptp/ptp-qoriq.txt
index c5d0e7998e2b..8e7f8551d190 100644
--- a/Documentation/devicetree/bindings/ptp/ptp-qoriq.txt
+++ b/Documentation/devicetree/bindings/ptp/ptp-qoriq.txt
@@ -17,6 +17,8 @@ Clock Properties:
   - fsl,tmr-fiper1   Fixed interval period pulse generator.
   - fsl,tmr-fiper2   Fixed interval period pulse generator.
   - fsl,max-adj      Maximum frequency adjustment in parts per billion.
+  - fsl,extts-fifo   The presence of this property indicates hardware
+		     support for the external trigger stamp FIFO.
 
   These properties set the operational parameters for the PTP
   clock. You must choose these carefully for the clock to work right.
-- 
cgit v1.2.3


From ffcf7ce9332723cab5ae55575f3a55d1ce559bf3 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Fri, 18 Jan 2019 13:56:49 -0800
Subject: bpf: btf: add btf documentation

This patch added documentation for BTF (BPF Debug Format).
The document is placed under linux:Documentation/bpf directory.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 Documentation/bpf/btf.rst   | 870 ++++++++++++++++++++++++++++++++++++++++++++
 Documentation/bpf/index.rst |   7 +
 2 files changed, 877 insertions(+)
 create mode 100644 Documentation/bpf/btf.rst

(limited to 'Documentation')

diff --git a/Documentation/bpf/btf.rst b/Documentation/bpf/btf.rst
new file mode 100644
index 000000000000..1d434c3a268d
--- /dev/null
+++ b/Documentation/bpf/btf.rst
@@ -0,0 +1,870 @@
+=====================
+BPF Type Format (BTF)
+=====================
+
+1. Introduction
+***************
+
+BTF (BPF Type Format) is the meta data format which
+encodes the debug info related to BPF program/map.
+The name BTF was used initially to describe
+data types. The BTF was later extended to include
+function info for defined subroutines, and line info
+for source/line information.
+
+The debug info is used for map pretty print, function
+signature, etc. The function signature enables better
+bpf program/function kernel symbol.
+The line info helps generate
+source annotated translated byte code, jited code
+and verifier log.
+
+The BTF specification contains two parts,
+  * BTF kernel API
+  * BTF ELF file format
+
+The kernel API is the contract between
+user space and kernel. The kernel verifies
+the BTF info before using it.
+The ELF file format is a user space contract
+between ELF file and libbpf loader.
+
+The type and string sections are part of the
+BTF kernel API, describing the debug info
+(mostly types related) referenced by the bpf program.
+These two sections are discussed in
+details in :ref:`BTF_Type_String`.
+
+.. _BTF_Type_String:
+
+2. BTF Type and String Encoding
+*******************************
+
+The file ``include/uapi/linux/btf.h`` provides high
+level definition on how types/strings are encoded.
+
+The beginning of data blob must be::
+
+    struct btf_header {
+        __u16   magic;
+        __u8    version;
+        __u8    flags;
+        __u32   hdr_len;
+
+        /* All offsets are in bytes relative to the end of this header */
+        __u32   type_off;       /* offset of type section       */
+        __u32   type_len;       /* length of type section       */
+        __u32   str_off;        /* offset of string section     */
+        __u32   str_len;        /* length of string section     */
+    };
+
+The magic is ``0xeB9F``, which has different encoding for big and little
+endian system, and can be used to test whether BTF is generated for
+big or little endian target.
+The btf_header is designed to be extensible with hdr_len equal to
+``sizeof(struct btf_header)`` when the data blob is generated.
+
+2.1 String Encoding
+===================
+
+The first string in the string section must be a null string.
+The rest of string table is a concatenation of other null-treminated
+strings.
+
+2.2 Type Encoding
+=================
+
+The type id ``0`` is reserved for ``void`` type.
+The type section is parsed sequentially and the type id is assigned to
+each recognized type starting from id ``1``.
+Currently, the following types are supported::
+
+    #define BTF_KIND_INT            1       /* Integer      */
+    #define BTF_KIND_PTR            2       /* Pointer      */
+    #define BTF_KIND_ARRAY          3       /* Array        */
+    #define BTF_KIND_STRUCT         4       /* Struct       */
+    #define BTF_KIND_UNION          5       /* Union        */
+    #define BTF_KIND_ENUM           6       /* Enumeration  */
+    #define BTF_KIND_FWD            7       /* Forward      */
+    #define BTF_KIND_TYPEDEF        8       /* Typedef      */
+    #define BTF_KIND_VOLATILE       9       /* Volatile     */
+    #define BTF_KIND_CONST          10      /* Const        */
+    #define BTF_KIND_RESTRICT       11      /* Restrict     */
+    #define BTF_KIND_FUNC           12      /* Function     */
+    #define BTF_KIND_FUNC_PROTO     13      /* Function Proto       */
+
+Note that the type section encodes debug info, not just pure types.
+``BTF_KIND_FUNC`` is not a type, and it represents a defined subprogram.
+
+Each type contains the following common data::
+
+    struct btf_type {
+        __u32 name_off;
+        /* "info" bits arrangement
+         * bits  0-15: vlen (e.g. # of struct's members)
+         * bits 16-23: unused
+         * bits 24-27: kind (e.g. int, ptr, array...etc)
+         * bits 28-30: unused
+         * bit     31: kind_flag, currently used by
+         *             struct, union and fwd
+         */
+        __u32 info;
+        /* "size" is used by INT, ENUM, STRUCT and UNION.
+         * "size" tells the size of the type it is describing.
+         *
+         * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT,
+         * FUNC and FUNC_PROTO.
+         * "type" is a type_id referring to another type.
+         */
+        union {
+                __u32 size;
+                __u32 type;
+        };
+    };
+
+For certain kinds, the common data are followed by kind specific data.
+The ``name_off`` in ``struct btf_type`` specifies the offset in the string table.
+The following details encoding of each kind.
+
+2.2.1 BTF_KIND_INT
+~~~~~~~~~~~~~~~~~~
+
+``struct btf_type`` encoding requirement:
+ * ``name_off``: any valid offset
+ * ``info.kind_flag``: 0
+ * ``info.kind``: BTF_KIND_INT
+ * ``info.vlen``: 0
+ * ``size``: the size of the int type in bytes.
+
+``btf_type`` is followed by a ``u32`` with following bits arrangement::
+
+  #define BTF_INT_ENCODING(VAL)   (((VAL) & 0x0f000000) >> 24)
+  #define BTF_INT_OFFSET(VAL)     (((VAL  & 0x00ff0000)) >> 16)
+  #define BTF_INT_BITS(VAL)       ((VAL)  & 0x000000ff)
+
+The ``BTF_INT_ENCODING`` has the following attributes::
+
+  #define BTF_INT_SIGNED  (1 << 0)
+  #define BTF_INT_CHAR    (1 << 1)
+  #define BTF_INT_BOOL    (1 << 2)
+
+The ``BTF_INT_ENCODING()`` provides extra information, signness,
+char, or bool, for the int type. The char and bool encoding
+are mostly useful for pretty print. At most one encoding can
+be specified for the int type.
+
+The ``BTF_INT_BITS()`` specifies the number of actual bits held by
+this int type. For example, a 4-bit bitfield encodes
+``BTF_INT_BITS()`` equals to 4. The ``btf_type.size * 8``
+must be equal to or greater than ``BTF_INT_BITS()`` for the type.
+The maximum value of ``BTF_INT_BITS()`` is 128.
+
+The ``BTF_INT_OFFSET()`` specifies the starting bit offset to
+calculate values for this int. For example, a bitfield struct
+member has
+
+ * btf member bit offset 100 from the start of the structure,
+ * btf member pointing to an int type,
+ * the int type has ``BTF_INT_OFFSET() = 2`` and ``BTF_INT_BITS() = 4``
+
+Then in the struct memory layout, this member will occupy
+``4`` bits starting from bits ``100 + 2 = 102``.
+
+Alternatively, the bitfield struct member can be the following to
+access the same bits as the above:
+
+ * btf member bit offset 102,
+ * btf member pointing to an int type,
+ * the int type has ``BTF_INT_OFFSET() = 0`` and ``BTF_INT_BITS() = 4``
+
+The original intention of ``BTF_INT_OFFSET()`` is to provide
+flexibility of bitfield encoding.
+Currently, both llvm and pahole generates ``BTF_INT_OFFSET() = 0``
+for all int types.
+
+2.2.2 BTF_KIND_PTR
+~~~~~~~~~~~~~~~~~~
+
+``struct btf_type`` encoding requirement:
+  * ``name_off``: 0
+  * ``info.kind_flag``: 0
+  * ``info.kind``: BTF_KIND_PTR
+  * ``info.vlen``: 0
+  * ``type``: the pointee type of the pointer
+
+No additional type data follow ``btf_type``.
+
+2.2.3 BTF_KIND_ARRAY
+~~~~~~~~~~~~~~~~~~~~
+
+``struct btf_type`` encoding requirement:
+  * ``name_off``: 0
+  * ``info.kind_flag``: 0
+  * ``info.kind``: BTF_KIND_ARRAY
+  * ``info.vlen``: 0
+  * ``size/type``: 0, not used
+
+btf_type is followed by one "struct btf_array"::
+
+    struct btf_array {
+        __u32   type;
+        __u32   index_type;
+        __u32   nelems;
+    };
+
+The ``struct btf_array`` encoding:
+  * ``type``: the element type
+  * ``index_type``: the index type
+  * ``nelems``: the number of elements for this array (``0`` is also allowed).
+
+The ``index_type`` can be any regular int types
+(u8, u16, u32, u64, unsigned __int128).
+The original design of including ``index_type`` follows dwarf
+which has a ``index_type`` for its array type.
+Currently in BTF, beyond type verification, the ``index_type`` is not used.
+
+The ``struct btf_array`` allows chaining through element type to represent
+multiple dimensional arrays. For example, ``int a[5][6]``, the following
+type system illustrates the chaining:
+
+  * [1]: int
+  * [2]: array, ``btf_array.type = [1]``, ``btf_array.nelems = 6``
+  * [3]: array, ``btf_array.type = [2]``, ``btf_array.nelems = 5``
+
+Currently, both pahole and llvm collapse multiple dimensional array
+into one dimensional array, e.g., ``a[5][6]``, the btf_array.nelems
+equal to ``30``. This is because the original use case is map pretty
+print where the whole array is dumped out so one dimensional array
+is enough. As more BTF usage is explored, pahole and llvm can be
+changed to generate proper chained representation for
+multiple dimensional arrays.
+
+2.2.4 BTF_KIND_STRUCT
+~~~~~~~~~~~~~~~~~~~~~
+2.2.5 BTF_KIND_UNION
+~~~~~~~~~~~~~~~~~~~~
+
+``struct btf_type`` encoding requirement:
+  * ``name_off``: 0 or offset to a valid C identifier
+  * ``info.kind_flag``: 0 or 1
+  * ``info.kind``: BTF_KIND_STRUCT or BTF_KIND_UNION
+  * ``info.vlen``: the number of struct/union members
+  * ``info.size``: the size of the struct/union in bytes
+
+``btf_type`` is followed by ``info.vlen`` number of ``struct btf_member``.::
+
+    struct btf_member {
+        __u32   name_off;
+        __u32   type;
+        __u32   offset;
+    };
+
+``struct btf_member`` encoding:
+  * ``name_off``: offset to a valid C identifier
+  * ``type``: the member type
+  * ``offset``: <see below>
+
+If the type info ``kind_flag`` is not set, the offset contains
+only bit offset of the member. Note that the base type of the
+bitfield can only be int or enum type. If the bitfield size
+is 32, the base type can be either int or enum type.
+If the bitfield size is not 32, the base type must be int,
+and int type ``BTF_INT_BITS()`` encodes the bitfield size.
+
+If the ``kind_flag`` is set, the ``btf_member.offset``
+contains both member bitfield size and bit offset. The
+bitfield size and bit offset are calculated as below.::
+
+  #define BTF_MEMBER_BITFIELD_SIZE(val)   ((val) >> 24)
+  #define BTF_MEMBER_BIT_OFFSET(val)      ((val) & 0xffffff)
+
+In this case, if the base type is an int type, it must
+be a regular int type:
+
+  * ``BTF_INT_OFFSET()`` must be 0.
+  * ``BTF_INT_BITS()`` must be equal to ``{1,2,4,8,16} * 8``.
+
+The following kernel patch introduced ``kind_flag`` and
+explained why both modes exist:
+
+  https://github.com/torvalds/linux/commit/9d5f9f701b1891466fb3dbb1806ad97716f95cc3#diff-fa650a64fdd3968396883d2fe8215ff3
+
+2.2.6 BTF_KIND_ENUM
+~~~~~~~~~~~~~~~~~~~
+
+``struct btf_type`` encoding requirement:
+  * ``name_off``: 0 or offset to a valid C identifier
+  * ``info.kind_flag``: 0
+  * ``info.kind``: BTF_KIND_ENUM
+  * ``info.vlen``: number of enum values
+  * ``size``: 4
+
+``btf_type`` is followed by ``info.vlen`` number of ``struct btf_enum``.::
+
+    struct btf_enum {
+        __u32   name_off;
+        __s32   val;
+    };
+
+The ``btf_enum`` encoding:
+  * ``name_off``: offset to a valid C identifier
+  * ``val``: any value
+
+2.2.7 BTF_KIND_FWD
+~~~~~~~~~~~~~~~~~~
+
+``struct btf_type`` encoding requirement:
+  * ``name_off``: offset to a valid C identifier
+  * ``info.kind_flag``: 0 for struct, 1 for union
+  * ``info.kind``: BTF_KIND_FWD
+  * ``info.vlen``: 0
+  * ``type``: 0
+
+No additional type data follow ``btf_type``.
+
+2.2.8 BTF_KIND_TYPEDEF
+~~~~~~~~~~~~~~~~~~~~~~
+
+``struct btf_type`` encoding requirement:
+  * ``name_off``: offset to a valid C identifier
+  * ``info.kind_flag``: 0
+  * ``info.kind``: BTF_KIND_TYPEDEF
+  * ``info.vlen``: 0
+  * ``type``: the type which can be referred by name at ``name_off``
+
+No additional type data follow ``btf_type``.
+
+2.2.9 BTF_KIND_VOLATILE
+~~~~~~~~~~~~~~~~~~~~~~~
+
+``struct btf_type`` encoding requirement:
+  * ``name_off``: 0
+  * ``info.kind_flag``: 0
+  * ``info.kind``: BTF_KIND_VOLATILE
+  * ``info.vlen``: 0
+  * ``type``: the type with ``volatile`` qualifier
+
+No additional type data follow ``btf_type``.
+
+2.2.10 BTF_KIND_CONST
+~~~~~~~~~~~~~~~~~~~~~
+
+``struct btf_type`` encoding requirement:
+  * ``name_off``: 0
+  * ``info.kind_flag``: 0
+  * ``info.kind``: BTF_KIND_CONST
+  * ``info.vlen``: 0
+  * ``type``: the type with ``const`` qualifier
+
+No additional type data follow ``btf_type``.
+
+2.2.11 BTF_KIND_RESTRICT
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+``struct btf_type`` encoding requirement:
+  * ``name_off``: 0
+  * ``info.kind_flag``: 0
+  * ``info.kind``: BTF_KIND_RESTRICT
+  * ``info.vlen``: 0
+  * ``type``: the type with ``restrict`` qualifier
+
+No additional type data follow ``btf_type``.
+
+2.2.12 BTF_KIND_FUNC
+~~~~~~~~~~~~~~~~~~~~
+
+``struct btf_type`` encoding requirement:
+  * ``name_off``: offset to a valid C identifier
+  * ``info.kind_flag``: 0
+  * ``info.kind``: BTF_KIND_FUNC
+  * ``info.vlen``: 0
+  * ``type``: a BTF_KIND_FUNC_PROTO type
+
+No additional type data follow ``btf_type``.
+
+A BTF_KIND_FUNC defines, not a type, but a subprogram (function) whose
+signature is defined by ``type``. The subprogram is thus an instance of
+that type. The BTF_KIND_FUNC may in turn be referenced by a func_info in
+the :ref:`BTF_Ext_Section` (ELF) or in the arguments to
+:ref:`BPF_Prog_Load` (ABI).
+
+2.2.13 BTF_KIND_FUNC_PROTO
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+``struct btf_type`` encoding requirement:
+  * ``name_off``: 0
+  * ``info.kind_flag``: 0
+  * ``info.kind``: BTF_KIND_FUNC_PROTO
+  * ``info.vlen``: # of parameters
+  * ``type``: the return type
+
+``btf_type`` is followed by ``info.vlen`` number of ``struct btf_param``.::
+
+    struct btf_param {
+        __u32   name_off;
+        __u32   type;
+    };
+
+If a BTF_KIND_FUNC_PROTO type is referred by a BTF_KIND_FUNC type,
+then ``btf_param.name_off`` must point to a valid C identifier
+except for the possible last argument representing the variable
+argument. The btf_param.type refers to parameter type.
+
+If the function has variable arguments, the last parameter
+is encoded with ``name_off = 0`` and ``type = 0``.
+
+3. BTF Kernel API
+*****************
+
+The following bpf syscall command involves BTF:
+   * BPF_BTF_LOAD: load a blob of BTF data into kernel
+   * BPF_MAP_CREATE: map creation with btf key and value type info.
+   * BPF_PROG_LOAD: prog load with btf function and line info.
+   * BPF_BTF_GET_FD_BY_ID: get a btf fd
+   * BPF_OBJ_GET_INFO_BY_FD: btf, func_info, line_info
+     and other btf related info are returned.
+
+The workflow typically looks like:
+::
+
+  Application:
+      BPF_BTF_LOAD
+          |
+          v
+      BPF_MAP_CREATE and BPF_PROG_LOAD
+          |
+          V
+      ......
+
+  Introspection tool:
+      ......
+      BPF_{PROG,MAP}_GET_NEXT_ID (get prog/map id's)
+          |
+          V
+      BPF_{PROG,MAP}_GET_FD_BY_ID (get a prog/map fd)
+          |
+          V
+      BPF_OBJ_GET_INFO_BY_FD (get bpf_prog_info/bpf_map_info with btf_id)
+          |                                     |
+          V                                     |
+      BPF_BTF_GET_FD_BY_ID (get btf_fd)         |
+          |                                     |
+          V                                     |
+      BPF_OBJ_GET_INFO_BY_FD (get btf)          |
+          |                                     |
+          V                                     V
+      pretty print types, dump func signatures and line info, etc.
+
+
+3.1 BPF_BTF_LOAD
+================
+
+Load a blob of BTF data into kernel. A blob of data
+described in :ref:`BTF_Type_String`
+can be directly loaded into the kernel.
+A ``btf_fd`` returns to userspace.
+
+3.2 BPF_MAP_CREATE
+==================
+
+A map can be created with ``btf_fd`` and specified key/value type id.::
+
+    __u32   btf_fd;         /* fd pointing to a BTF type data */
+    __u32   btf_key_type_id;        /* BTF type_id of the key */
+    __u32   btf_value_type_id;      /* BTF type_id of the value */
+
+In libbpf, the map can be defined with extra annotation like below:
+::
+
+    struct bpf_map_def SEC("maps") btf_map = {
+        .type = BPF_MAP_TYPE_ARRAY,
+        .key_size = sizeof(int),
+        .value_size = sizeof(struct ipv_counts),
+        .max_entries = 4,
+    };
+    BPF_ANNOTATE_KV_PAIR(btf_map, int, struct ipv_counts);
+
+Here, the parameters for macro BPF_ANNOTATE_KV_PAIR are map name,
+key and value types for the map.
+During ELF parsing, libbpf is able to extract key/value type_id's
+and assigned them to BPF_MAP_CREATE attributes automatically.
+
+.. _BPF_Prog_Load:
+
+3.3 BPF_PROG_LOAD
+=================
+
+During prog_load, func_info and line_info can be passed to kernel with
+proper values for the following attributes:
+::
+
+    __u32           insn_cnt;
+    __aligned_u64   insns;
+    ......
+    __u32           prog_btf_fd;    /* fd pointing to BTF type data */
+    __u32           func_info_rec_size;     /* userspace bpf_func_info size */
+    __aligned_u64   func_info;      /* func info */
+    __u32           func_info_cnt;  /* number of bpf_func_info records */
+    __u32           line_info_rec_size;     /* userspace bpf_line_info size */
+    __aligned_u64   line_info;      /* line info */
+    __u32           line_info_cnt;  /* number of bpf_line_info records */
+
+The func_info and line_info are an array of below, respectively.::
+
+    struct bpf_func_info {
+        __u32   insn_off; /* [0, insn_cnt - 1] */
+        __u32   type_id;  /* pointing to a BTF_KIND_FUNC type */
+    };
+    struct bpf_line_info {
+        __u32   insn_off; /* [0, insn_cnt - 1] */
+        __u32   file_name_off; /* offset to string table for the filename */
+        __u32   line_off; /* offset to string table for the source line */
+        __u32   line_col; /* line number and column number */
+    };
+
+func_info_rec_size is the size of each func_info record, and line_info_rec_size
+is the size of each line_info record. Passing the record size to kernel make
+it possible to extend the record itself in the future.
+
+Below are requirements for func_info:
+  * func_info[0].insn_off must be 0.
+  * the func_info insn_off is in strictly increasing order and matches
+    bpf func boundaries.
+
+Below are requirements for line_info:
+  * the first insn in each func must points to a line_info record.
+  * the line_info insn_off is in strictly increasing order.
+
+For line_info, the line number and column number are defined as below:
+::
+
+    #define BPF_LINE_INFO_LINE_NUM(line_col)        ((line_col) >> 10)
+    #define BPF_LINE_INFO_LINE_COL(line_col)        ((line_col) & 0x3ff)
+
+3.4 BPF_{PROG,MAP}_GET_NEXT_ID
+
+In kernel, every loaded program, map or btf has a unique id.
+The id won't change during the life time of the program, map or btf.
+
+The bpf syscall command BPF_{PROG,MAP}_GET_NEXT_ID
+returns all id's, one for each command, to user space, for bpf
+program or maps,
+so the inspection tool can inspect all programs and maps.
+
+3.5 BPF_{PROG,MAP}_GET_FD_BY_ID
+
+The introspection tool cannot use id to get details about program or maps.
+A file descriptor needs to be obtained first for reference countin