From 590ce401c207fd944827eb5aa5e87d834eddb149 Mon Sep 17 00:00:00 2001 From: Sergio Paracuellos Date: Sun, 13 Jan 2019 09:56:48 +0100 Subject: dt-bindings: net: dsa: ksz9477: fix indentation for switch spi bindings Switch bindings for spi managed mode are using spaces instead of tabs. Fix them to get a file with a proper kernel indentation style. Reviewed-by: Florian Fainelli Signed-off-by: Sergio Paracuellos Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/net/dsa/ksz.txt | 102 +++++++++++----------- 1 file changed, 51 insertions(+), 51 deletions(-) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/dsa/ksz.txt b/Documentation/devicetree/bindings/net/dsa/ksz.txt index 0f407fb371ce..8d58c2a7de39 100644 --- a/Documentation/devicetree/bindings/net/dsa/ksz.txt +++ b/Documentation/devicetree/bindings/net/dsa/ksz.txt @@ -19,58 +19,58 @@ Examples: Ethernet switch connected via SPI to the host, CPU port wired to eth0: - eth0: ethernet@10001000 { - fixed-link { - speed = <1000>; - full-duplex; - }; - }; + eth0: ethernet@10001000 { + fixed-link { + speed = <1000>; + full-duplex; + }; + }; - spi1: spi@f8008000 { - pinctrl-0 = <&pinctrl_spi_ksz>; - cs-gpios = <&pioC 25 0>; - id = <1>; + spi1: spi@f8008000 { + pinctrl-0 = <&pinctrl_spi_ksz>; + cs-gpios = <&pioC 25 0>; + id = <1>; - ksz9477: ksz9477@0 { - compatible = "microchip,ksz9477"; - reg = <0>; + ksz9477: ksz9477@0 { + compatible = "microchip,ksz9477"; + reg = <0>; - spi-max-frequency = <44000000>; - spi-cpha; - spi-cpol; + spi-max-frequency = <44000000>; + spi-cpha; + spi-cpol; - ports { - #address-cells = <1>; - #size-cells = <0>; - port@0 { - reg = <0>; - label = "lan1"; - }; - port@1 { - reg = <1>; - label = "lan2"; - }; - port@2 { - reg = <2>; - label = "lan3"; - }; - port@3 { - reg = <3>; - label = "lan4"; - }; - port@4 { - reg = <4>; - label = "lan5"; - }; - port@5 { - reg = <5>; - label = "cpu"; - ethernet = <ð0>; - fixed-link { - speed = <1000>; - full-duplex; - }; - }; - }; - }; - }; + ports { + #address-cells = <1>; + #size-cells = <0>; + port@0 { + reg = <0>; + label = "lan1"; + }; + port@1 { + reg = <1>; + label = "lan2"; + }; + port@2 { + reg = <2>; + label = "lan3"; + }; + port@3 { + reg = <3>; + label = "lan4"; + }; + port@4 { + reg = <4>; + label = "lan5"; + }; + port@5 { + reg = <5>; + label = "cpu"; + ethernet = <ð0>; + fixed-link { + speed = <1000>; + full-duplex; + }; + }; + }; + }; + }; -- cgit v1.2.3 From ae5220c672180765615458ae54dbcff9abe6a01d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 13 Jan 2019 20:17:41 -0800 Subject: networking: Documentation: fix snmp_counters.rst Sphinx warnings Fix over 100 documentation warnings in snmp_counter.rst by extending the underline string lengths and inserting a blank line after bullet items. Examples: Documentation/networking/snmp_counter.rst:1: WARNING: Title overline too short. Documentation/networking/snmp_counter.rst:14: WARNING: Bullet list ends without a blank line; unexpected unindent. Fixes: 2b96547223e3 ("add document for TCP OFO, PAWS and skip ACK counters") Fixes: 8e2ea53a83df ("add snmp counters document") Fixes: 712ee16c230f ("add documents for snmp counters") Fixes: 80cc49507ba4 ("net: Add part of TCP counts explanations in snmp_counters.rst") Fixes: b08794a922c4 ("documentation of some IP/ICMP snmp counters") Signed-off-by: Randy Dunlap Cc: yupeng Signed-off-by: David S. Miller --- Documentation/networking/snmp_counter.rst | 113 +++++++++++++++++++++++------- 1 file changed, 86 insertions(+), 27 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/snmp_counter.rst b/Documentation/networking/snmp_counter.rst index b0dfdaaca512..486ab33acc3a 100644 --- a/Documentation/networking/snmp_counter.rst +++ b/Documentation/networking/snmp_counter.rst @@ -1,16 +1,17 @@ -=========== +============ SNMP counter -=========== +============ This document explains the meaning of SNMP counters. General IPv4 counters -==================== +===================== All layer 4 packets and ICMP packets will change these counters, but these counters won't be changed by layer 2 packets (such as STP) or ARP packets. * IpInReceives + Defined in `RFC1213 ipInReceives`_ .. _RFC1213 ipInReceives: https://tools.ietf.org/html/rfc1213#page-26 @@ -23,6 +24,7 @@ and so on). It indicates the number of aggregated segments after GRO/LRO. * IpInDelivers + Defined in `RFC1213 ipInDelivers`_ .. _RFC1213 ipInDelivers: https://tools.ietf.org/html/rfc1213#page-28 @@ -33,6 +35,7 @@ supported protocols will be delivered, if someone listens on the raw socket, all valid IP packets will be delivered. * IpOutRequests + Defined in `RFC1213 ipOutRequests`_ .. _RFC1213 ipOutRequests: https://tools.ietf.org/html/rfc1213#page-28 @@ -42,6 +45,7 @@ multicast packets, and would always be updated together with IpExtOutOctets. * IpExtInOctets and IpExtOutOctets + They are Linux kernel extensions, no RFC definitions. Please note, RFC1213 indeed defines ifInOctets and ifOutOctets, but they are different things. The ifInOctets and ifOutOctets include the MAC @@ -49,6 +53,7 @@ layer header size but IpExtInOctets and IpExtOutOctets don't, they only include the IP layer header and the IP layer data. * IpExtInNoECTPkts, IpExtInECT1Pkts, IpExtInECT0Pkts, IpExtInCEPkts + They indicate the number of four kinds of ECN IP packets, please refer `Explicit Congestion Notification`_ for more details. @@ -60,6 +65,7 @@ for the same packet, you might find that IpInReceives count 1, but IpExtInNoECTPkts counts 2 or more. * IpInHdrErrors + Defined in `RFC1213 ipInHdrErrors`_. It indicates the packet is dropped due to the IP header error. It might happen in both IP input and IP forward paths. @@ -67,6 +73,7 @@ and IP forward paths. .. _RFC1213 ipInHdrErrors: https://tools.ietf.org/html/rfc1213#page-27 * IpInAddrErrors + Defined in `RFC1213 ipInAddrErrors`_. It will be increased in two scenarios: (1) The IP address is invalid. (2) The destination IP address is not a local address and IP forwarding is not enabled @@ -74,6 +81,7 @@ address is not a local address and IP forwarding is not enabled .. _RFC1213 ipInAddrErrors: https://tools.ietf.org/html/rfc1213#page-27 * IpExtInNoRoutes + This counter means the packet is dropped when the IP stack receives a packet and can't find a route for it from the route table. It might happen when IP forwarding is enabled and the destination IP address is @@ -81,6 +89,7 @@ not a local address and there is no route for the destination IP address. * IpInUnknownProtos + Defined in `RFC1213 ipInUnknownProtos`_. It will be increased if the layer 4 protocol is unsupported by kernel. If an application is using raw socket, kernel will always deliver the packet to the raw socket @@ -89,10 +98,12 @@ and this counter won't be increased. .. _RFC1213 ipInUnknownProtos: https://tools.ietf.org/html/rfc1213#page-27 * IpExtInTruncatedPkts + For IPv4 packet, it means the actual data size is smaller than the "Total Length" field in the IPv4 header. * IpInDiscards + Defined in `RFC1213 ipInDiscards`_. It indicates the packet is dropped in the IP receiving path and due to kernel internal reasons (e.g. no enough memory). @@ -100,20 +111,23 @@ enough memory). .. _RFC1213 ipInDiscards: https://tools.ietf.org/html/rfc1213#page-28 * IpOutDiscards + Defined in `RFC1213 ipOutDiscards`_. It indicates the packet is dropped in the IP sending path and due to kernel internal reasons. .. _RFC1213 ipOutDiscards: https://tools.ietf.org/html/rfc1213#page-28 * IpOutNoRoutes + Defined in `RFC1213 ipOutNoRoutes`_. It indicates the packet is dropped in the IP sending path and no route is found for it. .. _RFC1213 ipOutNoRoutes: https://tools.ietf.org/html/rfc1213#page-29 ICMP counters -============ +============= * IcmpInMsgs and IcmpOutMsgs + Defined by `RFC1213 icmpInMsgs`_ and `RFC1213 icmpOutMsgs`_ .. _RFC1213 icmpInMsgs: https://tools.ietf.org/html/rfc1213#page-41 @@ -126,6 +140,7 @@ IcmpOutMsgs would still be updated if the IP header is constructed by a userspace program. * ICMP named types + | These counters include most of common ICMP types, they are: | IcmpInDestUnreachs: `RFC1213 icmpInDestUnreachs`_ | IcmpInTimeExcds: `RFC1213 icmpInTimeExcds`_ @@ -180,6 +195,7 @@ straightforward. The 'In' counter means kernel receives such a packet and the 'Out' counter means kernel sends such a packet. * ICMP numeric types + They are IcmpMsgInType[N] and IcmpMsgOutType[N], the [N] indicates the ICMP type number. These counters track all kinds of ICMP packets. The ICMP type number definition could be found in the `ICMP parameters`_ @@ -192,12 +208,14 @@ IcmpMsgOutType8 would increase 1. And if kernel gets an ICMP Echo Reply packet, IcmpMsgInType0 would increase 1. * IcmpInCsumErrors + This counter indicates the checksum of the ICMP packet is wrong. Kernel verifies the checksum after updating the IcmpInMsgs and before updating IcmpMsgInType[N]. If a packet has bad checksum, the IcmpInMsgs would be updated but none of IcmpMsgInType[N] would be updated. * IcmpInErrors and IcmpOutErrors + Defined by `RFC1213 icmpInErrors`_ and `RFC1213 icmpOutErrors`_ .. _RFC1213 icmpInErrors: https://tools.ietf.org/html/rfc1213#page-41 @@ -209,7 +227,7 @@ and the sending packet path use IcmpOutErrors. When IcmpInCsumErrors is increased, IcmpInErrors would always be increased too. relationship of the ICMP counters -------------------------------- +--------------------------------- The sum of IcmpMsgOutType[N] is always equal to IcmpOutMsgs, as they are updated at the same time. The sum of IcmpMsgInType[N] plus IcmpInErrors should be equal or larger than IcmpInMsgs. When kernel @@ -229,8 +247,9 @@ IcmpInMsgs should be less than the sum of IcmpMsgOutType[N] plus IcmpInErrors. General TCP counters -================== +==================== * TcpInSegs + Defined in `RFC1213 tcpInSegs`_ .. _RFC1213 tcpInSegs: https://tools.ietf.org/html/rfc1213#page-48 @@ -247,6 +266,7 @@ isn't aware of GRO. So if two packets are merged by GRO, the TcpInSegs counter would only increase 1. * TcpOutSegs + Defined in `RFC1213 tcpOutSegs`_ .. _RFC1213 tcpOutSegs: https://tools.ietf.org/html/rfc1213#page-48 @@ -258,6 +278,7 @@ GSO, so if a packet would be split to 2 by GSO, TcpOutSegs will increase 2. * TcpActiveOpens + Defined in `RFC1213 tcpActiveOpens`_ .. _RFC1213 tcpActiveOpens: https://tools.ietf.org/html/rfc1213#page-47 @@ -267,6 +288,7 @@ state. Every time TcpActiveOpens increases 1, TcpOutSegs should always increase 1. * TcpPassiveOpens + Defined in `RFC1213 tcpPassiveOpens`_ .. _RFC1213 tcpPassiveOpens: https://tools.ietf.org/html/rfc1213#page-47 @@ -275,6 +297,7 @@ It means the TCP layer receives a SYN, replies a SYN+ACK, come into the SYN-RCVD state. * TcpExtTCPRcvCoalesce + When packets are received by the TCP layer and are not be read by the application, the TCP layer will try to merge them. This counter indicate how many packets are merged in such situation. If GRO is @@ -282,12 +305,14 @@ enabled, lots of packets would be merged by GRO, these packets wouldn't be counted to TcpExtTCPRcvCoalesce. * TcpExtTCPAutoCorking + When sending packets, the TCP layer will try to merge small packets to a bigger one. This counter increase 1 for every packet merged in such situation. Please refer to the LWN article for more details: https://lwn.net/Articles/576263/ * TcpExtTCPOrigDataSent + This counter is explained by `kernel commit f19c29e3e391`_, I pasted the explaination below:: @@ -297,6 +322,7 @@ explaination below:: more useful to track the TCP retransmission rate. * TCPSynRetrans + This counter is explained by `kernel commit f19c29e3e391`_, I pasted the explaination below:: @@ -304,6 +330,7 @@ explaination below:: retransmissions into SYN, fast-retransmits, timeout retransmits, etc. * TCPFastOpenActiveFail + This counter is explained by `kernel commit f19c29e3e391`_, I pasted the explaination below:: @@ -313,6 +340,7 @@ explaination below:: .. _kernel commit f19c29e3e391: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f19c29e3e391a66a273e9afebaf01917245148cd * TcpExtListenOverflows and TcpExtListenDrops + When kernel receives a SYN from a client, and if the TCP accept queue is full, kernel will drop the SYN and add 1 to TcpExtListenOverflows. At the same time kernel will also add 1 to TcpExtListenDrops. When a @@ -337,7 +365,7 @@ to the accept queue. TCP Fast Open -============ +============= When kernel receives a TCP packet, it has two paths to handler the packet, one is fast path, another is slow path. The comment in kernel code provides a good explanation of them, I pasted them below:: @@ -370,22 +398,24 @@ will disable the fast path at first, and try to enable it after kernel receives packets. * TcpExtTCPPureAcks and TcpExtTCPHPAcks + If a packet set ACK flag and has no data, it is a pure ACK packet, if kernel handles it in the fast path, TcpExtTCPHPAcks will increase 1, if kernel handles it in the slow path, TcpExtTCPPureAcks will increase 1. * TcpExtTCPHPHits + If a TCP packet has data (which means it is not a pure ACK packet), and this packet is handled in the fast path, TcpExtTCPHPHits will increase 1. TCP abort -======== - +========= * TcpExtTCPAbortOnData + It means TCP layer has data in flight, but need to close the connection. So TCP layer sends a RST to the other side, indicate the connection is not closed very graceful. An easy way to increase this @@ -404,11 +434,13 @@ when the application closes a connection, kernel will send a RST immediately and increase the TcpExtTCPAbortOnData counter. * TcpExtTCPAbortOnClose + This counter means the application has unread data in the TCP layer when the application wants to close the TCP connection. In such a situation, kernel will send a RST to the other side of the TCP connection. * TcpExtTCPAbortOnMemory + When an application closes a TCP connection, kernel still need to track the connection, let it complete the TCP disconnect process. E.g. an app calls the close method of a socket, kernel sends fin to the other @@ -430,10 +462,12 @@ the tcp_mem. Please refer the tcp_mem section in the `TCP man page`_: * TcpExtTCPAbortOnTimeout + This counter will increase when any of the TCP timers expire. In such situation, kernel won't send RST, just give up the connection. * TcpExtTCPAbortOnLinger + When a TCP connection comes into FIN_WAIT_2 state, instead of waiting for the fin packet from the other side, kernel could send a RST and delete the socket immediately. This is not the default behavior of @@ -441,6 +475,7 @@ Linux kernel TCP stack. By configuring the TCP_LINGER2 socket option, you could let kernel follow this behavior. * TcpExtTCPAbortFailed + The kernel TCP layer will send RST if the `RFC2525 2.17 section`_ is satisfied. If an internal error occurs during this process, TcpExtTCPAbortFailed will be increased. @@ -448,7 +483,7 @@ TcpExtTCPAbortFailed will be increased. .. _RFC2525 2.17 section: https://tools.ietf.org/html/rfc2525#page-50 TCP Hybrid Slow Start -==================== +===================== The Hybrid Slow Start algorithm is an enhancement of the traditional TCP congestion window Slow Start algorithm. It uses two pieces of information to detect whether the max bandwidth of the TCP path is @@ -464,23 +499,27 @@ relate with the Hybrid Slow Start algorithm. .. _Hybrid Slow Start paper: https://pdfs.semanticscholar.org/25e9/ef3f03315782c7f1cbcd31b587857adae7d1.pdf * TcpExtTCPHystartTrainDetect + How many times the ACK train length threshold is detected * TcpExtTCPHystartTrainCwnd + The sum of CWND detected by ACK train length. Dividing this value by TcpExtTCPHystartTrainDetect is the average CWND which detected by the ACK train length. * TcpExtTCPHystartDelayDetect + How many times the packet delay threshold is detected. * TcpExtTCPHystartDelayCwnd + The sum of CWND detected by packet delay. Dividing this value by TcpExtTCPHystartDelayDetect is the average CWND which detected by the packet delay. TCP retransmission and congestion control -====================================== +========================================= The TCP protocol has two retransmission mechanisms: SACK and fast recovery. They are exclusive with each other. When SACK is enabled, the kernel TCP stack would use SACK, or kernel would use fast @@ -499,12 +538,14 @@ https://pdfs.semanticscholar.org/0e9c/968d09ab2e53e24c4dca5b2d67c7f7140f8e.pdf .. _RFC6582: https://tools.ietf.org/html/rfc6582 * TcpExtTCPRenoRecovery and TcpExtTCPSackRecovery + When the congestion control comes into Recovery state, if sack is used, TcpExtTCPSackRecovery increases 1, if sack is not used, TcpExtTCPRenoRecovery increases 1. These two counters mean the TCP stack begins to retransmit the lost packets. * TcpExtTCPSACKReneging + A packet was acknowledged by SACK, but the receiver has dropped this packet, so the sender needs to retransmit this packet. In this situation, the sender adds 1 to TcpExtTCPSACKReneging. A receiver @@ -515,6 +556,7 @@ the RTO expires for this packet, then the sender assumes this packet has been dropped by the receiver. * TcpExtTCPRenoReorder + The reorder packet is detected by fast recovery. It would only be used if SACK is disabled. The fast recovery algorithm detects recorder by the duplicate ACK number. E.g., if retransmission is triggered, and @@ -525,6 +567,7 @@ order packet. Thus the sender would find more ACks than its expectation, and the sender knows out of order occurs. * TcpExtTCPTSReorder + The reorder packet is detected when a hole is filled. E.g., assume the sender sends packet 1,2,3,4,5, and the receiving order is 1,2,4,5,3. When the sender receives the ACK of packet 3 (which will @@ -534,6 +577,7 @@ fill the hole), two conditions will let TcpExtTCPTSReorder increase than the retransmission timestamp. * TcpExtTCPSACKReorder + The reorder packet detected by SACK. The SACK has two methods to detect reorder: (1) DSACK is received by the sender. It means the sender sends the same packet more than one times. And the only reason @@ -558,39 +602,46 @@ sender side. .. _RFC2883 : https://tools.ietf.org/html/rfc2883 * TcpExtTCPDSACKOldSent + The TCP stack receives a duplicate packet which has been acked, so it sends a DSACK to the sender. * TcpExtTCPDSACKOfoSent + The TCP stack receives an out of order duplicate packet, so it sends a DSACK to the sender. * TcpExtTCPDSACKRecv + The TCP stack receives a DSACK, which indicate an acknowledged duplicate packet is received. * TcpExtTCPDSACKOfoRecv + The TCP stack receives a DSACK, which indicate an out of order duplicate packet is received. TCP out of order -=============== +================ * TcpExtTCPOFOQueue + The TCP layer receives an out of order packet and has enough memory to queue it. * TcpExtTCPOFODrop + The TCP layer receives an out of order packet but doesn't have enough memory, so drops it. Such packets won't be counted into TcpExtTCPOFOQueue. * TcpExtTCPOFOMerge + The received out of order packet has an overlay with the previous packet. the overlay part will be dropped. All of TcpExtTCPOFOMerge packets will also be counted into TcpExtTCPOFOQueue. TCP PAWS -======= +======== PAWS (Protection Against Wrapped Sequence numbers) is an algorithm which is used to drop old packets. It depends on the TCP timestamps. For detail information, please refer the `timestamp wiki`_ @@ -600,13 +651,15 @@ and the `RFC of PAWS`_. .. _timestamp wiki: https://en.wikipedia.org/wiki/Transmission_Control_Protocol#TCP_timestamps * TcpExtPAWSActive + Packets are dropped by PAWS in Syn-Sent status. * TcpExtPAWSEstab + Packets are dropped by PAWS in any status other than Syn-Sent. TCP ACK skip -=========== +============ In some scenarios, kernel would avoid sending duplicate ACKs too frequently. Please find more details in the tcp_invalid_ratelimit section of the `sysctl document`_. When kernel decides to skip an ACK @@ -618,6 +671,7 @@ it has no data. .. _sysctl document: https://www.kernel.org/doc/Documentation/networking/ip-sysctl.txt * TcpExtTCPACKSkippedSynRecv + The ACK is skipped in Syn-Recv status. The Syn-Recv status means the TCP stack receives a SYN and replies SYN+ACK. Now the TCP stack is waiting for an ACK. Generally, the TCP stack doesn't need to send ACK @@ -631,6 +685,7 @@ increase TcpExtTCPACKSkippedSynRecv. * TcpExtTCPACKSkippedPAWS + The ACK is skipped due to PAWS (Protect Against Wrapped Sequence numbers) check fails. If the PAWS check fails in Syn-Recv, Fin-Wait-2 or Time-Wait statuses, the skipped ACK would be counted to @@ -639,18 +694,22 @@ TcpExtTCPACKSkippedTimeWait. In all other statuses, the skipped ACK would be counted to TcpExtTCPACKSkippedPAWS. * TcpExtTCPACKSkippedSeq + The sequence number is out of window and the timestamp passes the PAWS check and the TCP status is not Syn-Recv, Fin-Wait-2, and Time-Wait. * TcpExtTCPACKSkippedFinWait2 + The ACK is skipped in Fin-Wait-2 status, the reason would be either PAWS check fails or the received sequence number is out of window. * TcpExtTCPACKSkippedTimeWait + Tha ACK is skipped in Time-Wait status, the reason would be either PAWS check failed or the received sequence number is out of window. * TcpExtTCPACKSkippedChallenge + The ACK is skipped if the ACK is a challenge ACK. The RFC 5961 defines 3 kind of challenge ACK, please refer `RFC 5961 section 3.2`_, `RFC 5961 section 4.2`_ and `RFC 5961 section 5.2`_. Besides these @@ -664,10 +723,10 @@ unacknowledged number (more strict than `RFC 5961 section 5.2`_). examples -======= +======== ping test --------- +--------- Run the ping command against the public dns server 8.8.8.8:: nstatuser@nstat-a:~$ ping 8.8.8.8 -c 1 @@ -711,7 +770,7 @@ and its corresponding Echo Reply packet are constructed by: So the IpExtInOctets and IpExtOutOctets are 20+16+48=84. tcp 3-way handshake ------------------- +------------------- On server side, we run:: nstatuser@nstat-b:~$ nc -lknv 0.0.0.0 9000 @@ -753,7 +812,7 @@ ACK, so client sent 2 packets, received 1 packet, TcpInSegs increased 1, TcpOutSegs increased 2. TCP normal traffic ------------------ +------------------ Run nc on server:: nstatuser@nstat-b:~$ nc -lkv 0.0.0.0 9000 @@ -876,7 +935,7 @@ and the packet received from client qualified for fast path, so it was counted into 'TcpExtTCPHPHits'. TcpExtTCPAbortOnClose --------------------- +--------------------- On the server side, we run below python script:: import socket @@ -910,7 +969,7 @@ If we run tcpdump on the server side, we could find the server sent a RST after we type Ctrl-C. TcpExtTCPAbortOnMemory and TcpExtTCPAbortOnTimeout ------------------------------------------------ +--------------------------------------------------- Below is an example which let the orphan socket count be higher than net.ipv4.tcp_max_orphans. Change tcp_max_orphans to a smaller value on client:: @@ -1032,7 +1091,7 @@ FIN_WAIT_1 state finally. So we wait for a few minutes, we could find TcpExtTCPAbortOnTimeout 10 0.0 TcpExtTCPAbortOnLinger ---------------------- +---------------------- The server side code:: nstatuser@nstat-b:~$ cat server_linger.py @@ -1077,7 +1136,7 @@ After run client_linger.py, check the output of nstat:: TcpExtTCPAbortOnLinger 1 0.0 TcpExtTCPRcvCoalesce -------------------- +-------------------- On the server, we run a program which listen on TCP port 9000, but doesn't read any data:: @@ -1137,7 +1196,7 @@ the receiving queue. So the TCP layer merged the two packets, and we could find the TcpExtTCPRcvCoalesce increased 1. TcpExtListenOverflows and TcpExtListenDrops ----------------------------------------- +------------------------------------------- On server, run the nc command, listen on port 9000:: nstatuser@nstat-b:~$ nc -lkv 0.0.0.0 9000 @@ -1185,7 +1244,7 @@ TcpExtListenOverflows and TcpExtListenDrops would be larger, because the SYN of the 4th nc was dropped, the client was retrying. IpInAddrErrors, IpExtInNoRoutes and IpOutNoRoutes ----------------------------------------------- +------------------------------------------------- server A IP address: 192.168.122.250 server B IP address: 192.168.122.251 Prepare on server A, add a route to server B:: @@ -1280,7 +1339,7 @@ a route for the 8.8.8.8 IP address, so server B increased IpOutNoRoutes. TcpExtTCPACKSkippedSynRecv ------------------------- +-------------------------- In this test, we send 3 same SYN packets from client to server. The first SYN will let server create a socket, set it to Syn-Recv status, and reply a SYN/ACK. The second SYN will let server reply the SYN/ACK @@ -1328,7 +1387,7 @@ Check snmp cunter on nstat-b:: As we expected, TcpExtTCPACKSkippedSynRecv is 1. TcpExtTCPACKSkippedPAWS ----------------------- +----------------------- To trigger PAWS, we could send an old SYN. On nstat-b, let nc listen on port 9000:: @@ -1365,7 +1424,7 @@ failed, the nstat-b replied an ACK for the first SYN, skipped the ACK for the second SYN, and updated TcpExtTCPACKSkippedPAWS. TcpExtTCPACKSkippedSeq --------------------- +---------------------- To trigger TcpExtTCPACKSkippedSeq, we send packets which have valid timestamp (to pass PAWS check) but the sequence number is out of window. The linux TCP stack would avoid to skip if the packet has -- cgit v1.2.3 From 44543f1dd2a39d56c9afdc3778aa050b5a4725b4 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 15 Jan 2019 14:35:02 -0800 Subject: Documentation: networking: dsa: Update documentation Since 83c0afaec7b7 ("net: dsa: Add new binding implementation"), DSA is no longer a platform device exclusively and can support registering DSA switches from other bus drivers (PCI, USB, I2C, etc.). Signed-off-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- Documentation/networking/dsa/dsa.txt | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'Documentation') diff --git a/Documentation/networking/dsa/dsa.txt b/Documentation/networking/dsa/dsa.txt index 25170ad7d25b..1000b821681c 100644 --- a/Documentation/networking/dsa/dsa.txt +++ b/Documentation/networking/dsa/dsa.txt @@ -236,19 +236,6 @@ description. Design limitations ================== -DSA is a platform device driver -------------------------------- - -DSA is implemented as a DSA platform device driver which is convenient because -it will register the entire DSA switch tree attached to a master network device -in one-shot, facilitating the device creation and simplifying the device driver -model a bit, this comes however with a number of limitations: - -- building DSA and its switch drivers as modules is currently not working -- the device driver parenting does not necessarily reflect the original - bus/device the switch can be created from -- supporting non-MDIO and non-MMIO (platform) switches is not possible - Limits on the number of devices and ports ----------------------------------------- -- cgit v1.2.3 From 6685987c29582afc79b7fa3998dfbf36b4295791 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Wed, 16 Jan 2019 23:06:56 +0000 Subject: switchdev: Add extack argument to call_switchdev_notifiers() A follow-up patch will enable vetoing of FDB entries. Make it possible to communicate details of why an FDB entry is not acceptable back to the user. Signed-off-by: Petr Machata Signed-off-by: David S. Miller --- Documentation/networking/switchdev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/networking/switchdev.txt b/Documentation/networking/switchdev.txt index 82236a17b5e6..f3244d87512a 100644 --- a/Documentation/networking/switchdev.txt +++ b/Documentation/networking/switchdev.txt @@ -196,7 +196,7 @@ The switch device will learn/forget source MAC address/VLAN on ingress packets and notify the switch driver of the mac/vlan/port tuples. The switch driver, in turn, will notify the bridge driver using the switchdev notifier call: - err = call_switchdev_notifiers(val, dev, info); + err = call_switchdev_notifiers(val, dev, info, extack); Where val is SWITCHDEV_FDB_ADD when learning and SWITCHDEV_FDB_DEL when forgetting, and info points to a struct switchdev_notifier_fdb_info. On -- cgit v1.2.3 From b8c45a033acc607201588f7665ba84207e5149e0 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Thu, 17 Jan 2019 23:59:20 +0200 Subject: devlink: Add Documentation/networking/devlink-health.txt This patch adds a new file to add information about devlink health mechanism. Signed-off-by: Aya Levin Signed-off-by: Eran Ben Elisha Signed-off-by: David S. Miller --- Documentation/networking/devlink-health.txt | 86 +++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 Documentation/networking/devlink-health.txt (limited to 'Documentation') diff --git a/Documentation/networking/devlink-health.txt b/Documentation/networking/devlink-health.txt new file mode 100644 index 000000000000..1db3fbea0831 --- /dev/null +++ b/Documentation/networking/devlink-health.txt @@ -0,0 +1,86 @@ +The health mechanism is targeted for Real Time Alerting, in order to know when +something bad had happened to a PCI device +- Provide alert debug information +- Self healing +- If problem needs vendor support, provide a way to gather all needed debugging + information. + +The main idea is to unify and centralize driver health reports in the +generic devlink instance and allow the user to set different +attributes of the health reporting and recovery procedures. + +The devlink health reporter: +Device driver creates a "health reporter" per each error/health type. +Error/Health type can be a known/generic (eg pci error, fw error, rx/tx error) +or unknown (driver specific). +For each registered health reporter a driver can issue error/health reports +asynchronously. All health reports handling is done by devlink. +Device driver can provide specific callbacks for each "health reporter", e.g. + - Recovery procedures + - Diagnostics and object dump procedures + - OOB initial parameters +Different parts of the driver can register different types of health reporters +with different handlers. + +Once an error is reported, devlink health will do the following actions: + * A log is being send to the kernel trace events buffer + * Health status and statistics are being updated for the reporter instance + * Object dump is being taken and saved at the reporter instance (as long as + there is no other dump which is already stored) + * Auto recovery attempt is being done. Depends on: + - Auto-recovery configuration + - Grace period vs. time passed since last recover + +The user interface: +User can access/change each reporter's parameters and driver specific callbacks +via devlink, e.g per error type (per health reporter) + - Configure reporter's generic parameters (like: disable/enable auto recovery) + - Invoke recovery procedure + - Run diagnostics + - Object dump + +The devlink health interface (via netlink): +DEVLINK_CMD_HEALTH_REPORTER_GET + Retrieves status and configuration info per DEV and reporter. +DEVLINK_CMD_HEALTH_REPORTER_SET + Allows reporter-related configuration setting. +DEVLINK_CMD_HEALTH_REPORTER_RECOVER + Triggers a reporter's recovery procedure. +DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE + Retrieves diagnostics data from a reporter on a device. +DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET + Retrieves the last stored dump. Devlink health + saves a single dump. If an dump is not already stored by the devlink + for this reporter, devlink generates a new dump. + dump output is defined by the reporter. +DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR + Clears the last saved dump file for the specified reporter. + + + netlink + +--------------------------+ + | | + | + | + | | | + +--------------------------+ + |request for ops + |(diagnose, + mlx5_core devlink |recover, + |dump) ++--------+ +--------------------------+ +| | | reporter| | +| | | +---------v----------+ | +| | ops execution | | | | +| <----------------------------------+ | | +| | | | | | +| | | + ^------------------+ | +| | | | request for ops | +| | | | (recover, dump) | +| | | | | +| | | +-+------------------+ | +| | health report | | health handler | | +| +-------------------------------> | | +| | | +--------------------+ | +| | health reporter create | | +| +----------------------------> | ++--------+ +--------------------------+ -- cgit v1.2.3 From 856c395cfa63b94a1d8215182f0243c222f6f927 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 17 Jan 2019 23:27:11 -0800 Subject: net: introduce a knob to control whether to inherit devconf config There have been many people complaining about the inconsistent behaviors of IPv4 and IPv6 devconf when creating new network namespaces. Currently, for IPv4, we inherit all current settings from init_net, but for IPv6 we reset all setting to default. This patch introduces a new /proc file /proc/sys/net/core/devconf_inherit_init_net to control the behavior of whether to inhert sysctl current settings from init_net. This file itself is only available in init_net. As demonstrated below: Initial setup in init_net: # cat /proc/sys/net/ipv4/conf/all/rp_filter 2 # cat /proc/sys/net/ipv6/conf/all/accept_dad 1 Default value 0 (current behavior): # ip netns del test # ip netns add test # ip netns exec test cat /proc/sys/net/ipv4/conf/all/rp_filter 2 # ip netns exec test cat /proc/sys/net/ipv6/conf/all/accept_dad 0 Set to 1 (inherit from init_net): # echo 1 > /proc/sys/net/core/devconf_inherit_init_net # ip netns del test # ip netns add test # ip netns exec test cat /proc/sys/net/ipv4/conf/all/rp_filter 2 # ip netns exec test cat /proc/sys/net/ipv6/conf/all/accept_dad 1 Set to 2 (reset to default): # echo 2 > /proc/sys/net/core/devconf_inherit_init_net # ip netns del test # ip netns add test # ip netns exec test cat /proc/sys/net/ipv4/conf/all/rp_filter 0 # ip netns exec test cat /proc/sys/net/ipv6/conf/all/accept_dad 0 Set to a value out of range (invalid): # echo 3 > /proc/sys/net/core/devconf_inherit_init_net -bash: echo: write error: Invalid argument # echo -1 > /proc/sys/net/core/devconf_inherit_init_net -bash: echo: write error: Invalid argument Reported-by: Zhu Yanjun Reported-by: Tonghao Zhang Cc: Nicolas Dichtel Signed-off-by: Cong Wang Acked-by: Nicolas Dichtel Acked-by: Tonghao Zhang Signed-off-by: David S. Miller --- Documentation/sysctl/net.txt | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'Documentation') diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index 2793d4eac55f..bc0680706870 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -291,6 +291,20 @@ user space is responsible for creating them if needed. Default : 0 (for compatibility reasons) +devconf_inherit_init_net +---------------------------- + +Controls if a new network namespace should inherit all current +settings under /proc/sys/net/{ipv4,ipv6}/conf/{all,default}/. By +default, we keep the current behavior: for IPv4 we inherit all current +settings from init_net and for IPv6 we reset all settings to default. + +If set to 1, both IPv4 and IPv6 settings are forced to inherit from +current ones in init_net. If set to 2, both IPv4 and IPv6 settings are +forced to reset to their default values. + +Default : 0 (for compatibility reasons) + 2. /proc/sys/net/unix - Parameters for Unix domain sockets ------------------------------------------------------- -- cgit v1.2.3 From 00f1ee5361c3f644133ef2d19d2c340d2a730f1d Mon Sep 17 00:00:00 2001 From: Vinod Koul Date: Mon, 21 Jan 2019 14:43:14 +0530 Subject: dt-bindings: net: Add Qualcomm ethqos binding Add support for Qualcomm ethqos found in some SoCs like QCS404. Signed-off-by: Vinod Koul Reviewed-by: Rob Herring Signed-off-by: David S. Miller --- .../devicetree/bindings/net/qcom,ethqos.txt | 64 ++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 Documentation/devicetree/bindings/net/qcom,ethqos.txt (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/net/qcom,ethqos.txt b/Documentation/devicetree/bindings/net/qcom,ethqos.txt new file mode 100644 index 000000000000..fcf5035810b5 --- /dev/null +++ b/Documentation/devicetree/bindings/net/qcom,ethqos.txt @@ -0,0 +1,64 @@ +Qualcomm Ethernet ETHQOS device + +This documents dwmmac based ethernet device which supports Gigabit +ethernet for version v2.3.0 onwards. + +This device has following properties: + +Required properties: + +- compatible: Should be qcom,qcs404-ethqos" + +- reg: Address and length of the register set for the device + +- reg-names: Should contain register names "stmmaceth", "rgmii" + +- clocks: Should contain phandle to clocks + +- clock-names: Should contain clock names "stmmaceth", "pclk", + "ptp_ref", "rgmii" + +- interrupts: Should contain phandle to interrupts + +- interrupt-names: Should contain interrupt names "macirq", "eth_lpi" + +Rest of the properties are defined in stmmac.txt file in same directory + + +Example: + +ethernet: ethernet@7a80000 { + compatible = "qcom,qcs404-ethqos"; + reg = <0x07a80000 0x10000>, + <0x07a96000 0x100>; + reg-names = "stmmaceth", "rgmii"; + clock-names = "stmmaceth", "pclk", "ptp_ref", "rgmii"; + clocks = <&gcc GCC_ETH_AXI_CLK>, + <&gcc GCC_ETH_SLAVE_AHB_CLK>, + <&gcc GCC_ETH_PTP_CLK>, + <&gcc GCC_ETH_RGMII_CLK>; + interrupts = , + ; + interrupt-names = "macirq", "eth_lpi"; + snps,reset-gpio = <&tlmm 60 GPIO_ACTIVE_LOW>; + snps,reset-active-low; + + snps,txpbl = <8>; + snps,rxpbl = <2>; + snps,aal; + snps,tso; + + phy-handle = <&phy1>; + phy-mode = "rgmii"; + + mdio { + #address-cells = <0x1>; + #size-cells = <0x0>; + compatible = "snps,dwmac-mdio"; + phy1: phy@4 { + device_type = "ethernet-phy"; + reg = <0x4>; + }; + }; + +}; -- cgit v1.2.3 From 5ff2698b3301c37246f1f79dc9bdcd378b000dbe Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Mon, 21 Jan 2019 18:41:40 +0800 Subject: dt-binding: ptp_qoriq: document "fsl,extts-fifo" property Documented "fsl,extts-fifo" property. Signed-off-by: Yangbo Lu Reviewed-by: Rob Herring Signed-off-by: David S. Miller --- Documentation/devicetree/bindings/ptp/ptp-qoriq.txt | 2 ++ 1 file changed, 2 insertions(+) (limited to 'Documentation') diff --git a/Documentation/devicetree/bindings/ptp/ptp-qoriq.txt b/Documentation/devicetree/bindings/ptp/ptp-qoriq.txt index c5d0e7998e2b..8e7f8551d190 100644 --- a/Documentation/devicetree/bindings/ptp/ptp-qoriq.txt +++ b/Documentation/devicetree/bindings/ptp/ptp-qoriq.txt @@ -17,6 +17,8 @@ Clock Properties: - fsl,tmr-fiper1 Fixed interval period pulse generator. - fsl,tmr-fiper2 Fixed interval period pulse generator. - fsl,max-adj Maximum frequency adjustment in parts per billion. + - fsl,extts-fifo The presence of this property indicates hardware + support for the external trigger stamp FIFO. These properties set the operational parameters for the PTP clock. You must choose these carefully for the clock to work right. -- cgit v1.2.3 From ffcf7ce9332723cab5ae55575f3a55d1ce559bf3 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 18 Jan 2019 13:56:49 -0800 Subject: bpf: btf: add btf documentation This patch added documentation for BTF (BPF Debug Format). The document is placed under linux:Documentation/bpf directory. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- Documentation/bpf/btf.rst | 870 ++++++++++++++++++++++++++++++++++++++++++++ Documentation/bpf/index.rst | 7 + 2 files changed, 877 insertions(+) create mode 100644 Documentation/bpf/btf.rst (limited to 'Documentation') diff --git a/Documentation/bpf/btf.rst b/Documentation/bpf/btf.rst new file mode 100644 index 000000000000..1d434c3a268d --- /dev/null +++ b/Documentation/bpf/btf.rst @@ -0,0 +1,870 @@ +===================== +BPF Type Format (BTF) +===================== + +1. Introduction +*************** + +BTF (BPF Type Format) is the meta data format which +encodes the debug info related to BPF program/map. +The name BTF was used initially to describe +data types. The BTF was later extended to include +function info for defined subroutines, and line info +for source/line information. + +The debug info is used for map pretty print, function +signature, etc. The function signature enables better +bpf program/function kernel symbol. +The line info helps generate +source annotated translated byte code, jited code +and verifier log. + +The BTF specification contains two parts, + * BTF kernel API + * BTF ELF file format + +The kernel API is the contract between +user space and kernel. The kernel verifies +the BTF info before using it. +The ELF file format is a user space contract +between ELF file and libbpf loader. + +The type and string sections are part of the +BTF kernel API, describing the debug info +(mostly types related) referenced by the bpf program. +These two sections are discussed in +details in :ref:`BTF_Type_String`. + +.. _BTF_Type_String: + +2. BTF Type and String Encoding +******************************* + +The file ``include/uapi/linux/btf.h`` provides high +level definition on how types/strings are encoded. + +The beginning of data blob must be:: + + struct btf_header { + __u16 magic; + __u8 version; + __u8 flags; + __u32 hdr_len; + + /* All offsets are in bytes relative to the end of this header */ + __u32 type_off; /* offset of type section */ + __u32 type_len; /* length of type section */ + __u32 str_off; /* offset of string section */ + __u32 str_len; /* length of string section */ + }; + +The magic is ``0xeB9F``, which has different encoding for big and little +endian system, and can be used to test whether BTF is generated for +big or little endian target. +The btf_header is designed to be extensible with hdr_len equal to +``sizeof(struct btf_header)`` when the data blob is generated. + +2.1 String Encoding +=================== + +The first string in the string section must be a null string. +The rest of string table is a concatenation of other null-treminated +strings. + +2.2 Type Encoding +================= + +The type id ``0`` is reserved for ``void`` type. +The type section is parsed sequentially and the type id is assigned to +each recognized type starting from id ``1``. +Currently, the following types are supported:: + + #define BTF_KIND_INT 1 /* Integer */ + #define BTF_KIND_PTR 2 /* Pointer */ + #define BTF_KIND_ARRAY 3 /* Array */ + #define BTF_KIND_STRUCT 4 /* Struct */ + #define BTF_KIND_UNION 5 /* Union */ + #define BTF_KIND_ENUM 6 /* Enumeration */ + #define BTF_KIND_FWD 7 /* Forward */ + #define BTF_KIND_TYPEDEF 8 /* Typedef */ + #define BTF_KIND_VOLATILE 9 /* Volatile */ + #define BTF_KIND_CONST 10 /* Const */ + #define BTF_KIND_RESTRICT 11 /* Restrict */ + #define BTF_KIND_FUNC 12 /* Function */ + #define BTF_KIND_FUNC_PROTO 13 /* Function Proto */ + +Note that the type section encodes debug info, not just pure types. +``BTF_KIND_FUNC`` is not a type, and it represents a defined subprogram. + +Each type contains the following common data:: + + struct btf_type { + __u32 name_off; + /* "info" bits arrangement + * bits 0-15: vlen (e.g. # of struct's members) + * bits 16-23: unused + * bits 24-27: kind (e.g. int, ptr, array...etc) + * bits 28-30: unused + * bit 31: kind_flag, currently used by + * struct, union and fwd + */ + __u32 info; + /* "size" is used by INT, ENUM, STRUCT and UNION. + * "size" tells the size of the type it is describing. + * + * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, + * FUNC and FUNC_PROTO. + * "type" is a type_id referring to another type. + */ + union { + __u32 size; + __u32 type; + }; + }; + +For certain kinds, the common data are followed by kind specific data. +The ``name_off`` in ``struct btf_type`` specifies the offset in the string table. +The following details encoding of each kind. + +2.2.1 BTF_KIND_INT +~~~~~~~~~~~~~~~~~~ + +``struct btf_type`` encoding requirement: + * ``name_off``: any valid offset + * ``info.kind_flag``: 0 + * ``info.kind``: BTF_KIND_INT + * ``info.vlen``: 0 + * ``size``: the size of the int type in bytes. + +``btf_type`` is followed by a ``u32`` with following bits arrangement:: + + #define BTF_INT_ENCODING(VAL) (((VAL) & 0x0f000000) >> 24) + #define BTF_INT_OFFSET(VAL) (((VAL & 0x00ff0000)) >> 16) + #define BTF_INT_BITS(VAL) ((VAL) & 0x000000ff) + +The ``BTF_INT_ENCODING`` has the following attributes:: + + #define BTF_INT_SIGNED (1 << 0) + #define BTF_INT_CHAR (1 << 1) + #define BTF_INT_BOOL (1 << 2) + +The ``BTF_INT_ENCODING()`` provides extra information, signness, +char, or bool, for the int type. The char and bool encoding +are mostly useful for pretty print. At most one encoding can +be specified for the int type. + +The ``BTF_INT_BITS()`` specifies the number of actual bits held by +this int type. For example, a 4-bit bitfield encodes +``BTF_INT_BITS()`` equals to 4. The ``btf_type.size * 8`` +must be equal to or greater than ``BTF_INT_BITS()`` for the type. +The maximum value of ``BTF_INT_BITS()`` is 128. + +The ``BTF_INT_OFFSET()`` specifies the starting bit offset to +calculate values for this int. For example, a bitfield struct +member has + + * btf member bit offset 100 from the start of the structure, + * btf member pointing to an int type, + * the int type has ``BTF_INT_OFFSET() = 2`` and ``BTF_INT_BITS() = 4`` + +Then in the struct memory layout, this member will occupy +``4`` bits starting from bits ``100 + 2 = 102``. + +Alternatively, the bitfield struct member can be the following to +access the same bits as the above: + + * btf member bit offset 102, + * btf member pointing to an int type, + * the int type has ``BTF_INT_OFFSET() = 0`` and ``BTF_INT_BITS() = 4`` + +The original intention of ``BTF_INT_OFFSET()`` is to provide +flexibility of bitfield encoding. +Currently, both llvm and pahole generates ``BTF_INT_OFFSET() = 0`` +for all int types. + +2.2.2 BTF_KIND_PTR +~~~~~~~~~~~~~~~~~~ + +``struct btf_type`` encoding requirement: + * ``name_off``: 0 + * ``info.kind_flag``: 0 + * ``info.kind``: BTF_KIND_PTR + * ``info.vlen``: 0 + * ``type``: the pointee type of the pointer + +No additional type data follow ``btf_type``. + +2.2.3 BTF_KIND_ARRAY +~~~~~~~~~~~~~~~~~~~~ + +``struct btf_type`` encoding requirement: + * ``name_off``: 0 + * ``info.kind_flag``: 0 + * ``info.kind``: BTF_KIND_ARRAY + * ``info.vlen``: 0 + * ``size/type``: 0, not used + +btf_type is followed by one "struct btf_array":: + + struct btf_array { + __u32 type; + __u32 index_type; + __u32 nelems; + }; + +The ``struct btf_array`` encoding: + * ``type``: the element type + * ``index_type``: the index type + * ``nelems``: the number of elements for this array (``0`` is also allowed). + +The ``index_type`` can be any regular int types +(u8, u16, u32, u64, unsigned __int128). +The original design of including ``index_type`` follows dwarf +which has a ``index_type`` for its array type. +Currently in BTF, beyond type verification, the ``index_type`` is not used. + +The ``struct btf_array`` allows chaining through element type to represent +multiple dimensional arrays. For example, ``int a[5][6]``, the following +type system illustrates the chaining: + + * [1]: int + * [2]: array, ``btf_array.type = [1]``, ``btf_array.nelems = 6`` + * [3]: array, ``btf_array.type = [2]``, ``btf_array.nelems = 5`` + +Currently, both pahole and llvm collapse multiple dimensional array +into one dimensional array, e.g., ``a[5][6]``, the btf_array.nelems +equal to ``30``. This is because the original use case is map pretty +print where the whole array is dumped out so one dimensional array +is enough. As more BTF usage is explored, pahole and llvm can be +changed to generate proper chained representation for +multiple dimensional arrays. + +2.2.4 BTF_KIND_STRUCT +~~~~~~~~~~~~~~~~~~~~~ +2.2.5 BTF_KIND_UNION +~~~~~~~~~~~~~~~~~~~~ + +``struct btf_type`` encoding requirement: + * ``name_off``: 0 or offset to a valid C identifier + * ``info.kind_flag``: 0 or 1 + * ``info.kind``: BTF_KIND_STRUCT or BTF_KIND_UNION + * ``info.vlen``: the number of struct/union members + * ``info.size``: the size of the struct/union in bytes + +``btf_type`` is followed by ``info.vlen`` number of ``struct btf_member``.:: + + struct btf_member { + __u32 name_off; + __u32 type; + __u32 offset; + }; + +``struct btf_member`` encoding: + * ``name_off``: offset to a valid C identifier + * ``type``: the member type + * ``offset``: + +If the type info ``kind_flag`` is not set, the offset contains +only bit offset of the member. Note that the base type of the +bitfield can only be int or enum type. If the bitfield size +is 32, the base type can be either int or enum type. +If the bitfield size is not 32, the base type must be int, +and int type ``BTF_INT_BITS()`` encodes the bitfield size. + +If the ``kind_flag`` is set, the ``btf_member.offset`` +contains both member bitfield size and bit offset. The +bitfield size and bit offset are calculated as below.:: + + #define BTF_MEMBER_BITFIELD_SIZE(val) ((val) >> 24) + #define BTF_MEMBER_BIT_OFFSET(val) ((val) & 0xffffff) + +In this case, if the base type is an int type, it must +be a regular int type: + + * ``BTF_INT_OFFSET()`` must be 0. + * ``BTF_INT_BITS()`` must be equal to ``{1,2,4,8,16} * 8``. + +The following kernel patch introduced ``kind_flag`` and +explained why both modes exist: + + https://github.com/torvalds/linux/commit/9d5f9f701b1891466fb3dbb1806ad97716f95cc3#diff-fa650a64fdd3968396883d2fe8215ff3 + +2.2.6 BTF_KIND_ENUM +~~~~~~~~~~~~~~~~~~~ + +``struct btf_type`` encoding requirement: + * ``name_off``: 0 or offset to a valid C identifier + * ``info.kind_flag``: 0 + * ``info.kind``: BTF_KIND_ENUM + * ``info.vlen``: number of enum values + * ``size``: 4 + +``btf_type`` is followed by ``info.vlen`` number of ``struct btf_enum``.:: + + struct btf_enum { + __u32 name_off; + __s32 val; + }; + +The ``btf_enum`` encoding: + * ``name_off``: offset to a valid C identifier + * ``val``: any value + +2.2.7 BTF_KIND_FWD +~~~~~~~~~~~~~~~~~~ + +``struct btf_type`` encoding requirement: + * ``name_off``: offset to a valid C identifier + * ``info.kind_flag``: 0 for struct, 1 for union + * ``info.kind``: BTF_KIND_FWD + * ``info.vlen``: 0 + * ``type``: 0 + +No additional type data follow ``btf_type``. + +2.2.8 BTF_KIND_TYPEDEF +~~~~~~~~~~~~~~~~~~~~~~ + +``struct btf_type`` encoding requirement: + * ``name_off``: offset to a valid C identifier + * ``info.kind_flag``: 0 + * ``info.kind``: BTF_KIND_TYPEDEF + * ``info.vlen``: 0 + * ``type``: the type which can be referred by name at ``name_off`` + +No additional type data follow ``btf_type``. + +2.2.9 BTF_KIND_VOLATILE +~~~~~~~~~~~~~~~~~~~~~~~ + +``struct btf_type`` encoding requirement: + * ``name_off``: 0 + * ``info.kind_flag``: 0 + * ``info.kind``: BTF_KIND_VOLATILE + * ``info.vlen``: 0 + * ``type``: the type with ``volatile`` qualifier + +No additional type data follow ``btf_type``. + +2.2.10 BTF_KIND_CONST +~~~~~~~~~~~~~~~~~~~~~ + +``struct btf_type`` encoding requirement: + * ``name_off``: 0 + * ``info.kind_flag``: 0 + * ``info.kind``: BTF_KIND_CONST + * ``info.vlen``: 0 + * ``type``: the type with ``const`` qualifier + +No additional type data follow ``btf_type``. + +2.2.11 BTF_KIND_RESTRICT +~~~~~~~~~~~~~~~~~~~~~~~~ + +``struct btf_type`` encoding requirement: + * ``name_off``: 0 + * ``info.kind_flag``: 0 + * ``info.kind``: BTF_KIND_RESTRICT + * ``info.vlen``: 0 + * ``type``: the type with ``restrict`` qualifier + +No additional type data follow ``btf_type``. + +2.2.12 BTF_KIND_FUNC +~~~~~~~~~~~~~~~~~~~~ + +``struct btf_type`` encoding requirement: + * ``name_off``: offset to a valid C identifier + * ``info.kind_flag``: 0 + * ``info.kind``: BTF_KIND_FUNC + * ``info.vlen``: 0 + * ``type``: a BTF_KIND_FUNC_PROTO type + +No additional type data follow ``btf_type``. + +A BTF_KIND_FUNC defines, not a type, but a subprogram (function) whose +signature is defined by ``type``. The subprogram is thus an instance of +that type. The BTF_KIND_FUNC may in turn be referenced by a func_info in +the :ref:`BTF_Ext_Section` (ELF) or in the arguments to +:ref:`BPF_Prog_Load` (ABI). + +2.2.13 BTF_KIND_FUNC_PROTO +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +``struct btf_type`` encoding requirement: + * ``name_off``: 0 + * ``info.kind_flag``: 0 + * ``info.kind``: BTF_KIND_FUNC_PROTO + * ``info.vlen``: # of parameters + * ``type``: the return type + +``btf_type`` is followed by ``info.vlen`` number of ``struct btf_param``.:: + + struct btf_param { + __u32 name_off; + __u32 type; + }; + +If a BTF_KIND_FUNC_PROTO type is referred by a BTF_KIND_FUNC type, +then ``btf_param.name_off`` must point to a valid C identifier +except for the possible last argument representing the variable +argument. The btf_param.type refers to parameter type. + +If the function has variable arguments, the last parameter +is encoded with ``name_off = 0`` and ``type = 0``. + +3. BTF Kernel API +***************** + +The following bpf syscall command involves BTF: + * BPF_BTF_LOAD: load a blob of BTF data into kernel + * BPF_MAP_CREATE: map creation with btf key and value type info. + * BPF_PROG_LOAD: prog load with btf function and line info. + * BPF_BTF_GET_FD_BY_ID: get a btf fd + * BPF_OBJ_GET_INFO_BY_FD: btf, func_info, line_info + and other btf related info are returned. + +The workflow typically looks like: +:: + + Application: + BPF_BTF_LOAD + | + v + BPF_MAP_CREATE and BPF_PROG_LOAD + | + V + ...... + + Introspection tool: + ...... + BPF_{PROG,MAP}_GET_NEXT_ID (get prog/map id's) + | + V + BPF_{PROG,MAP}_GET_FD_BY_ID (get a prog/map fd) + | + V + BPF_OBJ_GET_INFO_BY_FD (get bpf_prog_info/bpf_map_info with btf_id) + | | + V | + BPF_BTF_GET_FD_BY_ID (get btf_fd) | + | | + V | + BPF_OBJ_GET_INFO_BY_FD (get btf) | + | | + V V + pretty print types, dump func signatures and line info, etc. + + +3.1 BPF_BTF_LOAD +================ + +Load a blob of BTF data into kernel. A blob of data +described in :ref:`BTF_Type_String` +can be directly loaded into the kernel. +A ``btf_fd`` returns to userspace. + +3.2 BPF_MAP_CREATE +================== + +A map can be created with ``btf_fd`` and specified key/value type id.:: + + __u32 btf_fd; /* fd pointing to a BTF type data */ + __u32 btf_key_type_id; /* BTF type_id of the key */ + __u32 btf_value_type_id; /* BTF type_id of the value */ + +In libbpf, the map can be defined with extra annotation like below: +:: + + struct bpf_map_def SEC("maps") btf_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(struct ipv_counts), + .max_entries = 4, + }; + BPF_ANNOTATE_KV_PAIR(btf_map, int, struct ipv_counts); + +Here, the parameters for macro BPF_ANNOTATE_KV_PAIR are map name, +key and value types for the map. +During ELF parsing, libbpf is able to extract key/value type_id's +and assigned them to BPF_MAP_CREATE attributes automatically. + +.. _BPF_Prog_Load: + +3.3 BPF_PROG_LOAD +================= + +During prog_load, func_info and line_info can be passed to kernel with +proper values for the following attributes: +:: + + __u32 insn_cnt; + __aligned_u64 insns; + ...... + __u32 prog_btf_fd; /* fd pointing to BTF type data */ + __u32 func_info_rec_size; /* userspace bpf_func_info size */ + __aligned_u64 func_info; /* func info */ + __u32 func_info_cnt; /* number of bpf_func_info records */ + __u32 line_info_rec_size; /* userspace bpf_line_info size */ + __aligned_u64 line_info; /* line info */ + __u32 line_info_cnt; /* number of bpf_line_info records */ + +The func_info and line_info are an array of below, respectively.:: + + struct bpf_func_info { + __u32 insn_off; /* [0, insn_cnt - 1] */ + __u32 type_id; /* pointing to a BTF_KIND_FUNC type */ + }; + struct bpf_line_info { + __u32 insn_off; /* [0, insn_cnt - 1] */ + __u32 file_name_off; /* offset to string table for the filename */ + __u32 line_off; /* offset to string table for the source line */ + __u32 line_col; /* line number and column number */ + }; + +func_info_rec_size is the size of each func_info record, and line_info_rec_size +is the size of each line_info record. Passing the record size to kernel make +it possible to extend the record itself in the future. + +Below are requirements for func_info: + * func_info[0].insn_off must be 0. + * the func_info insn_off is in strictly increasing order and matches + bpf func boundaries. + +Below are requirements for line_info: + * the first insn in each func must points to a line_info record. + * the line_info insn_off is in strictly increasing order. + +For line_info, the line number and column number are defined as below: +:: + + #define BPF_LINE_INFO_LINE_NUM(line_col) ((line_col) >> 10) + #define BPF_LINE_INFO_LINE_COL(line_col) ((line_col) & 0x3ff) + +3.4 BPF_{PROG,MAP}_GET_NEXT_ID + +In kernel, every loaded program, map or btf has a unique id. +The id won't change during the life time of the program, map or btf. + +The bpf syscall command BPF_{PROG,MAP}_GET_NEXT_ID +returns all id's, one for each command, to user space, for bpf +program or maps, +so the inspection tool can inspect all programs and maps. + +3.5 BPF_{PROG,MAP}_GET_FD_BY_ID + +The introspection tool cannot use id to get details about program or maps. +A file descriptor needs to be obtained first for reference countin