// SPDX-License-Identifier: GPL-2.0-only
/* PIPAPO: PIle PAcket POlicies: AVX2 packet lookup routines
*
* Copyright (c) 2019-2020 Red Hat GmbH
*
* Author: Stefano Brivio <sbrivio@redhat.com>
*/
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
#include <uapi/linux/netfilter/nf_tables.h>
#include <linux/bitmap.h>
#include <linux/bitops.h>
#include <linux/compiler.h>
#include <asm/fpu/api.h>
#include "nft_set_pipapo_avx2.h"
#include "nft_set_pipapo.h"
#define NFT_PIPAPO_LONGS_PER_M256 (XSAVE_YMM_SIZE / BITS_PER_LONG)
/* Load from memory into YMM register with non-temporal hint ("stream load"),
* that is, don't fetch lines from memory into the cache. This avoids pushing
* precious packet data out of the cache hierarchy, and is appropriate when:
*
* - loading buckets from lookup tables, as they are not going to be used
* again before packets are entirely classified
*
* - loading the result bitmap from the previous field, as it's never used
* again
*/
#define NFT_PIPAPO_AVX2_LOAD(reg, loc) \
asm volatile("vmovntdqa %0, %%ymm" #reg : : "m" (loc))
/* Stream a single lookup table bucket into YMM register given lookup table,
* group index, value of packet bits, bucket size.
*/
#define NFT_PIPAPO_AVX2_BUCKET_LOAD4(reg, lt, group, v, bsize) \
NFT_PIPAPO_AVX2_LOAD(reg, \
lt[((group) * NFT_PIPAPO_BUCKETS(4) + \
(v)) * (bsize)])
#define NFT_PIPAPO_AVX2_BUCKET_LOAD8(reg, lt, group, v, bsize) \
NFT_PIPAPO_AVX2_LOAD(reg, \
lt[((group) * NFT_PIPAPO_BUCKETS(8) + \
(v)) * (bsize)])
/* Bitwise AND: the staple operation of this algorithm */
#define NFT_PIPAPO_AVX2_AND(dst, a, b) \
asm volatile("vpand %ymm" #a ", %ymm" #b ", %ymm" #dst)
/* Jump to label if @reg is zero */
#define NFT_PIPAPO_AVX2_NOMATCH_GOTO(reg, label) \
asm_volatile_goto("vptest %%ymm" #reg ", %%ymm" #reg ";" \
"je %l[" #label "]" : : : : label)
/* Store 256 bits from YMM register into memory. Contrary to bucket load
* operation, we don't bypass the cache here, as stored matching results
* are always used shortly after.
*/
#define NFT_PIPAPO_AVX2_STORE(loc, reg) \
asm volatile("vmovdqa %%ymm" #reg ", %0" : "=m" (loc))
/* Zero out a complete YMM register, @reg */
#define NFT_PIPAPO_AVX2_ZERO(reg) \
asm volatile("vpxor %ymm" #reg ", %ymm" #reg ", %ymm" #reg)
/* Current working bitmap index, toggled between field matches */
static DEFINE_PER_CPU(bool, nft_pipapo_avx2_scratch_index);
/**
* nft_pipapo_avx2_prepare() - Prepare before main algorithm body
*
* This zeroes out ymm15, which is later used whenever we need to clear a
* memory location, by storing its content into memory.
*/
static void nft_pipapo_avx2_prepare(void)
{
NFT_PIPAPO_AVX2_ZERO(15);
}
/**
* nft_pipapo_avx2_fill() - Fill a bitmap region with ones
* @data: Base memory area
* @start: First bit to set
* @len: Count of bits to fill
*
* This is nothing else than a version of bitmap_set(), as used e.g. by
* pipapo_refill(), tailored for the microarchitectures using it and better
* suited for the specific usage: it's very likely that we'll set a small number
* of bits, not crossing a word boundary, and correct branch prediction is
* critical here.
*
* This function doesn't actually use any AVX2 instruction.
*/
static void nft_pipapo_avx2_fill(unsigned long *data, int start, int len)
{
int offset = start % BITS_PER_LONG;
unsigned long mask;
data += start / BITS_PER_LONG;
if (likely(len == 1)) {
*data |= BIT(offset);
return;
}
if (likely(len < BITS_PER_LONG || offset)) {
if (likely(len + offset <= BITS_PER_LONG)) {
*data |= GENMASK(len - 1 + offset, offset);
return;
}
*data |= ~0UL << offset;
len -= BITS_PER_LONG - offset;
data++;
if (len <= BITS_PER_LONG) {
mask = ~0UL >> (BITS_PER_LONG - len);
*data |= mask;
return;
}
}
memset(data, 0xff, len / BITS_PER_BYTE);
data += len / BITS_PER_LONG;
len %= BITS_PER_LONG;
if (len)
*data |= ~0UL >> (BITS_PER_LONG - len);
}
/**
* nft_pipapo_avx2_refill() - Scan bitmap, select mapping table item, set bits
* @offset: Start from given bitmap (equivalent to bucket) offset, in longs
* @map: Bitmap to be scanned for set bits
* @dst: Destination bitmap
* @mt: Mapping table containing bit set specifiers
* @len: Length of bitmap in longs
* @last: Return index of first set bit, if this is the last field
*
* Thi