From efa773fe913ff188c554ccaac577d0a1bb1dc2f4 Mon Sep 17 00:00:00 2001 From: Alim Akhtar Date: Mon, 24 Aug 2015 18:01:15 +0530 Subject: arm64: defconfig: Enable samsung serial and mmc This patch update defconfig, adds samsung serial and Synopsys Designware MMC configs related to exynos SoC Signed-off-by: Alim Akhtar Signed-off-by: Catalin Marinas --- arch/arm64/configs/defconfig | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index 34d71dd86781..c8ccda16e079 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -109,6 +109,10 @@ CONFIG_SERIAL_8250_DW=y CONFIG_SERIAL_8250_MT6577=y CONFIG_SERIAL_AMBA_PL011=y CONFIG_SERIAL_AMBA_PL011_CONSOLE=y +CONFIG_SERIAL_SAMSUNG=y +CONFIG_SERIAL_SAMSUNG_UARTS_4=y +CONFIG_SERIAL_SAMSUNG_UARTS=4 +CONFIG_SERIAL_SAMSUNG_CONSOLE=y CONFIG_SERIAL_MSM=y CONFIG_SERIAL_MSM_CONSOLE=y CONFIG_SERIAL_OF_PLATFORM=y @@ -145,6 +149,10 @@ CONFIG_MMC_ARMMMCI=y CONFIG_MMC_SDHCI=y CONFIG_MMC_SDHCI_PLTFM=y CONFIG_MMC_SPI=y +CONFIG_MMC_DW=y +CONFIG_MMC_DW_IDMAC=y +CONFIG_MMC_DW_PLTFM=y +CONFIG_MMC_DW_EXYNOS=y CONFIG_NEW_LEDS=y CONFIG_LEDS_CLASS=y CONFIG_LEDS_SYSCON=y -- cgit v1.2.3 From e5c88e3f2fb35dca5f3e46d65095bf5d008595b7 Mon Sep 17 00:00:00 2001 From: Feng Kan Date: Wed, 23 Sep 2015 11:55:38 -0700 Subject: arm64: Change memcpy in kernel to use the copy template file This converts the memcpy.S to use the copy template file. The copy template file was based originally on the memcpy.S Signed-off-by: Feng Kan Signed-off-by: Balamurugan Shanmugam [catalin.marinas@arm.com: removed tmp3(w) .req statements as they are not used] Signed-off-by: Catalin Marinas --- arch/arm64/lib/copy_template.S | 193 +++++++++++++++++++++++++++++++++++++++++ arch/arm64/lib/memcpy.S | 179 ++++++-------------------------------- 2 files changed, 219 insertions(+), 153 deletions(-) create mode 100644 arch/arm64/lib/copy_template.S diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S new file mode 100644 index 000000000000..410fbdb8163f --- /dev/null +++ b/arch/arm64/lib/copy_template.S @@ -0,0 +1,193 @@ +/* + * Copyright (C) 2013 ARM Ltd. + * Copyright (C) 2013 Linaro. + * + * This code is based on glibc cortex strings work originally authored by Linaro + * and re-licensed under GPLv2 for the Linux kernel. The original code can + * be found @ + * + * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/ + * files/head:/src/aarch64/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + + +/* + * Copy a buffer from src to dest (alignment handled by the hardware) + * + * Parameters: + * x0 - dest + * x1 - src + * x2 - n + * Returns: + * x0 - dest + */ +dstin .req x0 +src .req x1 +count .req x2 +tmp1 .req x3 +tmp1w .req w3 +tmp2 .req x4 +tmp2w .req w4 +dst .req x6 + +A_l .req x7 +A_h .req x8 +B_l .req x9 +B_h .req x10 +C_l .req x11 +C_h .req x12 +D_l .req x13 +D_h .req x14 + + mov dst, dstin + cmp count, #16 + /*When memory length is less than 16, the accessed are not aligned.*/ + b.lo .Ltiny15 + + neg tmp2, src + ands tmp2, tmp2, #15/* Bytes to reach alignment. */ + b.eq .LSrcAligned + sub count, count, tmp2 + /* + * Copy the leading memory data from src to dst in an increasing + * address order.By this way,the risk of overwritting the source + * memory data is eliminated when the distance between src and + * dst is less than 16. The memory accesses here are alignment. + */ + tbz tmp2, #0, 1f + ldrb1 tmp1w, src, #1 + strb1 tmp1w, dst, #1 +1: + tbz tmp2, #1, 2f + ldrh1 tmp1w, src, #2 + strh1 tmp1w, dst, #2 +2: + tbz tmp2, #2, 3f + ldr1 tmp1w, src, #4 + str1 tmp1w, dst, #4 +3: + tbz tmp2, #3, .LSrcAligned + ldr1 tmp1, src, #8 + str1 tmp1, dst, #8 + +.LSrcAligned: + cmp count, #64 + b.ge .Lcpy_over64 + /* + * Deal with small copies quickly by dropping straight into the + * exit block. + */ +.Ltail63: + /* + * Copy up to 48 bytes of data. At this point we only need the + * bottom 6 bits of count to be accurate. + */ + ands tmp1, count, #0x30 + b.eq .Ltiny15 + cmp tmp1w, #0x20 + b.eq 1f + b.lt 2f + ldp1 A_l, A_h, src, #16 + stp1 A_l, A_h, dst, #16 +1: + ldp1 A_l, A_h, src, #16 + stp1 A_l, A_h, dst, #16 +2: + ldp1 A_l, A_h, src, #16 + stp1 A_l, A_h, dst, #16 +.Ltiny15: + /* + * Prefer to break one ldp/stp into several load/store to access + * memory in an increasing address order,rather than to load/store 16 + * bytes from (src-16) to (dst-16) and to backward the src to aligned + * address,which way is used in original cortex memcpy. If keeping + * the original memcpy process here, memmove need to satisfy the + * precondition that src address is at least 16 bytes bigger than dst + * address,otherwise some source data will be overwritten when memove + * call memcpy directly. To make memmove simpler and decouple the + * memcpy's dependency on memmove, withdrew the original process. + */ + tbz count, #3, 1f + ldr1 tmp1, src, #8 + str1 tmp1, dst, #8 +1: + tbz count, #2, 2f + ldr1 tmp1w, src, #4 + str1 tmp1w, dst, #4 +2: + tbz count, #1, 3f + ldrh1 tmp1w, src, #2 + strh1 tmp1w, dst, #2 +3: + tbz count, #0, .Lexitfunc + ldrb1 tmp1w, src, #1 + strb1 tmp1w, dst, #1 + + b .Lexitfunc + +.Lcpy_over64: + subs count, count, #128 + b.ge .Lcpy_body_large + /* + * Less than 128 bytes to copy, so handle 64 here and then jump + * to the tail. + */ + ldp1 A_l, A_h, src, #16 + stp1 A_l, A_h, dst, #16 + ldp1 B_l, B_h, src, #16 + ldp1 C_l, C_h, src, #16 + stp1 B_l, B_h, dst, #16 + stp1 C_l, C_h, dst, #16 + ldp1 D_l, D_h, src, #16 + stp1 D_l, D_h, dst, #16 + + tst count, #0x3f + b.ne .Ltail63 + b .Lexitfunc + + /* + * Critical loop. Start at a new cache line boundary. Assuming + * 64 bytes per line this ensures the entire loop is in one line. + */ + .p2align L1_CACHE_SHIFT +.Lcpy_body_large: + /* pre-get 64 bytes data. */ + ldp1 A_l, A_h, src, #16 + ldp1 B_l, B_h, src, #16 + ldp1 C_l, C_h, src, #16 + ldp1 D_l, D_h, src, #16 +1: + /* + * interlace the load of next 64 bytes data block with store of the last + * loaded 64 bytes data. + */ + stp1 A_l, A_h, dst, #16 + ldp1 A_l, A_h, src, #16 + stp1 B_l, B_h, dst, #16 + ldp1 B_l, B_h, src, #16 + stp1 C_l, C_h, dst, #16 + ldp1 C_l, C_h, src, #16 + stp1 D_l, D_h, dst, #16 + ldp1 D_l, D_h, src, #16 + subs count, count, #64 + b.ge 1b + stp1 A_l, A_h, dst, #16 + stp1 B_l, B_h, dst, #16 + stp1 C_l, C_h, dst, #16 + stp1 D_l, D_h, dst, #16 + + tst count, #0x3f + b.ne .Ltail63 +.Lexitfunc: diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S index 8a9a96d3ddae..173a1aace9bb 100644 --- a/arch/arm64/lib/memcpy.S +++ b/arch/arm64/lib/memcpy.S @@ -36,166 +36,39 @@ * Returns: * x0 - dest */ -dstin .req x0 -src .req x1 -count .req x2 -tmp1 .req x3 -tmp1w .req w3 -tmp2 .req x4 -tmp2w .req w4 -tmp3 .req x5 -tmp3w .req w5 -dst .req x6 + .macro ldrb1 ptr, regB, val + ldrb \ptr, [\regB], \val + .endm -A_l .req x7 -A_h .req x8 -B_l .req x9 -B_h .req x10 -C_l .req x11 -C_h .req x12 -D_l .req x13 -D_h .req x14 + .macro strb1 ptr, regB, val + strb \ptr, [\regB], \val + .endm -ENTRY(memcpy) - mov dst, dstin - cmp count, #16 - /*When memory length is less than 16, the accessed are not aligned.*/ - b.lo .Ltiny15 + .macro ldrh1 ptr, regB, val + ldrh \ptr, [\regB], \val + .endm - neg tmp2, src - ands tmp2, tmp2, #15/* Bytes to reach alignment. */ - b.eq .LSrcAligned - sub count, count, tmp2 - /* - * Copy the leading memory data from src to dst in an increasing - * address order.By this way,the risk of overwritting the source - * memory data is eliminated when the distance between src and - * dst is less than 16. The memory accesses here are alignment. - */ - tbz tmp2, #0, 1f - ldrb tmp1w, [src], #1 - strb tmp1w, [dst], #1 -1: - tbz tmp2, #1, 2f - ldrh tmp1w, [src], #2 - strh tmp1w, [dst], #2 -2: - tbz tmp2, #2, 3f - ldr tmp1w, [src], #4 - str tmp1w, [dst], #4 -3: - tbz tmp2, #3, .LSrcAligned - ldr tmp1, [src],#8 - str tmp1, [dst],#8 + .macro strh1 ptr, regB, val + strh \ptr, [\regB], \val + .endm -.LSrcAligned: - cmp count, #64 - b.ge .Lcpy_over64 - /* - * Deal with small copies quickly by dropping straight into the - * exit block. - */ -.Ltail63: - /* - * Copy up to 48 bytes of data. At this point we only need the - * bottom 6 bits of count to be accurate. - */ - ands tmp1, count, #0x30 - b.eq .Ltiny15 - cmp tmp1w, #0x20 - b.eq 1f - b.lt 2f - ldp A_l, A_h, [src], #16 - stp A_l, A_h, [dst], #16 -1: - ldp A_l, A_h, [src], #16 - stp A_l, A_h, [dst], #16 -2: - ldp A_l, A_h, [src], #16 - stp A_l, A_h, [dst], #16 -.Ltiny15: - /* - * Prefer to break one ldp/stp into several load/store to access - * memory in an increasing address order,rather than to load/store 16 - * bytes from (src-16) to (dst-16) and to backward the src to aligned - * address,which way is used in original cortex memcpy. If keeping - * the original memcpy process here, memmove need to satisfy the - * precondition that src address is at least 16 bytes bigger than dst - * address,otherwise some source data will be overwritten when memove - * call memcpy directly. To make memmove simpler and decouple the - * memcpy's dependency on memmove, withdrew the original process. - */ - tbz count, #3, 1f - ldr tmp1, [src], #8 - str tmp1, [dst], #8 -1: - tbz count, #2, 2f - ldr tmp1w, [src], #4 - str tmp1w, [dst], #4 -2: - tbz count, #1, 3f - ldrh tmp1w, [src], #2 - strh tmp1w, [dst], #2 -3: - tbz count, #0, .Lexitfunc - ldrb tmp1w, [src] - strb tmp1w, [dst] + .macro ldr1 ptr, regB, val + ldr \ptr, [\regB], \val + .endm -.Lexitfunc: - ret + .macro str1 ptr, regB, val + str \ptr, [\regB], \val + .endm -.Lcpy_over64: - subs count, count, #128 - b.ge .Lcpy_body_large - /* - * Less than 128 bytes to copy, so handle 64 here and then jump - * to the tail. - */ - ldp A_l, A_h, [src],#16 - stp A_l, A_h, [dst],#16 - ldp B_l, B_h, [src],#16 - ldp C_l, C_h, [src],#16 - stp B_l, B_h, [dst],#16 - stp C_l, C_h, [dst],#16 - ldp D_l, D_h, [src],#16 - stp D_l, D_h, [dst],#16 + .macro ldp1 ptr, regB, regC, val + ldp \ptr, \regB, [\regC], \val + .endm - tst count, #0x3f - b.ne .Ltail63 - ret + .macro stp1 ptr, regB, regC, val + stp \ptr, \regB, [\regC], \val + .endm - /* - * Critical loop. Start at a new cache line boundary. Assuming - * 64 bytes per line this ensures the entire loop is in one line. - */ - .p2align L1_CACHE_SHIFT -.Lcpy_body_large: - /* pre-get 64 bytes data. */ - ldp A_l, A_h, [src],#16 - ldp B_l, B_h, [src],#16 - ldp C_l, C_h, [src],#16 - ldp D_l, D_h, [src],#16 -1: - /* - * interlace the load of next 64 bytes data block with store of the last - * loaded 64 bytes data. - */ - stp A_l, A_h, [dst],#16 - ldp A_l, A_h, [src],#16 - stp B_l, B_h, [dst],#16 - ldp B_l, B_h, [src],#16 - stp C_l, C_h, [dst],#16 - ldp C_l, C_h, [src],#16 - stp D_l, D_h, [dst],#16 - ldp D_l, D_h, [src],#16 - subs count, count, #64 - b.ge 1b - stp A_l, A_h, [dst],#16 - stp B_l, B_h, [dst],#16 - stp C_l, C_h, [dst],#16 - stp D_l, D_h, [dst],#16 - - tst count, #0x3f - b.ne .Ltail63 +ENTRY(memcpy) +#include "copy_template.S" ret ENDPROC(memcpy) -- cgit v1.2.3 From 404268828c74ba06b1f21584b26edafb381c9d7d Mon Sep 17 00:00:00 2001 From: Feng Kan Date: Wed, 23 Sep 2015 11:55:39 -0700 Subject: arm64: copy_to-from-in_user optimization using copy template This patch optimize copy_to-from-in_user for arm 64bit architecture. The copy template is used as template file for all the copy*.S files. Minor change was made to it to accommodate the copy to/from/in user files. Signed-off-by: Feng Kan Signed-off-by: Balamurugan Shanmugam Signed-off-by: Catalin Marinas --- arch/arm64/lib/copy_from_user.S | 78 +++++++++++++++++++++++------------------ arch/arm64/lib/copy_in_user.S | 67 ++++++++++++++++++++--------------- arch/arm64/lib/copy_to_user.S | 67 ++++++++++++++++++++--------------- 3 files changed, 120 insertions(+), 92 deletions(-) diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 1be9ef27be97..4699cd74f87e 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -31,49 +32,58 @@ * Returns: * x0 - bytes not copied */ + + .macro ldrb1 ptr, regB, val + USER(9998f, ldrb \ptr, [\regB], \val) + .endm + + .macro strb1 ptr, regB, val + strb \ptr, [\regB], \val + .endm + + .macro ldrh1 ptr, regB, val + USER(9998f, ldrh \ptr, [\regB], \val) + .endm + + .macro strh1 ptr, regB, val + strh \ptr, [\regB], \val + .endm + + .macro ldr1 ptr, regB, val + USER(9998f, ldr \ptr, [\regB], \val) + .endm + + .macro str1 ptr, regB, val + str \ptr, [\regB], \val + .endm + + .macro ldp1 ptr, regB, regC, val + USER(9998f, ldp \ptr, \regB, [\regC], \val) + .endm + + .macro stp1 ptr, regB, regC, val + stp \ptr, \regB, [\regC], \val + .endm + +end .req x5 ENTRY(__copy_from_user) ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \ CONFIG_ARM64_PAN) - add x5, x1, x2 // upper user buffer boundary - subs x2, x2, #16 - b.mi 1f -0: -USER(9f, ldp x3, x4, [x1], #16) - subs x2, x2, #16 - stp x3, x4, [x0], #16 - b.pl 0b -1: adds x2, x2, #8 - b.mi 2f -USER(9f, ldr x3, [x1], #8 ) - sub x2, x2, #8 - str x3, [x0], #8 -2: adds x2, x2, #4 - b.mi 3f -USER(9f, ldr w3, [x1], #4 ) - sub x2, x2, #4 - str w3, [x0], #4 -3: adds x2, x2, #2 - b.mi 4f -USER(9f, ldrh w3, [x1], #2 ) - sub x2, x2, #2 - strh w3, [x0], #2 -4: adds x2, x2, #1 - b.mi 5f -USER(9f, ldrb w3, [x1] ) - strb w3, [x0] -5: mov x0, #0 + add end, x0, x2 +#include "copy_template.S" ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \ CONFIG_ARM64_PAN) + mov x0, #0 // Nothing to copy ret ENDPROC(__copy_from_user) .section .fixup,"ax" .align 2 -9: sub x2, x5, x1 - mov x3, x2 -10: strb wzr, [x0], #1 // zero remaining buffer space - subs x3, x3, #1 - b.ne 10b - mov x0, x2 // bytes not copied +9998: + sub x0, end, dst +9999: + strb wzr, [dst], #1 // zero remaining buffer space + cmp dst, end + b.lo 9999b ret .previous diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S index 1b94661e22b3..81c8fc93c100 100644 --- a/arch/arm64/lib/copy_in_user.S +++ b/arch/arm64/lib/copy_in_user.S @@ -20,6 +20,7 @@ #include #include +#include #include #include @@ -33,44 +34,52 @@ * Returns: * x0 - bytes not copied */ + .macro ldrb1 ptr, regB, val + USER(9998f, ldrb \ptr, [\regB], \val) + .endm + + .macro strb1 ptr, regB, val + USER(9998f, strb \ptr, [\regB], \val) + .endm + + .macro ldrh1 ptr, regB, val + USER(9998f, ldrh \ptr, [\regB], \val) + .endm + + .macro strh1 ptr, regB, val + USER(9998f, strh \ptr, [\regB], \val) + .endm + + .macro ldr1 ptr, regB, val + USER(9998f, ldr \ptr, [\regB], \val) + .endm + + .macro str1 ptr, regB, val + USER(9998f, str \ptr, [\regB], \val) + .endm + + .macro ldp1 ptr, regB, regC, val + USER(9998f, ldp \ptr, \regB, [\regC], \val) + .endm + + .macro stp1 ptr, regB, regC, val + USER(9998f, stp \ptr, \regB, [\regC], \val) + .endm + +end .req x5 ENTRY(__copy_in_user) ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \ CONFIG_ARM64_PAN) - add x5, x0, x2 // upper user buffer boundary - subs x2, x2, #16 - b.mi 1f -0: -USER(9f, ldp x3, x4, [x1], #16) - subs x2, x2, #16 -USER(9f, stp x3, x4, [x0], #16) - b.pl 0b -1: adds x2, x2, #8 - b.mi 2f -USER(9f, ldr x3, [x1], #8 ) - sub x2, x2, #8 -USER(9f, str x3, [x0], #8 ) -2: adds x2, x2, #4 - b.mi 3f -USER(9f, ldr w3, [x1], #4 ) - sub x2, x2, #4 -USER(9f, str w3, [x0], #4 ) -3: adds x2, x2, #2 - b.mi 4f -USER(9f, ldrh w3, [x1], #2 ) - sub x2, x2, #2 -USER(9f, strh w3, [x0], #2 ) -4: adds x2, x2, #1 - b.mi 5f -USER(9f, ldrb w3, [x1] ) -USER(9f, strb w3, [x0] ) -5: mov x0, #0 + add end, x0, x2 +#include "copy_template.S" ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \ CONFIG_ARM64_PAN) + mov x0, #0 ret ENDPROC(__copy_in_user) .section .fixup,"ax" .align 2 -9: sub x0, x5, x0 // bytes not copied +9998: sub x0, end, dst // bytes not copied ret .previous diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index a257b47e2dc4..7512bbbc07ac 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -31,44 +32,52 @@ * Returns: * x0 - bytes not copied */ + .macro ldrb1 ptr, regB, val + ldrb \ptr, [\regB], \val + .endm + + .macro strb1 ptr, regB, val + USER(9998f, strb \ptr, [\regB], \val) + .endm + + .macro ldrh1 ptr, regB, val + ldrh \ptr, [\regB], \val + .endm + + .macro strh1 ptr, regB, val + USER(9998f, strh \ptr, [\regB], \val) + .endm + + .macro ldr1 ptr, regB, val + ldr \ptr, [\regB], \val + .endm + + .macro str1 ptr, regB, val + USER(9998f, str \ptr, [\regB], \val) + .endm + + .macro ldp1 ptr, regB, regC, val + ldp \ptr, \regB, [\regC], \val + .endm + + .macro stp1 ptr, regB, regC, val + USER(9998f, stp \ptr, \regB, [\regC], \val) + .endm + +end .req x5 ENTRY(__copy_to_user) ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_HAS_PAN, \ CONFIG_ARM64_PAN) - add x5, x0, x2 // upper user buffer boundary - subs x2, x2, #16 - b.mi 1f -0: - ldp x3, x4, [x1], #16 - subs x2, x2, #16 -USER(9f, stp x3, x4, [x0], #16) - b.pl 0b -1: adds x2, x2, #8 - b.mi 2f - ldr x3, [x1], #8 - sub x2, x2, #8 -USER(9f, str x3, [x0], #8 ) -2: adds x2, x2, #4 - b.mi 3f - ldr w3, [x1], #4 - sub x2, x2, #4 -USER(9f, str w3, [x0], #4 ) -3: adds x2, x2, #2 - b.mi 4f - ldrh w3, [x1], #2 - sub x2, x2, #2 -USER(9f, strh w3, [x0], #2 ) -4: adds x2, x2, #1 - b.mi 5f - ldrb w3, [x1] -USER(9f, strb w3, [x0] ) -5: mov x0, #0 + add end, x0, x2 +#include "copy_template.S" ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_HAS_PAN, \ CONFIG_ARM64_PAN) + mov x0, #0 ret ENDPROC(__copy_to_user) .section .fixup,"ax" .align 2 -9: sub x0, x5, x0 // bytes not copied +9998: sub x0, end, dst // bytes not copied ret .previous -- cgit v1.2.3 From 127db024a7baee9874014dac33628253f438b4da Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 17 Sep 2015 12:38:07 +0300 Subject: arm64: introduce VA_START macro - the first kernel virtual address. In order to not use lengthy (UL(0xffffffffffffffff) << VA_BITS) everywhere, replace it with VA_START. Signed-off-by: Andrey Ryabinin Reviewed-by: Catalin Marinas Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/memory.h | 2 ++ arch/arm64/include/asm/pgtable.h | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 6b4c3ad75a2a..11ccf6c09533 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -42,12 +42,14 @@ * PAGE_OFFSET - the virtual address of the start of the kernel image (top * (VA_BITS - 1)) * VA_BITS - the maximum number of bits for virtual addresses. + * VA_START - the first kernel virtual address. * TASK_SIZE - the maximum size of a user space task. * TASK_UNMAPPED_BASE - the lower boundary of the mmap VM area. * The module space lives between the addresses given by TASK_SIZE * and PAGE_OFFSET - it must be within 128MB of the kernel text. */ #define VA_BITS (CONFIG_ARM64_VA_BITS) +#define VA_START (UL(0xffffffffffffffff) << VA_BITS) #define PAGE_OFFSET (UL(0xffffffffffffffff) << (VA_BITS - 1)) #define MODULES_END (PAGE_OFFSET) #define MODULES_VADDR (MODULES_END - SZ_64M) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 26b066690593..581d1406ba28 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -41,7 +41,7 @@ * fixed mappings and modules */ #define VMEMMAP_SIZE ALIGN((1UL << (VA_BITS - PAGE_SHIFT)) * sizeof(struct page), PUD_SIZE) -#define VMALLOC_START (UL(0xffffffffffffffff) << VA_BITS) +#define VMALLOC_START (VA_START) #define VMALLOC_END (PAGE_OFFSET - PUD_SIZE - VMEMMAP_SIZE - SZ_64K) #define vmemmap ((struct page *)(VMALLOC_END + SZ_64K)) -- cgit v1.2.3 From c51e97d89e526368eb697f87cd4d391b9e19f369 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 6 Oct 2015 18:46:21 +0100 Subject: arm64: mm: remove unused cpu_set_idmap_tcr_t0sz function With commit b08d4640a3dc ("arm64: remove dead code"), cpu_set_idmap_tcr_t0sz is no longer called and can therefore be removed from the kernel. This patch removes the function and effectively inlines the helper function __cpu_set_tcr_t0sz into cpu_set_default_tcr_t0sz. Reviewed-by: Catalin Marinas Acked-by: Ard Biesheuvel Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/mmu_context.h | 35 ++++++++++++----------------------- 1 file changed, 12 insertions(+), 23 deletions(-) diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 8ec41e5f56f0..549b89554ce8 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -77,34 +77,23 @@ static inline bool __cpu_uses_extended_idmap(void) unlikely(idmap_t0sz != TCR_T0SZ(VA_BITS))); } -static inline void __cpu_set_tcr_t0sz(u64 t0sz) -{ - unsigned long tcr; - - if (__cpu_uses_extended_idmap()) - asm volatile ( - " mrs %0, tcr_el1 ;" - " bfi %0, %1, %2, %3 ;" - " msr tcr_el1, %0 ;" - " isb" - : "=&r" (tcr) - : "r"(t0sz), "I"(TCR_T0SZ_OFFSET), "I"(TCR_TxSZ_WIDTH)); -} - -/* - * Set TCR.T0SZ to the value appropriate for activating the identity map. - */ -static inline void cpu_set_idmap_tcr_t0sz(void) -{ - __cpu_set_tcr_t0sz(idmap_t0sz); -} - /* * Set TCR.T0SZ to its default value (based on VA_BITS) */ static inline void cpu_set_default_tcr_t0sz(void) { - __cpu_set_tcr_t0sz(TCR_T0SZ(VA_BITS)); + unsigned long tcr; + + if (!__cpu_uses_extended_idmap()) + return; + + asm volatile ( + " mrs %0, tcr_el1 ;" + " bfi %0, %1, %2, %3 ;" + " msr tcr_el1, %0 ;" + " isb" + : "=&r" (tcr) + : "r"(TCR_T0SZ(VA_BITS)), "I"(TCR_T0SZ_OFFSET), "I"(TCR_TxSZ_WIDTH)); } static inline void switch_new_context(struct mm_struct *mm) -- cgit v1.2.3 From fa7aae8a4257e6be7051420dac1f150c1eef721b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 6 Oct 2015 18:46:22 +0100 Subject: arm64: proc: de-scope TLBI operation during cold boot When cold-booting a CPU, we must invalidate any junk entries from the local TLB prior to enabling the MMU. This doesn't require broadcasting within the inner-shareable domain, so de-scope the operation to apply only to the local CPU. Acked-by: Catalin Marinas Reviewed-by: Catalin Marinas Tested-by: Mark Rutland Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/mm/proc.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index e4ee7bd8830a..bbde13d77da5 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -146,8 +146,8 @@ ENDPROC(cpu_do_switch_mm) * value of the SCTLR_EL1 register. */ ENTRY(__cpu_setup) - tlbi vmalle1is // invalidate I + D TLBs - dsb ish + tlbi vmalle1 // Invalidate local TLB + dsb nsh mov x0, #3 << 20 msr cpacr_el1, x0 // Enable FP/ASIMD -- cgit v1.2.3 From 8e63d38876691756f9bc6930850f1fb77809be1b Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 6 Oct 2015 18:46:23 +0100 Subject: arm64: flush: use local TLB and I-cache invalidation There are a number of places where a single CPU is running with a private page-table and we need to perform maintenance on the TLB and I-cache in order to ensure correctness, but do not require the operation to be broadcast to other CPUs. This patch adds local variants of tlb_flush_all and __flush_icache_all to support these use-cases and updates the callers respectively. __local_flush_icache_all also implies an isb, since it is intended to be used synchronously. Reviewed-by: Catalin Marinas Acked-by: David Daney Acked-by: Ard Biesheuvel Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/cacheflush.h | 7 +++++++ arch/arm64/include/asm/tlbflush.h | 8 ++++++++ arch/arm64/kernel/efi.c | 4 ++-- arch/arm64/kernel/smp.c | 2 +- arch/arm64/kernel/suspend.c | 2 +- arch/arm64/mm/context.c | 4 ++-- arch/arm64/mm/mmu.c | 2 +- 7 files changed, 22 insertions(+), 7 deletions(-) diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index c75b8d027eb1..54efedaf331f 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -115,6 +115,13 @@ extern void copy_to_user_page(struct vm_area_struct *, struct page *, #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *); +static inline void __local_flush_icache_all(void) +{ + asm("ic iallu"); + dsb(nsh); + isb(); +} + static inline void __flush_icache_all(void) { asm("ic ialluis"); diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 7bd2da021658..96f944e75dc4 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -63,6 +63,14 @@ * only require the D-TLB to be invalidated. * - kaddr - Kernel virtual memory address */ +static inline void local_flush_tlb_all(void) +{ + dsb(nshst); + asm("tlbi vmalle1"); + dsb(nsh); + isb(); +} + static inline void flush_tlb_all(void) { dsb(ishst); diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 13671a9cf016..4d12926ea40d 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -344,9 +344,9 @@ static void efi_set_pgd(struct mm_struct *mm) else cpu_switch_mm(mm->pgd, mm); - flush_tlb_all(); + local_flush_tlb_all(); if (icache_is_aivivt()) - __flush_icache_all(); + __local_flush_icache_all(); } void efi_virtmap_load(void) diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index dbdaacddd9a5..fdd4d4dbd64f 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -152,7 +152,7 @@ asmlinkage void secondary_start_kernel(void) * point to zero page to avoid speculatively fetching new entries. */ cpu_set_reserved_ttbr0(); - flush_tlb_all(); + local_flush_tlb_all(); cpu_set_default_tcr_t0sz(); preempt_disable(); diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c index 8297d502217e..3c5e4e6dcf68 100644 --- a/arch/arm64/kernel/suspend.c +++ b/arch/arm64/kernel/suspend.c @@ -90,7 +90,7 @@ int cpu_suspend(unsigned long arg, int (*fn)(unsigned long)) else cpu_switch_mm(mm->pgd, mm); - flush_tlb_all(); + local_flush_tlb_all(); /* * Restore per-cpu offset before any kernel diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index d70ff14dbdbd..48b53fb381af 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -48,9 +48,9 @@ static void flush_context(void) { /* set the reserved TTBR0 before flushing the TLB */ cpu_set_reserved_ttbr0(); - flush_tlb_all(); + local_flush_tlb_all(); if (icache_is_aivivt()) - __flush_icache_all(); + __local_flush_icache_all(); } static void set_mm_context(struct mm_struct *mm, unsigned int asid) diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 9211b8527f25..71a310478c9e 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -456,7 +456,7 @@ void __init paging_init(void) * point to zero page to avoid speculatively fetching new entries. */ cpu_set_reserved_ttbr0(); - flush_tlb_all(); + local_flush_tlb_all(); cpu_set_default_tcr_t0sz(); } -- cgit v1.2.3 From 5aec715d7d3122f77cabaa7578d9d25a0c1ed20e Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 6 Oct 2015 18:46:24 +0100 Subject: arm64: mm: rewrite ASID allocator and MM context-switching code Our current switch_mm implementation suffers from a number of problems: (1) The ASID allocator relies on IPIs to synchronise the CPUs on a rollover event (2) Because of (1), we cannot allocate ASIDs with interrupts disabled and therefore make use of a TIF_SWITCH_MM flag to postpone the actual switch to finish_arch_post_lock_switch (3) We run context switch with a reserved (invalid) TTBR0 value, even though the ASID and pgd are updated atomically (4) We take a global spinlock (cpu_asid_lock) during context-switch (5) We use h/w broadcast TLB operations when they are not required (e.g. in flush_context) This patch addresses these problems by rewriting the ASID algorithm to match the bitmap-based arch/arm/ implementation more closely. This in turn allows us to remove much of the complications surrounding switch_mm, including the ugly thread flag. Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/mmu.h | 15 +-- arch/arm64/include/asm/mmu_context.h | 76 ++--------- arch/arm64/include/asm/thread_info.h | 1 - arch/arm64/kernel/asm-offsets.c | 2 +- arch/arm64/kernel/efi.c | 1 - arch/arm64/mm/context.c | 238 +++++++++++++++++++++-------------- arch/arm64/mm/proc.S | 2 +- 7 files changed, 166 insertions(+), 169 deletions(-) diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 030208767185..990124a67eeb 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -17,15 +17,16 @@ #define __ASM_MMU_H typedef struct { - unsigned int id; - raw_spinlock_t id_lock; - void *vdso; + atomic64_t id; + void *vdso; } mm_context_t; -#define INIT_MM_CONTEXT(name) \ - .context.id_lock = __RAW_SPIN_LOCK_UNLOCKED(name.context.id_lock), - -#define ASID(mm) ((mm)->context.id & 0xffff) +/* + * This macro is only used by the TLBI code, which cannot race with an + * ASID change and therefore doesn't need to reload the counter using + * atomic64_read. + */ +#define ASID(mm) ((mm)->context.id.counter & 0xffff) extern void paging_init(void); extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt); diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 549b89554ce8..f4c74a951b6c 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -28,13 +28,6 @@ #include #include -#define MAX_ASID_BITS 16 - -extern unsigned int cpu_last_asid; - -void __init_new_context(struct task_struct *tsk, struct mm_struct *mm); -void __new_context(struct mm_struct *mm); - #ifdef CONFIG_PID_IN_CONTEXTIDR static inline void contextidr_thread_switch(struct task_struct *next) { @@ -96,66 +89,19 @@ static inline void cpu_set_default_tcr_t0sz(void) : "r"(TCR_T0SZ(VA_BITS)), "I"(TCR_T0SZ_OFFSET), "I"(TCR_TxSZ_WIDTH)); } -static inline void switch_new_context(struct mm_struct *mm) -{ - unsigned long flags; - - __new_context(mm); - - local_irq_save(flags); - cpu_switch_mm(mm->pgd, mm); - local_irq_restore(flags); -} - -static inline void check_and_switch_context(struct mm_struct *mm, - struct task_struct *tsk) -{ - /* - * Required during context switch to avoid speculative page table - * walking with the wrong TTBR. - */ - cpu_set_reserved_ttbr0(); - - if (!((mm->context.id ^ cpu_last_asid) >> MAX_ASID_BITS)) - /* - * The ASID is from the current generation, just switch to the - * new pgd. This condition is only true for calls from - * context_switch() and interrupts are already disabled. - */ - cpu_switch_mm(mm->pgd, mm); - else if (irqs_disabled()) - /* - * Defer the new ASID allocation until after the context - * switch critical region since __new_context() cannot be - * called with interrupts disabled. - */ - set_ti_thread_flag(task_thread_info(tsk), TIF_SWITCH_MM); - else - /* - * That is a direct call to switch_mm() or activate_mm() with - * interrupts enabled and a new context. - */ - switch_new_context(mm); -} - -#define init_new_context(tsk,mm) (__init_new_context(tsk,mm),0) +/* + * It would be nice to return ASIDs back to the allocator, but unfortunately + * that introduces a race with a generation rollover where we could erroneously + * free an ASID allocated in a future generation. We could workaround this by + * freeing the ASID from the context of the dying mm (e.g. in arch_exit_mmap), + * but we'd then need to make sure that we didn't dirty any TLBs afterwards. + * Setting a reserved TTBR0 or EPD0 would work, but it all gets ugly when you + * take CPU migration into account. + */ #define destroy_context(mm) do { } while(0) +void check_and_switch_context(struct mm_struct *mm, unsigned int cpu); -#define finish_arch_post_lock_switch \ - finish_arch_post_lock_switch -static inline void finish_arch_post_lock_switch(void) -{ - if (test_and_clear_thread_flag(TIF_SWITCH_MM)) { - struct mm_struct *mm = current->mm; - unsigned long flags; - - __new_context(mm); - - local_irq_save(flags); - cpu_switch_mm(mm->pgd, mm); - local_irq_restore(flags); - } -} +#define init_new_context(tsk,mm) ({ atomic64_set(&mm->context.id, 0); 0; }) /* * This is called when "tsk" is about to enter lazy TLB mode. diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index dcd06d18a42a..555c6dec5ef2 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -111,7 +111,6 @@ static inline struct thread_info *current_thread_info(void) #define TIF_RESTORE_SIGMASK 20 #define TIF_SINGLESTEP 21 #define TIF_32BIT 22 /* 32bit process */ -#define TIF_SWITCH_MM 23 /* deferred switch_mm */ #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 8d89cf8dae55..25de8b244961 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -60,7 +60,7 @@ int main(void) DEFINE(S_SYSCALLNO, offsetof(struct pt_regs, syscallno)); DEFINE(S_FRAME_SIZE, sizeof(struct pt_regs)); BLANK(); - DEFINE(MM_CONTEXT_ID, offsetof(struct mm_struct, context.id)); + DEFINE(MM_CONTEXT_ID, offsetof(struct mm_struct, context.id.counter)); BLANK(); DEFINE(VMA_VM_MM, offsetof(struct vm_area_struct, vm_mm)); DEFINE(VMA_VM_FLAGS, offsetof(struct vm_area_struct, vm_flags)); diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index 4d12926ea40d..a48d1f477b2e 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -48,7 +48,6 @@ static struct mm_struct efi_mm = { .mmap_sem = __RWSEM_INITIALIZER(efi_mm.mmap_sem), .page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(efi_mm.mmlist), - INIT_MM_CONTEXT(efi_mm) }; static int uefi_debug __initdata; diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index 48b53fb381af..e902229b1a3d 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -17,135 +17,187 @@ * along with this program. If not, see . */ -#include +#include #include +#include #include -#include -#include +#include #include #include -#include -#define asid_bits(reg) \ - (((read_cpuid(ID_AA64MMFR0_EL1) & 0xf0) >> 2) + 8) +static u32 asid_bits; +static DEFINE_RAW_SPINLOCK(cpu_asid_lock); -#define ASID_FIRST_VERSION (1 << MAX_ASID_BITS) +static atomic64_t asid_generation; +static unsigned long *asid_map; -static DEFINE_RAW_SPINLOCK(cpu_asid_lock); -unsigned int cpu_last_asid = ASID_FIRST_VERSION; +static DEFINE_PER_CPU(atomic64_t, active_asids); +static DEFINE_PER_CPU(u64, reserved_asids); +static cpumask_t tlb_flush_pending; -/* - * We fork()ed a process, and we need a new context for the child to run in. - */ -void __init_new_context(struct task_struct *tsk, struct mm_struct *mm) +#define ASID_MASK (~GENMASK(asid_bits - 1, 0)) +#define ASID_FIRST_VERSION (1UL << asid_bits) +#define NUM_USER_ASIDS ASID_FIRST_VERSION + +static void flush_context(unsigned int cpu) { - mm->context.id = 0; - raw_spin_lock_init(&mm->context.id_lock); + int i; + u64 asid; + + /* Update the list of reserved ASIDs and the ASID bitmap. */ + bitmap_clear(asid_map, 0, NUM_USER_ASIDS); + + /* + * Ensure the generation bump is observed before we xchg the + * active_asids. + */ + smp_wmb(); + + for_each_possible_cpu(i) { + asid = atomic64_xchg_relaxed(&per_cpu(active_asids, i), 0); + /* + * If this CPU has already been through a + * rollover, but hasn't run another task in + * the meantime, we must preserve its reserved + * ASID, as this is the only trace we have of + * the process it is still running. + */ + if (asid == 0) + asid = per_cpu(reserved_asids, i); + __set_bit(asid & ~ASID_MASK, asid_map); + per_cpu(reserved_asids, i) = asid; + } + + /* Queue a TLB invalidate and flush the I-cache if necessary. */ + cpumask_setall(&tlb_flush_pending); + + if (icache_is_aivivt()) + __flush_icache_all(); } -static void flush_context(void) +static int is_reserved_asid(u64 asid) { - /* set the reserved TTBR0 before flushing the TLB */ - cpu_set_reserved_ttbr0(); - local_flush_tlb_all(); - if (icache_is_aivivt()) - __local_flush_icache_all(); + int cpu; + for_each_possible_cpu(cpu) + if (per_cpu(reserved_asids, cpu) == asid) + return 1; + return 0; } -static void set_mm_context(struct mm_struct *mm, unsigned int asid) +static u64 new_context(struct mm_struct *mm, unsigned int cpu) { - unsigned long flags; + static u32 cur_idx = 1; + u64 asid = atomic64_read(&mm->context.id); + u64 generation = atomic64_read(&asid_generation); - /* - * Locking needed for multi-threaded applications where the same - * mm->context.id could be set from different CPUs during the - * broadcast. This function is also called via IPI so the - * mm->context.id_lock has to be IRQ-safe. - */ - raw_spin_lock_irqsave(&mm->context.id_lock, flags); - if (likely((mm->context.id ^ cpu_last_asid) >> MAX_ASID_BITS)) { + if (asid != 0) { /* - * Old version of ASID found. Set the new one and reset - * mm_cpumask(mm). + * If our current ASID was active during a rollover, we + * can continue to use it and this was just a false alarm. */ - mm->context.id = asid; - cpumask_clear(mm_cpumask(mm)); + if (is_reserved_asid(asid)) + return generation | (asid & ~ASID_MASK); + + /* + * We had a valid ASID in a previous life, so try to re-use + * it if possible. + */ + asid &= ~ASID_MASK; + if (!__test_and_set_bit(asid, asid_map)) + goto bump_gen; } - raw_spin_unlock_irqrestore(&mm->context.id_lock, flags); /* - * Set the mm_cpumask(mm) bit for the current CPU. + * Allocate a free ASID. If we can't find one, take a note of the + * currently active ASIDs and mark the TLBs as requiring flushes. + * We always count from ASID #1, as we use ASID #0 when setting a + * reserved TTBR0 for the init_mm. */ - cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm)); + asid = find_next_zero_bit(asid_map, NUM_USER_ASIDS, cur_idx); + if (asid != NUM_USER_ASIDS) + goto set_asid; + + /* We're out of ASIDs, so increment the global generation count */ + generation = atomic64_add_return_relaxed(ASID_FIRST_VERSION, + &asid_generation); + flush_context(cpu); + + /* We have at least 1 ASID per CPU, so this will always succeed */ + asid = find_next_zero_bit(asid_map, NUM_USER_ASIDS, 1); + +set_asid: + __set_bit(asid, asid_map); + cur_idx = asid; + +bump_gen: + asid |= generation; + cpumask_clear(mm_cpumask(mm)); + return asid; } -/* - * Reset the ASID on the current CPU. This function call is broadcast from the - * CPU handling the ASID rollover and holding cpu_asid_lock. - */ -static void reset_context(void *info) +void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) { - unsigned int asid; - unsigned int cpu = smp_processor_id(); - struct mm_struct *mm = current->active_mm; + unsigned long flags; + u64 asid; + + asid = atomic64_read(&mm->context.id); /* - * current->active_mm could be init_mm for the idle thread immediately - * after secondary CPU boot or hotplug. TTBR0_EL1 is already set to - * the reserved value, so no need to reset any context. + * The memory ordering here is subtle. We rely on the control + * dependency between the generation read and the update of + * active_asids to ensure that we are synchronised with a + * parallel rollover (i.e. this pairs with the smp_wmb() in + * flush_context). */ - if (mm == &init_mm) - return; + if (!((asid ^ atomic64_read(&asid_generation)) >> asid_bits) + && atomic64_xchg_relaxed(&per_cpu(active_asids, cpu), asid)) + goto switch_mm_fastpath; + + raw_spin_lock_irqsave(&cpu_asid_lock, flags); + /* Check that our ASID belongs to the current generation. */ + asid = atomic64_read(&mm->context.id); + if ((asid ^ atomic64_read(&asid_generation)) >> asid_bits) { + asid = new_context(mm, cpu); + atomic64_set(&mm->context.id, asid); + } - smp_rmb(); - asid = cpu_last_asid + cpu; + if (cpumask_test_and_clear_cpu(cpu, &tlb_flush_pending)) + local_flush_tlb_all(); - flush_context(); - set_mm_context(mm, asid); + atomic64_set(&per_cpu(active_asids, cpu), asid); + cpumask_set_cpu(cpu, mm_cpumask(mm)); + raw_spin_unlock_irqrestore(&cpu_asid_lock, flags); - /* set the new ASID */ +switch_mm_fastpath: cpu_switch_mm(mm->pgd, mm); } -void __new_context(struct mm_struct *mm) +static int asids_init(void) { - unsigned int asid; - unsigned int bits = asid_bits(); - - raw_spin_lock(&cpu_asid_lock); - /* - * Check the ASID again, in case the change was broadcast from another - * CPU before we acquired the lock. - */ - if (!unlikely((mm->context.id ^ cpu_last_asid) >> MAX_ASID_BITS)) { - cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm)); - raw_spin_unlock(&cpu_asid_lock); - return; - } - /* - * At this point, it is guaranteed that the current mm (with an old - * ASID) isn't active on any other CPU since the ASIDs are changed - * simultaneously via IPI. - */ - asid = ++cpu_last_asid; - - /* - * If we've used up all our ASIDs, we need to start a new version and - * flush the TLB. - */ - if (unlikely((asid & ((1 << bits) - 1)) == 0)) { - /* increment the ASID version */ - cpu_last_asid += (1 << MAX_ASID_BITS) - (1 << bits); - if (cpu_last_asid == 0) - cpu_last_asid = ASID_FIRST_VERSION; - asid = cpu_last_asid + smp_processor_id(); - flush_context(); - smp_wmb(); - smp_call_function(reset_context, NULL, 1); - cpu_last_asid += NR_CPUS - 1; + int fld = cpuid_feature_extract_field(read_cpuid(ID_AA64MMFR0_EL1), 4); + + switch (fld) { + default: + pr_warn("Unknown ASID size (%d); assuming 8-bit\n", fld); + /* Fallthrough */ + case 0: + asid_bits = 8; + break; + case 2: + asid_bits = 16; } - set_mm_context(mm, asid); - raw_spin_unlock(&cpu_asid_lock); + /* If we end up with more CPUs than ASIDs, expect things to crash */ + WARN_ON(NUM_USER_ASIDS < num_possible_cpus()); + atomic64_set(&asid_generation, ASID_FIRST_VERSION); + asid_map = kzalloc(BITS_TO_LONGS(NUM_USER_ASIDS) * sizeof(*asid_map), + GFP_KERNEL); + if (!asid_map) + panic("Failed to allocate bitmap for %lu ASIDs\n", + NUM_USER_ASIDS); + + pr_info("ASID allocator initialised with %lu entries\n", NUM_USER_ASIDS); + return 0; } +early_initcall(asids_init); diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index bbde13d77da5..91cb2eaac256 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -130,7 +130,7 @@ ENDPROC(cpu_do_resume) * - pgd_phys - physical address of new TTB */ ENTRY(cpu_do_switch_mm) - mmid w1, x1 // get mm->context.id + mmid x1, x1 // get mm->context.id bfi x0, x1, #48, #16 // set the ASID msr ttbr0_el1, x0 // set TTBR0 isb -- cgit v1.2.3 From f3e002c24e1f3b66f6e392ecd6928b5d04672c54 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 6 Oct 2015 18:46:25 +0100 Subject: arm64: tlbflush: remove redundant ASID casts to (unsigned long) The ASID macro returns a 64-bit (long long) value, so there is no need to cast to (unsigned long) before shifting prior to a TLBI operation. Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/tlbflush.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 96f944e75dc4..93e9f964805c 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -81,7 +81,7 @@ static inline void flush_tlb_all(void) static inline void flush_tlb_mm(struct mm_struct *mm) { - unsigned long asid = (unsigned long)ASID(mm) << 48; + unsigned long asid = ASID(mm) << 48; dsb(ishst); asm("tlbi aside1is, %0" : : "r" (asid)); @@ -91,8 +91,7 @@ static inline void flush_tlb_mm(struct mm_struct *mm) static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr) { - unsigned long addr = uaddr >> 12 | - ((unsigned long)ASID(vma->vm_mm) << 48); + unsigned long addr = uaddr >> 12 | (ASID(vma->vm_mm) << 48); dsb(ishst); asm("tlbi vale1is, %0" : : "r" (addr)); @@ -109,7 +108,7 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool last_level) { - unsigned long asid = (unsigned long)ASID(vma->vm_mm) << 48; + unsigned long asid = ASID(vma->vm_mm) << 48; unsigned long addr; if ((end - start) > MAX_TLB_RANGE) { @@ -162,7 +161,7 @@ static inline void flush_tlb_kernel_range(unsigned long start, unsigned long end static inline void __flush_tlb_pgtable(struct mm_struct *mm, unsigned long uaddr) { - unsigned long addr = uaddr >> 12 | ((unsigned long)ASID(mm) << 48); + unsigned long addr = uaddr >> 12 | (ASID(mm) << 48); dsb(ishst); asm("tlbi vae1is, %0" : : "r" (addr)); -- cgit v1.2.3 From 5a7862e83000ccfd36db927c6f060458fe271157 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 6 Oct 2015 18:46:26 +0100 Subject: arm64: tlbflush: avoid flushing when fullmm == 1 The TLB gather code sets fullmm=1 when tearing down the entire address space for an mm_struct on exit or execve. Given that the ASID allocator will never re-allocate a dirty ASID, this flushing is not needed and can simply be avoided in the flushing code. Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/tlb.h | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h index d6e6b6660380..ffdaea7954bb 100644 --- a/arch/arm64/include/asm/tlb.h +++ b/arch/arm64/include/asm/tlb.h @@ -37,17 +37,21 @@ static inline void __tlb_remove_table(void *_table) static inline void tlb_flush(struct mmu_gather *tlb) { - if (tlb->fullmm) { - flush_tlb_mm(tlb->mm); - } else { - struct vm_area_struct vma = { .vm_mm = tlb->mm, }; - /* - * The intermediate page table levels are already handled by - * the __(pte|pmd|pud)_free_tlb() functions, so last level - * TLBI is sufficient here. - */ - __flush_tlb_range(&vma, tlb->start, tlb->end, true); - } + struct vm_area_struct vma = { .vm_mm = tlb->mm, }; + + /* + * The ASID allocator will either invalidate the ASID or mark + * it as used. + */ + if (tlb->fullmm) + return; + + /* + * The intermediate page table levels are already handled by + * the __(pte|pmd|pud)_free_tlb() functions, so last level + * TLBI is sufficient here. + */ + __flush_tlb_range(&vma, tlb->start, tlb->end, true); } static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, -- cgit v1.2.3 From c2775b2ee5caca19f661ee2ab5af92462596db71 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 6 Oct 2015 18:46:27 +0100 Subject: arm64: switch_mm: simplify mm and CPU checks switch_mm performs some checks to try and avoid entering the ASID allocator: (1) If we're switching to the init_mm (no user mappings), then simply set a reserved TTBR0 value with no page table (the zero page) (2) If prev == next *and* the mm_cpumask indicates that we've run on this CPU before, then we can skip the allocator. However, there is plenty of redundancy here. With the new ASID allocator, if prev == next, then we know that our ASID is valid and do not need to worry about re-allocation. Consequently, we can drop the mm_cpumask check in (2) and move the prev == next check before the init_mm check, since if prev == next == init_mm then there's nothing to do. Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/mmu_context.h | 6 ++++-- arch/arm64/mm/context.c | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index f4c74a951b6c..c0e87898ba96 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -129,6 +129,9 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next, { unsigned int cpu = smp_processor_id(); + if (prev == next) + return; + /* * init_mm.pgd does not contain any user mappings and it is always * active for kernel addresses in TTBR1. Just set the reserved TTBR0. @@ -138,8 +141,7 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next, return; } - if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next)) || prev != next) - check_and_switch_context(next, tsk); + check_and_switch_context(next, cpu); } #define deactivate_mm(tsk,mm) do { } while (0) diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index e902229b1a3d..4b9ec4484e3f 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -166,10 +166,10 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) local_flush_tlb_all(); atomic64_set(&per_cpu(active_asids, cpu), asid); - cpumask_set_cpu(cpu, mm_cpumask(mm)); raw_spin_unlock_irqrestore(&cpu_asid_lock, flags); switch_mm_fastpath: + cpumask_set_cpu(cpu, mm_cpumask(mm)); cpu_switch_mm(mm->pgd, mm); } -- cgit v1.2.3 From 38d96287504a2478eb538bfecfa1fddd743bb6b2 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 6 Oct 2015 18:46:28 +0100 Subject: arm64: mm: kill mm_cpumask usage mm_cpumask isn't actually used for anything on arm64, so remove all the code trying to keep it up-to-date. Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/kernel/smp.c | 7 ------- arch/arm64/mm/context.c | 2 -- 2 files changed, 9 deletions(-) diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index fdd4d4dbd64f..03b0aa28ea61 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -142,7 +142,6 @@ asmlinkage void secondary_start_kernel(void) */ atomic_inc(&mm->mm_count); current->active_mm = mm; - cpumask_set_cpu(cpu, mm_cpumask(mm)); set_my_cpu_offset(per_cpu_offset(smp_processor_id())); printk("CPU%u: Booted secondary processor\n", cpu); @@ -233,12 +232,6 @@ int __cpu_disable(void) * OK - migrate IRQs away from this CPU */ migrate_irqs(); - - /* - * Remove this CPU from the vm mask set of all processes. - */ - clear_tasks_mm_cpumask(cpu); - return 0; } diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index 4b9ec4484e3f..f636a2639f03 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -132,7 +132,6 @@ set_asid: bump_gen: asid |= generation; - cpumask_clear(mm_cpumask(mm)); return asid; } @@ -169,7 +168,6 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) raw_spin_unlock_irqrestore(&cpu_asid_lock, flags); switch_mm_fastpath: - cpumask_set_cpu(cpu, mm_cpumask(mm)); cpu_switch_mm(mm->pgd, mm); } -- cgit v1.2.3 From 28c6fbc3b446caf5f8d1f2d7b79e09e743158a4d Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 6 Oct 2015 18:46:29 +0100 Subject: arm64: tlb: remove redundant barrier from __flush_tlb_pgtable __flush_tlb_pgtable is used to invalidate intermediate page table entries after they have been cleared and are about to be freed. Since pXd_clear imply memory barriers, we don't need the extra one here. Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/tlbflush.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 93e9f964805c..b460ae28e346 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -163,7 +163,6 @@ static inline void __flush_tlb_pgtable(struct mm_struct *mm, { unsigned long addr = uaddr >> 12 | (ASID(mm) << 48); - dsb(ishst); asm("tlbi vae1is, %0" : : "r" (addr)); dsb(ish); } -- cgit v1.2.3 From 120798d2e7d1ac87365fe5ea91b074bb42ca1eff Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 6 Oct 2015 18:46:30 +0100 Subject: arm64: mm: remove dsb from update_mmu_cache update_mmu_cache() consists of a dsb(ishst) instruction so that new user mappings are guaranteed to be visible to the page table walker on exception return. In reality this can be a very expensive operation which is rarely needed. Removing this barrier shows a modest improvement in hackbench scores and , in the worst case, we re-take the user fault and establish that there was nothing to do. Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/pgtable.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 581d1406ba28..5043a84e724e 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -646,10 +646,10 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { /* - * set_pte() does not have a DSB for user mappings, so make sure that - * the page table write is visible. + * We don't do anything here, so there's a very small chance of + * us retaking a user fault which we just fixed up. The alternative + * is doing a dsb(ishst), but that penalises the fastpath. */ - dsb(ishst); } #define update_mmu_cache_pmd(vma, address, pmd) do { } while (0) -- cgit v1.2.3 From 8f48c0629049fdebb6e783803325bff19176d3fd Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 7 Oct 2015 11:37:36 +0100 Subject: arm64: hw_breakpoint: use target state to determine ABI behaviour The arm64 hw_breakpoint interface is slightly less flexible than its 32-bit counterpart, thanks to some changes in the architecture rendering unaligned watchpoint addresses obselete for AArch64. However, in a multi-arch environment (i.e. debugging a 32-bit target with a 64-bit GDB under a 64-bit kernel), we need to provide a feature compatible interface to GDB in order for debugging to function correctly. This patch adds a new helper, is_compat_bp, to our hw_breakpoint implementation which changes the interface behaviour based on the architecture of the debug target as opposed to the debugger itself. This allows debugged to function as expected for multi-arch configurations without relying on deprecated architectural behaviours when debugging native applications. Cc: Yao Qi Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/kernel/hw_breakpoint.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index bba85c8f8037..46465d9fbc4d 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -163,6 +163,20 @@ enum hw_breakpoint_ops { HW_BREAKPOINT_RESTORE }; +static int is_compat_bp(struct perf_event *bp) +{ + struct task_struct *tsk = bp->hw.target; + + /* + * tsk can be NULL for per-cpu (non-ptrace) breakpoints. + * In this case, use the native interface, since we don't have + * the notion of a "compat CPU" and could end up relying on + * deprecated behaviour if we use unaligned watchpoints in + * AArch64 state. + */ + return tsk && is_compat_thread(task_thread_info(tsk)); +} + /** * hw_breakpoint_slot_setup - Find and setup a perf slot according to * operations @@ -420,7 +434,7 @@ static int arch_build_bp_info(struct perf_event *bp) * Watchpoints can be of length 1, 2, 4 or 8 bytes. */ if (info->ctrl.type == ARM_BREAKPOINT_EXECUTE) { - if (is_compat_task()) { + if (is_compat_bp(bp)) { if (info->ctrl.len != ARM_BREAKPOINT_LEN_2 && info->ctrl.len != ARM_BREAKPOINT_LEN_4) return -EINVAL; @@ -477,7 +491,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) * AArch32 tasks expect some simple alignment fixups, so emulate * that here. */ - if (is_compat_task()) { + if (is_compat_bp(bp)) { if (info->ctrl.len == ARM_BREAKPOINT_LEN_8) alignment_mask = 0x7; else -- cgit v1.2.3 From 6475b2d846176e3272351266869481a21ff47866 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 2 Oct 2015 10:55:03 +0100 Subject: arm64: perf: move to shared arm_pmu framework Now that the arm_pmu framework has been factored out to drivers/perf we can make use of it for arm64, gaining support for heterogeneous PMUs and unifying the two codebases before they diverge further. The as yet unused PMU name for PMUv3 is changed to armv8_pmuv3, matching the style previously applied to the 32-bit PMUs. Signed-off-by: Mark Rutland Acked-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 8 +- arch/arm64/include/asm/pmu.h | 83 ---- arch/arm64/kernel/perf_event.c | 951 ++++------------------------------------- drivers/perf/Kconfig | 2 +- 4 files changed, 93 insertions(+), 951 deletions(-) delete mode 100644 arch/arm64/include/asm/pmu.h diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 07d1811aa03f..98b4f98a13de 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -454,12 +454,8 @@ config HAVE_ARCH_PFN_VALID def_bool ARCH_HAS_HOLES_MEMORYMODEL || !SPARSEMEM config HW_PERF_EVENTS - bool "Enable hardware performance counter support for perf events" - depends on PERF_EVENTS - default y - help - Enable hardware performance counter support for perf events. If - disabled, perf events will use software events only. + def_bool y + depends on ARM_PMU config SYS_SUPPORTS_HUGETLBFS def_bool y diff --git a/arch/arm64/include/asm/pmu.h b/arch/arm64/include/asm/pmu.h deleted file mode 100644 index b7710a59672c..000000000000 --- a/arch/arm64/include/asm/pmu.h +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Based on arch/arm/include/asm/pmu.h - * - * Copyright (C) 2009 picoChip Designs Ltd, Jamie Iles - * Copyright (C) 2012 ARM Ltd. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ -#ifndef __ASM_PMU_H -#define __ASM_PMU_H - -#ifdef CONFIG_HW_PERF_EVENTS - -/* The events for a given PMU register set. */ -struct pmu_hw_events { - /* - * The events that are active on the PMU for the given index. - */ - struct perf_event **events; - - /* - * A 1 bit for an index indicates that the counter is being used for - * an event. A 0 means that the counter can be used. - */ - unsigned long *used_mask; - - /* - * Hardware lock to serialize accesses to PMU registers. Needed for the - * read/modify/write sequences. - */ - raw_spinlock_t pmu_lock; -}; - -struct arm_pmu { - struct pmu pmu; - cpumask_t active_irqs; - int *irq_affinity; - const char *name; - irqreturn_t (*handle_irq)(int irq_num, void *dev); - void (*enable)(struct hw_perf_event *evt, int idx); - void (*disable)(struct hw_perf_event *evt, int idx); - int (*get_event_idx)(struct pmu_hw_events *hw_events, - struct hw_perf_event *hwc); - int (*set_event_filter)(struct hw_perf_event *evt, - struct perf_event_attr *attr); - u32 (*read_counter)(int idx); - void (*write_counter)(int idx, u32 val); - void (*start)(void); - void (*stop)(void); - void (*reset)(void *); - int (*map_event)(struct perf_event *event); - int num_events; - atomic_t active_events; - struct mutex reserve_mutex; - u64 max_period; - struct platform_device *plat_device; - struct pmu_hw_events *(*get_hw_events)(void); -}; - -#define to_arm_pmu(p) (container_of(p, struct arm_pmu, pmu)) - -int __init armpmu_register(struct arm_pmu *armpmu, char *name, int type); - -u64 armpmu_event_update(struct perf_event *event, - struct hw_perf_event *hwc, - int idx); - -int armpmu_event_set_period(struct perf_event *event, - struct hw_perf_event *hwc, - int idx); - -#endif /* CONFIG_HW_PERF_EVENTS */ -#endif /* __ASM_PMU_H */ diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index f9a74d4fff3b..a614100cd00f 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -18,651 +18,12 @@ * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ -#define pr_fmt(fmt) "hw perfevents: " fmt - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include -#include - -/* - * ARMv8 supports a maximum of 32 events. - * The cycle counter is included in this total. - */ -#define ARMPMU_MAX_HWEVENTS 32 - -static DEFINE_PER_CPU(struct perf_event * [ARMPMU_MAX_HWEVENTS], hw_events); -static DEFINE_PER_CPU(unsigned long [BITS_TO_LONGS(ARMPMU_MAX_HWEVENTS)], used_mask); -static DEFINE_PER_CPU(struct pmu_hw_events, cpu_hw_events); - -#define to_arm_pmu(p) (container_of(p, struct arm_pmu, pmu)) - -/* Set at runtime when we know what CPU type we are. */ -static struct arm_pmu *cpu_pmu; - -int -armpmu_get_max_events(void) -{ - int max_events = 0; - - if (cpu_pmu != NULL) - max_events = cpu_pmu->num_ev