summaryrefslogtreecommitdiffstats
path: root/arch/powerpc/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/mm')
-rw-r--r--arch/powerpc/mm/fsl_booke_mmu.c7
-rw-r--r--arch/powerpc/mm/numa.c122
-rw-r--r--arch/powerpc/mm/pgtable.c1
-rw-r--r--arch/powerpc/mm/tlb_hash32.c15
-rw-r--r--arch/powerpc/mm/tlb_nohash.c129
5 files changed, 196 insertions, 78 deletions
diff --git a/arch/powerpc/mm/fsl_booke_mmu.c b/arch/powerpc/mm/fsl_booke_mmu.c
index cdc7526e9c93..4b66a1ece6d8 100644
--- a/arch/powerpc/mm/fsl_booke_mmu.c
+++ b/arch/powerpc/mm/fsl_booke_mmu.c
@@ -104,9 +104,10 @@ unsigned long p_mapped_by_tlbcam(phys_addr_t pa)
}
/*
- * Set up one of the I/D BAT (block address translation) register pairs.
- * The parameters are not checked; in particular size must be a power
- * of 4 between 4k and 256M.
+ * Set up a variable-size TLB entry (tlbcam). The parameters are not checked;
+ * in particular size must be a power of 4 between 4k and 256M (or 1G, for cpus
+ * that support extended page sizes). Note that while some cpus support a
+ * page size of 4G, we don't allow its use here.
*/
static void settlbcam(int index, unsigned long virt, phys_addr_t phys,
unsigned long size, unsigned long flags, unsigned int pid)
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index aa731af720c0..002878ccf90b 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -42,6 +42,12 @@ EXPORT_SYMBOL(node_data);
static int min_common_depth;
static int n_mem_addr_cells, n_mem_size_cells;
+static int form1_affinity;
+
+#define MAX_DISTANCE_REF_POINTS 4
+static int distance_ref_points_depth;
+static const unsigned int *distance_ref_points;
+static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
/*
* Allocate node_to_cpumask_map based on number of available nodes
@@ -204,6 +210,39 @@ static const u32 *of_get_usable_memory(struct device_node *memory)
return prop;
}
+int __node_distance(int a, int b)
+{
+ int i;
+ int distance = LOCAL_DISTANCE;
+
+ if (!form1_affinity)
+ return distance;
+
+ for (i = 0; i < distance_ref_points_depth; i++) {
+ if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
+ break;
+
+ /* Double the distance for each NUMA level */
+ distance *= 2;
+ }
+
+ return distance;
+}
+
+static void initialize_distance_lookup_table(int nid,
+ const unsigned int *associativity)
+{
+ int i;
+
+ if (!form1_affinity)
+ return;
+
+ for (i = 0; i < distance_ref_points_depth; i++) {
+ distance_lookup_table[nid][i] =
+ associativity[distance_ref_points[i]];
+ }
+}
+
/* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
* info is found.
*/
@@ -225,6 +264,10 @@ static int of_node_to_nid_single(struct device_node *device)
/* POWER4 LPAR uses 0xffff as invalid node */
if (nid == 0xffff || nid >= MAX_NUMNODES)
nid = -1;
+
+ if (nid > 0 && tmp[0] >= distance_ref_points_depth)
+ initialize_distance_lookup_table(nid, tmp);
+
out:
return nid;
}
@@ -251,26 +294,10 @@ int of_node_to_nid(struct device_node *device)
}
EXPORT_SYMBOL_GPL(of_node_to_nid);
-/*
- * In theory, the "ibm,associativity" property may contain multiple
- * associativity lists because a resource may be multiply connected
- * into the machine. This resource then has different associativity
- * characteristics relative to its multiple connections. We ignore
- * this for now. We also assume that all cpu and memory sets have
- * their distances represented at a common level. This won't be
- * true for hierarchical NUMA.
- *
- * In any case the ibm,associativity-reference-points should give
- * the correct depth for a normal NUMA system.
- *
- * - Dave Hansen <haveblue@us.ibm.com>
- */
static int __init find_min_common_depth(void)
{
- int depth, index;
- const unsigned int *ref_points;
+ int depth;
struct device_node *rtas_root;
- unsigned int len;
struct device_node *chosen;
const char *vec5;
@@ -280,18 +307,28 @@ static int __init find_min_common_depth(void)
return -1;
/*
- * this property is 2 32-bit integers, each representing a level of
- * depth in the associativity nodes. The first is for an SMP
- * configuration (should be all 0's) and the second is for a normal
- * NUMA configuration.
+ * This property is a set of 32-bit integers, each representing
+ * an index into the ibm,associativity nodes.
+ *
+ * With form 0 affinity the first integer is for an SMP configuration
+ * (should be all 0's) and the second is for a normal NUMA
+ * configuration. We have only one level of NUMA.
+ *
+ * With form 1 affinity the first integer is the most significant
+ * NUMA boundary and the following are progressively less significant
+ * boundaries. There can be more than one level of NUMA.
*/
- index = 1;
- ref_points = of_get_property(rtas_root,
- "ibm,associativity-reference-points", &len);
+ distance_ref_points = of_get_property(rtas_root,
+ "ibm,associativity-reference-points",
+ &distance_ref_points_depth);
+
+ if (!distance_ref_points) {
+ dbg("NUMA: ibm,associativity-reference-points not found.\n");
+ goto err;
+ }
+
+ distance_ref_points_depth /= sizeof(int);
- /*
- * For form 1 affinity information we want the first field
- */
#define VEC5_AFFINITY_BYTE 5
#define VEC5_AFFINITY 0x80
chosen = of_find_node_by_path("/chosen");
@@ -299,19 +336,38 @@ static int __init find_min_common_depth(void)
vec5 = of_get_property(chosen, "ibm,architecture-vec-5", NULL);
if (vec5 && (vec5[VEC5_AFFINITY_BYTE] & VEC5_AFFINITY)) {
dbg("Using form 1 affinity\n");
- index = 0;
+ form1_affinity = 1;
}
}
- if ((len >= 2 * sizeof(unsigned int)) && ref_points) {
- depth = ref_points[index];
+ if (form1_affinity) {
+ depth = distance_ref_points[0];
} else {
- dbg("NUMA: ibm,associativity-reference-points not found.\n");
- depth = -1;
+ if (distance_ref_points_depth < 2) {
+ printk(KERN_WARNING "NUMA: "
+ "short ibm,associativity-reference-points\n");
+ goto err;
+ }
+
+ depth = distance_ref_points[1];
}
- of_node_put(rtas_root);
+ /*
+ * Warn and cap if the hardware supports more than
+ * MAX_DISTANCE_REF_POINTS domains.
+ */
+ if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
+ printk(KERN_WARNING "NUMA: distance array capped at "
+ "%d entries\n", MAX_DISTANCE_REF_POINTS);
+ distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
+ }
+
+ of_node_put(rtas_root);
return depth;
+
+err:
+ of_node_put(rtas_root);
+ return -1;
}
static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index ebc2f38eb381..2c7e801ab20b 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -92,7 +92,6 @@ static void pte_free_rcu_callback(struct rcu_head *head)
static void pte_free_submit(struct pte_freelist_batch *batch)
{
- INIT_RCU_HEAD(&batch->rcu);
call_rcu(&batch->rcu, pte_free_rcu_callback);
}
diff --git a/arch/powerpc/mm/tlb_hash32.c b/arch/powerpc/mm/tlb_hash32.c
index 8aaa8b7eb324..690566b66e8e 100644
--- a/arch/powerpc/mm/tlb_hash32.c
+++ b/arch/powerpc/mm/tlb_hash32.c
@@ -89,17 +89,6 @@ void tlb_flush(struct mmu_gather *tlb)
* -- Cort
*/
-/*
- * 750 SMP is a Bad Idea because the 750 doesn't broadcast all
- * the cache operations on the bus. Hence we need to use an IPI
- * to get the other CPU(s) to invalidate their TLBs.
- */
-#ifdef CONFIG_SMP_750
-#define FINISH_FLUSH smp_send_tlb_invalidate(0)
-#else
-#define FINISH_FLUSH do { } while (0)
-#endif
-
static void flush_range(struct mm_struct *mm, unsigned long start,
unsigned long end)
{
@@ -138,7 +127,6 @@ static void flush_range(struct mm_struct *mm, unsigned long start,
void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
flush_range(&init_mm, start, end);
- FINISH_FLUSH;
}
EXPORT_SYMBOL(flush_tlb_kernel_range);
@@ -162,7 +150,6 @@ void flush_tlb_mm(struct mm_struct *mm)
*/
for (mp = mm->mmap; mp != NULL; mp = mp->vm_next)
flush_range(mp->vm_mm, mp->vm_start, mp->vm_end);
- FINISH_FLUSH;
}
EXPORT_SYMBOL(flush_tlb_mm);
@@ -179,7 +166,6 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
pmd = pmd_offset(pud_offset(pgd_offset(mm, vmaddr), vmaddr), vmaddr);
if (!pmd_none(*pmd))
flush_hash_pages(mm->context.id, vmaddr, pmd_val(*pmd), 1);
- FINISH_FLUSH;
}
EXPORT_SYMBOL(flush_tlb_page);
@@ -192,6 +178,5 @@ void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end)
{
flush_range(vma->vm_mm, start, end);
- FINISH_FLUSH;
}
EXPORT_SYMBOL(flush_tlb_range);
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index d8695b02a968..fe391e942521 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -46,6 +46,7 @@
struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
[MMU_PAGE_4K] = {
.shift = 12,
+ .ind = 20,
.enc = BOOK3E_PAGESZ_4K,
},
[MMU_PAGE_16K] = {
@@ -54,6 +55,7 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
},
[MMU_PAGE_64K] = {
.shift = 16,
+ .ind = 28,
.enc = BOOK3E_PAGESZ_64K,
},
[MMU_PAGE_1M] = {
@@ -62,6 +64,7 @@ struct mmu_psize_def mmu_psize_defs[MMU_PAGE_COUNT] = {
},
[MMU_PAGE_16M] = {
.shift = 24,
+ .ind = 36,
.enc = BOOK3E_PAGESZ_16M,
},
[MMU_PAGE_256M] = {
@@ -344,16 +347,108 @@ void tlb_flush_pgtable(struct mmu_gather *tlb, unsigned long address)
}
}
-/*
- * Early initialization of the MMU TLB code
- */
-static void __early_init_mmu(int boot_cpu)
+static void setup_page_sizes(void)
+{
+ unsigned int tlb0cfg = mfspr(SPRN_TLB0CFG);
+ unsigned int tlb0ps = mfspr(SPRN_TLB0PS);
+ unsigned int eptcfg = mfspr(SPRN_EPTCFG);
+ int i, psize;
+
+ /* Look for supported direct sizes */
+ for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+ struct mmu_psize_def *def = &mmu_psize_defs[psize];
+
+ if (tlb0ps & (1U << (def->shift - 10)))
+ def->flags |= MMU_PAGE_SIZE_DIRECT;
+ }
+
+ /* Indirect page sizes supported ? */
+ if ((tlb0cfg & TLBnCFG_IND) == 0)
+ goto no_indirect;
+
+ /* Now, we only deal with one IND page size for each
+ * direct size. Hopefully all implementations today are
+ * unambiguous, but we might want to be careful in the
+ * future.
+ */
+ for (i = 0; i < 3; i++) {
+ unsigned int ps, sps;
+
+ sps = eptcfg & 0x1f;
+ eptcfg >>= 5;
+ ps = eptcfg & 0x1f;
+ eptcfg >>= 5;
+ if (!ps || !sps)
+ continue;
+ for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
+ struct mmu_psize_def *def = &mmu_psize_defs[psize];
+
+ if (ps == (def->shift - 10))
+ def->flags |= MMU_PAGE_SIZE_INDIRECT;
+ if (sps == (def->shift - 10))
+ def->ind = ps + 10;
+ }
+ }
+ no_indirect:
+
+ /* Cleanup array and print summary */
+ pr_info("MMU: Supported page sizes\n");
+ for (psize = 0; psize < MMU_PAGE_COUNT; ++psize) {
+ struct mmu_psize_def *def = &mmu_psize_defs[psize];
+ const char *__page_type_names[] = {
+ "unsupported",
+ "direct",
+ "indirect",
+ "direct & indirect"
+ };
+ if (def->flags == 0) {
+ def->shift = 0;
+ continue;
+ }
+ pr_info(" %8ld KB as %s\n", 1ul << (def->shift - 10),
+ __page_type_names[def->flags & 0x3]);
+ }
+}
+
+static void setup_mmu_htw(void)
{
extern unsigned int interrupt_base_book3e;
extern unsigned int exc_data_tlb_miss_htw_book3e;
extern unsigned int exc_instruction_tlb_miss_htw_book3e;
unsigned int *ibase = &interrupt_base_book3e;
+
+ /* Check if HW tablewalk is present, and if yes, enable it by:
+ *
+ * - patching the TLB miss handlers to branch to the
+ * one dedicates to it
+ *
+ * - setting the global book3e_htw_enabled
+ */
+ unsigned int tlb0cfg = mfspr(SPRN_TLB0CFG);
+
+ if ((tlb0cfg & TLBnCFG_IND) &&
+ (tlb0cfg & TLBnCFG_PT)) {
+ /* Our exceptions vectors start with a NOP and -then- a branch
+ * to deal with single stepping from userspace which stops on
+ * the second instruction. Thus we need to patch the second
+ * instruction of the exception, not the first one
+ */
+ patch_branch(ibase + (0x1c0 / 4) + 1,
+ (unsigned long)&exc_data_tlb_miss_htw_book3e, 0);
+ patch_branch(ibase + (0x1e0 / 4) + 1,
+ (unsigned long)&exc_instruction_tlb_miss_htw_book3e, 0);
+ book3e_htw_enabled = 1;
+ }
+ pr_info("MMU: Book3E Page Tables %s\n",
+ book3e_htw_enabled ? "Enabled" : "Disabled");
+}
+
+/*
+ * Early initialization of the MMU TLB code
+ */
+static void __early_init_mmu(int boot_cpu)
+{
unsigned int mas4;
/* XXX This will have to be decided at runtime, but right
@@ -370,35 +465,17 @@ static void __early_init_mmu(int boot_cpu)
*/
mmu_vmemmap_psize = MMU_PAGE_16M;
- /* Check if HW tablewalk is present, and if yes, enable it by:
- *
- * - patching the TLB miss handlers to branch to the
- * one dedicates to it
- *
- * - setting the global book3e_htw_enabled
- *
- * - Set MAS4:INDD and default page size
- */
-
/* XXX This code only checks for TLB 0 capabilities and doesn't
* check what page size combos are supported by the HW. It
* also doesn't handle the case where a separate array holds
* the IND entries from the array loaded by the PT.
*/
if (boot_cpu) {
- unsigned int tlb0cfg = mfspr(SPRN_TLB0CFG);
+ /* Look for supported page sizes */
+ setup_page_sizes();
- /* Check if HW loader is supported */
- if ((tlb0cfg & TLBnCFG_IND) &&
- (tlb0cfg & TLBnCFG_PT)) {
- patch_branch(ibase + (0x1c0 / 4),
- (unsigned long)&exc_data_tlb_miss_htw_book3e, 0);
- patch_branch(ibase + (0x1e0 / 4),
- (unsigned long)&exc_instruction_tlb_miss_htw_book3e, 0);
- book3e_htw_enabled = 1;
- }
- pr_info("MMU: Book3E Page Tables %s\n",
- book3e_htw_enabled ? "Enabled" : "Disabled");
+ /* Look for HW tablewalk support */
+ setup_mmu_htw();
}
/* Set MAS4 based on page table setting */