[RFC PATCH v2 14/32] x86: mm: Provide support to use memblock when spliting large pages

Brijesh Singh brijesh.singh at amd.com
Thu Apr 6 14:05:03 UTC 2017


Hi Boris,

On 03/17/2017 05:17 AM, Borislav Petkov wrote:
> On Thu, Mar 16, 2017 at 11:25:36PM +0100, Paolo Bonzini wrote:
>> The kvmclock memory is initially zero so there is no need for the
>> hypervisor to allocate anything; the point of these patches is just to
>> access the data in a natural way from Linux source code.
>
> I realize that.
>
>> I also don't really like the patch as is (plus it fails modpost), but
>> IMO reusing __change_page_attr and __split_large_page is the right thing
>> to do.
>
> Right, so teaching pageattr.c about memblock could theoretically come
> around and bite us later when a page allocated with memblock gets freed
> with free_page().
>
> And looking at this more, we have all this kernel pagetable preparation
> code down the init_mem_mapping() call and the pagetable setup in
> arch/x86/mm/init_{32,64}.c
>
> And that code even does some basic page splitting. Oh and it uses
> alloc_low_pages() which knows whether to do memblock reservation or the
> common __get_free_pages() when slabs are up.
>

I looked into arch/x86/mm/init_{32,64}.c and as you pointed the file contains
routines to do basic page splitting. I think it sufficient for our usage.

I should be able to drop the memblock patch from the series and update the
Patch 15 [1] to use the kernel_physical_mapping_init().

The kernel_physical_mapping_init() creates the page table mapping using
default KERNEL_PAGE attributes, I tried to extend the function by passing
'bool enc' flags to hint whether to clr or set _PAGE_ENC when splitting the
pages. The code did not looked clean hence I dropped that idea. Instead,
I took the below approach. I did some runtime test and it seem to be working okay.

[1] http://marc.info/?l=linux-mm&m=148846773731212&w=2

diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 7df5f4c..de16ef4 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -15,6 +15,7 @@
  #include <linux/mm.h>
  #include <linux/dma-mapping.h>
  #include <linux/swiotlb.h>
+#include <linux/mem_encrypt.h>
  
  #include <asm/tlbflush.h>
  #include <asm/fixmap.h>
@@ -22,6 +23,8 @@
  #include <asm/bootparam.h>
  #include <asm/cacheflush.h>
  
+#include "mm_internal.h"
+
  extern pmdval_t early_pmd_flags;
  int __init __early_make_pgtable(unsigned long, pmdval_t);
  void __init __early_pgtable_flush(void);
@@ -258,6 +261,72 @@ static void sme_free(struct device *dev, size_t size, void *vaddr,
         swiotlb_free_coherent(dev, size, vaddr, dma_handle);
  }
  
+static int __init early_set_memory_enc_dec(resource_size_t paddr,
+                                          unsigned long size, bool enc)
+{
+       pte_t *kpte;
+       int level;
+       unsigned long vaddr, vaddr_end, vaddr_next;
+
+       vaddr = (unsigned long)__va(paddr);
+       vaddr_next = vaddr;
+       vaddr_end = vaddr + size;
+
+       /*
+        * We are going to change the physical page attribute from C=1 to C=0.
+        * Flush the caches to ensure that all the data with C=1 is flushed to
+        * memory. Any caching of the vaddr after function returns will
+        * use C=0.
+        */
+       clflush_cache_range(__va(paddr), size);
+
+       for (; vaddr < vaddr_end; vaddr = vaddr_next) {
+               kpte = lookup_address(vaddr, &level);
+               if (!kpte || pte_none(*kpte) )
+                       return 1;
+
+               if (level == PG_LEVEL_4K) {
+                       pte_t new_pte;
+                       unsigned long pfn = pte_pfn(*kpte);
+                       pgprot_t new_prot = pte_pgprot(*kpte);
+
+                       if (enc)
+                               pgprot_val(new_prot) |= _PAGE_ENC;
+                       else
+                               pgprot_val(new_prot) &= ~_PAGE_ENC;
+
+                       new_pte = pfn_pte(pfn, canon_pgprot(new_prot));
+                       pr_info("  pte %016lx -> 0x%016lx\n", pte_val(*kpte),
+                               pte_val(new_pte));
+                       set_pte_atomic(kpte, new_pte);
+                       vaddr_next = (vaddr & PAGE_MASK) + PAGE_SIZE;
+                       continue;
+               }
+
+               /*
+                * virtual address is part of large page, create the page
+                * table mapping to use smaller pages (4K). The virtual and
+                * physical address must be aligned to PMD level.
+                */
+               kernel_physical_mapping_init(__pa(vaddr & PMD_MASK),
+                                            __pa((vaddr_end & PMD_MASK) + PMD_SIZE),
+                                            0);
+       }
+
+       __flush_tlb_all();
+       return 0;
+}
+
+int __init early_set_memory_decrypted(resource_size_t paddr, unsigned long size)
+{
+       return early_set_memory_enc_dec(paddr, size, false);
+}
+
+int __init early_set_memory_encrypted(resource_size_t paddr, unsigned long size)
+{
+       return early_set_memory_enc_dec(paddr, size, true);
+}
+

> So what would be much cleaner, IMHO, is if one would reuse that code to
> change init_mm.pgd mappings early without copying pageattr.c.
>
> init_mem_mapping() gets called before kvm_guest_init() in setup_arch()
> so the guest would simply fixup its pagetable right there.
>


More information about the devel mailing list