On 14/04/2025 4:02 am, Alejandro Jimenez wrote:
> Caution: External email. Do not open attachments or click links, unless this
> email comes from a known sender and you know the content is safe.
>
>
> The current amdvi_page_walk() is designed to be called by the replay()
> method. Rather than drastically altering it, introduce helpers to fetch
> guest PTEs that will be used by a page walker implementation.
>
> Signed-off-by: Alejandro Jimenez <[email protected]>
> ---
> hw/i386/amd_iommu.c | 125 ++++++++++++++++++++++++++++++++++++++++++++
> hw/i386/amd_iommu.h | 42 +++++++++++++++
> 2 files changed, 167 insertions(+)
>
> diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
> index 0af873b66a31..d089fdc28ef1 100644
> --- a/hw/i386/amd_iommu.c
> +++ b/hw/i386/amd_iommu.c
> @@ -1563,6 +1563,131 @@ static const MemoryRegionOps amdvi_ir_ops = {
> }
> };
>
> +/*
> + * For a PTE encoding a large page, return the page size it encodes as
> described
> + * by the AMD IOMMU Specification Table 14: Example Page Size Encodings.
> + * No need to adjust the value of the PTE to point to the first PTE in the
> large
> + * page since the encoding guarantees all "base" PTEs in the large page are
> the
> + * same.
> + */
> +static uint64_t large_pte_page_size(uint64_t pte)
> +{
> + assert(PTE_NEXT_LEVEL(pte) == 7);
> +
> + /* Determine size of the large/contiguous page encoded in the PTE */
> + return PTE_LARGE_PAGE_SIZE(pte);
> +}
> +
> +/*
> + * Helper function to fetch a PTE using AMD v1 pgtable format.
> + * Returns:
> + * -2: The Page Table Root could not be read from DTE, or IOVA is larger
> than
> + * supported by current page table level encodedin DTE[Mode].
> + * -1: PTE could not be read from guest memory during a page table walk.
> + * This means that the DTE has valid data, and one of the lower level
> + * entries in the Page Table could not be read.
> + * 0: PTE is marked not present, or entry is 0.
> + * >0: Leaf PTE value resolved from walking Guest IO Page Table.
> + */
This seems to be a bit error prone as any statement like "if (pte < 0)"
might be completely removed by the compiler during optimization phases.
If you want to reuse such "high" values, defines could help.
Otherwise, pte could be an out parameter.
> +static uint64_t __attribute__((unused))
> +fetch_pte(AMDVIAddressSpace *as, const hwaddr address, uint64_t dte,
> + hwaddr *page_size)
> +{
> + IOMMUAccessFlags perms = amdvi_get_perms(dte);
> +
> + uint8_t level, mode;
> + uint64_t pte = dte, pte_addr;
> +
> + *page_size = 0;
> +
> + if (perms == IOMMU_NONE) {
> + return (uint64_t)-2;
> + }
> +
> + /*
> + * The Linux kernel driver initializes the default mode to 3,
> corresponding
> + * to a 39-bit GPA space, where each entry in the pagetable translates
> to a
> + * 1GB (2^30) page size.
> + */
> + level = mode = get_pte_translation_mode(dte);
> + assert(mode > 0 && mode < 7);
> +
> + /*
> + * If IOVA is larger than the max supported by the current pgtable level,
> + * there is nothing to do. This signals that the pagetable level should
> be
> + * increased, or is an address meant to have special behavior like
> + * invalidating the entire cache.
> + */
> + if (address > PT_LEVEL_MAX_ADDR(mode - 1)) {
> + /* IOVA too large for the current DTE */
> + return (uint64_t)-2;
> + }
> +
> + do {
> + level -= 1;
> +
> + /* Update the page_size */
> + *page_size = PTE_LEVEL_PAGE_SIZE(level);
> +
> + /* Permission bits are ANDed at every level, including the DTE */
> + perms &= amdvi_get_perms(pte);
> + if (perms == IOMMU_NONE) {
> + return pte;
> + }
> +
> + /* Not Present */
> + if (!IOMMU_PTE_PRESENT(pte)) {
> + return 0;
> + }
> +
> + /* Large or Leaf PTE found */
> + if (PTE_NEXT_LEVEL(pte) == 7 || PTE_NEXT_LEVEL(pte) == 0) {
> + /* Leaf PTE found */
> + break;
> + }
> +
> + /*
> + * Index the pgtable using the IOVA bits corresponding to current
> level
> + * and walk down to the lower level.
> + */
> + pte_addr = NEXT_PTE_ADDR(pte, level, address);
> + pte = amdvi_get_pte_entry(as->iommu_state, pte_addr, as->devfn);
> +
> + if (pte == (uint64_t)-1) {
> + /*
> + * A returned PTE of -1 indicates a failure to read the page
> table
> + * entry from guest memory.
> + */
> + if (level == mode - 1) {
> + /* Failure to retrieve the Page Table from Root Pointer */
> + *page_size = 0;
> + return (uint64_t)-2;
> + } else {
> + /* Failure to read PTE. Page walk skips a page_size chunk */
> + return pte;
> + }
> + }
> + } while (level > 0);
> +
> + /*
> + * Page walk ends when Next Level field on PTE shows that either a leaf
> PTE
> + * or a series of large PTEs have been reached. In the latter case,
> return
> + * the pointer to the first PTE of the series.
> + */
> + assert(level == 0 || PTE_NEXT_LEVEL(pte) == 0 || PTE_NEXT_LEVEL(pte) ==
> 7);
> +
> + /*
> + * In case the range starts in the middle of a contiguous page, need to
> + * return the first PTE
> + */
> + if (PTE_NEXT_LEVEL(pte) == 7) {
> + /* Update page_size with the large PTE page size */
> + *page_size = large_pte_page_size(pte);
> + }
> +
> + return pte;
> +}
> +
> /*
> * Toggle between address translation and passthrough modes by enabling the
> * corresponding memory regions.
> diff --git a/hw/i386/amd_iommu.h b/hw/i386/amd_iommu.h
> index c89e7dc9947d..fc4d2f7a4575 100644
> --- a/hw/i386/amd_iommu.h
> +++ b/hw/i386/amd_iommu.h
> @@ -25,6 +25,8 @@
> #include "hw/i386/x86-iommu.h"
> #include "qom/object.h"
>
> +#define GENMASK64(h, l) (((~0ULL) >> (63 - (h) + (l))) << (l))
> +
> /* Capability registers */
> #define AMDVI_CAPAB_BAR_LOW 0x04
> #define AMDVI_CAPAB_BAR_HIGH 0x08
> @@ -174,6 +176,46 @@
> #define AMDVI_GATS_MODE (2ULL << 12)
> #define AMDVI_HATS_MODE (2ULL << 10)
>
> +/* Page Table format */
> +
> +#define AMDVI_PTE_PR (1ULL << 0)
> +#define AMDVI_PTE_NEXT_LEVEL_MASK GENMASK64(11, 9)
> +
> +#define IOMMU_PTE_PRESENT(pte) ((pte) & AMDVI_PTE_PR)
> +
> +/* Using level=0 for leaf PTE at 4K page size */
> +#define PT_LEVEL_SHIFT(level) (12 + ((level) * 9))
> +
> +/* Return IOVA bit group used to index the Page Table at specific level */
> +#define PT_LEVEL_INDEX(level, iova) (((iova) >> PT_LEVEL_SHIFT(level)) &
> \
> + GENMASK64(8, 0))
> +
> +/* Return the max address for a specified level i.e. max_oaddr */
> +#define PT_LEVEL_MAX_ADDR(x) (((x) < 5) ? \
> + ((1ULL << PT_LEVEL_SHIFT((x + 1))) - 1) : \
> + (~(0ULL)))
> +
> +/* Extract the NextLevel field from PTE/PDE */
> +#define PTE_NEXT_LEVEL(pte) (((pte) & AMDVI_PTE_NEXT_LEVEL_MASK) >> 9)
> +
> +/* Take page table level and return default pagetable size for level */
> +#define PTE_LEVEL_PAGE_SIZE(level) (1ULL << (PT_LEVEL_SHIFT(level)))
> +
> +/*
> + * Return address of lower level page table encoded in PTE and specified by
> + * current level and corresponding IOVA bit group at such level.
> + */
> +#define NEXT_PTE_ADDR(pte, level, iova) (((pte) & AMDVI_DEV_PT_ROOT_MASK) + \
> + (PT_LEVEL_INDEX(level, iova) * 8))
> +
> +/*
> + * Take a PTE value with mode=0x07 and return the page size it encodes.
> + */
> +#define PTE_LARGE_PAGE_SIZE(pte) (1ULL << (1 + cto64(((pte) | 0xfffULL))))
> +
> +/* Return number of PTEs to use for a given page size (expected power of 2)
> */
> +#define PAGE_SIZE_PTE_COUNT(pgsz) (1ULL << ((ctz64(pgsz) - 12) % 9))
> +
> /* IOTLB */
> #define AMDVI_IOTLB_MAX_SIZE 1024
> #define AMDVI_DEVID_SHIFT 36
> --
> 2.43.5
>
>