One of the aspects of device access is whether CPU writes to a device are posted or non-posted. For non-posted writes, the CPU will wait for the device to acknowledge that the write has performed. If the device sits on a bus far away, this can take a while and slow things down. The alternative are so-called posted writes. The CPU will "post" the write to the bus without waiting for an acknowledgement. The CPU may receive an asynchronous notifaction at a later time that the write didn't succeed or a failing write may be dropped without further botification. On most architectures whether writes are posted or not is a property of the bus between the CPU and the device. For example, memory mapped I/O on the PCI bus is always posted and there is nothing the CPU can do about it.
On the ARM architecture though we can indicate to the CPU whether writes to a certain address range should be posted or not. This is done by specifying certain memory attributes in the mappings used by the MMU. The OpenBSD kernel always specifies device access as non-posted. On all ARM implementations we have seen so far this seems to work even for writes to devices connected to a PCIe bus. There might be a penalty though, so I need to investigate this a bit further. However, on Apple's M1 SoC, this isn't the case. Non-posted writes to a bus that uses posted writes fail and vice-versa. So in order to use the PCIe bus on these SoCs we need to specify the right memory attributes. The diff below implements this by introducing a new BUS_SPACE_MAP_POSTED flag. At this point I don't expect generic drivers to use this flag yet. So there is no need to add it for other architectures. But I don't rule out we may have to use this flag in sys/dev/fdt sometime in the future. That is why I posted this to a wider audience. ok? Index: arch/arm64/arm64/locore.S =================================================================== RCS file: /cvs/src/sys/arch/arm64/arm64/locore.S,v retrieving revision 1.32 diff -u -p -r1.32 locore.S --- arch/arm64/arm64/locore.S 19 Oct 2020 17:57:40 -0000 1.32 +++ arch/arm64/arm64/locore.S 14 Feb 2021 21:28:26 -0000 @@ -233,9 +233,10 @@ switch_mmu_kernel: mair: /* Device | Normal (no cache, write-back, write-through) */ .quad MAIR_ATTR(0x00, 0) | \ - MAIR_ATTR(0x44, 1) | \ - MAIR_ATTR(0xff, 2) | \ - MAIR_ATTR(0x88, 3) + MAIR_ATTR(0x04, 1) | \ + MAIR_ATTR(0x44, 2) | \ + MAIR_ATTR(0xff, 3) | \ + MAIR_ATTR(0x88, 4) tcr: .quad (TCR_T1SZ(64 - VIRT_BITS) | TCR_T0SZ(64 - 48) | \ TCR_AS | TCR_TG1_4K | TCR_CACHE_ATTRS | TCR_SMP_ATTRS) Index: arch/arm64/arm64/locore0.S =================================================================== RCS file: /cvs/src/sys/arch/arm64/arm64/locore0.S,v retrieving revision 1.5 diff -u -p -r1.5 locore0.S --- arch/arm64/arm64/locore0.S 28 May 2019 20:32:30 -0000 1.5 +++ arch/arm64/arm64/locore0.S 14 Feb 2021 21:28:26 -0000 @@ -34,8 +34,8 @@ #include <machine/pte.h> #define DEVICE_MEM 0 -#define NORMAL_UNCACHED 1 -#define NORMAL_MEM 2 +#define NORMAL_UNCACHED 2 +#define NORMAL_MEM 3 /* * We assume: Index: arch/arm64/arm64/machdep.c =================================================================== RCS file: /cvs/src/sys/arch/arm64/arm64/machdep.c,v retrieving revision 1.57 diff -u -p -r1.57 machdep.c --- arch/arm64/arm64/machdep.c 11 Feb 2021 23:55:48 -0000 1.57 +++ arch/arm64/arm64/machdep.c 14 Feb 2021 21:28:27 -0000 @@ -1188,7 +1188,7 @@ pmap_bootstrap_bs_map(bus_space_tag_t t, for (pa = startpa; pa < endpa; pa += PAGE_SIZE, va += PAGE_SIZE) pmap_kenter_cache(va, pa, PROT_READ | PROT_WRITE, - PMAP_CACHE_DEV); + PMAP_CACHE_DEV_NGNRNE); virtual_avail = va; Index: arch/arm64/arm64/pmap.c =================================================================== RCS file: /cvs/src/sys/arch/arm64/arm64/pmap.c,v retrieving revision 1.70 diff -u -p -r1.70 pmap.c --- arch/arm64/arm64/pmap.c 25 Jan 2021 19:37:17 -0000 1.70 +++ arch/arm64/arm64/pmap.c 14 Feb 2021 21:28:28 -0000 @@ -472,7 +472,7 @@ pmap_enter(pmap_t pm, vaddr_t va, paddr_ if (pa & PMAP_NOCACHE) cache = PMAP_CACHE_CI; if (pa & PMAP_DEVICE) - cache = PMAP_CACHE_DEV; + cache = PMAP_CACHE_DEV_NGNRNE; pg = PHYS_TO_VM_PAGE(pa); pmap_lock(pm); @@ -648,7 +648,7 @@ _pmap_kenter_pa(vaddr_t va, paddr_t pa, pmap_pte_insert(pted); ttlb_flush(pm, va & ~PAGE_MASK); - if (cache == PMAP_CACHE_CI || cache == PMAP_CACHE_DEV) + if (cache == PMAP_CACHE_CI || cache == PMAP_CACHE_DEV_NGNRNE) cpu_idcache_wbinv_range(va & ~PAGE_MASK, PAGE_SIZE); } @@ -735,7 +735,9 @@ pmap_fill_pte(pmap_t pm, vaddr_t va, pad break; case PMAP_CACHE_CI: break; - case PMAP_CACHE_DEV: + case PMAP_CACHE_DEV_NGNRNE: + break; + case PMAP_CACHE_DEV_NGNRE: break; default: panic("pmap_fill_pte:invalid cache mode"); @@ -1637,8 +1639,12 @@ pmap_pte_update(struct pte_desc *pted, u attr |= ATTR_IDX(PTE_ATTR_CI); attr |= ATTR_SH(SH_INNER); break; - case PMAP_CACHE_DEV: - attr |= ATTR_IDX(PTE_ATTR_DEV); + case PMAP_CACHE_DEV_NGNRNE: + attr |= ATTR_IDX(PTE_ATTR_DEV_NGNRNE); + attr |= ATTR_SH(SH_INNER); + break; + case PMAP_CACHE_DEV_NGNRE: + attr |= ATTR_IDX(PTE_ATTR_DEV_NGNRE); attr |= ATTR_SH(SH_INNER); break; default: Index: arch/arm64/dev/arm64_bus_space.c =================================================================== RCS file: /cvs/src/sys/arch/arm64/dev/arm64_bus_space.c,v retrieving revision 1.7 diff -u -p -r1.7 arm64_bus_space.c --- arch/arm64/dev/arm64_bus_space.c 20 Aug 2018 19:38:07 -0000 1.7 +++ arch/arm64/dev/arm64_bus_space.c 14 Feb 2021 21:28:29 -0000 @@ -191,8 +191,12 @@ generic_space_map(bus_space_tag_t t, bus { u_long startpa, endpa, pa; vaddr_t va; - int cache = flags & BUS_SPACE_MAP_CACHEABLE ? - PMAP_CACHE_WB : PMAP_CACHE_DEV; + int cache = PMAP_CACHE_DEV_NGNRNE; + + if (flags & BUS_SPACE_MAP_CACHEABLE) + cache = PMAP_CACHE_WB; + if (flags & BUS_SPACE_MAP_POSTED) + cache = PMAP_CACHE_DEV_NGNRE; startpa = trunc_page(offs); endpa = round_page(offs + size); Index: arch/arm64/include/bus.h =================================================================== RCS file: /cvs/src/sys/arch/arm64/include/bus.h,v retrieving revision 1.7 diff -u -p -r1.7 bus.h --- arch/arm64/include/bus.h 13 Apr 2020 21:34:54 -0000 1.7 +++ arch/arm64/include/bus.h 14 Feb 2021 21:28:29 -0000 @@ -130,7 +130,7 @@ struct bus_space { (*(t)->_space_subregion)((t), (h), (o), (s), (p)) #define BUS_SPACE_MAP_CACHEABLE 0x01 -#define BUS_SPACE_MAP_KSEG0 0x02 +#define BUS_SPACE_MAP_POSTED 0x02 #define BUS_SPACE_MAP_LINEAR 0x04 #define BUS_SPACE_MAP_PREFETCHABLE 0x08 Index: arch/arm64/include/pmap.h =================================================================== RCS file: /cvs/src/sys/arch/arm64/include/pmap.h,v retrieving revision 1.14 diff -u -p -r1.14 pmap.h --- arch/arm64/include/pmap.h 21 Oct 2020 21:53:47 -0000 1.14 +++ arch/arm64/include/pmap.h 14 Feb 2021 21:28:29 -0000 @@ -42,7 +42,8 @@ #define PMAP_CACHE_CI (PMAP_MD0) /* cache inhibit */ #define PMAP_CACHE_WT (PMAP_MD1) /* writethru */ #define PMAP_CACHE_WB (PMAP_MD1|PMAP_MD0) /* writeback */ -#define PMAP_CACHE_DEV (PMAP_MD2) /* device mapping */ +#define PMAP_CACHE_DEV_NGNRNE (PMAP_MD2) /* device nGnRnE */ +#define PMAP_CACHE_DEV_NGNRE (PMAP_MD2|PMAP_MD0) /* device nGnRE */ #define PMAP_CACHE_BITS (PMAP_MD0|PMAP_MD1|PMAP_MD2) #define PTED_VA_MANAGED_M (PMAP_MD3) Index: arch/arm64/include/pte.h =================================================================== RCS file: /cvs/src/sys/arch/arm64/include/pte.h,v retrieving revision 1.5 diff -u -p -r1.5 pte.h --- arch/arm64/include/pte.h 13 Apr 2017 23:29:02 -0000 1.5 +++ arch/arm64/include/pte.h 14 Feb 2021 21:28:30 -0000 @@ -53,11 +53,11 @@ #define ATTR_IDX(x) ((x) << 2) #define ATTR_IDX_MASK (7 << 2) -#define PTE_ATTR_DEV 0 -#define PTE_ATTR_CI 1 -#define PTE_ATTR_WB 2 -#define PTE_ATTR_WT 3 - +#define PTE_ATTR_DEV_NGNRNE 0 +#define PTE_ATTR_DEV_NGNRE 1 +#define PTE_ATTR_CI 2 +#define PTE_ATTR_WB 3 +#define PTE_ATTR_WT 4 #define SH_INNER 3 #define SH_OUTER 2