From fa54b39d18954d0110440e51026f406a87419987 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 4 Aug 2025 08:40:27 -0700 Subject: [PATCH 1/9] iommu/amd: Avoid stack buffer overflow from kernel cmdline ANBZ: #32239 commit 8503d0fcb1086a7cfe26df67ca4bd9bd9e99bdec upstream. While the kernel command line is considered trusted in most environments, avoid writing 1 byte past the end of "acpiid" if the "str" argument is maximum length. Reported-by: Simcha Kosman Closes: https://lore.kernel.org/all/AS8P193MB2271C4B24BCEDA31830F37AE84A52@AS8P193MB2271.EURP193.PROD.OUTLOOK.COM Fixes: b6b26d86c61c ("iommu/amd: Add a length limitation for the ivrs_acpihid command-line parameter") Signed-off-by: Kees Cook Reviewed-by: Ankit Soni Link: https://lore.kernel.org/r/20250804154023.work.970-kees@kernel.org Signed-off-by: Joerg Roedel Signed-off-by: hr567 --- drivers/iommu/amd/init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 36e1d33752a8..6d273b7f1c52 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -3616,7 +3616,7 @@ static int __init parse_ivrs_acpihid(char *str) { u32 seg = 0, bus, dev, fn; char *hid, *uid, *p, *addr; - char acpiid[ACPIID_LEN] = {0}; + char acpiid[ACPIID_LEN + 1] = { }; /* size with NULL terminator */ int i; addr = strchr(str, '@'); @@ -3642,7 +3642,7 @@ static int __init parse_ivrs_acpihid(char *str) /* We have the '@', make it the terminator to get just the acpiid */ *addr++ = 0; - if (strlen(str) > ACPIID_LEN + 1) + if (strlen(str) > ACPIID_LEN) goto not_found; if (sscanf(str, "=%s", acpiid) != 1) -- Gitee From 25a217e7260103ad973dc205594e5e47027252ae Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Mon, 25 Aug 2025 21:46:01 +0000 Subject: [PATCH 2/9] iommu/amd: Add support to remap/unmap IOMMU buffers for kdump MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ANBZ: #32239 commit f32fe7cb01986 upstream. After a panic if SNP is enabled in the previous kernel then the kdump kernel boots with IOMMU SNP enforcement still enabled. IOMMU completion wait buffers (CWBs), command buffers and event buffer registers remain locked and exclusive to the previous kernel. Attempts to allocate and use new buffers in the kdump kernel fail, as hardware ignores writes to the locked MMIO registers as per AMD IOMMU spec Section 2.12.2.1. This results in repeated "Completion-Wait loop timed out" errors and a second kernel panic: "Kernel panic - not syncing: timer doesn't work through Interrupt-remapped IO-APIC" The list of MMIO registers locked and which ignore writes after failed SNP shutdown are mentioned in the AMD IOMMU specifications below: Section 2.12.2.1. https://docs.amd.com/v/u/en-US/48882_3.10_PUB Reuse the pages of the previous kernel for completion wait buffers, command buffers, event buffers and memremap them during kdump boot and essentially work with an already enabled IOMMU configuration and re-using the previous kernel’s data structures. Reusing of command buffers and event buffers is now done for kdump boot irrespective of SNP being enabled during kdump. Re-use of completion wait buffers is only done when SNP is enabled as the exclusion base register is used for the completion wait buffer (CWB) address only when SNP is enabled. Reviewed-by: Vasant Hegde Tested-by: Sairaj Kodilkar Signed-off-by: Ashish Kalra Link: https://lore.kernel.org/r/ff04b381a8fe774b175c23c1a336b28bc1396511.1756157913.git.ashish.kalra@amd.com Signed-off-by: Joerg Roedel Signed-off-by: hr567 --- drivers/iommu/amd/amd_iommu_types.h | 5 + drivers/iommu/amd/init.c | 152 +++++++++++++++++++++++++--- drivers/iommu/amd/iommu.c | 2 +- 3 files changed, 146 insertions(+), 13 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 687542608272..77e7049f8e15 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -791,6 +791,11 @@ struct amd_iommu { u32 flags; volatile u64 *cmd_sem; atomic64_t cmd_sem_val; + /* + * Track physical address to directly use it in build_completion_wait() + * and avoid adding any special checks and handling for kdump. + */ + u64 cmd_sem_paddr; #ifdef CONFIG_AMD_IOMMU_DEBUGFS /* DebugFS Info */ diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 6d273b7f1c52..fd2f36e5937e 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -709,6 +709,26 @@ static void __init free_alias_table(struct amd_iommu_pci_seg *pci_seg) pci_seg->alias_table = NULL; } +static inline void *iommu_memremap(unsigned long paddr, size_t size) +{ + phys_addr_t phys; + + if (!paddr) + return NULL; + + /* + * Obtain true physical address in kdump kernel when SME is enabled. + * Currently, previous kernel with SME enabled and kdump kernel + * with SME support disabled is not supported. + */ + phys = __sme_clr(paddr); + + if (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT)) + return (__force void *)ioremap_encrypted(phys, size); + else + return memremap(phys, size, MEMREMAP_WB); +} + /* * Allocates the command buffer. This buffer is per AMD IOMMU. We can * write commands to that buffer later and the IOMMU will execute them @@ -941,8 +961,91 @@ static int iommu_init_ga_log(struct amd_iommu *iommu) static int __init alloc_cwwb_sem(struct amd_iommu *iommu) { iommu->cmd_sem = iommu_alloc_4k_pages(iommu, GFP_KERNEL, 1); + if (!iommu->cmd_sem) + return -ENOMEM; + iommu->cmd_sem_paddr = iommu_virt_to_phys((void *)iommu->cmd_sem); + return 0; +} + +static int __init remap_event_buffer(struct amd_iommu *iommu) +{ + u64 paddr; + + pr_info_once("Re-using event buffer from the previous kernel\n"); + paddr = readq(iommu->mmio_base + MMIO_EVT_BUF_OFFSET) & PM_ADDR_MASK; + iommu->evt_buf = iommu_memremap(paddr, EVT_BUFFER_SIZE); + + return iommu->evt_buf ? 0 : -ENOMEM; +} + +static int __init remap_command_buffer(struct amd_iommu *iommu) +{ + u64 paddr; - return iommu->cmd_sem ? 0 : -ENOMEM; + pr_info_once("Re-using command buffer from the previous kernel\n"); + paddr = readq(iommu->mmio_base + MMIO_CMD_BUF_OFFSET) & PM_ADDR_MASK; + iommu->cmd_buf = iommu_memremap(paddr, CMD_BUFFER_SIZE); + + return iommu->cmd_buf ? 0 : -ENOMEM; +} + +static int __init remap_or_alloc_cwwb_sem(struct amd_iommu *iommu) +{ + u64 paddr; + + if (check_feature(FEATURE_SNP)) { + /* + * When SNP is enabled, the exclusion base register is used for the + * completion wait buffer (CWB) address. Read and re-use it. + */ + pr_info_once("Re-using CWB buffers from the previous kernel\n"); + paddr = readq(iommu->mmio_base + MMIO_EXCL_BASE_OFFSET) & PM_ADDR_MASK; + iommu->cmd_sem = iommu_memremap(paddr, PAGE_SIZE); + if (!iommu->cmd_sem) + return -ENOMEM; + iommu->cmd_sem_paddr = paddr; + } else { + return alloc_cwwb_sem(iommu); + } + + return 0; +} + +static int __init alloc_iommu_buffers(struct amd_iommu *iommu) +{ + int ret; + + /* + * Reuse/Remap the previous kernel's allocated completion wait + * command and event buffers for kdump boot. + */ + if (is_kdump_kernel()) { + ret = remap_or_alloc_cwwb_sem(iommu); + if (ret) + return ret; + + ret = remap_command_buffer(iommu); + if (ret) + return ret; + + ret = remap_event_buffer(iommu); + if (ret) + return ret; + } else { + ret = alloc_cwwb_sem(iommu); + if (ret) + return ret; + + ret = alloc_command_buffer(iommu); + if (ret) + return ret; + + ret = alloc_event_buffer(iommu); + if (ret) + return ret; + } + + return 0; } static void __init free_cwwb_sem(struct amd_iommu *iommu) @@ -950,6 +1053,38 @@ static void __init free_cwwb_sem(struct amd_iommu *iommu) if (iommu->cmd_sem) iommu_free_pages((void *)iommu->cmd_sem); } +static void __init unmap_cwwb_sem(struct amd_iommu *iommu) +{ + if (iommu->cmd_sem) { + if (check_feature(FEATURE_SNP)) + memunmap((void *)iommu->cmd_sem); + else + iommu_free_pages((void *)iommu->cmd_sem); + } +} + +static void __init unmap_command_buffer(struct amd_iommu *iommu) +{ + memunmap((void *)iommu->cmd_buf); +} + +static void __init unmap_event_buffer(struct amd_iommu *iommu) +{ + memunmap(iommu->evt_buf); +} + +static void __init free_iommu_buffers(struct amd_iommu *iommu) +{ + if (is_kdump_kernel()) { + unmap_cwwb_sem(iommu); + unmap_command_buffer(iommu); + unmap_event_buffer(iommu); + } else { + free_cwwb_sem(iommu); + free_command_buffer(iommu); + free_event_buffer(iommu); + } +} static void iommu_enable_xt(struct amd_iommu *iommu) { @@ -1654,9 +1789,7 @@ static void __init free_sysfs(struct amd_iommu *iommu) static void __init free_iommu_one(struct amd_iommu *iommu) { free_sysfs(iommu); - free_cwwb_sem(iommu); - free_command_buffer(iommu); - free_event_buffer(iommu); + free_iommu_buffers(iommu); amd_iommu_free_ppr_log(iommu); free_ga_log(iommu); iommu_unmap_mmio_space(iommu); @@ -1820,14 +1953,9 @@ static int __init init_iommu_one_late(struct amd_iommu *iommu) { int ret; - if (alloc_cwwb_sem(iommu)) - return -ENOMEM; - - if (alloc_command_buffer(iommu)) - return -ENOMEM; - - if (alloc_event_buffer(iommu)) - return -ENOMEM; + ret = alloc_iommu_buffers(iommu); + if (ret) + return ret; iommu->int_enabled = false; diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index e539b128a90e..cbb6cf6b8027 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1194,7 +1194,7 @@ static void build_completion_wait(struct iommu_cmd *cmd, struct amd_iommu *iommu, u64 data) { - u64 paddr = iommu_virt_to_phys((void *)iommu->cmd_sem); + u64 paddr = iommu->cmd_sem_paddr; memset(cmd, 0, sizeof(*cmd)); cmd->data[0] = lower_32_bits(paddr) | CMD_COMPL_WAIT_STORE_MASK; -- Gitee From cbc63dfa6c5980ae2e7f8990e407a71d89f8d92a Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Mon, 25 Aug 2025 21:46:15 +0000 Subject: [PATCH 3/9] iommu/amd: Reuse device table for kdump ANBZ: #32239 commit 38e5f33ee3596f37ee8d1e694073a17590904004 upstream. After a panic if SNP is enabled in the previous kernel then the kdump kernel boots with IOMMU SNP enforcement still enabled. IOMMU device table register is locked and exclusive to the previous kernel. Attempts to copy old device table from the previous kernel fails in kdump kernel as hardware ignores writes to the locked device table base address register as per AMD IOMMU spec Section 2.12.2.1. This causes the IOMMU driver (OS) and the hardware to reference different memory locations. As a result, the IOMMU hardware cannot process the command which results in repeated "Completion-Wait loop timed out" errors and a second kernel panic: "Kernel panic - not syncing: timer doesn't work through Interrupt-remapped IO-APIC". Reuse device table instead of copying device table in case of kdump boot and remove all copying device table code. Reviewed-by: Vasant Hegde Tested-by: Sairaj Kodilkar Signed-off-by: Ashish Kalra Link: https://lore.kernel.org/r/3a31036fb2f7323e6b1a1a1921ac777e9f7bdddc.1756157913.git.ashish.kalra@amd.com Signed-off-by: Joerg Roedel Signed-off-by: hr567 --- drivers/iommu/amd/init.c | 104 +++++++++++++-------------------------- 1 file changed, 34 insertions(+), 70 deletions(-) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index fd2f36e5937e..66b8bc455491 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -405,6 +405,9 @@ static void iommu_set_device_table(struct amd_iommu *iommu) BUG_ON(iommu->mmio_base == NULL); + if (is_kdump_kernel()) + return; + entry = iommu_virt_to_phys(dev_table); entry |= (dev_table_size >> 12) - 1; memcpy_toio(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET, @@ -645,7 +648,10 @@ static inline int __init alloc_dev_table(struct amd_iommu_pci_seg *pci_seg) static inline void free_dev_table(struct amd_iommu_pci_seg *pci_seg) { - iommu_free_pages(pci_seg->dev_table); + if (is_kdump_kernel()) + memunmap((void *)pci_seg->dev_table); + else + iommu_free_pages(pci_seg->dev_table); pci_seg->dev_table = NULL; } @@ -1116,15 +1122,12 @@ static void set_dte_bit(struct dev_table_entry *dte, u8 bit) dte->data[i] |= (1UL << _bit); } -static bool __copy_device_table(struct amd_iommu *iommu) +static bool __reuse_device_table(struct amd_iommu *iommu) { - u64 int_ctl, int_tab_len, entry = 0; struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; - struct dev_table_entry *old_devtb = NULL; - u32 lo, hi, devid, old_devtb_size; + u32 lo, hi, old_devtb_size; phys_addr_t old_devtb_phys; - u16 dom_id, dte_v, irq_v; - u64 tmp; + u64 entry; /* Each IOMMU use separate device table with the same size */ lo = readl(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET); @@ -1149,66 +1152,20 @@ static bool __copy_device_table(struct amd_iommu *iommu) pr_err("The address of old device table is above 4G, not trustworthy!\n"); return false; } - old_devtb = (cc_platform_has(CC_ATTR_HOST_MEM_ENCRYPT) && is_kdump_kernel()) - ? (__force void *)ioremap_encrypted(old_devtb_phys, - pci_seg->dev_table_size) - : memremap(old_devtb_phys, pci_seg->dev_table_size, MEMREMAP_WB); - - if (!old_devtb) - return false; - pci_seg->old_dev_tbl_cpy = iommu_alloc_pages_sz( - GFP_KERNEL | GFP_DMA32, pci_seg->dev_table_size); + /* + * Re-use the previous kernel's device table for kdump. + */ + pci_seg->old_dev_tbl_cpy = iommu_memremap(old_devtb_phys, pci_seg->dev_table_size); if (pci_seg->old_dev_tbl_cpy == NULL) { - pr_err("Failed to allocate memory for copying old device table!\n"); - memunmap(old_devtb); + pr_err("Failed to remap memory for reusing old device table!\n"); return false; } - for (devid = 0; devid <= pci_seg->last_bdf; ++devid) { - pci_seg->old_dev_tbl_cpy[devid] = old_devtb[devid]; - dom_id = old_devtb[devid].data[1] & DEV_DOMID_MASK; - dte_v = old_devtb[devid].data[0] & DTE_FLAG_V; - - if (dte_v && dom_id) { - pci_seg->old_dev_tbl_cpy[devid].data[0] = old_devtb[devid].data[0]; - pci_seg->old_dev_tbl_cpy[devid].data[1] = old_devtb[devid].data[1]; - /* Reserve the Domain IDs used by previous kernel */ - if (ida_alloc_range(&pdom_ids, dom_id, dom_id, GFP_ATOMIC) != dom_id) { - pr_err("Failed to reserve domain ID 0x%x\n", dom_id); - memunmap(old_devtb); - return false; - } - /* If gcr3 table existed, mask it out */ - if (old_devtb[devid].data[0] & DTE_FLAG_GV) { - tmp = (DTE_GCR3_30_15 | DTE_GCR3_51_31); - pci_seg->old_dev_tbl_cpy[devid].data[1] &= ~tmp; - tmp = (DTE_GCR3_14_12 | DTE_FLAG_GV); - pci_seg->old_dev_tbl_cpy[devid].data[0] &= ~tmp; - } - } - - irq_v = old_devtb[devid].data[2] & DTE_IRQ_REMAP_ENABLE; - int_ctl = old_devtb[devid].data[2] & DTE_IRQ_REMAP_INTCTL_MASK; - int_tab_len = old_devtb[devid].data[2] & DTE_INTTABLEN_MASK; - if (irq_v && (int_ctl || int_tab_len)) { - if ((int_ctl != DTE_IRQ_REMAP_INTCTL) || - (int_tab_len != DTE_INTTABLEN_512 && - int_tab_len != DTE_INTTABLEN_2K)) { - pr_err("Wrong old irq remapping flag: %#x\n", devid); - memunmap(old_devtb); - return false; - } - - pci_seg->old_dev_tbl_cpy[devid].data[2] = old_devtb[devid].data[2]; - } - } - memunmap(old_devtb); - return true; } -static bool copy_device_table(void) +static bool reuse_device_table(void) { struct amd_iommu *iommu; struct amd_iommu_pci_seg *pci_seg; @@ -1216,17 +1173,17 @@ static bool copy_device_table(void) if (!amd_iommu_pre_enabled) return false; - pr_warn("Translation is already enabled - trying to copy translation structures\n"); + pr_warn("Translation is already enabled - trying to reuse translation structures\n"); /* * All IOMMUs within PCI segment shares common device table. - * Hence copy device table only once per PCI segment. + * Hence reuse device table only once per PCI segment. */ for_each_pci_segment(pci_seg) { for_each_iommu(iommu) { if (pci_seg->id != iommu->pci_seg->id) continue; - if (!__copy_device_table(iommu)) + if (!__reuse_device_table(iommu)) return false; break; } @@ -2905,8 +2862,8 @@ static void early_enable_iommu(struct amd_iommu *iommu) * This function finally enables all IOMMUs found in the system after * they have been initialized. * - * Or if in kdump kernel and IOMMUs are all pre-enabled, try to copy - * the old content of device table entries. Not this case or copy failed, + * Or if in kdump kernel and IOMMUs are all pre-enabled, try to reuse + * the old content of device table entries. Not this case or reuse failed, * just continue as normal kernel does. */ static void early_enable_iommus(void) @@ -2914,18 +2871,25 @@ static void early_enable_iommus(void) struct amd_iommu *iommu; struct amd_iommu_pci_seg *pci_seg; - if (!copy_device_table()) { + if (!reuse_device_table()) { /* - * If come here because of failure in copying device table from old + * If come here because of failure in reusing device table from old * kernel with all IOMMUs enabled, print error message and try to * free allocated old_dev_tbl_cpy. */ - if (amd_iommu_pre_enabled) - pr_err("Failed to copy DEV table from previous kernel.\n"); + if (amd_iommu_pre_enabled) { + pr_err("Failed to reuse DEV table from previous kernel.\n"); + /* + * Bail out early if unable to remap/reuse DEV table from + * previous kernel if SNP enabled as IOMMU commands will + * time out without DEV table and cause kdump boot panic. + */ + BUG_ON(check_feature(FEATURE_SNP)); + } for_each_pci_segment(pci_seg) { if (pci_seg->old_dev_tbl_cpy != NULL) { - iommu_free_pages(pci_seg->old_dev_tbl_cpy); + memunmap((void *)pci_seg->old_dev_tbl_cpy); pci_seg->old_dev_tbl_cpy = NULL; } } @@ -2935,7 +2899,7 @@ static void early_enable_iommus(void) early_enable_iommu(iommu); } } else { - pr_info("Copied DEV table from previous kernel.\n"); + pr_info("Reused DEV table from previous kernel.\n"); for_each_pci_segment(pci_seg) { iommu_free_pages(pci_seg->dev_table); -- Gitee From c4c9ac039274d79da4731a63a3b88ff95c080791 Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Sat, 13 Sep 2025 06:26:57 +0000 Subject: [PATCH 4/9] iommu/amd/pgtbl: Fix possible race while increase page table level ANBZ: #32239 commit 1e56310b40fd2e7e0b9493da9ff488af145bdd0c upstream. The AMD IOMMU host page table implementation supports dynamic page table levels (up to 6 levels), starting with a 3-level configuration that expands based on IOVA address. The kernel maintains a root pointer and current page table level to enable proper page table walks in alloc_pte()/fetch_pte() operations. The IOMMU IOVA allocator initially starts with 32-bit address and onces its exhuasted it switches to 64-bit address (max address is determined based on IOMMU and device DMA capability). To support larger IOVA, AMD IOMMU driver increases page table level. But in unmap path (iommu_v1_unmap_pages()), fetch_pte() reads pgtable->[root/mode] without lock. So its possible that in exteme corner case, when increase_address_space() is updating pgtable->[root/mode], fetch_pte() reads wrong page table level (pgtable->mode). It does compare the value with level encoded in page table and returns NULL. This will result is iommu_unmap ops to fail and upper layer may retry/log WARN_ON. CPU 0 CPU 1 ------ ------ map pages unmap pages alloc_pte() -> increase_address_space() iommu_v1_unmap_pages() -> fetch_pte() pgtable->root = pte (new root value) READ pgtable->[mode/root] Reads new root, old mode Updates mode (pgtable->mode += 1) Since Page table level updates are infrequent and already synchronized with a spinlock, implement seqcount to enable lock-free read operations on the read path. Fixes: 754265bcab7 ("iommu/amd: Fix race in increase_address_space()") Reported-by: Alejandro Jimenez Cc: stable@vger.kernel.org Cc: Joao Martins Cc: Suravee Suthikulpanit Signed-off-by: Vasant Hegde Signed-off-by: Joerg Roedel Signed-off-by: hr567 --- drivers/iommu/amd/amd_iommu_types.h | 1 + drivers/iommu/amd/io_pgtable.c | 25 +++++++++++++++++++++---- 2 files changed, 22 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 77e7049f8e15..de03f7077613 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -555,6 +555,7 @@ struct gcr3_tbl_info { }; struct amd_io_pgtable { + seqcount_t seqcount; /* Protects root/mode update */ struct io_pgtable pgtbl; int mode; u64 *root; diff --git a/drivers/iommu/amd/io_pgtable.c b/drivers/iommu/amd/io_pgtable.c index a91e71f981ef..70c2f5b1631b 100644 --- a/drivers/iommu/amd/io_pgtable.c +++ b/drivers/iommu/amd/io_pgtable.c @@ -17,6 +17,7 @@ #include #include #include +#include #include @@ -130,8 +131,11 @@ static bool increase_address_space(struct amd_io_pgtable *pgtable, *pte = PM_LEVEL_PDE(pgtable->mode, iommu_virt_to_phys(pgtable->root)); + write_seqcount_begin(&pgtable->seqcount); pgtable->root = pte; pgtable->mode += 1; + write_seqcount_end(&pgtable->seqcount); + amd_iommu_update_and_flush_device_table(domain); pte = NULL; @@ -153,6 +157,7 @@ static u64 *alloc_pte(struct amd_io_pgtable *pgtable, { unsigned long last_addr = address + (page_size - 1); struct io_pgtable_cfg *cfg = &pgtable->pgtbl.cfg; + unsigned int seqcount; int level, end_lvl; u64 *pte, *page; @@ -170,8 +175,14 @@ static u64 *alloc_pte(struct amd_io_pgtable *pgtable, } - level = pgtable->mode - 1; - pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; + do { + seqcount = read_seqcount_begin(&pgtable->seqcount); + + level = pgtable->mode - 1; + pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; + } while (read_seqcount_retry(&pgtable->seqcount, seqcount)); + + address = PAGE_SIZE_ALIGN(address, page_size); end_lvl = PAGE_SIZE_LEVEL(page_size); @@ -249,6 +260,7 @@ static u64 *fetch_pte(struct amd_io_pgtable *pgtable, unsigned long *page_size) { int level; + unsigned int seqcount; u64 *pte; *page_size = 0; @@ -256,8 +268,12 @@ static u64 *fetch_pte(struct amd_io_pgtable *pgtable, if (address > PM_LEVEL_SIZE(pgtable->mode)) return NULL; - level = pgtable->mode - 1; - pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; + do { + seqcount = read_seqcount_begin(&pgtable->seqcount); + level = pgtable->mode - 1; + pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; + } while (read_seqcount_retry(&pgtable->seqcount, seqcount)); + *page_size = PTE_LEVEL_PAGE_SIZE(level); while (level > 0) { @@ -541,6 +557,7 @@ static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *coo if (!pgtable->root) return NULL; pgtable->mode = PAGE_MODE_3_LEVEL; + seqcount_init(&pgtable->seqcount); cfg->pgsize_bitmap = amd_iommu_pgsize_bitmap; cfg->ias = IOMMU_IN_ADDR_BIT_SIZE; -- Gitee From 34e2739334f7c932e89ee0f4a81e938b399bf3e3 Mon Sep 17 00:00:00 2001 From: Dheeraj Kumar Srivastava Date: Thu, 13 Nov 2025 02:43:50 +0530 Subject: [PATCH 5/9] iommu/amd: Enhance "Completion-wait Time-out" error message ANBZ: #32239 commit d1e281f832fcadad3c3f6c8c5f998aadd7cb33a5 upstream. Current IOMMU driver prints "Completion-wait Time-out" error message with insufficient information to further debug the issue. Enhancing the error message as following: 1. Log IOMMU PCI device ID in the error message. 2. With "amd_iommu_dump=1" kernel command line option, dump entire command buffer entries including Head and Tail offset. Dump the entire command buffer only on the first 'Completion-wait Time-out' to avoid dmesg spam. Signed-off-by: Dheeraj Kumar Srivastava Reviewed-by: Ankit Soni Reviewed-by: Vasant Hegde Signed-off-by: Joerg Roedel Signed-off-by: hr567 --- drivers/iommu/amd/amd_iommu_types.h | 4 ++++ drivers/iommu/amd/iommu.c | 28 +++++++++++++++++++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index de03f7077613..411311d17025 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -247,6 +247,10 @@ #define CMD_BUFFER_ENTRIES 512 #define MMIO_CMD_SIZE_SHIFT 56 #define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT) +#define MMIO_CMD_HEAD_MASK GENMASK_ULL(18, 4) /* Command buffer head ptr field [18:4] */ +#define MMIO_CMD_BUFFER_HEAD(x) FIELD_GET(MMIO_CMD_HEAD_MASK, (x)) +#define MMIO_CMD_TAIL_MASK GENMASK_ULL(18, 4) /* Command buffer tail ptr field [18:4] */ +#define MMIO_CMD_BUFFER_TAIL(x) FIELD_GET(MMIO_CMD_TAIL_MASK, (x)) /* constants for event buffer handling */ #define EVT_BUFFER_SIZE 8192 /* 512 entries */ diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index cbb6cf6b8027..a01f8f466a7c 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1155,6 +1155,25 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data) * ****************************************************************************/ +static void dump_command_buffer(struct amd_iommu *iommu) +{ + struct iommu_cmd *cmd; + u32 head, tail; + int i; + + head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); + tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); + + pr_err("CMD Buffer head=%llu tail=%llu\n", MMIO_CMD_BUFFER_HEAD(head), + MMIO_CMD_BUFFER_TAIL(tail)); + + for (i = 0; i < CMD_BUFFER_ENTRIES; i++) { + cmd = (struct iommu_cmd *)(iommu->cmd_buf + i * sizeof(*cmd)); + pr_err("%3d: %08x %08x %08x %08x\n", i, cmd->data[0], cmd->data[1], cmd->data[2], + cmd->data[3]); + } +} + static int wait_on_sem(struct amd_iommu *iommu, u64 data) { int i = 0; @@ -1165,7 +1184,14 @@ static int wait_on_sem(struct amd_iommu *iommu, u64 data) } if (i == LOOP_TIMEOUT) { - pr_alert("Completion-Wait loop timed out\n"); + + pr_alert("IOMMU %04x:%02x:%02x.%01x: Completion-Wait loop timed out\n", + iommu->pci_seg->id, PCI_BUS_NUM(iommu->devid), + PCI_SLOT(iommu->devid), PCI_FUNC(iommu->devid)); + + if (amd_iommu_dump) + DO_ONCE_LITE(dump_command_buffer, iommu); + return -EIO; } -- Gitee From 4d358b7a1dd9ea2f017593365ce335c9d60b50e3 Mon Sep 17 00:00:00 2001 From: Jinhui Guo Date: Tue, 28 Oct 2025 00:50:17 +0800 Subject: [PATCH 6/9] iommu/amd: Fix pci_segment memleak in alloc_pci_segment() ANBZ: #32239 commit 75ba146c2674ba49ed8a222c67f9abfb4a4f2a4f upstream. Fix a memory leak of struct amd_iommu_pci_segment in alloc_pci_segment() when system memory (or contiguous memory) is insufficient. Fixes: 04230c119930 ("iommu/amd: Introduce per PCI segment device table") Fixes: eda797a27795 ("iommu/amd: Introduce per PCI segment rlookup table") Fixes: 99fc4ac3d297 ("iommu/amd: Introduce per PCI segment alias_table") Cc: stable@vger.kernel.org Signed-off-by: Jinhui Guo Signed-off-by: Joerg Roedel Signed-off-by: hr567 --- drivers/iommu/amd/init.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 66b8bc455491..994752ab4459 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -1699,13 +1699,22 @@ static struct amd_iommu_pci_seg *__init alloc_pci_segment(u16 id, list_add_tail(&pci_seg->list, &amd_iommu_pci_seg_list); if (alloc_dev_table(pci_seg)) - return NULL; + goto err_free_pci_seg; if (alloc_alias_table(pci_seg)) - return NULL; + goto err_free_dev_table; if (alloc_rlookup_table(pci_seg)) - return NULL; + goto err_free_alias_table; return pci_seg; + +err_free_alias_table: + free_alias_table(pci_seg); +err_free_dev_table: + free_dev_table(pci_seg); +err_free_pci_seg: + list_del(&pci_seg->list); + kfree(pci_seg); + return NULL; } static struct amd_iommu_pci_seg *__init get_pci_segment(u16 id, -- Gitee From 7b7e888439f5dfbaebf46ce7ba5ee4e4bf002ff5 Mon Sep 17 00:00:00 2001 From: Sairaj Kodilkar Date: Fri, 21 Nov 2025 14:41:15 +0530 Subject: [PATCH 7/9] amd/iommu: Preserve domain ids inside the kdump kernel ANBZ: #32239 commit c2e8dc1222c2136e714d5d972dce7e64924e4ed8 upstream. Currently AMD IOMMU driver does not reserve domain ids programmed in the DTE while reusing the device table inside kdump kernel. This can cause reallocation of these domain ids for newer domains that are created by the kdump kernel, which can lead to potential IO_PAGE_FAULTs Hence reserve these ids inside pdom_ids. Fixes: 38e5f33ee359 ("iommu/amd: Reuse device table for kdump") Signed-off-by: Sairaj Kodilkar Reported-by: Jason Gunthorpe Reviewed-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Signed-off-by: Joerg Roedel Signed-off-by: hr567 --- drivers/iommu/amd/init.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index 994752ab4459..eb70665fed00 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -1125,9 +1125,13 @@ static void set_dte_bit(struct dev_table_entry *dte, u8 bit) static bool __reuse_device_table(struct amd_iommu *iommu) { struct amd_iommu_pci_seg *pci_seg = iommu->pci_seg; - u32 lo, hi, old_devtb_size; + struct dev_table_entry *old_dev_tbl_entry; + u32 lo, hi, old_devtb_size, devid; phys_addr_t old_devtb_phys; + u16 dom_id; + bool dte_v; u64 entry; + int ret; /* Each IOMMU use separate device table with the same size */ lo = readl(iommu->mmio_base + MMIO_DEV_TABLE_OFFSET); @@ -1162,6 +1166,23 @@ static bool __reuse_device_table(struct amd_iommu *iommu) return false; } + for (devid = 0; devid <= pci_seg->last_bdf; devid++) { + old_dev_tbl_entry = &pci_seg->old_dev_tbl_cpy[devid]; + dte_v = FIELD_GET(DTE_FLAG_V, old_dev_tbl_entry->data[0]); + dom_id = FIELD_GET(DEV_DOMID_MASK, old_dev_tbl_entry->data[1]); + + if (!dte_v || !dom_id) + continue; + /* + * ID reservation can fail with -ENOSPC when there + * are multiple devices present in the same domain, + * hence check only for -ENOMEM. + */ + ret = ida_alloc_range(&pdom_ids, dom_id, dom_id, GFP_KERNEL); + if (ret == -ENOMEM) + return false; + } + return true; } -- Gitee From ce61d40d3d7e72aff2384e5998116e21e2ab205e Mon Sep 17 00:00:00 2001 From: Ankit Soni Date: Mon, 1 Dec 2025 14:39:40 +0000 Subject: [PATCH 8/9] iommu/amd: move wait_on_sem() out of spinlock ANBZ: #32239 commit d2a0cac10597068567d336e85fa3cbdbe8ca62bf upstream. With iommu.strict=1, the existing completion wait path can cause soft lockups under stressed environment, as wait_on_sem() busy-waits under the spinlock with interrupts disabled. Move the completion wait in iommu_completion_wait() out of the spinlock. wait_on_sem() only polls the hardware-updated cmd_sem and does not require iommu->lock, so holding the lock during the busy wait unnecessarily increases contention and extends the time with interrupts disabled. Signed-off-by: Ankit Soni Reviewed-by: Vasant Hegde Signed-off-by: Joerg Roedel Signed-off-by: hr567 --- drivers/iommu/amd/iommu.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index a01f8f466a7c..51dd5eb9da1f 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1178,7 +1178,12 @@ static int wait_on_sem(struct amd_iommu *iommu, u64 data) { int i = 0; - while (*iommu->cmd_sem != data && i < LOOP_TIMEOUT) { + /* + * cmd_sem holds a monotonically non-decreasing completion sequence + * number. + */ + while ((__s64)(READ_ONCE(*iommu->cmd_sem) - data) < 0 && + i < LOOP_TIMEOUT) { udelay(1); i += 1; } @@ -1430,14 +1435,13 @@ static int iommu_completion_wait(struct amd_iommu *iommu) raw_spin_lock_irqsave(&iommu->lock, flags); ret = __iommu_queue_command_sync(iommu, &cmd, false); + raw_spin_unlock_irqrestore(&iommu->lock, flags); + if (ret) - goto out_unlock; + return ret; ret = wait_on_sem(iommu, data); -out_unlock: - raw_spin_unlock_irqrestore(&iommu->lock, flags); - return ret; } @@ -3121,13 +3125,18 @@ static void iommu_flush_irt_and_complete(struct amd_iommu *iommu, u16 devid) raw_spin_lock_irqsave(&iommu->lock, flags); ret = __iommu_queue_command_sync(iommu, &cmd, true); if (ret) - goto out; + goto out_err; ret = __iommu_queue_command_sync(iommu, &cmd2, false); if (ret) - goto out; + goto out_err; + raw_spin_unlock_irqrestore(&iommu->lock, flags); + wait_on_sem(iommu, data); -out: + return; + +out_err: raw_spin_unlock_irqrestore(&iommu->lock, flags); + return; } static inline u8 iommu_get_int_tablen(struct iommu_dev_data *dev_data) -- Gitee From 6762c72c0c71786016d9cc5edf3fa691027e64f3 Mon Sep 17 00:00:00 2001 From: Ankit Soni Date: Thu, 22 Jan 2026 15:30:38 +0000 Subject: [PATCH 9/9] iommu/amd: serialize sequence allocation under concurrent TLB invalidations ANBZ: #32239 commit 9e249c48412828e807afddc21527eb734dc9bd3d upstream. With concurrent TLB invalidations, completion wait randomly gets timed out because cmd_sem_val was incremented outside the IOMMU spinlock, allowing CMD_COMPL_WAIT commands to be queued out of sequence and breaking the ordering assumption in wait_on_sem(). Move the cmd_sem_val increment under iommu->lock so completion sequence allocation is serialized with command queuing. And remove the unnecessary return. Fixes: d2a0cac10597 ("iommu/amd: move wait_on_sem() out of spinlock") Tested-by: Srikanth Aithal Reported-by: Srikanth Aithal Signed-off-by: Ankit Soni Reviewed-by: Vasant Hegde Signed-off-by: Joerg Roedel Signed-off-by: hr567 --- drivers/iommu/amd/amd_iommu_types.h | 2 +- drivers/iommu/amd/init.c | 2 +- drivers/iommu/amd/iommu.c | 18 ++++++++++++------ 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h index 411311d17025..139404966457 100644 --- a/drivers/iommu/amd/amd_iommu_types.h +++ b/drivers/iommu/amd/amd_iommu_types.h @@ -795,7 +795,7 @@ struct amd_iommu { u32 flags; volatile u64 *cmd_sem; - atomic64_t cmd_sem_val; + u64 cmd_sem_val; /* * Track physical address to directly use it in build_completion_wait() * and avoid adding any special checks and handling for kdump. diff --git a/drivers/iommu/amd/init.c b/drivers/iommu/amd/init.c index eb70665fed00..ade072f18ca7 100644 --- a/drivers/iommu/amd/init.c +++ b/drivers/iommu/amd/init.c @@ -1868,7 +1868,7 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h, iommu->pci_seg = pci_seg; raw_spin_lock_init(&iommu->lock); - atomic64_set(&iommu->cmd_sem_val, 0); + iommu->cmd_sem_val = 0; /* Add IOMMU to internal data structures */ list_add_tail(&iommu->list, &amd_iommu_list); diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 51dd5eb9da1f..fde011a884b7 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -1415,6 +1415,12 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) return iommu_queue_command_sync(iommu, cmd, true); } +static u64 get_cmdsem_val(struct amd_iommu *iommu) +{ + lockdep_assert_held(&iommu->lock); + return ++iommu->cmd_sem_val; +} + /* * This function queues a completion wait command into the command * buffer of an IOMMU @@ -1429,11 +1435,11 @@ static int iommu_completion_wait(struct amd_iommu *iommu) if (!iommu->need_sync) return 0; - data = atomic64_inc_return(&iommu->cmd_sem_val); - build_completion_wait(&cmd, iommu, data); - raw_spin_lock_irqsave(&iommu->lock, flags); + data = get_cmdsem_val(iommu); + build_completion_wait(&cmd, iommu, data); + ret = __iommu_queue_command_sync(iommu, &cmd, false); raw_spin_unlock_irqrestore(&iommu->lock, flags); @@ -3119,10 +3125,11 @@ static void iommu_flush_irt_and_complete(struct amd_iommu *iommu, u16 devid) return; build_inv_irt(&cmd, devid); - data = atomic64_inc_return(&iommu->cmd_sem_val); - build_completion_wait(&cmd2, iommu, data); raw_spin_lock_irqsave(&iommu->lock, flags); + data = get_cmdsem_val(iommu); + build_completion_wait(&cmd2, iommu, data); + ret = __iommu_queue_command_sync(iommu, &cmd, true); if (ret) goto out_err; @@ -3136,7 +3143,6 @@ static void iommu_flush_irt_and_complete(struct amd_iommu *iommu, u16 devid) out_err: raw_spin_unlock_irqrestore(&iommu->lock, flags); - return; } static inline u8 iommu_get_int_tablen(struct iommu_dev_data *dev_data) -- Gitee