diff --git a/MAINTAINERS b/MAINTAINERS index 3501c95f7cde8d3461b09af17db1fe4371c20731..5b749b448e7d035cc6bf25ece4ab1426d8a2b24e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11052,6 +11052,18 @@ F: include/linux/iommu.h F: include/linux/iova.h F: include/linux/of_iommu.h +IOMMU LIVEUPDATE +M: Samiullah Khawaja +R: Pranjal Shrivastava +L: iommu@lists.linux.dev +S: Maintained +F: drivers/iommu/intel/liveupdate.c +F: drivers/iommu/iommufd/liveupdate.c +F: drivers/iommu/liveupdate.c +F: include/linux/iommu-liveupdate.h +F: include/linux/kho/abi/iommu.h +F: include/linux/kho/abi/iommufd.h + IOMMUFD M: Jason Gunthorpe M: Kevin Tian diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 7f3aa3085c0cad9cf4bebd4b60d9604f03d68b8f..47efd45469aca73248709f0752f9fa0a6618edf6 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -560,6 +560,18 @@ config IOMMU_DEBUG_PAGEALLOC line to activate the runtime checks. If unsure, say N. + +config IOMMU_LIVEUPDATE + bool "IOMMU live update state preservation support" + depends on LIVEUPDATE && IOMMUFD + help + Enable support for preserving IOMMU state across a kexec live update. + + This allows devices managed by iommufd to maintain their DMA mappings + during kexec base kernel update. + + If unsure, say N. + endif # IOMMU_SUPPORT source "drivers/iommu/generic_pt/Kconfig" diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile index cba1cff4aa119afece6c064cfc100bd8a5d49b63..0f567d19b24b1f6bf1ae19b3dc6f9366b2aa06bf 100644 --- a/drivers/iommu/Makefile +++ b/drivers/iommu/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_IOMMU_IO_PGTABLE_ARMV7S) += io-pgtable-arm-v7s.o obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o obj-$(CONFIG_IOMMU_IO_PGTABLE_DART) += io-pgtable-dart.o obj-$(CONFIG_IOMMU_IOVA) += iova.o +obj-$(CONFIG_IOMMU_LIVEUPDATE) += liveupdate.o obj-$(CONFIG_OF_IOMMU) += of_iommu.o obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o obj-$(CONFIG_IPMMU_VMSA) += ipmmu-vmsa.o diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h index 93ef5006ebfcbd81f85155256a13433eb155b5b6..32e65d0726bc38632158b7274fa2d15d6aff45d4 100644 --- a/drivers/iommu/generic_pt/iommu_pt.h +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -920,6 +920,133 @@ int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova, } EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(map_pages), GENERIC_PT_IOMMU); +#ifdef CONFIG_IOMMU_LIVEUPDATE +/** + * unpreserve() - Unpreserve page tables and other state of a domain. + * @domain: Domain to unpreserve + */ +void DOMAIN_NS(unpreserve)(struct iommu_domain *domain, struct iommu_domain_ser *ser) +{ + struct pt_iommu *iommu_table = + container_of(domain, struct pt_iommu, domain); + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_range range = pt_all_range(common); + struct pt_iommu_collect_args collect = { + .free_list = IOMMU_PAGES_LIST_INIT(collect.free_list), + }; + + iommu_pages_list_add(&collect.free_list, range.top_table); + pt_walk_range(&range, __collect_tables, &collect); + + iommu_unpreserve_pages(&collect.free_list); +} +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(unpreserve), GENERIC_PT_IOMMU); + +/** + * preserve() - Preserve page tables and other state of a domain. + * @domain: Domain to preserve + * + * Returns: -ERRNO on failure, 0 on success. + */ +int DOMAIN_NS(preserve)(struct iommu_domain *domain, struct iommu_domain_ser *ser) +{ + struct pt_iommu *iommu_table = + container_of(domain, struct pt_iommu, domain); + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_range range = pt_all_range(common); + struct pt_iommu_collect_args collect = { + .free_list = IOMMU_PAGES_LIST_INIT(collect.free_list), + }; + int ret; + + iommu_pages_list_add(&collect.free_list, range.top_table); + pt_walk_range(&range, __collect_tables, &collect); + + ret = iommu_preserve_pages(&collect.free_list); + if (ret) + return ret; + + ser->top_table_phys = virt_to_phys(range.top_table); + ser->top_level = range.top_level; + + /* + * VASZ and SIGN_EXTEND will be needed in next kernel for collector page + * table walk to restore and free pages. + */ + ser->vasz = common->max_vasz_lg2; + ser->sign_extend = pt_feature(common, PT_FEAT_SIGN_EXTEND); + + return 0; +} +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(preserve), GENERIC_PT_IOMMU); + +static int __restore_tables(struct pt_range *range, void *arg, + unsigned int level, struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + int ret; + + for_each_pt_level_entry(&pts) { + if (pts.type == PT_ENTRY_TABLE) { + iommu_restore_page(virt_to_phys(pts.table_lower)); + + /* + * pt_descend can only fail if pts.table_lower is not + * init. So the if statement below is dead code. + */ + ret = pt_descend(&pts, arg, __restore_tables); + if (ret) + return ret; + } + } + + return 0; +} + +static const struct pt_iommu_ops NS(ops_immutable); + +/** + * restore() - Restore page tables and other state of a domain. + * @domain: Domain to preserve + * + * Returns: -ERRNO on failure, 0 on success. + */ +int DOMAIN_NS(restore)(struct iommu_domain *domain, struct iommu_domain_ser *ser) +{ + struct pt_iommu *iommu_table = + container_of(domain, struct pt_iommu, domain); + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_range range; + + common->max_vasz_lg2 = ser->vasz; + + /* Make this domain immutable.*/ + iommu_table->ops = &NS(ops_immutable); + + /* + * It is safe to override this here since this domain is immutable and + * can only be freed. + */ + common->features = 0; + if (ser->sign_extend) + common->features |= BIT(PT_FEAT_SIGN_EXTEND); + + range = pt_all_range(common); + iommu_restore_page(ser->top_table_phys); + + /* Free new table */ + iommu_free_pages(range.top_table); + + /* Set the restored top table */ + pt_top_set(common, phys_to_virt(ser->top_table_phys), ser->top_level); + + /* Restore all pages*/ + range = pt_all_range(common); + return pt_walk_range(&range, __restore_tables, NULL); +} +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(restore), GENERIC_PT_IOMMU); +#endif + struct pt_unmap_args { struct iommu_pages_list free_list; pt_vaddr_t unmapped; @@ -1118,6 +1245,10 @@ static const struct pt_iommu_ops NS(ops) = { .deinit = NS(deinit), }; +static const struct pt_iommu_ops NS(ops_immutable) = { + .deinit = NS(deinit), +}; + static int pt_init_common(struct pt_common *common) { struct pt_range top_range = pt_top_range(common); diff --git a/drivers/iommu/generic_pt/kunit_iommu_pt.h b/drivers/iommu/generic_pt/kunit_iommu_pt.h index e8a63c8ea850e594d1f214f8490bb78d977dad7e..af1918d693ed11e0d554181d9433f9112a053d5e 100644 --- a/drivers/iommu/generic_pt/kunit_iommu_pt.h +++ b/drivers/iommu/generic_pt/kunit_iommu_pt.h @@ -426,6 +426,33 @@ static void test_mixed(struct kunit *test) check_iova(test, start, oa, len); } +static void test_restore_free(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range top_range = pt_top_range(priv->common); + u64 start = 0x3fe400ULL << 12; + u64 end = 0x4c0600ULL << 12; + pt_vaddr_t len = end - start; + + if (top_range.last_va <= start || sizeof(unsigned long) == 4) + kunit_skip(test, "range is too small"); + if ((priv->safe_pgsize_bitmap & GENMASK(30, 21)) != (BIT(30) | BIT(21))) + kunit_skip(test, "incompatible psize"); + + /* Map a large mixed range to populate multiple levels of page tables */ + do_map(test, start, start, len); + + /* + * Simulate a restored state by clearing all features except + * SIGN_EXTEND. This verifies that the generic page table free walker + * can correctly tear down a populated domain when other features are + * zeroed. + */ + priv->common->features &= BIT(PT_FEAT_SIGN_EXTEND); + + /* The domain will be freed when the test exits. */ +} + static struct kunit_case iommu_test_cases[] = { KUNIT_CASE_FMT(test_increase_level), KUNIT_CASE_FMT(test_map_simple), @@ -434,6 +461,7 @@ static struct kunit_case iommu_test_cases[] = { KUNIT_CASE_FMT(test_random_map), KUNIT_CASE_FMT(test_pgsize_boundary), KUNIT_CASE_FMT(test_mixed), + KUNIT_CASE_FMT(test_restore_free), {}, }; diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile index 6c7528130cf9d53489c7658b4f8a31c1f8194feb..d26f8e8ad852bd654836926489fd1a2dc69f17af 100644 --- a/drivers/iommu/intel/Makefile +++ b/drivers/iommu/intel/Makefile @@ -9,3 +9,4 @@ ifdef CONFIG_INTEL_IOMMU obj-$(CONFIG_IRQ_REMAP) += irq_remapping.o endif obj-$(CONFIG_INTEL_IOMMU_PERF_EVENTS) += perfmon.o +obj-$(CONFIG_IOMMU_LIVEUPDATE) += liveupdate.o diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 853cb2cccde042cde0935746b0df13f346656855..8061754fdafb1cbcbdc8ba6243d97a1f492c0db4 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -50,6 +51,8 @@ static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, bool enable); static int rwbf_quirk; +static void clear_unpreserved_context_entries(struct intel_iommu *iommu); + /* * set to 1 to panic kernel if can't successfully enable VT-d * (used when kernel is launched w/ TXT) @@ -58,8 +61,6 @@ static int force_on = 0; static int intel_iommu_tboot_noforce; static int no_platform_optin; -#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) - /* * Take a root_entry and return the Lower Context Table Pointer (LCTP) * if marked present. @@ -667,10 +668,17 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, #endif /* iommu handling */ -static int iommu_alloc_root_entry(struct intel_iommu *iommu) +static int iommu_alloc_root_entry(struct intel_iommu *iommu, + struct iommu_hw_ser *iommu_ser) { struct root_entry *root; + if (iommu_ser) { + intel_iommu_liveupdate_restore_root_table(iommu, iommu_ser); + __iommu_flush_cache(iommu, iommu->root_entry, ROOT_SIZE); + return 0; + } + root = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC, SZ_4K); if (!root) { pr_err("Allocating root entry for %s failed\n", @@ -989,15 +997,16 @@ static void disable_dmar_iommu(struct intel_iommu *iommu) iommu_disable_translation(iommu); } -static void free_dmar_iommu(struct intel_iommu *iommu) +static void free_dmar_iommu(struct intel_iommu *iommu, struct iommu_hw_ser *iommu_ser) { if (iommu->copied_tables) { bitmap_free(iommu->copied_tables); iommu->copied_tables = NULL; } - /* free context mapping */ - free_context_table(iommu); + /* free context mapping if there is no serialized state. */ + if (!iommu_ser) + free_context_table(iommu); if (ecap_prs(iommu->ecap)) intel_iommu_finish_prq(iommu); @@ -1020,7 +1029,8 @@ static bool first_level_by_default(struct intel_iommu *iommu) return true; } -int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) +int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu, + int restore_did) { struct iommu_domain_info *info, *curr; int num, ret = -ENOSPC; @@ -1040,8 +1050,11 @@ int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) return 0; } - num = ida_alloc_range(&iommu->domain_ida, IDA_START_DID, - cap_ndoms(iommu->cap) - 1, GFP_KERNEL); + if (restore_did >= IDA_START_DID) + num = restore_did; + else + num = ida_alloc_range(&iommu->domain_ida, IDA_START_DID, + cap_ndoms(iommu->cap) - 1, GFP_KERNEL); if (num < 0) { pr_err("%s: No free domain ids\n", iommu->name); goto err_unlock; @@ -1321,10 +1334,14 @@ static int dmar_domain_attach_device(struct dmar_domain *domain, { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; + struct device_ser *device_ser = NULL; unsigned long flags; int ret; - ret = domain_attach_iommu(domain, iommu); + device_ser = dev_iommu_restored_state(dev); + + ret = domain_attach_iommu(domain, iommu, + dev_iommu_restore_did(dev, &domain->domain)); if (ret) return ret; @@ -1337,16 +1354,18 @@ static int dmar_domain_attach_device(struct dmar_domain *domain, if (dev_is_real_dma_subdevice(dev)) return 0; - if (!sm_supported(iommu)) - ret = domain_context_mapping(domain, dev); - else if (intel_domain_is_fs_paging(domain)) - ret = domain_setup_first_level(iommu, domain, dev, - IOMMU_NO_PASID, NULL); - else if (intel_domain_is_ss_paging(domain)) - ret = domain_setup_second_level(iommu, domain, dev, - IOMMU_NO_PASID, NULL); - else if (WARN_ON(true)) - ret = -EINVAL; + if (!device_ser) { + if (!sm_supported(iommu)) + ret = domain_context_mapping(domain, dev); + else if (intel_domain_is_fs_paging(domain)) + ret = domain_setup_first_level(iommu, domain, dev, + IOMMU_NO_PASID, NULL); + else if (intel_domain_is_ss_paging(domain)) + ret = domain_setup_second_level(iommu, domain, dev, + IOMMU_NO_PASID, NULL); + else if (WARN_ON(true)) + ret = -EINVAL; + } if (ret) goto out_block_translation; @@ -1622,6 +1641,7 @@ static int copy_translation_tables(struct intel_iommu *iommu) static int __init init_dmars(void) { + struct iommu_hw_ser *iommu_ser = NULL; struct dmar_drhd_unit *drhd; struct intel_iommu *iommu; int ret; @@ -1644,8 +1664,12 @@ static int __init init_dmars(void) intel_pasid_max_id); } + iommu_ser = iommu_get_preserved_data(iommu->reg_phys, IOMMU_INTEL); + intel_iommu_init_qi(iommu); - init_translation_status(iommu); + + if (!iommu_ser) + init_translation_status(iommu); if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { iommu_disable_translation(iommu); @@ -1659,7 +1683,7 @@ static int __init init_dmars(void) * we could share the same root & context tables * among all IOMMU's. Need to Split it later. */ - ret = iommu_alloc_root_entry(iommu); + ret = iommu_alloc_root_entry(iommu, iommu_ser); if (ret) goto free_iommu; @@ -1743,8 +1767,12 @@ static int __init init_dmars(void) free_iommu: for_each_active_iommu(iommu, drhd) { - disable_dmar_iommu(iommu); - free_dmar_iommu(iommu); + iommu_ser = iommu_get_preserved_data(iommu->reg_phys, IOMMU_INTEL); + + if (!iommu_ser) + disable_dmar_iommu(iommu); + + free_dmar_iommu(iommu, iommu_ser); } return ret; @@ -2114,15 +2142,19 @@ int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) static int intel_iommu_add(struct dmar_drhd_unit *dmaru) { struct intel_iommu *iommu = dmaru->iommu; + struct iommu_hw_ser *iommu_ser = NULL; int ret; + /* Use IOMMU HW unit MMIO base to identify the preserved state. */ + iommu_ser = iommu_get_preserved_data(iommu->reg_phys, IOMMU_INTEL); + /* * Disable translation if already enabled prior to OS handover. */ - if (iommu->gcmd & DMA_GCMD_TE) + if (!iommu_ser && iommu->gcmd & DMA_GCMD_TE) iommu_disable_translation(iommu); - ret = iommu_alloc_root_entry(iommu); + ret = iommu_alloc_root_entry(iommu, iommu_ser); if (ret) goto out; @@ -2157,9 +2189,10 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru) return 0; disable_iommu: - disable_dmar_iommu(iommu); + if (!iommu_ser) + disable_dmar_iommu(iommu); out: - free_dmar_iommu(iommu); + free_dmar_iommu(iommu, iommu_ser); return ret; } @@ -2167,6 +2200,7 @@ int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) { int ret = 0; struct intel_iommu *iommu = dmaru->iommu; + struct iommu_hw_ser *iommu_ser; if (!intel_iommu_enabled) return 0; @@ -2176,8 +2210,12 @@ int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) if (insert) { ret = intel_iommu_add(dmaru); } else { - disable_dmar_iommu(iommu); - free_dmar_iommu(iommu); + iommu_ser = iommu_get_preserved_data(iommu->reg_phys, IOMMU_INTEL); + + if (!iommu_ser) + disable_dmar_iommu(iommu); + + free_dmar_iommu(iommu, iommu_ser); } return ret; @@ -2401,8 +2439,11 @@ void intel_iommu_shutdown(void) /* Disable PMRs explicitly here. */ iommu_disable_protect_mem_regions(iommu); - /* Make sure the IOMMUs are switched off */ - iommu_disable_translation(iommu); + /* Make sure the IOMMUs are switched off if not preserved. */ + if (iommu_preserved_state(&iommu->iommu)) + clear_unpreserved_context_entries(iommu); + else + iommu_disable_translation(iommu); } } @@ -2961,6 +3002,43 @@ static const struct iommu_dirty_ops intel_second_stage_dirty_ops = { .set_dirty_tracking = intel_iommu_set_dirty_tracking, }; +#ifdef CONFIG_IOMMU_LIVEUPDATE +static int clear_unpreserve_context_entry_fn(struct device *dev, + struct iommu_device *iommu, + void *arg) +{ + struct device_domain_info *info; + + info = dev_iommu_priv_get(dev); + if (!info) + return 0; + + if (dev_is_pci(dev) && dev_iommu_preserved_state(dev)) { + pasid_cleanup_preserved_table(dev); + return 0; + } + + domain_context_clear(info); + return 0; +} + +static void clear_unpreserved_context_entries(struct intel_iommu *iommu) +{ + struct iommu_dev_iter iter = { + .fn = clear_unpreserve_context_entry_fn, + .iommu = &iommu->iommu, + .arg = NULL, + + }; + + iommu_for_each_dev(&iter); +} +#else +static void clear_unpreserved_context_entries(struct intel_iommu *iommu) +{ +} +#endif + static struct iommu_domain * intel_iommu_domain_alloc_second_stage(struct device *dev, struct intel_iommu *iommu, u32 flags) @@ -3149,6 +3227,15 @@ int paging_domain_compatible(struct iommu_domain *domain, struct device *dev) struct intel_iommu *iommu = info->iommu; int ret = -EINVAL; +#ifdef CONFIG_IOMMU_LIVEUPDATE + /* + * Restored IOMMU domains are already attached to the device and can + * only be freed. So no need to check the compatibility. + */ + if (iommu_domain_restored_state(domain)) + return 0; +#endif + if (intel_domain_is_fs_paging(dmar_domain)) ret = paging_domain_compatible_first_stage(dmar_domain, iommu); else if (intel_domain_is_ss_paging(dmar_domain)) @@ -3625,7 +3712,7 @@ domain_add_dev_pasid(struct iommu_domain *domain, if (!dev_pasid) return ERR_PTR(-ENOMEM); - ret = domain_attach_iommu(dmar_domain, iommu); + ret = domain_attach_iommu(dmar_domain, iommu, -1); if (ret) goto out_free; @@ -3967,6 +4054,12 @@ const struct iommu_ops intel_iommu_ops = { .is_attach_deferred = intel_iommu_is_attach_deferred, .def_domain_type = device_def_domain_type, .page_response = intel_iommu_page_response, +#ifdef CONFIG_IOMMU_LIVEUPDATE + .preserve_device = intel_iommu_preserve_device, + .unpreserve_device = intel_iommu_unpreserve_device, + .preserve = intel_iommu_preserve, + .unpreserve = intel_iommu_unpreserve, +#endif }; static void quirk_iommu_igfx(struct pci_dev *dev) diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index c6eb0227e33bfde392e9ad5435e449fc7198f9b7..a18726f1734b7f110fd6dfcdfa839983ec3d364a 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -563,6 +563,8 @@ struct root_entry { u64 hi; }; +#define ROOT_ENTRY_NR (VTD_PAGE_SIZE / sizeof(struct root_entry)) + /* * low 64 bits: * 0: present @@ -1196,7 +1198,8 @@ void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, */ #define QI_OPT_WAIT_DRAIN BIT(0) -int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); +int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu, + int restore_did); void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); void device_block_translation(struct device *dev); int paging_domain_compatible(struct iommu_domain *domain, struct device *dev); @@ -1300,6 +1303,51 @@ static inline int iopf_for_domain_replace(struct iommu_domain *new, return 0; } +#ifdef CONFIG_IOMMU_LIVEUPDATE +int intel_iommu_preserve_device(struct device *dev, + struct iommu_device_ser *device_ser); +void intel_iommu_unpreserve_device(struct device *dev, + struct iommu_device_ser *device_ser); +int intel_iommu_preserve(struct iommu_device *iommu, + struct iommu_hw_ser *iommu_ser); +void intel_iommu_unpreserve(struct iommu_device *iommu, + struct iommu_hw_ser *iommu_ser); +void intel_iommu_liveupdate_restore_root_table(struct intel_iommu *iommu, + struct iommu_hw_ser *iommu_ser); +void pasid_cleanup_preserved_table(struct device *dev); +#else +static inline int intel_iommu_preserve_device(struct device *dev, + struct iommu_device_ser *device_ser) +{ + return -EOPNOTSUPP; +} + +static inline void intel_iommu_unpreserve_device(struct device *dev, + struct iommu_device_ser *device_ser) +{ +} + +static inline int intel_iommu_preserve(struct iommu_device *iommu, + struct iommu_hw_ser *iommu_ser) +{ + return -EOPNOTSUPP; +} + +static inline void intel_iommu_unpreserve(struct iommu_device *iommu, + struct iommu_hw_ser *iommu_ser) +{ +} + +static inline void intel_iommu_liveupdate_restore_root_table(struct intel_iommu *iommu, + struct iommu_hw_ser *iommu_ser) +{ +} + +static inline void pasid_cleanup_preserved_table(struct device *dev) +{ +} +#endif + #ifdef CONFIG_INTEL_IOMMU_SVM void intel_svm_check(struct intel_iommu *iommu); struct iommu_domain *intel_svm_domain_alloc(struct device *dev, diff --git a/drivers/iommu/intel/liveupdate.c b/drivers/iommu/intel/liveupdate.c new file mode 100644 index 0000000000000000000000000000000000000000..404b485e97b93cb24576210c960ac3057d2aafee --- /dev/null +++ b/drivers/iommu/intel/liveupdate.c @@ -0,0 +1,337 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Copyright (C) 2026, Google LLC + * Author: Samiullah Khawaja + */ + +#define pr_fmt(fmt) "DMAR: liveupdate: " fmt + +#include +#include +#include +#include +#include + +#include "iommu.h" +#include "pasid.h" +#include "../iommu-pages.h" + +static void unpreserve_iommu_context_table(struct intel_iommu *iommu, int end) +{ + struct context_entry *context; + int i; + + for (i = 0; i < end; i++) { + context = iommu_context_addr(iommu, i, 0, 0); + if (context) + iommu_unpreserve_page(context); + + if (!sm_supported(iommu)) + continue; + + context = iommu_context_addr(iommu, i, 0x80, 0); + if (context) + iommu_unpreserve_page(context); + } +} + +static int preserve_iommu_context_table(struct intel_iommu *iommu) +{ + struct context_entry *context; + int ret; + int i; + + for (i = 0; i < ROOT_ENTRY_NR; i++) { + /* + * Alloc the context tables now to make sure the iommu unit is + * properly preserved. These might stay unused and wastes around + * 32MB max in scalable mode. + */ + spin_lock(&iommu->lock); + context = iommu_context_addr(iommu, i, 0, 1); + spin_unlock(&iommu->lock); + if (!context) { + ret = -ENOMEM; + goto error; + } + ret = iommu_preserve_page(context); + if (ret) + goto error; + + if (!sm_supported(iommu)) + continue; + + spin_lock(&iommu->lock); + context = iommu_context_addr(iommu, i, 0x80, 1); + spin_unlock(&iommu->lock); + if (!context) { + ret = -ENOMEM; + goto error_sm; + } + ret = iommu_preserve_page(context); + if (ret) + goto error_sm; + } + + return 0; + +error_sm: + context = iommu_context_addr(iommu, i, 0, 0); + iommu_unpreserve_page(context); +error: + unpreserve_iommu_context_table(iommu, i); + return ret; +} + +static void restore_iommu_context(struct intel_iommu *iommu) +{ + struct context_entry *context; + int i; + + for (i = 0; i < ROOT_ENTRY_NR; i++) { + context = iommu_context_addr(iommu, i, 0, 0); + if (context) + BUG_ON(!kho_restore_folio(virt_to_phys(context))); + + if (!sm_supported(iommu)) + continue; + + context = iommu_context_addr(iommu, i, 0x80, 0); + if (context) + BUG_ON(!kho_restore_folio(virt_to_phys(context))); + } +} + +static int _restore_used_domain_ids(struct iommu_device_ser *ser, void *arg) +{ + int id = ser->domain_iommu_ser.attachment_id; + struct iommu_hw_ser *iommu_hw_ser; + struct intel_iommu *iommu = arg; + + iommu_hw_ser = phys_to_virt(ser->domain_iommu_ser.iommu_phys); + if (iommu_hw_ser->type != IOMMU_INTEL) + return 0; + + /* Only allocate domain ID from associated IOMMU HW unit */ + if (iommu_hw_ser->intel.phys_addr != iommu->reg_phys) + return 0; + + /* + * This can fail as multiple preserved devices can share the same domain + * ID. Since this is done during DMAR init so these failures can be + * ignored. + */ + ida_alloc_range(&iommu->domain_ida, id, id, GFP_ATOMIC); + return 0; +} + +void intel_iommu_liveupdate_restore_root_table(struct intel_iommu *iommu, + struct iommu_hw_ser *iommu_ser) +{ + if (!iommu_ser->intel.restored) + BUG_ON(!kho_restore_folio(iommu_ser->intel.root_table)); + + iommu->root_entry = __va(iommu_ser->intel.root_table); + + if (!iommu_ser->intel.restored) + restore_iommu_context(iommu); + + iommu_ser->intel.restored = 1; + iommu_for_each_preserved_device(_restore_used_domain_ids, iommu); +} + +enum pasid_lu_op { + PASID_LU_OP_PRESERVE = 1, + PASID_LU_OP_UNPRESERVE, + PASID_LU_OP_RESTORE, + PASID_LU_OP_FREE, +}; + +static int pasid_lu_do_op(void *table, enum pasid_lu_op op) +{ + int ret = 0; + + switch (op) { + case PASID_LU_OP_PRESERVE: + ret = iommu_preserve_page(table); + break; + case PASID_LU_OP_UNPRESERVE: + iommu_unpreserve_page(table); + break; + case PASID_LU_OP_RESTORE: + iommu_restore_page(virt_to_phys(table)); + break; + case PASID_LU_OP_FREE: + iommu_free_pages(table); + break; + } + + return ret; +} + +static int pasid_lu_handle_pd(struct pasid_dir_entry *dir, enum pasid_lu_op op) +{ + struct pasid_entry *table; + int ret; + + /* Only preserve first table for NO_PASID. */ + table = get_pasid_table_from_pde(&dir[0]); + if (!table) + return -EINVAL; + + ret = pasid_lu_do_op(table, op); + if (ret) + return ret; + + ret = pasid_lu_do_op(dir, op); + if (ret) + goto err; + + return 0; +err: + if (op == PASID_LU_OP_PRESERVE) + pasid_lu_do_op(table, PASID_LU_OP_UNPRESERVE); + + return ret; +} + +void pasid_cleanup_preserved_table(struct device *dev) +{ + struct pasid_table *pasid_table; + struct pasid_dir_entry *dir; + struct pasid_entry *table; + size_t dir_size; + + pasid_table = intel_pasid_get_table(dev); + if (!pasid_table) + return; + + dir = pasid_table->table; + table = get_pasid_table_from_pde(&dir[0]); + if (!table) + return; + + /* Clear everything except the first entry in table. */ + memset(&table[1], 0, SZ_4K - sizeof(*table)); + + /* Use the folio order to calculate the size of Pasid Directory */ + dir_size = (1 << (folio_order(virt_to_folio(dir)) + PAGE_SHIFT)); + + /* Clear everything except the first entry in directory */ + memset(&dir[1], 0, dir_size - sizeof(struct pasid_dir_entry)); + + clflush_cache_range(&table[0], SZ_4K); + clflush_cache_range(&dir[0], dir_size); +} + +int intel_iommu_preserve_device(struct device *dev, + struct iommu_device_ser *device_ser) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct pasid_table *pasid_table; + int ret; + + if (!dev_is_pci(dev)) { + dev_err(dev, "Cannot preserve non-PCI device\n"); + return -EOPNOTSUPP; + } + + if (!info) + return -EINVAL; + + device_ser->domain_iommu_ser.attachment_id = domain_id_iommu(info->domain, + info->iommu); + + if (!sm_supported(info->iommu)) + return 0; + + pasid_table = intel_pasid_get_table(dev); + if (!pasid_table) + return -EINVAL; + + ret = pasid_lu_handle_pd(pasid_table->table, PASID_LU_OP_PRESERVE); + if (ret) + return ret; + + device_ser->intel.pasid_table = virt_to_phys(pasid_table->table); + device_ser->intel.max_pasid = pasid_table->max_pasid; + return 0; +} + +void intel_iommu_unpreserve_device(struct device *dev, + struct iommu_device_ser *device_ser) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct pasid_table *pasid_table; + + if (!dev_is_pci(dev)) + return; + + if (!info) + return; + + if (!sm_supported(info->iommu)) + return; + + pasid_table = intel_pasid_get_table(dev); + if (!pasid_table) + return; + + pasid_lu_handle_pd(pasid_table->table, PASID_LU_OP_UNPRESERVE); +} + +int intel_iommu_preserve(struct iommu_device *iommu_dev, + struct iommu_hw_ser *ser) +{ + struct intel_iommu *iommu; + int ret; + + iommu = container_of(iommu_dev, struct intel_iommu, iommu); + + ret = preserve_iommu_context_table(iommu); + if (ret) + return ret; + + ret = iommu_preserve_page(iommu->root_entry); + if (ret) { + unpreserve_iommu_context_table(iommu, ROOT_ENTRY_NR); + return ret; + } + + ser->intel.phys_addr = iommu->reg_phys; + ser->intel.root_table = __pa(iommu->root_entry); + ser->type = IOMMU_INTEL; + ser->token = ser->intel.phys_addr; + + return 0; +} + +void intel_iommu_unpreserve(struct iommu_device *iommu_dev, + struct iommu_hw_ser *iommu_ser) +{ + struct intel_iommu *iommu; + + iommu = container_of(iommu_dev, struct intel_iommu, iommu); + + unpreserve_iommu_context_table(iommu, ROOT_ENTRY_NR); + iommu_unpreserve_page(iommu->root_entry); +} + +void *intel_pasid_try_restore_table(struct device *dev, u64 max_pasid) +{ + struct iommu_device_ser *ser = dev_iommu_restored_state(dev); + + if (!ser) + return NULL; + + BUG_ON(pasid_lu_handle_pd(phys_to_virt(ser->intel.pasid_table), + PASID_LU_OP_RESTORE)); + if (WARN_ON_ONCE(ser->intel.max_pasid != max_pasid)) { + pasid_lu_handle_pd(phys_to_virt(ser->intel.pasid_table), + PASID_LU_OP_FREE); + return NULL; + } + + return phys_to_virt(ser->intel.pasid_table); +} diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index e9a440e9c960b219c7d35a0285cdc43117484dee..879b614fc5841fe31eb9922b7369adb114429d26 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -40,7 +40,7 @@ static int intel_nested_attach_dev(struct iommu_domain *domain, return ret; } - ret = domain_attach_iommu(dmar_domain, iommu); + ret = domain_attach_iommu(dmar_domain, iommu, -1); if (ret) { dev_err_ratelimited(dev, "Failed to attach domain to iommu\n"); return ret; diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index b63a71904cfb8b05463ce45a9904a7b3b65d91af..cc9756300e46178839b100120f8bcc6a5abf3099 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -60,8 +60,11 @@ int intel_pasid_alloc_table(struct device *dev) size = max_pasid >> (PASID_PDE_SHIFT - 3); order = size ? get_order(size) : 0; - dir = iommu_alloc_pages_node_sz(info->iommu->node, GFP_KERNEL, - 1 << (order + PAGE_SHIFT)); + + dir = intel_pasid_try_restore_table(dev, 1 << (order + PAGE_SHIFT + 3)); + if (!dir) + dir = iommu_alloc_pages_node_sz(info->iommu->node, GFP_KERNEL, + 1 << (order + PAGE_SHIFT)); if (!dir) { kfree(pasid_table); return -ENOMEM; diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index 48d3bb6b68dea0e5e9e3a2eb2c081b1015b59113..44e673a4ad8f7e45b178a48c37db2cf83e67ab50 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -301,6 +301,15 @@ static inline void pasid_set_eafe(struct pasid_entry *pe) extern unsigned int intel_pasid_max_id; int intel_pasid_alloc_table(struct device *dev); +#ifdef CONFIG_IOMMU_LIVEUPDATE +void *intel_pasid_try_restore_table(struct device *dev, u64 max_pasid); +#else +static inline void *intel_pasid_try_restore_table(struct device *dev, + u64 max_pasid) +{ + return NULL; +} +#endif void intel_pasid_free_table(struct device *dev); struct pasid_table *intel_pasid_get_table(struct device *dev); int intel_pasid_setup_first_level(struct intel_iommu *iommu, struct device *dev, diff --git a/drivers/iommu/iommu-pages.c b/drivers/iommu/iommu-pages.c index 0f2d8267abc4a4d511d39274a60c81a73899b5c4..c9027540a6ffc66aa0b7a184094d75dc78fc952b 100644 --- a/drivers/iommu/iommu-pages.c +++ b/drivers/iommu/iommu-pages.c @@ -6,6 +6,7 @@ #include "iommu-pages.h" #include #include +#include #include #define IOPTDESC_MATCH(pg_elm, elm) \ @@ -28,6 +29,13 @@ static inline size_t ioptdesc_mem_size(struct ioptdesc *desc) return 1UL << (folio_order(ioptdesc_folio(desc)) + PAGE_SHIFT); } +static inline void iommu_folio_update_stats(struct folio *folio, + unsigned long nr_pages) +{ + mod_node_page_state(folio_pgdat(folio), NR_IOMMU_PAGES, nr_pages); + lruvec_stat_mod_folio(folio, NR_SECONDARY_PAGETABLE, nr_pages); +} + /** * iommu_alloc_pages_node_sz - Allocate a zeroed page of a given size from * specific NUMA node @@ -80,8 +88,7 @@ void *iommu_alloc_pages_node_sz(int nid, gfp_t gfp, size_t size) * rather large, i.e. multiple gigabytes in size. */ pgcnt = 1UL << order; - mod_node_page_state(folio_pgdat(folio), NR_IOMMU_PAGES, pgcnt); - lruvec_stat_mod_folio(folio, NR_SECONDARY_PAGETABLE, pgcnt); + iommu_folio_update_stats(folio, pgcnt); return folio_address(folio); } @@ -95,8 +102,7 @@ static void __iommu_free_desc(struct ioptdesc *iopt) if (IOMMU_PAGES_USE_DMA_API) WARN_ON_ONCE(iopt->incoherent); - mod_node_page_state(folio_pgdat(folio), NR_IOMMU_PAGES, -pgcnt); - lruvec_stat_mod_folio(folio, NR_SECONDARY_PAGETABLE, -pgcnt); + iommu_folio_update_stats(folio, -pgcnt); folio_put(folio); } @@ -131,6 +137,100 @@ void iommu_put_pages_list(struct iommu_pages_list *list) } EXPORT_SYMBOL_GPL(iommu_put_pages_list); +#if IS_ENABLED(CONFIG_IOMMU_LIVEUPDATE) +/** + * iommu_unpreserve_page - Unpreserve a page that was preserved in KHO + * @virt: Virtual address of a page + */ +void iommu_unpreserve_page(void *virt) +{ + kho_unpreserve_folio(ioptdesc_folio(virt_to_ioptdesc(virt))); +} +EXPORT_SYMBOL_GPL(iommu_unpreserve_page); + +/** + * iommu_preserve_page - Preserve a page during kexec handover + * @virt: Virtual address of the page to preserve + * + * Returns 0 on success, negative error on failure + */ +int iommu_preserve_page(void *virt) +{ + return kho_preserve_folio(ioptdesc_folio(virt_to_ioptdesc(virt))); +} +EXPORT_SYMBOL_GPL(iommu_preserve_page); + +/** + * iommu_unpreserve_pages - Unpreserve pages that were preserved in KHO + * @list: List of pages to unpreserve + */ +void iommu_unpreserve_pages(struct iommu_pages_list *list) +{ + struct ioptdesc *iopt; + + list_for_each_entry(iopt, &list->pages, iopt_freelist_elm) + kho_unpreserve_folio(ioptdesc_folio(iopt)); +} +EXPORT_SYMBOL_GPL(iommu_unpreserve_pages); + +/** + * iommu_restore_page - Restore a page that was preserved in KHO + * @phys: Physical address of a page + */ +void iommu_restore_page(u64 phys) +{ + struct ioptdesc *iopt; + struct folio *folio; + unsigned long pgcnt; + unsigned int order; + + folio = kho_restore_folio(phys); + BUG_ON(!folio); + + iopt = folio_ioptdesc(folio); + + /* + * For the restored pages incoherent is set to false as these are not + * mapped using the DMA_API. The remapping of these pages using DMA_API + * is not needed as these are not going to be written to by the new + * kernel. + */ + iopt->incoherent = false; + + order = folio_order(folio); + pgcnt = 1UL << order; + iommu_folio_update_stats(folio, pgcnt); +} +EXPORT_SYMBOL_GPL(iommu_restore_page); + +/** + * iommu_preserve_pages - Preserve pages during kexec handover + * @list: List of pages to preserve + * + * Returns 0 on success, negative error on failure + */ +int iommu_preserve_pages(struct iommu_pages_list *list) +{ + struct ioptdesc *iopt; + int ret; + + list_for_each_entry(iopt, &list->pages, iopt_freelist_elm) { + ret = kho_preserve_folio(ioptdesc_folio(iopt)); + if (ret) + goto err; + } + + return 0; + +err: + list_for_each_entry_continue_reverse(iopt, &list->pages, iopt_freelist_elm) + kho_unpreserve_folio(ioptdesc_folio(iopt)); + + return ret; +} +EXPORT_SYMBOL_GPL(iommu_preserve_pages); +#endif + /** * iommu_pages_start_incoherent - Setup the page for cache incoherent operation * @virt: The page to setup diff --git a/drivers/iommu/iommu-pages.h b/drivers/iommu/iommu-pages.h index ae9da4f571f614fc5d240c73449e508d54aee693..7b9b6bb504b2e8df7846f900897f5f380fc02d33 100644 --- a/drivers/iommu/iommu-pages.h +++ b/drivers/iommu/iommu-pages.h @@ -53,6 +53,36 @@ void *iommu_alloc_pages_node_sz(int nid, gfp_t gfp, size_t size); void iommu_free_pages(void *virt); void iommu_put_pages_list(struct iommu_pages_list *list); +#if IS_ENABLED(CONFIG_IOMMU_LIVEUPDATE) +int iommu_preserve_page(void *virt); +void iommu_unpreserve_page(void *virt); +int iommu_preserve_pages(struct iommu_pages_list *list); +void iommu_unpreserve_pages(struct iommu_pages_list *list); +void iommu_restore_page(u64 phys); +#else +static inline int iommu_preserve_page(void *virt) +{ + return -EOPNOTSUPP; +} + +static inline void iommu_unpreserve_page(void *virt) +{ +} + +static inline int iommu_preserve_pages(struct iommu_pages_list *list) +{ + return -EOPNOTSUPP; +} + +static inline void iommu_unpreserve_pages(struct iommu_pages_list *list, int count) +{ +} + +static inline void iommu_restore_page(u64 phys) +{ +} +#endif + /** * iommu_pages_list_add - add the page to a iommu_pages_list * @list: List to add the page to diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 5a3da091a3644e9922a8d982df665d1d35e4b9a3..d0eb3c10a35b333119ef1d280a4e83b643a533df 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -305,6 +306,24 @@ void iommu_device_unregister(struct iommu_device *iommu) } EXPORT_SYMBOL_GPL(iommu_device_unregister); +static int _iommu_for_each_dev_cb(struct device *dev, void *data) +{ + struct iommu_dev_iter *iter = data; + + if (dev->iommu && dev->iommu->iommu_dev == iter->iommu) + return iter->fn(dev, iter->iommu, iter->arg); + + return 0; +} + +void iommu_for_each_dev(struct iommu_dev_iter *iter) +{ + for (int i = 0; i < ARRAY_SIZE(iommu_buses); i++) + bus_for_each_dev(iommu_buses[i], NULL, iter, + _iommu_for_each_dev_cb); +} +EXPORT_SYMBOL_GPL(iommu_for_each_dev); + #if IS_ENABLED(CONFIG_IOMMUFD_TEST) void iommu_device_unregister_bus(struct iommu_device *iommu, const struct bus_type *bus, @@ -485,6 +504,10 @@ static int iommu_init_device(struct device *dev) goto err_free; } +#ifdef CONFIG_IOMMU_LIVEUPDATE + dev->iommu->device_ser = iommu_get_device_preserved_data(dev); +#endif + iommu_dev = ops->probe_device(dev); if (IS_ERR(iommu_dev)) { ret = PTR_ERR(iommu_dev); @@ -2193,6 +2216,13 @@ static int __iommu_attach_device(struct iommu_domain *domain, ret = domain->ops->attach_dev(domain, dev, old); if (ret) return ret; + +#ifdef CONFIG_IOMMU_LIVEUPDATE + /* The associated state can be unset once restored. */ + if (dev_iommu_restored_state(dev)) + WRITE_ONCE(dev->iommu->device_ser, NULL); +#endif + dev->iommu->attach_deferred = 0; trace_attach_device_to_domain(dev); return 0; @@ -3105,6 +3135,47 @@ int iommu_fwspec_add_ids(struct device *dev, const u32 *ids, int num_ids) } EXPORT_SYMBOL_GPL(iommu_fwspec_add_ids); +static inline void *__iommu_group_restored_state(struct iommu_group *group) +{ + struct device *dev; + + dev = iommu_group_first_dev(group); + if (!dev_is_pci(dev)) + return NULL; + + return dev_iommu_restored_state(dev); +} + +static struct iommu_domain *__iommu_group_restore_domain(struct iommu_group *group) +{ + struct iommu_device_ser *device_ser; + struct iommu_domain *domain; + struct device *dev; + void *owner; + + lockdep_assert_held(&group->mutex); + dev = iommu_group_first_dev(group); + if (!dev_is_pci(dev)) + return NULL; + + device_ser = dev_iommu_restored_state(dev); + if (!device_ser) + return NULL; + + domain = iommu_restore_domain(dev, device_ser, &owner); + if (WARN_ON(IS_ERR(domain))) + return NULL; + + /* + * Ownership of groups with preserved devices is set during boot. These + * will be reclaimed later by the entity (iommufd) that preserved them. + */ + WARN_ON(group->owner); + group->owner = owner; + group->owner_cnt = 1; + return domain; +} + /** * iommu_setup_default_domain - Set the default_domain for the group * @group: Group to change @@ -3119,8 +3190,8 @@ static int iommu_setup_default_domain(struct iommu_group *group, int target_type) { struct iommu_domain *old_dom = group->default_domain; + struct iommu_domain *dom, *restored_domain; struct group_device *gdev; - struct iommu_domain *dom; bool direct_failed; int req_type; int ret; @@ -3164,6 +3235,10 @@ static int iommu_setup_default_domain(struct iommu_group *group, /* We must set default_domain early for __iommu_device_set_domain */ group->default_domain = dom; if (!group->domain) { + if (__iommu_group_restored_state(group)) + restored_domain = __iommu_group_restore_domain(group); + else + restored_domain = dom; /* * Drivers are not allowed to fail the first domain attach. * The only way to recover from this is to fail attaching the @@ -3171,7 +3246,7 @@ static int iommu_setup_default_domain(struct iommu_group *group, * in group->default_domain so it is freed after. */ ret = __iommu_group_set_domain_internal( - group, dom, IOMMU_SET_DOMAIN_MUST_SUCCEED); + group, restored_domain, IOMMU_SET_DOMAIN_MUST_SUCCEED); if (WARN_ON(ret)) goto out_free_old; } else { diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile index 71d692c9a8f49d04d6c1b32115483207af895b32..1407fb58ef6d9596ad82482d3ffd61603ed319d4 100644 --- a/drivers/iommu/iommufd/Makefile +++ b/drivers/iommu/iommufd/Makefile @@ -17,3 +17,4 @@ obj-$(CONFIG_IOMMUFD_DRIVER) += iova_bitmap.o iommufd_driver-y := driver.o obj-$(CONFIG_IOMMUFD_DRIVER_CORE) += iommufd_driver.o +iommufd-$(CONFIG_IOMMU_LIVEUPDATE) += liveupdate.o diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 5bacc473f7cddac37bec8fc66962e6dadf120cae..777eb8e47d79c599b7bb6b99db15d58493f8e613 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -2,6 +2,7 @@ /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES */ #include +#include #include #include #include @@ -601,6 +602,10 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, int rc; mutex_lock(&igroup->lock); + if (iommufd_device_is_preserved(idev)) { + rc = -EBUSY; + goto err_unlock; + } attach = xa_cmpxchg(&igroup->pasid_attach, pasid, NULL, XA_ZERO_ENTRY, GFP_KERNEL); @@ -1662,3 +1667,100 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd) iommufd_put_object(ucmd->ictx, &idev->obj); return rc; } + +#ifdef CONFIG_IOMMU_LIVEUPDATE +static bool _iommufd_device_has_pasid_attachments(struct iommufd_device *idev) +{ + struct iommufd_group *igroup = idev->igroup; + unsigned long start = IOMMU_NO_PASID; + + if (xa_find_after(&igroup->pasid_attach, + &start, UINT_MAX, XA_PRESENT)) + return true; + + return false; +} + +int iommufd_device_preserve(struct liveupdate_session *s, + struct iommufd_device *idev, + u64 *iommufd_tokenp, + u64 *preserved_state) +{ + struct iommufd_group *igroup = idev->igroup; + struct iommufd_hwpt_paging *hwpt_paging; + struct iommufd_hw_pagetable *hwpt; + struct iommufd_attach *attach; + int ret; + + mutex_lock(&igroup->lock); + if (_iommufd_device_has_pasid_attachments(idev)) { + ret = -EOPNOTSUPP; + goto out; + } + + attach = xa_load(&igroup->pasid_attach, IOMMU_NO_PASID); + if (!attach) { + ret = -ENOENT; + goto out; + } + + hwpt = attach->hwpt; + hwpt_paging = find_hwpt_paging(hwpt); + if (!hwpt_paging || !hwpt_paging->liveupdate_preserved) { + ret = -EINVAL; + goto out; + } + + ret = liveupdate_get_token_outgoing(s, idev->ictx->file, iommufd_tokenp); + if (ret) + goto out; + + ret = iommu_preserve_device(hwpt_paging->common.domain, + idev->dev, + preserved_state); + + if (!ret) + igroup->liveupdate_preserved = true; +out: + mutex_unlock(&igroup->lock); + return ret; +} +EXPORT_SYMBOL_NS_GPL(iommufd_device_preserve, IOMMUFD); + +void iommufd_device_unpreserve(struct liveupdate_session *s, + struct iommufd_device *idev) +{ + struct iommufd_group *igroup = idev->igroup; + struct iommufd_hwpt_paging *hwpt_paging; + struct iommufd_hw_pagetable *hwpt; + struct iommufd_attach *attach; + + mutex_lock(&igroup->lock); + attach = xa_load(&igroup->pasid_attach, IOMMU_NO_PASID); + if (!attach) { + WARN(1, "IOMMU_NO_PASID attachment not found"); + igroup->liveupdate_preserved = false; + goto out; + } + + hwpt = attach->hwpt; + hwpt_paging = find_hwpt_paging(hwpt); + if (!hwpt_paging || !hwpt_paging->liveupdate_preserved) { + WARN(1, "Attached domain is not preserved"); + igroup->liveupdate_preserved = false; + goto out; + } + + iommu_unpreserve_device(hwpt_paging->common.domain, idev->dev); + igroup->liveupdate_preserved = false; +out: + mutex_unlock(&igroup->lock); +} +EXPORT_SYMBOL_NS_GPL(iommufd_device_unpreserve, IOMMUFD); + +bool iommufd_device_is_preserved(struct iommufd_device *idev) +{ + return idev && idev->igroup && idev->igroup->liveupdate_preserved; +} +EXPORT_SYMBOL_NS_GPL(iommufd_device_is_preserved, IOMMUFD); +#endif diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 436992331111c60959841209308753a0645ffdd4..495cfdb73905c8bc43c7bec5daff474744facb42 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -384,6 +384,11 @@ int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list, return rc; down_read(&iopt->domains_rwsem); + if (iopt_liveupdate_immutable(iopt)) { + rc = -EBUSY; + goto out_unlock_domains; + } + rc = iopt_fill_domains_pages(pages_list); if (rc) goto out_unlock_domains; @@ -755,6 +760,12 @@ static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start, again: down_read(&iopt->domains_rwsem); down_write(&iopt->iova_rwsem); + + if (iopt_liveupdate_immutable(iopt)) { + rc = -EBUSY; + goto out_unlock_iova; + } + while ((area = iopt_area_iter_first(iopt, start, last))) { unsigned long area_last = iopt_area_last_iova(area); unsigned long area_first = iopt_area_iova(area); diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h index 14cd052fd3204e94684eebe230b97ec72d384ac3..b64cb4cf300c54294edb6078ba9f74f06bb6284c 100644 --- a/drivers/iommu/iommufd/io_pagetable.h +++ b/drivers/iommu/iommufd/io_pagetable.h @@ -234,6 +234,7 @@ struct iopt_pages { struct { /* IOPT_ADDRESS_FILE */ struct file *file; unsigned long start; + u32 seals; }; /* IOPT_ADDRESS_DMABUF */ struct iopt_pages_dmabuf dmabuf; diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index eb6d1a70f6732cd3e0ffd4083dc9393dcf6a7ae5..b0ecd7035af016f8ca90257c491488fde80c1bf5 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -45,6 +45,11 @@ struct iommufd_ctx { struct file *file; struct xarray objects; struct xarray groups; +#ifdef CONFIG_IOMMU_LIVEUPDATE +#define IOMMUFD_OBJ_LIVEUPDATE_MARK XA_MARK_1 + /* @liveupdate_mutex: Protects the preservation of HWPTs. */ + struct mutex liveupdate_mutex; +#endif wait_queue_head_t destroy_wait; struct rw_semaphore ioas_creation_lock; struct maple_tree mt_mmap; @@ -94,6 +99,9 @@ struct io_pagetable { /* IOVA that cannot be allocated, struct iopt_reserved */ struct rb_root_cached reserved_itree; u8 disable_large_pages; +#ifdef CONFIG_IOMMU_LIVEUPDATE + bool liveupdate_immutable; +#endif unsigned long iova_alignment; }; @@ -374,6 +382,10 @@ struct iommufd_hwpt_paging { bool auto_domain : 1; bool enforce_cache_coherency : 1; bool nest_parent : 1; +#ifdef CONFIG_IOMMU_LIVEUPDATE + bool liveupdate_preserved : 1; + u64 liveupdate_token; +#endif /* Head at iommufd_ioas::hwpt_list */ struct list_head hwpt_item; struct iommufd_sw_msi_maps present_sw_msi; @@ -475,6 +487,9 @@ struct iommufd_group { struct xarray pasid_attach; struct iommufd_sw_msi_maps required_sw_msi; phys_addr_t sw_msi_start; +#ifdef CONFIG_IOMMU_LIVEUPDATE + bool liveupdate_preserved; +#endif }; /* @@ -707,6 +722,37 @@ iommufd_get_vdevice(struct iommufd_ctx *ictx, u32 id) struct iommufd_vdevice, obj); } +#ifdef CONFIG_IOMMU_LIVEUPDATE +int iommufd_liveupdate_register(void); +void iommufd_liveupdate_unregister(void); + +int iommufd_hwpt_liveupdate_mark_preserve(struct iommufd_ucmd *ucmd); + +static inline bool iopt_liveupdate_immutable(const struct io_pagetable *iopt) +{ + return iopt->liveupdate_immutable; +} +#else +static inline int iommufd_liveupdate_register(void) +{ + return 0; +} + +static inline void iommufd_liveupdate_unregister(void) +{ +} + +static inline int iommufd_hwpt_liveupdate_mark_preserve(struct iommufd_ucmd *ucmd) +{ + return -ENOTTY; +} + +static inline bool iopt_liveupdate_immutable(const struct io_pagetable *iopt) +{ + return false; +} +#endif + #ifdef CONFIG_IOMMUFD_TEST int iommufd_test(struct iommufd_ucmd *ucmd); void iommufd_selftest_destroy(struct iommufd_object *obj); diff --git a/drivers/iommu/iommufd/liveupdate.c b/drivers/iommu/iommufd/liveupdate.c new file mode 100644 index 0000000000000000000000000000000000000000..3cb220557d0d5a23e34525d694fd4f03bf6985cd --- /dev/null +++ b/drivers/iommu/iommufd/liveupdate.c @@ -0,0 +1,339 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Copyright (C) 2026, Google LLC + * Author: Samiullah Khawaja + */ + +#define pr_fmt(fmt) "iommufd: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "iommufd_private.h" +#include "io_pagetable.h" + +static void ioas_set_immutable(struct iommufd_ioas *ioas, bool immutable) +{ + down_write(&ioas->iopt.domains_rwsem); + ioas->iopt.liveupdate_immutable = immutable; + up_write(&ioas->iopt.domains_rwsem); +} + +int iommufd_hwpt_liveupdate_mark_preserve(struct iommufd_ucmd *ucmd) +{ + struct iommu_hwpt_liveupdate_mark_preserve *cmd = ucmd->cmd; + struct iommufd_hwpt_paging *hwpt_target; + struct iommufd_hwpt_paging *hwpt_paging; + struct iommufd_ctx *ictx = ucmd->ictx; + struct iommufd_object *obj; + unsigned long index; + int rc = 0; + + hwpt_target = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id); + if (IS_ERR(hwpt_target)) + return PTR_ERR(hwpt_target); + + mutex_lock(&ictx->liveupdate_mutex); + + xa_lock(&ictx->objects); + xa_for_each_marked(&ictx->objects, index, obj, IOMMUFD_OBJ_LIVEUPDATE_MARK) { + if (WARN_ON_ONCE(obj->type != IOMMUFD_OBJ_HWPT_PAGING)) + continue; + + hwpt_paging = to_hwpt_paging(container_of(obj, struct iommufd_hw_pagetable, obj)); + if (hwpt_paging->liveupdate_token == cmd->hwpt_token) { + rc = -EADDRINUSE; + goto out_unlock; + } + } + + __xa_set_mark(&ictx->objects, hwpt_target->common.obj.id, IOMMUFD_OBJ_LIVEUPDATE_MARK); + hwpt_target->liveupdate_token = cmd->hwpt_token; + +out_unlock: + xa_unlock(&ictx->objects); + mutex_unlock(&ictx->liveupdate_mutex); + iommufd_put_object(ictx, &hwpt_target->common.obj); + return rc; +} + +static int check_iopt_pages_preserved(struct liveupdate_session *s, + struct iommufd_hwpt_paging *hwpt) +{ + u32 req_seals = F_SEAL_SEAL | F_SEAL_GROW | F_SEAL_SHRINK; + struct iopt_area *area; + int ret = 0; + + down_read(&hwpt->ioas->iopt.iova_rwsem); + for (area = iopt_area_iter_first(&hwpt->ioas->iopt, 0, ULONG_MAX); area; + area = iopt_area_iter_next(area, 0, ULONG_MAX)) { + struct iopt_pages *pages = area->pages; + + /* Only allow file based mapping */ + if (pages->type != IOPT_ADDRESS_FILE) { + ret = -EINVAL; + break; + } + + /* + * When this memory file was mapped it should be sealed and seal + * should be sealed. This means that since mapping was done the + * memory file was not grown or shrink and the pages being used + * until now remain pinned and preserved. + */ + if ((pages->seals & req_seals) != req_seals) { + ret = -EINVAL; + break; + } + + /* Make sure that the file was preserved. */ + ret = liveupdate_get_token_outgoing(s, pages->file, NULL); + if (ret) + break; + } + up_read(&hwpt->ioas->iopt.iova_rwsem); + + return ret; +} + +static int iommufd_preserve_hwpt(struct iommufd_hwpt_paging *hwpt, + struct iommufd_hwpt_ser *hwpt_ser, + struct liveupdate_session *session) +{ + struct iommu_domain_ser *domain_ser; + bool ioas_made_immutable = false; + int rc; + + if (!hwpt->ioas->iopt.liveupdate_immutable) { + /* + * Make IOAS immutable so the DMA mappings do not change while + * the HWPT is preserved. Since one IOAS can have multiple + * HWPTs, if an error occurs this call needs to make the IOAS + * mutable again if it was the one that made it immutable. + */ + ioas_made_immutable = true; + ioas_set_immutable(hwpt->ioas, true); + + rc = check_iopt_pages_preserved(session, hwpt); + if (rc) + goto err; + } + + hwpt_ser->token = hwpt->liveupdate_token; + hwpt_ser->reclaimed = false; + + rc = iommu_domain_preserve(hwpt->common.domain, &domain_ser); + if (rc < 0) + goto err; + + hwpt_ser->domain_data = virt_to_phys(domain_ser); + return 0; + +err: + if (ioas_made_immutable) + ioas_set_immutable(hwpt->ioas, false); + + return rc; +} + +static void _iommufd_unpreserve(struct iommufd_ctx *ictx, + struct iommufd_ser *ser) +{ + struct iommufd_hwpt_paging *hwpt; + struct iommufd_object *obj; + unsigned long index; + + xa_lock(&ictx->objects); + xa_for_each_marked(&ictx->objects, index, obj, IOMMUFD_OBJ_LIVEUPDATE_MARK) { + if (obj->type != IOMMUFD_OBJ_HWPT_PAGING) + continue; + + hwpt = to_hwpt_paging(container_of(obj, struct iommufd_hw_pagetable, obj)); + if (!hwpt->liveupdate_preserved) + continue; + + xa_unlock(&ictx->objects); + + iommu_domain_unpreserve(hwpt->common.domain); + if (hwpt->ioas->iopt.liveupdate_immutable) + ioas_set_immutable(hwpt->ioas, false); + + hwpt->liveupdate_preserved = false; + iommufd_put_object(ictx, obj); + + xa_lock(&ictx->objects); + } + xa_unlock(&ictx->objects); + + kho_unpreserve_free(ser); +} + +static int iommufd_liveupdate_preserve(struct liveupdate_file_op_args *args) +{ + struct iommufd_ctx *ictx = iommufd_ctx_from_file(args->file); + struct iommufd_hwpt_paging *hwpt; + struct iommufd_ser *iommufd_ser; + struct iommufd_object *obj; + unsigned int nr_hwpts; + unsigned long index; + unsigned int i; + void *mem; + int rc; + + if (IS_ERR(ictx)) + return PTR_ERR(ictx); + + mutex_lock(&ictx->liveupdate_mutex); + + /* Count the number of HWPTs to preserve */ + nr_hwpts = 0; + xa_lock(&ictx->objects); + xa_for_each_marked(&ictx->objects, index, obj, IOMMUFD_OBJ_LIVEUPDATE_MARK) { + if (obj->type != IOMMUFD_OBJ_HWPT_PAGING) + continue; + + hwpt = to_hwpt_paging(container_of(obj, struct iommufd_hw_pagetable, obj)); + if (!hwpt->common.domain) { + rc = -EINVAL; + xa_unlock(&ictx->objects); + goto out_unlock; + } + nr_hwpts++; + } + xa_unlock(&ictx->objects); + + mem = kho_alloc_preserve(struct_size(iommufd_ser, + hwpt_array, nr_hwpts)); + if (!mem) { + rc = -ENOMEM; + goto out_unlock; + } + + iommufd_ser = mem; + iommufd_ser->nr_hwpts = nr_hwpts; + + /* Preserve HWPTs */ + i = 0; + xa_lock(&ictx->objects); + xa_for_each_marked(&ictx->objects, index, obj, IOMMUFD_OBJ_LIVEUPDATE_MARK) { + if (obj->type != IOMMUFD_OBJ_HWPT_PAGING) + continue; + + if (!iommufd_lock_obj(obj)) { + rc = -ENOENT; + xa_unlock(&ictx->objects); + goto out_unpreserve; + } + + /* + * HWPT is locked so it will not be destroyed. The xarray lock + * can be released here before preserving the HWPT. + */ + xa_unlock(&ictx->objects); + hwpt = to_hwpt_paging(container_of(obj, struct iommufd_hw_pagetable, obj)); + rc = iommufd_preserve_hwpt(hwpt, &iommufd_ser->hwpt_array[i++], args->session); + if (rc) { + iommufd_put_object(ictx, obj); + goto out_unpreserve; + } + + /* Mark as preserved */ + hwpt->liveupdate_preserved = true; + xa_lock(&ictx->objects); + } + xa_unlock(&ictx->objects); + + args->serialized_data = virt_to_phys(iommufd_ser); + mutex_unlock(&ictx->liveupdate_mutex); + iommufd_ctx_put(ictx); + return 0; + +out_unpreserve: + _iommufd_unpreserve(ictx, iommufd_ser); +out_unlock: + mutex_unlock(&ictx->liveupdate_mutex); + iommufd_ctx_put(ictx); + return rc; +} + +static void iommufd_liveupdate_unpreserve(struct liveupdate_file_op_args *args) +{ + struct iommufd_ctx *ictx = iommufd_ctx_from_file(args->file); + + if (WARN_ON(IS_ERR(ictx))) + return; + + mutex_lock(&ictx->liveupdate_mutex); + _iommufd_unpreserve(ictx, phys_to_virt(args->serialized_data)); + mutex_unlock(&ictx->liveupdate_mutex); + + iommufd_ctx_put(ictx); +} + +static int iommufd_liveupdate_retrieve(struct liveupdate_file_op_args *args) +{ + return -EOPNOTSUPP; +} + +static bool iommufd_liveupdate_can_finish(struct liveupdate_file_op_args *args) +{ + return false; +} + +static void iommufd_liveupdate_finish(struct liveupdate_file_op_args *args) +{ +} + +static bool iommufd_liveupdate_can_preserve(struct liveupdate_file_handler *handler, + struct file *file) +{ + struct iommufd_ctx *ictx = iommufd_ctx_from_file(file); + + if (IS_ERR(ictx)) + return false; + + iommufd_ctx_put(ictx); + return true; +} + +static struct liveupdate_file_ops iommufd_ser_file_ops = { + .can_preserve = iommufd_liveupdate_can_preserve, + .preserve = iommufd_liveupdate_preserve, + .unpreserve = iommufd_liveupdate_unpreserve, + .retrieve = iommufd_liveupdate_retrieve, + .can_finish = iommufd_liveupdate_can_finish, + .finish = iommufd_liveupdate_finish, +}; + +static struct liveupdate_file_handler iommufd_ser_handler = { + .compatible = IOMMUFD_LUO_COMPATIBLE, + .ops = &iommufd_ser_file_ops, +}; + +int iommufd_liveupdate_register(void) +{ + int ret; + + ret = liveupdate_register_file_handler(&iommufd_ser_handler); + if (ret) + return ret; + + ret = iommu_liveupdate_register_flb(&iommufd_ser_handler); + if (ret) + liveupdate_unregister_file_handler(&iommufd_ser_handler); + + return ret; +} + +void iommufd_liveupdate_unregister(void) +{ + iommu_liveupdate_unregister_flb(&iommufd_ser_handler); + liveupdate_unregister_file_handler(&iommufd_ser_handler); +} diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index abeb41680e3f27d3b258fde7e5e1a57356ce8bc5..db52aa2f5a3494e73b0877417c9aae1d463714dd 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -313,6 +313,9 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp) init_rwsem(&ictx->ioas_creation_lock); xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT); xa_init(&ictx->groups); +#ifdef CONFIG_IOMMU_LIVEUPDATE + mutex_init(&ictx->liveupdate_mutex); +#endif ictx->file = filp; mt_init_flags(&ictx->mt_mmap, MT_FLAGS_ALLOC_RANGE); init_waitqueue_head(&ictx->destroy_wait); @@ -375,6 +378,9 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp) * iommufd_object_tombstone_user() */ xa_destroy(&ictx->objects); +#ifdef CONFIG_IOMMU_LIVEUPDATE + mutex_destroy(&ictx->liveupdate_mutex); +#endif WARN_ON(!xa_empty(&ictx->groups)); @@ -420,6 +426,7 @@ union ucmd_buffer { struct iommu_hwpt_alloc hwpt; struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap; struct iommu_hwpt_invalidate cache; + struct iommu_hwpt_liveupdate_mark_preserve mark_preserve; struct iommu_hwpt_set_dirty_tracking set_dirty_tracking; struct iommu_ioas_alloc alloc; struct iommu_ioas_allow_iovas allow_iovas; @@ -493,6 +500,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { __reserved), IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl, struct iommu_viommu_alloc, out_viommu_id), + IOCTL_OP(IOMMU_HWPT_LIVEUPDATE_MARK_PRESERVE, iommufd_hwpt_liveupdate_mark_preserve, + struct iommu_hwpt_liveupdate_mark_preserve, hwpt_token), #ifdef CONFIG_IOMMUFD_TEST IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last), #endif @@ -773,11 +782,18 @@ static int __init iommufd_init(void) if (ret) goto err_misc; } - ret = iommufd_test_init(); + + ret = iommufd_liveupdate_register(); if (ret) goto err_vfio_misc; + + ret = iommufd_test_init(); + if (ret) + goto err_liveupdate; return 0; +err_liveupdate: + iommufd_liveupdate_unregister(); err_vfio_misc: if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) misc_deregister(&vfio_misc_dev); @@ -789,6 +805,7 @@ static int __init iommufd_init(void) static void __exit iommufd_exit(void) { iommufd_test_exit(); + iommufd_liveupdate_unregister(); if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) misc_deregister(&vfio_misc_dev); misc_deregister(&iommu_misc_dev); diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index 6b73fcbf41812bd2a8f4834dcbb65d9d52ee1706..f4c45282648ee7eae4bdf2f1099552c7f33732d5 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -56,6 +56,7 @@ #include #include #include +#include #include "double_span.h" #include "io_pagetable.h" @@ -1514,6 +1515,7 @@ struct iopt_pages *iopt_alloc_file_pages(struct file *file, { struct iopt_pages *pages; + int seals; pages = iopt_alloc_pages(start_byte, length, writable); if (IS_ERR(pages)) @@ -1521,6 +1523,11 @@ struct iopt_pages *iopt_alloc_file_pages(struct file *file, pages->file = get_file(file); pages->start = start - start_byte; pages->type = IOPT_ADDRESS_FILE; + + seals = memfd_get_seals(file); + if (seals > 0) + pages->seals = seals; + return pages; } diff --git a/drivers/iommu/liveupdate.c b/drivers/iommu/liveupdate.c new file mode 100644 index 0000000000000000000000000000000000000000..e837d6600485f55c0bc4946df2916fb9c86ac19d --- /dev/null +++ b/drivers/iommu/liveupdate.c @@ -0,0 +1,594 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Copyright (C) 2026, Google LLC + * Author: Samiullah Khawaja + */ + +#define pr_fmt(fmt) "iommu: liveupdate: " fmt + +#include +#include +#include +#include +#include +#include + +#define iommu_max_objs_per_page(_array) \ + ((PAGE_SIZE - sizeof(struct iommu_array_hdr_ser)) / sizeof((_array)->objects[0])) + +#define iommu_liveupdate_for_each_obj(_arr, _obj, _idx) \ + for (; (_arr); \ + (_arr) = (_arr)->hdr.next_array_phys ? \ + phys_to_virt((_arr)->hdr.next_array_phys) : NULL) \ + for ((_idx) = 0, (_obj) = (_arr)->objects; \ + (_idx) < (_arr)->hdr.nr_objects; (_idx)++, (_obj)++) \ + if (!(_obj)->hdr.deleted) + +static void *iommu_liveupdate_restore_array(u64 array_phys) +{ + struct iommu_array_hdr_ser *array_hdr; + void *vaddr = array_phys ? phys_to_virt(array_phys) : NULL; + + while (array_phys) { + /* + * Failure to restore preserved IOMMU state is considered fatal. + * + * This is because the IOMMU translations for preserved IOMMUs + * were kept enabled in the previous kernel and the preserved + * devices have their IOMMU domains still present. Not being + * able to restore means that the memory mapped into preserved + * domains might be already corrupted by the preserved devices. + * + * There is no way to confirm the integrity of the memory that + * was mapped. BUG_ON is the safest option at this point. + */ + BUG_ON(!kho_restore_folio(array_phys)); + array_hdr = phys_to_virt(array_phys); + array_phys = array_hdr->next_array_phys; + } + + return vaddr; +} + +static void iommu_liveupdate_unpreserve_free(u64 array_phys) +{ + struct iommu_array_hdr_ser *array_hdr; + + while (array_phys) { + array_hdr = phys_to_virt(array_phys); + array_phys = array_hdr->next_array_phys; + kho_unpreserve_free(array_hdr); + } +} + +static void iommu_liveupdate_folio_put(u64 array_phys) +{ + struct iommu_array_hdr_ser *array_hdr; + + while (array_phys) { + array_hdr = phys_to_virt(array_phys); + array_phys = array_hdr->next_array_phys; + folio_put(virt_to_folio(array_hdr)); + } +} + +static void iommu_liveupdate_flb_free(struct iommu_flb_obj *obj) +{ + if (obj->ser->iommu_domain_array_phys) + iommu_liveupdate_unpreserve_free(obj->ser->iommu_domain_array_phys); + + if (obj->ser->device_array_phys) + iommu_liveupdate_unpreserve_free(obj->ser->device_array_phys); + + if (obj->ser->iommu_array_phys) + iommu_liveupdate_unpreserve_free(obj->ser->iommu_array_phys); + + kho_unpreserve_free(obj->ser); + kfree(obj); +} + +static int iommu_liveupdate_flb_preserve(struct liveupdate_flb_op_args *argp) +{ + struct iommu_flb_obj *obj; + struct iommu_flb_ser *ser; + void *mem; + + /* obj exists only in the current kernel to track preserved state */ + obj = kzalloc(sizeof(*obj), GFP_KERNEL); + if (!obj) + return -ENOMEM; + + mutex_init(&obj->lock); + + /* mem is allocated via KHO and will survive the kexec */ + mem = kho_alloc_preserve(sizeof(*ser)); + if (IS_ERR(mem)) + goto err_free_obj; + + ser = mem; + obj->ser = ser; + + mem = kho_alloc_preserve(PAGE_SIZE); + if (IS_ERR(mem)) + goto err_free_ser; + + obj->curr_domain_array = mem; + ser->iommu_domain_array_phys = virt_to_phys(obj->curr_domain_array); + + mem = kho_alloc_preserve(PAGE_SIZE); + if (IS_ERR(mem)) + goto err_free_domains; + + obj->curr_device_array = mem; + ser->device_array_phys = virt_to_phys(obj->curr_device_array); + + mem = kho_alloc_preserve(PAGE_SIZE); + if (IS_ERR(mem)) + goto err_free_devices; + + obj->curr_iommu_array = mem; + ser->iommu_array_phys = virt_to_phys(obj->curr_iommu_array); + + argp->obj = obj; + argp->data = virt_to_phys(ser); + return 0; + +err_free_devices: + kho_unpreserve_free(obj->curr_device_array); +err_free_domains: + kho_unpreserve_free(obj->curr_domain_array); +err_free_ser: + kho_unpreserve_free(obj->ser); +err_free_obj: + kfree(obj); + return PTR_ERR(mem); +} + +static void iommu_liveupdate_flb_unpreserve(struct liveupdate_flb_op_args *argp) +{ + iommu_liveupdate_flb_free(argp->obj); +} + +static void iommu_liveupdate_flb_finish(struct liveupdate_flb_op_args *argp) +{ + struct iommu_flb_obj *obj = argp->obj; + + iommu_liveupdate_folio_put(obj->ser->iommu_domain_array_phys); + iommu_liveupdate_folio_put(obj->ser->device_array_phys); + iommu_liveupdate_folio_put(obj->ser->iommu_array_phys); + + folio_put(virt_to_folio(obj->ser)); + kfree(obj); +} + +static int iommu_liveupdate_flb_retrieve(struct liveupdate_flb_op_args *argp) +{ + struct iommu_flb_obj *obj; + struct iommu_flb_ser *ser; + + obj = kzalloc(sizeof(*obj), GFP_KERNEL); + if (!obj) + return -ENOMEM; + + /* Data must be present and valid from the previous kernel */ + BUG_ON(!kho_restore_folio(argp->data)); + + mutex_init(&obj->lock); + ser = phys_to_virt(argp->data); + obj->ser = ser; + + obj->curr_domain_array = iommu_liveupdate_restore_array(ser->iommu_domain_array_phys); + obj->curr_device_array = iommu_liveupdate_restore_array(ser->device_array_phys); + obj->curr_iommu_array = iommu_liveupdate_restore_array(ser->iommu_array_phys); + argp->obj = obj; + return 0; +} + +static struct liveupdate_flb_ops iommu_flb_ops = { + .preserve = iommu_liveupdate_flb_preserve, + .unpreserve = iommu_liveupdate_flb_unpreserve, + .finish = iommu_liveupdate_flb_finish, + .retrieve = iommu_liveupdate_flb_retrieve, +}; + +static struct liveupdate_flb iommu_flb = { + .compatible = IOMMU_LUO_FLB_COMPATIBLE, + .ops = &iommu_flb_ops, +}; + +int iommu_liveupdate_register_flb(struct liveupdate_file_handler *handler) +{ + return liveupdate_register_flb(handler, &iommu_flb); +} +EXPORT_SYMBOL(iommu_liveupdate_register_flb); + +void iommu_liveupdate_unregister_flb(struct liveupdate_file_handler *handler) +{ + liveupdate_unregister_flb(handler, &iommu_flb); +} +EXPORT_SYMBOL(iommu_liveupdate_unregister_flb); + +int iommu_for_each_preserved_device(iommu_preserved_device_iter_fn fn, + void *arg) +{ + struct iommu_flb_obj *flb_obj; + struct iommu_device_array_ser *array; + struct iommu_device_ser *device_ser; + int ret, idx; + + ret = liveupdate_flb_get_incoming(&iommu_flb, (void **)&flb_obj); + if (ret) + return -ENOENT; + + array = phys_to_virt(flb_obj->ser->device_array_phys); + iommu_liveupdate_for_each_obj(array, device_ser, idx) { + ret = fn(device_ser, arg); + if (ret) + goto out; + } + +out: + liveupdate_flb_put_incoming(&iommu_flb); + return ret; +} +EXPORT_SYMBOL(iommu_for_each_preserved_device); + +static inline bool match_device_ser(struct iommu_device_ser *match, + struct pci_dev *pdev) +{ + return match->devid == pci_dev_id(pdev) && match->pci_domain_nr == pci_domain_nr(pdev->bus); +} + +struct iommu_device_ser *iommu_get_device_preserved_data(struct device *dev) +{ + struct iommu_device_ser *device_ser = NULL; + struct iommu_device_array_ser *array; + struct iommu_flb_obj *flb_obj; + int ret, idx; + + if (!dev_is_pci(dev)) + return NULL; + + ret = liveupdate_flb_get_incoming(&iommu_flb, (void **)&flb_obj); + if (ret) + return NULL; + + array = phys_to_virt(flb_obj->ser->device_array_phys); + iommu_liveupdate_for_each_obj(array, device_ser, idx) { + if (match_device_ser(device_ser, to_pci_dev(dev))) { + device_ser->hdr.incoming = true; + goto out; + } + } + + device_ser = NULL; +out: + liveupdate_flb_put_incoming(&iommu_flb); + return device_ser; +} +EXPORT_SYMBOL(iommu_get_device_preserved_data); + +struct iommu_hw_ser *iommu_get_preserved_data(u64 token, enum iommu_type_ser type) +{ + struct iommu_hw_ser *iommu_ser = NULL; + struct iommu_hw_array_ser *array; + struct iommu_flb_obj *flb_obj; + int ret, idx; + + ret = liveupdate_flb_get_incoming(&iommu_flb, (void **)&flb_obj); + if (ret) + return NULL; + + array = phys_to_virt(flb_obj->ser->iommu_array_phys); + iommu_liveupdate_for_each_obj(array, iommu_ser, idx) { + if (iommu_ser->token == token && iommu_ser->type == type) + goto out; + } + + iommu_ser = NULL; +out: + liveupdate_flb_put_incoming(&iommu_flb); + return iommu_ser; +} +EXPORT_SYMBOL(iommu_get_preserved_data); + +static int alloc_object_ser(struct iommu_array_hdr_ser **curr_array_ptr, u64 max_objs) +{ + struct iommu_array_hdr_ser *curr_array = *curr_array_ptr; + struct iommu_array_hdr_ser *next_array; + + if (curr_array->nr_objects >= max_objs) { + next_array = kho_alloc_preserve(PAGE_SIZE); + if (IS_ERR(next_array)) + return PTR_ERR(next_array); + + curr_array->next_array_phys = virt_to_phys(next_array); + *curr_array_ptr = next_array; + curr_array = next_array; + } + + return curr_array->nr_objects++; +} + +static struct iommu_domain_ser *alloc_iommu_domain_ser(struct iommu_flb_obj *flb) +{ + int idx; + + idx = alloc_object_ser((struct iommu_array_hdr_ser **)&flb->curr_domain_array, + iommu_max_objs_per_page(flb->curr_domain_array)); + if (idx < 0) + return ERR_PTR(idx); + + flb->curr_domain_array->objects[idx].hdr.ref_count = 1; + return &flb->curr_domain_array->objects[idx]; +} + +int iommu_domain_preserve(struct iommu_domain *domain, struct iommu_domain_ser **ser) +{ + struct iommu_domain_ser *domain_ser; + struct iommu_flb_obj *flb_obj; + int ret; + + if (!domain->ops->preserve) + return -EOPNOTSUPP; + + ret = liveupdate_flb_get_outgoing(&iommu_flb, (void **)&flb_obj); + if (ret) + return ret; + + guard(mutex)(&flb_obj->lock); + domain_ser = alloc_iommu_domain_ser(flb_obj); + if (IS_ERR(domain_ser)) + return PTR_ERR(domain_ser); + + ret = domain->ops->preserve(domain, domain_ser); + if (ret) { + domain_ser->hdr.deleted = true; + return ret; + } + + domain->preserved_state = domain_ser; + *ser = domain_ser; + return 0; +} +EXPORT_SYMBOL_GPL(iommu_domain_preserve); + +void iommu_domain_unpreserve(struct iommu_domain *domain) +{ + struct iommu_domain_ser *domain_ser; + struct iommu_flb_obj *flb_obj; + int ret; + + if (!domain->ops->unpreserve) + return; + + ret = liveupdate_flb_get_outgoing(&iommu_flb, (void **)&flb_obj); + if (WARN_ON(ret)) + return; + + guard(mutex)(&flb_obj->lock); + + if (!domain->preserved_state) + return; + + /* + * There is no check for attached devices here. The correctness relies + * on the Live Update Orchestrator's session lifecycle. All resources + * (iommufd, vfio devices) are preserved within a single session. If the + * session is torn down, the .unpreserve callbacks for all files will be + * invoked, ensuring a consistent cleanup without needing explicit + * refcounting for the serialized objects here. + */ + domain_ser = domain->preserved_state; + domain->ops->unpreserve(domain, domain_ser); + domain_ser->hdr.deleted = true; + domain->preserved_state = NULL; +} +EXPORT_SYMBOL_GPL(iommu_domain_unpreserve); + +static struct iommu_hw_ser *alloc_iommu_hw_ser(struct iommu_flb_obj *flb) +{ + int idx; + + idx = alloc_object_ser((struct iommu_array_hdr_ser **)&flb->curr_iommu_array, + iommu_max_objs_per_page(flb->curr_iommu_array)); + if (idx < 0) + return ERR_PTR(idx); + + flb->curr_iommu_array->objects[idx].hdr.ref_count = 1; + return &flb->curr_iommu_array->objects[idx]; +} + +static int iommu_preserve_locked(struct iommu_device *iommu, + struct iommu_flb_obj *flb_obj) +{ + struct iommu_hw_ser *iommu_hw_ser; + int ret; + + if (!iommu->ops->preserve) + return -EOPNOTSUPP; + + lockdep_assert_held(&flb_obj->lock); + if (iommu->outgoing_preserved_state) { + iommu->outgoing_preserved_state->hdr.ref_count++; + return 0; + } + + iommu_hw_ser = alloc_iommu_hw_ser(flb_obj); + if (IS_ERR(iommu_hw_ser)) + return PTR_ERR(iommu_hw_ser); + + ret = iommu->ops->preserve(iommu, iommu_hw_ser); + if (ret) { + iommu_hw_ser->hdr.deleted = true; + return ret; + } + + iommu->outgoing_preserved_state = iommu_hw_ser; + return ret; +} + +static void iommu_unpreserve_locked(struct iommu_device *iommu, + struct iommu_flb_obj *flb_obj) +{ + struct iommu_hw_ser *iommu_hw_ser = iommu->outgoing_preserved_state; + + lockdep_assert_held(&flb_obj->lock); + iommu_hw_ser->hdr.ref_count--; + if (iommu_hw_ser->hdr.ref_count) + return; + + iommu->outgoing_preserved_state = NULL; + iommu->ops->unpreserve(iommu, iommu_hw_ser); + iommu_hw_ser->hdr.deleted = true; +} + +static struct iommu_device_ser *alloc_iommu_device_ser(struct iommu_flb_obj *flb) +{ + int idx; + + idx = alloc_object_ser((struct iommu_array_hdr_ser **)&flb->curr_device_array, + iommu_max_objs_per_page(flb->curr_device_array)); + if (idx < 0) + return ERR_PTR(idx); + + flb->curr_device_array->objects[idx].hdr.ref_count = 1; + return &flb->curr_device_array->objects[idx]; +} + +int iommu_preserve_device(struct iommu_domain *domain, + struct device *dev, u64 *preserved_state) +{ + struct iommu_flb_obj *flb_obj; + struct iommu_device_ser *device_ser; + struct dev_iommu *iommu; + struct pci_dev *pdev; + int ret; + + if (!dev_is_pci(dev)) + return -EOPNOTSUPP; + + if (!domain->preserved_state) + return -EINVAL; + + if (!iommu_group_dma_owner_claimed(dev->iommu_group)) + return -EINVAL; + + pdev = to_pci_dev(dev); + iommu = dev->iommu; + if (!iommu->iommu_dev->ops->preserve_device || + !iommu->iommu_dev->ops->preserve) + return -EOPNOTSUPP; + + ret = liveupdate_flb_get_outgoing(&iommu_flb, (void **)&flb_obj); + if (ret) + return ret; + + guard(mutex)(&flb_obj->lock); + device_ser = alloc_iommu_device_ser(flb_obj); + if (IS_ERR(device_ser)) + return PTR_ERR(device_ser); + + ret = iommu_preserve_locked(iommu->iommu_dev, flb_obj); + if (ret) { + device_ser->hdr.deleted = true; + return ret; + } + + device_ser->domain_iommu_ser.domain_phys = __pa(domain->preserved_state); + device_ser->domain_iommu_ser.iommu_phys = __pa(iommu->iommu_dev->outgoing_preserved_state); + device_ser->devid = pci_dev_id(pdev); + device_ser->pci_domain_nr = pci_domain_nr(pdev->bus); + + ret = iommu->iommu_dev->ops->preserve_device(dev, device_ser); + if (ret) { + device_ser->hdr.deleted = true; + iommu_unpreserve_locked(iommu->iommu_dev, flb_obj); + return ret; + } + + dev->iommu->device_ser = device_ser; + *preserved_state = virt_to_phys(device_ser); + return 0; +} +EXPORT_SYMBOL_GPL(iommu_preserve_device); + +void iommu_unpreserve_device(struct iommu_domain *domain, struct device *dev) +{ + struct iommu_flb_obj *flb_obj; + struct iommu_device_ser *iommu_device_ser; + struct dev_iommu *iommu; + struct pci_dev *pdev; + int ret; + + if (!dev_is_pci(dev)) + return; + + if (!iommu_group_dma_owner_claimed(dev->iommu_group)) + return; + + pdev = to_pci_dev(dev); + iommu = dev->iommu; + if (!iommu->iommu_dev->ops->unpreserve_device || + !iommu->iommu_dev->ops->unpreserve) + return; + + ret = liveupdate_flb_get_outgoing(&iommu_flb, (void **)&flb_obj); + if (WARN_ON(ret)) + return; + + guard(mutex)(&flb_obj->lock); + iommu_device_ser = dev_iommu_preserved_state(dev); + if (WARN_ON(!iommu_device_ser)) + return; + + iommu->iommu_dev->ops->unpreserve_device(dev, iommu_device_ser); + dev->iommu->device_ser = NULL; + + iommu_unpreserve_locked(iommu->iommu_dev, flb_obj); +} +EXPORT_SYMBOL_GPL(iommu_unpreserve_device); + +struct iommu_domain *iommu_restore_domain(struct device *dev, + struct iommu_device_ser *ser, + void **owner) +{ + struct iommu_domain_ser *domain_ser; + struct iommu_flb_obj *flb_obj; + struct iommu_domain *domain; + int ret; + + domain_ser = phys_to_virt(ser->domain_iommu_ser.domain_phys); + + ret = liveupdate_flb_get_incoming(&iommu_flb, (void **)&flb_obj); + if (ret) + return ERR_PTR(ret); + + guard(mutex)(&flb_obj->lock); + if (domain_ser->restored_domain) { + domain = domain_ser->restored_domain; + goto out; + } + + domain_ser->hdr.incoming = true; + domain = iommu_paging_domain_alloc(dev); + if (IS_ERR(domain)) + goto out; + + ret = domain->ops->restore(domain, domain_ser); + if (ret) { + iommu_domain_free(domain); + domain = ERR_PTR(ret); + goto out; + } + + /* The device is owned by the preserved state. */ + *owner = ser; + domain->preserved_state = domain_ser; + domain_ser->restored_domain = domain; + +out: + liveupdate_flb_put_incoming(&iommu_flb); + return domain; +} diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c index e58739749b58faad8e4d17c386dc6726ad3bc3cd..b271b59bb3f56213cb91bbb476403fb967484610 100644 --- a/drivers/vfio/device_cdev.c +++ b/drivers/vfio/device_cdev.c @@ -221,6 +221,11 @@ int vfio_df_ioctl_attach_pt(struct vfio_device_file *df, return -EINVAL; mutex_lock(&device->dev_set->lock); + if (iommufd_device_is_preserved(device->iommufd_device)) { + ret = -EBUSY; + goto out_unlock; + } + ret = device->ops->attach_ioas(device, &attach.pt_id); if (ret) goto out_unlock; @@ -256,6 +261,11 @@ int vfio_df_ioctl_detach_pt(struct vfio_device_file *df, return -EINVAL; mutex_lock(&device->dev_set->lock); + if (iommufd_device_is_preserved(device->iommufd_device)) { + mutex_unlock(&device->dev_set->lock); + return -EBUSY; + } + device->ops->detach_ioas(device); mutex_unlock(&device->dev_set->lock); diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index ac2f34866fe1db9973b35ac00b204a09971f2927..34984d440e1d46ff99af6cbe64fc23608d21343b 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -295,3 +295,4 @@ module_exit(vfio_pci_cleanup); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); +MODULE_IMPORT_NS(IOMMUFD); diff --git a/drivers/vfio/pci/vfio_pci_liveupdate.c b/drivers/vfio/pci/vfio_pci_liveupdate.c index 150996f93927fa9735280e0ae230c98a7b6b6a56..5ea4bd23d0f4ddf264c53169161c5db18d978e15 100644 --- a/drivers/vfio/pci/vfio_pci_liveupdate.c +++ b/drivers/vfio/pci/vfio_pci_liveupdate.c @@ -108,10 +108,13 @@ #include #include #include +#include #include #include "vfio_pci_priv.h" +MODULE_IMPORT_NS("IOMMUFD"); + static bool vfio_pci_liveupdate_can_preserve(struct liveupdate_file_handler *handler, struct file *file) { @@ -153,9 +156,26 @@ static int vfio_pci_liveupdate_preserve(struct liveupdate_file_op_args *args) vdev = container_of(device, struct vfio_pci_core_device, vdev); pdev = vdev->pdev; +#ifdef CONFIG_IOMMU_LIVEUPDATE + /* If iommufd is attached, preserve the underlying domain */ + mutex_lock(&device->dev_set->lock); + if (device->iommufd_attached) { + u64 token, preserved_state; + + ret = iommufd_device_preserve(args->session, + device->iommufd_device, + &token, &preserved_state); + if (ret) { + mutex_unlock(&device->dev_set->lock); + return ret; + } + } + mutex_unlock(&device->dev_set->lock); +#endif + ret = pci_liveupdate_preserve(pdev); if (ret) - return ret; + goto err_iommufd_unpreserve; ser = kho_alloc_preserve(sizeof(*ser)); if (IS_ERR(ser)) { @@ -170,6 +190,9 @@ static int vfio_pci_liveupdate_preserve(struct liveupdate_file_op_args *args) args->serialized_data = virt_to_phys(ser); return 0; +err_iommufd_unpreserve: + iommufd_device_unpreserve(args->session, device->iommufd_device); + err_unpreserve: pci_liveupdate_unpreserve(pdev); return ret; @@ -178,6 +201,14 @@ static int vfio_pci_liveupdate_preserve(struct liveupdate_file_op_args *args) static void vfio_pci_liveupdate_unpreserve(struct liveupdate_file_op_args *args) { struct vfio_device *device = vfio_device_from_file(args->file); + struct vfio_pci_core_device_ser *ser; + + ser = phys_to_virt(args->serialized_data); + mutex_lock(&device->dev_set->lock); + if (device->iommufd_attached) + iommufd_device_unpreserve(args->session, + device->iommufd_device); + mutex_unlock(&device->dev_set->lock); pci_liveupdate_unpreserve(to_pci_dev(device->dev)); kho_unpreserve_free(phys_to_virt(args->serialized_data)); diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 9eefbb74efd087815c8d2e4d34f0b9c7a33de68d..a5d478ca9a1177ecb54ad5fad1c2e5a968a88505 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -13,6 +13,7 @@ struct iommu_iotlb_gather; struct pt_iommu_ops; struct pt_iommu_driver_ops; struct iommu_dirty_bitmap; +struct iommu_domain_ser; /** * DOC: IOMMU Radix Page Table @@ -202,6 +203,12 @@ struct pt_iommu_cfg { struct iommu_domain *domain, unsigned long iova, \ size_t pgsize, size_t pgcount, \ struct iommu_iotlb_gather *iotlb_gather); \ + int pt_iommu_##fmt##_preserve(struct iommu_domain *domain, \ + struct iommu_domain_ser *ser); \ + void pt_iommu_##fmt##_unpreserve(struct iommu_domain *domain, \ + struct iommu_domain_ser *ser); \ + int pt_iommu_##fmt##_restore(struct iommu_domain *domain, \ + struct iommu_domain_ser *ser); \ int pt_iommu_##fmt##_read_and_clear_dirty( \ struct iommu_domain *domain, unsigned long iova, size_t size, \ unsigned long flags, struct iommu_dirty_bitmap *dirty); \ @@ -217,6 +224,15 @@ struct pt_iommu_cfg { }; \ IOMMU_PROTOTYPES(fmt) +#ifdef CONFIG_IOMMU_LIVEUPDATE +#define IOMMU_PT_LIVEUPDATE_OPS(fmt) \ + , .preserve = &pt_iommu_##fmt##_preserve, \ + .unpreserve = &pt_iommu_##fmt##_unpreserve, \ + .restore = &pt_iommu_##fmt##_restore +#else +#define IOMMU_PT_LIVEUPDATE_OPS(fmt) +#endif + /* * A driver uses IOMMU_PT_DOMAIN_OPS to populate the iommu_domain_ops for the * iommu_pt @@ -224,7 +240,8 @@ struct pt_iommu_cfg { #define IOMMU_PT_DOMAIN_OPS(fmt) \ .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \ .map_pages = &pt_iommu_##fmt##_map_pages, \ - .unmap_pages = &pt_iommu_##fmt##_unmap_pages + .unmap_pages = &pt_iommu_##fmt##_unmap_pages \ + IOMMU_PT_LIVEUPDATE_OPS(fmt) #define IOMMU_PT_DIRTY_OPS(fmt) \ .read_and_clear_dirty = &pt_iommu_##fmt##_read_and_clear_dirty diff --git a/include/linux/iommu-liveupdate.h b/include/linux/iommu-liveupdate.h new file mode 100644 index 0000000000000000000000000000000000000000..75d27256c883810edab87c4bfd23a6188d392787 --- /dev/null +++ b/include/linux/iommu-liveupdate.h @@ -0,0 +1,156 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (C) 2026, Google LLC + * Author: Samiullah Khawaja + */ + +#ifndef _LINUX_IOMMU_LIVEUPDATE_H +#define _LINUX_IOMMU_LIVEUPDATE_H + +#include +#include +#include +#include + +typedef int (*iommu_preserved_device_iter_fn)(struct iommu_device_ser *ser, + void *arg); +#ifdef CONFIG_IOMMU_LIVEUPDATE +static inline void *dev_iommu_preserved_state(struct device *dev) +{ + struct iommu_device_ser *ser; + + if (!dev->iommu) + return NULL; + + ser = dev->iommu->device_ser; + if (ser && !ser->hdr.incoming) + return ser; + + return NULL; +} + +static inline void *dev_iommu_restored_state(struct device *dev) +{ + struct iommu_device_ser *ser; + + if (!dev->iommu) + return NULL; + + ser = dev->iommu->device_ser; + if (ser && ser->hdr.incoming) + return ser; + + return NULL; +} + +static inline void *iommu_domain_restored_state(struct iommu_domain *domain) +{ + struct iommu_domain_ser *ser; + + ser = domain->preserved_state; + if (ser && ser->hdr.incoming) + return ser; + + return NULL; +} + +static inline int dev_iommu_restore_did(struct device *dev, struct iommu_domain *domain) +{ + struct iommu_device_ser *ser = dev_iommu_restored_state(dev); + + if (ser && iommu_domain_restored_state(domain)) + return ser->domain_iommu_ser.attachment_id; + + return -1; +} + +struct iommu_domain *iommu_restore_domain(struct device *dev, + struct iommu_device_ser *ser, + void **owner); +int iommu_for_each_preserved_device(iommu_preserved_device_iter_fn fn, + void *arg); +struct iommu_device_ser *iommu_get_device_preserved_data(struct device *dev); +struct iommu_hw_ser *iommu_get_preserved_data(u64 token, enum iommu_type_ser type); +int iommu_domain_preserve(struct iommu_domain *domain, struct iommu_domain_ser **ser); +void iommu_domain_unpreserve(struct iommu_domain *domain); +int iommu_preserve_device(struct iommu_domain *domain, + struct device *dev, u64 *preserved_state); +void iommu_unpreserve_device(struct iommu_domain *domain, struct device *dev); + +static inline void *iommu_preserved_state(struct iommu_device *iommu) +{ + return iommu->outgoing_preserved_state; +} +#else +static inline void *dev_iommu_preserved_state(struct device *dev) +{ + return NULL; +} + +static inline void *dev_iommu_restored_state(struct device *dev) +{ + return NULL; +} + +static inline int dev_iommu_restore_did(struct device *dev, struct iommu_domain *domain) +{ + return -1; +} + +static inline void *iommu_domain_restored_state(struct iommu_domain *domain) +{ + return NULL; +} + +static inline struct iommu_domain *iommu_restore_domain(struct device *dev, + struct iommu_device_ser *ser, + void **owner) +{ + return NULL; +} + +static inline int iommu_for_each_preserved_device(iommu_preserved_device_iter_fn fn, void *arg) +{ + return -EOPNOTSUPP; +} + +static inline struct iommu_device_ser *iommu_get_device_preserved_data(struct device *dev) +{ + return NULL; +} + +static inline struct iommu_hw_ser *iommu_get_preserved_data(u64 token, enum iommu_type_ser type) +{ + return NULL; +} + +static inline int iommu_domain_preserve(struct iommu_domain *domain, struct iommu_domain_ser **ser) +{ + return -EOPNOTSUPP; +} + +static inline void iommu_domain_unpreserve(struct iommu_domain *domain) +{ +} + +static inline int iommu_preserve_device(struct iommu_domain *domain, + struct device *dev, u64 *preserved_state) +{ + return -EOPNOTSUPP; +} + +static inline void iommu_unpreserve_device(struct iommu_domain *domain, struct device *dev) +{ +} + +static inline void *iommu_preserved_state(struct iommu_device *iommu) +{ + return NULL; +} +#endif + +int iommu_liveupdate_register_flb(struct liveupdate_file_handler *handler); +void iommu_liveupdate_unregister_flb(struct liveupdate_file_handler *handler); + +#endif /* _LINUX_IOMMU_LIVEUPDATE_H */ diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 6eb4967a2c59318febdf4274392f38816cd16774..1d3700a5ed5b9906de56a72ba4d399db8cc0a45c 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #define IOMMU_READ (1 << 0) @@ -248,7 +249,9 @@ struct iommu_domain { struct list_head next; }; }; - +#ifdef CONFIG_IOMMU_LIVEUPDATE + struct iommu_domain_ser *preserved_state; +#endif CK_KABI_RESERVE(1) CK_KABI_RESERVE(2) CK_KABI_RESERVE(3) @@ -655,6 +658,10 @@ __iommu_copy_struct_to_user(const struct iommu_user_data *dst_data, * resources shared/passed to user space IOMMU instance. Associate * it with a nesting @parent_domain. It is required for driver to * set @viommu->ops pointing to its own viommu_ops + * @preserve_device: Preserve state of a device for liveupdate. + * @unpreserve_device: Unpreserve state that was preserved earlier. + * @preserve: Preserve state of iommu translation hardware for liveupdate. + * @unpreserve: Unpreserve state of iommu that was preserved earlier. * @pgsize_bitmap: bitmap of all possible supported page sizes * @owner: Driver module providing these ops * @identity_domain: An always available, always attachable identity @@ -720,7 +727,12 @@ struct iommu_ops { struct iommu_domain *release_domain; struct iommu_domain *default_domain; u8 user_pasid_table:1; - +#ifdef CONFIG_IOMMU_LIVEUPDATE + int (*preserve_device)(struct device *dev, struct iommu_device_ser *device_ser); + void (*unpreserve_device)(struct device *dev, struct iommu_device_ser *device_ser); + int (*preserve)(struct iommu_device *iommu, struct iommu_hw_ser *iommu_ser); + void (*unpreserve)(struct iommu_device *iommu, struct iommu_hw_ser *iommu_ser); +#endif CK_KABI_RESERVE(1) CK_KABI_RESERVE(2) CK_KABI_RESERVE(3) @@ -768,6 +780,11 @@ struct iommu_ops { * specific mechanisms. * @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*) * @free: Release the domain after use. + * @preserve: Preserve the iommu domain for liveupdate. + * Returns 0 on success, a negative errno on failure. + * @unpreserve: Unpreserve the iommu domain that was preserved earlier. + * @restore: Restore the iommu domain after liveupdate. + * Returns 0 on success, a negative errno on failure. */ struct iommu_domain_ops { int (*attach_dev)(struct iommu_domain *domain, struct device *dev, @@ -798,6 +815,9 @@ struct iommu_domain_ops { unsigned long quirks); void (*free)(struct iommu_domain *domain); + int (*preserve)(struct iommu_domain *domain, struct iommu_domain_ser *ser); + void (*unpreserve)(struct iommu_domain *domain, struct iommu_domain_ser *ser); + int (*restore)(struct iommu_domain *domain, struct iommu_domain_ser *ser); }; /** @@ -809,6 +829,8 @@ struct iommu_domain_ops { * @singleton_group: Used internally for drivers that have only one group * @max_pasids: number of supported PASIDs * @ready: set once iommu_device_register() has completed successfully + * @outgoing_preserved_state: preserved iommu state of outgoing kernel for + * liveupdate. */ struct iommu_device { struct list_head list; @@ -818,6 +840,10 @@ struct iommu_device { struct iommu_group *singleton_group; u32 max_pasids; bool ready; + +#ifdef CONFIG_IOMMU_LIVEUPDATE + struct iommu_hw_ser *outgoing_preserved_state; +#endif }; /** @@ -872,6 +898,9 @@ struct dev_iommu { u32 pci_32bit_workaround:1; u32 require_direct:1; u32 shadow_on_flush:1; +#ifdef CONFIG_IOMMU_LIVEUPDATE + struct iommu_device_ser *device_ser; +#endif }; int iommu_device_register(struct iommu_device *iommu, @@ -1202,6 +1231,20 @@ static inline void *dev_iommu_priv_get(struct device *dev) void dev_iommu_priv_set(struct device *dev, void *priv); +typedef int (*iommu_dev_iter_fn)(struct device *dev, + struct iommu_device *iommu, void *arg); + +/** + * struct iommu_dev_iter - Iterator for devices attached to an IOMMU + */ +struct iommu_dev_iter { + struct iommu_device *iommu; + iommu_dev_iter_fn fn; + void *arg; +}; + +void iommu_for_each_dev(struct iommu_dev_iter *iter); + extern struct mutex iommu_probe_device_lock; int iommu_probe_device(struct device *dev); diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 6e7efe83bc5d83e8d4f5cede9a680860bc5fed9b..d1fd5d71e0fdb582feba42f5362d884de2bf47ae 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -71,6 +72,34 @@ void iommufd_device_detach(struct iommufd_device *idev, ioasid_t pasid); struct iommufd_ctx *iommufd_device_to_ictx(struct iommufd_device *idev); u32 iommufd_device_to_id(struct iommufd_device *idev); +#ifdef CONFIG_IOMMU_LIVEUPDATE +int iommufd_device_preserve(struct liveupdate_session *s, + struct iommufd_device *idev, + u64 *iommufd_tokenp, + u64 *preserved_state); +void iommufd_device_unpreserve(struct liveupdate_session *s, + struct iommufd_device *idev); +bool iommufd_device_is_preserved(struct iommufd_device *idev); +#else +static inline int iommufd_device_preserve(struct liveupdate_session *s, + struct iommufd_device *idev, + u64 *iommufd_tokenp, + u64 *preserved_state) +{ + return -EOPNOTSUPP; +} + +static inline void iommufd_device_unpreserve(struct liveupdate_session *s, + struct iommufd_device *idev) +{ +} + +static inline bool iommufd_device_is_preserved(struct iommufd_device *idev) +{ + return false; +} +#endif + struct iommufd_access_ops { u8 needs_pin_pages : 1; void (*unmap)(void *data, unsigned long iova, unsigned long length); diff --git a/include/linux/kho/abi/iommu.h b/include/linux/kho/abi/iommu.h new file mode 100644 index 0000000000000000000000000000000000000000..66314551396d7f8c55b0db5002e29edb0b6e175f --- /dev/null +++ b/include/linux/kho/abi/iommu.h @@ -0,0 +1,249 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (C) 2026, Google LLC + * Author: Samiullah Khawaja + */ + +#ifndef _LINUX_KHO_ABI_IOMMU_H +#define _LINUX_KHO_ABI_IOMMU_H + +#include +#include +#include + +/** + * DOC: IOMMU File-Lifecycle Bound (FLB) Live Update ABI + * + * This header defines the ABI for preserving IOMMU state across kexec using + * Live Update File-Lifecycle Bound (FLB) data. + * + * This interface is a contract. Any modification to any of the serialization + * structs defined here constitutes a breaking change. Such changes require + * incrementing the version number in the IOMMU_LUO_FLB_COMPATIBLE string. + * + * Memory Layout of Serialization Structures: + * ========================================== + * + * Each serialized type (IOMMU, Domain, Device) is stored in a linked list of + * arrays. The first array is allocated initially. When an array is full, a new + * array is allocated and its physical address is stored in the next_array_phys + * field of the hdr of the current array. + * + * Top Level (struct iommu_flb_ser): + * +---------------------------+ + * | - iommu_array_phys | + * | - iommu_domain_array_phys | + * | - device_array_phys | + * +---------------------------+ + * + * Each Array contains the serialized objects of the respective type. For + * example see below the representation of struct iommu_domain_array_ser. + * + * +---------------------------+ +---------------------------+ + * | iommu_domain_array_ser |-->| iommu_domain_array_ser |--> NULL + * | - hdr.next_array_phys | | - hdr.next_array_phys | + * | - hdr.nr_objects | | - hdr.nr_objects | + * | | | | + * | objects[]: | | objects[]: | + * | [ iommu_domain_ser ] | | [ iommu_domain_ser ] | + * | [ iommu_domain_ser ] | | [ iommu_domain_ser ] | + * | ... | | ... | + * +---------------------------+ +---------------------------+ + * + * Each object in the array starts with a common header (iommu_hdr_ser). + * For example, the layout of struct iommu_domain_ser is: + * + * +-----------------------------+ + * | iommu_domain_ser | + * | +-------------------------+ | + * | | hdr (iommu_hdr_ser) | | + * | | - ref_count | | + * | | - deleted / incoming | | + * | +-------------------------+ | + * | - top_table_phys | | + * | - top_level | | + * | - restored_domain | | + * +-----------------------------+ + * + * This pattern applies identically to iommu_device_ser and iommu_hw_ser. + */ + +#define IOMMU_LUO_FLB_COMPATIBLE "iommu-liveupdate-v1" + +enum iommu_type_ser { + IOMMU_INVALID, + IOMMU_INTEL, +}; + +/** + * struct iommu_hdr_ser - Common header for all serialized IOMMU objects + * @ref_count: Reference count for the object + * @deleted: Flag indicating if the object is deleted + * @incoming: Flag indicating if the object was preserved in previous kernel + */ +struct iommu_hdr_ser { + u32 ref_count; + u32 deleted:1; + u32 incoming:1; +} __packed; + +/** + * struct iommu_domain_ser - Serialized state of an IOMMU domain + * @hdr: Common object header + * @top_table_phys: Physical address of the top-level page table + * @top_level: Level of the top-level page table + * @vasz: Virtual Address Size + * @sign_extend: FEAT_SIGN_EXTEND is enabled for this domain + * @restored_domain: Pointer to the restored domain (valid only after restore) + */ +struct iommu_domain_ser { + struct iommu_hdr_ser hdr; + u64 top_table_phys; + u64 top_level; + u32 vasz; + u32 sign_extend:1; + struct iommu_domain *restored_domain; +} __packed; + +/** + * struct iommu_dev_map_ser - Serialized mapping between device, domain, + * and IOMMU instance. + * @attachment_id: ID of the attachment between device and domain. + * @domain_phys: Physical address of the domain + * @iommu_phys: Physical address of the IOMMU + */ +struct iommu_dev_map_ser { + u64 attachment_id; + u64 domain_phys; + u64 iommu_phys; +} __packed; + +/** + * struct iommu_device_intel_ser - Intel specific state of serialized device + * @pasid_table: Physical address of pasid table + * @max_pasid: Maximum supported pasid + */ +struct iommu_device_intel_ser { + u64 pasid_table; + u64 max_pasid; +} __packed; + +/** + * struct iommu_device_ser - Serialized state of a device + * @hdr: Common object header + * @devid: Device ID + * @pci_domain_nr: PCI domain number + * @domain_iommu_ser: Domain and IOMMU mapping + */ +struct iommu_device_ser { + struct iommu_hdr_ser hdr; + u32 devid; + u32 pci_domain_nr; + struct iommu_dev_map_ser domain_iommu_ser; + union { + struct iommu_device_intel_ser intel; + }; +} __packed; + +/** + * struct iommu_intel_ser - Serialized state of an Intel IOMMU instance + * @restored: Whether IOMMU state is restored + * @phys_addr: Physical address of the IOMMU register base + * @root_table: Physical address of the root entry table + */ +struct iommu_intel_ser { + u8 restored; + u8 padding[7]; + u64 phys_addr; + u64 root_table; +}; + +/** + * struct iommu_hw_ser - Serialized state of an IOMMU instance + * @hdr: Common object header + * @token: Unique token for the IOMMU + * @type: IOMMU type serialized state belongs to + * @intel: Intel specific serialization data + */ +struct iommu_hw_ser { + struct iommu_hdr_ser hdr; + u64 token; + u64 type; + union { + struct iommu_intel_ser intel; + }; +} __packed; + +/** + * struct iommu_array_hdr_ser - Header for an array of serialized objects + * @next_array_phys: Physical address of the next array of objects + * @nr_objects: Number of objects in the current array + */ +struct iommu_array_hdr_ser { + u64 next_array_phys; + u64 nr_objects; +} __packed; + +/** + * struct iommu_hw_array_ser - An array containing serialized IOMMU HWs + * @hdr: Array header + * @objects: Array of serialized IOMMU devices + */ +struct iommu_hw_array_ser { + struct iommu_array_hdr_ser hdr; + struct iommu_hw_ser objects[]; +} __packed; + +/** + * struct iommu_domain_array_ser - An array containing serialized domains + * @hdr: Array header + * @objects: Array of serialized domains + */ +struct iommu_domain_array_ser { + struct iommu_array_hdr_ser hdr; + struct iommu_domain_ser objects[]; +} __packed; + +/** + * struct iommu_device_array_ser - An array containing serialized devices + * @hdr: Array header + * @objects: Array of serialized devices + */ +struct iommu_device_array_ser { + struct iommu_array_hdr_ser hdr; + struct iommu_device_ser objects[]; +} __packed; + +/** + * struct iommu_flb_ser - Top-level serialization structure + * @iommu_array_phys: Physical address of the first array of IOMMU HWs + * @iommu_domain_array_phys: Physical address of the first array of domains + * @device_array_phys: Physical address of the first array of devices + */ +struct iommu_flb_ser { + u64 iommu_array_phys; + u64 iommu_domain_array_phys; + u64 device_array_phys; +} __packed; + +/** + * struct iommu_flb_obj - FLB object allocated in current kernel pointing to + * preserved state in FLB + * @lock: Mutex protecting the object + * @ser: Pointer to the serialized state in FLB + * @curr_iommu_array: Pointer to the current array of IOMMU instances + * @curr_domain_array: Pointer to the current array of domains + * @curr_device_array: Pointer to the current array of devices + */ +struct iommu_flb_obj { + /* @lock: Protects the serialized objects during concurrent preservation */ + struct mutex lock; + struct iommu_flb_ser *ser; + + struct iommu_hw_array_ser *curr_iommu_array; + struct iommu_domain_array_ser *curr_domain_array; + struct iommu_device_array_ser *curr_device_array; +} __packed; + +#endif /* _LINUX_KHO_ABI_IOMMU_H */ diff --git a/include/linux/kho/abi/iommufd.h b/include/linux/kho/abi/iommufd.h new file mode 100644 index 0000000000000000000000000000000000000000..557952123ba404e6155f1bd0c9a06f39ab46eb3a --- /dev/null +++ b/include/linux/kho/abi/iommufd.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (C) 2026, Google LLC + * Author: Samiullah Khawaja + */ + +#ifndef _LINUX_KHO_ABI_IOMMUFD_H +#define _LINUX_KHO_ABI_IOMMUFD_H + +#include +#include +#include + +/** + * DOC: IOMMUFD Live Update ABI + * + * This header defines the ABI for preserving the state of an IOMMUFD file + * across a kexec reboot using LUO. + * + * This interface is a contract. Any modification to any of the serialization + * structs defined here constitutes a breaking change. Such changes require + * incrementing the version number in the IOMMUFD_LUO_COMPATIBLE string. + */ + +#define IOMMUFD_LUO_COMPATIBLE "iommufd-v1" + +/** + * struct iommu_hwpt_ser - IOMMUFD HWPT serialized state + * @domain_data: Physical address of the serialized state of associated domain + * @token: User provided token + * @reclaimed: Whether the HWPT is reclaimed + */ +struct iommufd_hwpt_ser { + u64 domain_data; + u64 token; + u8 reclaimed; + u8 padding[7]; +} __packed; + +/** + * struct iommu_ser - IOMMUFD serialized state + * @nr_hwpts: Number of preserved HWPTs + * @hwpt_array: Array of serialized state of preserved HWPTs + */ +struct iommufd_ser { + u64 nr_hwpts; + struct iommufd_hwpt_ser hwpt_array[]; +} __packed; + +#endif /* _LINUX_KHO_ABI_IOMMUFD_H */ diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h index 30c5a39ff9e9c29936cecf664e4f985b24c6eb21..cdf42e9b963a65e9bf192be96e6fbffa004b05e3 100644 --- a/include/linux/liveupdate.h +++ b/include/linux/liveupdate.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -24,6 +25,7 @@ struct file; /** * struct liveupdate_file_op_args - Arguments for file operation callbacks. * @handler: The file handler being called. + * @session: The session this file belongs to. * @retrieve_status: The retrieve status for the 'can_finish / finish' * operation. A value of 0 means the retrieve has not been * attempted, a positive value means the retrieve was @@ -44,6 +46,7 @@ struct file; */ struct liveupdate_file_op_args { struct liveupdate_file_handler *handler; + struct liveupdate_session *session; int retrieve_status; struct file *file; u64 serialized_data; @@ -175,7 +178,7 @@ struct liveupdate_flb_ops { * @retrieved: True once the FLB's retrieve() callback has run. */ struct luo_flb_private_state { - long count; + refcount_t count; u64 data; void *obj; struct mutex lock; @@ -239,7 +242,15 @@ void liveupdate_unregister_flb(struct liveupdate_file_handler *fh, struct liveupdate_flb *flb); int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, void **objp); +void liveupdate_flb_put_incoming(struct liveupdate_flb *flb); int liveupdate_flb_get_outgoing(struct liveupdate_flb *flb, void **objp); +/* kernel can internally retrieve files */ +int liveupdate_get_file_incoming(struct liveupdate_session *s, u64 token, + struct file **filep); + +/* Get a token for an outgoing file, or -ENOENT if file is not preserved */ +int liveupdate_get_token_outgoing(struct liveupdate_session *s, + struct file *file, u64 *tokenp); #else /* CONFIG_LIVEUPDATE */ @@ -279,11 +290,27 @@ static inline int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, return -EOPNOTSUPP; } +static inline void liveupdate_flb_put_incoming(struct liveupdate_flb *flb) +{ +} + static inline int liveupdate_flb_get_outgoing(struct liveupdate_flb *flb, void **objp) { return -EOPNOTSUPP; } +static inline int liveupdate_get_file_incoming(struct liveupdate_session *s, + u64 token, struct file **filep) +{ + return -EOPNOTSUPP; +} + +static inline int liveupdate_get_token_outgoing(struct liveupdate_session *s, + struct file *file, u64 *tokenp) +{ + return -EOPNOTSUPP; +} + #endif /* CONFIG_LIVEUPDATE */ #endif /* _LINUX_LIVEUPDATE_H */ diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 9f1acb4eb2a8f48cb4b62112963429c813d2eef6..9afe8515f43b0241864f6ba6b5c9219c228a66fa 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -57,6 +57,7 @@ enum { IOMMUFD_CMD_IOAS_CHANGE_PROCESS = 0x92, IOMMUFD_CMD_VEVENTQ_ALLOC = 0x93, IOMMUFD_CMD_HW_QUEUE_ALLOC = 0x94, + IOMMUFD_CMD_HWPT_LU_MARK_PRESERVE = 0x95, }; /** @@ -1299,4 +1300,29 @@ struct iommu_hw_queue_alloc { __aligned_u64 length; }; #define IOMMU_HW_QUEUE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HW_QUEUE_ALLOC) + +/** + * struct iommu_hwpt_liveupdate_mark_preserve - ioctl(IOMMU_HWPT_LIVEUPDATE_MARK_PRESERVE) + * @size: sizeof(struct iommu_hwpt_liveupdate_mark_preserve) + * @hwpt_id: Iommufd object ID of the target HWPT + * @hwpt_token: Token to identify this hwpt upon restore + * + * The target HWPT will be preserved during iommufd preservation. + * Only file-based memory mappings (e.g. memfd) are supported for HWPTs marked + * for preservation. Mapping anonymous memory into a preserved HWPT will result + * in a failure during the preservation phase. + * + * The hwpt_token is provided by userspace. If userspace enters a token + * already in use within this iommufd, -EADDRINUSE is returned from this ioctl. + * + * Note: There is no 'unmark' operation, so any HWPTs pooled in userspace that + * are marked for preservation must be destroyed after use. + */ +struct iommu_hwpt_liveupdate_mark_preserve { + __u32 size; + __u32 hwpt_id; + __u64 hwpt_token; +}; +#define IOMMU_HWPT_LIVEUPDATE_MARK_PRESERVE _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_LU_MARK_PRESERVE) + #endif diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c index 451daff0856cc24dc4d78036a8c16b2295ac06b7..4a27af7c93ee0f6802a27d0f6593bb19f76cc502 100644 --- a/kernel/liveupdate/luo_file.c +++ b/kernel/liveupdate/luo_file.c @@ -325,6 +325,7 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd) mutex_init(&luo_file->mutex); args.handler = fh; + args.session = luo_session_from_file_set(file_set); args.file = file; err = fh->ops->preserve(&args); if (err) @@ -382,6 +383,7 @@ void luo_file_unpreserve_files(struct luo_file_set *file_set) struct luo_file, list); args.handler = luo_file->fh; + args.session = luo_session_from_file_set(file_set); args.file = luo_file->file; args.serialized_data = luo_file->serialized_data; args.private_data = luo_file->private_data; @@ -413,6 +415,7 @@ static int luo_file_freeze_one(struct luo_file_set *file_set, struct liveupdate_file_op_args args = {0}; args.handler = luo_file->fh; + args.session = luo_session_from_file_set(file_set); args.file = luo_file->file; args.serialized_data = luo_file->serialized_data; args.private_data = luo_file->private_data; @@ -434,6 +437,7 @@ static void luo_file_unfreeze_one(struct luo_file_set *file_set, struct liveupdate_file_op_args args = {0}; args.handler = luo_file->fh; + args.session = luo_session_from_file_set(file_set); args.file = luo_file->file; args.serialized_data = luo_file->serialized_data; args.private_data = luo_file->private_data; @@ -623,6 +627,7 @@ int luo_retrieve_file(struct luo_file_set *file_set, u64 token, } args.handler = luo_file->fh; + args.session = luo_session_from_file_set(file_set); args.serialized_data = luo_file->serialized_data; err = luo_file->fh->ops->retrieve(&args); if (err) { @@ -656,6 +661,7 @@ static int luo_file_can_finish_one(struct luo_file_set *file_set, struct liveupdate_file_op_args args = {0}; args.handler = luo_file->fh; + args.session = luo_session_from_file_set(file_set); args.file = luo_file->file; args.serialized_data = luo_file->serialized_data; args.retrieve_status = luo_file->retrieve_status; @@ -673,6 +679,7 @@ static void luo_file_finish_one(struct luo_file_set *file_set, guard(mutex)(&luo_file->mutex); args.handler = luo_file->fh; + args.session = luo_session_from_file_set(file_set); args.file = luo_file->file; args.serialized_data = luo_file->serialized_data; args.retrieve_status = luo_file->retrieve_status; @@ -929,3 +936,65 @@ void liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh) list_del(&ACCESS_PRIVATE(fh, list)); } EXPORT_SYMBOL_GPL(liveupdate_unregister_file_handler); + +/** + * liveupdate_get_token_outgoing - Get the token for a preserved file. + * @s: The outgoing liveupdate session. + * @file: The file object to search for. + * @tokenp: Output parameter for the found token. + * + * Searches the list of preserved files in an outgoing session for a matching + * file object. If found, the corresponding user-provided token is returned. + * + * This function is intended for in-kernel callers that need to correlate a + * file with its liveupdate token. + * + * Context: It must be called with session mutex acquired. + * Return: 0 on success, -ENOENT if the file is not preserved in this session. + */ +int liveupdate_get_token_outgoing(struct liveupdate_session *s, + struct file *file, u64 *tokenp) +{ + struct luo_file_set *file_set = luo_file_set_from_session_locked(s); + struct luo_file *luo_file; + int err = -ENOENT; + + list_for_each_entry(luo_file, &file_set->files_list, list) { + if (luo_file->file == file) { + if (tokenp) + *tokenp = luo_file->token; + err = 0; + break; + } + } + + return err; +} +EXPORT_SYMBOL_GPL(liveupdate_get_token_outgoing); + +/** + * liveupdate_get_file_incoming - Retrieves a preserved file for in-kernel use. + * @s: The incoming liveupdate session (restored from the previous kernel). + * @token: The unique token identifying the file to retrieve. + * @filep: On success, this will be populated with a pointer to the retrieved + * 'struct file'. + * + * Provides a kernel-internal API for other subsystems to retrieve their + * preserved files after a live update. This function is a simple wrapper + * around luo_retrieve_file(), allowing callers to find a file by its token. + * + * The caller receives a new reference to the file and must call fput() when it + * is no longer needed. The file's lifetime is managed by LUO and any userspace + * file descriptors. If the caller needs to hold a reference to the file beyond + * the immediate scope, it must call get_file() itself. + * + * Context: It must be called with session mutex acquired of a restored session. + * Return: 0 on success. Returns -ENOENT if no file with the matching token is + * found, or any other negative errno on failure. + */ +int liveupdate_get_file_incoming(struct liveupdate_session *s, u64 token, + struct file **filep) +{ + return luo_retrieve_file(luo_file_set_from_session_locked(s), + token, filep); +} diff --git a/kernel/liveupdate/luo_flb.c b/kernel/liveupdate/luo_flb.c index edd932c1e205d8752718ba0ba1e980374026ab81..cb8484df73b0133da14211359192f34de413e182 100644 --- a/kernel/liveupdate/luo_flb.c +++ b/kernel/liveupdate/luo_flb.c @@ -111,7 +111,7 @@ static int luo_flb_file_preserve_one(struct liveupdate_flb *flb) struct luo_flb_private *private = luo_flb_get_private(flb); scoped_guard(mutex, &private->outgoing.lock) { - if (!private->outgoing.count) { + if (!refcount_read(&private->outgoing.count)) { struct liveupdate_flb_op_args args = {0}; int err; @@ -126,8 +126,10 @@ static int luo_flb_file_preserve_one(struct liveupdate_flb *flb) } private->outgoing.data = args.data; private->outgoing.obj = args.obj; + refcount_set(&private->outgoing.count, 1); + } else { + refcount_inc(&private->outgoing.count); } - private->outgoing.count++; } return 0; @@ -138,8 +140,7 @@ static void luo_flb_file_unpreserve_one(struct liveupdate_flb *flb) struct luo_flb_private *private = luo_flb_get_private(flb); scoped_guard(mutex, &private->outgoing.lock) { - private->outgoing.count--; - if (!private->outgoing.count) { + if (refcount_dec_and_test(&private->outgoing.count)) { struct liveupdate_flb_op_args args = {0}; args.flb = flb; @@ -164,7 +165,7 @@ static int luo_flb_retrieve_one(struct liveupdate_flb *flb) bool found = false; int err; - guard(mutex)(&private->incoming.lock); + lockdep_assert_held(&private->incoming.lock); if (private->incoming.finished) return -ENODATA; @@ -178,7 +179,7 @@ static int luo_flb_retrieve_one(struct liveupdate_flb *flb) for (int i = 0; i < fh->header_ser->count; i++) { if (!strcmp(fh->ser[i].name, flb->compatible)) { private->incoming.data = fh->ser[i].data; - private->incoming.count = fh->ser[i].count; + refcount_set(&private->incoming.count, fh->ser[i].count); found = true; break; } @@ -205,16 +206,14 @@ static int luo_flb_retrieve_one(struct liveupdate_flb *flb) return 0; } -static void luo_flb_file_finish_one(struct liveupdate_flb *flb) +void liveupdate_flb_put_incoming(struct liveupdate_flb *flb) { struct luo_flb_private *private = luo_flb_get_private(flb); - u64 count; - - scoped_guard(mutex, &private->incoming.lock) - count = --private->incoming.count; + struct liveupdate_flb_op_args args = {0}; - if (!count) { - struct liveupdate_flb_op_args args = {0}; + scoped_guard(mutex, &private->incoming.lock) { + if (!refcount_dec_and_test(&private->incoming.count)) + return; if (!private->incoming.retrieved) { int err = luo_flb_retrieve_one(flb); @@ -223,16 +222,14 @@ static void luo_flb_file_finish_one(struct liveupdate_flb *flb) return; } - scoped_guard(mutex, &private->incoming.lock) { - args.flb = flb; - args.obj = private->incoming.obj; - flb->ops->finish(&args); + args.flb = flb; + args.obj = private->incoming.obj; + flb->ops->finish(&args); - private->incoming.data = 0; - private->incoming.obj = NULL; - private->incoming.finished = true; - module_put(flb->ops->owner); - } + private->incoming.data = 0; + private->incoming.obj = NULL; + private->incoming.finished = true; + module_put(flb->ops->owner); } } @@ -315,7 +312,7 @@ void luo_flb_file_finish(struct liveupdate_file_handler *fh) guard(rwsem_read)(&luo_register_rwlock); list_for_each_entry_reverse(iter, flb_list, list) - luo_flb_file_finish_one(iter->flb); + liveupdate_flb_put_incoming(iter->flb); } static void luo_flb_unregister_one(struct liveupdate_file_handler *fh, @@ -513,6 +510,8 @@ int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, void **objp) if (!liveupdate_enabled()) return -EOPNOTSUPP; + guard(mutex)(&private->incoming.lock); + if (!private->incoming.obj) { int err = luo_flb_retrieve_one(flb); @@ -520,7 +519,7 @@ int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, void **objp) return err; } - guard(mutex)(&private->incoming.lock); + refcount_inc(&private->incoming.count); *objp = private->incoming.obj; return 0; @@ -653,12 +652,13 @@ void luo_flb_serialize(void) guard(rwsem_read)(&luo_register_rwlock); list_private_for_each_entry(gflb, &luo_flb_global.list, private.list) { struct luo_flb_private *private = luo_flb_get_private(gflb); + long count = refcount_read(&private->outgoing.count); - if (private->outgoing.count > 0) { + if (count > 0) { strscpy(fh->ser[i].name, gflb->compatible, sizeof(fh->ser[i].name)); fh->ser[i].data = private->outgoing.data; - fh->ser[i].count = private->outgoing.count; + fh->ser[i].count = count; i++; } } diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h index 875844d7a41dd50dac0e3cf4bf96797e8dd207d5..08b198802e7f4f4e358cdb3d6c2e4e557e9423a1 100644 --- a/kernel/liveupdate/luo_internal.h +++ b/kernel/liveupdate/luo_internal.h @@ -79,6 +79,23 @@ struct luo_session { extern struct rw_semaphore luo_register_rwlock; +static inline struct liveupdate_session *luo_session_from_file_set(struct luo_file_set *file_set) +{ + struct luo_session *session; + + session = container_of(file_set, struct luo_session, file_set); + + return (struct liveupdate_session *)session; +} + +static inline struct luo_file_set *luo_file_set_from_session_locked(struct liveupdate_session *s) +{ + struct luo_session *session = (struct luo_session *)s; + + lockdep_assert_held(&session->mutex); + return &session->file_set; +} + int luo_session_create(const char *name, struct file **filep); int luo_session_retrieve(const char *name, struct file **filep); int __init luo_session_setup_outgoing(void *fdt); diff --git a/mm/memfd.c b/mm/memfd.c index c5ef07a710ec6100cf41cd185fada683a904471d..8bd8704a293d8ae3a094ba2894d36ce9d928fadb 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -316,6 +316,7 @@ int memfd_get_seals(struct file *file) return seals ? *seals : -EINVAL; } +EXPORT_SYMBOL_GPL(memfd_get_seals); long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg) { diff --git a/tools/testing/selftests/iommu/Makefile b/tools/testing/selftests/iommu/Makefile index f824582a253f225a1c1095dd99568593b7260c93..d0f65286d64b336776756a9a6c4f183205d2029a 100644 --- a/tools/testing/selftests/iommu/Makefile +++ b/tools/testing/selftests/iommu/Makefile @@ -9,4 +9,16 @@ TEST_GEN_PROGS := TEST_GEN_PROGS += iommufd TEST_GEN_PROGS += iommufd_fail_nth +TEST_GEN_PROGS_EXTENDED += iommufd_liveupdate_kexec_test + include ../lib.mk +include ../liveupdate/lib/libliveupdate.mk + +CFLAGS += -I$(top_srcdir)/tools/include +CFLAGS += -MD +CFLAGS += $(EXTRA_CFLAGS) + +$(TEST_GEN_PROGS_EXTENDED): %: %.o $(LIBLIVEUPDATE_O) + $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(TARGET_ARCH) $< $(LIBLIVEUPDATE_O) $(LDLIBS) -o $@ + +EXTRA_CLEAN += $(LIBLIVEUPDATE_O) diff --git a/tools/testing/selftests/iommu/iommufd_liveupdate_kexec_test.c b/tools/testing/selftests/iommu/iommufd_liveupdate_kexec_test.c new file mode 100644 index 0000000000000000000000000000000000000000..cad57aba056f3cbfb12aca0cbaf2f6453b54ddd4 --- /dev/null +++ b/tools/testing/selftests/iommu/iommufd_liveupdate_kexec_test.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Copyright (c) 2026, Google LLC. + * Samiullah Khawaja + */ + +#include +#include +#include +#include +#include +#include + +#define __EXPORTED_HEADERS__ +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" + +#define ksft_assert(condition) \ + do { \ + if (!(condition)) \ + fail_exit("Failed: %s", #condition); \ + } while (0) + +static const char *device_cdev_path; +static char state_session[LIVEUPDATE_SESSION_NAME_LENGTH]; +static char iommufd_session[LIVEUPDATE_SESSION_NAME_LENGTH]; + +static const uint64_t STATE_TOKEN; +static const uint64_t IOMMUFD_TOKEN = 0x123456; +static const uint64_t CDEV_TOKEN = 0x654321; +static const uint64_t HWPT_TOKEN = 0x789012; +static const uint64_t MEMFD_TOKEN = 0x890123; + +static int open_cdev(const char *vfio_cdev_path) +{ + int cdev_fd; + + cdev_fd = open(vfio_cdev_path, O_RDWR); + if (cdev_fd < 0) + ksft_exit_skip("Failed to open VFIO cdev: %s\n", vfio_cdev_path); + + return cdev_fd; +} + +static int open_iommufd(void) +{ + int iommufd; + + iommufd = open("/dev/iommu", O_RDWR); + if (iommufd < 0) + ksft_exit_skip("Failed to open /dev/iommu. IOMMUFD support not enabled.\n"); + + return iommufd; +} + +static int create_sealed_memfd(size_t size) +{ + int fd, ret; + + fd = memfd_create("buffer", MFD_ALLOW_SEALING); + if (fd < 0) + fail_exit("memfd_create failed"); + + ret = ftruncate(fd, size); + if (ret) + fail_exit("ftruncate failed"); + + ret = fcntl(fd, F_ADD_SEALS, F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL); + if (ret) + fail_exit("fcntl F_ADD_SEALS failed"); + + return fd; +} + +#define test_ioctl(fd, cmd, arg) \ + do { \ + if (ioctl(fd, cmd, arg)) \ + fail_exit("ioctl(%s) failed", #cmd); \ + } while (0) + +#define test_luo_session_preserve_fd(session, fd, token) \ + do { \ + if (luo_session_preserve_fd(session, fd, token)) \ + fail_exit("luo_session_preserve_fd(%s) failed", #token); \ + } while (0) + +#define test_luo_session_retrieve_fd(session, token) \ + ({ \ + int _fd = luo_session_retrieve_fd(session, token); \ + if (_fd < 0) \ + fail_exit("luo_session_retrieve_fd(%s) failed", #token); \ + _fd; \ + }) + +static void setup_iommufd(int iommufd, int memfd, int cdev_fd) +{ + struct vfio_device_bind_iommufd bind = { + .argsz = sizeof(bind), + .flags = 0, + .iommufd = iommufd, + }; + struct iommu_ioas_alloc alloc_data = { + .size = sizeof(alloc_data), + .flags = 0, + }; + struct iommu_hwpt_alloc hwpt_alloc = { + .size = sizeof(hwpt_alloc), + .flags = 0, + }; + struct vfio_device_attach_iommufd_pt attach_data = { + .argsz = sizeof(attach_data), + .flags = 0, + }; + struct iommu_hwpt_liveupdate_mark_preserve mark_preserve = { + .size = sizeof(mark_preserve), + .hwpt_token = HWPT_TOKEN, + }; + struct iommu_ioas_map_file map_file = { + .size = sizeof(map_file), + .length = SZ_1M, + .flags = IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE, + .iova = SZ_4G, + .fd = memfd, + .start = 0, + }; + + test_ioctl(cdev_fd, VFIO_DEVICE_BIND_IOMMUFD, &bind); + + test_ioctl(iommufd, IOMMU_IOAS_ALLOC, &alloc_data); + + hwpt_alloc.dev_id = bind.out_devid; + hwpt_alloc.pt_id = alloc_data.out_ioas_id; + test_ioctl(iommufd, IOMMU_HWPT_ALLOC, &hwpt_alloc); + + attach_data.pt_id = hwpt_alloc.out_hwpt_id; + test_ioctl(cdev_fd, VFIO_DEVICE_ATTACH_IOMMUFD_PT, &attach_data); + + map_file.ioas_id = alloc_data.out_ioas_id; + test_ioctl(iommufd, IOMMU_IOAS_MAP_FILE, &map_file); + + mark_preserve.hwpt_id = attach_data.pt_id; + test_ioctl(iommufd, IOMMU_HWPT_LIVEUPDATE_MARK_PRESERVE, &mark_preserve); +} + +static void before_kexec(int luo_fd) +{ + int iommufd, cdev_fd, memfd, session; + + create_state_file(luo_fd, state_session, STATE_TOKEN, /*next_stage=*/2); + + session = luo_create_session(luo_fd, iommufd_session); + if (session < 0) + fail_exit("luo_create_session failed"); + + iommufd = open_iommufd(); + memfd = create_sealed_memfd(SZ_1M); + cdev_fd = open_cdev(device_cdev_path); + + setup_iommufd(iommufd, memfd, cdev_fd); + + /* Cannot preserve cdev without iommufd */ + if (!luo_session_preserve_fd(session, cdev_fd, CDEV_TOKEN)) + fail_exit("Preserving cdev without iommufd should fail"); + + /* Cannot preserve iommufd without preserving memfd. */ + if (!luo_session_preserve_fd(session, iommufd, IOMMUFD_TOKEN)) + fail_exit("Preserving iommufd without memfd should fail"); + + test_luo_session_preserve_fd(session, memfd, MEMFD_TOKEN); + test_luo_session_preserve_fd(session, iommufd, IOMMUFD_TOKEN); + test_luo_session_preserve_fd(session, cdev_fd, CDEV_TOKEN); + + close(session); + session = luo_create_session(luo_fd, iommufd_session); + if (session < 0) + fail_exit("luo_create_session failed"); + + test_luo_session_preserve_fd(session, memfd, MEMFD_TOKEN); + test_luo_session_preserve_fd(session, iommufd, IOMMUFD_TOKEN); + test_luo_session_preserve_fd(session, cdev_fd, CDEV_TOKEN); + + close(luo_fd); + daemonize_and_wait(); +} + +static void after_kexec(int luo_fd, int state_session_fd) +{ + int iommufd, cdev_fd, session, stage; + struct vfio_device_bind_iommufd bind = { + .argsz = sizeof(bind), + .flags = 0, + }; + + restore_and_read_stage(state_session_fd, STATE_TOKEN, &stage); + ksft_assert(stage == 2); + + session = luo_retrieve_session(luo_fd, iommufd_session); + if (session < 0) + fail_exit("luo_retrieve_session failed"); + + cdev_fd = test_luo_session_retrieve_fd(session, CDEV_TOKEN); + + iommufd = luo_session_retrieve_fd(session, IOMMUFD_TOKEN); + if (iommufd >= 0) + fail_exit("iommufd should not be retrievable yet"); + + iommufd = open_iommufd(); + + bind.iommufd = iommufd; + if (ioctl(cdev_fd, VFIO_DEVICE_BIND_IOMMUFD, &bind) == 0 || errno != EPERM) + fail_exit("Binding cdev to new iommufd should fail with EPERM"); + + /* Should fail */ + if (luo_session_finish(session) == 0) + fail_exit("luo_session_finish should fail if iommufd is not restored"); + + close(iommufd); + close(cdev_fd); +} + +int main(int argc, char *argv[]) +{ + if (argc < 2) { + printf("Usage: %s \n", argv[0]); + return 1; + } + + device_cdev_path = argv[1]; + sprintf(iommufd_session, "iommufd-test-%s", "cdev"); + sprintf(state_session, "state-%s", "iommufd-cdev"); + + return luo_test(argc, argv, state_session, before_kexec, after_kexec); +}