From 57f8ba3032765424124c1c1a985341911899e725 Mon Sep 17 00:00:00 2001 From: David Matlack Date: Wed, 22 Apr 2026 17:47:51 +0000 Subject: [PATCH 01/18] liveupdate: Use refcount_t for FLB reference counts ANBZ: #26808 cherry-picked from https://lore.kernel.org/all/20260423174032.3140399-2-dmatlack@google.com/ Use refcount_t instead of a raw integer to keep track of references on incoming and outgoing FLBs. Using refcount_t provides protection from overflow, underflow, and other issues. Fixes: cab056f2aae7 ("liveupdate: luo_flb: introduce File-Lifecycle-Bound global state") Signed-off-by: David Matlack Signed-off-by: Zelin Deng --- include/linux/liveupdate.h | 3 ++- kernel/liveupdate/luo_flb.c | 22 ++++++++++------------ 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h index 30c5a39ff9e9..8d3bbc35c828 100644 --- a/include/linux/liveupdate.h +++ b/include/linux/liveupdate.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -175,7 +176,7 @@ struct liveupdate_flb_ops { * @retrieved: True once the FLB's retrieve() callback has run. */ struct luo_flb_private_state { - long count; + refcount_t count; u64 data; void *obj; struct mutex lock; diff --git a/kernel/liveupdate/luo_flb.c b/kernel/liveupdate/luo_flb.c index edd932c1e205..85446111c8d9 100644 --- a/kernel/liveupdate/luo_flb.c +++ b/kernel/liveupdate/luo_flb.c @@ -111,7 +111,7 @@ static int luo_flb_file_preserve_one(struct liveupdate_flb *flb) struct luo_flb_private *private = luo_flb_get_private(flb); scoped_guard(mutex, &private->outgoing.lock) { - if (!private->outgoing.count) { + if (!refcount_read(&private->outgoing.count)) { struct liveupdate_flb_op_args args = {0}; int err; @@ -126,8 +126,10 @@ static int luo_flb_file_preserve_one(struct liveupdate_flb *flb) } private->outgoing.data = args.data; private->outgoing.obj = args.obj; + refcount_set(&private->outgoing.count, 1); + } else { + refcount_inc(&private->outgoing.count); } - private->outgoing.count++; } return 0; @@ -138,8 +140,7 @@ static void luo_flb_file_unpreserve_one(struct liveupdate_flb *flb) struct luo_flb_private *private = luo_flb_get_private(flb); scoped_guard(mutex, &private->outgoing.lock) { - private->outgoing.count--; - if (!private->outgoing.count) { + if (refcount_dec_and_test(&private->outgoing.count)) { struct liveupdate_flb_op_args args = {0}; args.flb = flb; @@ -178,7 +179,7 @@ static int luo_flb_retrieve_one(struct liveupdate_flb *flb) for (int i = 0; i < fh->header_ser->count; i++) { if (!strcmp(fh->ser[i].name, flb->compatible)) { private->incoming.data = fh->ser[i].data; - private->incoming.count = fh->ser[i].count; + refcount_set(&private->incoming.count, fh->ser[i].count); found = true; break; } @@ -208,12 +209,8 @@ static int luo_flb_retrieve_one(struct liveupdate_flb *flb) static void luo_flb_file_finish_one(struct liveupdate_flb *flb) { struct luo_flb_private *private = luo_flb_get_private(flb); - u64 count; - scoped_guard(mutex, &private->incoming.lock) - count = --private->incoming.count; - - if (!count) { + if (refcount_dec_and_test(&private->incoming.count)) { struct liveupdate_flb_op_args args = {0}; if (!private->incoming.retrieved) { @@ -653,12 +650,13 @@ void luo_flb_serialize(void) guard(rwsem_read)(&luo_register_rwlock); list_private_for_each_entry(gflb, &luo_flb_global.list, private.list) { struct luo_flb_private *private = luo_flb_get_private(gflb); + long count = refcount_read(&private->outgoing.count); - if (private->outgoing.count > 0) { + if (count > 0) { strscpy(fh->ser[i].name, gflb->compatible, sizeof(fh->ser[i].name)); fh->ser[i].data = private->outgoing.data; - fh->ser[i].count = private->outgoing.count; + fh->ser[i].count = count; i++; } } -- Gitee From c959827337bbc23d7351e8fe304cd4da1d822e0d Mon Sep 17 00:00:00 2001 From: David Matlack Date: Thu, 9 Apr 2026 21:27:53 +0000 Subject: [PATCH 02/18] liveupdate: Reference count incoming FLB data ANBZ: #26808 cherry-picked from https://lore.kernel.org/all/20260423174032.3140399-3-dmatlack@google.com/ Increment the incoming FLB refcount in liveupdate_flb_get_incoming() so that the FLB structure cannot be freed while the caller is actively using it. Add an additional liveupdate_flb_put_incoming() function so the caller can explicitly indicate when it is done using the FLB data. During a Live Update, a subsystem might need to hold onto the incoming File-Lifecycle-Bound (FLB) data for an extended period, such as during device enumeration. Incrementing the reference count guarantees that the data remains valid and accessible until the subsystem releases it, preventing future use-after-free bugs. Fixes: cab056f2aae7 ("liveupdate: luo_flb: introduce File-Lifecycle-Bound global state") Signed-off-by: David Matlack Signed-off-by: Zelin Deng --- include/linux/liveupdate.h | 5 +++++ kernel/liveupdate/luo_flb.c | 32 +++++++++++++++++--------------- 2 files changed, 22 insertions(+), 15 deletions(-) diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h index 8d3bbc35c828..d9fcf0bb07c4 100644 --- a/include/linux/liveupdate.h +++ b/include/linux/liveupdate.h @@ -240,6 +240,7 @@ void liveupdate_unregister_flb(struct liveupdate_file_handler *fh, struct liveupdate_flb *flb); int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, void **objp); +void liveupdate_flb_put_incoming(struct liveupdate_flb *flb); int liveupdate_flb_get_outgoing(struct liveupdate_flb *flb, void **objp); #else /* CONFIG_LIVEUPDATE */ @@ -280,6 +281,10 @@ static inline int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, return -EOPNOTSUPP; } +static inline void liveupdate_flb_put_incoming(struct liveupdate_flb *flb) +{ +} + static inline int liveupdate_flb_get_outgoing(struct liveupdate_flb *flb, void **objp) { diff --git a/kernel/liveupdate/luo_flb.c b/kernel/liveupdate/luo_flb.c index 85446111c8d9..cb8484df73b0 100644 --- a/kernel/liveupdate/luo_flb.c +++ b/kernel/liveupdate/luo_flb.c @@ -165,7 +165,7 @@ static int luo_flb_retrieve_one(struct liveupdate_flb *flb) bool found = false; int err; - guard(mutex)(&private->incoming.lock); + lockdep_assert_held(&private->incoming.lock); if (private->incoming.finished) return -ENODATA; @@ -206,12 +206,14 @@ static int luo_flb_retrieve_one(struct liveupdate_flb *flb) return 0; } -static void luo_flb_file_finish_one(struct liveupdate_flb *flb) +void liveupdate_flb_put_incoming(struct liveupdate_flb *flb) { struct luo_flb_private *private = luo_flb_get_private(flb); + struct liveupdate_flb_op_args args = {0}; - if (refcount_dec_and_test(&private->incoming.count)) { - struct liveupdate_flb_op_args args = {0}; + scoped_guard(mutex, &private->incoming.lock) { + if (!refcount_dec_and_test(&private->incoming.count)) + return; if (!private->incoming.retrieved) { int err = luo_flb_retrieve_one(flb); @@ -220,16 +222,14 @@ static void luo_flb_file_finish_one(struct liveupdate_flb *flb) return; } - scoped_guard(mutex, &private->incoming.lock) { - args.flb = flb; - args.obj = private->incoming.obj; - flb->ops->finish(&args); + args.flb = flb; + args.obj = private->incoming.obj; + flb->ops->finish(&args); - private->incoming.data = 0; - private->incoming.obj = NULL; - private->incoming.finished = true; - module_put(flb->ops->owner); - } + private->incoming.data = 0; + private->incoming.obj = NULL; + private->incoming.finished = true; + module_put(flb->ops->owner); } } @@ -312,7 +312,7 @@ void luo_flb_file_finish(struct liveupdate_file_handler *fh) guard(rwsem_read)(&luo_register_rwlock); list_for_each_entry_reverse(iter, flb_list, list) - luo_flb_file_finish_one(iter->flb); + liveupdate_flb_put_incoming(iter->flb); } static void luo_flb_unregister_one(struct liveupdate_file_handler *fh, @@ -510,6 +510,8 @@ int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, void **objp) if (!liveupdate_enabled()) return -EOPNOTSUPP; + guard(mutex)(&private->incoming.lock); + if (!private->incoming.obj) { int err = luo_flb_retrieve_one(flb); @@ -517,7 +519,7 @@ int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, void **objp) return err; } - guard(mutex)(&private->incoming.lock); + refcount_inc(&private->incoming.count); *objp = private->incoming.obj; return 0; -- Gitee From a1d4326dcf6d48de265c5533dee77b8f4bbf6ac3 Mon Sep 17 00:00:00 2001 From: Pasha Tatashin Date: Sat, 22 Nov 2025 17:23:47 -0500 Subject: [PATCH 03/18] liveupdate: luo_file: Add internal APIs for file preservation ANBZ: #26808 cherry-picked from https://lore.kernel.org/all/20260427175633.1978233-2-skhawaja@google.com/ The core liveupdate mechanism allows userspace to preserve file descriptors. However, kernel subsystems often manage struct file objects directly and need to participate in the preservation process programmatically without relying solely on userspace interaction. Signed-off-by: Pasha Tatashin Signed-off-by: Samiullah Khawaja Signed-off-by: Zelin Deng --- include/linux/liveupdate.h | 21 ++++++++++ kernel/liveupdate/luo_file.c | 68 ++++++++++++++++++++++++++++++++ kernel/liveupdate/luo_internal.h | 17 ++++++++ 3 files changed, 106 insertions(+) diff --git a/include/linux/liveupdate.h b/include/linux/liveupdate.h index d9fcf0bb07c4..cdf42e9b963a 100644 --- a/include/linux/liveupdate.h +++ b/include/linux/liveupdate.h @@ -25,6 +25,7 @@ struct file; /** * struct liveupdate_file_op_args - Arguments for file operation callbacks. * @handler: The file handler being called. + * @session: The session this file belongs to. * @retrieve_status: The retrieve status for the 'can_finish / finish' * operation. A value of 0 means the retrieve has not been * attempted, a positive value means the retrieve was @@ -45,6 +46,7 @@ struct file; */ struct liveupdate_file_op_args { struct liveupdate_file_handler *handler; + struct liveupdate_session *session; int retrieve_status; struct file *file; u64 serialized_data; @@ -242,6 +244,13 @@ void liveupdate_unregister_flb(struct liveupdate_file_handler *fh, int liveupdate_flb_get_incoming(struct liveupdate_flb *flb, void **objp); void liveupdate_flb_put_incoming(struct liveupdate_flb *flb); int liveupdate_flb_get_outgoing(struct liveupdate_flb *flb, void **objp); +/* kernel can internally retrieve files */ +int liveupdate_get_file_incoming(struct liveupdate_session *s, u64 token, + struct file **filep); + +/* Get a token for an outgoing file, or -ENOENT if file is not preserved */ +int liveupdate_get_token_outgoing(struct liveupdate_session *s, + struct file *file, u64 *tokenp); #else /* CONFIG_LIVEUPDATE */ @@ -291,5 +300,17 @@ static inline int liveupdate_flb_get_outgoing(struct liveupdate_flb *flb, return -EOPNOTSUPP; } +static inline int liveupdate_get_file_incoming(struct liveupdate_session *s, + u64 token, struct file **filep) +{ + return -EOPNOTSUPP; +} + +static inline int liveupdate_get_token_outgoing(struct liveupdate_session *s, + struct file *file, u64 *tokenp) +{ + return -EOPNOTSUPP; +} + #endif /* CONFIG_LIVEUPDATE */ #endif /* _LINUX_LIVEUPDATE_H */ diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c index 451daff0856c..2a1c93e97f7d 100644 --- a/kernel/liveupdate/luo_file.c +++ b/kernel/liveupdate/luo_file.c @@ -325,6 +325,7 @@ int luo_preserve_file(struct luo_file_set *file_set, u64 token, int fd) mutex_init(&luo_file->mutex); args.handler = fh; + args.session = luo_session_from_file_set(file_set); args.file = file; err = fh->ops->preserve(&args); if (err) @@ -382,6 +383,7 @@ void luo_file_unpreserve_files(struct luo_file_set *file_set) struct luo_file, list); args.handler = luo_file->fh; + args.session = luo_session_from_file_set(file_set); args.file = luo_file->file; args.serialized_data = luo_file->serialized_data; args.private_data = luo_file->private_data; @@ -413,6 +415,7 @@ static int luo_file_freeze_one(struct luo_file_set *file_set, struct liveupdate_file_op_args args = {0}; args.handler = luo_file->fh; + args.session = luo_session_from_file_set(file_set); args.file = luo_file->file; args.serialized_data = luo_file->serialized_data; args.private_data = luo_file->private_data; @@ -434,6 +437,7 @@ static void luo_file_unfreeze_one(struct luo_file_set *file_set, struct liveupdate_file_op_args args = {0}; args.handler = luo_file->fh; + args.session = luo_session_from_file_set(file_set); args.file = luo_file->file; args.serialized_data = luo_file->serialized_data; args.private_data = luo_file->private_data; @@ -623,6 +627,7 @@ int luo_retrieve_file(struct luo_file_set *file_set, u64 token, } args.handler = luo_file->fh; + args.session = luo_session_from_file_set(file_set); args.serialized_data = luo_file->serialized_data; err = luo_file->fh->ops->retrieve(&args); if (err) { @@ -656,6 +661,7 @@ static int luo_file_can_finish_one(struct luo_file_set *file_set, struct liveupdate_file_op_args args = {0}; args.handler = luo_file->fh; + args.session = luo_session_from_file_set(file_set); args.file = luo_file->file; args.serialized_data = luo_file->serialized_data; args.retrieve_status = luo_file->retrieve_status; @@ -673,6 +679,7 @@ static void luo_file_finish_one(struct luo_file_set *file_set, guard(mutex)(&luo_file->mutex); args.handler = luo_file->fh; + args.session = luo_session_from_file_set(file_set); args.file = luo_file->file; args.serialized_data = luo_file->serialized_data; args.retrieve_status = luo_file->retrieve_status; @@ -929,3 +936,64 @@ void liveupdate_unregister_file_handler(struct liveupdate_file_handler *fh) list_del(&ACCESS_PRIVATE(fh, list)); } EXPORT_SYMBOL_GPL(liveupdate_unregister_file_handler); + +/** + * liveupdate_get_token_outgoing - Get the token for a preserved file. + * @s: The outgoing liveupdate session. + * @file: The file object to search for. + * @tokenp: Output parameter for the found token. + * + * Searches the list of preserved files in an outgoing session for a matching + * file object. If found, the corresponding user-provided token is returned. + * + * This function is intended for in-kernel callers that need to correlate a + * file with its liveupdate token. + * + * Context: It must be called with session mutex acquired. + * Return: 0 on success, -ENOENT if the file is not preserved in this session. + */ +int liveupdate_get_token_outgoing(struct liveupdate_session *s, + struct file *file, u64 *tokenp) +{ + struct luo_file_set *file_set = luo_file_set_from_session_locked(s); + struct luo_file *luo_file; + int err = -ENOENT; + + list_for_each_entry(luo_file, &file_set->files_list, list) { + if (luo_file->file == file) { + if (tokenp) + *tokenp = luo_file->token; + err = 0; + break; + } + } + + return err; +} + +/** + * liveupdate_get_file_incoming - Retrieves a preserved file for in-kernel use. + * @s: The incoming liveupdate session (restored from the previous kernel). + * @token: The unique token identifying the file to retrieve. + * @filep: On success, this will be populated with a pointer to the retrieved + * 'struct file'. + * + * Provides a kernel-internal API for other subsystems to retrieve their + * preserved files after a live update. This function is a simple wrapper + * around luo_retrieve_file(), allowing callers to find a file by its token. + * + * The caller receives a new reference to the file and must call fput() when it + * is no longer needed. The file's lifetime is managed by LUO and any userspace + * file descriptors. If the caller needs to hold a reference to the file beyond + * the immediate scope, it must call get_file() itself. + * + * Context: It must be called with session mutex acquired of a restored session. + * Return: 0 on success. Returns -ENOENT if no file with the matching token is + * found, or any other negative errno on failure. + */ +int liveupdate_get_file_incoming(struct liveupdate_session *s, u64 token, + struct file **filep) +{ + return luo_retrieve_file(luo_file_set_from_session_locked(s), + token, filep); +} diff --git a/kernel/liveupdate/luo_internal.h b/kernel/liveupdate/luo_internal.h index 875844d7a41d..08b198802e7f 100644 --- a/kernel/liveupdate/luo_internal.h +++ b/kernel/liveupdate/luo_internal.h @@ -79,6 +79,23 @@ struct luo_session { extern struct rw_semaphore luo_register_rwlock; +static inline struct liveupdate_session *luo_session_from_file_set(struct luo_file_set *file_set) +{ + struct luo_session *session; + + session = container_of(file_set, struct luo_session, file_set); + + return (struct liveupdate_session *)session; +} + +static inline struct luo_file_set *luo_file_set_from_session_locked(struct liveupdate_session *s) +{ + struct luo_session *session = (struct luo_session *)s; + + lockdep_assert_held(&session->mutex); + return &session->file_set; +} + int luo_session_create(const char *name, struct file **filep); int luo_session_retrieve(const char *name, struct file **filep); int __init luo_session_setup_outgoing(void *fdt); -- Gitee From 5cb3940d85327f919809f3b50200e2eff07d46f3 Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Sun, 30 Nov 2025 03:08:28 +0000 Subject: [PATCH 04/18] iommu: Implement IOMMU Live update FLB callbacks ANBZ: #26808 cherry-picked from https://lore.kernel.org/all/20260427175633.1978233-3-skhawaja@google.com/ Add liveupdate FLB for IOMMU state preservation. Use KHO preserve memory alloc/free helper functions to allocate memory for the IOMMU Live update FLB object and the serialization structs for device, domain and iommu. During retrieve, walk through the preserved obj array headers and restore each folio. Also recreate the FLB obj. [Zelin Deng: 1. use mutex.h instead of mutex_types.h. 2. use kzalloc instead of kzalloc_obj.] Signed-off-by: Samiullah Khawaja Signed-off-by: Zelin Deng --- MAINTAINERS | 9 ++ drivers/iommu/Kconfig | 12 ++ drivers/iommu/Makefile | 1 + drivers/iommu/liveupdate.c | 198 ++++++++++++++++++++++++++++ include/linux/iommu-liveupdate.h | 18 +++ include/linux/kho/abi/iommu.h | 218 +++++++++++++++++++++++++++++++ 6 files changed, 456 insertions(+) create mode 100644 drivers/iommu/liveupdate.c create mode 100644 include/linux/iommu-liveupdate.h create mode 100644 include/linux/kho/abi/iommu.h diff --git a/MAINTAINERS b/MAINTAINERS index 3501c95f7cde..1e368a45dd12 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11052,6 +11052,15 @@ F: include/linux/iommu.h F: include/linux/iova.h F: include/linux/of_iommu.h +IOMMU LIVEUPDATE +M: Samiullah Khawaja +R: Pranjal Shrivastava +L: iommu@lists.linux.dev +S: Maintained +F: drivers/iommu/liveupdate.c +F: include/linux/iommu-liveupdate.h +F: include/linux/kho/abi/iommu.h + IOMMUFD M: Jason Gunthorpe M: Kevin Tian diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 7f3aa3085c0c..47efd45469ac 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -560,6 +560,18 @@ config IOMMU_DEBUG_PAGEALLOC line to activate the runtime checks. If unsure, say N. + +config IOMMU_LIVEUPDATE + bool "IOMMU live update state preservation support" + depends on LIVEUPDATE && IOMMUFD + help + Enable support for preserving IOMMU state across a kexec live update. + + This allows devices managed by iommufd to maintain their DMA mappings + during kexec base kernel update. + + If unsure, say N. + endif # IOMMU_SUPPORT source "drivers/iommu/generic_pt/Kconfig" diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile index cba1cff4aa11..0f567d19b24b 100644 --- a/drivers/iommu/Makefile +++ b/drivers/iommu/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_IOMMU_IO_PGTABLE_ARMV7S) += io-pgtable-arm-v7s.o obj-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += io-pgtable-arm.o obj-$(CONFIG_IOMMU_IO_PGTABLE_DART) += io-pgtable-dart.o obj-$(CONFIG_IOMMU_IOVA) += iova.o +obj-$(CONFIG_IOMMU_LIVEUPDATE) += liveupdate.o obj-$(CONFIG_OF_IOMMU) += of_iommu.o obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o obj-$(CONFIG_IPMMU_VMSA) += ipmmu-vmsa.o diff --git a/drivers/iommu/liveupdate.c b/drivers/iommu/liveupdate.c new file mode 100644 index 000000000000..0590400dc931 --- /dev/null +++ b/drivers/iommu/liveupdate.c @@ -0,0 +1,198 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Copyright (C) 2026, Google LLC + * Author: Samiullah Khawaja + */ + +#define pr_fmt(fmt) "iommu: liveupdate: " fmt + +#include +#include +#include +#include +#include + +static void *iommu_liveupdate_restore_array(u64 array_phys) +{ + struct iommu_array_hdr_ser *array_hdr; + void *vaddr = array_phys ? phys_to_virt(array_phys) : NULL; + + while (array_phys) { + /* + * Failure to restore preserved IOMMU state is considered fatal. + * + * This is because the IOMMU translations for preserved IOMMUs + * were kept enabled in the previous kernel and the preserved + * devices have their IOMMU domains still present. Not being + * able to restore means that the memory mapped into preserved + * domains might be already corrupted by the preserved devices. + * + * There is no way to confirm the integrity of the memory that + * was mapped. BUG_ON is the safest option at this point. + */ + BUG_ON(!kho_restore_folio(array_phys)); + array_hdr = phys_to_virt(array_phys); + array_phys = array_hdr->next_array_phys; + } + + return vaddr; +} + +static void iommu_liveupdate_unpreserve_free(u64 array_phys) +{ + struct iommu_array_hdr_ser *array_hdr; + + while (array_phys) { + array_hdr = phys_to_virt(array_phys); + array_phys = array_hdr->next_array_phys; + kho_unpreserve_free(array_hdr); + } +} + +static void iommu_liveupdate_folio_put(u64 array_phys) +{ + struct iommu_array_hdr_ser *array_hdr; + + while (array_phys) { + array_hdr = phys_to_virt(array_phys); + array_phys = array_hdr->next_array_phys; + folio_put(virt_to_folio(array_hdr)); + } +} + +static void iommu_liveupdate_flb_free(struct iommu_flb_obj *obj) +{ + if (obj->ser->iommu_domain_array_phys) + iommu_liveupdate_unpreserve_free(obj->ser->iommu_domain_array_phys); + + if (obj->ser->device_array_phys) + iommu_liveupdate_unpreserve_free(obj->ser->device_array_phys); + + if (obj->ser->iommu_array_phys) + iommu_liveupdate_unpreserve_free(obj->ser->iommu_array_phys); + + kho_unpreserve_free(obj->ser); + kfree(obj); +} + +static int iommu_liveupdate_flb_preserve(struct liveupdate_flb_op_args *argp) +{ + struct iommu_flb_obj *obj; + struct iommu_flb_ser *ser; + void *mem; + + /* obj exists only in the current kernel to track preserved state */ + obj = kzalloc(sizeof(*obj), GFP_KERNEL); + if (!obj) + return -ENOMEM; + + mutex_init(&obj->lock); + + /* mem is allocated via KHO and will survive the kexec */ + mem = kho_alloc_preserve(sizeof(*ser)); + if (IS_ERR(mem)) + goto err_free_obj; + + ser = mem; + obj->ser = ser; + + mem = kho_alloc_preserve(PAGE_SIZE); + if (IS_ERR(mem)) + goto err_free_ser; + + obj->curr_domain_array = mem; + ser->iommu_domain_array_phys = virt_to_phys(obj->curr_domain_array); + + mem = kho_alloc_preserve(PAGE_SIZE); + if (IS_ERR(mem)) + goto err_free_domains; + + obj->curr_device_array = mem; + ser->device_array_phys = virt_to_phys(obj->curr_device_array); + + mem = kho_alloc_preserve(PAGE_SIZE); + if (IS_ERR(mem)) + goto err_free_devices; + + obj->curr_iommu_array = mem; + ser->iommu_array_phys = virt_to_phys(obj->curr_iommu_array); + + argp->obj = obj; + argp->data = virt_to_phys(ser); + return 0; + +err_free_devices: + kho_unpreserve_free(obj->curr_device_array); +err_free_domains: + kho_unpreserve_free(obj->curr_domain_array); +err_free_ser: + kho_unpreserve_free(obj->ser); +err_free_obj: + kfree(obj); + return PTR_ERR(mem); +} + +static void iommu_liveupdate_flb_unpreserve(struct liveupdate_flb_op_args *argp) +{ + iommu_liveupdate_flb_free(argp->obj); +} + +static void iommu_liveupdate_flb_finish(struct liveupdate_flb_op_args *argp) +{ + struct iommu_flb_obj *obj = argp->obj; + + iommu_liveupdate_folio_put(obj->ser->iommu_domain_array_phys); + iommu_liveupdate_folio_put(obj->ser->device_array_phys); + iommu_liveupdate_folio_put(obj->ser->iommu_array_phys); + + folio_put(virt_to_folio(obj->ser)); + kfree(obj); +} + +static int iommu_liveupdate_flb_retrieve(struct liveupdate_flb_op_args *argp) +{ + struct iommu_flb_obj *obj; + struct iommu_flb_ser *ser; + + obj = kzalloc(sizeof(*obj), GFP_KERNEL); + if (!obj) + return -ENOMEM; + + /* Data must be present and valid from the previous kernel */ + BUG_ON(!kho_restore_folio(argp->data)); + + mutex_init(&obj->lock); + ser = phys_to_virt(argp->data); + obj->ser = ser; + + obj->curr_domain_array = iommu_liveupdate_restore_array(ser->iommu_domain_array_phys); + obj->curr_device_array = iommu_liveupdate_restore_array(ser->device_array_phys); + obj->curr_iommu_array = iommu_liveupdate_restore_array(ser->iommu_array_phys); + argp->obj = obj; + return 0; +} + +static struct liveupdate_flb_ops iommu_flb_ops = { + .preserve = iommu_liveupdate_flb_preserve, + .unpreserve = iommu_liveupdate_flb_unpreserve, + .finish = iommu_liveupdate_flb_finish, + .retrieve = iommu_liveupdate_flb_retrieve, +}; + +static struct liveupdate_flb iommu_flb = { + .compatible = IOMMU_LUO_FLB_COMPATIBLE, + .ops = &iommu_flb_ops, +}; + +int iommu_liveupdate_register_flb(struct liveupdate_file_handler *handler) +{ + return liveupdate_register_flb(handler, &iommu_flb); +} +EXPORT_SYMBOL(iommu_liveupdate_register_flb); + +void iommu_liveupdate_unregister_flb(struct liveupdate_file_handler *handler) +{ + liveupdate_unregister_flb(handler, &iommu_flb); +} +EXPORT_SYMBOL(iommu_liveupdate_unregister_flb); diff --git a/include/linux/iommu-liveupdate.h b/include/linux/iommu-liveupdate.h new file mode 100644 index 000000000000..3d1c65ed76fa --- /dev/null +++ b/include/linux/iommu-liveupdate.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (C) 2026, Google LLC + * Author: Samiullah Khawaja + */ + +#ifndef _LINUX_IOMMU_LIVEUPDATE_H +#define _LINUX_IOMMU_LIVEUPDATE_H + +#include +#include +#include + +int iommu_liveupdate_register_flb(struct liveupdate_file_handler *handler); +void iommu_liveupdate_unregister_flb(struct liveupdate_file_handler *handler); + +#endif /* _LINUX_IOMMU_LIVEUPDATE_H */ diff --git a/include/linux/kho/abi/iommu.h b/include/linux/kho/abi/iommu.h new file mode 100644 index 000000000000..c7fab98dd933 --- /dev/null +++ b/include/linux/kho/abi/iommu.h @@ -0,0 +1,218 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (C) 2026, Google LLC + * Author: Samiullah Khawaja + */ + +#ifndef _LINUX_KHO_ABI_IOMMU_H +#define _LINUX_KHO_ABI_IOMMU_H + +#include +#include +#include + +/** + * DOC: IOMMU File-Lifecycle Bound (FLB) Live Update ABI + * + * This header defines the ABI for preserving IOMMU state across kexec using + * Live Update File-Lifecycle Bound (FLB) data. + * + * This interface is a contract. Any modification to any of the serialization + * structs defined here constitutes a breaking change. Such changes require + * incrementing the version number in the IOMMU_LUO_FLB_COMPATIBLE string. + * + * Memory Layout of Serialization Structures: + * ========================================== + * + * Each serialized type (IOMMU, Domain, Device) is stored in a linked list of + * arrays. The first array is allocated initially. When an array is full, a new + * array is allocated and its physical address is stored in the next_array_phys + * field of the hdr of the current array. + * + * Top Level (struct iommu_flb_ser): + * +---------------------------+ + * | - iommu_array_phys | + * | - iommu_domain_array_phys | + * | - device_array_phys | + * +---------------------------+ + * + * Each Array contains the serialized objects of the respective type. For + * example see below the representation of struct iommu_domain_array_ser. + * + * +---------------------------+ +---------------------------+ + * | iommu_domain_array_ser |-->| iommu_domain_array_ser |--> NULL + * | - hdr.next_array_phys | | - hdr.next_array_phys | + * | - hdr.nr_objects | | - hdr.nr_objects | + * | | | | + * | objects[]: | | objects[]: | + * | [ iommu_domain_ser ] | | [ iommu_domain_ser ] | + * | [ iommu_domain_ser ] | | [ iommu_domain_ser ] | + * | ... | | ... | + * +---------------------------+ +---------------------------+ + * + * Each object in the array starts with a common header (iommu_hdr_ser). + * For example, the layout of struct iommu_domain_ser is: + * + * +-----------------------------+ + * | iommu_domain_ser | + * | +-------------------------+ | + * | | hdr (iommu_hdr_ser) | | + * | | - ref_count | | + * | | - deleted / incoming | | + * | +-------------------------+ | + * | - top_table_phys | | + * | - top_level | | + * | - restored_domain | | + * +-----------------------------+ + * + * This pattern applies identically to iommu_device_ser and iommu_hw_ser. + */ + +#define IOMMU_LUO_FLB_COMPATIBLE "iommu-liveupdate-v1" + +enum iommu_type_ser { + IOMMU_INVALID, +}; + +/** + * struct iommu_hdr_ser - Common header for all serialized IOMMU objects + * @ref_count: Reference count for the object + * @deleted: Flag indicating if the object is deleted + * @incoming: Flag indicating if the object was preserved in previous kernel + */ +struct iommu_hdr_ser { + u32 ref_count; + u32 deleted:1; + u32 incoming:1; +} __packed; + +/** + * struct iommu_domain_ser - Serialized state of an IOMMU domain + * @hdr: Common object header + * @top_table_phys: Physical address of the top-level page table + * @top_level: Level of the top-level page table + * @vasz: Virtual Address Size + * @sign_extend: FEAT_SIGN_EXTEND is enabled for this domain + * @restored_domain: Pointer to the restored domain (valid only after restore) + */ +struct iommu_domain_ser { + struct iommu_hdr_ser hdr; + u64 top_table_phys; + u64 top_level; + u32 vasz; + u32 sign_extend:1; + struct iommu_domain *restored_domain; +} __packed; + +/** + * struct iommu_dev_map_ser - Serialized mapping between device, domain, + * and IOMMU instance. + * @attachment_id: ID of the attachment between device and domain. + * @domain_phys: Physical address of the domain + * @iommu_phys: Physical address of the IOMMU + */ +struct iommu_dev_map_ser { + u64 attachment_id; + u64 domain_phys; + u64 iommu_phys; +} __packed; + +/** + * struct iommu_device_ser - Serialized state of a device + * @hdr: Common object header + * @devid: Device ID + * @pci_domain_nr: PCI domain number + * @domain_iommu_ser: Domain and IOMMU mapping + */ +struct iommu_device_ser { + struct iommu_hdr_ser hdr; + u32 devid; + u32 pci_domain_nr; + struct iommu_dev_map_ser domain_iommu_ser; +} __packed; + +/** + * struct iommu_hw_ser - Serialized state of an IOMMU instance + * @hdr: Common object header + * @token: Unique token for the IOMMU + * @type: IOMMU type serialized state belongs to + */ +struct iommu_hw_ser { + struct iommu_hdr_ser hdr; + u64 token; + u64 type; +} __packed; + +/** + * struct iommu_array_hdr_ser - Header for an array of serialized objects + * @next_array_phys: Physical address of the next array of objects + * @nr_objects: Number of objects in the current array + */ +struct iommu_array_hdr_ser { + u64 next_array_phys; + u64 nr_objects; +} __packed; + +/** + * struct iommu_hw_array_ser - An array containing serialized IOMMU HWs + * @hdr: Array header + * @objects: Array of serialized IOMMU devices + */ +struct iommu_hw_array_ser { + struct iommu_array_hdr_ser hdr; + struct iommu_hw_ser objects[]; +} __packed; + +/** + * struct iommu_domain_array_ser - An array containing serialized domains + * @hdr: Array header + * @objects: Array of serialized domains + */ +struct iommu_domain_array_ser { + struct iommu_array_hdr_ser hdr; + struct iommu_domain_ser objects[]; +} __packed; + +/** + * struct iommu_device_array_ser - An array containing serialized devices + * @hdr: Array header + * @objects: Array of serialized devices + */ +struct iommu_device_array_ser { + struct iommu_array_hdr_ser hdr; + struct iommu_device_ser objects[]; +} __packed; + +/** + * struct iommu_flb_ser - Top-level serialization structure + * @iommu_array_phys: Physical address of the first array of IOMMU HWs + * @iommu_domain_array_phys: Physical address of the first array of domains + * @device_array_phys: Physical address of the first array of devices + */ +struct iommu_flb_ser { + u64 iommu_array_phys; + u64 iommu_domain_array_phys; + u64 device_array_phys; +} __packed; + +/** + * struct iommu_flb_obj - FLB object allocated in current kernel pointing to + * preserved state in FLB + * @lock: Mutex protecting the object + * @ser: Pointer to the serialized state in FLB + * @curr_iommu_array: Pointer to the current array of IOMMU instances + * @curr_domain_array: Pointer to the current array of domains + * @curr_device_array: Pointer to the current array of devices + */ +struct iommu_flb_obj { + /* @lock: Protects the serialized objects during concurrent preservation */ + struct mutex lock; + struct iommu_flb_ser *ser; + + struct iommu_hw_array_ser *curr_iommu_array; + struct iommu_domain_array_ser *curr_domain_array; + struct iommu_device_array_ser *curr_device_array; +} __packed; + +#endif /* _LINUX_KHO_ABI_IOMMU_H */ -- Gitee From 2cf91ae46b2dcde9a500b9ff4fc6c310432fe575 Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Sun, 30 Nov 2025 03:57:29 +0000 Subject: [PATCH 05/18] iommu: Implement IOMMU domain preservation ANBZ: #26808 cherry-picked from https://lore.kernel.org/all/20260427175633.1978233-4-skhawaja@google.com/ Add IOMMU domain ops that can be implemented by the IOMMU drivers if they support IOMMU domain preservation across liveupdate. The new IOMMU domain preserve, unpreserve and restore APIs call these ops to perform respective live update operations. [Zelin Deng: Use kabi field.] Signed-off-by: Samiullah Khawaja Signed-off-by: Zelin Deng --- drivers/iommu/liveupdate.c | 97 ++++++++++++++++++++++++++++++++ include/linux/iommu-liveupdate.h | 14 +++++ include/linux/iommu.h | 14 ++++- 3 files changed, 124 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/liveupdate.c b/drivers/iommu/liveupdate.c index 0590400dc931..85d5f1afd81d 100644 --- a/drivers/iommu/liveupdate.c +++ b/drivers/iommu/liveupdate.c @@ -13,6 +13,9 @@ #include #include +#define iommu_max_objs_per_page(_array) \ + ((PAGE_SIZE - sizeof(struct iommu_array_hdr_ser)) / sizeof((_array)->objects[0])) + static void *iommu_liveupdate_restore_array(u64 array_phys) { struct iommu_array_hdr_ser *array_hdr; @@ -196,3 +199,97 @@ void iommu_liveupdate_unregister_flb(struct liveupdate_file_handler *handler) liveupdate_unregister_flb(handler, &iommu_flb); } EXPORT_SYMBOL(iommu_liveupdate_unregister_flb); + +static int alloc_object_ser(struct iommu_array_hdr_ser **curr_array_ptr, u64 max_objs) +{ + struct iommu_array_hdr_ser *curr_array = *curr_array_ptr; + struct iommu_array_hdr_ser *next_array; + + if (curr_array->nr_objects >= max_objs) { + next_array = kho_alloc_preserve(PAGE_SIZE); + if (IS_ERR(next_array)) + return PTR_ERR(next_array); + + curr_array->next_array_phys = virt_to_phys(next_array); + *curr_array_ptr = next_array; + curr_array = next_array; + } + + return curr_array->nr_objects++; +} + +static struct iommu_domain_ser *alloc_iommu_domain_ser(struct iommu_flb_obj *flb) +{ + int idx; + + idx = alloc_object_ser((struct iommu_array_hdr_ser **)&flb->curr_domain_array, + iommu_max_objs_per_page(flb->curr_domain_array)); + if (idx < 0) + return ERR_PTR(idx); + + flb->curr_domain_array->objects[idx].hdr.ref_count = 1; + return &flb->curr_domain_array->objects[idx]; +} + +int iommu_domain_preserve(struct iommu_domain *domain, struct iommu_domain_ser **ser) +{ + struct iommu_domain_ser *domain_ser; + struct iommu_flb_obj *flb_obj; + int ret; + + if (!domain->ops->preserve) + return -EOPNOTSUPP; + + ret = liveupdate_flb_get_outgoing(&iommu_flb, (void **)&flb_obj); + if (ret) + return ret; + + guard(mutex)(&flb_obj->lock); + domain_ser = alloc_iommu_domain_ser(flb_obj); + if (IS_ERR(domain_ser)) + return PTR_ERR(domain_ser); + + ret = domain->ops->preserve(domain, domain_ser); + if (ret) { + domain_ser->hdr.deleted = true; + return ret; + } + + domain->preserved_state = domain_ser; + *ser = domain_ser; + return 0; +} +EXPORT_SYMBOL_GPL(iommu_domain_preserve); + +void iommu_domain_unpreserve(struct iommu_domain *domain) +{ + struct iommu_domain_ser *domain_ser; + struct iommu_flb_obj *flb_obj; + int ret; + + if (!domain->ops->unpreserve) + return; + + ret = liveupdate_flb_get_outgoing(&iommu_flb, (void **)&flb_obj); + if (WARN_ON(ret)) + return; + + guard(mutex)(&flb_obj->lock); + + if (!domain->preserved_state) + return; + + /* + * There is no check for attached devices here. The correctness relies + * on the Live Update Orchestrator's session lifecycle. All resources + * (iommufd, vfio devices) are preserved within a single session. If the + * session is torn down, the .unpreserve callbacks for all files will be + * invoked, ensuring a consistent cleanup without needing explicit + * refcounting for the serialized objects here. + */ + domain_ser = domain->preserved_state; + domain->ops->unpreserve(domain, domain_ser); + domain_ser->hdr.deleted = true; + domain->preserved_state = NULL; +} +EXPORT_SYMBOL_GPL(iommu_domain_unpreserve); diff --git a/include/linux/iommu-liveupdate.h b/include/linux/iommu-liveupdate.h index 3d1c65ed76fa..6019cfc27428 100644 --- a/include/linux/iommu-liveupdate.h +++ b/include/linux/iommu-liveupdate.h @@ -12,6 +12,20 @@ #include #include +#ifdef CONFIG_IOMMU_LIVEUPDATE +int iommu_domain_preserve(struct iommu_domain *domain, struct iommu_domain_ser **ser); +void iommu_domain_unpreserve(struct iommu_domain *domain); +#else +static inline int iommu_domain_preserve(struct iommu_domain *domain, struct iommu_domain_ser **ser) +{ + return -EOPNOTSUPP; +} + +static inline void iommu_domain_unpreserve(struct iommu_domain *domain) +{ +} +#endif + int iommu_liveupdate_register_flb(struct liveupdate_file_handler *handler); void iommu_liveupdate_unregister_flb(struct liveupdate_file_handler *handler); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 6eb4967a2c59..591edaee9942 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #define IOMMU_READ (1 << 0) @@ -248,8 +249,11 @@ struct iommu_domain { struct list_head next; }; }; - +#ifdef CONFIG_IOMMU_LIVEUPDATE + CK_KABI_USE(1, struct iommu_domain_ser *preserved_state); +#else CK_KABI_RESERVE(1) +#endif CK_KABI_RESERVE(2) CK_KABI_RESERVE(3) CK_KABI_RESERVE(4) @@ -768,6 +772,11 @@ struct iommu_ops { * specific mechanisms. * @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*) * @free: Release the domain after use. + * @preserve: Preserve the iommu domain for liveupdate. + * Returns 0 on success, a negative errno on failure. + * @unpreserve: Unpreserve the iommu domain that was preserved earlier. + * @restore: Restore the iommu domain after liveupdate. + * Returns 0 on success, a negative errno on failure. */ struct iommu_domain_ops { int (*attach_dev)(struct iommu_domain *domain, struct device *dev, @@ -798,6 +807,9 @@ struct iommu_domain_ops { unsigned long quirks); void (*free)(struct iommu_domain *domain); + int (*preserve)(struct iommu_domain *domain, struct iommu_domain_ser *ser); + void (*unpreserve)(struct iommu_domain *domain, struct iommu_domain_ser *ser); + int (*restore)(struct iommu_domain *domain, struct iommu_domain_ser *ser); }; /** -- Gitee From a7ab2adf109412e3b562a765f4d8404575ebc144 Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Sun, 30 Nov 2025 03:57:29 +0000 Subject: [PATCH 06/18] iommu: Implement device and IOMMU HW preservation ANBZ: #26808 cherry-picked from https://lore.kernel.org/all/20260427175633.1978233-5-skhawaja@google.com/ Add IOMMU ops to preserve/unpreserve a device. These can be implemented by the IOMMU drivers that support preservation of devices that have their IOMMU domains preserved. During device preservation the state of the associated IOMMU is also preserved as dependency. [Zelin Deng: use kabi fields.] Signed-off-by: Samiullah Khawaja Signed-off-by: Zelin Deng --- drivers/iommu/liveupdate.c | 162 +++++++++++++++++++++++++++++++ include/linux/iommu-liveupdate.h | 33 +++++++ include/linux/iommu.h | 21 +++- 3 files changed, 215 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/liveupdate.c b/drivers/iommu/liveupdate.c index 85d5f1afd81d..21ea52e234e0 100644 --- a/drivers/iommu/liveupdate.c +++ b/drivers/iommu/liveupdate.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #define iommu_max_objs_per_page(_array) \ @@ -293,3 +294,164 @@ void iommu_domain_unpreserve(struct iommu_domain *domain) domain->preserved_state = NULL; } EXPORT_SYMBOL_GPL(iommu_domain_unpreserve); + +static struct iommu_hw_ser *alloc_iommu_hw_ser(struct iommu_flb_obj *flb) +{ + int idx; + + idx = alloc_object_ser((struct iommu_array_hdr_ser **)&flb->curr_iommu_array, + iommu_max_objs_per_page(flb->curr_iommu_array)); + if (idx < 0) + return ERR_PTR(idx); + + flb->curr_iommu_array->objects[idx].hdr.ref_count = 1; + return &flb->curr_iommu_array->objects[idx]; +} + +static int iommu_preserve_locked(struct iommu_device *iommu, + struct iommu_flb_obj *flb_obj) +{ + struct iommu_hw_ser *iommu_hw_ser; + int ret; + + if (!iommu->ops->preserve) + return -EOPNOTSUPP; + + lockdep_assert_held(&flb_obj->lock); + if (iommu->outgoing_preserved_state) { + iommu->outgoing_preserved_state->hdr.ref_count++; + return 0; + } + + iommu_hw_ser = alloc_iommu_hw_ser(flb_obj); + if (IS_ERR(iommu_hw_ser)) + return PTR_ERR(iommu_hw_ser); + + ret = iommu->ops->preserve(iommu, iommu_hw_ser); + if (ret) { + iommu_hw_ser->hdr.deleted = true; + return ret; + } + + iommu->outgoing_preserved_state = iommu_hw_ser; + return ret; +} + +static void iommu_unpreserve_locked(struct iommu_device *iommu, + struct iommu_flb_obj *flb_obj) +{ + struct iommu_hw_ser *iommu_hw_ser = iommu->outgoing_preserved_state; + + lockdep_assert_held(&flb_obj->lock); + iommu_hw_ser->hdr.ref_count--; + if (iommu_hw_ser->hdr.ref_count) + return; + + iommu->outgoing_preserved_state = NULL; + iommu->ops->unpreserve(iommu, iommu_hw_ser); + iommu_hw_ser->hdr.deleted = true; +} + +static struct iommu_device_ser *alloc_iommu_device_ser(struct iommu_flb_obj *flb) +{ + int idx; + + idx = alloc_object_ser((struct iommu_array_hdr_ser **)&flb->curr_device_array, + iommu_max_objs_per_page(flb->curr_device_array)); + if (idx < 0) + return ERR_PTR(idx); + + flb->curr_device_array->objects[idx].hdr.ref_count = 1; + return &flb->curr_device_array->objects[idx]; +} + +int iommu_preserve_device(struct iommu_domain *domain, + struct device *dev, u64 *preserved_state) +{ + struct iommu_flb_obj *flb_obj; + struct iommu_device_ser *device_ser; + struct dev_iommu *iommu; + struct pci_dev *pdev; + int ret; + + if (!dev_is_pci(dev)) + return -EOPNOTSUPP; + + if (!domain->preserved_state) + return -EINVAL; + + if (!iommu_group_dma_owner_claimed(dev->iommu_group)) + return -EINVAL; + + pdev = to_pci_dev(dev); + iommu = dev->iommu; + if (!iommu->iommu_dev->ops->preserve_device || + !iommu->iommu_dev->ops->preserve) + return -EOPNOTSUPP; + + ret = liveupdate_flb_get_outgoing(&iommu_flb, (void **)&flb_obj); + if (ret) + return ret; + + guard(mutex)(&flb_obj->lock); + device_ser = alloc_iommu_device_ser(flb_obj); + if (IS_ERR(device_ser)) + return PTR_ERR(device_ser); + + ret = iommu_preserve_locked(iommu->iommu_dev, flb_obj); + if (ret) { + device_ser->hdr.deleted = true; + return ret; + } + + device_ser->domain_iommu_ser.domain_phys = __pa(domain->preserved_state); + device_ser->domain_iommu_ser.iommu_phys = __pa(iommu->iommu_dev->outgoing_preserved_state); + device_ser->devid = pci_dev_id(pdev); + device_ser->pci_domain_nr = pci_domain_nr(pdev->bus); + + ret = iommu->iommu_dev->ops->preserve_device(dev, device_ser); + if (ret) { + device_ser->hdr.deleted = true; + iommu_unpreserve_locked(iommu->iommu_dev, flb_obj); + return ret; + } + + dev->iommu->device_ser = device_ser; + *preserved_state = virt_to_phys(device_ser); + return 0; +} + +void iommu_unpreserve_device(struct iommu_domain *domain, struct device *dev) +{ + struct iommu_flb_obj *flb_obj; + struct iommu_device_ser *iommu_device_ser; + struct dev_iommu *iommu; + struct pci_dev *pdev; + int ret; + + if (!dev_is_pci(dev)) + return; + + if (!iommu_group_dma_owner_claimed(dev->iommu_group)) + return; + + pdev = to_pci_dev(dev); + iommu = dev->iommu; + if (!iommu->iommu_dev->ops->unpreserve_device || + !iommu->iommu_dev->ops->unpreserve) + return; + + ret = liveupdate_flb_get_outgoing(&iommu_flb, (void **)&flb_obj); + if (WARN_ON(ret)) + return; + + guard(mutex)(&flb_obj->lock); + iommu_device_ser = dev_iommu_preserved_state(dev); + if (WARN_ON(!iommu_device_ser)) + return; + + iommu->iommu_dev->ops->unpreserve_device(dev, iommu_device_ser); + dev->iommu->device_ser = NULL; + + iommu_unpreserve_locked(iommu->iommu_dev, flb_obj); +} diff --git a/include/linux/iommu-liveupdate.h b/include/linux/iommu-liveupdate.h index 6019cfc27428..279c7ab04f09 100644 --- a/include/linux/iommu-liveupdate.h +++ b/include/linux/iommu-liveupdate.h @@ -8,14 +8,37 @@ #ifndef _LINUX_IOMMU_LIVEUPDATE_H #define _LINUX_IOMMU_LIVEUPDATE_H +#include #include #include #include #ifdef CONFIG_IOMMU_LIVEUPDATE +static inline void *dev_iommu_preserved_state(struct device *dev) +{ + struct iommu_device_ser *ser; + + if (!dev->iommu) + return NULL; + + ser = dev->iommu->device_ser; + if (ser && !ser->hdr.incoming) + return ser; + + return NULL; +} + int iommu_domain_preserve(struct iommu_domain *domain, struct iommu_domain_ser **ser); void iommu_domain_unpreserve(struct iommu_domain *domain); +int iommu_preserve_device(struct iommu_domain *domain, + struct device *dev, u64 *preserved_state); +void iommu_unpreserve_device(struct iommu_domain *domain, struct device *dev); #else +static inline void *dev_iommu_preserved_state(struct device *dev) +{ + return NULL; +} + static inline int iommu_domain_preserve(struct iommu_domain *domain, struct iommu_domain_ser **ser) { return -EOPNOTSUPP; @@ -24,6 +47,16 @@ static inline int iommu_domain_preserve(struct iommu_domain *domain, struct iomm static inline void iommu_domain_unpreserve(struct iommu_domain *domain) { } + +static inline int iommu_preserve_device(struct iommu_domain *domain, + struct device *dev, u64 *preserved_state) +{ + return -EOPNOTSUPP; +} + +static inline void iommu_unpreserve_device(struct iommu_domain *domain, struct device *dev) +{ +} #endif int iommu_liveupdate_register_flb(struct liveupdate_file_handler *handler); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 591edaee9942..9aa8f237d7d7 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -659,6 +659,10 @@ __iommu_copy_struct_to_user(const struct iommu_user_data *dst_data, * resources shared/passed to user space IOMMU instance. Associate * it with a nesting @parent_domain. It is required for driver to * set @viommu->ops pointing to its own viommu_ops + * @preserve_device: Preserve state of a device for liveupdate. + * @unpreserve_device: Unpreserve state that was preserved earlier. + * @preserve: Preserve state of iommu translation hardware for liveupdate. + * @unpreserve: Unpreserve state of iommu that was preserved earlier. * @pgsize_bitmap: bitmap of all possible supported page sizes * @owner: Driver module providing these ops * @identity_domain: An always available, always attachable identity @@ -724,11 +728,17 @@ struct iommu_ops { struct iommu_domain *release_domain; struct iommu_domain *default_domain; u8 user_pasid_table:1; - +#ifdef CONFIG_IOMMU_LIVEUPDATE + CK_KABI_USE(1, int (*preserve_device)(struct device *dev, struct iommu_device_ser *device_ser)); + CK_KABI_USE(2, void (*unpreserve_device)(struct device *dev, struct iommu_device_ser *device_ser)); + CK_KABI_USE(3, int (*preserve)(struct iommu_device *iommu, struct iommu_hw_ser *iommu_ser)); + CK_KABI_USE(4, void (*unpreserve)(struct iommu_device *iommu, struct iommu_hw_ser *iommu_ser)); +#else CK_KABI_RESERVE(1) CK_KABI_RESERVE(2) CK_KABI_RESERVE(3) CK_KABI_RESERVE(4) +#endif CK_KABI_RESERVE(5) CK_KABI_RESERVE(6) CK_KABI_RESERVE(7) @@ -821,6 +831,8 @@ struct iommu_domain_ops { * @singleton_group: Used internally for drivers that have only one group * @max_pasids: number of supported PASIDs * @ready: set once iommu_device_register() has completed successfully + * @outgoing_preserved_state: preserved iommu state of outgoing kernel for + * liveupdate. */ struct iommu_device { struct list_head list; @@ -830,6 +842,10 @@ struct iommu_device { struct iommu_group *singleton_group; u32 max_pasids; bool ready; + +#ifdef CONFIG_IOMMU_LIVEUPDATE + struct iommu_hw_ser *outgoing_preserved_state; +#endif }; /** @@ -884,6 +900,9 @@ struct dev_iommu { u32 pci_32bit_workaround:1; u32 require_direct:1; u32 shadow_on_flush:1; +#ifdef CONFIG_IOMMU_LIVEUPDATE + struct iommu_device_ser *device_ser; +#endif }; int iommu_device_register(struct iommu_device *iommu, -- Gitee From 57bc5a44f98e9afd82a396597e22f0f76d7fdcb6 Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Sun, 30 Nov 2025 04:03:31 +0000 Subject: [PATCH 07/18] iommu/pages: Add APIs to preserve/unpreserve/restore iommu pages ANBZ: #26808 cherry-picked from https://lore.kernel.org/all/20260427175633.1978233-6-skhawaja@google.com/ IOMMU pages are allocated/freed using APIs using struct ioptdesc. For the proper preservation and restoration of ioptdesc add helper functions. Signed-off-by: Samiullah Khawaja Signed-off-by: Zelin Deng --- drivers/iommu/iommu-pages.c | 108 ++++++++++++++++++++++++++++++++++-- drivers/iommu/iommu-pages.h | 30 ++++++++++ 2 files changed, 134 insertions(+), 4 deletions(-) diff --git a/drivers/iommu/iommu-pages.c b/drivers/iommu/iommu-pages.c index 0f2d8267abc4..c9027540a6ff 100644 --- a/drivers/iommu/iommu-pages.c +++ b/drivers/iommu/iommu-pages.c @@ -6,6 +6,7 @@ #include "iommu-pages.h" #include #include +#include #include #define IOPTDESC_MATCH(pg_elm, elm) \ @@ -28,6 +29,13 @@ static inline size_t ioptdesc_mem_size(struct ioptdesc *desc) return 1UL << (folio_order(ioptdesc_folio(desc)) + PAGE_SHIFT); } +static inline void iommu_folio_update_stats(struct folio *folio, + unsigned long nr_pages) +{ + mod_node_page_state(folio_pgdat(folio), NR_IOMMU_PAGES, nr_pages); + lruvec_stat_mod_folio(folio, NR_SECONDARY_PAGETABLE, nr_pages); +} + /** * iommu_alloc_pages_node_sz - Allocate a zeroed page of a given size from * specific NUMA node @@ -80,8 +88,7 @@ void *iommu_alloc_pages_node_sz(int nid, gfp_t gfp, size_t size) * rather large, i.e. multiple gigabytes in size. */ pgcnt = 1UL << order; - mod_node_page_state(folio_pgdat(folio), NR_IOMMU_PAGES, pgcnt); - lruvec_stat_mod_folio(folio, NR_SECONDARY_PAGETABLE, pgcnt); + iommu_folio_update_stats(folio, pgcnt); return folio_address(folio); } @@ -95,8 +102,7 @@ static void __iommu_free_desc(struct ioptdesc *iopt) if (IOMMU_PAGES_USE_DMA_API) WARN_ON_ONCE(iopt->incoherent); - mod_node_page_state(folio_pgdat(folio), NR_IOMMU_PAGES, -pgcnt); - lruvec_stat_mod_folio(folio, NR_SECONDARY_PAGETABLE, -pgcnt); + iommu_folio_update_stats(folio, -pgcnt); folio_put(folio); } @@ -131,6 +137,100 @@ void iommu_put_pages_list(struct iommu_pages_list *list) } EXPORT_SYMBOL_GPL(iommu_put_pages_list); +#if IS_ENABLED(CONFIG_IOMMU_LIVEUPDATE) +/** + * iommu_unpreserve_page - Unpreserve a page that was preserved in KHO + * @virt: Virtual address of a page + */ +void iommu_unpreserve_page(void *virt) +{ + kho_unpreserve_folio(ioptdesc_folio(virt_to_ioptdesc(virt))); +} +EXPORT_SYMBOL_GPL(iommu_unpreserve_page); + +/** + * iommu_preserve_page - Preserve a page during kexec handover + * @virt: Virtual address of the page to preserve + * + * Returns 0 on success, negative error on failure + */ +int iommu_preserve_page(void *virt) +{ + return kho_preserve_folio(ioptdesc_folio(virt_to_ioptdesc(virt))); +} +EXPORT_SYMBOL_GPL(iommu_preserve_page); + +/** + * iommu_unpreserve_pages - Unpreserve pages that were preserved in KHO + * @list: List of pages to unpreserve + */ +void iommu_unpreserve_pages(struct iommu_pages_list *list) +{ + struct ioptdesc *iopt; + + list_for_each_entry(iopt, &list->pages, iopt_freelist_elm) + kho_unpreserve_folio(ioptdesc_folio(iopt)); +} +EXPORT_SYMBOL_GPL(iommu_unpreserve_pages); + +/** + * iommu_restore_page - Restore a page that was preserved in KHO + * @phys: Physical address of a page + */ +void iommu_restore_page(u64 phys) +{ + struct ioptdesc *iopt; + struct folio *folio; + unsigned long pgcnt; + unsigned int order; + + folio = kho_restore_folio(phys); + BUG_ON(!folio); + + iopt = folio_ioptdesc(folio); + + /* + * For the restored pages incoherent is set to false as these are not + * mapped using the DMA_API. The remapping of these pages using DMA_API + * is not needed as these are not going to be written to by the new + * kernel. + */ + iopt->incoherent = false; + + order = folio_order(folio); + pgcnt = 1UL << order; + iommu_folio_update_stats(folio, pgcnt); +} +EXPORT_SYMBOL_GPL(iommu_restore_page); + +/** + * iommu_preserve_pages - Preserve pages during kexec handover + * @list: List of pages to preserve + * + * Returns 0 on success, negative error on failure + */ +int iommu_preserve_pages(struct iommu_pages_list *list) +{ + struct ioptdesc *iopt; + int ret; + + list_for_each_entry(iopt, &list->pages, iopt_freelist_elm) { + ret = kho_preserve_folio(ioptdesc_folio(iopt)); + if (ret) + goto err; + } + + return 0; + +err: + list_for_each_entry_continue_reverse(iopt, &list->pages, iopt_freelist_elm) + kho_unpreserve_folio(ioptdesc_folio(iopt)); + + return ret; +} +EXPORT_SYMBOL_GPL(iommu_preserve_pages); +#endif + /** * iommu_pages_start_incoherent - Setup the page for cache incoherent operation * @virt: The page to setup diff --git a/drivers/iommu/iommu-pages.h b/drivers/iommu/iommu-pages.h index ae9da4f571f6..7b9b6bb504b2 100644 --- a/drivers/iommu/iommu-pages.h +++ b/drivers/iommu/iommu-pages.h @@ -53,6 +53,36 @@ void *iommu_alloc_pages_node_sz(int nid, gfp_t gfp, size_t size); void iommu_free_pages(void *virt); void iommu_put_pages_list(struct iommu_pages_list *list); +#if IS_ENABLED(CONFIG_IOMMU_LIVEUPDATE) +int iommu_preserve_page(void *virt); +void iommu_unpreserve_page(void *virt); +int iommu_preserve_pages(struct iommu_pages_list *list); +void iommu_unpreserve_pages(struct iommu_pages_list *list); +void iommu_restore_page(u64 phys); +#else +static inline int iommu_preserve_page(void *virt) +{ + return -EOPNOTSUPP; +} + +static inline void iommu_unpreserve_page(void *virt) +{ +} + +static inline int iommu_preserve_pages(struct iommu_pages_list *list) +{ + return -EOPNOTSUPP; +} + +static inline void iommu_unpreserve_pages(struct iommu_pages_list *list, int count) +{ +} + +static inline void iommu_restore_page(u64 phys) +{ +} +#endif + /** * iommu_pages_list_add - add the page to a iommu_pages_list * @list: List to add the page to -- Gitee From 3497411d7270a8d751a2f37783f323d8d8a5d35f Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Sun, 30 Nov 2025 04:09:24 +0000 Subject: [PATCH 08/18] iommupt: Implement preserve/unpreserve/restore callbacks ANBZ: #26808 cherry-picked from https://lore.kernel.org/all/20260427175633.1978233-7-skhawaja@google.com/ Implement the iommu domain ops for presevation, unpresevation and restoration of iommu domains for liveupdate. Use the existing page walker to preserve the ioptdesc of the top_table and the lower tables. Preserve top_level, VASZ and FEAT Sign Extended to restore the domain in the next kernel. On restore the domain has only the preserved features enabled and all the other features are zeroed. This is ok since the restored domain is made immutable and can only be freed. A kunit test is added to verify that the IOMMU domain free can be done with trimmed features. [Zelin Deng: EXPORT_SYMBOL_NS_GPL is not string literal, remove "" in it.] Signed-off-by: Samiullah Khawaja Signed-off-by: Zelin Deng --- drivers/iommu/generic_pt/iommu_pt.h | 131 ++++++++++++++++++++++ drivers/iommu/generic_pt/kunit_iommu_pt.h | 28 +++++ include/linux/generic_pt/iommu.h | 19 +++- 3 files changed, 177 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h index 93ef5006ebfc..32e65d0726bc 100644 --- a/drivers/iommu/generic_pt/iommu_pt.h +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -920,6 +920,133 @@ int DOMAIN_NS(map_pages)(struct iommu_domain *domain, unsigned long iova, } EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(map_pages), GENERIC_PT_IOMMU); +#ifdef CONFIG_IOMMU_LIVEUPDATE +/** + * unpreserve() - Unpreserve page tables and other state of a domain. + * @domain: Domain to unpreserve + */ +void DOMAIN_NS(unpreserve)(struct iommu_domain *domain, struct iommu_domain_ser *ser) +{ + struct pt_iommu *iommu_table = + container_of(domain, struct pt_iommu, domain); + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_range range = pt_all_range(common); + struct pt_iommu_collect_args collect = { + .free_list = IOMMU_PAGES_LIST_INIT(collect.free_list), + }; + + iommu_pages_list_add(&collect.free_list, range.top_table); + pt_walk_range(&range, __collect_tables, &collect); + + iommu_unpreserve_pages(&collect.free_list); +} +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(unpreserve), GENERIC_PT_IOMMU); + +/** + * preserve() - Preserve page tables and other state of a domain. + * @domain: Domain to preserve + * + * Returns: -ERRNO on failure, 0 on success. + */ +int DOMAIN_NS(preserve)(struct iommu_domain *domain, struct iommu_domain_ser *ser) +{ + struct pt_iommu *iommu_table = + container_of(domain, struct pt_iommu, domain); + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_range range = pt_all_range(common); + struct pt_iommu_collect_args collect = { + .free_list = IOMMU_PAGES_LIST_INIT(collect.free_list), + }; + int ret; + + iommu_pages_list_add(&collect.free_list, range.top_table); + pt_walk_range(&range, __collect_tables, &collect); + + ret = iommu_preserve_pages(&collect.free_list); + if (ret) + return ret; + + ser->top_table_phys = virt_to_phys(range.top_table); + ser->top_level = range.top_level; + + /* + * VASZ and SIGN_EXTEND will be needed in next kernel for collector page + * table walk to restore and free pages. + */ + ser->vasz = common->max_vasz_lg2; + ser->sign_extend = pt_feature(common, PT_FEAT_SIGN_EXTEND); + + return 0; +} +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(preserve), GENERIC_PT_IOMMU); + +static int __restore_tables(struct pt_range *range, void *arg, + unsigned int level, struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + int ret; + + for_each_pt_level_entry(&pts) { + if (pts.type == PT_ENTRY_TABLE) { + iommu_restore_page(virt_to_phys(pts.table_lower)); + + /* + * pt_descend can only fail if pts.table_lower is not + * init. So the if statement below is dead code. + */ + ret = pt_descend(&pts, arg, __restore_tables); + if (ret) + return ret; + } + } + + return 0; +} + +static const struct pt_iommu_ops NS(ops_immutable); + +/** + * restore() - Restore page tables and other state of a domain. + * @domain: Domain to preserve + * + * Returns: -ERRNO on failure, 0 on success. + */ +int DOMAIN_NS(restore)(struct iommu_domain *domain, struct iommu_domain_ser *ser) +{ + struct pt_iommu *iommu_table = + container_of(domain, struct pt_iommu, domain); + struct pt_common *common = common_from_iommu(iommu_table); + struct pt_range range; + + common->max_vasz_lg2 = ser->vasz; + + /* Make this domain immutable.*/ + iommu_table->ops = &NS(ops_immutable); + + /* + * It is safe to override this here since this domain is immutable and + * can only be freed. + */ + common->features = 0; + if (ser->sign_extend) + common->features |= BIT(PT_FEAT_SIGN_EXTEND); + + range = pt_all_range(common); + iommu_restore_page(ser->top_table_phys); + + /* Free new table */ + iommu_free_pages(range.top_table); + + /* Set the restored top table */ + pt_top_set(common, phys_to_virt(ser->top_table_phys), ser->top_level); + + /* Restore all pages*/ + range = pt_all_range(common); + return pt_walk_range(&range, __restore_tables, NULL); +} +EXPORT_SYMBOL_NS_GPL(DOMAIN_NS(restore), GENERIC_PT_IOMMU); +#endif + struct pt_unmap_args { struct iommu_pages_list free_list; pt_vaddr_t unmapped; @@ -1118,6 +1245,10 @@ static const struct pt_iommu_ops NS(ops) = { .deinit = NS(deinit), }; +static const struct pt_iommu_ops NS(ops_immutable) = { + .deinit = NS(deinit), +}; + static int pt_init_common(struct pt_common *common) { struct pt_range top_range = pt_top_range(common); diff --git a/drivers/iommu/generic_pt/kunit_iommu_pt.h b/drivers/iommu/generic_pt/kunit_iommu_pt.h index e8a63c8ea850..af1918d693ed 100644 --- a/drivers/iommu/generic_pt/kunit_iommu_pt.h +++ b/drivers/iommu/generic_pt/kunit_iommu_pt.h @@ -426,6 +426,33 @@ static void test_mixed(struct kunit *test) check_iova(test, start, oa, len); } +static void test_restore_free(struct kunit *test) +{ + struct kunit_iommu_priv *priv = test->priv; + struct pt_range top_range = pt_top_range(priv->common); + u64 start = 0x3fe400ULL << 12; + u64 end = 0x4c0600ULL << 12; + pt_vaddr_t len = end - start; + + if (top_range.last_va <= start || sizeof(unsigned long) == 4) + kunit_skip(test, "range is too small"); + if ((priv->safe_pgsize_bitmap & GENMASK(30, 21)) != (BIT(30) | BIT(21))) + kunit_skip(test, "incompatible psize"); + + /* Map a large mixed range to populate multiple levels of page tables */ + do_map(test, start, start, len); + + /* + * Simulate a restored state by clearing all features except + * SIGN_EXTEND. This verifies that the generic page table free walker + * can correctly tear down a populated domain when other features are + * zeroed. + */ + priv->common->features &= BIT(PT_FEAT_SIGN_EXTEND); + + /* The domain will be freed when the test exits. */ +} + static struct kunit_case iommu_test_cases[] = { KUNIT_CASE_FMT(test_increase_level), KUNIT_CASE_FMT(test_map_simple), @@ -434,6 +461,7 @@ static struct kunit_case iommu_test_cases[] = { KUNIT_CASE_FMT(test_random_map), KUNIT_CASE_FMT(test_pgsize_boundary), KUNIT_CASE_FMT(test_mixed), + KUNIT_CASE_FMT(test_restore_free), {}, }; diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 9eefbb74efd0..a5d478ca9a11 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -13,6 +13,7 @@ struct iommu_iotlb_gather; struct pt_iommu_ops; struct pt_iommu_driver_ops; struct iommu_dirty_bitmap; +struct iommu_domain_ser; /** * DOC: IOMMU Radix Page Table @@ -202,6 +203,12 @@ struct pt_iommu_cfg { struct iommu_domain *domain, unsigned long iova, \ size_t pgsize, size_t pgcount, \ struct iommu_iotlb_gather *iotlb_gather); \ + int pt_iommu_##fmt##_preserve(struct iommu_domain *domain, \ + struct iommu_domain_ser *ser); \ + void pt_iommu_##fmt##_unpreserve(struct iommu_domain *domain, \ + struct iommu_domain_ser *ser); \ + int pt_iommu_##fmt##_restore(struct iommu_domain *domain, \ + struct iommu_domain_ser *ser); \ int pt_iommu_##fmt##_read_and_clear_dirty( \ struct iommu_domain *domain, unsigned long iova, size_t size, \ unsigned long flags, struct iommu_dirty_bitmap *dirty); \ @@ -217,6 +224,15 @@ struct pt_iommu_cfg { }; \ IOMMU_PROTOTYPES(fmt) +#ifdef CONFIG_IOMMU_LIVEUPDATE +#define IOMMU_PT_LIVEUPDATE_OPS(fmt) \ + , .preserve = &pt_iommu_##fmt##_preserve, \ + .unpreserve = &pt_iommu_##fmt##_unpreserve, \ + .restore = &pt_iommu_##fmt##_restore +#else +#define IOMMU_PT_LIVEUPDATE_OPS(fmt) +#endif + /* * A driver uses IOMMU_PT_DOMAIN_OPS to populate the iommu_domain_ops for the * iommu_pt @@ -224,7 +240,8 @@ struct pt_iommu_cfg { #define IOMMU_PT_DOMAIN_OPS(fmt) \ .iova_to_phys = &pt_iommu_##fmt##_iova_to_phys, \ .map_pages = &pt_iommu_##fmt##_map_pages, \ - .unmap_pages = &pt_iommu_##fmt##_unmap_pages + .unmap_pages = &pt_iommu_##fmt##_unmap_pages \ + IOMMU_PT_LIVEUPDATE_OPS(fmt) #define IOMMU_PT_DIRTY_OPS(fmt) \ .read_and_clear_dirty = &pt_iommu_##fmt##_read_and_clear_dirty -- Gitee From 7c959f718ffdfaa24e54674986426300c2b14588 Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Sun, 30 Nov 2025 05:16:49 +0000 Subject: [PATCH 09/18] iommu/vt-d: Implement device and iommu preserve/unpreserve ops ANBZ: #26808 cherry-picked from https://lore.kernel.org/all/20260427175633.1978233-8-skhawaja@google.com/ Add implementation of the device and iommu presevation in a separate file. Also set the device and iommu preserve/unpreserve ops in the struct iommu_ops. During normal shutdown the iommu translation is disabled. Since the root table is preserved during live update, it needs to be cleaned up and the context entries of the unpreserved devices need to be cleared. Signed-off-by: Samiullah Khawaja Signed-off-by: Zelin Deng --- MAINTAINERS | 1 + drivers/iommu/intel/Makefile | 1 + drivers/iommu/intel/iommu.c | 52 +++++++++++- drivers/iommu/intel/iommu.h | 28 +++++++ drivers/iommu/intel/liveupdate.c | 139 +++++++++++++++++++++++++++++++ drivers/iommu/iommu.c | 18 ++++ include/linux/iommu-liveupdate.h | 10 +++ include/linux/iommu.h | 14 ++++ include/linux/kho/abi/iommu.h | 18 ++++ 9 files changed, 277 insertions(+), 4 deletions(-) create mode 100644 drivers/iommu/intel/liveupdate.c diff --git a/MAINTAINERS b/MAINTAINERS index 1e368a45dd12..4904c661d8bd 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11057,6 +11057,7 @@ M: Samiullah Khawaja R: Pranjal Shrivastava L: iommu@lists.linux.dev S: Maintained +F: drivers/iommu/intel/liveupdate.c F: drivers/iommu/liveupdate.c F: include/linux/iommu-liveupdate.h F: include/linux/kho/abi/iommu.h diff --git a/drivers/iommu/intel/Makefile b/drivers/iommu/intel/Makefile index 6c7528130cf9..d26f8e8ad852 100644 --- a/drivers/iommu/intel/Makefile +++ b/drivers/iommu/intel/Makefile @@ -9,3 +9,4 @@ ifdef CONFIG_INTEL_IOMMU obj-$(CONFIG_IRQ_REMAP) += irq_remapping.o endif obj-$(CONFIG_INTEL_IOMMU_PERF_EVENTS) += perfmon.o +obj-$(CONFIG_IOMMU_LIVEUPDATE) += liveupdate.o diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 853cb2cccde0..ec526970911b 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -50,6 +51,8 @@ static int intel_iommu_set_dirty_tracking(struct iommu_domain *domain, bool enable); static int rwbf_quirk; +static void clear_unpreserved_context_entries(struct intel_iommu *iommu); + /* * set to 1 to panic kernel if can't successfully enable VT-d * (used when kernel is launched w/ TXT) @@ -58,8 +61,6 @@ static int force_on = 0; static int intel_iommu_tboot_noforce; static int no_platform_optin; -#define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry)) - /* * Take a root_entry and return the Lower Context Table Pointer (LCTP) * if marked present. @@ -2401,8 +2402,11 @@ void intel_iommu_shutdown(void) /* Disable PMRs explicitly here. */ iommu_disable_protect_mem_regions(iommu); - /* Make sure the IOMMUs are switched off */ - iommu_disable_translation(iommu); + /* Make sure the IOMMUs are switched off if not preserved. */ + if (iommu_preserved_state(&iommu->iommu)) + clear_unpreserved_context_entries(iommu); + else + iommu_disable_translation(iommu); } } @@ -2961,6 +2965,41 @@ static const struct iommu_dirty_ops intel_second_stage_dirty_ops = { .set_dirty_tracking = intel_iommu_set_dirty_tracking, }; +#ifdef CONFIG_IOMMU_LIVEUPDATE +static int clear_unpreserve_context_entry_fn(struct device *dev, + struct iommu_device *iommu, + void *arg) +{ + struct device_domain_info *info; + + info = dev_iommu_priv_get(dev); + if (!info) + return 0; + + if (dev_is_pci(dev) && dev_iommu_preserved_state(dev)) + return 0; + + domain_context_clear(info); + return 0; +} + +static void clear_unpreserved_context_entries(struct intel_iommu *iommu) +{ + struct iommu_dev_iter iter = { + .fn = clear_unpreserve_context_entry_fn, + .iommu = &iommu->iommu, + .arg = NULL, + + }; + + iommu_for_each_dev(&iter); +} +#else +static void clear_unpreserved_context_entries(struct intel_iommu *iommu) +{ +} +#endif + static struct iommu_domain * intel_iommu_domain_alloc_second_stage(struct device *dev, struct intel_iommu *iommu, u32 flags) @@ -3967,6 +4006,11 @@ const struct iommu_ops intel_iommu_ops = { .is_attach_deferred = intel_iommu_is_attach_deferred, .def_domain_type = device_def_domain_type, .page_response = intel_iommu_page_response, +#ifdef CONFIG_IOMMU_LIVEUPDATE + .preserve_device = intel_iommu_preserve_device, + .preserve = intel_iommu_preserve, + .unpreserve = intel_iommu_unpreserve, +#endif }; static void quirk_iommu_igfx(struct pci_dev *dev) diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index c6eb0227e33b..bea02c37ce86 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -563,6 +563,8 @@ struct root_entry { u64 hi; }; +#define ROOT_ENTRY_NR (VTD_PAGE_SIZE / sizeof(struct root_entry)) + /* * low 64 bits: * 0: present @@ -1300,6 +1302,32 @@ static inline int iopf_for_domain_replace(struct iommu_domain *new, return 0; } +#ifdef CONFIG_IOMMU_LIVEUPDATE +int intel_iommu_preserve_device(struct device *dev, + struct iommu_device_ser *device_ser); +int intel_iommu_preserve(struct iommu_device *iommu, + struct iommu_hw_ser *iommu_ser); +void intel_iommu_unpreserve(struct iommu_device *iommu, + struct iommu_hw_ser *iommu_ser); +#else +static inline int intel_iommu_preserve_device(struct device *dev, + struct iommu_device_ser *device_ser) +{ + return -EOPNOTSUPP; +} + +static inline int intel_iommu_preserve(struct iommu_device *iommu, + struct iommu_hw_ser *iommu_ser) +{ + return -EOPNOTSUPP; +} + +static inline void intel_iommu_unpreserve(struct iommu_device *iommu, + struct iommu_hw_ser *iommu_ser) +{ +} +#endif + #ifdef CONFIG_INTEL_IOMMU_SVM void intel_svm_check(struct intel_iommu *iommu); struct iommu_domain *intel_svm_domain_alloc(struct device *dev, diff --git a/drivers/iommu/intel/liveupdate.c b/drivers/iommu/intel/liveupdate.c new file mode 100644 index 000000000000..75fa68b701bf --- /dev/null +++ b/drivers/iommu/intel/liveupdate.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Copyright (C) 2026, Google LLC + * Author: Samiullah Khawaja + */ + +#define pr_fmt(fmt) "DMAR: liveupdate: " fmt + +#include +#include +#include +#include +#include + +#include "iommu.h" +#include "../iommu-pages.h" + +static void unpreserve_iommu_context_table(struct intel_iommu *iommu, int end) +{ + struct context_entry *context; + int i; + + for (i = 0; i < end; i++) { + context = iommu_context_addr(iommu, i, 0, 0); + if (context) + iommu_unpreserve_page(context); + + if (!sm_supported(iommu)) + continue; + + context = iommu_context_addr(iommu, i, 0x80, 0); + if (context) + iommu_unpreserve_page(context); + } +} + +static int preserve_iommu_context_table(struct intel_iommu *iommu) +{ + struct context_entry *context; + int ret; + int i; + + for (i = 0; i < ROOT_ENTRY_NR; i++) { + /* + * Alloc the context tables now to make sure the iommu unit is + * properly preserved. These might stay unused and wastes around + * 32MB max in scalable mode. + */ + spin_lock(&iommu->lock); + context = iommu_context_addr(iommu, i, 0, 1); + spin_unlock(&iommu->lock); + if (!context) { + ret = -ENOMEM; + goto error; + } + ret = iommu_preserve_page(context); + if (ret) + goto error; + + if (!sm_supported(iommu)) + continue; + + spin_lock(&iommu->lock); + context = iommu_context_addr(iommu, i, 0x80, 1); + spin_unlock(&iommu->lock); + if (!context) { + ret = -ENOMEM; + goto error_sm; + } + ret = iommu_preserve_page(context); + if (ret) + goto error_sm; + } + + return 0; + +error_sm: + context = iommu_context_addr(iommu, i, 0, 0); + iommu_unpreserve_page(context); +error: + unpreserve_iommu_context_table(iommu, i); + return ret; +} + +int intel_iommu_preserve_device(struct device *dev, + struct iommu_device_ser *device_ser) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + + if (!dev_is_pci(dev)) { + dev_err(dev, "Cannot preserve non-PCI device\n"); + return -EOPNOTSUPP; + } + + if (!info) + return -EINVAL; + + device_ser->domain_iommu_ser.attachment_id = domain_id_iommu(info->domain, + info->iommu); + return 0; +} + +int intel_iommu_preserve(struct iommu_device *iommu_dev, + struct iommu_hw_ser *ser) +{ + struct intel_iommu *iommu; + int ret; + + iommu = container_of(iommu_dev, struct intel_iommu, iommu); + + ret = preserve_iommu_context_table(iommu); + if (ret) + return ret; + + ret = iommu_preserve_page(iommu->root_entry); + if (ret) { + unpreserve_iommu_context_table(iommu, ROOT_ENTRY_NR); + return ret; + } + + ser->intel.phys_addr = iommu->reg_phys; + ser->intel.root_table = __pa(iommu->root_entry); + ser->type = IOMMU_INTEL; + ser->token = ser->intel.phys_addr; + + return 0; +} + +void intel_iommu_unpreserve(struct iommu_device *iommu_dev, + struct iommu_hw_ser *iommu_ser) +{ + struct intel_iommu *iommu; + + iommu = container_of(iommu_dev, struct intel_iommu, iommu); + + unpreserve_iommu_context_table(iommu, ROOT_ENTRY_NR); + iommu_unpreserve_page(iommu->root_entry); +} diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 5a3da091a364..5c86126844ea 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -305,6 +305,24 @@ void iommu_device_unregister(struct iommu_device *iommu) } EXPORT_SYMBOL_GPL(iommu_device_unregister); +static int _iommu_for_each_dev_cb(struct device *dev, void *data) +{ + struct iommu_dev_iter *iter = data; + + if (dev->iommu && dev->iommu->iommu_dev == iter->iommu) + return iter->fn(dev, iter->iommu, iter->arg); + + return 0; +} + +void iommu_for_each_dev(struct iommu_dev_iter *iter) +{ + for (int i = 0; i < ARRAY_SIZE(iommu_buses); i++) + bus_for_each_dev(iommu_buses[i], NULL, iter, + _iommu_for_each_dev_cb); +} +EXPORT_SYMBOL_GPL(iommu_for_each_dev); + #if IS_ENABLED(CONFIG_IOMMUFD_TEST) void iommu_device_unregister_bus(struct iommu_device *iommu, const struct bus_type *bus, diff --git a/include/linux/iommu-liveupdate.h b/include/linux/iommu-liveupdate.h index 279c7ab04f09..c9d75c6b3be9 100644 --- a/include/linux/iommu-liveupdate.h +++ b/include/linux/iommu-liveupdate.h @@ -33,6 +33,11 @@ void iommu_domain_unpreserve(struct iommu_domain *domain); int iommu_preserve_device(struct iommu_domain *domain, struct device *dev, u64 *preserved_state); void iommu_unpreserve_device(struct iommu_domain *domain, struct device *dev); + +static inline void *iommu_preserved_state(struct iommu_device *iommu) +{ + return iommu->outgoing_preserved_state; +} #else static inline void *dev_iommu_preserved_state(struct device *dev) { @@ -57,6 +62,11 @@ static inline int iommu_preserve_device(struct iommu_domain *domain, static inline void iommu_unpreserve_device(struct iommu_domain *domain, struct device *dev) { } + +static inline void *iommu_preserved_state(struct iommu_device *iommu) +{ + return NULL; +} #endif int iommu_liveupdate_register_flb(struct liveupdate_file_handler *handler); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 9aa8f237d7d7..9c41efb69946 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1233,6 +1233,20 @@ static inline void *dev_iommu_priv_get(struct device *dev) void dev_iommu_priv_set(struct device *dev, void *priv); +typedef int (*iommu_dev_iter_fn)(struct device *dev, + struct iommu_device *iommu, void *arg); + +/** + * struct iommu_dev_iter - Iterator for devices attached to an IOMMU + */ +struct iommu_dev_iter { + struct iommu_device *iommu; + iommu_dev_iter_fn fn; + void *arg; +}; + +void iommu_for_each_dev(struct iommu_dev_iter *iter); + extern struct mutex iommu_probe_device_lock; int iommu_probe_device(struct device *dev); diff --git a/include/linux/kho/abi/iommu.h b/include/linux/kho/abi/iommu.h index c7fab98dd933..238926429361 100644 --- a/include/linux/kho/abi/iommu.h +++ b/include/linux/kho/abi/iommu.h @@ -73,6 +73,7 @@ enum iommu_type_ser { IOMMU_INVALID, + IOMMU_INTEL, }; /** @@ -132,16 +133,33 @@ struct iommu_device_ser { struct iommu_dev_map_ser domain_iommu_ser; } __packed; +/** + * struct iommu_intel_ser - Serialized state of an Intel IOMMU instance + * @restored: Whether IOMMU state is restored + * @phys_addr: Physical address of the IOMMU register base + * @root_table: Physical address of the root entry table + */ +struct iommu_intel_ser { + u8 restored; + u8 padding[7]; + u64 phys_addr; + u64 root_table; +}; + /** * struct iommu_hw_ser - Serialized state of an IOMMU instance * @hdr: Common object header * @token: Unique token for the IOMMU * @type: IOMMU type serialized state belongs to + * @intel: Intel specific serialization data */ struct iommu_hw_ser { struct iommu_hdr_ser hdr; u64 token; u64 type; + union { + struct iommu_intel_ser intel; + }; } __packed; /** -- Gitee From 0cc42de19d7b8f499e910e81be16868a2926047e Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Sun, 30 Nov 2025 03:57:29 +0000 Subject: [PATCH 10/18] iommu: Add APIs to get iommu and device preserved state ANBZ: #26808 cherry-picked from https://lore.kernel.org/all/20260427175633.1978233-9-skhawaja@google.com/ The preserved state of the device and IOMMU needs to be fetched during shutdown and boot in the next kernel. Add APIs that can be used to fetch the preserved state of a device and IOMMU. The APIs will only be used during shutdown and after liveupdate so no locking needed. Signed-off-by: Samiullah Khawaja Signed-off-by: Zelin Deng --- drivers/iommu/liveupdate.c | 57 ++++++++++++++++++++++++++++++++ include/linux/iommu-liveupdate.h | 31 +++++++++++++++++ 2 files changed, 88 insertions(+) diff --git a/drivers/iommu/liveupdate.c b/drivers/iommu/liveupdate.c index 21ea52e234e0..1af033f6a18e 100644 --- a/drivers/iommu/liveupdate.c +++ b/drivers/iommu/liveupdate.c @@ -17,6 +17,14 @@ #define iommu_max_objs_per_page(_array) \ ((PAGE_SIZE - sizeof(struct iommu_array_hdr_ser)) / sizeof((_array)->objects[0])) +#define iommu_liveupdate_for_each_obj(_arr, _obj, _idx) \ + for (; (_arr); \ + (_arr) = (_arr)->hdr.next_array_phys ? \ + phys_to_virt((_arr)->hdr.next_array_phys) : NULL) \ + for ((_idx) = 0, (_obj) = (_arr)->objects; \ + (_idx) < (_arr)->hdr.nr_objects; (_idx)++, (_obj)++) \ + if (!(_obj)->hdr.deleted) + static void *iommu_liveupdate_restore_array(u64 array_phys) { struct iommu_array_hdr_ser *array_hdr; @@ -201,6 +209,55 @@ void iommu_liveupdate_unregister_flb(struct liveupdate_file_handler *handler) } EXPORT_SYMBOL(iommu_liveupdate_unregister_flb); +int iommu_for_each_preserved_device(iommu_preserved_device_iter_fn fn, + void *arg) +{ + struct iommu_flb_obj *flb_obj; + struct iommu_device_array_ser *array; + struct iommu_device_ser *device_ser; + int ret, idx; + + ret = liveupdate_flb_get_incoming(&iommu_flb, (void **)&flb_obj); + if (ret) + return -ENOENT; + + array = phys_to_virt(flb_obj->ser->device_array_phys); + iommu_liveupdate_for_each_obj(array, device_ser, idx) { + ret = fn(device_ser, arg); + if (ret) + goto out; + } + +out: + liveupdate_flb_put_incoming(&iommu_flb); + return ret; +} +EXPORT_SYMBOL(iommu_for_each_preserved_device); + +struct iommu_hw_ser *iommu_get_preserved_data(u64 token, enum iommu_type_ser type) +{ + struct iommu_hw_ser *iommu_ser = NULL; + struct iommu_hw_array_ser *array; + struct iommu_flb_obj *flb_obj; + int ret, idx; + + ret = liveupdate_flb_get_incoming(&iommu_flb, (void **)&flb_obj); + if (ret) + return NULL; + + array = phys_to_virt(flb_obj->ser->iommu_array_phys); + iommu_liveupdate_for_each_obj(array, iommu_ser, idx) { + if (iommu_ser->token == token && iommu_ser->type == type) + goto out; + } + + iommu_ser = NULL; +out: + liveupdate_flb_put_incoming(&iommu_flb); + return iommu_ser; +} +EXPORT_SYMBOL(iommu_get_preserved_data); + static int alloc_object_ser(struct iommu_array_hdr_ser **curr_array_ptr, u64 max_objs) { struct iommu_array_hdr_ser *curr_array = *curr_array_ptr; diff --git a/include/linux/iommu-liveupdate.h b/include/linux/iommu-liveupdate.h index c9d75c6b3be9..0baf6bc2d93f 100644 --- a/include/linux/iommu-liveupdate.h +++ b/include/linux/iommu-liveupdate.h @@ -13,6 +13,8 @@ #include #include +typedef int (*iommu_preserved_device_iter_fn)(struct iommu_device_ser *ser, + void *arg); #ifdef CONFIG_IOMMU_LIVEUPDATE static inline void *dev_iommu_preserved_state(struct device *dev) { @@ -28,6 +30,20 @@ static inline void *dev_iommu_preserved_state(struct device *dev) return NULL; } +static inline void *iommu_domain_restored_state(struct iommu_domain *domain) +{ + struct iommu_domain_ser *ser; + + ser = domain->preserved_state; + if (ser && ser->hdr.incoming) + return ser; + + return NULL; +} + +int iommu_for_each_preserved_device(iommu_preserved_device_iter_fn fn, + void *arg); +struct iommu_hw_ser *iommu_get_preserved_data(u64 token, enum iommu_type_ser type); int iommu_domain_preserve(struct iommu_domain *domain, struct iommu_domain_ser **ser); void iommu_domain_unpreserve(struct iommu_domain *domain); int iommu_preserve_device(struct iommu_domain *domain, @@ -44,6 +60,21 @@ static inline void *dev_iommu_preserved_state(struct device *dev) return NULL; } +static inline void *iommu_domain_restored_state(struct iommu_domain *domain) +{ + return NULL; +} + +static inline int iommu_for_each_preserved_device(iommu_preserved_device_iter_fn fn, void *arg) +{ + return -EOPNOTSUPP; +} + +static inline struct iommu_hw_ser *iommu_get_preserved_data(u64 token, enum iommu_type_ser type) +{ + return NULL; +} + static inline int iommu_domain_preserve(struct iommu_domain *domain, struct iommu_domain_ser **ser) { return -EOPNOTSUPP; -- Gitee From e85132a406ec085108fcd502f55ee19bfac2808d Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Sun, 30 Nov 2025 22:36:42 +0000 Subject: [PATCH 11/18] iommu/vt-d: Restore IOMMU state and reclaimed domain ids ANBZ: #26808 cherry-picked from https://lore.kernel.org/all/20260427175633.1978233-10-skhawaja@google.com/ During boot fetch the preserved state of IOMMU unit and if found then restore the state. - Reuse the root_table that was preserved in the previous kernel. - Reclaim the domain ids of the preserved domains for each preserved devices so these are not acquired by another domain. Signed-off-by: Samiullah Khawaja Signed-off-by: Zelin Deng --- drivers/iommu/intel/iommu.c | 55 ++++++++++++++++++++++-------- drivers/iommu/intel/iommu.h | 7 ++++ drivers/iommu/intel/liveupdate.c | 57 ++++++++++++++++++++++++++++++++ 3 files changed, 105 insertions(+), 14 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index ec526970911b..f077c8aab4fe 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -668,10 +668,17 @@ void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id, #endif /* iommu handling */ -static int iommu_alloc_root_entry(struct intel_iommu *iommu) +static int iommu_alloc_root_entry(struct intel_iommu *iommu, + struct iommu_hw_ser *iommu_ser) { struct root_entry *root; + if (iommu_ser) { + intel_iommu_liveupdate_restore_root_table(iommu, iommu_ser); + __iommu_flush_cache(iommu, iommu->root_entry, ROOT_SIZE); + return 0; + } + root = iommu_alloc_pages_node_sz(iommu->node, GFP_ATOMIC, SZ_4K); if (!root) { pr_err("Allocating root entry for %s failed\n", @@ -990,15 +997,16 @@ static void disable_dmar_iommu(struct intel_iommu *iommu) iommu_disable_translation(iommu); } -static void free_dmar_iommu(struct intel_iommu *iommu) +static void free_dmar_iommu(struct intel_iommu *iommu, struct iommu_hw_ser *iommu_ser) { if (iommu->copied_tables) { bitmap_free(iommu->copied_tables); iommu->copied_tables = NULL; } - /* free context mapping */ - free_context_table(iommu); + /* free context mapping if there is no serialized state. */ + if (!iommu_ser) + free_context_table(iommu); if (ecap_prs(iommu->ecap)) intel_iommu_finish_prq(iommu); @@ -1623,6 +1631,7 @@ static int copy_translation_tables(struct intel_iommu *iommu) static int __init init_dmars(void) { + struct iommu_hw_ser *iommu_ser = NULL; struct dmar_drhd_unit *drhd; struct intel_iommu *iommu; int ret; @@ -1645,8 +1654,12 @@ static int __init init_dmars(void) intel_pasid_max_id); } + iommu_ser = iommu_get_preserved_data(iommu->reg_phys, IOMMU_INTEL); + intel_iommu_init_qi(iommu); - init_translation_status(iommu); + + if (!iommu_ser) + init_translation_status(iommu); if (translation_pre_enabled(iommu) && !is_kdump_kernel()) { iommu_disable_translation(iommu); @@ -1660,7 +1673,7 @@ static int __init init_dmars(void) * we could share the same root & context tables * among all IOMMU's. Need to Split it later. */ - ret = iommu_alloc_root_entry(iommu); + ret = iommu_alloc_root_entry(iommu, iommu_ser); if (ret) goto free_iommu; @@ -1744,8 +1757,12 @@ static int __init init_dmars(void) free_iommu: for_each_active_iommu(iommu, drhd) { - disable_dmar_iommu(iommu); - free_dmar_iommu(iommu); + iommu_ser = iommu_get_preserved_data(iommu->reg_phys, IOMMU_INTEL); + + if (!iommu_ser) + disable_dmar_iommu(iommu); + + free_dmar_iommu(iommu, iommu_ser); } return ret; @@ -2115,15 +2132,19 @@ int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg) static int intel_iommu_add(struct dmar_drhd_unit *dmaru) { struct intel_iommu *iommu = dmaru->iommu; + struct iommu_hw_ser *iommu_ser = NULL; int ret; + /* Use IOMMU HW unit MMIO base to identify the preserved state. */ + iommu_ser = iommu_get_preserved_data(iommu->reg_phys, IOMMU_INTEL); + /* * Disable translation if already enabled prior to OS handover. */ - if (iommu->gcmd & DMA_GCMD_TE) + if (!iommu_ser && iommu->gcmd & DMA_GCMD_TE) iommu_disable_translation(iommu); - ret = iommu_alloc_root_entry(iommu); + ret = iommu_alloc_root_entry(iommu, iommu_ser); if (ret) goto out; @@ -2158,9 +2179,10 @@ static int intel_iommu_add(struct dmar_drhd_unit *dmaru) return 0; disable_iommu: - disable_dmar_iommu(iommu); + if (!iommu_ser) + disable_dmar_iommu(iommu); out: - free_dmar_iommu(iommu); + free_dmar_iommu(iommu, iommu_ser); return ret; } @@ -2168,6 +2190,7 @@ int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) { int ret = 0; struct intel_iommu *iommu = dmaru->iommu; + struct iommu_hw_ser *iommu_ser; if (!intel_iommu_enabled) return 0; @@ -2177,8 +2200,12 @@ int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert) if (insert) { ret = intel_iommu_add(dmaru); } else { - disable_dmar_iommu(iommu); - free_dmar_iommu(iommu); + iommu_ser = iommu_get_preserved_data(iommu->reg_phys, IOMMU_INTEL); + + if (!iommu_ser) + disable_dmar_iommu(iommu); + + free_dmar_iommu(iommu, iommu_ser); } return ret; diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index bea02c37ce86..4a8d29b0082c 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -1309,6 +1309,8 @@ int intel_iommu_preserve(struct iommu_device *iommu, struct iommu_hw_ser *iommu_ser); void intel_iommu_unpreserve(struct iommu_device *iommu, struct iommu_hw_ser *iommu_ser); +void intel_iommu_liveupdate_restore_root_table(struct intel_iommu *iommu, + struct iommu_hw_ser *iommu_ser); #else static inline int intel_iommu_preserve_device(struct device *dev, struct iommu_device_ser *device_ser) @@ -1326,6 +1328,11 @@ static inline void intel_iommu_unpreserve(struct iommu_device *iommu, struct iommu_hw_ser *iommu_ser) { } + +static inline void intel_iommu_liveupdate_restore_root_table(struct intel_iommu *iommu, + struct iommu_hw_ser *iommu_ser) +{ +} #endif #ifdef CONFIG_INTEL_IOMMU_SVM diff --git a/drivers/iommu/intel/liveupdate.c b/drivers/iommu/intel/liveupdate.c index 75fa68b701bf..50a63812533f 100644 --- a/drivers/iommu/intel/liveupdate.c +++ b/drivers/iommu/intel/liveupdate.c @@ -83,6 +83,63 @@ static int preserve_iommu_context_table(struct intel_iommu *iommu) return ret; } +static void restore_iommu_context(struct intel_iommu *iommu) +{ + struct context_entry *context; + int i; + + for (i = 0; i < ROOT_ENTRY_NR; i++) { + context = iommu_context_addr(iommu, i, 0, 0); + if (context) + BUG_ON(!kho_restore_folio(virt_to_phys(context))); + + if (!sm_supported(iommu)) + continue; + + context = iommu_context_addr(iommu, i, 0x80, 0); + if (context) + BUG_ON(!kho_restore_folio(virt_to_phys(context))); + } +} + +static int _restore_used_domain_ids(struct iommu_device_ser *ser, void *arg) +{ + int id = ser->domain_iommu_ser.attachment_id; + struct iommu_hw_ser *iommu_hw_ser; + struct intel_iommu *iommu = arg; + + iommu_hw_ser = phys_to_virt(ser->domain_iommu_ser.iommu_phys); + if (iommu_hw_ser->type != IOMMU_INTEL) + return 0; + + /* Only allocate domain ID from associated IOMMU HW unit */ + if (iommu_hw_ser->intel.phys_addr != iommu->reg_phys) + return 0; + + /* + * This can fail as multiple preserved devices can share the same domain + * ID. Since this is done during DMAR init so these failures can be + * ignored. + */ + ida_alloc_range(&iommu->domain_ida, id, id, GFP_ATOMIC); + return 0; +} + +void intel_iommu_liveupdate_restore_root_table(struct intel_iommu *iommu, + struct iommu_hw_ser *iommu_ser) +{ + if (!iommu_ser->intel.restored) + BUG_ON(!kho_restore_folio(iommu_ser->intel.root_table)); + + iommu->root_entry = __va(iommu_ser->intel.root_table); + + if (!iommu_ser->intel.restored) + restore_iommu_context(iommu); + + iommu_ser->intel.restored = 1; + iommu_for_each_preserved_device(_restore_used_domain_ids, iommu); +} + int intel_iommu_preserve_device(struct device *dev, struct iommu_device_ser *device_ser) { -- Gitee From 920e6ee576c83b4c0c296fbec408b944c4482e2f Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Thu, 29 Jan 2026 18:33:01 +0000 Subject: [PATCH 12/18] iommu: Restore and reattach preserved domains to devices ANBZ: #26808 cherry-picked from https://lore.kernel.org/all/20260427175633.1978233-11-skhawaja@google.com/ Restore the preserved domains by restoring the page tables using restore IOMMU domain op. Reattach the preserved domain to the device during default domain setup. While attaching, reuse the domain ID that was used in the previous kernel. The context entry setup is not needed as that is preserved during liveupdate. Signed-off-by: Samiullah Khawaja Signed-off-by: Zelin Deng --- drivers/iommu/intel/iommu.c | 49 ++++++++++++++------ drivers/iommu/intel/iommu.h | 3 +- drivers/iommu/intel/nested.c | 2 +- drivers/iommu/iommu.c | 61 ++++++++++++++++++++++++- drivers/iommu/liveupdate.c | 78 ++++++++++++++++++++++++++++++++ include/linux/iommu-liveupdate.h | 50 ++++++++++++++++++++ 6 files changed, 224 insertions(+), 19 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index f077c8aab4fe..0ac10a6c8a1a 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -1029,7 +1029,8 @@ static bool first_level_by_default(struct intel_iommu *iommu) return true; } -int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) +int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu, + int restore_did) { struct iommu_domain_info *info, *curr; int num, ret = -ENOSPC; @@ -1049,8 +1050,11 @@ int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu) return 0; } - num = ida_alloc_range(&iommu->domain_ida, IDA_START_DID, - cap_ndoms(iommu->cap) - 1, GFP_KERNEL); + if (restore_did >= IDA_START_DID) + num = restore_did; + else + num = ida_alloc_range(&iommu->domain_ida, IDA_START_DID, + cap_ndoms(iommu->cap) - 1, GFP_KERNEL); if (num < 0) { pr_err("%s: No free domain ids\n", iommu->name); goto err_unlock; @@ -1330,10 +1334,14 @@ static int dmar_domain_attach_device(struct dmar_domain *domain, { struct device_domain_info *info = dev_iommu_priv_get(dev); struct intel_iommu *iommu = info->iommu; + struct device_ser *device_ser = NULL; unsigned long flags; int ret; - ret = domain_attach_iommu(domain, iommu); + device_ser = dev_iommu_restored_state(dev); + + ret = domain_attach_iommu(domain, iommu, + dev_iommu_restore_did(dev, &domain->domain)); if (ret) return ret; @@ -1346,16 +1354,18 @@ static int dmar_domain_attach_device(struct dmar_domain *domain, if (dev_is_real_dma_subdevice(dev)) return 0; - if (!sm_supported(iommu)) - ret = domain_context_mapping(domain, dev); - else if (intel_domain_is_fs_paging(domain)) - ret = domain_setup_first_level(iommu, domain, dev, - IOMMU_NO_PASID, NULL); - else if (intel_domain_is_ss_paging(domain)) - ret = domain_setup_second_level(iommu, domain, dev, - IOMMU_NO_PASID, NULL); - else if (WARN_ON(true)) - ret = -EINVAL; + if (!device_ser) { + if (!sm_supported(iommu)) + ret = domain_context_mapping(domain, dev); + else if (intel_domain_is_fs_paging(domain)) + ret = domain_setup_first_level(iommu, domain, dev, + IOMMU_NO_PASID, NULL); + else if (intel_domain_is_ss_paging(domain)) + ret = domain_setup_second_level(iommu, domain, dev, + IOMMU_NO_PASID, NULL); + else if (WARN_ON(true)) + ret = -EINVAL; + } if (ret) goto out_block_translation; @@ -3215,6 +3225,15 @@ int paging_domain_compatible(struct iommu_domain *domain, struct device *dev) struct intel_iommu *iommu = info->iommu; int ret = -EINVAL; +#ifdef CONFIG_IOMMU_LIVEUPDATE + /* + * Restored IOMMU domains are already attached to the device and can + * only be freed. So no need to check the compatibility. + */ + if (iommu_domain_restored_state(domain)) + return 0; +#endif + if (intel_domain_is_fs_paging(dmar_domain)) ret = paging_domain_compatible_first_stage(dmar_domain, iommu); else if (intel_domain_is_ss_paging(dmar_domain)) @@ -3691,7 +3710,7 @@ domain_add_dev_pasid(struct iommu_domain *domain, if (!dev_pasid) return ERR_PTR(-ENOMEM); - ret = domain_attach_iommu(dmar_domain, iommu); + ret = domain_attach_iommu(dmar_domain, iommu, -1); if (ret) goto out_free; diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 4a8d29b0082c..873fd291e7af 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -1198,7 +1198,8 @@ void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, u64 addr, */ #define QI_OPT_WAIT_DRAIN BIT(0) -int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); +int domain_attach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu, + int restore_did); void domain_detach_iommu(struct dmar_domain *domain, struct intel_iommu *iommu); void device_block_translation(struct device *dev); int paging_domain_compatible(struct iommu_domain *domain, struct device *dev); diff --git a/drivers/iommu/intel/nested.c b/drivers/iommu/intel/nested.c index e9a440e9c960..879b614fc584 100644 --- a/drivers/iommu/intel/nested.c +++ b/drivers/iommu/intel/nested.c @@ -40,7 +40,7 @@ static int intel_nested_attach_dev(struct iommu_domain *domain, return ret; } - ret = domain_attach_iommu(dmar_domain, iommu); + ret = domain_attach_iommu(dmar_domain, iommu, -1); if (ret) { dev_err_ratelimited(dev, "Failed to attach domain to iommu\n"); return ret; diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index 5c86126844ea..d0eb3c10a35b 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -503,6 +504,10 @@ static int iommu_init_device(struct device *dev) goto err_free; } +#ifdef CONFIG_IOMMU_LIVEUPDATE + dev->iommu->device_ser = iommu_get_device_preserved_data(dev); +#endif + iommu_dev = ops->probe_device(dev); if (IS_ERR(iommu_dev)) { ret = PTR_ERR(iommu_dev); @@ -2211,6 +2216,13 @@ static int __iommu_attach_device(struct iommu_domain *domain, ret = domain->ops->attach_dev(domain, dev, old); if (ret) return ret; + +#ifdef CONFIG_IOMMU_LIVEUPDATE + /* The associated state can be unset once restored. */ + if (dev_iommu_restored_state(dev)) + WRITE_ONCE(dev->iommu->device_ser, NULL); +#endif + dev->iommu->attach_deferred = 0; trace_attach_device_to_domain(dev); return 0; @@ -3123,6 +3135,47 @@ int iommu_fwspec_add_ids(struct device *dev, const u32 *ids, int num_ids) } EXPORT_SYMBOL_GPL(iommu_fwspec_add_ids); +static inline void *__iommu_group_restored_state(struct iommu_group *group) +{ + struct device *dev; + + dev = iommu_group_first_dev(group); + if (!dev_is_pci(dev)) + return NULL; + + return dev_iommu_restored_state(dev); +} + +static struct iommu_domain *__iommu_group_restore_domain(struct iommu_group *group) +{ + struct iommu_device_ser *device_ser; + struct iommu_domain *domain; + struct device *dev; + void *owner; + + lockdep_assert_held(&group->mutex); + dev = iommu_group_first_dev(group); + if (!dev_is_pci(dev)) + return NULL; + + device_ser = dev_iommu_restored_state(dev); + if (!device_ser) + return NULL; + + domain = iommu_restore_domain(dev, device_ser, &owner); + if (WARN_ON(IS_ERR(domain))) + return NULL; + + /* + * Ownership of groups with preserved devices is set during boot. These + * will be reclaimed later by the entity (iommufd) that preserved them. + */ + WARN_ON(group->owner); + group->owner = owner; + group->owner_cnt = 1; + return domain; +} + /** * iommu_setup_default_domain - Set the default_domain for the group * @group: Group to change @@ -3137,8 +3190,8 @@ static int iommu_setup_default_domain(struct iommu_group *group, int target_type) { struct iommu_domain *old_dom = group->default_domain; + struct iommu_domain *dom, *restored_domain; struct group_device *gdev; - struct iommu_domain *dom; bool direct_failed; int req_type; int ret; @@ -3182,6 +3235,10 @@ static int iommu_setup_default_domain(struct iommu_group *group, /* We must set default_domain early for __iommu_device_set_domain */ group->default_domain = dom; if (!group->domain) { + if (__iommu_group_restored_state(group)) + restored_domain = __iommu_group_restore_domain(group); + else + restored_domain = dom; /* * Drivers are not allowed to fail the first domain attach. * The only way to recover from this is to fail attaching the @@ -3189,7 +3246,7 @@ static int iommu_setup_default_domain(struct iommu_group *group, * in group->default_domain so it is freed after. */ ret = __iommu_group_set_domain_internal( - group, dom, IOMMU_SET_DOMAIN_MUST_SUCCEED); + group, restored_domain, IOMMU_SET_DOMAIN_MUST_SUCCEED); if (WARN_ON(ret)) goto out_free_old; } else { diff --git a/drivers/iommu/liveupdate.c b/drivers/iommu/liveupdate.c index 1af033f6a18e..f846f36a7b54 100644 --- a/drivers/iommu/liveupdate.c +++ b/drivers/iommu/liveupdate.c @@ -234,6 +234,41 @@ int iommu_for_each_preserved_device(iommu_preserved_device_iter_fn fn, } EXPORT_SYMBOL(iommu_for_each_preserved_device); +static inline bool match_device_ser(struct iommu_device_ser *match, + struct pci_dev *pdev) +{ + return match->devid == pci_dev_id(pdev) && match->pci_domain_nr == pci_domain_nr(pdev->bus); +} + +struct iommu_device_ser *iommu_get_device_preserved_data(struct device *dev) +{ + struct iommu_device_ser *device_ser = NULL; + struct iommu_device_array_ser *array; + struct iommu_flb_obj *flb_obj; + int ret, idx; + + if (!dev_is_pci(dev)) + return NULL; + + ret = liveupdate_flb_get_incoming(&iommu_flb, (void **)&flb_obj); + if (ret) + return NULL; + + array = phys_to_virt(flb_obj->ser->device_array_phys); + iommu_liveupdate_for_each_obj(array, device_ser, idx) { + if (match_device_ser(device_ser, to_pci_dev(dev))) { + device_ser->hdr.incoming = true; + goto out; + } + } + + device_ser = NULL; +out: + liveupdate_flb_put_incoming(&iommu_flb); + return device_ser; +} +EXPORT_SYMBOL(iommu_get_device_preserved_data); + struct iommu_hw_ser *iommu_get_preserved_data(u64 token, enum iommu_type_ser type) { struct iommu_hw_ser *iommu_ser = NULL; @@ -512,3 +547,46 @@ void iommu_unpreserve_device(struct iommu_domain *domain, struct device *dev) iommu_unpreserve_locked(iommu->iommu_dev, flb_obj); } + +struct iommu_domain *iommu_restore_domain(struct device *dev, + struct iommu_device_ser *ser, + void **owner) +{ + struct iommu_domain_ser *domain_ser; + struct iommu_flb_obj *flb_obj; + struct iommu_domain *domain; + int ret; + + domain_ser = phys_to_virt(ser->domain_iommu_ser.domain_phys); + + ret = liveupdate_flb_get_incoming(&iommu_flb, (void **)&flb_obj); + if (ret) + return ERR_PTR(ret); + + guard(mutex)(&flb_obj->lock); + if (domain_ser->restored_domain) { + domain = domain_ser->restored_domain; + goto out; + } + + domain_ser->hdr.incoming = true; + domain = iommu_paging_domain_alloc(dev); + if (IS_ERR(domain)) + goto out; + + ret = domain->ops->restore(domain, domain_ser); + if (ret) { + iommu_domain_free(domain); + domain = ERR_PTR(ret); + goto out; + } + + /* The device is owned by the preserved state. */ + *owner = ser; + domain->preserved_state = domain_ser; + domain_ser->restored_domain = domain; + +out: + liveupdate_flb_put_incoming(&iommu_flb); + return domain; +} diff --git a/include/linux/iommu-liveupdate.h b/include/linux/iommu-liveupdate.h index 0baf6bc2d93f..75d27256c883 100644 --- a/include/linux/iommu-liveupdate.h +++ b/include/linux/iommu-liveupdate.h @@ -30,6 +30,20 @@ static inline void *dev_iommu_preserved_state(struct device *dev) return NULL; } +static inline void *dev_iommu_restored_state(struct device *dev) +{ + struct iommu_device_ser *ser; + + if (!dev->iommu) + return NULL; + + ser = dev->iommu->device_ser; + if (ser && ser->hdr.incoming) + return ser; + + return NULL; +} + static inline void *iommu_domain_restored_state(struct iommu_domain *domain) { struct iommu_domain_ser *ser; @@ -41,8 +55,22 @@ static inline void *iommu_domain_restored_state(struct iommu_domain *domain) return NULL; } +static inline int dev_iommu_restore_did(struct device *dev, struct iommu_domain *domain) +{ + struct iommu_device_ser *ser = dev_iommu_restored_state(dev); + + if (ser && iommu_domain_restored_state(domain)) + return ser->domain_iommu_ser.attachment_id; + + return -1; +} + +struct iommu_domain *iommu_restore_domain(struct device *dev, + struct iommu_device_ser *ser, + void **owner); int iommu_for_each_preserved_device(iommu_preserved_device_iter_fn fn, void *arg); +struct iommu_device_ser *iommu_get_device_preserved_data(struct device *dev); struct iommu_hw_ser *iommu_get_preserved_data(u64 token, enum iommu_type_ser type); int iommu_domain_preserve(struct iommu_domain *domain, struct iommu_domain_ser **ser); void iommu_domain_unpreserve(struct iommu_domain *domain); @@ -60,16 +88,38 @@ static inline void *dev_iommu_preserved_state(struct device *dev) return NULL; } +static inline void *dev_iommu_restored_state(struct device *dev) +{ + return NULL; +} + +static inline int dev_iommu_restore_did(struct device *dev, struct iommu_domain *domain) +{ + return -1; +} + static inline void *iommu_domain_restored_state(struct iommu_domain *domain) { return NULL; } +static inline struct iommu_domain *iommu_restore_domain(struct device *dev, + struct iommu_device_ser *ser, + void **owner) +{ + return NULL; +} + static inline int iommu_for_each_preserved_device(iommu_preserved_device_iter_fn fn, void *arg) { return -EOPNOTSUPP; } +static inline struct iommu_device_ser *iommu_get_device_preserved_data(struct device *dev) +{ + return NULL; +} + static inline struct iommu_hw_ser *iommu_get_preserved_data(u64 token, enum iommu_type_ser type) { return NULL; -- Gitee From 6c53aa0c2f7f8ed59437beef8057ed49b4082f86 Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Wed, 17 Dec 2025 00:38:31 +0000 Subject: [PATCH 13/18] iommu/vt-d: preserve PASID table of preserved device ANBZ: #26808 cherry-picked from https://lore.kernel.org/all/20260427175633.1978233-12-skhawaja@google.com/ In scalable mode the PASID table is used to fetch the io page tables. Preserve and restore the PASID table of the preserved devices. Signed-off-by: Samiullah Khawaja Signed-off-by: Zelin Deng --- drivers/iommu/intel/iommu.c | 5 +- drivers/iommu/intel/iommu.h | 12 +++ drivers/iommu/intel/liveupdate.c | 141 +++++++++++++++++++++++++++++++ drivers/iommu/intel/pasid.c | 7 +- drivers/iommu/intel/pasid.h | 9 ++ include/linux/kho/abi/iommu.h | 13 +++ 6 files changed, 184 insertions(+), 3 deletions(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 0ac10a6c8a1a..8061754fdafb 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3013,8 +3013,10 @@ static int clear_unpreserve_context_entry_fn(struct device *dev, if (!info) return 0; - if (dev_is_pci(dev) && dev_iommu_preserved_state(dev)) + if (dev_is_pci(dev) && dev_iommu_preserved_state(dev)) { + pasid_cleanup_preserved_table(dev); return 0; + } domain_context_clear(info); return 0; @@ -4054,6 +4056,7 @@ const struct iommu_ops intel_iommu_ops = { .page_response = intel_iommu_page_response, #ifdef CONFIG_IOMMU_LIVEUPDATE .preserve_device = intel_iommu_preserve_device, + .unpreserve_device = intel_iommu_unpreserve_device, .preserve = intel_iommu_preserve, .unpreserve = intel_iommu_unpreserve, #endif diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h index 873fd291e7af..a18726f1734b 100644 --- a/drivers/iommu/intel/iommu.h +++ b/drivers/iommu/intel/iommu.h @@ -1306,12 +1306,15 @@ static inline int iopf_for_domain_replace(struct iommu_domain *new, #ifdef CONFIG_IOMMU_LIVEUPDATE int intel_iommu_preserve_device(struct device *dev, struct iommu_device_ser *device_ser); +void intel_iommu_unpreserve_device(struct device *dev, + struct iommu_device_ser *device_ser); int intel_iommu_preserve(struct iommu_device *iommu, struct iommu_hw_ser *iommu_ser); void intel_iommu_unpreserve(struct iommu_device *iommu, struct iommu_hw_ser *iommu_ser); void intel_iommu_liveupdate_restore_root_table(struct intel_iommu *iommu, struct iommu_hw_ser *iommu_ser); +void pasid_cleanup_preserved_table(struct device *dev); #else static inline int intel_iommu_preserve_device(struct device *dev, struct iommu_device_ser *device_ser) @@ -1319,6 +1322,11 @@ static inline int intel_iommu_preserve_device(struct device *dev, return -EOPNOTSUPP; } +static inline void intel_iommu_unpreserve_device(struct device *dev, + struct iommu_device_ser *device_ser) +{ +} + static inline int intel_iommu_preserve(struct iommu_device *iommu, struct iommu_hw_ser *iommu_ser) { @@ -1334,6 +1342,10 @@ static inline void intel_iommu_liveupdate_restore_root_table(struct intel_iommu struct iommu_hw_ser *iommu_ser) { } + +static inline void pasid_cleanup_preserved_table(struct device *dev) +{ +} #endif #ifdef CONFIG_INTEL_IOMMU_SVM diff --git a/drivers/iommu/intel/liveupdate.c b/drivers/iommu/intel/liveupdate.c index 50a63812533f..404b485e97b9 100644 --- a/drivers/iommu/intel/liveupdate.c +++ b/drivers/iommu/intel/liveupdate.c @@ -14,6 +14,7 @@ #include #include "iommu.h" +#include "pasid.h" #include "../iommu-pages.h" static void unpreserve_iommu_context_table(struct intel_iommu *iommu, int end) @@ -140,10 +141,96 @@ void intel_iommu_liveupdate_restore_root_table(struct intel_iommu *iommu, iommu_for_each_preserved_device(_restore_used_domain_ids, iommu); } +enum pasid_lu_op { + PASID_LU_OP_PRESERVE = 1, + PASID_LU_OP_UNPRESERVE, + PASID_LU_OP_RESTORE, + PASID_LU_OP_FREE, +}; + +static int pasid_lu_do_op(void *table, enum pasid_lu_op op) +{ + int ret = 0; + + switch (op) { + case PASID_LU_OP_PRESERVE: + ret = iommu_preserve_page(table); + break; + case PASID_LU_OP_UNPRESERVE: + iommu_unpreserve_page(table); + break; + case PASID_LU_OP_RESTORE: + iommu_restore_page(virt_to_phys(table)); + break; + case PASID_LU_OP_FREE: + iommu_free_pages(table); + break; + } + + return ret; +} + +static int pasid_lu_handle_pd(struct pasid_dir_entry *dir, enum pasid_lu_op op) +{ + struct pasid_entry *table; + int ret; + + /* Only preserve first table for NO_PASID. */ + table = get_pasid_table_from_pde(&dir[0]); + if (!table) + return -EINVAL; + + ret = pasid_lu_do_op(table, op); + if (ret) + return ret; + + ret = pasid_lu_do_op(dir, op); + if (ret) + goto err; + + return 0; +err: + if (op == PASID_LU_OP_PRESERVE) + pasid_lu_do_op(table, PASID_LU_OP_UNPRESERVE); + + return ret; +} + +void pasid_cleanup_preserved_table(struct device *dev) +{ + struct pasid_table *pasid_table; + struct pasid_dir_entry *dir; + struct pasid_entry *table; + size_t dir_size; + + pasid_table = intel_pasid_get_table(dev); + if (!pasid_table) + return; + + dir = pasid_table->table; + table = get_pasid_table_from_pde(&dir[0]); + if (!table) + return; + + /* Clear everything except the first entry in table. */ + memset(&table[1], 0, SZ_4K - sizeof(*table)); + + /* Use the folio order to calculate the size of Pasid Directory */ + dir_size = (1 << (folio_order(virt_to_folio(dir)) + PAGE_SHIFT)); + + /* Clear everything except the first entry in directory */ + memset(&dir[1], 0, dir_size - sizeof(struct pasid_dir_entry)); + + clflush_cache_range(&table[0], SZ_4K); + clflush_cache_range(&dir[0], dir_size); +} + int intel_iommu_preserve_device(struct device *dev, struct iommu_device_ser *device_ser) { struct device_domain_info *info = dev_iommu_priv_get(dev); + struct pasid_table *pasid_table; + int ret; if (!dev_is_pci(dev)) { dev_err(dev, "Cannot preserve non-PCI device\n"); @@ -155,9 +242,45 @@ int intel_iommu_preserve_device(struct device *dev, device_ser->domain_iommu_ser.attachment_id = domain_id_iommu(info->domain, info->iommu); + + if (!sm_supported(info->iommu)) + return 0; + + pasid_table = intel_pasid_get_table(dev); + if (!pasid_table) + return -EINVAL; + + ret = pasid_lu_handle_pd(pasid_table->table, PASID_LU_OP_PRESERVE); + if (ret) + return ret; + + device_ser->intel.pasid_table = virt_to_phys(pasid_table->table); + device_ser->intel.max_pasid = pasid_table->max_pasid; return 0; } +void intel_iommu_unpreserve_device(struct device *dev, + struct iommu_device_ser *device_ser) +{ + struct device_domain_info *info = dev_iommu_priv_get(dev); + struct pasid_table *pasid_table; + + if (!dev_is_pci(dev)) + return; + + if (!info) + return; + + if (!sm_supported(info->iommu)) + return; + + pasid_table = intel_pasid_get_table(dev); + if (!pasid_table) + return; + + pasid_lu_handle_pd(pasid_table->table, PASID_LU_OP_UNPRESERVE); +} + int intel_iommu_preserve(struct iommu_device *iommu_dev, struct iommu_hw_ser *ser) { @@ -194,3 +317,21 @@ void intel_iommu_unpreserve(struct iommu_device *iommu_dev, unpreserve_iommu_context_table(iommu, ROOT_ENTRY_NR); iommu_unpreserve_page(iommu->root_entry); } + +void *intel_pasid_try_restore_table(struct device *dev, u64 max_pasid) +{ + struct iommu_device_ser *ser = dev_iommu_restored_state(dev); + + if (!ser) + return NULL; + + BUG_ON(pasid_lu_handle_pd(phys_to_virt(ser->intel.pasid_table), + PASID_LU_OP_RESTORE)); + if (WARN_ON_ONCE(ser->intel.max_pasid != max_pasid)) { + pasid_lu_handle_pd(phys_to_virt(ser->intel.pasid_table), + PASID_LU_OP_FREE); + return NULL; + } + + return phys_to_virt(ser->intel.pasid_table); +} diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c index b63a71904cfb..cc9756300e46 100644 --- a/drivers/iommu/intel/pasid.c +++ b/drivers/iommu/intel/pasid.c @@ -60,8 +60,11 @@ int intel_pasid_alloc_table(struct device *dev) size = max_pasid >> (PASID_PDE_SHIFT - 3); order = size ? get_order(size) : 0; - dir = iommu_alloc_pages_node_sz(info->iommu->node, GFP_KERNEL, - 1 << (order + PAGE_SHIFT)); + + dir = intel_pasid_try_restore_table(dev, 1 << (order + PAGE_SHIFT + 3)); + if (!dir) + dir = iommu_alloc_pages_node_sz(info->iommu->node, GFP_KERNEL, + 1 << (order + PAGE_SHIFT)); if (!dir) { kfree(pasid_table); return -ENOMEM; diff --git a/drivers/iommu/intel/pasid.h b/drivers/iommu/intel/pasid.h index 48d3bb6b68de..44e673a4ad8f 100644 --- a/drivers/iommu/intel/pasid.h +++ b/drivers/iommu/intel/pasid.h @@ -301,6 +301,15 @@ static inline void pasid_set_eafe(struct pasid_entry *pe) extern unsigned int intel_pasid_max_id; int intel_pasid_alloc_table(struct device *dev); +#ifdef CONFIG_IOMMU_LIVEUPDATE +void *intel_pasid_try_restore_table(struct device *dev, u64 max_pasid); +#else +static inline void *intel_pasid_try_restore_table(struct device *dev, + u64 max_pasid) +{ + return NULL; +} +#endif void intel_pasid_free_table(struct device *dev); struct pasid_table *intel_pasid_get_table(struct device *dev); int intel_pasid_setup_first_level(struct intel_iommu *iommu, struct device *dev, diff --git a/include/linux/kho/abi/iommu.h b/include/linux/kho/abi/iommu.h index 238926429361..66314551396d 100644 --- a/include/linux/kho/abi/iommu.h +++ b/include/linux/kho/abi/iommu.h @@ -119,6 +119,16 @@ struct iommu_dev_map_ser { u64 iommu_phys; } __packed; +/** + * struct iommu_device_intel_ser - Intel specific state of serialized device + * @pasid_table: Physical address of pasid table + * @max_pasid: Maximum supported pasid + */ +struct iommu_device_intel_ser { + u64 pasid_table; + u64 max_pasid; +} __packed; + /** * struct iommu_device_ser - Serialized state of a device * @hdr: Common object header @@ -131,6 +141,9 @@ struct iommu_device_ser { u32 devid; u32 pci_domain_nr; struct iommu_dev_map_ser domain_iommu_ser; + union { + struct iommu_device_intel_ser intel; + }; } __packed; /** -- Gitee From 424d651b4a2edc237b4cb869999db013d9fe84df Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Tue, 8 Jul 2025 21:34:03 +0000 Subject: [PATCH 14/18] iommufd: Implement ioctl to mark HWPT for preservation ANBZ: #26808 cherry-picked from https://lore.kernel.org/all/20260427175633.1978233-13-skhawaja@google.com/ Userspace provides a token to mark the HWPT for preservation. Note that this token is not the LUO token that is used to preserve the iommufd. Once all the required HWPT are marked for preservation, the user can preserve the iommufd into LUO. The iommufd will preserve the HWPTs that are marked for preservation. The marked HWPTs are tracked using a new XArray mark protected by a new liveupdate mutex. This mutex will also be used during iommufd preservation to protect against any race with the mark preserve ioctl. The HWPT token will be used during restore to identify this HWPT. The restoration logic is not implemented and will be added later. [Zelin Deng: Fix makefile issue to build iommufd.ko with liveupdate support --> should by iommufd-$(xxx) += liveupdate.o.] Signed-off-by: YiFei Zhu Signed-off-by: Samiullah Khawaja Signed-off-by: Zelin Deng --- MAINTAINERS | 1 + drivers/iommu/iommufd/Makefile | 1 + drivers/iommu/iommufd/iommufd_private.h | 18 +++++++++ drivers/iommu/iommufd/liveupdate.c | 52 +++++++++++++++++++++++++ drivers/iommu/iommufd/main.c | 9 +++++ include/uapi/linux/iommufd.h | 26 +++++++++++++ 6 files changed, 107 insertions(+) create mode 100644 drivers/iommu/iommufd/liveupdate.c diff --git a/MAINTAINERS b/MAINTAINERS index 4904c661d8bd..2eecf23fd2a7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11058,6 +11058,7 @@ R: Pranjal Shrivastava L: iommu@lists.linux.dev S: Maintained F: drivers/iommu/intel/liveupdate.c +F: drivers/iommu/iommufd/liveupdate.c F: drivers/iommu/liveupdate.c F: include/linux/iommu-liveupdate.h F: include/linux/kho/abi/iommu.h diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile index 71d692c9a8f4..1407fb58ef6d 100644 --- a/drivers/iommu/iommufd/Makefile +++ b/drivers/iommu/iommufd/Makefile @@ -17,3 +17,4 @@ obj-$(CONFIG_IOMMUFD_DRIVER) += iova_bitmap.o iommufd_driver-y := driver.o obj-$(CONFIG_IOMMUFD_DRIVER_CORE) += iommufd_driver.o +iommufd-$(CONFIG_IOMMU_LIVEUPDATE) += liveupdate.o diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index eb6d1a70f673..6613301ad6b8 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -45,6 +45,11 @@ struct iommufd_ctx { struct file *file; struct xarray objects; struct xarray groups; +#ifdef CONFIG_IOMMU_LIVEUPDATE +#define IOMMUFD_OBJ_LIVEUPDATE_MARK XA_MARK_1 + /* @liveupdate_mutex: Protects the preservation of HWPTs. */ + struct mutex liveupdate_mutex; +#endif wait_queue_head_t destroy_wait; struct rw_semaphore ioas_creation_lock; struct maple_tree mt_mmap; @@ -374,6 +379,10 @@ struct iommufd_hwpt_paging { bool auto_domain : 1; bool enforce_cache_coherency : 1; bool nest_parent : 1; +#ifdef CONFIG_IOMMU_LIVEUPDATE + bool liveupdate_preserve : 1; + u64 liveupdate_token; +#endif /* Head at iommufd_ioas::hwpt_list */ struct list_head hwpt_item; struct iommufd_sw_msi_maps present_sw_msi; @@ -707,6 +716,15 @@ iommufd_get_vdevice(struct iommufd_ctx *ictx, u32 id) struct iommufd_vdevice, obj); } +#ifdef CONFIG_IOMMU_LIVEUPDATE +int iommufd_hwpt_liveupdate_mark_preserve(struct iommufd_ucmd *ucmd); +#else +static inline int iommufd_hwpt_liveupdate_mark_preserve(struct iommufd_ucmd *ucmd) +{ + return -ENOTTY; +} +#endif + #ifdef CONFIG_IOMMUFD_TEST int iommufd_test(struct iommufd_ucmd *ucmd); void iommufd_selftest_destroy(struct iommufd_object *obj); diff --git a/drivers/iommu/iommufd/liveupdate.c b/drivers/iommu/iommufd/liveupdate.c new file mode 100644 index 000000000000..2d3abfa9e9f8 --- /dev/null +++ b/drivers/iommu/iommufd/liveupdate.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Copyright (C) 2026, Google LLC + * Author: Samiullah Khawaja + */ + +#define pr_fmt(fmt) "iommufd: " fmt + +#include +#include +#include + +#include "iommufd_private.h" + +int iommufd_hwpt_liveupdate_mark_preserve(struct iommufd_ucmd *ucmd) +{ + struct iommu_hwpt_liveupdate_mark_preserve *cmd = ucmd->cmd; + struct iommufd_hwpt_paging *hwpt_target; + struct iommufd_hwpt_paging *hwpt_paging; + struct iommufd_ctx *ictx = ucmd->ictx; + struct iommufd_object *obj; + unsigned long index; + int rc = 0; + + hwpt_target = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id); + if (IS_ERR(hwpt_target)) + return PTR_ERR(hwpt_target); + + mutex_lock(&ictx->liveupdate_mutex); + + xa_lock(&ictx->objects); + xa_for_each_marked(&ictx->objects, index, obj, IOMMUFD_OBJ_LIVEUPDATE_MARK) { + if (WARN_ON_ONCE(obj->type != IOMMUFD_OBJ_HWPT_PAGING)) + continue; + + hwpt_paging = to_hwpt_paging(container_of(obj, struct iommufd_hw_pagetable, obj)); + if (hwpt_paging->liveupdate_token == cmd->hwpt_token) { + rc = -EADDRINUSE; + goto out_unlock; + } + } + + __xa_set_mark(&ictx->objects, hwpt_target->common.obj.id, IOMMUFD_OBJ_LIVEUPDATE_MARK); + hwpt_target->liveupdate_token = cmd->hwpt_token; + +out_unlock: + xa_unlock(&ictx->objects); + mutex_unlock(&ictx->liveupdate_mutex); + iommufd_put_object(ictx, &hwpt_target->common.obj); + return rc; +} diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index abeb41680e3f..30745c87bd4a 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -313,6 +313,9 @@ static int iommufd_fops_open(struct inode *inode, struct file *filp) init_rwsem(&ictx->ioas_creation_lock); xa_init_flags(&ictx->objects, XA_FLAGS_ALLOC1 | XA_FLAGS_ACCOUNT); xa_init(&ictx->groups); +#ifdef CONFIG_IOMMU_LIVEUPDATE + mutex_init(&ictx->liveupdate_mutex); +#endif ictx->file = filp; mt_init_flags(&ictx->mt_mmap, MT_FLAGS_ALLOC_RANGE); init_waitqueue_head(&ictx->destroy_wait); @@ -375,6 +378,9 @@ static int iommufd_fops_release(struct inode *inode, struct file *filp) * iommufd_object_tombstone_user() */ xa_destroy(&ictx->objects); +#ifdef CONFIG_IOMMU_LIVEUPDATE + mutex_destroy(&ictx->liveupdate_mutex); +#endif WARN_ON(!xa_empty(&ictx->groups)); @@ -420,6 +426,7 @@ union ucmd_buffer { struct iommu_hwpt_alloc hwpt; struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap; struct iommu_hwpt_invalidate cache; + struct iommu_hwpt_liveupdate_mark_preserve mark_preserve; struct iommu_hwpt_set_dirty_tracking set_dirty_tracking; struct iommu_ioas_alloc alloc; struct iommu_ioas_allow_iovas allow_iovas; @@ -493,6 +500,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = { __reserved), IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl, struct iommu_viommu_alloc, out_viommu_id), + IOCTL_OP(IOMMU_HWPT_LIVEUPDATE_MARK_PRESERVE, iommufd_hwpt_liveupdate_mark_preserve, + struct iommu_hwpt_liveupdate_mark_preserve, hwpt_token), #ifdef CONFIG_IOMMUFD_TEST IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last), #endif diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h index 9f1acb4eb2a8..9afe8515f43b 100644 --- a/include/uapi/linux/iommufd.h +++ b/include/uapi/linux/iommufd.h @@ -57,6 +57,7 @@ enum { IOMMUFD_CMD_IOAS_CHANGE_PROCESS = 0x92, IOMMUFD_CMD_VEVENTQ_ALLOC = 0x93, IOMMUFD_CMD_HW_QUEUE_ALLOC = 0x94, + IOMMUFD_CMD_HWPT_LU_MARK_PRESERVE = 0x95, }; /** @@ -1299,4 +1300,29 @@ struct iommu_hw_queue_alloc { __aligned_u64 length; }; #define IOMMU_HW_QUEUE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HW_QUEUE_ALLOC) + +/** + * struct iommu_hwpt_liveupdate_mark_preserve - ioctl(IOMMU_HWPT_LIVEUPDATE_MARK_PRESERVE) + * @size: sizeof(struct iommu_hwpt_liveupdate_mark_preserve) + * @hwpt_id: Iommufd object ID of the target HWPT + * @hwpt_token: Token to identify this hwpt upon restore + * + * The target HWPT will be preserved during iommufd preservation. + * Only file-based memory mappings (e.g. memfd) are supported for HWPTs marked + * for preservation. Mapping anonymous memory into a preserved HWPT will result + * in a failure during the preservation phase. + * + * The hwpt_token is provided by userspace. If userspace enters a token + * already in use within this iommufd, -EADDRINUSE is returned from this ioctl. + * + * Note: There is no 'unmark' operation, so any HWPTs pooled in userspace that + * are marked for preservation must be destroyed after use. + */ +struct iommu_hwpt_liveupdate_mark_preserve { + __u32 size; + __u32 hwpt_id; + __u64 hwpt_token; +}; +#define IOMMU_HWPT_LIVEUPDATE_MARK_PRESERVE _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_LU_MARK_PRESERVE) + #endif -- Gitee From d2455cfada1576e13a1753fff5078d5d7e61b957 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Tue, 8 Jul 2025 23:36:58 +0000 Subject: [PATCH 15/18] iommufd: Persist iommu hardware pagetables for live update ANBZ: #26808 cherry-picked from https://lore.kernel.org/all/20260427175633.1978233-14-skhawaja@google.com/ Register iommufd with the LUO framework and implement the preserve and unpreserve ops to save marked HWPTs. To make sure mappings do not change during preserved state, add a liveupdate_immutable flag to IOAS. When an HWPT is preserved, its IOAS is marked immutable and any map/unmap attempts will fail with -EBUSY. This is synchronized using the domains_rwsem to prevent races with concurrent mapping operations. The preserve callback iterates over the marked HWPTs, verifies that the backing memory pages are preserved, and calls iommu_domain_preserve() to preserve the associated IOMMU domain. [Zelin Deng: 1. include for memfd_get_seals(). 2. also export necessary symbols referenced by this patch.] Signed-off-by: YiFei Zhu Signed-off-by: Samiullah Khawaja Signed-off-by: Zelin Deng --- MAINTAINERS | 1 + drivers/iommu/iommufd/io_pagetable.c | 11 + drivers/iommu/iommufd/io_pagetable.h | 1 + drivers/iommu/iommufd/iommufd_private.h | 27 ++- drivers/iommu/iommufd/liveupdate.c | 287 ++++++++++++++++++++++++ drivers/iommu/iommufd/main.c | 10 +- drivers/iommu/iommufd/pages.c | 7 + include/linux/kho/abi/iommufd.h | 51 +++++ kernel/liveupdate/luo_file.c | 1 + mm/memfd.c | 1 + 10 files changed, 395 insertions(+), 2 deletions(-) create mode 100644 include/linux/kho/abi/iommufd.h diff --git a/MAINTAINERS b/MAINTAINERS index 2eecf23fd2a7..5b749b448e7d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11062,6 +11062,7 @@ F: drivers/iommu/iommufd/liveupdate.c F: drivers/iommu/liveupdate.c F: include/linux/iommu-liveupdate.h F: include/linux/kho/abi/iommu.h +F: include/linux/kho/abi/iommufd.h IOMMUFD M: Jason Gunthorpe diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c index 436992331111..495cfdb73905 100644 --- a/drivers/iommu/iommufd/io_pagetable.c +++ b/drivers/iommu/iommufd/io_pagetable.c @@ -384,6 +384,11 @@ int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list, return rc; down_read(&iopt->domains_rwsem); + if (iopt_liveupdate_immutable(iopt)) { + rc = -EBUSY; + goto out_unlock_domains; + } + rc = iopt_fill_domains_pages(pages_list); if (rc) goto out_unlock_domains; @@ -755,6 +760,12 @@ static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start, again: down_read(&iopt->domains_rwsem); down_write(&iopt->iova_rwsem); + + if (iopt_liveupdate_immutable(iopt)) { + rc = -EBUSY; + goto out_unlock_iova; + } + while ((area = iopt_area_iter_first(iopt, start, last))) { unsigned long area_last = iopt_area_last_iova(area); unsigned long area_first = iopt_area_iova(area); diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h index 14cd052fd320..b64cb4cf300c 100644 --- a/drivers/iommu/iommufd/io_pagetable.h +++ b/drivers/iommu/iommufd/io_pagetable.h @@ -234,6 +234,7 @@ struct iopt_pages { struct { /* IOPT_ADDRESS_FILE */ struct file *file; unsigned long start; + u32 seals; }; /* IOPT_ADDRESS_DMABUF */ struct iopt_pages_dmabuf dmabuf; diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index 6613301ad6b8..fdd8b448fc9e 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -99,6 +99,9 @@ struct io_pagetable { /* IOVA that cannot be allocated, struct iopt_reserved */ struct rb_root_cached reserved_itree; u8 disable_large_pages; +#ifdef CONFIG_IOMMU_LIVEUPDATE + bool liveupdate_immutable; +#endif unsigned long iova_alignment; }; @@ -380,7 +383,7 @@ struct iommufd_hwpt_paging { bool enforce_cache_coherency : 1; bool nest_parent : 1; #ifdef CONFIG_IOMMU_LIVEUPDATE - bool liveupdate_preserve : 1; + bool liveupdate_preserved : 1; u64 liveupdate_token; #endif /* Head at iommufd_ioas::hwpt_list */ @@ -717,12 +720,34 @@ iommufd_get_vdevice(struct iommufd_ctx *ictx, u32 id) } #ifdef CONFIG_IOMMU_LIVEUPDATE +int iommufd_liveupdate_register(void); +void iommufd_liveupdate_unregister(void); + int iommufd_hwpt_liveupdate_mark_preserve(struct iommufd_ucmd *ucmd); + +static inline bool iopt_liveupdate_immutable(const struct io_pagetable *iopt) +{ + return iopt->liveupdate_immutable; +} #else +static inline int iommufd_liveupdate_register(void) +{ + return 0; +} + +static inline void iommufd_liveupdate_unregister(void) +{ +} + static inline int iommufd_hwpt_liveupdate_mark_preserve(struct iommufd_ucmd *ucmd) { return -ENOTTY; } + +static inline bool iopt_liveupdate_immutable(const struct io_pagetable *iopt) +{ + return false; +} #endif #ifdef CONFIG_IOMMUFD_TEST diff --git a/drivers/iommu/iommufd/liveupdate.c b/drivers/iommu/iommufd/liveupdate.c index 2d3abfa9e9f8..3cb220557d0d 100644 --- a/drivers/iommu/iommufd/liveupdate.c +++ b/drivers/iommu/iommufd/liveupdate.c @@ -9,9 +9,22 @@ #include #include +#include +#include #include +#include +#include +#include #include "iommufd_private.h" +#include "io_pagetable.h" + +static void ioas_set_immutable(struct iommufd_ioas *ioas, bool immutable) +{ + down_write(&ioas->iopt.domains_rwsem); + ioas->iopt.liveupdate_immutable = immutable; + up_write(&ioas->iopt.domains_rwsem); +} int iommufd_hwpt_liveupdate_mark_preserve(struct iommufd_ucmd *ucmd) { @@ -50,3 +63,277 @@ int iommufd_hwpt_liveupdate_mark_preserve(struct iommufd_ucmd *ucmd) iommufd_put_object(ictx, &hwpt_target->common.obj); return rc; } + +static int check_iopt_pages_preserved(struct liveupdate_session *s, + struct iommufd_hwpt_paging *hwpt) +{ + u32 req_seals = F_SEAL_SEAL | F_SEAL_GROW | F_SEAL_SHRINK; + struct iopt_area *area; + int ret = 0; + + down_read(&hwpt->ioas->iopt.iova_rwsem); + for (area = iopt_area_iter_first(&hwpt->ioas->iopt, 0, ULONG_MAX); area; + area = iopt_area_iter_next(area, 0, ULONG_MAX)) { + struct iopt_pages *pages = area->pages; + + /* Only allow file based mapping */ + if (pages->type != IOPT_ADDRESS_FILE) { + ret = -EINVAL; + break; + } + + /* + * When this memory file was mapped it should be sealed and seal + * should be sealed. This means that since mapping was done the + * memory file was not grown or shrink and the pages being used + * until now remain pinned and preserved. + */ + if ((pages->seals & req_seals) != req_seals) { + ret = -EINVAL; + break; + } + + /* Make sure that the file was preserved. */ + ret = liveupdate_get_token_outgoing(s, pages->file, NULL); + if (ret) + break; + } + up_read(&hwpt->ioas->iopt.iova_rwsem); + + return ret; +} + +static int iommufd_preserve_hwpt(struct iommufd_hwpt_paging *hwpt, + struct iommufd_hwpt_ser *hwpt_ser, + struct liveupdate_session *session) +{ + struct iommu_domain_ser *domain_ser; + bool ioas_made_immutable = false; + int rc; + + if (!hwpt->ioas->iopt.liveupdate_immutable) { + /* + * Make IOAS immutable so the DMA mappings do not change while + * the HWPT is preserved. Since one IOAS can have multiple + * HWPTs, if an error occurs this call needs to make the IOAS + * mutable again if it was the one that made it immutable. + */ + ioas_made_immutable = true; + ioas_set_immutable(hwpt->ioas, true); + + rc = check_iopt_pages_preserved(session, hwpt); + if (rc) + goto err; + } + + hwpt_ser->token = hwpt->liveupdate_token; + hwpt_ser->reclaimed = false; + + rc = iommu_domain_preserve(hwpt->common.domain, &domain_ser); + if (rc < 0) + goto err; + + hwpt_ser->domain_data = virt_to_phys(domain_ser); + return 0; + +err: + if (ioas_made_immutable) + ioas_set_immutable(hwpt->ioas, false); + + return rc; +} + +static void _iommufd_unpreserve(struct iommufd_ctx *ictx, + struct iommufd_ser *ser) +{ + struct iommufd_hwpt_paging *hwpt; + struct iommufd_object *obj; + unsigned long index; + + xa_lock(&ictx->objects); + xa_for_each_marked(&ictx->objects, index, obj, IOMMUFD_OBJ_LIVEUPDATE_MARK) { + if (obj->type != IOMMUFD_OBJ_HWPT_PAGING) + continue; + + hwpt = to_hwpt_paging(container_of(obj, struct iommufd_hw_pagetable, obj)); + if (!hwpt->liveupdate_preserved) + continue; + + xa_unlock(&ictx->objects); + + iommu_domain_unpreserve(hwpt->common.domain); + if (hwpt->ioas->iopt.liveupdate_immutable) + ioas_set_immutable(hwpt->ioas, false); + + hwpt->liveupdate_preserved = false; + iommufd_put_object(ictx, obj); + + xa_lock(&ictx->objects); + } + xa_unlock(&ictx->objects); + + kho_unpreserve_free(ser); +} + +static int iommufd_liveupdate_preserve(struct liveupdate_file_op_args *args) +{ + struct iommufd_ctx *ictx = iommufd_ctx_from_file(args->file); + struct iommufd_hwpt_paging *hwpt; + struct iommufd_ser *iommufd_ser; + struct iommufd_object *obj; + unsigned int nr_hwpts; + unsigned long index; + unsigned int i; + void *mem; + int rc; + + if (IS_ERR(ictx)) + return PTR_ERR(ictx); + + mutex_lock(&ictx->liveupdate_mutex); + + /* Count the number of HWPTs to preserve */ + nr_hwpts = 0; + xa_lock(&ictx->objects); + xa_for_each_marked(&ictx->objects, index, obj, IOMMUFD_OBJ_LIVEUPDATE_MARK) { + if (obj->type != IOMMUFD_OBJ_HWPT_PAGING) + continue; + + hwpt = to_hwpt_paging(container_of(obj, struct iommufd_hw_pagetable, obj)); + if (!hwpt->common.domain) { + rc = -EINVAL; + xa_unlock(&ictx->objects); + goto out_unlock; + } + nr_hwpts++; + } + xa_unlock(&ictx->objects); + + mem = kho_alloc_preserve(struct_size(iommufd_ser, + hwpt_array, nr_hwpts)); + if (!mem) { + rc = -ENOMEM; + goto out_unlock; + } + + iommufd_ser = mem; + iommufd_ser->nr_hwpts = nr_hwpts; + + /* Preserve HWPTs */ + i = 0; + xa_lock(&ictx->objects); + xa_for_each_marked(&ictx->objects, index, obj, IOMMUFD_OBJ_LIVEUPDATE_MARK) { + if (obj->type != IOMMUFD_OBJ_HWPT_PAGING) + continue; + + if (!iommufd_lock_obj(obj)) { + rc = -ENOENT; + xa_unlock(&ictx->objects); + goto out_unpreserve; + } + + /* + * HWPT is locked so it will not be destroyed. The xarray lock + * can be released here before preserving the HWPT. + */ + xa_unlock(&ictx->objects); + hwpt = to_hwpt_paging(container_of(obj, struct iommufd_hw_pagetable, obj)); + rc = iommufd_preserve_hwpt(hwpt, &iommufd_ser->hwpt_array[i++], args->session); + if (rc) { + iommufd_put_object(ictx, obj); + goto out_unpreserve; + } + + /* Mark as preserved */ + hwpt->liveupdate_preserved = true; + xa_lock(&ictx->objects); + } + xa_unlock(&ictx->objects); + + args->serialized_data = virt_to_phys(iommufd_ser); + mutex_unlock(&ictx->liveupdate_mutex); + iommufd_ctx_put(ictx); + return 0; + +out_unpreserve: + _iommufd_unpreserve(ictx, iommufd_ser); +out_unlock: + mutex_unlock(&ictx->liveupdate_mutex); + iommufd_ctx_put(ictx); + return rc; +} + +static void iommufd_liveupdate_unpreserve(struct liveupdate_file_op_args *args) +{ + struct iommufd_ctx *ictx = iommufd_ctx_from_file(args->file); + + if (WARN_ON(IS_ERR(ictx))) + return; + + mutex_lock(&ictx->liveupdate_mutex); + _iommufd_unpreserve(ictx, phys_to_virt(args->serialized_data)); + mutex_unlock(&ictx->liveupdate_mutex); + + iommufd_ctx_put(ictx); +} + +static int iommufd_liveupdate_retrieve(struct liveupdate_file_op_args *args) +{ + return -EOPNOTSUPP; +} + +static bool iommufd_liveupdate_can_finish(struct liveupdate_file_op_args *args) +{ + return false; +} + +static void iommufd_liveupdate_finish(struct liveupdate_file_op_args *args) +{ +} + +static bool iommufd_liveupdate_can_preserve(struct liveupdate_file_handler *handler, + struct file *file) +{ + struct iommufd_ctx *ictx = iommufd_ctx_from_file(file); + + if (IS_ERR(ictx)) + return false; + + iommufd_ctx_put(ictx); + return true; +} + +static struct liveupdate_file_ops iommufd_ser_file_ops = { + .can_preserve = iommufd_liveupdate_can_preserve, + .preserve = iommufd_liveupdate_preserve, + .unpreserve = iommufd_liveupdate_unpreserve, + .retrieve = iommufd_liveupdate_retrieve, + .can_finish = iommufd_liveupdate_can_finish, + .finish = iommufd_liveupdate_finish, +}; + +static struct liveupdate_file_handler iommufd_ser_handler = { + .compatible = IOMMUFD_LUO_COMPATIBLE, + .ops = &iommufd_ser_file_ops, +}; + +int iommufd_liveupdate_register(void) +{ + int ret; + + ret = liveupdate_register_file_handler(&iommufd_ser_handler); + if (ret) + return ret; + + ret = iommu_liveupdate_register_flb(&iommufd_ser_handler); + if (ret) + liveupdate_unregister_file_handler(&iommufd_ser_handler); + + return ret; +} + +void iommufd_liveupdate_unregister(void) +{ + iommu_liveupdate_unregister_flb(&iommufd_ser_handler); + liveupdate_unregister_file_handler(&iommufd_ser_handler); +} diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c index 30745c87bd4a..db52aa2f5a34 100644 --- a/drivers/iommu/iommufd/main.c +++ b/drivers/iommu/iommufd/main.c @@ -782,11 +782,18 @@ static int __init iommufd_init(void) if (ret) goto err_misc; } - ret = iommufd_test_init(); + + ret = iommufd_liveupdate_register(); if (ret) goto err_vfio_misc; + + ret = iommufd_test_init(); + if (ret) + goto err_liveupdate; return 0; +err_liveupdate: + iommufd_liveupdate_unregister(); err_vfio_misc: if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) misc_deregister(&vfio_misc_dev); @@ -798,6 +805,7 @@ static int __init iommufd_init(void) static void __exit iommufd_exit(void) { iommufd_test_exit(); + iommufd_liveupdate_unregister(); if (IS_ENABLED(CONFIG_IOMMUFD_VFIO_CONTAINER)) misc_deregister(&vfio_misc_dev); misc_deregister(&iommu_misc_dev); diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c index 6b73fcbf4181..f4c45282648e 100644 --- a/drivers/iommu/iommufd/pages.c +++ b/drivers/iommu/iommufd/pages.c @@ -56,6 +56,7 @@ #include #include #include +#include #include "double_span.h" #include "io_pagetable.h" @@ -1514,6 +1515,7 @@ struct iopt_pages *iopt_alloc_file_pages(struct file *file, { struct iopt_pages *pages; + int seals; pages = iopt_alloc_pages(start_byte, length, writable); if (IS_ERR(pages)) @@ -1521,6 +1523,11 @@ struct iopt_pages *iopt_alloc_file_pages(struct file *file, pages->file = get_file(file); pages->start = start - start_byte; pages->type = IOPT_ADDRESS_FILE; + + seals = memfd_get_seals(file); + if (seals > 0) + pages->seals = seals; + return pages; } diff --git a/include/linux/kho/abi/iommufd.h b/include/linux/kho/abi/iommufd.h new file mode 100644 index 000000000000..557952123ba4 --- /dev/null +++ b/include/linux/kho/abi/iommufd.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (C) 2026, Google LLC + * Author: Samiullah Khawaja + */ + +#ifndef _LINUX_KHO_ABI_IOMMUFD_H +#define _LINUX_KHO_ABI_IOMMUFD_H + +#include +#include +#include + +/** + * DOC: IOMMUFD Live Update ABI + * + * This header defines the ABI for preserving the state of an IOMMUFD file + * across a kexec reboot using LUO. + * + * This interface is a contract. Any modification to any of the serialization + * structs defined here constitutes a breaking change. Such changes require + * incrementing the version number in the IOMMUFD_LUO_COMPATIBLE string. + */ + +#define IOMMUFD_LUO_COMPATIBLE "iommufd-v1" + +/** + * struct iommu_hwpt_ser - IOMMUFD HWPT serialized state + * @domain_data: Physical address of the serialized state of associated domain + * @token: User provided token + * @reclaimed: Whether the HWPT is reclaimed + */ +struct iommufd_hwpt_ser { + u64 domain_data; + u64 token; + u8 reclaimed; + u8 padding[7]; +} __packed; + +/** + * struct iommu_ser - IOMMUFD serialized state + * @nr_hwpts: Number of preserved HWPTs + * @hwpt_array: Array of serialized state of preserved HWPTs + */ +struct iommufd_ser { + u64 nr_hwpts; + struct iommufd_hwpt_ser hwpt_array[]; +} __packed; + +#endif /* _LINUX_KHO_ABI_IOMMUFD_H */ diff --git a/kernel/liveupdate/luo_file.c b/kernel/liveupdate/luo_file.c index 2a1c93e97f7d..4a27af7c93ee 100644 --- a/kernel/liveupdate/luo_file.c +++ b/kernel/liveupdate/luo_file.c @@ -970,6 +970,7 @@ int liveupdate_get_token_outgoing(struct liveupdate_session *s, return err; } +EXPORT_SYMBOL_GPL(liveupdate_get_token_outgoing); /** * liveupdate_get_file_incoming - Retrieves a preserved file for in-kernel use. diff --git a/mm/memfd.c b/mm/memfd.c index c5ef07a710ec..8bd8704a293d 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -316,6 +316,7 @@ int memfd_get_seals(struct file *file) return seals ? *seals : -EINVAL; } +EXPORT_SYMBOL_GPL(memfd_get_seals); long memfd_fcntl(struct file *file, unsigned int cmd, unsigned int arg) { -- Gitee From 4abb82d1114cfe5ea822db789113f7139fea0442 Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Sun, 30 Nov 2025 05:29:24 +0000 Subject: [PATCH 16/18] iommufd: Add APIs to preserve/unpreserve a vfio cdev ANBZ: #26808 cherry-picked from https://lore.kernel.org/all/20260427175633.1978233-15-skhawaja@google.com/ Add APIs that can be used to preserve and unpreserve a vfio cdev. Use the APIs exported by the IOMMU core to preserve/unpreserve device. The LUO token of the preserved iommufd is fetched and returned back to the caller as that can be used during restore to get the restored iommufd. Handle to the preserved state of the device is also returned to reassociate with the restored state after live update kexec. [Zelin Deng: correct symbols export, otherwise there are build issues.] Signed-off-by: Samiullah Khawaja Signed-off-by: Zelin Deng --- drivers/iommu/iommufd/device.c | 102 ++++++++++++++++++++++++ drivers/iommu/iommufd/iommufd_private.h | 3 + drivers/iommu/liveupdate.c | 2 + include/linux/iommufd.h | 29 +++++++ 4 files changed, 136 insertions(+) diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c index 5bacc473f7cd..777eb8e47d79 100644 --- a/drivers/iommu/iommufd/device.c +++ b/drivers/iommu/iommufd/device.c @@ -2,6 +2,7 @@ /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES */ #include +#include #include #include #include @@ -601,6 +602,10 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, int rc; mutex_lock(&igroup->lock); + if (iommufd_device_is_preserved(idev)) { + rc = -EBUSY; + goto err_unlock; + } attach = xa_cmpxchg(&igroup->pasid_attach, pasid, NULL, XA_ZERO_ENTRY, GFP_KERNEL); @@ -1662,3 +1667,100 @@ int iommufd_get_hw_info(struct iommufd_ucmd *ucmd) iommufd_put_object(ucmd->ictx, &idev->obj); return rc; } + +#ifdef CONFIG_IOMMU_LIVEUPDATE +static bool _iommufd_device_has_pasid_attachments(struct iommufd_device *idev) +{ + struct iommufd_group *igroup = idev->igroup; + unsigned long start = IOMMU_NO_PASID; + + if (xa_find_after(&igroup->pasid_attach, + &start, UINT_MAX, XA_PRESENT)) + return true; + + return false; +} + +int iommufd_device_preserve(struct liveupdate_session *s, + struct iommufd_device *idev, + u64 *iommufd_tokenp, + u64 *preserved_state) +{ + struct iommufd_group *igroup = idev->igroup; + struct iommufd_hwpt_paging *hwpt_paging; + struct iommufd_hw_pagetable *hwpt; + struct iommufd_attach *attach; + int ret; + + mutex_lock(&igroup->lock); + if (_iommufd_device_has_pasid_attachments(idev)) { + ret = -EOPNOTSUPP; + goto out; + } + + attach = xa_load(&igroup->pasid_attach, IOMMU_NO_PASID); + if (!attach) { + ret = -ENOENT; + goto out; + } + + hwpt = attach->hwpt; + hwpt_paging = find_hwpt_paging(hwpt); + if (!hwpt_paging || !hwpt_paging->liveupdate_preserved) { + ret = -EINVAL; + goto out; + } + + ret = liveupdate_get_token_outgoing(s, idev->ictx->file, iommufd_tokenp); + if (ret) + goto out; + + ret = iommu_preserve_device(hwpt_paging->common.domain, + idev->dev, + preserved_state); + + if (!ret) + igroup->liveupdate_preserved = true; +out: + mutex_unlock(&igroup->lock); + return ret; +} +EXPORT_SYMBOL_NS_GPL(iommufd_device_preserve, IOMMUFD); + +void iommufd_device_unpreserve(struct liveupdate_session *s, + struct iommufd_device *idev) +{ + struct iommufd_group *igroup = idev->igroup; + struct iommufd_hwpt_paging *hwpt_paging; + struct iommufd_hw_pagetable *hwpt; + struct iommufd_attach *attach; + + mutex_lock(&igroup->lock); + attach = xa_load(&igroup->pasid_attach, IOMMU_NO_PASID); + if (!attach) { + WARN(1, "IOMMU_NO_PASID attachment not found"); + igroup->liveupdate_preserved = false; + goto out; + } + + hwpt = attach->hwpt; + hwpt_paging = find_hwpt_paging(hwpt); + if (!hwpt_paging || !hwpt_paging->liveupdate_preserved) { + WARN(1, "Attached domain is not preserved"); + igroup->liveupdate_preserved = false; + goto out; + } + + iommu_unpreserve_device(hwpt_paging->common.domain, idev->dev); + igroup->liveupdate_preserved = false; +out: + mutex_unlock(&igroup->lock); +} +EXPORT_SYMBOL_NS_GPL(iommufd_device_unpreserve, IOMMUFD); + +bool iommufd_device_is_preserved(struct iommufd_device *idev) +{ + return idev && idev->igroup && idev->igroup->liveupdate_preserved; +} +EXPORT_SYMBOL_NS_GPL(iommufd_device_is_preserved, IOMMUFD); +#endif diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h index fdd8b448fc9e..b0ecd7035af0 100644 --- a/drivers/iommu/iommufd/iommufd_private.h +++ b/drivers/iommu/iommufd/iommufd_private.h @@ -487,6 +487,9 @@ struct iommufd_group { struct xarray pasid_attach; struct iommufd_sw_msi_maps required_sw_msi; phys_addr_t sw_msi_start; +#ifdef CONFIG_IOMMU_LIVEUPDATE + bool liveupdate_preserved; +#endif }; /* diff --git a/drivers/iommu/liveupdate.c b/drivers/iommu/liveupdate.c index f846f36a7b54..e837d6600485 100644 --- a/drivers/iommu/liveupdate.c +++ b/drivers/iommu/liveupdate.c @@ -512,6 +512,7 @@ int iommu_preserve_device(struct iommu_domain *domain, *preserved_state = virt_to_phys(device_ser); return 0; } +EXPORT_SYMBOL_GPL(iommu_preserve_device); void iommu_unpreserve_device(struct iommu_domain *domain, struct device *dev) { @@ -547,6 +548,7 @@ void iommu_unpreserve_device(struct iommu_domain *domain, struct device *dev) iommu_unpreserve_locked(iommu->iommu_dev, flb_obj); } +EXPORT_SYMBOL_GPL(iommu_unpreserve_device); struct iommu_domain *iommu_restore_domain(struct device *dev, struct iommu_device_ser *ser, diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h index 6e7efe83bc5d..d1fd5d71e0fd 100644 --- a/include/linux/iommufd.h +++ b/include/linux/iommufd.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -71,6 +72,34 @@ void iommufd_device_detach(struct iommufd_device *idev, ioasid_t pasid); struct iommufd_ctx *iommufd_device_to_ictx(struct iommufd_device *idev); u32 iommufd_device_to_id(struct iommufd_device *idev); +#ifdef CONFIG_IOMMU_LIVEUPDATE +int iommufd_device_preserve(struct liveupdate_session *s, + struct iommufd_device *idev, + u64 *iommufd_tokenp, + u64 *preserved_state); +void iommufd_device_unpreserve(struct liveupdate_session *s, + struct iommufd_device *idev); +bool iommufd_device_is_preserved(struct iommufd_device *idev); +#else +static inline int iommufd_device_preserve(struct liveupdate_session *s, + struct iommufd_device *idev, + u64 *iommufd_tokenp, + u64 *preserved_state) +{ + return -EOPNOTSUPP; +} + +static inline void iommufd_device_unpreserve(struct liveupdate_session *s, + struct iommufd_device *idev) +{ +} + +static inline bool iommufd_device_is_preserved(struct iommufd_device *idev) +{ + return false; +} +#endif + struct iommufd_access_ops { u8 needs_pin_pages : 1; void (*unmap)(void *data, unsigned long iova, unsigned long length); -- Gitee From 9d1713eecbc2ca3524854e687ce77a2d17ba03e5 Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Sun, 30 Nov 2025 05:32:14 +0000 Subject: [PATCH 17/18] vfio/pci: Preserve the iommufd state of the vfio cdev ANBZ: #26808 cherry-picked from https://lore.kernel.org/all/20260427175633.1978233-16-skhawaja@google.com/ If the vfio cdev is attached to an iommufd, preserve the state of the attached iommufd also. Basically preserve the iommu specific state of the device and also the attach iommu HW unit. Once the device and its iommufd attachment is preserved, it cannot be detached or attached to another IOAS until it is unpreserved. Signed-off-by: Samiullah Khawaja Signed-off-by: Zelin Deng --- drivers/vfio/device_cdev.c | 10 ++++++++ drivers/vfio/pci/vfio_pci.c | 1 + drivers/vfio/pci/vfio_pci_liveupdate.c | 33 +++++++++++++++++++++++++- 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/drivers/vfio/device_cdev.c b/drivers/vfio/device_cdev.c index e58739749b58..b271b59bb3f5 100644 --- a/drivers/vfio/device_cdev.c +++ b/drivers/vfio/device_cdev.c @@ -221,6 +221,11 @@ int vfio_df_ioctl_attach_pt(struct vfio_device_file *df, return -EINVAL; mutex_lock(&device->dev_set->lock); + if (iommufd_device_is_preserved(device->iommufd_device)) { + ret = -EBUSY; + goto out_unlock; + } + ret = device->ops->attach_ioas(device, &attach.pt_id); if (ret) goto out_unlock; @@ -256,6 +261,11 @@ int vfio_df_ioctl_detach_pt(struct vfio_device_file *df, return -EINVAL; mutex_lock(&device->dev_set->lock); + if (iommufd_device_is_preserved(device->iommufd_device)) { + mutex_unlock(&device->dev_set->lock); + return -EBUSY; + } + device->ops->detach_ioas(device); mutex_unlock(&device->dev_set->lock); diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index ac2f34866fe1..34984d440e1d 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -295,3 +295,4 @@ module_exit(vfio_pci_cleanup); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); +MODULE_IMPORT_NS(IOMMUFD); diff --git a/drivers/vfio/pci/vfio_pci_liveupdate.c b/drivers/vfio/pci/vfio_pci_liveupdate.c index 150996f93927..5ea4bd23d0f4 100644 --- a/drivers/vfio/pci/vfio_pci_liveupdate.c +++ b/drivers/vfio/pci/vfio_pci_liveupdate.c @@ -108,10 +108,13 @@ #include #include #include +#include #include #include "vfio_pci_priv.h" +MODULE_IMPORT_NS("IOMMUFD"); + static bool vfio_pci_liveupdate_can_preserve(struct liveupdate_file_handler *handler, struct file *file) { @@ -153,9 +156,26 @@ static int vfio_pci_liveupdate_preserve(struct liveupdate_file_op_args *args) vdev = container_of(device, struct vfio_pci_core_device, vdev); pdev = vdev->pdev; +#ifdef CONFIG_IOMMU_LIVEUPDATE + /* If iommufd is attached, preserve the underlying domain */ + mutex_lock(&device->dev_set->lock); + if (device->iommufd_attached) { + u64 token, preserved_state; + + ret = iommufd_device_preserve(args->session, + device->iommufd_device, + &token, &preserved_state); + if (ret) { + mutex_unlock(&device->dev_set->lock); + return ret; + } + } + mutex_unlock(&device->dev_set->lock); +#endif + ret = pci_liveupdate_preserve(pdev); if (ret) - return ret; + goto err_iommufd_unpreserve; ser = kho_alloc_preserve(sizeof(*ser)); if (IS_ERR(ser)) { @@ -170,6 +190,9 @@ static int vfio_pci_liveupdate_preserve(struct liveupdate_file_op_args *args) args->serialized_data = virt_to_phys(ser); return 0; +err_iommufd_unpreserve: + iommufd_device_unpreserve(args->session, device->iommufd_device); + err_unpreserve: pci_liveupdate_unpreserve(pdev); return ret; @@ -178,6 +201,14 @@ static int vfio_pci_liveupdate_preserve(struct liveupdate_file_op_args *args) static void vfio_pci_liveupdate_unpreserve(struct liveupdate_file_op_args *args) { struct vfio_device *device = vfio_device_from_file(args->file); + struct vfio_pci_core_device_ser *ser; + + ser = phys_to_virt(args->serialized_data); + mutex_lock(&device->dev_set->lock); + if (device->iommufd_attached) + iommufd_device_unpreserve(args->session, + device->iommufd_device); + mutex_unlock(&device->dev_set->lock); pci_liveupdate_unpreserve(to_pci_dev(device->dev)); kho_unpreserve_free(phys_to_virt(args->serialized_data)); -- Gitee From ff83a44d409fe53c2799219d2ee5aba543733df1 Mon Sep 17 00:00:00 2001 From: Samiullah Khawaja Date: Sat, 20 Sep 2025 00:04:32 +0000 Subject: [PATCH 18/18] iommufd/selftest: Add test to verify iommufd preservation ANBZ: #26808 cherry-picked from https://lore.kernel.org/all/20260427175633.1978233-17-skhawaja@google.com/ Test iommufd preservation by setting up an iommufd and vfio cdev and preserve it across live update. Test takes VFIO cdev path of a device bound to vfio-pci driver and binds it to an iommufd being preserved. It also preserves the vfio cdev so the iommufd state associated with it is also preserved. The restore path is tested by restoring the preserved vfio cdev only. On restore, test verifies that the bind with a new iommufd fails as the device is attached to the restored IOMMU domain. Also the LUO session finish fails as the preserved iommufd is not restored. [Zelin Deng: included uapi headers, for __aligned_xxx definition.] Signed-off-by: Samiullah Khawaja Signed-off-by: YiFei Zhu Signed-off-by: Zelin Deng --- tools/testing/selftests/iommu/Makefile | 12 + .../iommu/iommufd_liveupdate_kexec_test.c | 240 ++++++++++++++++++ 2 files changed, 252 insertions(+) create mode 100644 tools/testing/selftests/iommu/iommufd_liveupdate_kexec_test.c diff --git a/tools/testing/selftests/iommu/Makefile b/tools/testing/selftests/iommu/Makefile index f824582a253f..d0f65286d64b 100644 --- a/tools/testing/selftests/iommu/Makefile +++ b/tools/testing/selftests/iommu/Makefile @@ -9,4 +9,16 @@ TEST_GEN_PROGS := TEST_GEN_PROGS += iommufd TEST_GEN_PROGS += iommufd_fail_nth +TEST_GEN_PROGS_EXTENDED += iommufd_liveupdate_kexec_test + include ../lib.mk +include ../liveupdate/lib/libliveupdate.mk + +CFLAGS += -I$(top_srcdir)/tools/include +CFLAGS += -MD +CFLAGS += $(EXTRA_CFLAGS) + +$(TEST_GEN_PROGS_EXTENDED): %: %.o $(LIBLIVEUPDATE_O) + $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(TARGET_ARCH) $< $(LIBLIVEUPDATE_O) $(LDLIBS) -o $@ + +EXTRA_CLEAN += $(LIBLIVEUPDATE_O) diff --git a/tools/testing/selftests/iommu/iommufd_liveupdate_kexec_test.c b/tools/testing/selftests/iommu/iommufd_liveupdate_kexec_test.c new file mode 100644 index 000000000000..cad57aba056f --- /dev/null +++ b/tools/testing/selftests/iommu/iommufd_liveupdate_kexec_test.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* + * Copyright (c) 2026, Google LLC. + * Samiullah Khawaja + */ + +#include +#include +#include +#include +#include +#include + +#define __EXPORTED_HEADERS__ +#include +#include +#include +#include +#include +#include + +#include "../kselftest.h" + +#define ksft_assert(condition) \ + do { \ + if (!(condition)) \ + fail_exit("Failed: %s", #condition); \ + } while (0) + +static const char *device_cdev_path; +static char state_session[LIVEUPDATE_SESSION_NAME_LENGTH]; +static char iommufd_session[LIVEUPDATE_SESSION_NAME_LENGTH]; + +static const uint64_t STATE_TOKEN; +static const uint64_t IOMMUFD_TOKEN = 0x123456; +static const uint64_t CDEV_TOKEN = 0x654321; +static const uint64_t HWPT_TOKEN = 0x789012; +static const uint64_t MEMFD_TOKEN = 0x890123; + +static int open_cdev(const char *vfio_cdev_path) +{ + int cdev_fd; + + cdev_fd = open(vfio_cdev_path, O_RDWR); + if (cdev_fd < 0) + ksft_exit_skip("Failed to open VFIO cdev: %s\n", vfio_cdev_path); + + return cdev_fd; +} + +static int open_iommufd(void) +{ + int iommufd; + + iommufd = open("/dev/iommu", O_RDWR); + if (iommufd < 0) + ksft_exit_skip("Failed to open /dev/iommu. IOMMUFD support not enabled.\n"); + + return iommufd; +} + +static int create_sealed_memfd(size_t size) +{ + int fd, ret; + + fd = memfd_create("buffer", MFD_ALLOW_SEALING); + if (fd < 0) + fail_exit("memfd_create failed"); + + ret = ftruncate(fd, size); + if (ret) + fail_exit("ftruncate failed"); + + ret = fcntl(fd, F_ADD_SEALS, F_SEAL_GROW | F_SEAL_SHRINK | F_SEAL_SEAL); + if (ret) + fail_exit("fcntl F_ADD_SEALS failed"); + + return fd; +} + +#define test_ioctl(fd, cmd, arg) \ + do { \ + if (ioctl(fd, cmd, arg)) \ + fail_exit("ioctl(%s) failed", #cmd); \ + } while (0) + +#define test_luo_session_preserve_fd(session, fd, token) \ + do { \ + if (luo_session_preserve_fd(session, fd, token)) \ + fail_exit("luo_session_preserve_fd(%s) failed", #token); \ + } while (0) + +#define test_luo_session_retrieve_fd(session, token) \ + ({ \ + int _fd = luo_session_retrieve_fd(session, token); \ + if (_fd < 0) \ + fail_exit("luo_session_retrieve_fd(%s) failed", #token); \ + _fd; \ + }) + +static void setup_iommufd(int iommufd, int memfd, int cdev_fd) +{ + struct vfio_device_bind_iommufd bind = { + .argsz = sizeof(bind), + .flags = 0, + .iommufd = iommufd, + }; + struct iommu_ioas_alloc alloc_data = { + .size = sizeof(alloc_data), + .flags = 0, + }; + struct iommu_hwpt_alloc hwpt_alloc = { + .size = sizeof(hwpt_alloc), + .flags = 0, + }; + struct vfio_device_attach_iommufd_pt attach_data = { + .argsz = sizeof(attach_data), + .flags = 0, + }; + struct iommu_hwpt_liveupdate_mark_preserve mark_preserve = { + .size = sizeof(mark_preserve), + .hwpt_token = HWPT_TOKEN, + }; + struct iommu_ioas_map_file map_file = { + .size = sizeof(map_file), + .length = SZ_1M, + .flags = IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE, + .iova = SZ_4G, + .fd = memfd, + .start = 0, + }; + + test_ioctl(cdev_fd, VFIO_DEVICE_BIND_IOMMUFD, &bind); + + test_ioctl(iommufd, IOMMU_IOAS_ALLOC, &alloc_data); + + hwpt_alloc.dev_id = bind.out_devid; + hwpt_alloc.pt_id = alloc_data.out_ioas_id; + test_ioctl(iommufd, IOMMU_HWPT_ALLOC, &hwpt_alloc); + + attach_data.pt_id = hwpt_alloc.out_hwpt_id; + test_ioctl(cdev_fd, VFIO_DEVICE_ATTACH_IOMMUFD_PT, &attach_data); + + map_file.ioas_id = alloc_data.out_ioas_id; + test_ioctl(iommufd, IOMMU_IOAS_MAP_FILE, &map_file); + + mark_preserve.hwpt_id = attach_data.pt_id; + test_ioctl(iommufd, IOMMU_HWPT_LIVEUPDATE_MARK_PRESERVE, &mark_preserve); +} + +static void before_kexec(int luo_fd) +{ + int iommufd, cdev_fd, memfd, session; + + create_state_file(luo_fd, state_session, STATE_TOKEN, /*next_stage=*/2); + + session = luo_create_session(luo_fd, iommufd_session); + if (session < 0) + fail_exit("luo_create_session failed"); + + iommufd = open_iommufd(); + memfd = create_sealed_memfd(SZ_1M); + cdev_fd = open_cdev(device_cdev_path); + + setup_iommufd(iommufd, memfd, cdev_fd); + + /* Cannot preserve cdev without iommufd */ + if (!luo_session_preserve_fd(session, cdev_fd, CDEV_TOKEN)) + fail_exit("Preserving cdev without iommufd should fail"); + + /* Cannot preserve iommufd without preserving memfd. */ + if (!luo_session_preserve_fd(session, iommufd, IOMMUFD_TOKEN)) + fail_exit("Preserving iommufd without memfd should fail"); + + test_luo_session_preserve_fd(session, memfd, MEMFD_TOKEN); + test_luo_session_preserve_fd(session, iommufd, IOMMUFD_TOKEN); + test_luo_session_preserve_fd(session, cdev_fd, CDEV_TOKEN); + + close(session); + session = luo_create_session(luo_fd, iommufd_session); + if (session < 0) + fail_exit("luo_create_session failed"); + + test_luo_session_preserve_fd(session, memfd, MEMFD_TOKEN); + test_luo_session_preserve_fd(session, iommufd, IOMMUFD_TOKEN); + test_luo_session_preserve_fd(session, cdev_fd, CDEV_TOKEN); + + close(luo_fd); + daemonize_and_wait(); +} + +static void after_kexec(int luo_fd, int state_session_fd) +{ + int iommufd, cdev_fd, session, stage; + struct vfio_device_bind_iommufd bind = { + .argsz = sizeof(bind), + .flags = 0, + }; + + restore_and_read_stage(state_session_fd, STATE_TOKEN, &stage); + ksft_assert(stage == 2); + + session = luo_retrieve_session(luo_fd, iommufd_session); + if (session < 0) + fail_exit("luo_retrieve_session failed"); + + cdev_fd = test_luo_session_retrieve_fd(session, CDEV_TOKEN); + + iommufd = luo_session_retrieve_fd(session, IOMMUFD_TOKEN); + if (iommufd >= 0) + fail_exit("iommufd should not be retrievable yet"); + + iommufd = open_iommufd(); + + bind.iommufd = iommufd; + if (ioctl(cdev_fd, VFIO_DEVICE_BIND_IOMMUFD, &bind) == 0 || errno != EPERM) + fail_exit("Binding cdev to new iommufd should fail with EPERM"); + + /* Should fail */ + if (luo_session_finish(session) == 0) + fail_exit("luo_session_finish should fail if iommufd is not restored"); + + close(iommufd); + close(cdev_fd); +} + +int main(int argc, char *argv[]) +{ + if (argc < 2) { + printf("Usage: %s \n", argv[0]); + return 1; + } + + device_cdev_path = argv[1]; + sprintf(iommufd_session, "iommufd-test-%s", "cdev"); + sprintf(state_session, "state-%s", "iommufd-cdev"); + + return luo_test(argc, argv, state_session, before_kexec, after_kexec); +} -- Gitee