diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c index e39a810a4e92..89e58a174516 100644 --- a/accel/kvm/kvm-all.c +++ b/accel/kvm/kvm-all.c @@ -101,6 +101,8 @@ bool kvm_msi_use_devid; bool kvm_has_guest_debug; static int kvm_sstep_flags; static bool kvm_immediate_exit; +static bool kvm_guest_memfd_supported; +static uint64_t kvm_supported_memory_attributes; static hwaddr kvm_max_slot_size = ~0; static const KVMCapabilityInfo kvm_required_capabilites[] = { @@ -292,34 +294,69 @@ int kvm_physical_memory_addr_from_host(KVMState *s, void *ram, static int kvm_set_user_memory_region(KVMMemoryListener *kml, KVMSlot *slot, bool new) { KVMState *s = kvm_state; - struct kvm_userspace_memory_region mem; + struct kvm_userspace_memory_region2 mem; + static int cap_user_memory2 = -1; int ret; + if (cap_user_memory2 == -1) { + cap_user_memory2 = kvm_check_extension(s, KVM_CAP_USER_MEMORY2); + } + + if (!cap_user_memory2 && slot->guest_memfd >= 0) { + error_report("%s, KVM doesn't support KVM_CAP_USER_MEMORY2," + " which is required by guest memfd!", __func__); + exit(1); + } + mem.slot = slot->slot | (kml->as_id << 16); mem.guest_phys_addr = slot->start_addr; mem.userspace_addr = (unsigned long)slot->ram; mem.flags = slot->flags; + mem.guest_memfd = slot->guest_memfd; + mem.guest_memfd_offset = slot->guest_memfd_offset; if (slot->memory_size && !new && (mem.flags ^ slot->old_flags) & KVM_MEM_READONLY) { /* Set the slot size to 0 before setting the slot to the desired * value. This is needed based on KVM commit 75d61fbc. */ mem.memory_size = 0; - ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem); + + if (cap_user_memory2) { + ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION2, &mem); + } else { + ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem); + } if (ret < 0) { goto err; } } mem.memory_size = slot->memory_size; - ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem); + if (cap_user_memory2) { + ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION2, &mem); + } else { + ret = kvm_vm_ioctl(s, KVM_SET_USER_MEMORY_REGION, &mem); + } slot->old_flags = mem.flags; err: - trace_kvm_set_user_memory(mem.slot, mem.flags, mem.guest_phys_addr, - mem.memory_size, mem.userspace_addr, ret); + trace_kvm_set_user_memory(mem.slot >> 16, (uint16_t)mem.slot, mem.flags, + mem.guest_phys_addr, mem.memory_size, + mem.userspace_addr, mem.guest_memfd, + mem.guest_memfd_offset, ret); if (ret < 0) { - error_report("%s: KVM_SET_USER_MEMORY_REGION failed, slot=%d," - " start=0x%" PRIx64 ", size=0x%" PRIx64 ": %s", - __func__, mem.slot, slot->start_addr, - (uint64_t)mem.memory_size, strerror(errno)); + if (cap_user_memory2) { + error_report("%s: KVM_SET_USER_MEMORY_REGION2 failed, slot=%d," + " start=0x%" PRIx64 ", size=0x%" PRIx64 "," + " flags=0x%" PRIx32 ", guest_memfd=%" PRId32 "," + " guest_memfd_offset=0x%" PRIx64 ": %s", + __func__, mem.slot, slot->start_addr, + (uint64_t)mem.memory_size, mem.flags, + mem.guest_memfd, (uint64_t)mem.guest_memfd_offset, + strerror(errno)); + } else { + error_report("%s: KVM_SET_USER_MEMORY_REGION failed, slot=%d," + " start=0x%" PRIx64 ", size=0x%" PRIx64 ": %s", + __func__, mem.slot, slot->start_addr, + (uint64_t)mem.memory_size, strerror(errno)); + } } return ret; } @@ -475,6 +512,9 @@ static int kvm_mem_flags(MemoryRegion *mr) if (readonly && kvm_readonly_mem_allowed) { flags |= KVM_MEM_READONLY; } + if (memory_region_has_guest_memfd(mr)) { + flags |= KVM_MEM_GUEST_MEMFD; + } return flags; } @@ -1266,6 +1306,44 @@ void kvm_set_max_memslot_size(hwaddr max_slot_size) kvm_max_slot_size = max_slot_size; } +static int kvm_set_memory_attributes(hwaddr start, hwaddr size, uint64_t attr) +{ + struct kvm_memory_attributes attrs; + int r; + + attrs.attributes = attr; + attrs.address = start; + attrs.size = size; + attrs.flags = 0; + + r = kvm_vm_ioctl(kvm_state, KVM_SET_MEMORY_ATTRIBUTES, &attrs); + if (r) { + warn_report("%s: failed to set memory (0x%lx+%#zx) with attr 0x%lx error '%s'", + __func__, start, size, attr, strerror(errno)); + } + return r; +} + +int kvm_set_memory_attributes_private(hwaddr start, hwaddr size) +{ + if (!(kvm_supported_memory_attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE)) { + error_report("KVM doesn't support PRIVATE memory attribute\n"); + return -EINVAL; + } + + return kvm_set_memory_attributes(start, size, KVM_MEMORY_ATTRIBUTE_PRIVATE); +} + +int kvm_set_memory_attributes_shared(hwaddr start, hwaddr size) +{ + if (!(kvm_supported_memory_attributes & KVM_MEMORY_ATTRIBUTE_PRIVATE)) { + error_report("KVM doesn't support PRIVATE memory attribute\n"); + return -EINVAL; + } + + return kvm_set_memory_attributes(start, size, 0); +} + /* Called with KVMMemoryListener.slots_lock held */ static void kvm_set_phys_mem(KVMMemoryListener *kml, MemoryRegionSection *section, bool add) @@ -1362,6 +1440,9 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml, mem->ram_start_offset = ram_start_offset; mem->ram = ram; mem->flags = kvm_mem_flags(mr); + mem->guest_memfd = mr->ram_block->guest_memfd; + mem->guest_memfd_offset = (uint8_t*)ram - mr->ram_block->host; + kvm_slot_init_dirty_bitmap(mem); err = kvm_set_user_memory_region(kml, mem, true); if (err) { @@ -1369,6 +1450,16 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml, strerror(-err)); abort(); } + + if (memory_region_is_default_private(mr)) { + err = kvm_set_memory_attributes_private(start_addr, slot_size); + if (err) { + error_report("%s: failed to set memory attribute private: %s\n", + __func__, strerror(-err)); + exit(1); + } + } + start_addr += slot_size; ram_start_offset += slot_size; ram += slot_size; @@ -2396,6 +2487,11 @@ static int kvm_init(MachineState *ms) } s->as = g_new0(struct KVMAs, s->nr_as); + kvm_guest_memfd_supported = kvm_check_extension(s, KVM_CAP_GUEST_MEMFD); + + ret = kvm_check_extension(s, KVM_CAP_MEMORY_ATTRIBUTES); + kvm_supported_memory_attributes = ret > 0 ? ret : 0; + if (object_property_find(OBJECT(current_machine), "kvm-type")) { g_autofree char *kvm_type = object_property_get_str(OBJECT(current_machine), "kvm-type", @@ -2816,6 +2912,51 @@ static void kvm_eat_signals(CPUState *cpu) } while (sigismember(&chkset, SIG_IPI)); } +int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private) +{ + MemoryRegionSection section; + ram_addr_t offset; + RAMBlock *rb; + void *addr; + int ret = -1; + + trace_kvm_convert_memory(start, size, to_private ? "shared_to_private" : "private_to_shared"); + section = memory_region_find(get_system_memory(), start, size); + if (!section.mr) { + return ret; + } + + if (memory_region_has_guest_memfd(section.mr)) { + if (to_private) { + ret = kvm_set_memory_attributes_private(start, size); + } else { + ret = kvm_set_memory_attributes_shared(start, size); + } + + if (ret) { + memory_region_unref(section.mr); + return ret; + } + + addr = memory_region_get_ram_ptr(section.mr) + + section.offset_within_region; + rb = qemu_ram_block_from_host(addr, false, &offset); + /* + * With KVM_SET_MEMORY_ATTRIBUTES by kvm_set_memory_attributes(), + * operation on underlying file descriptor is only for releasing + * unnecessary pages. + */ + ram_block_convert_range(rb, offset, size, to_private); + } else { + warn_report("Convert non guest_memfd backed memory region " + "(0x%"HWADDR_PRIx" ,+ 0x%"HWADDR_PRIx") to %s", + start, size, to_private ? "private" : "shared"); + } + + memory_region_unref(section.mr); + return ret; +} + int kvm_cpu_exec(CPUState *cpu) { struct kvm_run *run = cpu->kvm_run; @@ -2883,18 +3024,20 @@ int kvm_cpu_exec(CPUState *cpu) ret = EXCP_INTERRUPT; break; } - fprintf(stderr, "error: kvm run failed %s\n", - strerror(-run_ret)); + if (!(run_ret == -EFAULT && run->exit_reason == KVM_EXIT_MEMORY_FAULT)) { + fprintf(stderr, "error: kvm run failed %s\n", + strerror(-run_ret)); #ifdef TARGET_PPC - if (run_ret == -EBUSY) { - fprintf(stderr, - "This is probably because your SMT is enabled.\n" - "VCPU can only run on primary threads with all " - "secondary threads offline.\n"); - } + if (run_ret == -EBUSY) { + fprintf(stderr, + "This is probably because your SMT is enabled.\n" + "VCPU can only run on primary threads with all " + "secondary threads offline.\n"); + } #endif - ret = -1; - break; + ret = -1; + break; + } } trace_kvm_run_exit(cpu->cpu_index, run->exit_reason); @@ -2981,6 +3124,18 @@ int kvm_cpu_exec(CPUState *cpu) break; } break; + case KVM_EXIT_MEMORY_FAULT: + g_warning("memory fault: GPA 0x%llx size 0x%llx flags 0x%llx", + run->memory_fault.gpa, run->memory_fault.size, run->memory_fault.flags); + if (run->memory_fault.flags & ~KVM_MEMORY_EXIT_FLAG_PRIVATE) { + error_report("KVM_EXIT_MEMORY_FAULT: Unknown flag 0x%" PRIx64, + (uint64_t)run->memory_fault.flags); + ret = -1; + break; + } + ret = kvm_convert_memory(run->memory_fault.gpa, run->memory_fault.size, + run->memory_fault.flags & KVM_MEMORY_EXIT_FLAG_PRIVATE); + break; default: DPRINTF("kvm_arch_handle_exit\n"); ret = kvm_arch_handle_exit(cpu, run); @@ -4077,3 +4232,29 @@ void query_stats_schemas_cb(StatsSchemaList **result, Error **errp) query_stats_schema_vcpu(first_cpu, &stats_args); } } + +int kvm_create_guest_memfd(uint64_t size, uint64_t flags, Error **errp) +{ + int fd; + struct kvm_create_guest_memfd guest_memfd = { + .size = size, + .flags = flags, + }; + + if (!kvm_guest_memfd_supported) { + error_setg(errp, "KVM doesn't support guest memfd\n"); + return -EOPNOTSUPP; + } + + fd = kvm_vm_ioctl(kvm_state, KVM_CREATE_GUEST_MEMFD, &guest_memfd); + if (fd < 0) { + error_setg_errno(errp, errno, "%s: error creating kvm guest memfd\n", __func__); + } + + return fd; +} + +bool kvm_has_restricted_memory(void) +{ + return current_machine->require_guest_memfd; +} diff --git a/accel/kvm/trace-events b/accel/kvm/trace-events index 399aaeb0ec75..bca51f877b12 100644 --- a/accel/kvm/trace-events +++ b/accel/kvm/trace-events @@ -15,7 +15,7 @@ kvm_irqchip_update_msi_route(int virq) "Updating MSI route virq=%d" kvm_irqchip_release_virq(int virq) "virq %d" kvm_set_ioeventfd_mmio(int fd, uint64_t addr, uint32_t val, bool assign, uint32_t size, bool datamatch) "fd: %d @0x%" PRIx64 " val=0x%x assign: %d size: %d match: %d" kvm_set_ioeventfd_pio(int fd, uint16_t addr, uint32_t val, bool assign, uint32_t size, bool datamatch) "fd: %d @0x%x val=0x%x assign: %d size: %d match: %d" -kvm_set_user_memory(uint32_t slot, uint32_t flags, uint64_t guest_phys_addr, uint64_t memory_size, uint64_t userspace_addr, int ret) "Slot#%d flags=0x%x gpa=0x%"PRIx64 " size=0x%"PRIx64 " ua=0x%"PRIx64 " ret=%d" +kvm_set_user_memory(uint16_t as, uint16_t slot, uint32_t flags, uint64_t guest_phys_addr, uint64_t memory_size, uint64_t userspace_addr, uint32_t fd, uint64_t fd_offset, int ret) "AddrSpace#%d Slot#%d flags=0x%x gpa=0x%"PRIx64 " size=0x%"PRIx64 " ua=0x%"PRIx64 " guest_memfd=%d" " guest_memfd_offset=0x%" PRIx64 " ret=%d" kvm_clear_dirty_log(uint32_t slot, uint64_t start, uint32_t size) "slot#%"PRId32" start 0x%"PRIx64" size 0x%"PRIx32 kvm_resample_fd_notify(int gsi) "gsi %d" kvm_dirty_ring_full(int id) "vcpu %d" @@ -25,4 +25,4 @@ kvm_dirty_ring_reaper(const char *s) "%s" kvm_dirty_ring_reap(uint64_t count, int64_t t) "reaped %"PRIu64" pages (took %"PRIi64" us)" kvm_dirty_ring_reaper_kick(const char *reason) "%s" kvm_dirty_ring_flush(int finished) "%d" - +kvm_convert_memory(uint64_t start, uint64_t size, const char *msg) "start 0x%" PRIx64 " size 0x%" PRIx64 " %s" diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c index 1b37d9a302cc..9c235dd457c7 100644 --- a/accel/stubs/kvm-stub.c +++ b/accel/stubs/kvm-stub.c @@ -124,3 +124,8 @@ uint32_t kvm_dirty_ring_size(void) { return 0; } + +bool kvm_has_restricted_memory(void) +{ + return false; +} diff --git a/backends/confidential-guest-support.c b/backends/confidential-guest-support.c index 052fde8db049..fe5d7a46cc44 100644 --- a/backends/confidential-guest-support.c +++ b/backends/confidential-guest-support.c @@ -14,20 +14,107 @@ #include "qemu/osdep.h" #include "exec/confidential-guest-support.h" +#include "qemu/error-report.h" +#include "qapi/error.h" +#include "exec/igvm.h" OBJECT_DEFINE_ABSTRACT_TYPE(ConfidentialGuestSupport, confidential_guest_support, CONFIDENTIAL_GUEST_SUPPORT, OBJECT) +#if defined(CONFIG_IGVM) +static char *get_igvm(Object *obj, Error **errp) +{ + ConfidentialGuestSupport *cgs = CONFIDENTIAL_GUEST_SUPPORT(obj); + return g_strdup(cgs->igvm_filename); +} + +static void set_igvm(Object *obj, const char *value, Error **errp) +{ + ConfidentialGuestSupport *cgs = CONFIDENTIAL_GUEST_SUPPORT(obj); + g_free(cgs->igvm_filename); + cgs->igvm_filename = g_strdup(value); +#if defined(CONFIG_IGVM) + igvm_file_init(cgs, errp); +#endif +} +#endif + static void confidential_guest_support_class_init(ObjectClass *oc, void *data) { +#if defined(CONFIG_IGVM) + object_class_property_add_str(oc, "igvm-file", + get_igvm, set_igvm); + object_class_property_set_description(oc, "igvm-file", + "Set the IGVM filename to use"); +#endif +} + +static int check_support(ConfidentialGuestPlatformType platform, + uint16_t platform_version, uint8_t highest_vtl, + uint64_t shared_gpa_boundary) +{ + /* Default: no support. */ + return 0; +} + +static int set_guest_state(hwaddr gpa, uint8_t *ptr, uint64_t len, + ConfidentialGuestPageType memory_type, + uint16_t cpu_index, Error **errp) +{ + error_setg(errp, + "Setting confidential guest state is not supported for this platform"); + return -1; +} + +static int set_guest_policy(ConfidentialGuestPolicyType policy_type, + uint64_t policy, + void *policy_data1, uint32_t policy_data1_size, + void *policy_data2, uint32_t policy_data2_size, + Error **errp) +{ + error_setg(errp, + "Setting confidential guest policy is not supported for this platform"); + return -1; +} + +static int get_mem_map_entry(int index, ConfidentialGuestMemoryMapEntry *entry, + Error **errp) +{ + error_setg( + errp, + "Obtaining the confidential guest memory map is not supported for this platform"); + return -1; } static void confidential_guest_support_init(Object *obj) { + ConfidentialGuestSupport *cgs = CONFIDENTIAL_GUEST_SUPPORT(obj); + cgs->check_support = check_support; + cgs->set_guest_state = set_guest_state; + cgs->set_guest_policy = set_guest_policy; + cgs->get_mem_map_entry = get_mem_map_entry; } static void confidential_guest_support_finalize(Object *obj) { } + +bool cgs_is_igvm(ConfidentialGuestSupport *cgs) +{ +#if defined(CONFIG_IGVM) + return cgs && cgs->igvm; +#else + return false; +#endif +} + +void cgs_process_igvm(ConfidentialGuestSupport *cgs) +{ +#if defined(CONFIG_IGVM) + if (cgs && cgs_is_igvm(cgs)) { + igvm_process(cgs, &error_fatal); + } +#endif +} diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c index 361d4a8103ef..d5ea2879f321 100644 --- a/backends/hostmem-file.c +++ b/backends/hostmem-file.c @@ -84,6 +84,7 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp) ram_flags |= fb->readonly ? RAM_READONLY_FD : 0; ram_flags |= fb->rom == ON_OFF_AUTO_ON ? RAM_READONLY : 0; ram_flags |= backend->reserve ? 0 : RAM_NORESERVE; + ram_flags |= backend->require_guest_memfd ? RAM_GUEST_MEMFD : 0; ram_flags |= fb->is_pmem ? RAM_PMEM : 0; ram_flags |= RAM_NAMED_FILE; memory_region_init_ram_from_file(&backend->mr, OBJECT(backend), name, diff --git a/backends/hostmem-memfd.c b/backends/hostmem-memfd.c index 3fc85c3db81b..011e2311f088 100644 --- a/backends/hostmem-memfd.c +++ b/backends/hostmem-memfd.c @@ -55,6 +55,7 @@ memfd_backend_memory_alloc(HostMemoryBackend *backend, Error **errp) name = host_memory_backend_get_name(backend); ram_flags = backend->share ? RAM_SHARED : 0; ram_flags |= backend->reserve ? 0 : RAM_NORESERVE; + ram_flags |= backend->require_guest_memfd ? RAM_GUEST_MEMFD : 0; memory_region_init_ram_from_fd(&backend->mr, OBJECT(backend), name, backend->size, ram_flags, fd, 0, errp); g_free(name); diff --git a/backends/hostmem-ram.c b/backends/hostmem-ram.c index b8e55cdbd0f8..7d2e1327f8c8 100644 --- a/backends/hostmem-ram.c +++ b/backends/hostmem-ram.c @@ -30,6 +30,7 @@ ram_backend_memory_alloc(HostMemoryBackend *backend, Error **errp) name = host_memory_backend_get_name(backend); ram_flags = backend->share ? RAM_SHARED : 0; ram_flags |= backend->reserve ? 0 : RAM_NORESERVE; + ram_flags |= backend->require_guest_memfd ? RAM_GUEST_MEMFD : 0; memory_region_init_ram_flags_nomigrate(&backend->mr, OBJECT(backend), name, backend->size, ram_flags, errp); g_free(name); diff --git a/backends/hostmem.c b/backends/hostmem.c index 747e7838c031..2deb2b78bcb8 100644 --- a/backends/hostmem.c +++ b/backends/hostmem.c @@ -279,6 +279,7 @@ static void host_memory_backend_init(Object *obj) /* TODO: convert access to globals to compat properties */ backend->merge = machine_mem_merge(machine); backend->dump = machine_dump_guest_core(machine); + backend->require_guest_memfd = machine_require_guest_memfd(machine); backend->reserve = true; backend->prealloc_threads = machine->smp.cpus; } diff --git a/backends/igvm.c b/backends/igvm.c new file mode 100644 index 000000000000..aba5586bc01d --- /dev/null +++ b/backends/igvm.c @@ -0,0 +1,903 @@ +/* + * QEMU IGVM configuration backend for Confidential Guests + * + * Copyright (C) 2023-2024 SUSE + * + * Authors: + * Roy Hopkins + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "qemu/osdep.h" + +#if defined(CONFIG_IGVM) + +#include "exec/confidential-guest-support.h" +#include "qemu/queue.h" +#include "qemu/typedefs.h" + +#include "exec/igvm.h" +#include "qemu/error-report.h" +#include "hw/boards.h" +#include "qapi/error.h" +#include "exec/address-spaces.h" + +#include +#include +#include + +typedef struct IgvmParameterData { + QTAILQ_ENTRY(IgvmParameterData) next; + uint8_t *data; + uint32_t size; + uint32_t index; +} IgvmParameterData; + +/* + * Some directives are specific individual confidential computing platforms. + * Define required types for each of those platforms here. + */ + +/* SEV/SEV-ES/SEV-SNP */ +struct QEMU_PACKED sev_id_block { + uint8_t ld[48]; + uint8_t family_id[16]; + uint8_t image_id[16]; + uint32_t version; + uint32_t guest_svn; + uint64_t policy; +}; + +struct QEMU_PACKED sev_id_authentication { + uint32_t id_key_alg; + uint32_t auth_key_algo; + uint8_t reserved[56]; + uint8_t id_block_sig[512]; + uint8_t id_key[1028]; + uint8_t reserved2[60]; + uint8_t id_key_sig[512]; + uint8_t author_key[1028]; + uint8_t reserved3[892]; +}; + +struct igvm_context { + /* + * Compatibility mask that is used to check if IGVM directives apply + * to the current platform. + */ + uint32_t compatibility_mask; + + /* + * IGVM definition of the current platform type. + */ + IgvmPlatformType platform_type; + + /* + * The ConfidentialGuestSupport object that is used to process directives + * in the IGVM file. + */ + ConfidentialGuestSupport *cgs; + + /* + * For SEV platforms, optionally contains the ID block and authentication + * that should be verified by the guest. + */ + struct sev_id_block *id_block; + struct sev_id_authentication *id_auth; + + /* Define the guest policy for SEV guests */ + uint64_t sev_policy; + + /* List of all parameters to populate in the guest */ + QTAILQ_HEAD(, IgvmParameterData) parameter_data; +}; + +static int directive_page_data(struct igvm_context *ctx, int i, + const uint8_t *header_data, Error **errp); +static int directive_vp_context(struct igvm_context *ctx, int i, + const uint8_t *header_data, Error **errp); +static int directive_parameter_area(struct igvm_context *ctx, int i, + const uint8_t *header_data, Error **errp); +static int directive_parameter_insert(struct igvm_context *ctx, int i, + const uint8_t *header_data, Error **errp); +static int directive_memory_map(struct igvm_context *ctx, int i, + const uint8_t *header_data, Error **errp); +static int directive_vp_count(struct igvm_context *ctx, int i, + const uint8_t *header_data, Error **errp); +static int directive_environment_info(struct igvm_context *ctx, int i, + const uint8_t *header_data, Error **errp); +static int directive_required_memory(struct igvm_context *ctx, int i, + const uint8_t *header_data, Error **errp); +static int directive_snp_id_block(struct igvm_context *ctx, int i, + const uint8_t *header_data, Error **errp); + +static int initialization_guest_policy(struct igvm_context *ctx, int i, + const uint8_t *header_data, Error **errp); + +struct IGVMHandler { + uint32_t type; + uint32_t section; + int (*handler)(struct igvm_context *ctx, int i, + const uint8_t *header_data, Error **errp); +}; + +static struct IGVMHandler handlers[] = { + { IGVM_VHT_PAGE_DATA, HEADER_SECTION_DIRECTIVE, directive_page_data }, + { IGVM_VHT_VP_CONTEXT, HEADER_SECTION_DIRECTIVE, directive_vp_context }, + { IGVM_VHT_PARAMETER_AREA, HEADER_SECTION_DIRECTIVE, directive_parameter_area }, + { IGVM_VHT_PARAMETER_INSERT, HEADER_SECTION_DIRECTIVE, directive_parameter_insert }, + { IGVM_VHT_MEMORY_MAP, HEADER_SECTION_DIRECTIVE, directive_memory_map }, + { IGVM_VHT_VP_COUNT_PARAMETER, HEADER_SECTION_DIRECTIVE, directive_vp_count }, + { IGVM_VHT_ENVIRONMENT_INFO_PARAMETER, HEADER_SECTION_DIRECTIVE, directive_environment_info }, + { IGVM_VHT_REQUIRED_MEMORY, HEADER_SECTION_DIRECTIVE, directive_required_memory }, + { IGVM_VHT_SNP_ID_BLOCK, HEADER_SECTION_DIRECTIVE, directive_snp_id_block }, + { IGVM_VHT_GUEST_POLICY, HEADER_SECTION_INITIALIZATION, initialization_guest_policy }, +}; + +static int handle(uint32_t type, struct igvm_context *ctx, int i, Error **errp) +{ + size_t handler; + IgvmHandle header_handle; + const uint8_t *header_data; + int result; + + for (handler = 0; handler < (sizeof(handlers) / + sizeof(struct IGVMHandler)); + ++handler) { + if (handlers[handler].type == type) { + header_handle = + igvm_get_header(ctx->cgs->igvm, handlers[handler].section, i); + if (header_handle < 0) { + error_setg( + errp, + "IGVM file is invalid: Failed to read header (code: %d)", + (int)header_handle); + return -1; + } + header_data = igvm_get_buffer(ctx->cgs->igvm, header_handle) + + sizeof(IGVM_VHS_VARIABLE_HEADER); + result = handlers[handler].handler(ctx, i, header_data, errp); + igvm_free_buffer(ctx->cgs->igvm, header_handle); + return result; + } + } + error_setg(errp, + "IGVM: Unknown header type encountered when processing file: " + "(type 0x%X)", + type); + return -1; +} + +static void *igvm_prepare_memory(uint64_t addr, uint64_t size, + int region_identifier, Error **errp) +{ + ERRP_GUARD(); + MemoryRegion *igvm_pages = NULL; + Int128 gpa_region_size; + MemoryRegionSection mrs = + memory_region_find(get_system_memory(), addr, size); + if (mrs.mr) { + if (!memory_region_is_ram(mrs.mr)) { + memory_region_unref(mrs.mr); + error_setg( + errp, + "Processing of IGVM file failed: Could not prepare memory " + "at address 0x%lX due to existing non-RAM region", + addr); + return NULL; + } + + gpa_region_size = int128_make64(size); + if (int128_lt(mrs.size, gpa_region_size)) { + memory_region_unref(mrs.mr); + error_setg( + errp, + "Processing of IGVM file failed: Could not prepare memory " + "at address 0x%lX: region size exceeded", + addr); + return NULL; + } + return qemu_map_ram_ptr(mrs.mr->ram_block, mrs.offset_within_region); + } else { + /* + * The region_identifier is the is the index of the IGVM directive that + * contains the page with the lowest GPA in the region. This will + * generate a unique region name. + */ + g_autofree char *region_name = + g_strdup_printf("igvm.%X", region_identifier); + igvm_pages = g_malloc(sizeof(*igvm_pages)); + memory_region_init_ram_guest_memfd(igvm_pages, NULL, region_name, size, + errp); + if (*errp) { + return NULL; + } + memory_region_add_subregion(get_system_memory(), addr, igvm_pages); + return memory_region_get_ram_ptr(igvm_pages); + } +} + +static int igvm_type_to_cgs_type(IgvmPageDataType memory_type, bool unmeasured, + bool zero) +{ + switch (memory_type) { + case NORMAL: { + if (unmeasured) { + return CGS_PAGE_TYPE_UNMEASURED; + } else { + return zero ? CGS_PAGE_TYPE_ZERO : CGS_PAGE_TYPE_NORMAL; + } + } + case SECRETS: + return CGS_PAGE_TYPE_SECRETS; + case CPUID_DATA: + return CGS_PAGE_TYPE_CPUID; + case CPUID_XF: + return CGS_PAGE_TYPE_CPUID; + default: + return -1; + } +} + +static bool page_attrs_equal(IgvmHandle igvm, int i, + const IGVM_VHS_PAGE_DATA *page_1, + const IGVM_VHS_PAGE_DATA *page_2) +{ + IgvmHandle data_handle1, data_handle2; + + /* + * If one page has data and the other doesn't then this results in different + * page types: NORMAL vs ZERO. + */ + data_handle1 = igvm_get_header_data(igvm, HEADER_SECTION_DIRECTIVE, i - 1); + data_handle2 = igvm_get_header_data(igvm, HEADER_SECTION_DIRECTIVE, i); + if ((data_handle1 == IGVMAPI_NO_DATA) && + (data_handle2 != IGVMAPI_NO_DATA)) { + return false; + } else if ((data_handle1 != IGVMAPI_NO_DATA) && + (data_handle2 == IGVMAPI_NO_DATA)) { + return false; + } + return ((*(const uint32_t *)&page_1->flags == + *(const uint32_t *)&page_2->flags) && + (page_1->data_type == page_2->data_type) && + (page_1->compatibility_mask == page_2->compatibility_mask)); +} + +static int igvm_process_mem_region(struct igvm_context *ctx, + int start_index, + uint64_t gpa_start, int page_count, + const IgvmPageDataFlags *flags, + const IgvmPageDataType page_type, + Error **errp) +{ + ERRP_GUARD(); + uint8_t *region; + IgvmHandle data_handle; + const void *data; + uint32_t data_size; + int i; + bool zero = true; + const uint64_t page_size = flags->is_2mb_page ? 0x200000 : 0x1000; + int result; + int cgs_page_type; + + region = igvm_prepare_memory(gpa_start, page_count * page_size, start_index, + errp); + if (!region) { + return -1; + } + + for (i = 0; i < page_count; ++i) { + data_handle = igvm_get_header_data(ctx->cgs->igvm, HEADER_SECTION_DIRECTIVE, + i + start_index); + if (data_handle == IGVMAPI_NO_DATA) { + /* No data indicates a zero page */ + memset(®ion[i * page_size], 0, page_size); + } else if (data_handle < 0) { + error_setg( + errp, + "IGVM file contains invalid page data for directive with " + "index %d", + i + start_index); + return -1; + } else { + zero = false; + data_size = igvm_get_buffer_size(ctx->cgs->igvm, data_handle); + if (data_size < page_size) { + memset(®ion[i * page_size], 0, page_size); + } else if (data_size > page_size) { + error_setg(errp, + "IGVM file contains page data with invalid size for " + "directive with index %d", + i + start_index); + return -1; + } + data = igvm_get_buffer(ctx->cgs->igvm, data_handle); + memcpy(®ion[i * page_size], data, data_size); + igvm_free_buffer(ctx->cgs->igvm, data_handle); + } + } + + cgs_page_type = igvm_type_to_cgs_type(page_type, flags->unmeasured, zero); + if (cgs_page_type < 0) { + error_setg( + errp, + "Invalid page type in IGVM file. Directives: %d to %d, " + "page type: %d", + start_index, start_index + page_count, page_type); + return -1; + } + + result = ctx->cgs->set_guest_state(gpa_start, region, page_size * page_count, + cgs_page_type, 0, errp); + if ((result < 0) && !*errp) { + error_setg(errp, "IGVM set guest state failed with code %d", result); + return -1; + } + return 0; +} + +static int process_mem_page(struct igvm_context *ctx, int i, + const IGVM_VHS_PAGE_DATA *page_data, Error **errp) +{ + ERRP_GUARD(); + static IGVM_VHS_PAGE_DATA prev_page_data; + static uint64_t region_start; + static int region_start_i; + static int last_i; + static int page_count; + + if (page_data) { + if (page_count == 0) { + region_start = page_data->gpa; + region_start_i = i; + } else { + if (!page_attrs_equal(ctx->cgs->igvm, i, page_data, &prev_page_data) || + ((prev_page_data.gpa + + (prev_page_data.flags.is_2mb_page ? 0x200000 : 0x1000)) != + page_data->gpa) || + (last_i != (i - 1))) { + /* End of current region */ + if (igvm_process_mem_region(ctx, region_start_i, + region_start, page_count, + &prev_page_data.flags, + prev_page_data.data_type, errp) < 0) { + return -1; + } + page_count = 0; + region_start = page_data->gpa; + region_start_i = i; + } + } + memcpy(&prev_page_data, page_data, sizeof(prev_page_data)); + last_i = i; + ++page_count; + } else { + if (page_count > 0) { + if (igvm_process_mem_region(ctx, region_start_i, + region_start, page_count, + &prev_page_data.flags, + prev_page_data.data_type, errp) < 0) { + return -1; + } + page_count = 0; + } + } + return 0; +} + +static int directive_page_data(struct igvm_context *ctx, int i, + const uint8_t *header_data, Error **errp) +{ + const IGVM_VHS_PAGE_DATA *page_data = + (const IGVM_VHS_PAGE_DATA *)header_data; + if (page_data->compatibility_mask & ctx->compatibility_mask) { + return process_mem_page(ctx, i, page_data, errp); + } + return 0; +} + +static int directive_vp_context(struct igvm_context *ctx, int i, + const uint8_t *header_data, Error **errp) +{ + ERRP_GUARD(); + const IGVM_VHS_VP_CONTEXT *vp_context = + (const IGVM_VHS_VP_CONTEXT *)header_data; + IgvmHandle data_handle; + uint8_t *data; + int result; + + if (vp_context->compatibility_mask & ctx->compatibility_mask) { + data_handle = + igvm_get_header_data(ctx->cgs->igvm, HEADER_SECTION_DIRECTIVE, i); + if (data_handle < 0) { + error_setg(errp, "Invalid VP context in IGVM file. Error code: %X", + data_handle); + return -1; + } + + data = (uint8_t *)igvm_get_buffer(ctx->cgs->igvm, data_handle); + result = ctx->cgs->set_guest_state( + vp_context->gpa, data, + igvm_get_buffer_size(ctx->cgs->igvm, data_handle), + CGS_PAGE_TYPE_VMSA, vp_context->vp_index, errp); + igvm_free_buffer(ctx->cgs->igvm, data_handle); + if (result != 0) { + if (!*errp) { + error_setg(errp, + "IGVM: Failed to set CPU context: error_code=%d", + result); + } + return -1; + } + } + return 0; +} + +static int directive_parameter_area(struct igvm_context *ctx, int i, + const uint8_t *header_data, Error **errp) +{ + const IGVM_VHS_PARAMETER_AREA *param_area = + (const IGVM_VHS_PARAMETER_AREA *)header_data; + IgvmParameterData *param_entry; + + param_entry = g_new0(IgvmParameterData, 1); + param_entry->size = param_area->number_of_bytes; + param_entry->index = param_area->parameter_area_index; + param_entry->data = g_malloc0(param_entry->size); + + QTAILQ_INSERT_TAIL(&ctx->parameter_data, param_entry, next); + return 0; +} + +static int directive_parameter_insert(struct igvm_context *ctx, int i, + const uint8_t *header_data, Error **errp) +{ + ERRP_GUARD(); + const IGVM_VHS_PARAMETER_INSERT *param = + (const IGVM_VHS_PARAMETER_INSERT *)header_data; + IgvmParameterData *param_entry; + int result; + void *region; + + QTAILQ_FOREACH(param_entry, &ctx->parameter_data, next) + { + if (param_entry->index == param->parameter_area_index) { + region = + igvm_prepare_memory(param->gpa, param_entry->size, i, errp); + if (!region) { + return -1; + } + memcpy(region, param_entry->data, param_entry->size); + g_free(param_entry->data); + param_entry->data = NULL; + + result = ctx->cgs->set_guest_state(param->gpa, region, param_entry->size, + CGS_PAGE_TYPE_UNMEASURED, 0, errp); + if (result != 0) { + if (!*errp) { + error_setg(errp, + "IGVM: Failed to set guest state: error_code=%d", + result); + } + return -1; + } + } + } + return 0; +} + +static int cmp_mm_entry(const void *a, const void *b) +{ + const IGVM_VHS_MEMORY_MAP_ENTRY *entry_a = + (const IGVM_VHS_MEMORY_MAP_ENTRY *)a; + const IGVM_VHS_MEMORY_MAP_ENTRY *entry_b = + (const IGVM_VHS_MEMORY_MAP_ENTRY *)b; + if (entry_a->starting_gpa_page_number < entry_b->starting_gpa_page_number) { + return -1; + } else if (entry_a->starting_gpa_page_number > + entry_b->starting_gpa_page_number) { + return 1; + } else { + return 0; + } +} + +static int directive_memory_map(struct igvm_context *ctx, int i, + const uint8_t *header_data, Error **errp) +{ + const IGVM_VHS_PARAMETER *param = (const IGVM_VHS_PARAMETER *)header_data; + IgvmParameterData *param_entry; + int max_entry_count; + int entry = 0; + IGVM_VHS_MEMORY_MAP_ENTRY *mm_entry; + ConfidentialGuestMemoryMapEntry cgmm_entry; + int retval = 0; + + /* Find the parameter area that should hold the memory map */ + QTAILQ_FOREACH(param_entry, &ctx->parameter_data, next) + { + if (param_entry->index == param->parameter_area_index) { + max_entry_count = + param_entry->size / sizeof(IGVM_VHS_MEMORY_MAP_ENTRY); + mm_entry = (IGVM_VHS_MEMORY_MAP_ENTRY *)param_entry->data; + + retval = ctx->cgs->get_mem_map_entry(entry, &cgmm_entry, errp); + while (retval == 0) { + if (entry > max_entry_count) { + error_setg( + errp, + "IGVM: guest memory map size exceeds parameter area defined in IGVM file"); + return -1; + } + mm_entry[entry].starting_gpa_page_number = cgmm_entry.gpa >> 12; + mm_entry[entry].number_of_pages = cgmm_entry.size >> 12; + + switch (cgmm_entry.type) { + case CGS_MEM_RAM: + mm_entry[entry].entry_type = MEMORY; + break; + case CGS_MEM_RESERVED: + mm_entry[entry].entry_type = PLATFORM_RESERVED; + break; + case CGS_MEM_ACPI: + mm_entry[entry].entry_type = PLATFORM_RESERVED; + break; + case CGS_MEM_NVS: + mm_entry[entry].entry_type = PERSISTENT; + break; + case CGS_MEM_UNUSABLE: + mm_entry[entry].entry_type = PLATFORM_RESERVED; + break; + } + retval = ctx->cgs->get_mem_map_entry(++entry, &cgmm_entry, errp); + } + if (retval < 0) { + return retval; + } + /* The entries need to be sorted */ + qsort(mm_entry, entry, sizeof(IGVM_VHS_MEMORY_MAP_ENTRY), + cmp_mm_entry); + + break; + } + } + return 0; +} + +static int directive_vp_count(struct igvm_context *ctx, int i, + const uint8_t *header_data, Error **errp) +{ + const IGVM_VHS_PARAMETER *param = (const IGVM_VHS_PARAMETER *)header_data; + IgvmParameterData *param_entry; + uint32_t *vp_count; + CPUState *cpu; + + QTAILQ_FOREACH(param_entry, &ctx->parameter_data, next) + { + if (param_entry->index == param->parameter_area_index) { + vp_count = (uint32_t *)(param_entry->data + param->byte_offset); + *vp_count = 0; + CPU_FOREACH(cpu) + { + (*vp_count)++; + } + break; + } + } + return 0; +} + +static int directive_environment_info(struct igvm_context *ctx, int i, + const uint8_t *header_data, Error **errp) +{ + const IGVM_VHS_PARAMETER *param = (const IGVM_VHS_PARAMETER *)header_data; + IgvmParameterData *param_entry; + IgvmEnvironmentInfo *environmental_state; + + QTAILQ_FOREACH(param_entry, &ctx->parameter_data, next) + { + if (param_entry->index == param->parameter_area_index) { + environmental_state = + (IgvmEnvironmentInfo *)(param_entry->data + param->byte_offset); + environmental_state->memory_is_shared = 1; + break; + } + } + return 0; +} + +static int directive_required_memory(struct igvm_context *ctx, int i, + const uint8_t *header_data, Error **errp) +{ + ERRP_GUARD(); + const IGVM_VHS_REQUIRED_MEMORY *mem = + (const IGVM_VHS_REQUIRED_MEMORY *)header_data; + uint8_t *region; + int result; + + if (mem->compatibility_mask & ctx->compatibility_mask) { + region = igvm_prepare_memory(mem->gpa, mem->number_of_bytes, i, errp); + if (!region) { + return -1; + } + result = ctx->cgs->set_guest_state(mem->gpa, region, mem->number_of_bytes, + CGS_PAGE_TYPE_REQUIRED_MEMORY, 0, errp); + if (result < 0) { + if (!*errp) { + error_setg(errp, + "IGVM: Failed to set guest state: error_code=%d", + result); + } + return -1; + } + } + return 0; +} + +static int directive_snp_id_block(struct igvm_context *ctx, int i, + const uint8_t *header_data, Error **errp) +{ + const IGVM_VHS_SNP_ID_BLOCK *igvm_id = + (const IGVM_VHS_SNP_ID_BLOCK *)header_data; + + if (ctx->compatibility_mask & igvm_id->compatibility_mask) { + if (ctx->id_block) { + error_setg(errp, "IGVM: Multiple ID blocks encountered " + "in IGVM file."); + return -1; + } + ctx->id_block = g_malloc0(sizeof(struct sev_id_block)); + ctx->id_auth = g_malloc0(sizeof(struct sev_id_authentication)); + + memcpy(ctx->id_block->family_id, igvm_id->family_id, + sizeof(ctx->id_block->family_id)); + memcpy(ctx->id_block->image_id, igvm_id->image_id, + sizeof(ctx->id_block->image_id)); + ctx->id_block->guest_svn = igvm_id->guest_svn; + ctx->id_block->version = 1; + memcpy(ctx->id_block->ld, igvm_id->ld, sizeof(ctx->id_block->ld)); + + ctx->id_auth->id_key_alg = igvm_id->id_key_algorithm; + memcpy(ctx->id_auth->id_block_sig, &igvm_id->id_key_signature, + sizeof(igvm_id->id_key_signature)); + + ctx->id_auth->auth_key_algo = igvm_id->author_key_algorithm; + memcpy(ctx->id_auth->id_key_sig, &igvm_id->author_key_signature, + sizeof(igvm_id->author_key_signature)); + + /* + * SEV and IGVM public key structure population are slightly different. + * See SEV Secure Nested Paging Firmware ABI Specification, Chapter 10. + */ + *((uint32_t *)ctx->id_auth->id_key) = igvm_id->id_public_key.curve; + memcpy(&ctx->id_auth->id_key[4], &igvm_id->id_public_key.qx, 72); + memcpy(&ctx->id_auth->id_key[76], &igvm_id->id_public_key.qy, 72); + + *((uint32_t *)ctx->id_auth->author_key) = + igvm_id->author_public_key.curve; + memcpy(&ctx->id_auth->author_key[4], &igvm_id->author_public_key.qx, + 72); + memcpy(&ctx->id_auth->author_key[76], &igvm_id->author_public_key.qy, + 72); + } + + return 0; +} + +static int initialization_guest_policy(struct igvm_context *ctx, int i, + const uint8_t *header_data, Error **errp) +{ + const IGVM_VHS_GUEST_POLICY *guest = + (const IGVM_VHS_GUEST_POLICY *)header_data; + + if (guest->compatibility_mask & ctx->compatibility_mask) { + ctx->sev_policy = guest->policy; + } + return 0; +} + +static int supported_platform_compat_mask(struct igvm_context *ctx, + Error **errp) +{ + int32_t result; + int i; + IgvmHandle header_handle; + IGVM_VHS_SUPPORTED_PLATFORM *platform; + + ctx->compatibility_mask = 0; + + result = igvm_header_count(ctx->cgs->igvm, HEADER_SECTION_PLATFORM); + if (result < 0) { + error_setg(errp, + "Invalid platform header count in IGVM file. Error code: %X", + result); + return 0; + } + + for (i = 0; i < (int)result; ++i) { + IgvmVariableHeaderType typ = + igvm_get_header_type(ctx->cgs->igvm, HEADER_SECTION_PLATFORM, i); + if (typ == IGVM_VHT_SUPPORTED_PLATFORM) { + header_handle = + igvm_get_header(ctx->cgs->igvm, HEADER_SECTION_PLATFORM, i); + if (header_handle < 0) { + error_setg(errp, + "Invalid platform header in IGVM file. " + "Index: %d, Error code: %X", + i, header_handle); + return 0; + } + platform = + (IGVM_VHS_SUPPORTED_PLATFORM *)(igvm_get_buffer(ctx->cgs->igvm, + header_handle) + + sizeof( + IGVM_VHS_VARIABLE_HEADER)); + /* Currently only support SEV-SNP. */ + if (platform->platform_type == SEV_SNP) { + /* + * IGVM does not define a platform types of SEV or SEV_ES. + * Translate SEV_SNP into CGS_PLATFORM_SEV_ES and + * CGS_PLATFORM_SEV and let the cgs function implementations + * check whether each IGVM directive results in an operation + * that is supported by the particular derivative of SEV. + */ + if (ctx->cgs->check_support( + CGS_PLATFORM_SEV_SNP, platform->platform_version, + platform->highest_vtl, platform->shared_gpa_boundary) || + ctx->cgs->check_support( + CGS_PLATFORM_SEV_ES, platform->platform_version, + platform->highest_vtl, platform->shared_gpa_boundary) || + ctx->cgs->check_support( + CGS_PLATFORM_SEV, platform->platform_version, + platform->highest_vtl, platform->shared_gpa_boundary)) { + ctx->compatibility_mask = platform->compatibility_mask; + ctx->platform_type = platform->platform_type; + break; + } + } + igvm_free_buffer(ctx->cgs->igvm, header_handle); + } + } + if (ctx->compatibility_mask == 0) { + error_setg( + errp, + "IGVM file does not describe a compatible supported platform"); + return -1; + } + return 0; +} + +static int handle_policy(struct igvm_context *ctx, Error **errp) +{ + if (ctx->platform_type == SEV_SNP) { + int id_block_len = 0; + int id_auth_len = 0; + if (ctx->id_block) { + ctx->id_block->policy = ctx->sev_policy; + id_block_len = sizeof(struct sev_id_block); + id_auth_len = sizeof(struct sev_id_authentication); + } + return ctx->cgs->set_guest_policy(GUEST_POLICY_SEV, ctx->sev_policy, + ctx->id_block, id_block_len, + ctx->id_auth, id_auth_len, errp); + } + return 0; +} + +int igvm_file_init(ConfidentialGuestSupport *cgs, Error **errp) +{ + g_autofree uint8_t *buf = NULL; + unsigned long len; + g_autoptr(GError) gerr = NULL; + + if (!cgs->igvm_filename) { + return 0; + } + + if (!g_file_get_contents(cgs->igvm_filename, (gchar **)&buf, &len, &gerr)) { + error_setg(errp, "Unable to load %s: %s", cgs->igvm_filename, + gerr->message); + return -1; + } + + if ((cgs->igvm = igvm_new_from_binary(buf, len)) < 0) { + error_setg(errp, "Unable to parse IGVM file %s: %d", cgs->igvm_filename, + cgs->igvm); + return -1; + } + + return 0; +} + +int igvm_process(ConfidentialGuestSupport *cgs, Error **errp) +{ + int32_t result; + int i; + IgvmParameterData *parameter; + int retval = 0; + struct igvm_context ctx; + + /* + * If this is not a Confidential guest or no IGVM has been provided then + * this is a no-op. + */ + if (!cgs->igvm) { + return 0; + } + + memset(&ctx, 0, sizeof(struct igvm_context)); + QTAILQ_INIT(&ctx.parameter_data); + ctx.cgs = cgs; + + /* + * Check that the IGVM file provides configuration for the current + * platform + */ + if (supported_platform_compat_mask(&ctx, errp) != 0) { + return -1; + } + + result = igvm_header_count(cgs->igvm, HEADER_SECTION_DIRECTIVE); + if (result < 0) { + error_setg( + errp, "Invalid directive header count in IGVM file. Error code: %X", + result); + return -1; + } + + for (i = 0; i < (int)result; ++i) { + IgvmVariableHeaderType type = + igvm_get_header_type(cgs->igvm, HEADER_SECTION_DIRECTIVE, i); + if (handle(type, &ctx, i, errp) < 0) { + retval = -1; + break; + } + } + + result = igvm_header_count(cgs->igvm, HEADER_SECTION_INITIALIZATION); + if (result < 0) { + error_setg( + errp, "Invalid initialization header count in IGVM file. Error code: %X", + result); + return -1; + } + + for (i = 0; i < (int)result; ++i) { + IgvmVariableHeaderType type = + igvm_get_header_type(cgs->igvm, HEADER_SECTION_INITIALIZATION, i); + if (handle(type, &ctx, i, errp) < 0) { + retval = -1; + break; + } + } + + /* + * Contiguous pages of data with compatible flags are grouped together in + * order to reduce the number of memory regions we create. Make sure the + * last group is processed with this call. + */ + if (retval == 0) { + retval = process_mem_page(&ctx, i, NULL, errp); + } + + if (retval == 0) { + retval = handle_policy(&ctx, errp); + } + + /* Clean up the context */ + QTAILQ_FOREACH(parameter, &ctx.parameter_data, next) + { + g_free(parameter->data); + parameter->data = NULL; + } + g_free(ctx.id_block); + g_free(ctx.id_auth); + + return retval; +} + +#endif diff --git a/backends/meson.build b/backends/meson.build index 914c7c4afb90..f4481dafb721 100644 --- a/backends/meson.build +++ b/backends/meson.build @@ -25,5 +25,9 @@ if have_vhost_user_crypto endif system_ss.add(when: gio, if_true: files('dbus-vmstate.c')) system_ss.add(when: 'CONFIG_SGX', if_true: files('hostmem-epc.c')) +if igvm.found() + system_ss.add(igvm) + system_ss.add(files('igvm.c')) +endif subdir('tpm') diff --git a/docs/system/i386/amd-memory-encryption.rst b/docs/system/i386/amd-memory-encryption.rst index e9bc142bc130..9d6b63acd967 100644 --- a/docs/system/i386/amd-memory-encryption.rst +++ b/docs/system/i386/amd-memory-encryption.rst @@ -25,8 +25,8 @@ support for notifying a guest's operating system when certain types of VMEXITs are about to occur. This allows the guest to selectively share information with the hypervisor to satisfy the requested function. -Launching ---------- +Launching (SEV and SEV-ES) +-------------------------- Boot images (such as bios) must be encrypted before a guest can be booted. The ``MEMORY_ENCRYPT_OP`` ioctl provides commands to encrypt the images: ``LAUNCH_START``, @@ -161,6 +161,80 @@ The value of GCTX.LD is If kernel hashes are not used, or SEV-ES is disabled, use empty blobs for ``kernel_hashes_blob`` and ``vmsas_blob`` as needed. +Launching (SEV-SNP) +------------------- +Boot images (such as bios) must be encrypted before a guest can be booted. The +``MEMORY_ENCRYPT_OP`` ioctl provides commands to encrypt the images: +``KVM_SNP_INIT``, ``SNP_LAUNCH_START``, ``SNP_LAUNCH_UPDATE``, and +``SNP_LAUNCH_FINISH``. These four commands together generate a fresh memory +encryption key for the VM, encrypt the boot images for a successful launch. + +KVM_SNP_INIT is called first to initialize the SEV-SNP firmware and SNP +features in the KVM. The feature flags value can be provided through the +init-flags property of the sev-snp-guest object. + ++------------+-------+----------+---------------------------------+ +| key | type | default | meaning | ++------------+-------+----------+---------------------------------+ +| init_flags | hex | 0 | SNP feature flags | ++-----------------------------------------------------------------+ + +Note: currently the init_flags must be zero. + +``SNP_LAUNCH_START`` is called first to create a cryptographic launch context +within the firmware. To create this context, guest owner must provide a guest +policy and other parameters as described in the SEV-SNP firmware +specification. The launch parameters should be specified as described in the +QAPI schema for the sev-snp-guest object. + +The ``SNP_LAUNCH_START`` uses the following parameters (see the SEV-SNP +specification for more details): + ++--------+-------+----------+----------------------------------------------+ +| key | type | default | meaning | ++--------+-------+----------+----------------------------------------------+ +| policy | hex | 0x30000 | a 64-bit guest policy | +| imi_en | bool | 0 | 1 when IMI is enabled | +| ma_end | bool | 0 | 1 when migration agent is used | +| gosvw | string| 0 | 16-byte base64 encoded string for the guest | +| | | | OS visible workaround. | ++--------+-------+----------+----------------------------------------------+ + +``SNP_LAUNCH_UPDATE`` encrypts the memory region using the cryptographic context +created via the ``SNP_LAUNCH_START`` command. If required, this command can be +called multiple times to encrypt different memory regions. The command also +calculates the measurement of the memory contents as it encrypts. + +``SNP_LAUNCH_FINISH`` finalizes the guest launch flow. Optionally, while +finalizing the launch the firmware can perform checks on the launch digest +computing through the ``SNP_LAUNCH_UPDATE``. To perform the check the user must +supply the id block, authentication blob and host data that should be included +in the attestation report. See the SEV-SNP spec for further details. + +The ``SNP_LAUNCH_FINISH`` uses the following parameters, which can be configured +by the corresponding parameters documented in the QAPI schema for the +'sev-snp-guest' object. + ++------------+-------+----------+----------------------------------------------+ +| key | type | default | meaning | ++------------+-------+----------+----------------------------------------------+ +| id_block | string| none | base64 encoded ID block | ++------------+-------+----------+----------------------------------------------+ +| id_auth | string| none | base64 encoded authentication information | ++------------+-------+----------+----------------------------------------------+ +| auth_key_en| bool | 0 | auth block contains author key | ++------------+-------+----------+----------------------------------------------+ +| host_data | string| none | host provided data | ++------------+-------+----------+----------------------------------------------+ + +To launch a SEV-SNP guest (additional parameters are documented in the QAPI +schema for the 'sev-snp-guest' object):: + + # ${QEMU} \ + -machine ...,confidential-guest-support=sev0 \ + -object sev-snp-guest,id=sev0,cbitpos=51,reduced-phys-bits=1 + + Debugging --------- diff --git a/docs/system/igvm.rst b/docs/system/igvm.rst new file mode 100644 index 000000000000..bb0c43f0eea4 --- /dev/null +++ b/docs/system/igvm.rst @@ -0,0 +1,58 @@ +Independent Guest Virtual Machine (IGVM) support +================================================ + +IGVM files are designed to encaspulate all the information required to launch a +virtual machine on any given virtualization stack in a deterministic way. This +allows the cryptographic measurement of initial guest state for Confidential +Guests to be calculated when the IGVM file is built, allowing a relying party to +verify the initial state of a guest via a remote attestation. + +QEMU supports IGVM files through the Confidential Guest Support object. An igvm +filename can optionally be passed to the object which will subsequently be +parsed and used to configure the guest state prior to launching the guest. + +Further Information on IGVM +--------------------------- + +Information about the IGVM format, including links to the format specification +and documentation for the Rust and C libraries can be found at the project +repository: + +https://github.com/microsoft/igvm + + +Supported Confidential Guests +----------------------------- + +Currently, IGVM files can be provided for Confidential Guests on host systems +that support AMD SEV and SEV-ES. + +IGVM files contain a set of directives. Not every directive is supported by +every Confidential Guest type. For example, setting the initial CPU state is not +supported on AMD SEV due to the platform not supporting encrypted save state +regions. However, this is supported on SEV-ES. + +When an IGVM file contains directives that are not supported for the active +platform, an error is displayed and the guest launch is aborted. + +Firmware Images with IGVM +------------------------- + +When an IGVM filename is specified for a Confidential Guest Support object it +overrides the default handling of system firmware: the firmware image, such as +an OVMF binary should be contained as a payload of the IGVM file and not +provided as a flash drive. The default QEMU firmware is not automatically mapped +into guest memory. + +Running a Confidential Guest configured using IGVM +-------------------------------------------------- + +To run a confidential guest configured with IGVM you need to add the +``igvm-file`` parameter to the "confidential guest support" object: + +Example (for AMD SEV):: + + qemu-system-x86_64 \ + \ + -machine ...,confidential-guest-support=sev0 \ + -object sev-guest,id=sev0,cbitpos=47,reduced-phys-bits=1,igvm-file=/path/to/guest.igvm diff --git a/docs/system/index.rst b/docs/system/index.rst index c21065e51932..6235dfab87cf 100644 --- a/docs/system/index.rst +++ b/docs/system/index.rst @@ -38,4 +38,5 @@ or Hypervisor.Framework. security multi-process confidential-guest-support + igvm vm-templating diff --git a/hw/core/machine.c b/hw/core/machine.c index 0c1739814124..b1b0a46ea52f 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -1189,6 +1189,11 @@ bool machine_mem_merge(MachineState *machine) return machine->mem_merge; } +bool machine_require_guest_memfd(MachineState *machine) +{ + return machine->require_guest_memfd; +} + static char *cpu_slot_to_string(const CPUArchId *cpu) { GString *s = g_string_new(NULL); diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 29b9964733ed..d94c54840704 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1037,8 +1037,8 @@ void pc_memory_init(PCMachineState *pcms, pc_system_firmware_init(pcms, rom_memory); option_rom_mr = g_malloc(sizeof(*option_rom_mr)); - memory_region_init_ram(option_rom_mr, NULL, "pc.rom", PC_ROM_SIZE, - &error_fatal); + memory_region_init_ram_guest_memfd(option_rom_mr, NULL, "pc.rom", PC_ROM_SIZE, + &error_fatal); if (pcmc->pci_enabled) { memory_region_set_readonly(option_rom_mr, true); } @@ -1646,6 +1646,21 @@ static void pc_machine_set_smbios_ep(Object *obj, Visitor *v, const char *name, visit_type_SmbiosEntryPointType(v, name, &pcms->smbios_entry_point_type, errp); } +static bool pc_machine_get_svsm_virtio_mmio(Object *obj, Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(obj); + + return pcms->svsm_virtio_mmio; +} + +static void pc_machine_set_svsm_virtio_mmio(Object *obj, bool value, + Error **errp) +{ + PCMachineState *pcms = PC_MACHINE(obj); + + pcms->svsm_virtio_mmio = value; +} + static void pc_machine_get_max_ram_below_4g(Object *obj, Visitor *v, const char *name, void *opaque, Error **errp) @@ -1755,11 +1770,6 @@ static void pc_machine_initfn(Object *obj) cxl_machine_init(obj, &pcms->cxl_devices_state); } -int pc_machine_kvm_type(MachineState *machine, const char *kvm_type) -{ - return 0; -} - static void pc_machine_reset(MachineState *machine, ShutdownCause reason) { CPUState *cs; @@ -1894,6 +1904,9 @@ static void pc_machine_class_init(ObjectClass *oc, void *data) NULL, NULL); object_class_property_set_description(oc, PC_MACHINE_SMBIOS_EP, "SMBIOS Entry Point type [32, 64]"); + + object_class_property_add_bool(oc, "x-svsm-virtio-mmio", + pc_machine_get_svsm_virtio_mmio, pc_machine_set_svsm_virtio_mmio); } static const TypeInfo pc_machine_info = { diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c index eace8543358a..ff0d9d226d96 100644 --- a/hw/i386/pc_piix.c +++ b/hw/i386/pc_piix.c @@ -67,6 +67,7 @@ #include "hw/i386/acpi-build.h" #include "kvm/kvm-cpu.h" #include "target/i386/cpu.h" +#include "exec/confidential-guest-support.h" #define MAX_IDE_BUS 2 #define XEN_IOAPIC_NUM_PIRQS 128ULL @@ -392,6 +393,9 @@ static void pc_init1(MachineState *machine, x86_nvdimm_acpi_dsmio, x86ms->fw_cfg, OBJECT(pcms)); } + + /* Apply confidential guest state from IGVM if supplied */ + cgs_process_igvm(machine->cgs); } typedef enum PCSouthBridgeOption { diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c index 4f3e5412f6b8..c1209508080d 100644 --- a/hw/i386/pc_q35.c +++ b/hw/i386/pc_q35.c @@ -59,6 +59,7 @@ #include "hw/mem/nvdimm.h" #include "hw/i386/acpi-build.h" #include "target/i386/cpu.h" +#include "exec/confidential-guest-support.h" /* ICH9 AHCI has 6 ports */ #define MAX_SATA_PORTS 6 @@ -236,6 +237,8 @@ static void pc_q35_init(MachineState *machine) x86ms->above_4g_mem_size, NULL); object_property_set_bool(phb, PCI_HOST_BYPASS_IOMMU, pcms->default_bus_bypass_iommu, NULL); + object_property_set_bool(phb, PCI_HOST_PROP_SMM_RANGES, + x86_machine_is_smm_enabled(x86ms), NULL); sysbus_realize_and_unref(SYS_BUS_DEVICE(phb), &error_fatal); /* pci */ @@ -347,6 +350,16 @@ static void pc_q35_init(MachineState *machine) x86_nvdimm_acpi_dsmio, x86ms->fw_cfg, OBJECT(pcms)); } + + /* Apply confidential guest state from IGVM if supplied */ + cgs_process_igvm(machine->cgs); + + if (pcms->svsm_virtio_mmio) { + for (int dev = 0; dev < 4; dev++) { + hwaddr addr = 0xfef00000 + dev * TARGET_PAGE_SIZE; + sysbus_create_simple("virtio-mmio", addr, /* no irq */ NULL); + } + } } #define DEFINE_Q35_MACHINE(suffix, name, compatfn, optionfn) \ diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c index c8d9e71b889b..bc17c7bfe638 100644 --- a/hw/i386/pc_sysfw.c +++ b/hw/i386/pc_sysfw.c @@ -54,8 +54,8 @@ static void pc_isa_bios_init(MemoryRegion *rom_memory, /* map the last 128KB of the BIOS in ISA space */ isa_bios_size = MIN(flash_size, 128 * KiB); isa_bios = g_malloc(sizeof(*isa_bios)); - memory_region_init_ram(isa_bios, NULL, "isa-bios", isa_bios_size, - &error_fatal); + memory_region_init_ram_guest_memfd(isa_bios, NULL, "isa-bios", isa_bios_size, + &error_fatal); memory_region_add_subregion_overlap(rom_memory, 0x100000 - isa_bios_size, isa_bios, @@ -151,6 +151,8 @@ static void pc_system_flash_map(PCMachineState *pcms, assert(PC_MACHINE_GET_CLASS(pcms)->pci_enabled); for (i = 0; i < ARRAY_SIZE(pcms->flash); i++) { + hwaddr gpa; + system_flash = pcms->flash[i]; blk = pflash_cfi01_get_blk(system_flash); if (!blk) { @@ -180,11 +182,11 @@ static void pc_system_flash_map(PCMachineState *pcms, } total_size += size; + gpa = 0x100000000ULL - total_size; /* where the flash is mapped */ qdev_prop_set_uint32(DEVICE(system_flash), "num-blocks", size / FLASH_SECTOR_SIZE); sysbus_realize_and_unref(SYS_BUS_DEVICE(system_flash), &error_fatal); - sysbus_mmio_map(SYS_BUS_DEVICE(system_flash), 0, - 0x100000000ULL - total_size); + sysbus_mmio_map(SYS_BUS_DEVICE(system_flash), 0, gpa); if (i == 0) { flash_mem = pflash_cfi01_get_memory(system_flash); @@ -194,7 +196,7 @@ static void pc_system_flash_map(PCMachineState *pcms, if (sev_enabled()) { flash_ptr = memory_region_get_ram_ptr(flash_mem); flash_size = memory_region_size(flash_mem); - x86_firmware_configure(flash_ptr, flash_size); + x86_firmware_configure(gpa, flash_ptr, flash_size); } } } @@ -228,8 +230,13 @@ void pc_system_firmware_init(PCMachineState *pcms, } if (!pflash_blk[0]) { - /* Machine property pflash0 not set, use ROM mode */ - x86_bios_rom_init(MACHINE(pcms), "bios.bin", rom_memory, false); + /* + * Machine property pflash0 not set, use ROM mode unless using IGVM, + * in which case the firmware must be provided by the IGVM file. + */ + if (!cgs_is_igvm(MACHINE(pcms)->cgs)) { + x86_bios_rom_init(MACHINE(pcms), "bios.bin", rom_memory, false); + } } else { if (kvm_enabled() && !kvm_readonly_mem_enabled()) { /* @@ -245,9 +252,22 @@ void pc_system_firmware_init(PCMachineState *pcms, } pc_system_flash_cleanup_unused(pcms); + + /* + * The user should not have specified any pflash devices when using IGVM + * to configure the guest. + */ + if (cgs_is_igvm(MACHINE(pcms)->cgs)) { + for (i = 0; i < ARRAY_SIZE(pcms->flash); i++) { + if (pcms->flash[i]) { + error_report("pflash devices cannot be configured when using IGVM"); + exit(1); + } + } + } } -void x86_firmware_configure(void *ptr, int size) +void x86_firmware_configure(hwaddr gpa, void *ptr, int size) { int ret; @@ -264,6 +284,6 @@ void x86_firmware_configure(void *ptr, int size) exit(1); } - sev_encrypt_flash(ptr, size, &error_fatal); + sev_encrypt_flash(gpa, ptr, size, &error_fatal); } } diff --git a/hw/i386/pc_sysfw_ovmf.c b/hw/i386/pc_sysfw_ovmf.c index 07a4c267faae..32efa34614fb 100644 --- a/hw/i386/pc_sysfw_ovmf.c +++ b/hw/i386/pc_sysfw_ovmf.c @@ -35,6 +35,31 @@ static const int bytes_after_table_footer = 32; static bool ovmf_flash_parsed; static uint8_t *ovmf_table; static int ovmf_table_len; +static OvmfSevMetadata *ovmf_sev_metadata_table; + +#define OVMF_SEV_META_DATA_GUID "dc886566-984a-4798-A75e-5585a7bf67cc" +typedef struct __attribute__((__packed__)) OvmfSevMetadataOffset { + uint32_t offset; +} OvmfSevMetadataOffset; + +static void pc_system_parse_sev_metadata(uint8_t *flash_ptr, size_t flash_size) +{ + OvmfSevMetadata *metadata; + OvmfSevMetadataOffset *data; + + if (!pc_system_ovmf_table_find(OVMF_SEV_META_DATA_GUID, (uint8_t **)&data, + NULL)) { + return; + } + + metadata = (OvmfSevMetadata *)(flash_ptr + flash_size - data->offset); + if (memcmp(metadata->signature, "ASEV", 4) != 0) { + return; + } + + ovmf_sev_metadata_table = g_malloc(metadata->len); + memcpy(ovmf_sev_metadata_table, metadata, metadata->len); +} void pc_system_parse_ovmf_flash(uint8_t *flash_ptr, size_t flash_size) { @@ -90,6 +115,9 @@ void pc_system_parse_ovmf_flash(uint8_t *flash_ptr, size_t flash_size) */ memcpy(ovmf_table, ptr - tot_len, tot_len); ovmf_table += tot_len; + + /* Copy the SEV metadata table (if exist) */ + pc_system_parse_sev_metadata(flash_ptr, flash_size); } /** @@ -159,3 +187,8 @@ bool pc_system_ovmf_table_find(const char *entry, uint8_t **data, } return false; } + +OvmfSevMetadata *pc_system_get_ovmf_sev_metadata_ptr(void) +{ + return ovmf_sev_metadata_table; +} diff --git a/hw/i386/x86.c b/hw/i386/x86.c index 2b6291ad8d5f..67fb8f1dbf34 100644 --- a/hw/i386/x86.c +++ b/hw/i386/x86.c @@ -1166,7 +1166,7 @@ void x86_bios_rom_init(MachineState *ms, const char *default_firmware, */ void *ptr = memory_region_get_ram_ptr(bios); load_image_size(filename, ptr, bios_size); - x86_firmware_configure(ptr, bios_size); + x86_firmware_configure(0x100000000ULL - bios_size, ptr, bios_size); } else { if (!isapc_ram_fw) { memory_region_set_readonly(bios, true); @@ -1386,6 +1386,27 @@ static void machine_set_sgx_epc(Object *obj, Visitor *v, const char *name, qapi_free_SgxEPCList(list); } +static int x86_kvm_type(MachineState *ms, const char *vm_type) +{ + X86MachineState *x86ms = X86_MACHINE(ms); + int kvm_type; + + kvm_type = kvm_get_vm_type(ms, vm_type); + x86ms->vm_type = kvm_type; + + if (kvm_type > 0) { + ms->require_guest_memfd = true; + if (x86ms->smm == ON_OFF_AUTO_AUTO) { + x86ms->smm = ON_OFF_AUTO_OFF; + } else if (x86ms->smm == ON_OFF_AUTO_ON) { + error_report("VM type doesn't support SMM"); + return -EINVAL; + } + } + + return kvm_type; +} + static void x86_machine_initfn(Object *obj) { X86MachineState *x86ms = X86_MACHINE(obj); @@ -1410,6 +1431,7 @@ static void x86_machine_class_init(ObjectClass *oc, void *data) mc->cpu_index_to_instance_props = x86_cpu_index_to_props; mc->get_default_cpu_node_id = x86_get_default_cpu_node_id; mc->possible_cpu_arch_ids = x86_possible_cpu_arch_ids; + mc->kvm_type = x86_kvm_type; x86mc->save_tsc_khz = true; x86mc->fwcfg_dma_enabled = true; nc->nmi_monitor_handler = x86_nmi; diff --git a/hw/pci-host/q35.c b/hw/pci-host/q35.c index 08534bc7cc09..8facd8b63f76 100644 --- a/hw/pci-host/q35.c +++ b/hw/pci-host/q35.c @@ -179,6 +179,8 @@ static Property q35_host_props[] = { mch.below_4g_mem_size, 0), DEFINE_PROP_SIZE(PCI_HOST_ABOVE_4G_MEM_SIZE, Q35PCIHost, mch.above_4g_mem_size, 0), + DEFINE_PROP_BOOL(PCI_HOST_PROP_SMM_RANGES, Q35PCIHost, + mch.has_smm_ranges, true), DEFINE_PROP_BOOL("x-pci-hole64-fix", Q35PCIHost, pci_hole64_fix, true), DEFINE_PROP_END_OF_LIST(), }; @@ -214,6 +216,7 @@ static void q35_host_initfn(Object *obj) /* mch's object_initialize resets the default value, set it again */ qdev_prop_set_uint64(DEVICE(s), PCI_HOST_PROP_PCI_HOLE64_SIZE, Q35_PCI_HOST_HOLE64_SIZE_DEFAULT); + object_property_add(obj, PCI_HOST_PROP_PCI_HOLE_START, "uint32", q35_host_get_pci_hole_start, NULL, NULL, NULL); @@ -476,6 +479,10 @@ static void mch_write_config(PCIDevice *d, mch_update_pciexbar(mch); } + if (!mch->has_smm_ranges) { + return; + } + if (ranges_overlap(address, len, MCH_HOST_BRIDGE_SMRAM, MCH_HOST_BRIDGE_SMRAM_SIZE)) { mch_update_smram(mch); @@ -494,10 +501,13 @@ static void mch_write_config(PCIDevice *d, static void mch_update(MCHPCIState *mch) { mch_update_pciexbar(mch); + mch_update_pam(mch); - mch_update_smram(mch); - mch_update_ext_tseg_mbytes(mch); - mch_update_smbase_smram(mch); + if (mch->has_smm_ranges) { + mch_update_smram(mch); + mch_update_ext_tseg_mbytes(mch); + mch_update_smbase_smram(mch); + } /* * pci hole goes from end-of-low-ram to io-apic. @@ -538,18 +548,20 @@ static void mch_reset(DeviceState *qdev) pci_set_quad(d->config + MCH_HOST_BRIDGE_PCIEXBAR, MCH_HOST_BRIDGE_PCIEXBAR_DEFAULT); - d->config[MCH_HOST_BRIDGE_SMRAM] = MCH_HOST_BRIDGE_SMRAM_DEFAULT; - d->config[MCH_HOST_BRIDGE_ESMRAMC] = MCH_HOST_BRIDGE_ESMRAMC_DEFAULT; - d->wmask[MCH_HOST_BRIDGE_SMRAM] = MCH_HOST_BRIDGE_SMRAM_WMASK; - d->wmask[MCH_HOST_BRIDGE_ESMRAMC] = MCH_HOST_BRIDGE_ESMRAMC_WMASK; + if (mch->has_smm_ranges) { + d->config[MCH_HOST_BRIDGE_SMRAM] = MCH_HOST_BRIDGE_SMRAM_DEFAULT; + d->config[MCH_HOST_BRIDGE_ESMRAMC] = MCH_HOST_BRIDGE_ESMRAMC_DEFAULT; + d->wmask[MCH_HOST_BRIDGE_SMRAM] = MCH_HOST_BRIDGE_SMRAM_WMASK; + d->wmask[MCH_HOST_BRIDGE_ESMRAMC] = MCH_HOST_BRIDGE_ESMRAMC_WMASK; - if (mch->ext_tseg_mbytes > 0) { - pci_set_word(d->config + MCH_HOST_BRIDGE_EXT_TSEG_MBYTES, - MCH_HOST_BRIDGE_EXT_TSEG_MBYTES_QUERY); - } + if (mch->ext_tseg_mbytes > 0) { + pci_set_word(d->config + MCH_HOST_BRIDGE_EXT_TSEG_MBYTES, + MCH_HOST_BRIDGE_EXT_TSEG_MBYTES_QUERY); + } - d->config[MCH_HOST_BRIDGE_F_SMBASE] = 0; - d->wmask[MCH_HOST_BRIDGE_F_SMBASE] = 0xff; + d->config[MCH_HOST_BRIDGE_F_SMBASE] = 0; + d->wmask[MCH_HOST_BRIDGE_F_SMBASE] = 0xff; + } mch_update(mch); } @@ -568,6 +580,20 @@ static void mch_realize(PCIDevice *d, Error **errp) /* setup pci memory mapping */ pc_pci_as_mapping_init(mch->system_memory, mch->pci_address_space); + /* PAM */ + init_pam(&mch->pam_regions[0], OBJECT(mch), mch->ram_memory, + mch->system_memory, mch->pci_address_space, + PAM_BIOS_BASE, PAM_BIOS_SIZE); + for (i = 0; i < ARRAY_SIZE(mch->pam_regions) - 1; ++i) { + init_pam(&mch->pam_regions[i + 1], OBJECT(mch), mch->ram_memory, + mch->system_memory, mch->pci_address_space, + PAM_EXPAN_BASE + i * PAM_EXPAN_SIZE, PAM_EXPAN_SIZE); + } + + if (!mch->has_smm_ranges) { + return; + } + /* if *disabled* show SMRAM to all CPUs */ memory_region_init_alias(&mch->smram_region, OBJECT(mch), "smram-region", mch->pci_address_space, MCH_HOST_BRIDGE_SMRAM_C_BASE, @@ -634,15 +660,6 @@ static void mch_realize(PCIDevice *d, Error **errp) object_property_add_const_link(qdev_get_machine(), "smram", OBJECT(&mch->smram)); - - init_pam(&mch->pam_regions[0], OBJECT(mch), mch->ram_memory, - mch->system_memory, mch->pci_address_space, - PAM_BIOS_BASE, PAM_BIOS_SIZE); - for (i = 0; i < ARRAY_SIZE(mch->pam_regions) - 1; ++i) { - init_pam(&mch->pam_regions[i + 1], OBJECT(mch), mch->ram_memory, - mch->system_memory, mch->pci_address_space, - PAM_EXPAN_BASE + i * PAM_EXPAN_SIZE, PAM_EXPAN_SIZE); - } } uint64_t mch_mcfg_base(void) diff --git a/include/exec/confidential-guest-support.h b/include/exec/confidential-guest-support.h index ba2dd4b5dfc4..4ad0ed9ea461 100644 --- a/include/exec/confidential-guest-support.h +++ b/include/exec/confidential-guest-support.h @@ -21,10 +21,53 @@ #ifndef CONFIG_USER_ONLY #include "qom/object.h" +#include "exec/hwaddr.h" + +#if defined(CONFIG_IGVM) +#include "igvm/igvm.h" +#endif + +#if defined(CONFIG_IGVM) +#include "igvm/igvm.h" +#endif #define TYPE_CONFIDENTIAL_GUEST_SUPPORT "confidential-guest-support" OBJECT_DECLARE_SIMPLE_TYPE(ConfidentialGuestSupport, CONFIDENTIAL_GUEST_SUPPORT) +typedef enum ConfidentialGuestPlatformType { + CGS_PLATFORM_SEV, + CGS_PLATFORM_SEV_ES, + CGS_PLATFORM_SEV_SNP, +} ConfidentialGuestPlatformType; + +typedef enum ConfidentialGuestMemoryType { + CGS_MEM_RAM, + CGS_MEM_RESERVED, + CGS_MEM_ACPI, + CGS_MEM_NVS, + CGS_MEM_UNUSABLE, +} ConfidentialGuestMemoryType; + +typedef struct ConfidentialGuestMemoryMapEntry { + uint64_t gpa; + uint64_t size; + ConfidentialGuestMemoryType type; +} ConfidentialGuestMemoryMapEntry; + +typedef enum ConfidentialGuestPageType { + CGS_PAGE_TYPE_NORMAL, + CGS_PAGE_TYPE_VMSA, + CGS_PAGE_TYPE_ZERO, + CGS_PAGE_TYPE_UNMEASURED, + CGS_PAGE_TYPE_SECRETS, + CGS_PAGE_TYPE_CPUID, + CGS_PAGE_TYPE_REQUIRED_MEMORY, +} ConfidentialGuestPageType; + +typedef enum ConfidentialGuestPolicyType { + GUEST_POLICY_SEV, +} ConfidentialGuestPolicyType; + struct ConfidentialGuestSupport { Object parent; @@ -51,12 +94,96 @@ struct ConfidentialGuestSupport { * so 'ready' is not set, we'll abort. */ bool ready; + +#if defined(CONFIG_IGVM) + /* + * igvm_filename: Optional filename that specifies a file that contains + * the configuration of the guest in Independent Guest + * Virtual Machine (IGVM) format. + */ + char *igvm_filename; + IgvmHandle igvm; +#endif + + /* + * The following virtual methods need to be implemented by systems that + * support confidential guests that can be configured with IGVM and are + * used during processing of the IGVM file with process_igvm(). + */ + + /* + * Check for to see if this confidential guest supports a particular + * platform or configuration + */ + int (*check_support)(ConfidentialGuestPlatformType platform, + uint16_t platform_version, uint8_t highest_vtl, + uint64_t shared_gpa_boundary); + + /* + * Configure part of the state of a guest for a particular set of data, page + * type and gpa. This can be used for example to pre-populate and measure + * guest memory contents, define private ranges or set the initial CPU state + * for one or more CPUs. + * + * If memory_type is CGS_PAGE_TYPE_VMSA then ptr points to the initial CPU + * context for a virtual CPU. The format of the data depends on the type of + * confidential virtual machine. For example, for SEV-ES ptr will point to a + * vmcb_save_area structure that should be copied into guest memory at the + * address specified in gpa. The cpu_index parameter contains the index of + * the CPU the VMSA applies to. + */ + int (*set_guest_state)(hwaddr gpa, uint8_t *ptr, uint64_t len, + ConfidentialGuestPageType memory_type, + uint16_t cpu_index, Error **errp); + + /* + * Set the guest policy. The policy can be used to configure the + * confidential platform, such as if debug is enabled or not and can contain + * information about expected launch measurements, signed verification of + * guest configuration and other platform data. + * + * The format of the policy data is specific to each platform. For example, + * SEV-SNP uses a policy bitfield in the 'policy' argument and provides an + * ID block and ID authentication in the 'policy_data' parameters. The type + * of policy data is identified by the 'policy_type' argument. + */ + int (*set_guest_policy)(ConfidentialGuestPolicyType policy_type, + uint64_t policy, + void *policy_data1, uint32_t policy_data1_size, + void *policy_data2, uint32_t policy_data2_size, + Error **errp); + + /* + * Iterate the system memory map, getting the entry with the given index + * that can be populated into guest memory. + * + * Returns 0 for ok, 1 if the index is out of range and -1 on error. + */ + int (*get_mem_map_entry)(int index, ConfidentialGuestMemoryMapEntry *entry, + Error **errp); }; typedef struct ConfidentialGuestSupportClass { ObjectClass parent; } ConfidentialGuestSupportClass; +/* + * Check whether the configuration of the confidential guest is provided + * using an IGVM file. IGVM configuration can include the system firmware, + * initial CPU state and other configuration that should override standard + * system initialization. This function should be used by platforms to + * determine which devices and configuration to include during system + * initialization. + */ +bool cgs_is_igvm(ConfidentialGuestSupport *cgs); +/* + * If IGVM is supported and an IGVM file has been specified then the + * configuration described in the file is applied to the guest. + * Configuration of a confidential guest includes the layout of the + * guest memory, including firmware and initial CPU state. + */ +void cgs_process_igvm(ConfidentialGuestSupport *cgs); + #endif /* !CONFIG_USER_ONLY */ #endif /* QEMU_CONFIDENTIAL_GUEST_SUPPORT_H */ diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h index 41115d891940..de728a18eef2 100644 --- a/include/exec/cpu-common.h +++ b/include/exec/cpu-common.h @@ -175,6 +175,8 @@ typedef int (RAMBlockIterFunc)(RAMBlock *rb, void *opaque); int qemu_ram_foreach_block(RAMBlockIterFunc func, void *opaque); int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length); +int ram_block_convert_range(RAMBlock *rb, uint64_t start, size_t length, + bool shared_to_private); #endif diff --git a/include/exec/igvm.h b/include/exec/igvm.h new file mode 100644 index 000000000000..148d5fe5b52d --- /dev/null +++ b/include/exec/igvm.h @@ -0,0 +1,37 @@ +/* + * QEMU IGVM configuration backend for Confidential Guests + * + * Copyright (C) 2023-2024 SUSE + * + * Authors: + * Roy Hopkins + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#ifndef EXEC_IGVM_H +#define EXEC_IGVM_H + +#include "exec/confidential-guest-support.h" + +#if defined(CONFIG_IGVM) + +int igvm_file_init(ConfidentialGuestSupport *cgs, Error **errp); +int igvm_process(ConfidentialGuestSupport *cgs, Error **erp); + +#else + +static inline int igvm_file_init(ConfidentialGuestSupport *cgs, Error **errp) +{ + return 0; +} + +static inline int igvm_process(ConfidentialGuestSupport *cgs, Error **errp) +{ + return 0; +} + +#endif + +#endif diff --git a/include/exec/memory.h b/include/exec/memory.h index 831f7c996d9d..42d30898f697 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -243,6 +243,12 @@ typedef struct IOMMUTLBEvent { /* RAM FD is opened read-only */ #define RAM_READONLY_FD (1 << 11) +/* RAM can be private that has kvm gmem backend */ +#define RAM_GUEST_MEMFD (1 << 12) + +/* RAM is default private */ +#define RAM_DEFAULT_PRIVATE (1 << 13) + static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn, IOMMUNotifierFlag flags, hwaddr start, hwaddr end, @@ -1583,6 +1589,12 @@ void memory_region_init_ram(MemoryRegion *mr, uint64_t size, Error **errp); +void memory_region_init_ram_guest_memfd(MemoryRegion *mr, + Object *owner, + const char *name, + uint64_t size, + Error **errp); + /** * memory_region_init_rom: Initialize a ROM memory region. * @@ -1643,6 +1655,13 @@ void memory_region_init_rom_device(MemoryRegion *mr, uint64_t size, Error **errp); +void memory_region_init_rom_device_private(MemoryRegion *mr, + Object *owner, + const MemoryRegionOps *ops, + void *opaque, + const char *name, + uint64_t size, + Error **errp); /** * memory_region_owner: get a memory region's owner. @@ -1702,6 +1721,19 @@ static inline bool memory_region_is_romd(MemoryRegion *mr) */ bool memory_region_is_protected(MemoryRegion *mr); +/** + * memory_region_has_guest_memfd: check whether a memory region has guest_memfd + * associated + * + * Returns %true if a memory region's ram_block has valid guest_memfd assigned. + * + * @mr: the memory region being queried + */ +bool memory_region_has_guest_memfd(MemoryRegion *mr); + +void memory_region_set_default_private(MemoryRegion *mr); +bool memory_region_is_default_private(MemoryRegion *mr); + /** * memory_region_get_iommu: check whether a memory region is an iommu * diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h index 69c6a5390293..0a17ba882729 100644 --- a/include/exec/ramblock.h +++ b/include/exec/ramblock.h @@ -41,6 +41,7 @@ struct RAMBlock { QLIST_HEAD(, RAMBlockNotifier) ramblock_notifiers; int fd; uint64_t fd_offset; + int guest_memfd; size_t page_size; /* dirty bitmap used during migration */ unsigned long *bmap; diff --git a/include/hw/boards.h b/include/hw/boards.h index da85f86efb91..2d411a12a96d 100644 --- a/include/hw/boards.h +++ b/include/hw/boards.h @@ -30,6 +30,7 @@ bool machine_usb(MachineState *machine); int machine_phandle_start(MachineState *machine); bool machine_dump_guest_core(MachineState *machine); bool machine_mem_merge(MachineState *machine); +bool machine_require_guest_memfd(MachineState *machine); HotpluggableCPUList *machine_query_hotpluggable_cpus(MachineState *machine); void machine_set_cpu_numa_node(MachineState *machine, const CpuInstanceProperties *props, @@ -364,6 +365,7 @@ struct MachineState { char *dt_compatible; bool dump_guest_core; bool mem_merge; + bool require_guest_memfd; bool usb; bool usb_disabled; char *firmware; diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h index a10ceeabbfac..beb3c1965f63 100644 --- a/include/hw/i386/pc.h +++ b/include/hw/i386/pc.h @@ -51,6 +51,7 @@ typedef struct PCMachineState { bool hpet_enabled; bool i8042_enabled; bool default_bus_bypass_iommu; + bool svsm_virtio_mmio; uint64_t max_fw_size; /* ACPI Memory hotplug IO base address */ @@ -165,7 +166,36 @@ void pc_guest_info_init(PCMachineState *pcms); #define PCI_HOST_PROP_PCI_HOLE64_SIZE "pci-hole64-size" #define PCI_HOST_BELOW_4G_MEM_SIZE "below-4g-mem-size" #define PCI_HOST_ABOVE_4G_MEM_SIZE "above-4g-mem-size" - +#define PCI_HOST_PROP_SMM_RANGES "smm-ranges" + +typedef enum { + SEV_DESC_TYPE_UNDEF, + /* The section contains the region that must be validated by the VMM. */ + SEV_DESC_TYPE_SNP_SEC_MEM, + /* The section contains the SNP secrets page */ + SEV_DESC_TYPE_SNP_SECRETS, + /* The section contains address that can be used as a CPUID page */ + SEV_DESC_TYPE_CPUID, + /* The section contains the region for kernel hashes for measured direct boot */ + SEV_DESC_TYPE_SNP_KERNEL_HASHES = 0x10, + +} ovmf_sev_metadata_desc_type; + +typedef struct __attribute__((__packed__)) OvmfSevMetadataDesc { + uint32_t base; + uint32_t len; + ovmf_sev_metadata_desc_type type; +} OvmfSevMetadataDesc; + +typedef struct __attribute__((__packed__)) OvmfSevMetadata { + uint8_t signature[4]; + uint32_t len; + uint32_t version; + uint32_t num_desc; + OvmfSevMetadataDesc descs[]; +} OvmfSevMetadata; + +OvmfSevMetadata *pc_system_get_ovmf_sev_metadata_ptr(void); void pc_pci_as_mapping_init(MemoryRegion *system_memory, MemoryRegion *pci_address_space); @@ -309,15 +339,12 @@ extern const size_t pc_compat_1_5_len; extern GlobalProperty pc_compat_1_4[]; extern const size_t pc_compat_1_4_len; -int pc_machine_kvm_type(MachineState *machine, const char *vm_type); - #define DEFINE_PC_MACHINE(suffix, namestr, initfn, optsfn) \ static void pc_machine_##suffix##_class_init(ObjectClass *oc, void *data) \ { \ MachineClass *mc = MACHINE_CLASS(oc); \ optsfn(mc); \ mc->init = initfn; \ - mc->kvm_type = pc_machine_kvm_type; \ } \ static const TypeInfo pc_machine_type_##suffix = { \ .name = namestr TYPE_MACHINE_SUFFIX, \ diff --git a/include/hw/i386/x86.h b/include/hw/i386/x86.h index da19ae15463a..9d6d4da01f8a 100644 --- a/include/hw/i386/x86.h +++ b/include/hw/i386/x86.h @@ -41,6 +41,7 @@ struct X86MachineState { MachineState parent; /*< public >*/ + unsigned int vm_type; /* Pointers to devices and objects: */ ISADevice *rtc; @@ -142,6 +143,6 @@ void ioapic_init_gsi(GSIState *gsi_state, const char *parent_name); DeviceState *ioapic_init_secondary(GSIState *gsi_state); /* pc_sysfw.c */ -void x86_firmware_configure(void *ptr, int size); +void x86_firmware_configure(hwaddr gpa, void *ptr, int size); #endif diff --git a/include/hw/pci-host/q35.h b/include/hw/pci-host/q35.h index bafcbe675214..22fadfa3ed76 100644 --- a/include/hw/pci-host/q35.h +++ b/include/hw/pci-host/q35.h @@ -50,6 +50,7 @@ struct MCHPCIState { MemoryRegion tseg_blackhole, tseg_window; MemoryRegion smbase_blackhole, smbase_window; bool has_smram_at_smbase; + bool has_smm_ranges; Range pci_hole; uint64_t below_4g_mem_size; uint64_t above_4g_mem_size; diff --git a/include/standard-headers/drm/drm_fourcc.h b/include/standard-headers/drm/drm_fourcc.h index 72279f4d25d4..b72917073d8d 100644 --- a/include/standard-headers/drm/drm_fourcc.h +++ b/include/standard-headers/drm/drm_fourcc.h @@ -53,7 +53,7 @@ extern "C" { * Format modifiers may change any property of the buffer, including the number * of planes and/or the required allocation size. Format modifiers are * vendor-namespaced, and as such the relationship between a fourcc code and a - * modifier is specific to the modifer being used. For example, some modifiers + * modifier is specific to the modifier being used. For example, some modifiers * may preserve meaning - such as number of planes - from the fourcc code, * whereas others may not. * @@ -78,7 +78,7 @@ extern "C" { * format. * - Higher-level programs interfacing with KMS/GBM/EGL/Vulkan/etc: these users * see modifiers as opaque tokens they can check for equality and intersect. - * These users musn't need to know to reason about the modifier value + * These users mustn't need to know to reason about the modifier value * (i.e. they are not expected to extract information out of the modifier). * * Vendors should document their modifier usage in as much detail as @@ -322,6 +322,8 @@ extern "C" { * index 1 = Cr:Cb plane, [39:0] Cr1:Cb1:Cr0:Cb0 little endian */ #define DRM_FORMAT_NV15 fourcc_code('N', 'V', '1', '5') /* 2x2 subsampled Cr:Cb plane */ +#define DRM_FORMAT_NV20 fourcc_code('N', 'V', '2', '0') /* 2x1 subsampled Cr:Cb plane */ +#define DRM_FORMAT_NV30 fourcc_code('N', 'V', '3', '0') /* non-subsampled Cr:Cb plane */ /* * 2 plane YCbCr MSB aligned @@ -537,7 +539,7 @@ extern "C" { * This is a tiled layout using 4Kb tiles in row-major layout. * Within the tile pixels are laid out in 16 256 byte units / sub-tiles which * are arranged in four groups (two wide, two high) with column-major layout. - * Each group therefore consits out of four 256 byte units, which are also laid + * Each group therefore consists out of four 256 byte units, which are also laid * out as 2x2 column-major. * 256 byte units are made out of four 64 byte blocks of pixels, producing * either a square block or a 2:1 unit. @@ -1100,7 +1102,7 @@ drm_fourcc_canonicalize_nvidia_format_mod(uint64_t modifier) */ /* - * The top 4 bits (out of the 56 bits alloted for specifying vendor specific + * The top 4 bits (out of the 56 bits allotted for specifying vendor specific * modifiers) denote the category for modifiers. Currently we have three * categories of modifiers ie AFBC, MISC and AFRC. We can have a maximum of * sixteen different categories. @@ -1416,7 +1418,7 @@ drm_fourcc_canonicalize_nvidia_format_mod(uint64_t modifier) * Amlogic FBC Memory Saving mode * * Indicates the storage is packed when pixel size is multiple of word - * boudaries, i.e. 8bit should be stored in this mode to save allocation + * boundaries, i.e. 8bit should be stored in this mode to save allocation * memory. * * This mode reduces body layout to 3072 bytes per 64x32 superblock with diff --git a/include/standard-headers/linux/ethtool.h b/include/standard-headers/linux/ethtool.h index 99fcddf04f88..2cd1feee0847 100644 --- a/include/standard-headers/linux/ethtool.h +++ b/include/standard-headers/linux/ethtool.h @@ -1266,6 +1266,8 @@ struct ethtool_rxfh_indir { * hardware hash key. * @hfunc: Defines the current RSS hash function used by HW (or to be set to). * Valid values are one of the %ETH_RSS_HASH_*. + * @input_xfrm: Defines how the input data is transformed. Valid values are one + * of %RXH_XFRM_*. * @rsvd8: Reserved for future use; see the note on reserved space. * @rsvd32: Reserved for future use; see the note on reserved space. * @rss_config: RX ring/queue index for each hash value i.e., indirection table @@ -1285,7 +1287,8 @@ struct ethtool_rxfh { uint32_t indir_size; uint32_t key_size; uint8_t hfunc; - uint8_t rsvd8[3]; + uint8_t input_xfrm; + uint8_t rsvd8[2]; uint32_t rsvd32; uint32_t rss_config[]; }; @@ -1992,6 +1995,14 @@ static inline int ethtool_validate_duplex(uint8_t duplex) #define WOL_MODE_COUNT 8 +/* RSS hash function data + * XOR the corresponding source and destination fields of each specified + * protocol. Both copies of the XOR'ed fields are fed into the RSS and RXHASH + * calculation. Note that this XORing reduces the input set entropy and could + * be exploited to reduce the RSS queue spread. + */ +#define RXH_XFRM_SYM_XOR (1 << 0) + /* L2-L4 network traffic flow types */ #define TCP_V4_FLOW 0x01 /* hash or spec (tcp_ip4_spec) */ #define UDP_V4_FLOW 0x02 /* hash or spec (udp_ip4_spec) */ diff --git a/include/standard-headers/linux/fuse.h b/include/standard-headers/linux/fuse.h index 6b9793842c98..fc0dcd10aede 100644 --- a/include/standard-headers/linux/fuse.h +++ b/include/standard-headers/linux/fuse.h @@ -209,7 +209,7 @@ * - add FUSE_HAS_EXPIRE_ONLY * * 7.39 - * - add FUSE_DIRECT_IO_RELAX + * - add FUSE_DIRECT_IO_ALLOW_MMAP * - add FUSE_STATX and related structures */ @@ -405,8 +405,7 @@ struct fuse_file_lock { * FUSE_CREATE_SUPP_GROUP: add supplementary group info to create, mkdir, * symlink and mknod (single group that matches parent) * FUSE_HAS_EXPIRE_ONLY: kernel supports expiry-only entry invalidation - * FUSE_DIRECT_IO_RELAX: relax restrictions in FOPEN_DIRECT_IO mode, for now - * allow shared mmap + * FUSE_DIRECT_IO_ALLOW_MMAP: allow shared mmap in FOPEN_DIRECT_IO mode. */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -445,7 +444,10 @@ struct fuse_file_lock { #define FUSE_HAS_INODE_DAX (1ULL << 33) #define FUSE_CREATE_SUPP_GROUP (1ULL << 34) #define FUSE_HAS_EXPIRE_ONLY (1ULL << 35) -#define FUSE_DIRECT_IO_RELAX (1ULL << 36) +#define FUSE_DIRECT_IO_ALLOW_MMAP (1ULL << 36) + +/* Obsolete alias for FUSE_DIRECT_IO_ALLOW_MMAP */ +#define FUSE_DIRECT_IO_RELAX FUSE_DIRECT_IO_ALLOW_MMAP /** * CUSE INIT request/reply flags diff --git a/include/standard-headers/linux/pci_regs.h b/include/standard-headers/linux/pci_regs.h index e5f558d96493..a39193213ff2 100644 --- a/include/standard-headers/linux/pci_regs.h +++ b/include/standard-headers/linux/pci_regs.h @@ -80,6 +80,7 @@ #define PCI_HEADER_TYPE_NORMAL 0 #define PCI_HEADER_TYPE_BRIDGE 1 #define PCI_HEADER_TYPE_CARDBUS 2 +#define PCI_HEADER_TYPE_MFD 0x80 /* Multi-Function Device (possible) */ #define PCI_BIST 0x0f /* 8 bits */ #define PCI_BIST_CODE_MASK 0x0f /* Return result */ @@ -637,6 +638,7 @@ #define PCI_EXP_RTCAP 0x1e /* Root Capabilities */ #define PCI_EXP_RTCAP_CRSVIS 0x0001 /* CRS Software Visibility capability */ #define PCI_EXP_RTSTA 0x20 /* Root Status */ +#define PCI_EXP_RTSTA_PME_RQ_ID 0x0000ffff /* PME Requester ID */ #define PCI_EXP_RTSTA_PME 0x00010000 /* PME status */ #define PCI_EXP_RTSTA_PENDING 0x00020000 /* PME pending */ /* @@ -930,12 +932,13 @@ /* Process Address Space ID */ #define PCI_PASID_CAP 0x04 /* PASID feature register */ -#define PCI_PASID_CAP_EXEC 0x02 /* Exec permissions Supported */ -#define PCI_PASID_CAP_PRIV 0x04 /* Privilege Mode Supported */ +#define PCI_PASID_CAP_EXEC 0x0002 /* Exec permissions Supported */ +#define PCI_PASID_CAP_PRIV 0x0004 /* Privilege Mode Supported */ +#define PCI_PASID_CAP_WIDTH 0x1f00 #define PCI_PASID_CTRL 0x06 /* PASID control register */ -#define PCI_PASID_CTRL_ENABLE 0x01 /* Enable bit */ -#define PCI_PASID_CTRL_EXEC 0x02 /* Exec permissions Enable */ -#define PCI_PASID_CTRL_PRIV 0x04 /* Privilege Mode Enable */ +#define PCI_PASID_CTRL_ENABLE 0x0001 /* Enable bit */ +#define PCI_PASID_CTRL_EXEC 0x0002 /* Exec permissions Enable */ +#define PCI_PASID_CTRL_PRIV 0x0004 /* Privilege Mode Enable */ #define PCI_EXT_CAP_PASID_SIZEOF 8 /* Single Root I/O Virtualization */ @@ -975,6 +978,8 @@ #define PCI_LTR_VALUE_MASK 0x000003ff #define PCI_LTR_SCALE_MASK 0x00001c00 #define PCI_LTR_SCALE_SHIFT 10 +#define PCI_LTR_NOSNOOP_VALUE 0x03ff0000 /* Max No-Snoop Latency Value */ +#define PCI_LTR_NOSNOOP_SCALE 0x1c000000 /* Scale for Max Value */ #define PCI_EXT_CAP_LTR_SIZEOF 8 /* Access Control Service */ @@ -1042,9 +1047,16 @@ #define PCI_EXP_DPC_STATUS 0x08 /* DPC Status */ #define PCI_EXP_DPC_STATUS_TRIGGER 0x0001 /* Trigger Status */ #define PCI_EXP_DPC_STATUS_TRIGGER_RSN 0x0006 /* Trigger Reason */ +#define PCI_EXP_DPC_STATUS_TRIGGER_RSN_UNCOR 0x0000 /* Uncorrectable error */ +#define PCI_EXP_DPC_STATUS_TRIGGER_RSN_NFE 0x0002 /* Rcvd ERR_NONFATAL */ +#define PCI_EXP_DPC_STATUS_TRIGGER_RSN_FE 0x0004 /* Rcvd ERR_FATAL */ +#define PCI_EXP_DPC_STATUS_TRIGGER_RSN_IN_EXT 0x0006 /* Reason in Trig Reason Extension field */ #define PCI_EXP_DPC_STATUS_INTERRUPT 0x0008 /* Interrupt Status */ #define PCI_EXP_DPC_RP_BUSY 0x0010 /* Root Port Busy */ #define PCI_EXP_DPC_STATUS_TRIGGER_RSN_EXT 0x0060 /* Trig Reason Extension */ +#define PCI_EXP_DPC_STATUS_TRIGGER_RSN_RP_PIO 0x0000 /* RP PIO error */ +#define PCI_EXP_DPC_STATUS_TRIGGER_RSN_SW_TRIGGER 0x0020 /* DPC SW Trigger bit */ +#define PCI_EXP_DPC_RP_PIO_FEP 0x1f00 /* RP PIO First Err Ptr */ #define PCI_EXP_DPC_SOURCE_ID 0x0A /* DPC Source Identifier */ @@ -1088,6 +1100,8 @@ #define PCI_L1SS_CTL1_LTR_L12_TH_VALUE 0x03ff0000 /* LTR_L1.2_THRESHOLD_Value */ #define PCI_L1SS_CTL1_LTR_L12_TH_SCALE 0xe0000000 /* LTR_L1.2_THRESHOLD_Scale */ #define PCI_L1SS_CTL2 0x0c /* Control 2 Register */ +#define PCI_L1SS_CTL2_T_PWR_ON_SCALE 0x00000003 /* T_POWER_ON Scale */ +#define PCI_L1SS_CTL2_T_PWR_ON_VALUE 0x000000f8 /* T_POWER_ON Value */ /* Designated Vendor-Specific (DVSEC, PCI_EXT_CAP_ID_DVSEC) */ #define PCI_DVSEC_HEADER1 0x4 /* Designated Vendor-Specific Header1 */ diff --git a/include/standard-headers/linux/vhost_types.h b/include/standard-headers/linux/vhost_types.h index 5ad07e134aed..fd54044936fc 100644 --- a/include/standard-headers/linux/vhost_types.h +++ b/include/standard-headers/linux/vhost_types.h @@ -185,5 +185,12 @@ struct vhost_vdpa_iova_range { * DRIVER_OK */ #define VHOST_BACKEND_F_ENABLE_AFTER_DRIVER_OK 0x6 +/* Device may expose the virtqueue's descriptor area, driver area and + * device area to a different group for ASID binding than where its + * buffers may reside. Requires VHOST_BACKEND_F_IOTLB_ASID. + */ +#define VHOST_BACKEND_F_DESC_ASID 0x7 +/* IOTLB don't flush memory mapping across device reset */ +#define VHOST_BACKEND_F_IOTLB_PERSIST 0x8 #endif diff --git a/include/standard-headers/linux/virtio_config.h b/include/standard-headers/linux/virtio_config.h index 8a7d0dc8b007..45be0fa1bcdb 100644 --- a/include/standard-headers/linux/virtio_config.h +++ b/include/standard-headers/linux/virtio_config.h @@ -52,7 +52,7 @@ * rest are per-device feature bits. */ #define VIRTIO_TRANSPORT_F_START 28 -#define VIRTIO_TRANSPORT_F_END 41 +#define VIRTIO_TRANSPORT_F_END 42 #ifndef VIRTIO_CONFIG_NO_LEGACY /* Do we get callbacks when the ring is completely used, even if we've @@ -103,8 +103,19 @@ */ #define VIRTIO_F_NOTIFICATION_DATA 38 +/* This feature indicates that the driver uses the data provided by the device + * as a virtqueue identifier in available buffer notifications. + */ +#define VIRTIO_F_NOTIF_CONFIG_DATA 39 + /* * This feature indicates that the driver can reset a queue individually. */ #define VIRTIO_F_RING_RESET 40 + +/* + * This feature indicates that the device support administration virtqueues. + */ +#define VIRTIO_F_ADMIN_VQ 41 + #endif /* _LINUX_VIRTIO_CONFIG_H */ diff --git a/include/standard-headers/linux/virtio_pci.h b/include/standard-headers/linux/virtio_pci.h index be912cfc957c..3e2bc2c97e6e 100644 --- a/include/standard-headers/linux/virtio_pci.h +++ b/include/standard-headers/linux/virtio_pci.h @@ -166,6 +166,20 @@ struct virtio_pci_common_cfg { uint32_t queue_used_hi; /* read-write */ }; +/* + * Warning: do not use sizeof on this: use offsetofend for + * specific fields you need. + */ +struct virtio_pci_modern_common_cfg { + struct virtio_pci_common_cfg cfg; + + uint16_t queue_notify_data; /* read-write */ + uint16_t queue_reset; /* read-write */ + + uint16_t admin_queue_index; /* read-only */ + uint16_t admin_queue_num; /* read-only */ +}; + /* Fields in VIRTIO_PCI_CAP_PCI_CFG: */ struct virtio_pci_cfg_cap { struct virtio_pci_cap cap; @@ -204,7 +218,72 @@ struct virtio_pci_cfg_cap { #define VIRTIO_PCI_COMMON_Q_USEDHI 52 #define VIRTIO_PCI_COMMON_Q_NDATA 56 #define VIRTIO_PCI_COMMON_Q_RESET 58 +#define VIRTIO_PCI_COMMON_ADM_Q_IDX 60 +#define VIRTIO_PCI_COMMON_ADM_Q_NUM 62 #endif /* VIRTIO_PCI_NO_MODERN */ +/* Admin command status. */ +#define VIRTIO_ADMIN_STATUS_OK 0 + +/* Admin command opcode. */ +#define VIRTIO_ADMIN_CMD_LIST_QUERY 0x0 +#define VIRTIO_ADMIN_CMD_LIST_USE 0x1 + +/* Admin command group type. */ +#define VIRTIO_ADMIN_GROUP_TYPE_SRIOV 0x1 + +/* Transitional device admin command. */ +#define VIRTIO_ADMIN_CMD_LEGACY_COMMON_CFG_WRITE 0x2 +#define VIRTIO_ADMIN_CMD_LEGACY_COMMON_CFG_READ 0x3 +#define VIRTIO_ADMIN_CMD_LEGACY_DEV_CFG_WRITE 0x4 +#define VIRTIO_ADMIN_CMD_LEGACY_DEV_CFG_READ 0x5 +#define VIRTIO_ADMIN_CMD_LEGACY_NOTIFY_INFO 0x6 + +struct QEMU_PACKED virtio_admin_cmd_hdr { + uint16_t opcode; + /* + * 1 - SR-IOV + * 2-65535 - reserved + */ + uint16_t group_type; + /* Unused, reserved for future extensions. */ + uint8_t reserved1[12]; + uint64_t group_member_id; +}; + +struct QEMU_PACKED virtio_admin_cmd_status { + uint16_t status; + uint16_t status_qualifier; + /* Unused, reserved for future extensions. */ + uint8_t reserved2[4]; +}; + +struct QEMU_PACKED virtio_admin_cmd_legacy_wr_data { + uint8_t offset; /* Starting offset of the register(s) to write. */ + uint8_t reserved[7]; + uint8_t registers[]; +}; + +struct QEMU_PACKED virtio_admin_cmd_legacy_rd_data { + uint8_t offset; /* Starting offset of the register(s) to read. */ +}; + +#define VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_END 0 +#define VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_OWNER_DEV 0x1 +#define VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_OWNER_MEM 0x2 + +#define VIRTIO_ADMIN_CMD_MAX_NOTIFY_INFO 4 + +struct QEMU_PACKED virtio_admin_cmd_notify_info_data { + uint8_t flags; /* 0 = end of list, 1 = owner device, 2 = member device */ + uint8_t bar; /* BAR of the member or the owner device */ + uint8_t padding[6]; + uint64_t offset; /* Offset within bar. */ +}; + +struct virtio_admin_cmd_notify_info_result { + struct virtio_admin_cmd_notify_info_data entries[VIRTIO_ADMIN_CMD_MAX_NOTIFY_INFO]; +}; + #endif diff --git a/include/sysemu/hostmem.h b/include/sysemu/hostmem.h index 39326f1d4f9c..92f7fd469639 100644 --- a/include/sysemu/hostmem.h +++ b/include/sysemu/hostmem.h @@ -66,6 +66,7 @@ struct HostMemoryBackend { uint64_t size; bool merge, dump, use_canonical_path; bool prealloc, is_mapped, share, reserve; + bool require_guest_memfd; uint32_t prealloc_threads; ThreadContext *prealloc_context; DECLARE_BITMAP(host_nodes, MAX_NODES + 1); diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h index d61487816421..c5d2831b2d6a 100644 --- a/include/sysemu/kvm.h +++ b/include/sysemu/kvm.h @@ -538,4 +538,12 @@ bool kvm_arch_cpu_check_are_resettable(void); bool kvm_dirty_ring_enabled(void); uint32_t kvm_dirty_ring_size(void); + +int kvm_create_guest_memfd(uint64_t size, uint64_t flags, Error **errp); + +int kvm_set_memory_attributes_private(hwaddr start, hwaddr size); +int kvm_set_memory_attributes_shared(hwaddr start, hwaddr size); + +int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private); +bool kvm_has_restricted_memory(void); #endif diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h index fd846394be10..58b7aa3fc786 100644 --- a/include/sysemu/kvm_int.h +++ b/include/sysemu/kvm_int.h @@ -30,6 +30,8 @@ typedef struct KVMSlot int as_id; /* Cache of the offset in ram address space */ ram_addr_t ram_start_offset; + int guest_memfd; + hwaddr guest_memfd_offset; } KVMSlot; typedef struct KVMMemoryUpdate { diff --git a/linux-headers/asm-arm64/kvm.h b/linux-headers/asm-arm64/kvm.h index 38e5957526c2..c59ea55cd8eb 100644 --- a/linux-headers/asm-arm64/kvm.h +++ b/linux-headers/asm-arm64/kvm.h @@ -491,6 +491,38 @@ struct kvm_smccc_filter { #define KVM_HYPERCALL_EXIT_SMC (1U << 0) #define KVM_HYPERCALL_EXIT_16BIT (1U << 1) +/* + * Get feature ID registers userspace writable mask. + * + * From DDI0487J.a, D19.2.66 ("ID_AA64MMFR2_EL1, AArch64 Memory Model + * Feature Register 2"): + * + * "The Feature ID space is defined as the System register space in + * AArch64 with op0==3, op1=={0, 1, 3}, CRn==0, CRm=={0-7}, + * op2=={0-7}." + * + * This covers all currently known R/O registers that indicate + * anything useful feature wise, including the ID registers. + * + * If we ever need to introduce a new range, it will be described as + * such in the range field. + */ +#define KVM_ARM_FEATURE_ID_RANGE_IDX(op0, op1, crn, crm, op2) \ + ({ \ + __u64 __op1 = (op1) & 3; \ + __op1 -= (__op1 == 3); \ + (__op1 << 6 | ((crm) & 7) << 3 | (op2)); \ + }) + +#define KVM_ARM_FEATURE_ID_RANGE 0 +#define KVM_ARM_FEATURE_ID_RANGE_SIZE (3 * 8 * 8) + +struct reg_mask_range { + __u64 addr; /* Pointer to mask array */ + __u32 range; /* Requested range */ + __u32 reserved[13]; +}; + #endif #endif /* __ARM_KVM_H__ */ diff --git a/linux-headers/asm-generic/unistd.h b/linux-headers/asm-generic/unistd.h index abe087c53b4b..75f00965ab15 100644 --- a/linux-headers/asm-generic/unistd.h +++ b/linux-headers/asm-generic/unistd.h @@ -71,7 +71,7 @@ __SYSCALL(__NR_fremovexattr, sys_fremovexattr) #define __NR_getcwd 17 __SYSCALL(__NR_getcwd, sys_getcwd) #define __NR_lookup_dcookie 18 -__SC_COMP(__NR_lookup_dcookie, sys_lookup_dcookie, compat_sys_lookup_dcookie) +__SYSCALL(__NR_lookup_dcookie, sys_ni_syscall) #define __NR_eventfd2 19 __SYSCALL(__NR_eventfd2, sys_eventfd2) #define __NR_epoll_create1 20 @@ -816,15 +816,34 @@ __SYSCALL(__NR_process_mrelease, sys_process_mrelease) __SYSCALL(__NR_futex_waitv, sys_futex_waitv) #define __NR_set_mempolicy_home_node 450 __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) - #define __NR_cachestat 451 __SYSCALL(__NR_cachestat, sys_cachestat) - #define __NR_fchmodat2 452 __SYSCALL(__NR_fchmodat2, sys_fchmodat2) +#define __NR_map_shadow_stack 453 +__SYSCALL(__NR_map_shadow_stack, sys_map_shadow_stack) +#define __NR_futex_wake 454 +__SYSCALL(__NR_futex_wake, sys_futex_wake) +#define __NR_futex_wait 455 +__SYSCALL(__NR_futex_wait, sys_futex_wait) +#define __NR_futex_requeue 456 +__SYSCALL(__NR_futex_requeue, sys_futex_requeue) + +#define __NR_statmount 457 +__SYSCALL(__NR_statmount, sys_statmount) + +#define __NR_listmount 458 +__SYSCALL(__NR_listmount, sys_listmount) + +#define __NR_lsm_get_self_attr 459 +__SYSCALL(__NR_lsm_get_self_attr, sys_lsm_get_self_attr) +#define __NR_lsm_set_self_attr 460 +__SYSCALL(__NR_lsm_set_self_attr, sys_lsm_set_self_attr) +#define __NR_lsm_list_modules 461 +__SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules) #undef __NR_syscalls -#define __NR_syscalls 453 +#define __NR_syscalls 462 /* * 32 bit systems traditionally used different diff --git a/linux-headers/asm-loongarch/bitsperlong.h b/linux-headers/asm-loongarch/bitsperlong.h new file mode 100644 index 000000000000..6dc0bb0c13b2 --- /dev/null +++ b/linux-headers/asm-loongarch/bitsperlong.h @@ -0,0 +1 @@ +#include diff --git a/linux-headers/asm-loongarch/kvm.h b/linux-headers/asm-loongarch/kvm.h new file mode 100644 index 000000000000..923d0bd38294 --- /dev/null +++ b/linux-headers/asm-loongarch/kvm.h @@ -0,0 +1,109 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright (C) 2020-2023 Loongson Technology Corporation Limited + */ + +#ifndef __UAPI_ASM_LOONGARCH_KVM_H +#define __UAPI_ASM_LOONGARCH_KVM_H + +#include + +/* + * KVM LoongArch specific structures and definitions. + * + * Some parts derived from the x86 version of this file. + */ + +#define __KVM_HAVE_READONLY_MEM + +#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 +#define KVM_DIRTY_LOG_PAGE_OFFSET 64 + +/* + * for KVM_GET_REGS and KVM_SET_REGS + */ +struct kvm_regs { + /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */ + __u64 gpr[32]; + __u64 pc; +}; + +/* + * for KVM_GET_FPU and KVM_SET_FPU + */ +struct kvm_fpu { + __u32 fcsr; + __u64 fcc; /* 8x8 */ + struct kvm_fpureg { + __u64 val64[4]; + } fpr[32]; +}; + +/* + * For LoongArch, we use KVM_SET_ONE_REG and KVM_GET_ONE_REG to access various + * registers. The id field is broken down as follows: + * + * bits[63..52] - As per linux/kvm.h + * bits[51..32] - Must be zero. + * bits[31..16] - Register set. + * + * Register set = 0: GP registers from kvm_regs (see definitions below). + * + * Register set = 1: CSR registers. + * + * Register set = 2: KVM specific registers (see definitions below). + * + * Register set = 3: FPU / SIMD registers (see definitions below). + * + * Other sets registers may be added in the future. Each set would + * have its own identifier in bits[31..16]. + */ + +#define KVM_REG_LOONGARCH_GPR (KVM_REG_LOONGARCH | 0x00000ULL) +#define KVM_REG_LOONGARCH_CSR (KVM_REG_LOONGARCH | 0x10000ULL) +#define KVM_REG_LOONGARCH_KVM (KVM_REG_LOONGARCH | 0x20000ULL) +#define KVM_REG_LOONGARCH_FPSIMD (KVM_REG_LOONGARCH | 0x30000ULL) +#define KVM_REG_LOONGARCH_CPUCFG (KVM_REG_LOONGARCH | 0x40000ULL) +#define KVM_REG_LOONGARCH_MASK (KVM_REG_LOONGARCH | 0x70000ULL) +#define KVM_CSR_IDX_MASK 0x7fff +#define KVM_CPUCFG_IDX_MASK 0x7fff + +/* + * KVM_REG_LOONGARCH_KVM - KVM specific control registers. + */ + +#define KVM_REG_LOONGARCH_COUNTER (KVM_REG_LOONGARCH_KVM | KVM_REG_SIZE_U64 | 1) +#define KVM_REG_LOONGARCH_VCPU_RESET (KVM_REG_LOONGARCH_KVM | KVM_REG_SIZE_U64 | 2) + +#define LOONGARCH_REG_SHIFT 3 +#define LOONGARCH_REG_64(TYPE, REG) (TYPE | KVM_REG_SIZE_U64 | (REG << LOONGARCH_REG_SHIFT)) +#define KVM_IOC_CSRID(REG) LOONGARCH_REG_64(KVM_REG_LOONGARCH_CSR, REG) +#define KVM_IOC_CPUCFG(REG) LOONGARCH_REG_64(KVM_REG_LOONGARCH_CPUCFG, REG) +#define KVM_LOONGARCH_VCPU_CPUCFG 0 + +struct kvm_debug_exit_arch { +}; + +/* for KVM_SET_GUEST_DEBUG */ +struct kvm_guest_debug_arch { +}; + +/* definition of registers in kvm_run */ +struct kvm_sync_regs { +}; + +/* dummy definition */ +struct kvm_sregs { +}; + +struct kvm_iocsr_entry { + __u32 addr; + __u32 pad; + __u64 data; +}; + +#define KVM_NR_IRQCHIPS 1 +#define KVM_IRQCHIP_NUM_PINS 64 +#define KVM_MAX_CORES 256 + +#endif /* __UAPI_ASM_LOONGARCH_KVM_H */ diff --git a/linux-headers/asm-loongarch/mman.h b/linux-headers/asm-loongarch/mman.h new file mode 100644 index 000000000000..8eebf89f5ab1 --- /dev/null +++ b/linux-headers/asm-loongarch/mman.h @@ -0,0 +1 @@ +#include diff --git a/linux-headers/asm-loongarch/unistd.h b/linux-headers/asm-loongarch/unistd.h new file mode 100644 index 000000000000..fcb668984f03 --- /dev/null +++ b/linux-headers/asm-loongarch/unistd.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#define __ARCH_WANT_SYS_CLONE +#define __ARCH_WANT_SYS_CLONE3 + +#include diff --git a/linux-headers/asm-mips/unistd_n32.h b/linux-headers/asm-mips/unistd_n32.h index 46d8500654c3..ce2e050a9ba4 100644 --- a/linux-headers/asm-mips/unistd_n32.h +++ b/linux-headers/asm-mips/unistd_n32.h @@ -381,5 +381,14 @@ #define __NR_set_mempolicy_home_node (__NR_Linux + 450) #define __NR_cachestat (__NR_Linux + 451) #define __NR_fchmodat2 (__NR_Linux + 452) +#define __NR_map_shadow_stack (__NR_Linux + 453) +#define __NR_futex_wake (__NR_Linux + 454) +#define __NR_futex_wait (__NR_Linux + 455) +#define __NR_futex_requeue (__NR_Linux + 456) +#define __NR_statmount (__NR_Linux + 457) +#define __NR_listmount (__NR_Linux + 458) +#define __NR_lsm_get_self_attr (__NR_Linux + 459) +#define __NR_lsm_set_self_attr (__NR_Linux + 460) +#define __NR_lsm_list_modules (__NR_Linux + 461) #endif /* _ASM_UNISTD_N32_H */ diff --git a/linux-headers/asm-mips/unistd_n64.h b/linux-headers/asm-mips/unistd_n64.h index c2f7ac673bb5..5bfb3733ffdf 100644 --- a/linux-headers/asm-mips/unistd_n64.h +++ b/linux-headers/asm-mips/unistd_n64.h @@ -357,5 +357,14 @@ #define __NR_set_mempolicy_home_node (__NR_Linux + 450) #define __NR_cachestat (__NR_Linux + 451) #define __NR_fchmodat2 (__NR_Linux + 452) +#define __NR_map_shadow_stack (__NR_Linux + 453) +#define __NR_futex_wake (__NR_Linux + 454) +#define __NR_futex_wait (__NR_Linux + 455) +#define __NR_futex_requeue (__NR_Linux + 456) +#define __NR_statmount (__NR_Linux + 457) +#define __NR_listmount (__NR_Linux + 458) +#define __NR_lsm_get_self_attr (__NR_Linux + 459) +#define __NR_lsm_set_self_attr (__NR_Linux + 460) +#define __NR_lsm_list_modules (__NR_Linux + 461) #endif /* _ASM_UNISTD_N64_H */ diff --git a/linux-headers/asm-mips/unistd_o32.h b/linux-headers/asm-mips/unistd_o32.h index 757c68f2add8..02eaecd020ec 100644 --- a/linux-headers/asm-mips/unistd_o32.h +++ b/linux-headers/asm-mips/unistd_o32.h @@ -427,5 +427,14 @@ #define __NR_set_mempolicy_home_node (__NR_Linux + 450) #define __NR_cachestat (__NR_Linux + 451) #define __NR_fchmodat2 (__NR_Linux + 452) +#define __NR_map_shadow_stack (__NR_Linux + 453) +#define __NR_futex_wake (__NR_Linux + 454) +#define __NR_futex_wait (__NR_Linux + 455) +#define __NR_futex_requeue (__NR_Linux + 456) +#define __NR_statmount (__NR_Linux + 457) +#define __NR_listmount (__NR_Linux + 458) +#define __NR_lsm_get_self_attr (__NR_Linux + 459) +#define __NR_lsm_set_self_attr (__NR_Linux + 460) +#define __NR_lsm_list_modules (__NR_Linux + 461) #endif /* _ASM_UNISTD_O32_H */ diff --git a/linux-headers/asm-powerpc/unistd_32.h b/linux-headers/asm-powerpc/unistd_32.h index 8ef94bbac138..bbab08d6ec26 100644 --- a/linux-headers/asm-powerpc/unistd_32.h +++ b/linux-headers/asm-powerpc/unistd_32.h @@ -434,6 +434,15 @@ #define __NR_set_mempolicy_home_node 450 #define __NR_cachestat 451 #define __NR_fchmodat2 452 +#define __NR_map_shadow_stack 453 +#define __NR_futex_wake 454 +#define __NR_futex_wait 455 +#define __NR_futex_requeue 456 +#define __NR_statmount 457 +#define __NR_listmount 458 +#define __NR_lsm_get_self_attr 459 +#define __NR_lsm_set_self_attr 460 +#define __NR_lsm_list_modules 461 #endif /* _ASM_UNISTD_32_H */ diff --git a/linux-headers/asm-powerpc/unistd_64.h b/linux-headers/asm-powerpc/unistd_64.h index 0e7ee43e884f..af34cde70f20 100644 --- a/linux-headers/asm-powerpc/unistd_64.h +++ b/linux-headers/asm-powerpc/unistd_64.h @@ -406,6 +406,15 @@ #define __NR_set_mempolicy_home_node 450 #define __NR_cachestat 451 #define __NR_fchmodat2 452 +#define __NR_map_shadow_stack 453 +#define __NR_futex_wake 454 +#define __NR_futex_wait 455 +#define __NR_futex_requeue 456 +#define __NR_statmount 457 +#define __NR_listmount 458 +#define __NR_lsm_get_self_attr 459 +#define __NR_lsm_set_self_attr 460 +#define __NR_lsm_list_modules 461 #endif /* _ASM_UNISTD_64_H */ diff --git a/linux-headers/asm-riscv/kvm.h b/linux-headers/asm-riscv/kvm.h index 992c5e407104..60d3b21dead7 100644 --- a/linux-headers/asm-riscv/kvm.h +++ b/linux-headers/asm-riscv/kvm.h @@ -80,6 +80,7 @@ struct kvm_riscv_csr { unsigned long sip; unsigned long satp; unsigned long scounteren; + unsigned long senvcfg; }; /* AIA CSR registers for KVM_GET_ONE_REG and KVM_SET_ONE_REG */ @@ -93,6 +94,11 @@ struct kvm_riscv_aia_csr { unsigned long iprio2h; }; +/* Smstateen CSR for KVM_GET_ONE_REG and KVM_SET_ONE_REG */ +struct kvm_riscv_smstateen_csr { + unsigned long sstateen0; +}; + /* TIMER registers for KVM_GET_ONE_REG and KVM_SET_ONE_REG */ struct kvm_riscv_timer { __u64 frequency; @@ -131,6 +137,8 @@ enum KVM_RISCV_ISA_EXT_ID { KVM_RISCV_ISA_EXT_ZICSR, KVM_RISCV_ISA_EXT_ZIFENCEI, KVM_RISCV_ISA_EXT_ZIHPM, + KVM_RISCV_ISA_EXT_SMSTATEEN, + KVM_RISCV_ISA_EXT_ZICOND, KVM_RISCV_ISA_EXT_MAX, }; @@ -148,6 +156,7 @@ enum KVM_RISCV_SBI_EXT_ID { KVM_RISCV_SBI_EXT_PMU, KVM_RISCV_SBI_EXT_EXPERIMENTAL, KVM_RISCV_SBI_EXT_VENDOR, + KVM_RISCV_SBI_EXT_DBCN, KVM_RISCV_SBI_EXT_MAX, }; @@ -178,10 +187,13 @@ enum KVM_RISCV_SBI_EXT_ID { #define KVM_REG_RISCV_CSR (0x03 << KVM_REG_RISCV_TYPE_SHIFT) #define KVM_REG_RISCV_CSR_GENERAL (0x0 << KVM_REG_RISCV_SUBTYPE_SHIFT) #define KVM_REG_RISCV_CSR_AIA (0x1 << KVM_REG_RISCV_SUBTYPE_SHIFT) +#define KVM_REG_RISCV_CSR_SMSTATEEN (0x2 << KVM_REG_RISCV_SUBTYPE_SHIFT) #define KVM_REG_RISCV_CSR_REG(name) \ (offsetof(struct kvm_riscv_csr, name) / sizeof(unsigned long)) #define KVM_REG_RISCV_CSR_AIA_REG(name) \ (offsetof(struct kvm_riscv_aia_csr, name) / sizeof(unsigned long)) +#define KVM_REG_RISCV_CSR_SMSTATEEN_REG(name) \ + (offsetof(struct kvm_riscv_smstateen_csr, name) / sizeof(unsigned long)) /* Timer registers are mapped as type 4 */ #define KVM_REG_RISCV_TIMER (0x04 << KVM_REG_RISCV_TYPE_SHIFT) diff --git a/linux-headers/asm-s390/unistd_32.h b/linux-headers/asm-s390/unistd_32.h index 716fa368ca71..a3ece69d8241 100644 --- a/linux-headers/asm-s390/unistd_32.h +++ b/linux-headers/asm-s390/unistd_32.h @@ -425,5 +425,14 @@ #define __NR_set_mempolicy_home_node 450 #define __NR_cachestat 451 #define __NR_fchmodat2 452 +#define __NR_map_shadow_stack 453 +#define __NR_futex_wake 454 +#define __NR_futex_wait 455 +#define __NR_futex_requeue 456 +#define __NR_statmount 457 +#define __NR_listmount 458 +#define __NR_lsm_get_self_attr 459 +#define __NR_lsm_set_self_attr 460 +#define __NR_lsm_list_modules 461 #endif /* _ASM_S390_UNISTD_32_H */ diff --git a/linux-headers/asm-s390/unistd_64.h b/linux-headers/asm-s390/unistd_64.h index b2a11b1d139f..8c5fd93495ce 100644 --- a/linux-headers/asm-s390/unistd_64.h +++ b/linux-headers/asm-s390/unistd_64.h @@ -373,5 +373,14 @@ #define __NR_set_mempolicy_home_node 450 #define __NR_cachestat 451 #define __NR_fchmodat2 452 +#define __NR_map_shadow_stack 453 +#define __NR_futex_wake 454 +#define __NR_futex_wait 455 +#define __NR_futex_requeue 456 +#define __NR_statmount 457 +#define __NR_listmount 458 +#define __NR_lsm_get_self_attr 459 +#define __NR_lsm_set_self_attr 460 +#define __NR_lsm_list_modules 461 #endif /* _ASM_S390_UNISTD_64_H */ diff --git a/linux-headers/asm-x86/kvm.h b/linux-headers/asm-x86/kvm.h index 2b3a8f7bd2c0..beec5facaeb9 100644 --- a/linux-headers/asm-x86/kvm.h +++ b/linux-headers/asm-x86/kvm.h @@ -560,4 +560,8 @@ struct kvm_pmu_event_filter { /* x86-specific KVM_EXIT_HYPERCALL flags. */ #define KVM_EXIT_HYPERCALL_LONG_MODE BIT(0) +#define KVM_X86_DEFAULT_VM 0 +#define KVM_X86_SW_PROTECTED_VM 1 +#define KVM_X86_SNP_VM 3 + #endif /* _ASM_X86_KVM_H */ diff --git a/linux-headers/asm-x86/unistd_32.h b/linux-headers/asm-x86/unistd_32.h index d749ad1c24ec..5c9c329e9390 100644 --- a/linux-headers/asm-x86/unistd_32.h +++ b/linux-headers/asm-x86/unistd_32.h @@ -443,6 +443,15 @@ #define __NR_set_mempolicy_home_node 450 #define __NR_cachestat 451 #define __NR_fchmodat2 452 +#define __NR_map_shadow_stack 453 +#define __NR_futex_wake 454 +#define __NR_futex_wait 455 +#define __NR_futex_requeue 456 +#define __NR_statmount 457 +#define __NR_listmount 458 +#define __NR_lsm_get_self_attr 459 +#define __NR_lsm_set_self_attr 460 +#define __NR_lsm_list_modules 461 #endif /* _ASM_UNISTD_32_H */ diff --git a/linux-headers/asm-x86/unistd_64.h b/linux-headers/asm-x86/unistd_64.h index cea67282ebfe..d9aab7ae87d8 100644 --- a/linux-headers/asm-x86/unistd_64.h +++ b/linux-headers/asm-x86/unistd_64.h @@ -366,6 +366,14 @@ #define __NR_cachestat 451 #define __NR_fchmodat2 452 #define __NR_map_shadow_stack 453 +#define __NR_futex_wake 454 +#define __NR_futex_wait 455 +#define __NR_futex_requeue 456 +#define __NR_statmount 457 +#define __NR_listmount 458 +#define __NR_lsm_get_self_attr 459 +#define __NR_lsm_set_self_attr 460 +#define __NR_lsm_list_modules 461 #endif /* _ASM_UNISTD_64_H */ diff --git a/linux-headers/asm-x86/unistd_x32.h b/linux-headers/asm-x86/unistd_x32.h index 5b2e79bf4c46..63cdd1ee43df 100644 --- a/linux-headers/asm-x86/unistd_x32.h +++ b/linux-headers/asm-x86/unistd_x32.h @@ -318,6 +318,14 @@ #define __NR_set_mempolicy_home_node (__X32_SYSCALL_BIT + 450) #define __NR_cachestat (__X32_SYSCALL_BIT + 451) #define __NR_fchmodat2 (__X32_SYSCALL_BIT + 452) +#define __NR_futex_wake (__X32_SYSCALL_BIT + 454) +#define __NR_futex_wait (__X32_SYSCALL_BIT + 455) +#define __NR_futex_requeue (__X32_SYSCALL_BIT + 456) +#define __NR_statmount (__X32_SYSCALL_BIT + 457) +#define __NR_listmount (__X32_SYSCALL_BIT + 458) +#define __NR_lsm_get_self_attr (__X32_SYSCALL_BIT + 459) +#define __NR_lsm_set_self_attr (__X32_SYSCALL_BIT + 460) +#define __NR_lsm_list_modules (__X32_SYSCALL_BIT + 461) #define __NR_rt_sigaction (__X32_SYSCALL_BIT + 512) #define __NR_rt_sigreturn (__X32_SYSCALL_BIT + 513) #define __NR_ioctl (__X32_SYSCALL_BIT + 514) diff --git a/linux-headers/linux/iommufd.h b/linux-headers/linux/iommufd.h index 218bf7ac98d0..806d98d09c01 100644 --- a/linux-headers/linux/iommufd.h +++ b/linux-headers/linux/iommufd.h @@ -47,6 +47,8 @@ enum { IOMMUFD_CMD_VFIO_IOAS, IOMMUFD_CMD_HWPT_ALLOC, IOMMUFD_CMD_GET_HW_INFO, + IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING, + IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP, }; /** @@ -347,20 +349,86 @@ struct iommu_vfio_ioas { }; #define IOMMU_VFIO_IOAS _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VFIO_IOAS) +/** + * enum iommufd_hwpt_alloc_flags - Flags for HWPT allocation + * @IOMMU_HWPT_ALLOC_NEST_PARENT: If set, allocate a HWPT that can serve as + * the parent HWPT in a nesting configuration. + * @IOMMU_HWPT_ALLOC_DIRTY_TRACKING: Dirty tracking support for device IOMMU is + * enforced on device attachment + */ +enum iommufd_hwpt_alloc_flags { + IOMMU_HWPT_ALLOC_NEST_PARENT = 1 << 0, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING = 1 << 1, +}; + +/** + * enum iommu_hwpt_vtd_s1_flags - Intel VT-d stage-1 page table + * entry attributes + * @IOMMU_VTD_S1_SRE: Supervisor request + * @IOMMU_VTD_S1_EAFE: Extended access enable + * @IOMMU_VTD_S1_WPE: Write protect enable + */ +enum iommu_hwpt_vtd_s1_flags { + IOMMU_VTD_S1_SRE = 1 << 0, + IOMMU_VTD_S1_EAFE = 1 << 1, + IOMMU_VTD_S1_WPE = 1 << 2, +}; + +/** + * struct iommu_hwpt_vtd_s1 - Intel VT-d stage-1 page table + * info (IOMMU_HWPT_DATA_VTD_S1) + * @flags: Combination of enum iommu_hwpt_vtd_s1_flags + * @pgtbl_addr: The base address of the stage-1 page table. + * @addr_width: The address width of the stage-1 page table + * @__reserved: Must be 0 + */ +struct iommu_hwpt_vtd_s1 { + __aligned_u64 flags; + __aligned_u64 pgtbl_addr; + __u32 addr_width; + __u32 __reserved; +}; + +/** + * enum iommu_hwpt_data_type - IOMMU HWPT Data Type + * @IOMMU_HWPT_DATA_NONE: no data + * @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table + */ +enum iommu_hwpt_data_type { + IOMMU_HWPT_DATA_NONE, + IOMMU_HWPT_DATA_VTD_S1, +}; + /** * struct iommu_hwpt_alloc - ioctl(IOMMU_HWPT_ALLOC) * @size: sizeof(struct iommu_hwpt_alloc) - * @flags: Must be 0 + * @flags: Combination of enum iommufd_hwpt_alloc_flags * @dev_id: The device to allocate this HWPT for - * @pt_id: The IOAS to connect this HWPT to + * @pt_id: The IOAS or HWPT to connect this HWPT to * @out_hwpt_id: The ID of the new HWPT * @__reserved: Must be 0 + * @data_type: One of enum iommu_hwpt_data_type + * @data_len: Length of the type specific data + * @data_uptr: User pointer to the type specific data * * Explicitly allocate a hardware page table object. This is the same object * type that is returned by iommufd_device_attach() and represents the * underlying iommu driver's iommu_domain kernel object. * - * A HWPT will be created with the IOVA mappings from the given IOAS. + * A kernel-managed HWPT will be created with the mappings from the given + * IOAS via the @pt_id. The @data_type for this allocation must be set to + * IOMMU_HWPT_DATA_NONE. The HWPT can be allocated as a parent HWPT for a + * nesting configuration by passing IOMMU_HWPT_ALLOC_NEST_PARENT via @flags. + * + * A user-managed nested HWPT will be created from a given parent HWPT via + * @pt_id, in which the parent HWPT must be allocated previously via the + * same ioctl from a given IOAS (@pt_id). In this case, the @data_type + * must be set to a pre-defined type corresponding to an I/O page table + * type supported by the underlying IOMMU hardware. + * + * If the @data_type is set to IOMMU_HWPT_DATA_NONE, @data_len and + * @data_uptr should be zero. Otherwise, both @data_len and @data_uptr + * must be given. */ struct iommu_hwpt_alloc { __u32 size; @@ -369,13 +437,26 @@ struct iommu_hwpt_alloc { __u32 pt_id; __u32 out_hwpt_id; __u32 __reserved; + __u32 data_type; + __u32 data_len; + __aligned_u64 data_uptr; }; #define IOMMU_HWPT_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_ALLOC) +/** + * enum iommu_hw_info_vtd_flags - Flags for VT-d hw_info + * @IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17: If set, disallow read-only mappings + * on a nested_parent domain. + * https://www.intel.com/content/www/us/en/content-details/772415/content-details.html + */ +enum iommu_hw_info_vtd_flags { + IOMMU_HW_INFO_VTD_ERRATA_772415_SPR17 = 1 << 0, +}; + /** * struct iommu_hw_info_vtd - Intel VT-d hardware information * - * @flags: Must be 0 + * @flags: Combination of enum iommu_hw_info_vtd_flags * @__reserved: Must be 0 * * @cap_reg: Value of Intel VT-d capability register defined in VT-d spec @@ -404,6 +485,20 @@ enum iommu_hw_info_type { IOMMU_HW_INFO_TYPE_INTEL_VTD, }; +/** + * enum iommufd_hw_capabilities + * @IOMMU_HW_CAP_DIRTY_TRACKING: IOMMU hardware support for dirty tracking + * If available, it means the following APIs + * are supported: + * + * IOMMU_HWPT_GET_DIRTY_BITMAP + * IOMMU_HWPT_SET_DIRTY_TRACKING + * + */ +enum iommufd_hw_capabilities { + IOMMU_HW_CAP_DIRTY_TRACKING = 1 << 0, +}; + /** * struct iommu_hw_info - ioctl(IOMMU_GET_HW_INFO) * @size: sizeof(struct iommu_hw_info) @@ -415,6 +510,8 @@ enum iommu_hw_info_type { * the iommu type specific hardware information data * @out_data_type: Output the iommu hardware info type as defined in the enum * iommu_hw_info_type. + * @out_capabilities: Output the generic iommu capability info type as defined + * in the enum iommu_hw_capabilities. * @__reserved: Must be 0 * * Query an iommu type specific hardware information data from an iommu behind @@ -439,6 +536,81 @@ struct iommu_hw_info { __aligned_u64 data_uptr; __u32 out_data_type; __u32 __reserved; + __aligned_u64 out_capabilities; }; #define IOMMU_GET_HW_INFO _IO(IOMMUFD_TYPE, IOMMUFD_CMD_GET_HW_INFO) + +/* + * enum iommufd_hwpt_set_dirty_tracking_flags - Flags for steering dirty + * tracking + * @IOMMU_HWPT_DIRTY_TRACKING_ENABLE: Enable dirty tracking + */ +enum iommufd_hwpt_set_dirty_tracking_flags { + IOMMU_HWPT_DIRTY_TRACKING_ENABLE = 1, +}; + +/** + * struct iommu_hwpt_set_dirty_tracking - ioctl(IOMMU_HWPT_SET_DIRTY_TRACKING) + * @size: sizeof(struct iommu_hwpt_set_dirty_tracking) + * @flags: Combination of enum iommufd_hwpt_set_dirty_tracking_flags + * @hwpt_id: HW pagetable ID that represents the IOMMU domain + * @__reserved: Must be 0 + * + * Toggle dirty tracking on an HW pagetable. + */ +struct iommu_hwpt_set_dirty_tracking { + __u32 size; + __u32 flags; + __u32 hwpt_id; + __u32 __reserved; +}; +#define IOMMU_HWPT_SET_DIRTY_TRACKING _IO(IOMMUFD_TYPE, \ + IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING) + +/** + * enum iommufd_hwpt_get_dirty_bitmap_flags - Flags for getting dirty bits + * @IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR: Just read the PTEs without clearing + * any dirty bits metadata. This flag + * can be passed in the expectation + * where the next operation is an unmap + * of the same IOVA range. + * + */ +enum iommufd_hwpt_get_dirty_bitmap_flags { + IOMMU_HWPT_GET_DIRTY_BITMAP_NO_CLEAR = 1, +}; + +/** + * struct iommu_hwpt_get_dirty_bitmap - ioctl(IOMMU_HWPT_GET_DIRTY_BITMAP) + * @size: sizeof(struct iommu_hwpt_get_dirty_bitmap) + * @hwpt_id: HW pagetable ID that represents the IOMMU domain + * @flags: Combination of enum iommufd_hwpt_get_dirty_bitmap_flags + * @__reserved: Must be 0 + * @iova: base IOVA of the bitmap first bit + * @length: IOVA range size + * @page_size: page size granularity of each bit in the bitmap + * @data: bitmap where to set the dirty bits. The bitmap bits each + * represent a page_size which you deviate from an arbitrary iova. + * + * Checking a given IOVA is dirty: + * + * data[(iova / page_size) / 64] & (1ULL << ((iova / page_size) % 64)) + * + * Walk the IOMMU pagetables for a given IOVA range to return a bitmap + * with the dirty IOVAs. In doing so it will also by default clear any + * dirty bit metadata set in the IOPTE. + */ +struct iommu_hwpt_get_dirty_bitmap { + __u32 size; + __u32 hwpt_id; + __u32 flags; + __u32 __reserved; + __aligned_u64 iova; + __aligned_u64 length; + __aligned_u64 page_size; + __aligned_u64 data; +}; +#define IOMMU_HWPT_GET_DIRTY_BITMAP _IO(IOMMUFD_TYPE, \ + IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP) + #endif diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index 0d74ee999aa9..154027b94d8f 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -16,76 +16,6 @@ #define KVM_API_VERSION 12 -/* *** Deprecated interfaces *** */ - -#define KVM_TRC_SHIFT 16 - -#define KVM_TRC_ENTRYEXIT (1 << KVM_TRC_SHIFT) -#define KVM_TRC_HANDLER (1 << (KVM_TRC_SHIFT + 1)) - -#define KVM_TRC_VMENTRY (KVM_TRC_ENTRYEXIT + 0x01) -#define KVM_TRC_VMEXIT (KVM_TRC_ENTRYEXIT + 0x02) -#define KVM_TRC_PAGE_FAULT (KVM_TRC_HANDLER + 0x01) - -#define KVM_TRC_HEAD_SIZE 12 -#define KVM_TRC_CYCLE_SIZE 8 -#define KVM_TRC_EXTRA_MAX 7 - -#define KVM_TRC_INJ_VIRQ (KVM_TRC_HANDLER + 0x02) -#define KVM_TRC_REDELIVER_EVT (KVM_TRC_HANDLER + 0x03) -#define KVM_TRC_PEND_INTR (KVM_TRC_HANDLER + 0x04) -#define KVM_TRC_IO_READ (KVM_TRC_HANDLER + 0x05) -#define KVM_TRC_IO_WRITE (KVM_TRC_HANDLER + 0x06) -#define KVM_TRC_CR_READ (KVM_TRC_HANDLER + 0x07) -#define KVM_TRC_CR_WRITE (KVM_TRC_HANDLER + 0x08) -#define KVM_TRC_DR_READ (KVM_TRC_HANDLER + 0x09) -#define KVM_TRC_DR_WRITE (KVM_TRC_HANDLER + 0x0A) -#define KVM_TRC_MSR_READ (KVM_TRC_HANDLER + 0x0B) -#define KVM_TRC_MSR_WRITE (KVM_TRC_HANDLER + 0x0C) -#define KVM_TRC_CPUID (KVM_TRC_HANDLER + 0x0D) -#define KVM_TRC_INTR (KVM_TRC_HANDLER + 0x0E) -#define KVM_TRC_NMI (KVM_TRC_HANDLER + 0x0F) -#define KVM_TRC_VMMCALL (KVM_TRC_HANDLER + 0x10) -#define KVM_TRC_HLT (KVM_TRC_HANDLER + 0x11) -#define KVM_TRC_CLTS (KVM_TRC_HANDLER + 0x12) -#define KVM_TRC_LMSW (KVM_TRC_HANDLER + 0x13) -#define KVM_TRC_APIC_ACCESS (KVM_TRC_HANDLER + 0x14) -#define KVM_TRC_TDP_FAULT (KVM_TRC_HANDLER + 0x15) -#define KVM_TRC_GTLB_WRITE (KVM_TRC_HANDLER + 0x16) -#define KVM_TRC_STLB_WRITE (KVM_TRC_HANDLER + 0x17) -#define KVM_TRC_STLB_INVAL (KVM_TRC_HANDLER + 0x18) -#define KVM_TRC_PPC_INSTR (KVM_TRC_HANDLER + 0x19) - -struct kvm_user_trace_setup { - __u32 buf_size; - __u32 buf_nr; -}; - -#define __KVM_DEPRECATED_MAIN_W_0x06 \ - _IOW(KVMIO, 0x06, struct kvm_user_trace_setup) -#define __KVM_DEPRECATED_MAIN_0x07 _IO(KVMIO, 0x07) -#define __KVM_DEPRECATED_MAIN_0x08 _IO(KVMIO, 0x08) - -#define __KVM_DEPRECATED_VM_R_0x70 _IOR(KVMIO, 0x70, struct kvm_assigned_irq) - -struct kvm_breakpoint { - __u32 enabled; - __u32 padding; - __u64 address; -}; - -struct kvm_debug_guest { - __u32 enabled; - __u32 pad; - struct kvm_breakpoint breakpoints[4]; - __u32 singlestep; -}; - -#define __KVM_DEPRECATED_VCPU_W_0x87 _IOW(KVMIO, 0x87, struct kvm_debug_guest) - -/* *** End of deprecated interfaces *** */ - - /* for KVM_SET_USER_MEMORY_REGION */ struct kvm_userspace_memory_region { __u32 slot; @@ -95,6 +25,19 @@ struct kvm_userspace_memory_region { __u64 userspace_addr; /* start of the userspace allocated memory */ }; +/* for KVM_SET_USER_MEMORY_REGION2 */ +struct kvm_userspace_memory_region2 { + __u32 slot; + __u32 flags; + __u64 guest_phys_addr; + __u64 memory_size; + __u64 userspace_addr; + __u64 guest_memfd_offset; + __u32 guest_memfd; + __u32 pad1; + __u64 pad2[14]; +}; + /* * The bit 0 ~ bit 15 of kvm_userspace_memory_region::flags are visible for * userspace, other bits are reserved for kvm internal use which are defined @@ -102,6 +45,7 @@ struct kvm_userspace_memory_region { */ #define KVM_MEM_LOG_DIRTY_PAGES (1UL << 0) #define KVM_MEM_READONLY (1UL << 1) +#define KVM_MEM_GUEST_MEMFD (1UL << 2) /* for KVM_IRQ_LINE */ struct kvm_irq_level { @@ -223,6 +167,31 @@ struct kvm_xen_exit { } u; }; +struct kvm_user_vmgexit { +#define KVM_USER_VMGEXIT_PSC_MSR 1 +#define KVM_USER_VMGEXIT_PSC 2 +#define KVM_USER_VMGEXIT_EXT_GUEST_REQ 3 + __u32 type; /* KVM_USER_VMGEXIT_* type */ + union { + struct { + __u64 gpa; +#define KVM_USER_VMGEXIT_PSC_MSR_OP_PRIVATE 1 +#define KVM_USER_VMGEXIT_PSC_MSR_OP_SHARED 2 + __u8 op; + __u32 ret; + } psc_msr; + struct { + __u64 shared_gpa; + __u64 ret; + } psc; + struct { + __u64 data_gpa; + __u64 data_npages; + __u32 ret; + } ext_guest_req; + }; +}; + #define KVM_S390_GET_SKEYS_NONE 1 #define KVM_S390_SKEYS_MAX 1048576 @@ -264,6 +233,9 @@ struct kvm_xen_exit { #define KVM_EXIT_RISCV_SBI 35 #define KVM_EXIT_RISCV_CSR 36 #define KVM_EXIT_NOTIFY 37 +#define KVM_EXIT_LOONGARCH_IOCSR 38 +#define KVM_EXIT_MEMORY_FAULT 39 +#define KVM_EXIT_VMGEXIT 40 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -336,6 +308,13 @@ struct kvm_run { __u32 len; __u8 is_write; } mmio; + /* KVM_EXIT_LOONGARCH_IOCSR */ + struct { + __u64 phys_addr; + __u8 data[8]; + __u32 len; + __u8 is_write; + } iocsr_io; /* KVM_EXIT_HYPERCALL */ struct { __u64 nr; @@ -506,6 +485,15 @@ struct kvm_run { #define KVM_NOTIFY_CONTEXT_INVALID (1 << 0) __u32 flags; } notify; + /* KVM_EXIT_MEMORY_FAULT */ + struct { +#define KVM_MEMORY_EXIT_FLAG_PRIVATE (1ULL << 3) + __u64 flags; + __u64 gpa; + __u64 size; + } memory_fault; + /* KVM_EXIT_VMGEXIT */ + struct kvm_user_vmgexit vmgexit; /* Fix the size of the union. */ char padding[256]; }; @@ -933,9 +921,6 @@ struct kvm_ppc_resize_hpt { */ #define KVM_GET_VCPU_MMAP_SIZE _IO(KVMIO, 0x04) /* in bytes */ #define KVM_GET_SUPPORTED_CPUID _IOWR(KVMIO, 0x05, struct kvm_cpuid2) -#define KVM_TRACE_ENABLE __KVM_DEPRECATED_MAIN_W_0x06 -#define KVM_TRACE_PAUSE __KVM_DEPRECATED_MAIN_0x07 -#define KVM_TRACE_DISABLE __KVM_DEPRECATED_MAIN_0x08 #define KVM_GET_EMULATED_CPUID _IOWR(KVMIO, 0x09, struct kvm_cpuid2) #define KVM_GET_MSR_FEATURE_INDEX_LIST _IOWR(KVMIO, 0x0a, struct kvm_msr_list) @@ -1188,6 +1173,12 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_COUNTER_OFFSET 227 #define KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 228 #define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229 +#define KVM_CAP_ARM_SUPPORTED_REG_MASK_RANGES 230 +#define KVM_CAP_USER_MEMORY2 231 +#define KVM_CAP_MEMORY_FAULT_INFO 232 +#define KVM_CAP_MEMORY_ATTRIBUTES 233 +#define KVM_CAP_GUEST_MEMFD 234 +#define KVM_CAP_VM_TYPES 235 #ifdef KVM_CAP_IRQ_ROUTING @@ -1278,6 +1269,7 @@ struct kvm_x86_mce { #define KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL (1 << 4) #define KVM_XEN_HVM_CONFIG_EVTCHN_SEND (1 << 5) #define KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG (1 << 6) +#define KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE (1 << 7) struct kvm_xen_hvm_config { __u32 flags; @@ -1358,6 +1350,7 @@ struct kvm_dirty_tlb { #define KVM_REG_ARM64 0x6000000000000000ULL #define KVM_REG_MIPS 0x7000000000000000ULL #define KVM_REG_RISCV 0x8000000000000000ULL +#define KVM_REG_LOONGARCH 0x9000000000000000ULL #define KVM_REG_SIZE_SHIFT 52 #define KVM_REG_SIZE_MASK 0x00f0000000000000ULL @@ -1469,6 +1462,8 @@ struct kvm_vfio_spapr_tce { struct kvm_userspace_memory_region) #define KVM_SET_TSS_ADDR _IO(KVMIO, 0x47) #define KVM_SET_IDENTITY_MAP_ADDR _IOW(KVMIO, 0x48, __u64) +#define KVM_SET_USER_MEMORY_REGION2 _IOW(KVMIO, 0x49, \ + struct kvm_userspace_memory_region2) /* enable ucontrol for s390 */ struct kvm_s390_ucas_mapping { @@ -1493,20 +1488,8 @@ struct kvm_s390_ucas_mapping { _IOW(KVMIO, 0x67, struct kvm_coalesced_mmio_zone) #define KVM_UNREGISTER_COALESCED_MMIO \ _IOW(KVMIO, 0x68, struct kvm_coalesced_mmio_zone) -#define KVM_ASSIGN_PCI_DEVICE _IOR(KVMIO, 0x69, \ - struct kvm_assigned_pci_dev) #define KVM_SET_GSI_ROUTING _IOW(KVMIO, 0x6a, struct kvm_irq_routing) -/* deprecated, replaced by KVM_ASSIGN_DEV_IRQ */ -#define KVM_ASSIGN_IRQ __KVM_DEPRECATED_VM_R_0x70 -#define KVM_ASSIGN_DEV_IRQ _IOW(KVMIO, 0x70, struct kvm_assigned_irq) #define KVM_REINJECT_CONTROL _IO(KVMIO, 0x71) -#define KVM_DEASSIGN_PCI_DEVICE _IOW(KVMIO, 0x72, \ - struct kvm_assigned_pci_dev) -#define KVM_ASSIGN_SET_MSIX_NR _IOW(KVMIO, 0x73, \ - struct kvm_assigned_msix_nr) -#define KVM_ASSIGN_SET_MSIX_ENTRY _IOW(KVMIO, 0x74, \ - struct kvm_assigned_msix_entry) -#define KVM_DEASSIGN_DEV_IRQ _IOW(KVMIO, 0x75, struct kvm_assigned_irq) #define KVM_IRQFD _IOW(KVMIO, 0x76, struct kvm_irqfd) #define KVM_CREATE_PIT2 _IOW(KVMIO, 0x77, struct kvm_pit_config) #define KVM_SET_BOOT_CPU_ID _IO(KVMIO, 0x78) @@ -1523,9 +1506,6 @@ struct kvm_s390_ucas_mapping { * KVM_CAP_VM_TSC_CONTROL to set defaults for a VM */ #define KVM_SET_TSC_KHZ _IO(KVMIO, 0xa2) #define KVM_GET_TSC_KHZ _IO(KVMIO, 0xa3) -/* Available with KVM_CAP_PCI_2_3 */ -#define KVM_ASSIGN_SET_INTX_MASK _IOW(KVMIO, 0xa4, \ - struct kvm_assigned_pci_dev) /* Available with KVM_CAP_SIGNAL_MSI */ #define KVM_SIGNAL_MSI _IOW(KVMIO, 0xa5, struct kvm_msi) /* Available with KVM_CAP_PPC_GET_SMMU_INFO */ @@ -1558,6 +1538,7 @@ struct kvm_s390_ucas_mapping { #define KVM_ARM_MTE_COPY_TAGS _IOR(KVMIO, 0xb4, struct kvm_arm_copy_mte_tags) /* Available with KVM_CAP_COUNTER_OFFSET */ #define KVM_ARM_SET_COUNTER_OFFSET _IOW(KVMIO, 0xb5, struct kvm_arm_counter_offset) +#define KVM_ARM_GET_REG_WRITABLE_MASKS _IOR(KVMIO, 0xb6, struct reg_mask_range) /* ioctl for vm fd */ #define KVM_CREATE_DEVICE _IOWR(KVMIO, 0xe0, struct kvm_create_device) @@ -1577,8 +1558,6 @@ struct kvm_s390_ucas_mapping { #define KVM_SET_SREGS _IOW(KVMIO, 0x84, struct kvm_sregs) #define KVM_TRANSLATE _IOWR(KVMIO, 0x85, struct kvm_translation) #define KVM_INTERRUPT _IOW(KVMIO, 0x86, struct kvm_interrupt) -/* KVM_DEBUG_GUEST is no longer supported, use KVM_SET_GUEST_DEBUG instead */ -#define KVM_DEBUG_GUEST __KVM_DEPRECATED_VCPU_W_0x87 #define KVM_GET_MSRS _IOWR(KVMIO, 0x88, struct kvm_msrs) #define KVM_SET_MSRS _IOW(KVMIO, 0x89, struct kvm_msrs) #define KVM_SET_CPUID _IOW(KVMIO, 0x8a, struct kvm_cpuid) @@ -1914,6 +1893,12 @@ enum sev_cmd_id { /* Guest Migration Extension */ KVM_SEV_SEND_CANCEL, + /* SNP specific commands */ + KVM_SEV_SNP_INIT, + KVM_SEV_SNP_LAUNCH_START, + KVM_SEV_SNP_LAUNCH_UPDATE, + KVM_SEV_SNP_LAUNCH_FINISH, + KVM_SEV_NR_MAX, }; @@ -2010,6 +1995,56 @@ struct kvm_sev_receive_update_data { __u32 trans_len; }; +/* enable the restricted injection */ +#define KVM_SEV_SNP_RESTRICTED_INJET (1 << 0) + +/* enable the restricted injection timer */ +#define KVM_SEV_SNP_RESTRICTED_TIMER_INJET (1 << 1) + +struct kvm_snp_init { + __u64 flags; +}; + +struct kvm_sev_snp_launch_start { + __u64 policy; + __u64 ma_uaddr; + __u8 ma_en; + __u8 imi_en; + __u8 gosvw[16]; + __u8 pad[6]; +}; + +#define KVM_SEV_SNP_PAGE_TYPE_NORMAL 0x1 +#define KVM_SEV_SNP_PAGE_TYPE_VMSA 0x2 +#define KVM_SEV_SNP_PAGE_TYPE_ZERO 0x3 +#define KVM_SEV_SNP_PAGE_TYPE_UNMEASURED 0x4 +#define KVM_SEV_SNP_PAGE_TYPE_SECRETS 0x5 +#define KVM_SEV_SNP_PAGE_TYPE_CPUID 0x6 + +struct kvm_sev_snp_launch_update { + __u64 start_gfn; + __u64 uaddr; + __u32 len; + __u8 imi_page; + __u8 page_type; + __u8 vmpl3_perms; + __u8 vmpl2_perms; + __u8 vmpl1_perms; +}; + +#define KVM_SEV_SNP_ID_BLOCK_SIZE 96 +#define KVM_SEV_SNP_ID_AUTH_SIZE 4096 +#define KVM_SEV_SNP_FINISH_DATA_SIZE 32 + +struct kvm_sev_snp_launch_finish { + __u64 id_block_uaddr; + __u64 id_auth_uaddr; + __u8 id_block_en; + __u8 auth_key_en; + __u8 host_data[KVM_SEV_SNP_FINISH_DATA_SIZE]; + __u8 pad[6]; +}; + #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) #define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) @@ -2252,4 +2287,24 @@ struct kvm_s390_zpci_op { /* flags for kvm_s390_zpci_op->u.reg_aen.flags */ #define KVM_S390_ZPCIOP_REGAEN_HOST (1 << 0) +/* Available with KVM_CAP_MEMORY_ATTRIBUTES */ +#define KVM_SET_MEMORY_ATTRIBUTES _IOW(KVMIO, 0xd2, struct kvm_memory_attributes) + +struct kvm_memory_attributes { + __u64 address; + __u64 size; + __u64 attributes; + __u64 flags; +}; + +#define KVM_MEMORY_ATTRIBUTE_PRIVATE (1ULL << 3) + +#define KVM_CREATE_GUEST_MEMFD _IOWR(KVMIO, 0xd4, struct kvm_create_guest_memfd) + +struct kvm_create_guest_memfd { + __u64 size; + __u64 flags; + __u64 reserved[6]; +}; + #endif /* __LINUX_KVM_H */ diff --git a/linux-headers/linux/psp-sev.h b/linux-headers/linux/psp-sev.h index 12ccb70099d4..1cd9727d9a0f 100644 --- a/linux-headers/linux/psp-sev.h +++ b/linux-headers/linux/psp-sev.h @@ -28,6 +28,11 @@ enum { SEV_PEK_CERT_IMPORT, SEV_GET_ID, /* This command is deprecated, use SEV_GET_ID2 */ SEV_GET_ID2, + SNP_PLATFORM_STATUS, + SNP_COMMIT, + SNP_SET_CONFIG, + SNP_SET_CONFIG_START, + SNP_SET_CONFIG_END, SEV_MAX, }; @@ -68,6 +73,13 @@ typedef enum { SEV_RET_INVALID_PARAM, SEV_RET_RESOURCE_LIMIT, SEV_RET_SECURE_DATA_INVALID, + SEV_RET_INVALID_KEY = 0x27, + SEV_RET_INVALID_PAGE_SIZE, + SEV_RET_INVALID_PAGE_STATE, + SEV_RET_INVALID_MDATA_ENTRY, + SEV_RET_INVALID_PAGE_OWNER, + SEV_RET_INVALID_PAGE_AEAD_OFLOW, + SEV_RET_RMP_INIT_REQUIRED, SEV_RET_MAX, } sev_ret_code; @@ -154,6 +166,66 @@ struct sev_user_data_get_id2 { __u32 length; /* In/Out */ } __attribute__((packed)); +/** + * struct sev_user_data_snp_status - SNP status + * + * @api_major: API major version + * @api_minor: API minor version + * @state: current platform state + * @is_rmp_initialized: whether RMP is initialized or not + * @rsvd: reserved + * @build_id: firmware build id for the API version + * @mask_chip_id: whether chip id is present in attestation reports or not + * @mask_chip_key: whether attestation reports are signed or not + * @vlek_en: VLEK hashstick is loaded + * @rsvd1: reserved + * @guest_count: the number of guest currently managed by the firmware + * @current_tcb_version: current TCB version + * @reported_tcb_version: reported TCB version + */ +struct sev_user_data_snp_status { + __u8 api_major; /* Out */ + __u8 api_minor; /* Out */ + __u8 state; /* Out */ + __u8 is_rmp_initialized:1; /* Out */ + __u8 rsvd:7; + __u32 build_id; /* Out */ + __u32 mask_chip_id:1; /* Out */ + __u32 mask_chip_key:1; /* Out */ + __u32 vlek_en:1; /* Out */ + __u32 rsvd1:29; + __u32 guest_count; /* Out */ + __u64 current_tcb_version; /* Out */ + __u64 reported_tcb_version; /* Out */ +} __attribute__((packed)); + +/** + * struct sev_user_data_snp_config - system wide configuration value for SNP. + * + * @reported_tcb: the TCB version to report in the guest attestation report. + * @mask_chip_id: whether chip id is present in attestation reports or not + * @mask_chip_key: whether attestation reports are signed or not + * @rsvd: reserved + * @rsvd1: reserved + */ +struct sev_user_data_snp_config { + __u64 reported_tcb ; /* In */ + __u32 mask_chip_id:1; /* In */ + __u32 mask_chip_key:1; /* In */ + __u32 rsvd:30; /* In */ + __u8 rsvd1[52]; +} __attribute__((packed)); + +/** + * struct sev_user_data_snp_config_transaction - metadata for config transactions + * + * @id: the ID of the transaction started/ended by a call to SNP_SET_CONFIG_START + * or SNP_SET_CONFIG_END, respectively. + */ +struct sev_user_data_snp_config_transaction { + __u64 id; /* Out */ +} __attribute__((packed)); + /** * struct sev_issue_cmd - SEV ioctl parameters * diff --git a/linux-headers/linux/stddef.h b/linux-headers/linux/stddef.h index 9bb07083ac89..bf9749dd1422 100644 --- a/linux-headers/linux/stddef.h +++ b/linux-headers/linux/stddef.h @@ -27,8 +27,13 @@ union { \ struct { MEMBERS } ATTRS; \ struct TAG { MEMBERS } ATTRS NAME; \ - } + } ATTRS +#ifdef __cplusplus +/* sizeof(struct{}) is 1 in C++, not 0, can't use C version of the macro. */ +#define __DECLARE_FLEX_ARRAY(T, member) \ + T member[0] +#else /** * __DECLARE_FLEX_ARRAY() - Declare a flexible array usable in a union * @@ -49,3 +54,5 @@ #ifndef __counted_by #define __counted_by(m) #endif + +#endif /* _LINUX_STDDEF_H */ diff --git a/linux-headers/linux/userfaultfd.h b/linux-headers/linux/userfaultfd.h index 59978fbaae33..4283de22d5b6 100644 --- a/linux-headers/linux/userfaultfd.h +++ b/linux-headers/linux/userfaultfd.h @@ -40,7 +40,9 @@ UFFD_FEATURE_EXACT_ADDRESS | \ UFFD_FEATURE_WP_HUGETLBFS_SHMEM | \ UFFD_FEATURE_WP_UNPOPULATED | \ - UFFD_FEATURE_POISON) + UFFD_FEATURE_POISON | \ + UFFD_FEATURE_WP_ASYNC | \ + UFFD_FEATURE_MOVE) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -49,6 +51,7 @@ ((__u64)1 << _UFFDIO_WAKE | \ (__u64)1 << _UFFDIO_COPY | \ (__u64)1 << _UFFDIO_ZEROPAGE | \ + (__u64)1 << _UFFDIO_MOVE | \ (__u64)1 << _UFFDIO_WRITEPROTECT | \ (__u64)1 << _UFFDIO_CONTINUE | \ (__u64)1 << _UFFDIO_POISON) @@ -72,6 +75,7 @@ #define _UFFDIO_WAKE (0x02) #define _UFFDIO_COPY (0x03) #define _UFFDIO_ZEROPAGE (0x04) +#define _UFFDIO_MOVE (0x05) #define _UFFDIO_WRITEPROTECT (0x06) #define _UFFDIO_CONTINUE (0x07) #define _UFFDIO_POISON (0x08) @@ -91,6 +95,8 @@ struct uffdio_copy) #define UFFDIO_ZEROPAGE _IOWR(UFFDIO, _UFFDIO_ZEROPAGE, \ struct uffdio_zeropage) +#define UFFDIO_MOVE _IOWR(UFFDIO, _UFFDIO_MOVE, \ + struct uffdio_move) #define UFFDIO_WRITEPROTECT _IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \ struct uffdio_writeprotect) #define UFFDIO_CONTINUE _IOWR(UFFDIO, _UFFDIO_CONTINUE, \ @@ -216,6 +222,14 @@ struct uffdio_api { * (i.e. empty ptes). This will be the default behavior for shmem * & hugetlbfs, so this flag only affects anonymous memory behavior * when userfault write-protection mode is registered. + * + * UFFD_FEATURE_WP_ASYNC indicates that userfaultfd write-protection + * asynchronous mode is supported in which the write fault is + * automatically resolved and write-protection is un-set. + * It implies UFFD_FEATURE_WP_UNPOPULATED. + * + * UFFD_FEATURE_MOVE indicates that the kernel supports moving an + * existing page contents from userspace. */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) @@ -232,6 +246,8 @@ struct uffdio_api { #define UFFD_FEATURE_WP_HUGETLBFS_SHMEM (1<<12) #define UFFD_FEATURE_WP_UNPOPULATED (1<<13) #define UFFD_FEATURE_POISON (1<<14) +#define UFFD_FEATURE_WP_ASYNC (1<<15) +#define UFFD_FEATURE_MOVE (1<<16) __u64 features; __u64 ioctls; @@ -340,6 +356,24 @@ struct uffdio_poison { __s64 updated; }; +struct uffdio_move { + __u64 dst; + __u64 src; + __u64 len; + /* + * Especially if used to atomically remove memory from the + * address space the wake on the dst range is not needed. + */ +#define UFFDIO_MOVE_MODE_DONTWAKE ((__u64)1<<0) +#define UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES ((__u64)1<<1) + __u64 mode; + /* + * "move" is written by the ioctl and must be at the end: the + * copy_from_user will not read the last 8 bytes. + */ + __s64 move; +}; + /* * Flags for the userfaultfd(2) system call itself. */ diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h index acf72b4999fa..b4be37b2255c 100644 --- a/linux-headers/linux/vfio.h +++ b/linux-headers/linux/vfio.h @@ -277,8 +277,8 @@ struct vfio_region_info { #define VFIO_REGION_INFO_FLAG_CAPS (1 << 3) /* Info supports caps */ __u32 index; /* Region index */ __u32 cap_offset; /* Offset within info struct of first cap */ - __u64 size; /* Region size (bytes) */ - __u64 offset; /* Region offset from start of device fd */ + __aligned_u64 size; /* Region size (bytes) */ + __aligned_u64 offset; /* Region offset from start of device fd */ }; #define VFIO_DEVICE_GET_REGION_INFO _IO(VFIO_TYPE, VFIO_BASE + 8) @@ -294,8 +294,8 @@ struct vfio_region_info { #define VFIO_REGION_INFO_CAP_SPARSE_MMAP 1 struct vfio_region_sparse_mmap_area { - __u64 offset; /* Offset of mmap'able area within region */ - __u64 size; /* Size of mmap'able area */ + __aligned_u64 offset; /* Offset of mmap'able area within region */ + __aligned_u64 size; /* Size of mmap'able area */ }; struct vfio_region_info_cap_sparse_mmap { @@ -450,9 +450,9 @@ struct vfio_device_migration_info { VFIO_DEVICE_STATE_V1_RESUMING) __u32 reserved; - __u64 pending_bytes; - __u64 data_offset; - __u64 data_size; + __aligned_u64 pending_bytes; + __aligned_u64 data_offset; + __aligned_u64 data_size; }; /* @@ -476,7 +476,7 @@ struct vfio_device_migration_info { struct vfio_region_info_cap_nvlink2_ssatgt { struct vfio_info_cap_header header; - __u64 tgt; + __aligned_u64 tgt; }; /* @@ -816,7 +816,7 @@ struct vfio_device_gfx_plane_info { __u32 drm_plane_type; /* type of plane: DRM_PLANE_TYPE_* */ /* out */ __u32 drm_format; /* drm format of plane */ - __u64 drm_format_mod; /* tiled mode */ + __aligned_u64 drm_format_mod; /* tiled mode */ __u32 width; /* width of plane */ __u32 height; /* height of plane */ __u32 stride; /* stride of plane */ @@ -829,6 +829,7 @@ struct vfio_device_gfx_plane_info { __u32 region_index; /* region index */ __u32 dmabuf_id; /* dma-buf id */ }; + __u32 reserved; }; #define VFIO_DEVICE_QUERY_GFX_PLANE _IO(VFIO_TYPE, VFIO_BASE + 14) @@ -863,9 +864,10 @@ struct vfio_device_ioeventfd { #define VFIO_DEVICE_IOEVENTFD_32 (1 << 2) /* 4-byte write */ #define VFIO_DEVICE_IOEVENTFD_64 (1 << 3) /* 8-byte write */ #define VFIO_DEVICE_IOEVENTFD_SIZE_MASK (0xf) - __u64 offset; /* device fd offset of write */ - __u64 data; /* data to be written */ + __aligned_u64 offset; /* device fd offset of write */ + __aligned_u64 data; /* data to be written */ __s32 fd; /* -1 for de-assignment */ + __u32 reserved; }; #define VFIO_DEVICE_IOEVENTFD _IO(VFIO_TYPE, VFIO_BASE + 16) @@ -1217,6 +1219,7 @@ enum vfio_device_mig_state { VFIO_DEVICE_STATE_RUNNING_P2P = 5, VFIO_DEVICE_STATE_PRE_COPY = 6, VFIO_DEVICE_STATE_PRE_COPY_P2P = 7, + VFIO_DEVICE_STATE_NR, }; /** @@ -1434,6 +1437,27 @@ struct vfio_device_feature_mig_data_size { #define VFIO_DEVICE_FEATURE_MIG_DATA_SIZE 9 +/** + * Upon VFIO_DEVICE_FEATURE_SET, set or clear the BUS mastering for the device + * based on the operation specified in op flag. + * + * The functionality is incorporated for devices that needs bus master control, + * but the in-band device interface lacks the support. Consequently, it is not + * applicable to PCI devices, as bus master control for PCI devices is managed + * in-band through the configuration space. At present, this feature is supported + * only for CDX devices. + * When the device's BUS MASTER setting is configured as CLEAR, it will result in + * blocking all incoming DMA requests from the device. On the other hand, configuring + * the device's BUS MASTER setting as SET (enable) will grant the device the + * capability to perform DMA to the host memory. + */ +struct vfio_device_feature_bus_master { + __u32 op; +#define VFIO_DEVICE_FEATURE_CLEAR_MASTER 0 /* Clear Bus Master */ +#define VFIO_DEVICE_FEATURE_SET_MASTER 1 /* Set Bus Master */ +}; +#define VFIO_DEVICE_FEATURE_BUS_MASTER 10 + /* -------- API for Type1 VFIO IOMMU -------- */ /** @@ -1449,7 +1473,7 @@ struct vfio_iommu_type1_info { __u32 flags; #define VFIO_IOMMU_INFO_PGSIZES (1 << 0) /* supported page sizes info */ #define VFIO_IOMMU_INFO_CAPS (1 << 1) /* Info supports caps */ - __u64 iova_pgsizes; /* Bitmap of supported page sizes */ + __aligned_u64 iova_pgsizes; /* Bitmap of supported page sizes */ __u32 cap_offset; /* Offset within info struct of first cap */ __u32 pad; }; diff --git a/linux-headers/linux/vhost.h b/linux-headers/linux/vhost.h index f5c48b61ab62..649560c685f1 100644 --- a/linux-headers/linux/vhost.h +++ b/linux-headers/linux/vhost.h @@ -219,4 +219,12 @@ */ #define VHOST_VDPA_RESUME _IO(VHOST_VIRTIO, 0x7E) +/* Get the group for the descriptor table including driver & device areas + * of a virtqueue: read index, write group in num. + * The virtqueue index is stored in the index field of vhost_vring_state. + * The group ID of the descriptor table for this specific virtqueue + * is returned via num field of vhost_vring_state. + */ +#define VHOST_VDPA_GET_VRING_DESC_GROUP _IOWR(VHOST_VIRTIO, 0x7F, \ + struct vhost_vring_state) #endif diff --git a/meson.build b/meson.build index 6c77d9687ded..73f2c3fa687e 100644 --- a/meson.build +++ b/meson.build @@ -1079,6 +1079,12 @@ if targetos == 'linux' and (have_system or have_tools) method: 'pkg-config', required: get_option('libudev')) endif +igvm = not_found +if not get_option('igvm').auto() or have_system + igvm = dependency('igvm', + method: 'pkg-config', + required: get_option('igvm')) +endif mpathlibs = [libudev] mpathpersist = not_found @@ -2223,6 +2229,7 @@ config_host_data.set('CONFIG_CFI', get_option('cfi')) config_host_data.set('CONFIG_SELINUX', selinux.found()) config_host_data.set('CONFIG_XEN_BACKEND', xen.found()) config_host_data.set('CONFIG_LIBDW', libdw.found()) +config_host_data.set('CONFIG_IGVM', igvm.found()) if xen.found() # protect from xen.version() having less than three components xen_version = xen.version().split('.') + ['0', '0'] @@ -4373,6 +4380,7 @@ summary_info += {'seccomp support': seccomp} summary_info += {'GlusterFS support': glusterfs} summary_info += {'hv-balloon support': hv_balloon} summary_info += {'TPM support': have_tpm} +summary_info += {'IGVM support': igvm} summary_info += {'libssh support': libssh} summary_info += {'lzo support': lzo} summary_info += {'snappy support': snappy} diff --git a/meson_options.txt b/meson_options.txt index c9baeda63956..fd2e381a35e4 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -109,6 +109,8 @@ option('dbus_display', type: 'feature', value: 'auto', description: '-display dbus support') option('tpm', type : 'feature', value : 'auto', description: 'TPM support') +option('igvm', type: 'feature', value: 'auto', + description: 'Independent Guest Virtual Machine (IGVM) file support') # Do not enable it by default even for Mingw32, because it doesn't # work on Wine. diff --git a/qapi/misc-target.json b/qapi/misc-target.json index 88291453ba47..1b4bbbe1fa3f 100644 --- a/qapi/misc-target.json +++ b/qapi/misc-target.json @@ -47,6 +47,49 @@ 'send-update', 'receive-update' ], 'if': 'TARGET_I386' } +## +# @SevGuestType: +# +# An enumeration indicating the type of SEV guest being run. +# +# @sev: The guest is a legacy SEV or SEV-ES guest. +# @sev-snp: The guest is an SEV-SNP guest. +# +# Since: 6.2 +## +{ 'enum': 'SevGuestType', + 'data': [ 'sev', 'sev-snp' ], + 'if': 'TARGET_I386' } + +## +# @SevGuestInfo: +# +# Information specific to legacy SEV/SEV-ES guests. +# +# @policy: SEV policy value +# +# @handle: SEV firmware handle +# +# Since: 2.12 +## +{ 'struct': 'SevGuestInfo', + 'data': { 'policy': 'uint32', + 'handle': 'uint32' }, + 'if': 'TARGET_I386' } + +## +# @SevSnpGuestInfo: +# +# Information specific to SEV-SNP guests. +# +# @snp-policy: SEV-SNP policy value +# +# Since: 6.2 +## +{ 'struct': 'SevSnpGuestInfo', + 'data': { 'snp-policy': 'uint64' }, + 'if': 'TARGET_I386' } + ## # @SevInfo: # @@ -60,25 +103,25 @@ # # @build-id: SEV FW build id # -# @policy: SEV policy value -# # @state: SEV guest state # -# @handle: SEV firmware handle +# @sev-type: Type of SEV guest being run # # Since: 2.12 ## -{ 'struct': 'SevInfo', - 'data': { 'enabled': 'bool', - 'api-major': 'uint8', - 'api-minor' : 'uint8', - 'build-id' : 'uint8', - 'policy' : 'uint32', - 'state' : 'SevState', - 'handle' : 'uint32' - }, - 'if': 'TARGET_I386' -} +{ 'union': 'SevInfo', + 'base': { 'enabled': 'bool', + 'api-major': 'uint8', + 'api-minor' : 'uint8', + 'build-id' : 'uint8', + 'state' : 'SevState', + 'sev-type' : 'SevGuestType' }, + 'discriminator': 'sev-type', + 'data': { + 'sev': 'SevGuestInfo', + 'sev-snp': 'SevSnpGuestInfo' }, + 'if': 'TARGET_I386' } + ## # @query-sev: diff --git a/qapi/qom.json b/qapi/qom.json index c53ef978ff7e..6abdb57489a7 100644 --- a/qapi/qom.json +++ b/qapi/qom.json @@ -843,19 +843,23 @@ 'data': { '*filename': 'str' } } ## -# @SevGuestProperties: +# @ConfidentialGuestProperties: # -# Properties for sev-guest objects. +# Properties common to objects that are derivatives of confidential-guest-support. # -# @sev-device: SEV device to use (default: "/dev/sev") +# @igvm-file: IGVM file to use to configure guest (default: none) # -# @dh-cert-file: guest owners DH certificate (encoded with base64) -# -# @session-file: guest owners session parameters (encoded with base64) +# Since: 9.1 +## +{ 'struct': 'ConfidentialGuestProperties', + 'data': { '*igvm-file': 'str' } } + +## +# @SevCommonProperties: # -# @policy: SEV policy value (default: 0x1) +# Properties common to objects that are derivatives of sev-common. # -# @handle: SEV firmware handle (default: 0) +# @sev-device: SEV device to use (default: "/dev/sev") # # @cbitpos: C-bit location in page table entry (default: 0) # @@ -868,16 +872,93 @@ # # Since: 2.12 ## -{ 'struct': 'SevGuestProperties', +{ 'struct': 'SevCommonProperties', + 'base': 'ConfidentialGuestProperties', 'data': { '*sev-device': 'str', - '*dh-cert-file': 'str', - '*session-file': 'str', - '*policy': 'uint32', - '*handle': 'uint32', '*cbitpos': 'uint32', 'reduced-phys-bits': 'uint32', '*kernel-hashes': 'bool' } } +## +# @SevGuestProperties: +# +# Properties for sev-guest objects. +# +# @dh-cert-file: guest owners DH certificate (encoded with base64) +# +# @session-file: guest owners session parameters (encoded with base64) +# +# @policy: SEV policy value (default: 0x1) +# +# @handle: SEV firmware handle (default: 0) +# +# Since: 2.12 +## +{ 'struct': 'SevGuestProperties', + 'base': 'SevCommonProperties', + 'data': { '*dh-cert-file': 'str', + '*session-file': 'str', + '*policy': 'uint32', + '*handle': 'uint32' } } + +## +# @SevSnpGuestProperties: +# +# Properties for sev-snp-guest objects. Most of these are direct arguments +# for the KVM_SNP_* interfaces documented in the linux kernel source +# under Documentation/virt/kvm/amd-memory-encryption.rst, which are in +# turn closely coupled with the SNP_INIT/SNP_LAUNCH_* firmware commands +# documented in the SEV-SNP Firmware ABI Specification (Rev 0.9). +# +# More usage information is also available in the QEMU source tree under +# docs/amd-memory-encryption. +# +# @init-flags: as documented for the 'flags' parameter of the +# KVM_SNP_INIT KVM command (default: 0) +# +# @policy: the 'POLICY' parameter to the SNP_LAUNCH_START command, as +# defined in the SEV-SNP firmware ABI (default: 0x30000) +# +# @guest-visible-workarounds: 16-byte, base64-encoded blob to report +# hypervisor-defined workarounds, corresponding +# to the 'GOSVW' parameter of the +# SNP_LAUNCH_START command defined in the +# SEV-SNP firmware ABI (default: all-zero) +# +# @id-block: 96-byte, base64-encoded blob to provide the 'ID Block' +# structure for the SNP_LAUNCH_FINISH command defined in the +# SEV-SNP firmware ABI (default: all-zero) +# +# @id-auth: 4096-byte, base64-encoded blob to provide the 'ID Authentication +# Information Structure' for the SNP_LAUNCH_FINISH command defined +# in the SEV-SNP firmware ABI (default: all-zero) +# +# @auth-key-enabled: true if 'id-auth' blob contains the 'AUTHOR_KEY' field +# defined SEV-SNP firmware ABI (default: false) +# +# @host-data: 32-byte, base64-encoded, user-defined blob to provide to the +# guest, as documented for the 'HOST_DATA' parameter of the +# SNP_LAUNCH_FINISH command in the SEV-SNP firmware ABI +# (default: all-zero) +# +# @certs-path: path to certificate data that can be passed to guests via +# SNP Extended Guest Requests. File should be in the format +# described in the GHCB specification. (default: none) +# +# Since: 7.2 +## +{ 'struct': 'SevSnpGuestProperties', + 'base': 'SevCommonProperties', + 'data': { + '*init-flags': 'uint64', + '*policy': 'uint64', + '*guest-visible-workarounds': 'str', + '*id-block': 'str', + '*id-auth': 'str', + '*auth-key-enabled': 'bool', + '*host-data': 'str', + '*certs-path': 'str' } } + ## # @ThreadContextProperties: # @@ -954,6 +1035,7 @@ { 'name': 'secret_keyring', 'if': 'CONFIG_SECRET_KEYRING' }, 'sev-guest', + 'sev-snp-guest', 'thread-context', 's390-pv-guest', 'throttle-group', @@ -1022,6 +1104,7 @@ 'secret_keyring': { 'type': 'SecretKeyringProperties', 'if': 'CONFIG_SECRET_KEYRING' }, 'sev-guest': 'SevGuestProperties', + 'sev-snp-guest': 'SevSnpGuestProperties', 'thread-context': 'ThreadContextProperties', 'throttle-group': 'ThrottleGroupProperties', 'tls-creds-anon': 'TlsCredsAnonProperties', diff --git a/qemu-options.hx b/qemu-options.hx index 42fd09e4de96..6dbad2e5b953 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -5637,7 +5637,7 @@ SRST -object secret,id=sec0,keyid=secmaster0,format=base64,\\ data=$SECRET,iv=$(terminates = true; mr->rom_device = true; mr->destructor = memory_region_destructor_ram; - mr->ram_block = qemu_ram_alloc(size, 0, mr, &err); + if (kvm_has_restricted_memory()) { + mr->ram_block = qemu_ram_alloc(size, RAM_GUEST_MEMFD, mr, &err); + } else { + mr->ram_block = qemu_ram_alloc(size, 0, mr, &err); + } if (err) { mr->size = int128_zero(); object_unparent(OBJECT(mr)); @@ -1732,6 +1737,37 @@ void memory_region_init_rom_device_nomigrate(MemoryRegion *mr, } } +void memory_region_init_rom_device_private(MemoryRegion *mr, + Object *owner, + const MemoryRegionOps *ops, + void *opaque, + const char *name, + uint64_t size, + Error **errp) +{ +#if 0 + DeviceState *owner_dev; + Error *err = NULL; + + g_warning("creating ROM device with private memory."); + + memory_region_init_rom_device_nomigrate_private(mr, owner, ops, opaque, + name, size, &err); + if (err) { + error_propagate(errp, err); + return; + } + /* This will assert if owner is neither NULL nor a DeviceState. + * We only want the owner here for the purposes of defining a + * unique name for migration. TODO: Ideally we should implement + * a naming scheme for Objects which are not DeviceStates, in + * which case we can relax this restriction. + */ + owner_dev = DEVICE(owner); + vmstate_register_ram(mr, owner_dev); +#endif +} + void memory_region_init_iommu(void *_iommu_mr, size_t instance_size, const char *mrtypename, @@ -1834,6 +1870,24 @@ bool memory_region_is_protected(MemoryRegion *mr) return mr->ram && (mr->ram_block->flags & RAM_PROTECTED); } +bool memory_region_has_guest_memfd(MemoryRegion *mr) +{ + return mr->ram_block && mr->ram_block->guest_memfd >= 0; +} + +bool memory_region_is_default_private(MemoryRegion *mr) +{ + return memory_region_has_guest_memfd(mr) && + (mr->ram_block->flags & RAM_DEFAULT_PRIVATE); +} + +void memory_region_set_default_private(MemoryRegion *mr) +{ + if (memory_region_has_guest_memfd(mr)) { + mr->ram_block->flags |= RAM_DEFAULT_PRIVATE; + } +} + uint8_t memory_region_get_dirty_log_mask(MemoryRegion *mr) { uint8_t mask = mr->dirty_log_mask; @@ -3586,6 +3640,41 @@ void memory_region_init_ram(MemoryRegion *mr, vmstate_register_ram(mr, owner_dev); } +void memory_region_init_ram_guest_memfd(MemoryRegion *mr, + Object *owner, + const char *name, + uint64_t size, + Error **errp) +{ + DeviceState *owner_dev; + Error *err = NULL; + + /* + * TODO: drop this whole function and just have memory_region_init_ram() + * handle this case automatically. + */ + if (!kvm_has_restricted_memory()) { + return memory_region_init_ram(mr, owner, name, size, errp); + } + + memory_region_init_ram_flags_nomigrate(mr, owner, name, size, + RAM_GUEST_MEMFD, &err); + if (err) { + error_propagate(errp, err); + return; + } + memory_region_set_default_private(mr); + + /* This will assert if owner is neither NULL nor a DeviceState. + * We only want the owner here for the purposes of defining a + * unique name for migration. TODO: Ideally we should implement + * a naming scheme for Objects which are not DeviceStates, in + * which case we can relax this restriction. + */ + owner_dev = DEVICE(owner); + vmstate_register_ram(mr, owner_dev); +} + void memory_region_init_rom(MemoryRegion *mr, Object *owner, const char *name, diff --git a/system/physmem.c b/system/physmem.c index a63853a7bc9d..83ea4613d72b 100644 --- a/system/physmem.c +++ b/system/physmem.c @@ -1803,6 +1803,42 @@ static void dirty_memory_extend(ram_addr_t old_ram_size, } } +#ifdef CONFIG_KVM +#define HPAGE_PMD_SIZE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" +#define DEFAULT_PMD_SIZE (1ul << 21) + +#if 0 +static uint32_t get_thp_size(void) +{ + gchar *content = NULL; + const char *endptr; + static uint64_t thp_size = 0; + uint64_t tmp; + + if (thp_size != 0) { + return thp_size; + } + + if (g_file_get_contents(HPAGE_PMD_SIZE_PATH, &content, NULL, NULL) && + !qemu_strtou64(content, &endptr, 0, &tmp) && + (!endptr || *endptr == '\n')) { + /* Sanity-check the value and fallback to something reasonable. */ + if (!tmp || !is_power_of_2(tmp)) { + warn_report("Read unsupported THP size: %" PRIx64, tmp); + } else { + thp_size = tmp; + } + } + + if (!thp_size) { + thp_size = DEFAULT_PMD_SIZE; + } + + return thp_size; +} +#endif +#endif + static void ram_block_add(RAMBlock *new_block, Error **errp) { const bool noreserve = qemu_ram_is_noreserve(new_block); @@ -1841,6 +1877,22 @@ static void ram_block_add(RAMBlock *new_block, Error **errp) } } +#ifdef CONFIG_KVM + if (kvm_enabled() && new_block->flags & RAM_GUEST_MEMFD && + new_block->guest_memfd < 0) { +#if 0 + uint64_t flags = QEMU_IS_ALIGNED(new_block->max_length, get_thp_size()) ? + KVM_GUEST_MEMFD_ALLOW_HUGEPAGE : 0; +#endif + new_block->guest_memfd = kvm_create_guest_memfd(new_block->max_length, + 0, errp); + if (new_block->guest_memfd < 0) { + qemu_mutex_unlock_ramlist(); + return; + } + } +#endif + new_ram_size = MAX(old_ram_size, (new_block->offset + new_block->max_length) >> TARGET_PAGE_BITS); if (new_ram_size > old_ram_size) { @@ -1903,7 +1955,7 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr, /* Just support these ram flags by now. */ assert((ram_flags & ~(RAM_SHARED | RAM_PMEM | RAM_NORESERVE | RAM_PROTECTED | RAM_NAMED_FILE | RAM_READONLY | - RAM_READONLY_FD)) == 0); + RAM_READONLY_FD | RAM_GUEST_MEMFD)) == 0); if (xen_enabled()) { error_setg(errp, "-mem-path not supported with Xen"); @@ -1938,6 +1990,7 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr, new_block->used_length = size; new_block->max_length = size; new_block->flags = ram_flags; + new_block->guest_memfd = -1; new_block->host = file_ram_alloc(new_block, size, fd, !file_size, offset, errp); if (!new_block->host) { @@ -2016,7 +2069,7 @@ RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size, Error *local_err = NULL; assert((ram_flags & ~(RAM_SHARED | RAM_RESIZEABLE | RAM_PREALLOC | - RAM_NORESERVE)) == 0); + RAM_NORESERVE| RAM_GUEST_MEMFD)) == 0); assert(!host ^ (ram_flags & RAM_PREALLOC)); size = HOST_PAGE_ALIGN(size); @@ -2028,6 +2081,7 @@ RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size, new_block->max_length = max_size; assert(max_size >= size); new_block->fd = -1; + new_block->guest_memfd = -1; new_block->page_size = qemu_real_host_page_size(); new_block->host = host; new_block->flags = ram_flags; @@ -2050,7 +2104,7 @@ RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host, RAMBlock *qemu_ram_alloc(ram_addr_t size, uint32_t ram_flags, MemoryRegion *mr, Error **errp) { - assert((ram_flags & ~(RAM_SHARED | RAM_NORESERVE)) == 0); + assert((ram_flags & ~(RAM_SHARED | RAM_NORESERVE | RAM_GUEST_MEMFD)) == 0); return qemu_ram_alloc_internal(size, size, NULL, NULL, ram_flags, mr, errp); } @@ -2078,6 +2132,11 @@ static void reclaim_ramblock(RAMBlock *block) } else { qemu_anon_ram_free(block->host, block->max_length); } + + if (block->guest_memfd >= 0) { + close(block->guest_memfd); + } + g_free(block); } @@ -3499,17 +3558,16 @@ int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length) uint8_t *host_startaddr = rb->host + start; - if (!QEMU_PTR_IS_ALIGNED(host_startaddr, rb->page_size)) { - error_report("ram_block_discard_range: Unaligned start address: %p", - host_startaddr); + if (!QEMU_PTR_IS_ALIGNED(host_startaddr, qemu_host_page_size)) { + error_report("%s: Unaligned start address: %p", + __func__, host_startaddr); goto err; } if ((start + length) <= rb->max_length) { bool need_madvise, need_fallocate; if (!QEMU_IS_ALIGNED(length, rb->page_size)) { - error_report("ram_block_discard_range: Unaligned length: %zx", - length); + error_report("%s: Unaligned length: %zx", __func__, length); goto err; } @@ -3533,8 +3591,8 @@ int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length) * proper error message. */ if (rb->flags & RAM_READONLY_FD) { - error_report("ram_block_discard_range: Discarding RAM" - " with readonly files is not supported"); + error_report("%s: Discarding RAM with readonly files is not" + " supported", __func__); goto err; } @@ -3549,27 +3607,26 @@ int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length) * file. */ if (!qemu_ram_is_shared(rb)) { - warn_report_once("ram_block_discard_range: Discarding RAM" + warn_report_once("%s: Discarding RAM" " in private file mappings is possibly" " dangerous, because it will modify the" " underlying file and will affect other" - " users of the file"); + " users of the file", __func__); } ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, start, length); if (ret) { ret = -errno; - error_report("ram_block_discard_range: Failed to fallocate " - "%s:%" PRIx64 " +%zx (%d)", - rb->idstr, start, length, ret); + error_report("%s: Failed to fallocate %s:%" PRIx64 " +%zx (%d)", + __func__, rb->idstr, start, length, ret); goto err; } #else ret = -ENOSYS; - error_report("ram_block_discard_range: fallocate not available/file" + error_report("%s: fallocate not available/file" "%s:%" PRIx64 " +%zx (%d)", - rb->idstr, start, length, ret); + __func__, rb->idstr, start, length, ret); goto err; #endif } @@ -3587,31 +3644,52 @@ int ram_block_discard_range(RAMBlock *rb, uint64_t start, size_t length) } if (ret) { ret = -errno; - error_report("ram_block_discard_range: Failed to discard range " + error_report("%s: Failed to discard range " "%s:%" PRIx64 " +%zx (%d)", - rb->idstr, start, length, ret); + __func__, rb->idstr, start, length, ret); goto err; } #else ret = -ENOSYS; - error_report("ram_block_discard_range: MADVISE not available" - "%s:%" PRIx64 " +%zx (%d)", - rb->idstr, start, length, ret); + error_report("%s: MADVISE not available %s:%" PRIx64 " +%zx (%d)", + __func__, rb->idstr, start, length, ret); goto err; #endif } trace_ram_block_discard_range(rb->idstr, host_startaddr, length, need_madvise, need_fallocate, ret); } else { - error_report("ram_block_discard_range: Overrun block '%s' (%" PRIu64 - "/%zx/" RAM_ADDR_FMT")", - rb->idstr, start, length, rb->max_length); + error_report("%s: Overrun block '%s' (%" PRIu64 "/%zx/" RAM_ADDR_FMT")", + __func__, rb->idstr, start, length, rb->max_length); } err: return ret; } +static int ram_block_discard_guest_memfd_range(RAMBlock *rb, uint64_t start, + size_t length) +{ + int ret = -1; + +#ifdef CONFIG_FALLOCATE_PUNCH_HOLE + ret = fallocate(rb->guest_memfd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + start, length); + + if (ret) { + ret = -errno; + error_report("%s: Failed to fallocate %s:%" PRIx64 " +%zx (%d)", + __func__, rb->idstr, start, length, ret); + } +#else + ret = -ENOSYS; + error_report("%s: fallocate not available %s:%" PRIx64 " +%zx (%d)", + __func__, rb->idstr, start, length, ret); +#endif + + return ret; +} + bool ramblock_is_pmem(RAMBlock *rb) { return rb->flags & RAM_PMEM; @@ -3799,3 +3877,30 @@ bool ram_block_discard_is_required(void) return qatomic_read(&ram_block_discard_required_cnt) || qatomic_read(&ram_block_coordinated_discard_required_cnt); } + +int ram_block_convert_range(RAMBlock *rb, uint64_t start, size_t length, + bool shared_to_private) +{ + if (!rb || rb->guest_memfd < 0) { + return -1; + } + + if (!QEMU_PTR_IS_ALIGNED(start, qemu_host_page_size) || + !QEMU_PTR_IS_ALIGNED(length, qemu_host_page_size)) { + return -1; + } + + if (!length) { + return -1; + } + + if (start + length > rb->max_length) { + return -1; + } + + if (shared_to_private) { + return ram_block_discard_range(rb, start, length); + } else { + return ram_block_discard_guest_memfd_range(rb, start, length); + } +} diff --git a/target/i386/cpu.c b/target/i386/cpu.c index cd16cb893daf..b17412da1d16 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -6660,6 +6660,7 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, uint32_t count, if (sev_enabled()) { *eax = 0x2; *eax |= sev_es_enabled() ? 0x8 : 0; + *eax |= sev_snp_enabled() ? 0x10 : 0; *ebx = sev_get_cbit_position() & 0x3f; /* EBX[5:0] */ *ebx |= (sev_get_reduced_phys_bits() & 0x3f) << 6; /* EBX[11:6] */ } diff --git a/target/i386/cpu.h b/target/i386/cpu.h index ef987f344cff..340e31cdc801 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -2129,7 +2129,14 @@ static inline void cpu_x86_load_seg_cache(CPUX86State *env, SegmentCache *sc; unsigned int new_hflags; - sc = &env->segs[seg_reg]; + if (seg_reg == R_LDTR) { + sc = &env->ldt; + } else if (seg_reg == R_TR) { + sc = &env->tr; + } else { + sc = &env->segs[seg_reg]; + } + sc->selector = selector; sc->base = base; sc->limit = limit; diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index 4ce80555b45c..99f5ebded4af 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -161,6 +161,36 @@ static KVMMSRHandlers msr_handlers[KVM_MSR_FILTER_MAX_RANGES]; static RateLimit bus_lock_ratelimit_ctrl; static int kvm_get_one_msr(X86CPU *cpu, int index, uint64_t *value); +static const char* vm_type_name[] = { + [KVM_X86_DEFAULT_VM] = "default", + [KVM_X86_SW_PROTECTED_VM] = "sw-protected-vm", + [KVM_X86_SNP_VM] = "snp" +}; + +int kvm_get_vm_type(MachineState *ms, const char *vm_type) +{ + int kvm_type = KVM_X86_DEFAULT_VM; + + if (ms->cgs && object_dynamic_cast(OBJECT(ms->cgs), TYPE_SEV_SNP_GUEST)) { + kvm_type = KVM_X86_SNP_VM; + } + + /* + * old KVM doesn't support KVM_CAP_VM_TYPES and KVM_X86_DEFAULT_VM + * is always supported + */ + if (kvm_type == KVM_X86_DEFAULT_VM) { + return kvm_type; + } + + if (!(kvm_check_extension(KVM_STATE(ms->accelerator), KVM_CAP_VM_TYPES) & BIT(kvm_type))) { + error_report("vm-type %s not supported by KVM", vm_type_name[kvm_type]); + exit(1); + } + + return kvm_type; +} + bool kvm_has_smm(void) { return kvm_vm_check_extension(kvm_state, KVM_CAP_X86_SMM); @@ -1914,6 +1944,7 @@ int kvm_arch_init_vcpu(CPUState *cs) } case 0x1f: if (env->nr_dies < 2) { + cpuid_i--; break; } /* fallthrough */ @@ -1925,10 +1956,6 @@ int kvm_arch_init_vcpu(CPUState *cs) break; } - if (i == 0x1f && j == 64) { - break; - } - c->function = i; c->flags = KVM_CPUID_FLAG_SIGNIFCANT_INDEX; c->index = j; @@ -2514,6 +2541,15 @@ int kvm_arch_get_default_type(MachineState *ms) return 0; } +static int kvm_confidential_guest_init(MachineState *ms, Error **errp) +{ + if (object_dynamic_cast(OBJECT(ms->cgs), TYPE_SEV_COMMON)) { + return sev_kvm_init(ms, errp); + } + + return 0; +} + int kvm_arch_init(MachineState *ms, KVMState *s) { uint64_t identity_base = 0xfffbc000; @@ -2534,7 +2570,7 @@ int kvm_arch_init(MachineState *ms, KVMState *s) * mechanisms are supported in future (e.g. TDX), they'll need * their own initialization either here or elsewhere. */ - ret = sev_kvm_init(ms->cgs, &local_err); + ret = kvm_confidential_guest_init(ms, &local_err); if (ret < 0) { error_report_err(local_err); return ret; @@ -5358,6 +5394,9 @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) ret = kvm_xen_handle_exit(cpu, &run->xen); break; #endif + case KVM_EXIT_VMGEXIT: + ret = kvm_handle_vmgexit(run); + break; default: fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason); ret = -1; diff --git a/target/i386/kvm/kvm_i386.h b/target/i386/kvm/kvm_i386.h index 30fedcffea3e..55fb25fa8e2e 100644 --- a/target/i386/kvm/kvm_i386.h +++ b/target/i386/kvm/kvm_i386.h @@ -37,6 +37,7 @@ bool kvm_hv_vpindex_settable(void); bool kvm_enable_sgx_provisioning(KVMState *s); bool kvm_hyperv_expand_features(X86CPU *cpu, Error **errp); +int kvm_get_vm_type(MachineState *ms, const char *vm_type); void kvm_arch_reset_vcpu(X86CPU *cs); void kvm_arch_after_reset_vcpu(X86CPU *cpu); void kvm_arch_do_init_vcpu(X86CPU *cs); diff --git a/target/i386/kvm/sev-stub.c b/target/i386/kvm/sev-stub.c index 1be5341e8a6a..633ef24a69f2 100644 --- a/target/i386/kvm/sev-stub.c +++ b/target/i386/kvm/sev-stub.c @@ -14,7 +14,7 @@ #include "qemu/osdep.h" #include "sev.h" -int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) +int sev_kvm_init(MachineState *ms, Error **errp) { /* If we get here, cgs must be some non-SEV thing */ return 0; diff --git a/target/i386/sev-sysemu-stub.c b/target/i386/sev-sysemu-stub.c index 96e1c15cc3fa..6af643e3a12f 100644 --- a/target/i386/sev-sysemu-stub.c +++ b/target/i386/sev-sysemu-stub.c @@ -42,7 +42,7 @@ void qmp_sev_inject_launch_secret(const char *packet_header, const char *secret, error_setg(errp, "SEV is not available in this QEMU"); } -int sev_encrypt_flash(uint8_t *ptr, uint64_t len, Error **errp) +int sev_encrypt_flash(hwaddr gpa, uint8_t *ptr, uint64_t len, Error **errp) { g_assert_not_reached(); } diff --git a/target/i386/sev.c b/target/i386/sev.c index 9a7124668258..9e5a9f753c89 100644 --- a/target/i386/sev.c +++ b/target/i386/sev.c @@ -37,11 +37,55 @@ #include "qapi/qapi-commands-misc-target.h" #include "exec/confidential-guest-support.h" #include "hw/i386/pc.h" +#include "hw/i386/e820_memory_layout.h" #include "exec/address-spaces.h" +#include "qemu/queue.h" -#define TYPE_SEV_GUEST "sev-guest" +/* hard code sha256 digest size */ +#define HASH_SIZE 32 + +/* Convert between SEV-ES VMSA and SegmentCache flags/attributes */ +#define FLAGS_VMSA_TO_SEGCACHE(flags) \ + ((((flags) & 0xff00) << 12) | (((flags) & 0xff) << 8)) +#define FLAGS_SEGCACHE_TO_VMSA(flags) \ + ((((flags) & 0xff00) >> 8) | (((flags) & 0xf00000) >> 12)) + +typedef struct QEMU_PACKED SevHashTableEntry { + QemuUUID guid; + uint16_t len; + uint8_t hash[HASH_SIZE]; +} SevHashTableEntry; + +typedef struct QEMU_PACKED SevHashTable { + QemuUUID guid; + uint16_t len; + SevHashTableEntry cmdline; + SevHashTableEntry initrd; + SevHashTableEntry kernel; +} SevHashTable; + +/* + * Data encrypted by sev_encrypt_flash() must be padded to a multiple of + * 16 bytes. + */ +typedef struct QEMU_PACKED PaddedSevHashTable { + SevHashTable ht; + uint8_t padding[ROUND_UP(sizeof(SevHashTable), 16) - sizeof(SevHashTable)]; +} PaddedSevHashTable; + +QEMU_BUILD_BUG_ON(sizeof(PaddedSevHashTable) % 16 != 0); + +OBJECT_DECLARE_SIMPLE_TYPE(SevCommonState, SEV_COMMON) OBJECT_DECLARE_SIMPLE_TYPE(SevGuestState, SEV_GUEST) +OBJECT_DECLARE_SIMPLE_TYPE(SevSnpGuestState, SEV_SNP_GUEST) + +typedef struct SevLaunchVmsa { + QTAILQ_ENTRY(SevLaunchVmsa) next; + uint16_t cpu_index; + uint64_t gpa; + struct sev_es_save_area vmsa; +} SevLaunchVmsa; /** * SevGuestState: @@ -53,34 +97,68 @@ OBJECT_DECLARE_SIMPLE_TYPE(SevGuestState, SEV_GUEST) * -object sev-guest,id=sev0 \ * -machine ...,memory-encryption=sev0 */ -struct SevGuestState { +struct SevCommonState { ConfidentialGuestSupport parent_obj; /* configuration parameters */ char *sev_device; - uint32_t policy; - char *dh_cert_file; - char *session_file; uint32_t cbitpos; uint32_t reduced_phys_bits; bool kernel_hashes; /* runtime state */ - uint32_t handle; uint8_t api_major; uint8_t api_minor; uint8_t build_id; int sev_fd; SevState state; + + QTAILQ_HEAD(, SevLaunchVmsa) launch_vmsa; +}; + +struct SevGuestState { + SevCommonState sev_common; gchar *measurement; - uint32_t reset_cs; - uint32_t reset_ip; - bool reset_data_valid; + /* configuration parameters */ + uint32_t handle; + uint32_t policy; + char *dh_cert_file; + char *session_file; +}; + +struct SevSnpGuestState { + SevCommonState sev_common; + + /* configuration parameters */ + char *guest_visible_workarounds; + char *id_block; + char *id_auth; + char *host_data; + char *certs_path; + + struct kvm_snp_init kvm_init_conf; + struct kvm_sev_snp_launch_start kvm_start_conf; + struct kvm_sev_snp_launch_finish kvm_finish_conf; + + uint32_t kernel_hashes_offset; + PaddedSevHashTable *kernel_hashes_data; }; #define DEFAULT_GUEST_POLICY 0x1 /* disable debug */ #define DEFAULT_SEV_DEVICE "/dev/sev" +#define DEFAULT_SEV_SNP_POLICY 0x30000 + +typedef struct SevLaunchUpdateData { + QTAILQ_ENTRY(SevLaunchUpdateData) next; + + hwaddr gpa; + void *hva; + uint64_t len; + int type; +} SevLaunchUpdateData; + +static QTAILQ_HEAD(, SevLaunchUpdateData) launch_update; #define SEV_INFO_BLOCK_GUID "00f771de-1a7e-4fcb-890e-68c77e2fb44e" typedef struct __attribute__((__packed__)) SevInfoBlock { @@ -96,35 +174,6 @@ typedef struct QEMU_PACKED SevHashTableDescriptor { uint32_t size; } SevHashTableDescriptor; -/* hard code sha256 digest size */ -#define HASH_SIZE 32 - -typedef struct QEMU_PACKED SevHashTableEntry { - QemuUUID guid; - uint16_t len; - uint8_t hash[HASH_SIZE]; -} SevHashTableEntry; - -typedef struct QEMU_PACKED SevHashTable { - QemuUUID guid; - uint16_t len; - SevHashTableEntry cmdline; - SevHashTableEntry initrd; - SevHashTableEntry kernel; -} SevHashTable; - -/* - * Data encrypted by sev_encrypt_flash() must be padded to a multiple of - * 16 bytes. - */ -typedef struct QEMU_PACKED PaddedSevHashTable { - SevHashTable ht; - uint8_t padding[ROUND_UP(sizeof(SevHashTable), 16) - sizeof(SevHashTable)]; -} PaddedSevHashTable; - -QEMU_BUILD_BUG_ON(sizeof(PaddedSevHashTable) % 16 != 0); - -static SevGuestState *sev_guest; static Error *sev_mig_blocker; static const char *const sev_fw_errlist[] = { @@ -157,6 +206,43 @@ static const char *const sev_fw_errlist[] = { #define SEV_FW_MAX_ERROR ARRAY_SIZE(sev_fw_errlist) +/* doesn't expose this, so re-use the max from kvm.c */ +#define KVM_MAX_CPUID_ENTRIES 100 + +typedef struct KvmCpuidInfo { + struct kvm_cpuid2 cpuid; + struct kvm_cpuid_entry2 entries[KVM_MAX_CPUID_ENTRIES]; +} KvmCpuidInfo; + +#define SNP_CPUID_FUNCTION_MAXCOUNT 64 +#define SNP_CPUID_FUNCTION_UNKNOWN 0xFFFFFFFF + +typedef struct { + uint32_t eax_in; + uint32_t ecx_in; + uint64_t xcr0_in; + uint64_t xss_in; + uint32_t eax; + uint32_t ebx; + uint32_t ecx; + uint32_t edx; + uint64_t reserved; +} __attribute__((packed)) SnpCpuidFunc; + +typedef struct { + uint32_t count; + uint32_t reserved1; + uint64_t reserved2; + SnpCpuidFunc entries[SNP_CPUID_FUNCTION_MAXCOUNT]; +} __attribute__((packed)) SnpCpuidInfo; + +static int sev_launch_update_data(SevGuestState *sev_guest, uint8_t *addr, + uint64_t len); +static int +snp_launch_update_data(uint64_t gpa, void *hva, uint32_t len, int type); +static int +snp_launch_update_cpuid(uint32_t cpuid_addr, void *hva, uint32_t cpuid_len); + static int sev_ioctl(int fd, int cmd, void *data, int *error) { @@ -205,21 +291,21 @@ fw_error_to_str(int code) } static bool -sev_check_state(const SevGuestState *sev, SevState state) +sev_check_state(const SevCommonState *sev_common, SevState state) { - assert(sev); - return sev->state == state ? true : false; + assert(sev_common); + return sev_common->state == state ? true : false; } static void -sev_set_guest_state(SevGuestState *sev, SevState new_state) +sev_set_guest_state(SevCommonState *sev_common, SevState new_state) { assert(new_state < SEV_STATE__MAX); - assert(sev); + assert(sev_common); - trace_kvm_sev_change_state(SevState_str(sev->state), + trace_kvm_sev_change_state(SevState_str(sev_common->state), SevState_str(new_state)); - sev->state = new_state; + sev_common->state = new_state; } static void @@ -281,173 +367,904 @@ sev_ram_block_removed(RAMBlockNotifier *n, void *host, size_t size, } } -static struct RAMBlockNotifier sev_ram_notifier = { - .ram_block_added = sev_ram_block_added, - .ram_block_removed = sev_ram_block_removed, -}; +static struct RAMBlockNotifier sev_ram_notifier = { + .ram_block_added = sev_ram_block_added, + .ram_block_removed = sev_ram_block_removed, +}; + +static int cgs_check_support(ConfidentialGuestPlatformType platform, + uint16_t platform_version, uint8_t highest_vtl, + uint64_t shared_gpa_boundary) +{ + return (((platform == CGS_PLATFORM_SEV_SNP) && sev_snp_enabled()) || + ((platform == CGS_PLATFORM_SEV_ES) && sev_es_enabled()) || + ((platform == CGS_PLATFORM_SEV) && sev_enabled())) ? 1 : 0; +} + +static void sev_apply_cpu_context(CPUState *cpu) +{ + SevCommonState *sev_common = SEV_COMMON(MACHINE(qdev_get_machine())->cgs); + X86CPU *x86; + CPUX86State *env; + struct SevLaunchVmsa *launch_vmsa; + + /* See if an initial VMSA has been provided for this CPU */ + QTAILQ_FOREACH(launch_vmsa, &sev_common->launch_vmsa, next) + { + if (cpu->cpu_index == launch_vmsa->cpu_index) { + x86 = X86_CPU(cpu); + env = &x86->env; + + /* + * Ideally we would provide the VMSA directly to kvm which would + * ensure that the resulting initial VMSA measurement which is + * calculated during KVM_SEV_LAUNCH_UPDATE_VMSA is calculated from + * exactly what we provide here. Currently this is not possible so + * we need to copy the parts of the VMSA structure that we currently + * support into the CPU state. + */ + cpu_load_efer(env, launch_vmsa->vmsa.efer); + cpu_x86_update_cr4(env, launch_vmsa->vmsa.cr4); + cpu_x86_update_cr0(env, launch_vmsa->vmsa.cr0); + cpu_x86_update_cr3(env, launch_vmsa->vmsa.cr3); + + cpu_x86_load_seg_cache( + env, R_CS, launch_vmsa->vmsa.cs.selector, + launch_vmsa->vmsa.cs.base, launch_vmsa->vmsa.cs.limit, + FLAGS_VMSA_TO_SEGCACHE(launch_vmsa->vmsa.cs.attrib)); + cpu_x86_load_seg_cache( + env, R_DS, launch_vmsa->vmsa.ds.selector, + launch_vmsa->vmsa.ds.base, launch_vmsa->vmsa.ds.limit, + FLAGS_VMSA_TO_SEGCACHE(launch_vmsa->vmsa.ds.attrib)); + cpu_x86_load_seg_cache( + env, R_ES, launch_vmsa->vmsa.es.selector, + launch_vmsa->vmsa.es.base, launch_vmsa->vmsa.es.limit, + FLAGS_VMSA_TO_SEGCACHE(launch_vmsa->vmsa.es.attrib)); + cpu_x86_load_seg_cache( + env, R_FS, launch_vmsa->vmsa.fs.selector, + launch_vmsa->vmsa.fs.base, launch_vmsa->vmsa.fs.limit, + FLAGS_VMSA_TO_SEGCACHE(launch_vmsa->vmsa.fs.attrib)); + cpu_x86_load_seg_cache( + env, R_GS, launch_vmsa->vmsa.gs.selector, + launch_vmsa->vmsa.gs.base, launch_vmsa->vmsa.gs.limit, + FLAGS_VMSA_TO_SEGCACHE(launch_vmsa->vmsa.gs.attrib)); + cpu_x86_load_seg_cache( + env, R_SS, launch_vmsa->vmsa.ss.selector, + launch_vmsa->vmsa.ss.base, launch_vmsa->vmsa.ss.limit, + FLAGS_VMSA_TO_SEGCACHE(launch_vmsa->vmsa.ss.attrib)); + + env->gdt.base = launch_vmsa->vmsa.gdtr.base; + env->gdt.limit = launch_vmsa->vmsa.gdtr.limit; + env->gdt.flags = FLAGS_VMSA_TO_SEGCACHE(launch_vmsa->vmsa.gdtr.attrib); + env->idt.base = launch_vmsa->vmsa.idtr.base; + env->idt.limit = launch_vmsa->vmsa.idtr.limit; + env->idt.flags = FLAGS_VMSA_TO_SEGCACHE(launch_vmsa->vmsa.idtr.attrib); + + cpu_x86_load_seg_cache( + env, R_LDTR, launch_vmsa->vmsa.ldtr.selector, + launch_vmsa->vmsa.ldtr.base, launch_vmsa->vmsa.ldtr.limit, + FLAGS_VMSA_TO_SEGCACHE(launch_vmsa->vmsa.ldtr.attrib)); + cpu_x86_load_seg_cache( + env, R_TR, launch_vmsa->vmsa.tr.selector, + launch_vmsa->vmsa.ldtr.base, launch_vmsa->vmsa.tr.limit, + FLAGS_VMSA_TO_SEGCACHE(launch_vmsa->vmsa.tr.attrib)); + + env->dr[6] = launch_vmsa->vmsa.dr6; + env->dr[7] = launch_vmsa->vmsa.dr7; + + env->regs[R_EAX] = launch_vmsa->vmsa.rax; + env->regs[R_ECX] = launch_vmsa->vmsa.rcx; + env->regs[R_EDX] = launch_vmsa->vmsa.rdx; + env->regs[R_EBX] = launch_vmsa->vmsa.rbx; + env->regs[R_ESP] = launch_vmsa->vmsa.rsp; + env->regs[R_EBP] = launch_vmsa->vmsa.rbp; + env->regs[R_ESI] = launch_vmsa->vmsa.rsi; + env->regs[R_EDI] = launch_vmsa->vmsa.rdi; +#ifdef TARGET_X86_64 + env->regs[R_R8] = launch_vmsa->vmsa.r8; + env->regs[R_R9] = launch_vmsa->vmsa.r9; + env->regs[R_R10] = launch_vmsa->vmsa.r10; + env->regs[R_R11] = launch_vmsa->vmsa.r11; + env->regs[R_R12] = launch_vmsa->vmsa.r12; + env->regs[R_R13] = launch_vmsa->vmsa.r13; + env->regs[R_R14] = launch_vmsa->vmsa.r14; + env->regs[R_R15] = launch_vmsa->vmsa.r15; +#endif + env->eip = launch_vmsa->vmsa.rip; + env->eflags = launch_vmsa->vmsa.rflags; + + if (sev_snp_enabled()) { + env->pat = launch_vmsa->vmsa.g_pat; + } + env->xcr0 = launch_vmsa->vmsa.xcr0; + + break; + } + } +} + +static int check_vmsa_supported(const struct sev_es_save_area *vmsa) +{ + struct sev_es_save_area vmsa_check; + size_t i; + /* + * Clear all supported fields so we can then check the entire structure + * is zero. + */ + memcpy(&vmsa_check, vmsa, sizeof(struct sev_es_save_area)); + memset(&vmsa_check.es, 0, sizeof(vmsa_check.es)); + memset(&vmsa_check.cs, 0, sizeof(vmsa_check.cs)); + memset(&vmsa_check.ss, 0, sizeof(vmsa_check.ss)); + memset(&vmsa_check.ds, 0, sizeof(vmsa_check.ds)); + memset(&vmsa_check.fs, 0, sizeof(vmsa_check.fs)); + memset(&vmsa_check.gs, 0, sizeof(vmsa_check.gs)); + memset(&vmsa_check.gdtr, 0, sizeof(vmsa_check.gdtr)); + memset(&vmsa_check.idtr, 0, sizeof(vmsa_check.idtr)); + memset(&vmsa_check.ldtr, 0, sizeof(vmsa_check.ldtr)); + memset(&vmsa_check.tr, 0, sizeof(vmsa_check.tr)); + vmsa_check.efer = 0; + vmsa_check.cr0 = 0; + vmsa_check.cr3 = 0; + vmsa_check.cr4 = 0; + vmsa_check.dr6 = 0; + vmsa_check.dr7 = 0; + vmsa_check.rax = 0; + vmsa_check.rcx = 0; + vmsa_check.rdx = 0; + vmsa_check.rbx = 0; + vmsa_check.rsp = 0; + vmsa_check.rbp = 0; + vmsa_check.rsi = 0; + vmsa_check.rdi = 0; + vmsa_check.r8 = 0; + vmsa_check.r9 = 0; + vmsa_check.r10 = 0; + vmsa_check.r11 = 0; + vmsa_check.r12 = 0; + vmsa_check.r13 = 0; + vmsa_check.r14 = 0; + vmsa_check.r15 = 0; + vmsa_check.rip = 0; + vmsa_check.rflags = 0; + + vmsa_check.g_pat = 0; + vmsa_check.xcr0 = 0; + + /* TODO: Handle setting of sev_features when KVM supports this. */ + vmsa_check.sev_features = 0; + + for (i = 0; i < sizeof(vmsa_check); ++i) { + if (((uint8_t *)&vmsa_check)[i]) { + return 0; + } + } + return 1; +} + +static int sev_set_cpu_context(uint16_t cpu_index, const void *ctx, + uint32_t ctx_len, hwaddr gpa) +{ + SevCommonState *sev_common = SEV_COMMON(MACHINE(qdev_get_machine())->cgs); + SevLaunchVmsa *launch_vmsa; + CPUState *cpu; + bool exists = false; + + /* + * Setting the CPU context is only supported for SEV-ES. The context buffer + * will contain a sev_es_save_area from the Linux kernel which is defined by + * "Table B-4. VMSA Layout, State Save Area for SEV-ES" in the AMD64 APM, + * Volume 2. + */ + + if (!sev_es_enabled()) { + error_report("SEV: unable to set CPU context: Not supported"); + return 1; + } + + if (ctx_len < sizeof(struct sev_es_save_area)) { + error_report("SEV: unable to set CPU context: Invalid context provided"); + return 1; + } + + cpu = qemu_get_cpu(cpu_index); + if (!cpu) { + error_report("SEV: unable to set CPU context for out of bounds CPU index %d", + cpu_index); + return 1; + } + + /* + * If the context of this VP has already been set then replace it with the + * new context. + */ + QTAILQ_FOREACH(launch_vmsa, &sev_common->launch_vmsa, next) + { + if (cpu_index == launch_vmsa->cpu_index) { + launch_vmsa->gpa = gpa; + memcpy(&launch_vmsa->vmsa, ctx, sizeof(launch_vmsa->vmsa)); + exists = true; + break; + } + } + + if (!exists) { + /* New VP context */ + launch_vmsa = g_new0(SevLaunchVmsa, 1); + memcpy(&launch_vmsa->vmsa, ctx, sizeof(launch_vmsa->vmsa)); + launch_vmsa->cpu_index = cpu_index; + launch_vmsa->gpa = gpa; + QTAILQ_INSERT_TAIL(&sev_common->launch_vmsa, launch_vmsa, next); + } + + /* Synchronise the VMSA with the current CPU state */ + sev_apply_cpu_context(cpu); + + return 0; +} + +static int cgs_set_guest_state(hwaddr gpa, uint8_t *ptr, uint64_t len, + ConfidentialGuestPageType memory_type, + uint16_t cpu_index, Error **errp) +{ + int ret = 1; + + if (!sev_enabled()) { + error_setg(errp, "%s: attempt to configure guest memory, but SEV " + "is not enabled", __func__); + goto out; + } + + switch (memory_type) { + case CGS_PAGE_TYPE_NORMAL: + case CGS_PAGE_TYPE_ZERO: + if (sev_snp_enabled()) { + ret = snp_launch_update_data(gpa, ptr, len, KVM_SEV_SNP_PAGE_TYPE_NORMAL); + } + else { + ret = sev_launch_update_data(SEV_GUEST(MACHINE(qdev_get_machine())->cgs), + ptr, len); + } + break; + + case CGS_PAGE_TYPE_VMSA: + if (!sev_es_enabled()) { + error_setg(errp, "%s: attempt to configure initial VMSA, but SEV-ES " + "is not supported", __func__); + goto out; + } + else { + if (!check_vmsa_supported((const struct sev_es_save_area *)ptr)) { + error_setg(errp, + "%s: The VMSA contains fields that are not " + "synchronized with KVM. Continuing would result in " + "either unpredictable guest behavior, or a " + "mismatched launch measurement.", + __func__); + } else { + ret = sev_set_cpu_context(cpu_index, ptr, len, gpa); + } + } + break; + + case CGS_PAGE_TYPE_UNMEASURED: + if (sev_snp_enabled()) { + ret = snp_launch_update_data(gpa, ptr, len, KVM_SEV_SNP_PAGE_TYPE_UNMEASURED); + } + else { + ret = 0; + } + break; + + case CGS_PAGE_TYPE_SECRETS: + if (sev_snp_enabled()) { + ret = snp_launch_update_data(gpa, ptr, len, KVM_SEV_SNP_PAGE_TYPE_SECRETS); + } + else { + ret = 0; + } + break; + + case CGS_PAGE_TYPE_REQUIRED_MEMORY: + ret = kvm_convert_memory(gpa, len, true); + break; + + case CGS_PAGE_TYPE_CPUID: + if (!sev_snp_enabled()) { + error_setg(errp, "%s: attempt to configure CPUID page, but SEV-SNP " + "is not supported", __func__); + goto out; + } + else { + ret = snp_launch_update_cpuid(gpa, ptr, len); + } + break; + } + if (ret < 0) { + error_setg(errp, "%s: failed to update guest. gpa: %lX, type: %d", + __func__, gpa, memory_type); + } +out: + return ret; +} + +static int cgs_set_guest_policy(ConfidentialGuestPolicyType policy_type, + uint64_t policy, + void *policy_data1, uint32_t policy_data1_size, + void *policy_data2, uint32_t policy_data2_size, + Error **errp) +{ + if (policy_type != GUEST_POLICY_SEV) { + error_setg(errp, "%s: Invalid guest policy type provided for SEV: %d", + __func__, policy_type); + return -1; + } + /* + * SEV-SNP handles policy differently. The policy flags are defined in + * kvm_start_conf.policy and an ID block and ID auth can be provided. + */ + if (sev_snp_enabled()) { + SevSnpGuestState *sev_snp_guest = SEV_SNP_GUEST(MACHINE(qdev_get_machine())->cgs); + struct kvm_sev_snp_launch_finish *finish = &sev_snp_guest->kvm_finish_conf; + + /* + * The policy consists of flags in 'policy' and optionally an ID block and + * ID auth in policy_data1 and policy_data2 respectively. + * The ID block and auth are optional so clear any previous ID block and + * auth and set them if provided, but always set the policy flags. + */ + g_free(sev_snp_guest->id_block); + g_free((guchar *)finish->id_block_uaddr); + g_free(sev_snp_guest->id_auth); + g_free((guchar *)finish->id_auth_uaddr); + sev_snp_guest->id_block = NULL; + finish->id_block_uaddr = 0; + sev_snp_guest->id_auth = NULL; + finish->id_auth_uaddr = 0; + + if (policy_data1_size > 0) { + struct sev_snp_id_authentication *id_auth = (struct sev_snp_id_authentication *)policy_data2; + + if (policy_data1_size != KVM_SEV_SNP_ID_BLOCK_SIZE) { + error_setg(errp, "%s: Invalid SEV-SNP ID block: incorrect size", + __func__); + return -1; + } + if (policy_data2_size != KVM_SEV_SNP_ID_AUTH_SIZE) { + error_setg(errp, "%s: Invalid SEV-SNP ID auth block: incorrect size", + __func__); + return -1; + } + finish->id_block_uaddr = (__u64)g_malloc0(KVM_SEV_SNP_ID_BLOCK_SIZE); + finish->id_auth_uaddr = (__u64)g_malloc0(KVM_SEV_SNP_ID_AUTH_SIZE); + memcpy((void *)finish->id_block_uaddr, policy_data1, KVM_SEV_SNP_ID_BLOCK_SIZE); + memcpy((void *)finish->id_auth_uaddr, policy_data2, KVM_SEV_SNP_ID_AUTH_SIZE); + + /* + * Check if an author key has been provided and use that to flag + * whether the author key is enabled. The first of the author key + * must be non-zero to indicate the key type, which will currently + * always be 2. + */ + sev_snp_guest->kvm_finish_conf.auth_key_en = + id_auth->author_key[0] ? 1 : 0; + finish->id_block_en = 1; + } + sev_snp_guest->kvm_start_conf.policy = policy; + } + else { + SevGuestState *sev_guest = SEV_GUEST(MACHINE(qdev_get_machine())->cgs); + /* Only the policy flags are supported for SEV and SEV-ES */ + if ((policy_data1_size > 0) || (policy_data2_size > 0) || !sev_guest) { + error_setg(errp, "%s: An ID block/ID auth block has been provided " + "but SEV-SNP is not supported", __func__); + return -1; + } + sev_guest->policy = policy; + } + return 0; +} + +static int cgs_get_mem_map_entry(int index, + ConfidentialGuestMemoryMapEntry *entry, + Error **errp) +{ + if ((index < 0) || (index >= e820_get_num_entries())) { + return 1; + } + entry->gpa = e820_table[index].address; + entry->size = e820_table[index].length; + switch (e820_table[index].type) { + case E820_RAM: + entry->type = CGS_MEM_RAM; + break; + case E820_RESERVED: + entry->type = CGS_MEM_RESERVED; + break; + case E820_ACPI: + entry->type = CGS_MEM_ACPI; + break; + case E820_NVS: + entry->type = CGS_MEM_NVS; + break; + case E820_UNUSABLE: + entry->type = CGS_MEM_UNUSABLE; + break; + } + return 0; +} + +static char * +sev_common_get_sev_device(Object *obj, Error **errp) +{ + return g_strdup(SEV_COMMON(obj)->sev_device); +} + +static void +sev_common_set_sev_device(Object *obj, const char *value, Error **errp) +{ + SEV_COMMON(obj)->sev_device = g_strdup(value); +} + +static bool sev_common_get_kernel_hashes(Object *obj, Error **errp) +{ + return SEV_COMMON(obj)->kernel_hashes; +} + +static void sev_common_set_kernel_hashes(Object *obj, bool value, Error **errp) +{ + SEV_COMMON(obj)->kernel_hashes = value; +} + +static void +sev_common_class_init(ObjectClass *oc, void *data) +{ + object_class_property_add_str(oc, "sev-device", + sev_common_get_sev_device, + sev_common_set_sev_device); + object_class_property_set_description(oc, "sev-device", + "SEV device to use"); + object_class_property_add_bool(oc, "kernel-hashes", + sev_common_get_kernel_hashes, + sev_common_set_kernel_hashes); + object_class_property_set_description(oc, "kernel-hashes", + "add kernel hashes to guest firmware for measured Linux boot"); +} + +static void +sev_common_instance_init(Object *obj) +{ + SevCommonState *sev_common = SEV_COMMON(obj); + ConfidentialGuestSupport* cgs = CONFIDENTIAL_GUEST_SUPPORT(obj); + + sev_common->sev_device = g_strdup(DEFAULT_SEV_DEVICE); + + object_property_add_uint32_ptr(obj, "cbitpos", &sev_common->cbitpos, + OBJ_PROP_FLAG_READWRITE); + object_property_add_uint32_ptr(obj, "reduced-phys-bits", + &sev_common->reduced_phys_bits, + OBJ_PROP_FLAG_READWRITE); + + cgs->check_support = cgs_check_support; + cgs->set_guest_state = cgs_set_guest_state; + cgs->set_guest_policy = cgs_set_guest_policy; + cgs->get_mem_map_entry = cgs_get_mem_map_entry; + + QTAILQ_INIT(&sev_common->launch_vmsa); +} + +/* sev guest info common to sev/sev-es/sev-snp */ +static const TypeInfo sev_common_info = { + .parent = TYPE_CONFIDENTIAL_GUEST_SUPPORT, + .name = TYPE_SEV_COMMON, + .instance_size = sizeof(SevCommonState), + .class_init = sev_common_class_init, + .instance_init = sev_common_instance_init, + .abstract = true, + .interfaces = (InterfaceInfo[]) { + { TYPE_USER_CREATABLE }, + { } + } +}; + +static char * +sev_guest_get_dh_cert_file(Object *obj, Error **errp) +{ + return g_strdup(SEV_GUEST(obj)->dh_cert_file); +} + +static void +sev_guest_set_dh_cert_file(Object *obj, const char *value, Error **errp) +{ + SEV_GUEST(obj)->dh_cert_file = g_strdup(value); +} + +static char * +sev_guest_get_session_file(Object *obj, Error **errp) +{ + SevGuestState *sev_guest = SEV_GUEST(obj); + + return sev_guest->session_file ? g_strdup(sev_guest->session_file) : NULL; +} + +static void +sev_guest_set_session_file(Object *obj, const char *value, Error **errp) +{ + SEV_GUEST(obj)->session_file = g_strdup(value); +} + +static void +sev_guest_class_init(ObjectClass *oc, void *data) +{ + object_class_property_add_str(oc, "dh-cert-file", + sev_guest_get_dh_cert_file, + sev_guest_set_dh_cert_file); + object_class_property_set_description(oc, "dh-cert-file", + "guest owners DH certificate (encoded with base64)"); + object_class_property_add_str(oc, "session-file", + sev_guest_get_session_file, + sev_guest_set_session_file); + object_class_property_set_description(oc, "session-file", + "guest owners session parameters (encoded with base64)"); +} + +static void +sev_guest_instance_init(Object *obj) +{ + SevGuestState *sev_guest = SEV_GUEST(obj); + + sev_guest->policy = DEFAULT_GUEST_POLICY; + object_property_add_uint32_ptr(obj, "handle", &sev_guest->handle, + OBJ_PROP_FLAG_READWRITE); + object_property_add_uint32_ptr(obj, "policy", &sev_guest->policy, + OBJ_PROP_FLAG_READWRITE); +} + +/* guest info specific sev/sev-es */ +static const TypeInfo sev_guest_info = { + .parent = TYPE_SEV_COMMON, + .name = TYPE_SEV_GUEST, + .instance_size = sizeof(SevGuestState), + .instance_init = sev_guest_instance_init, + .class_init = sev_guest_class_init, +}; + +static void +sev_snp_guest_get_init_flags(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + visit_type_uint64(v, name, + (uint64_t *)&SEV_SNP_GUEST(obj)->kvm_init_conf.flags, + errp); +} + +static void +sev_snp_guest_set_init_flags(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + visit_type_uint64(v, name, + (uint64_t *)&SEV_SNP_GUEST(obj)->kvm_init_conf.flags, + errp); +} + +static void +sev_snp_guest_get_policy(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + visit_type_uint64(v, name, + (uint64_t *)&SEV_SNP_GUEST(obj)->kvm_start_conf.policy, + errp); +} + +static void +sev_snp_guest_set_policy(Object *obj, Visitor *v, const char *name, + void *opaque, Error **errp) +{ + visit_type_uint64(v, name, + (uint64_t *)&SEV_SNP_GUEST(obj)->kvm_start_conf.policy, + errp); +} + +static char * +sev_snp_guest_get_guest_visible_workarounds(Object *obj, Error **errp) +{ + return g_strdup(SEV_SNP_GUEST(obj)->guest_visible_workarounds); +} + +static void +sev_snp_guest_set_guest_visible_workarounds(Object *obj, const char *value, + Error **errp) +{ + SevSnpGuestState *sev_snp_guest = SEV_SNP_GUEST(obj); + struct kvm_sev_snp_launch_start *start = &sev_snp_guest->kvm_start_conf; + g_autofree guchar *blob; + gsize len; + + if (sev_snp_guest->guest_visible_workarounds) { + g_free(sev_snp_guest->guest_visible_workarounds); + } + + /* store the base64 str so we don't need to re-encode in getter */ + sev_snp_guest->guest_visible_workarounds = g_strdup(value); + + blob = qbase64_decode(sev_snp_guest->guest_visible_workarounds, -1, &len, errp); + if (!blob) { + return; + } + + if (len > sizeof(start->gosvw)) { + error_setg(errp, "parameter length of %lu exceeds max of %lu", + len, sizeof(start->gosvw)); + return; + } + + memcpy(start->gosvw, blob, len); +} + +static char * +sev_snp_guest_get_id_block(Object *obj, Error **errp) +{ + SevSnpGuestState *sev_snp_guest = SEV_SNP_GUEST(obj); + + return g_strdup(sev_snp_guest->id_block); +} + +static void +sev_snp_guest_set_id_block(Object *obj, const char *value, Error **errp) +{ + SevSnpGuestState *sev_snp_guest = SEV_SNP_GUEST(obj); + struct kvm_sev_snp_launch_finish *finish = &sev_snp_guest->kvm_finish_conf; + gsize len; + + if (sev_snp_guest->id_block) { + g_free(sev_snp_guest->id_block); + g_free((guchar *)finish->id_block_uaddr); + } + + /* store the base64 str so we don't need to re-encode in getter */ + sev_snp_guest->id_block = g_strdup(value); + + finish->id_block_uaddr = + (uint64_t)qbase64_decode(sev_snp_guest->id_block, -1, &len, errp); + + if (!finish->id_block_uaddr) { + return; + } + + if (len > KVM_SEV_SNP_ID_BLOCK_SIZE) { + error_setg(errp, "parameter length of %lu exceeds max of %u", + len, KVM_SEV_SNP_ID_BLOCK_SIZE); + return; + } -static void -sev_guest_finalize(Object *obj) -{ + finish->id_block_en = (len) ? 1 : 0; } static char * -sev_guest_get_session_file(Object *obj, Error **errp) +sev_snp_guest_get_id_auth(Object *obj, Error **errp) { - SevGuestState *s = SEV_GUEST(obj); + SevSnpGuestState *sev_snp_guest = SEV_SNP_GUEST(obj); - return s->session_file ? g_strdup(s->session_file) : NULL; + return g_strdup(sev_snp_guest->id_auth); } static void -sev_guest_set_session_file(Object *obj, const char *value, Error **errp) +sev_snp_guest_set_id_auth(Object *obj, const char *value, Error **errp) { - SevGuestState *s = SEV_GUEST(obj); + SevSnpGuestState *sev_snp_guest = SEV_SNP_GUEST(obj); + struct kvm_sev_snp_launch_finish *finish = &sev_snp_guest->kvm_finish_conf; + gsize len; + + if (sev_snp_guest->id_auth) { + g_free(sev_snp_guest->id_auth); + g_free((guchar *)finish->id_auth_uaddr); + } + + /* store the base64 str so we don't need to re-encode in getter */ + sev_snp_guest->id_auth = g_strdup(value); + + finish->id_auth_uaddr = + (uint64_t)qbase64_decode(sev_snp_guest->id_auth, -1, &len, errp); + + if (!finish->id_auth_uaddr) { + return; + } - s->session_file = g_strdup(value); + if (len > KVM_SEV_SNP_ID_AUTH_SIZE) { + error_setg(errp, "parameter length of %lu exceeds max of %u", + len, KVM_SEV_SNP_ID_AUTH_SIZE); + return; + } } -static char * -sev_guest_get_dh_cert_file(Object *obj, Error **errp) +static bool +sev_snp_guest_get_auth_key_en(Object *obj, Error **errp) { - SevGuestState *s = SEV_GUEST(obj); + SevSnpGuestState *sev_snp_guest = SEV_SNP_GUEST(obj); - return g_strdup(s->dh_cert_file); + return !!sev_snp_guest->kvm_finish_conf.auth_key_en; } static void -sev_guest_set_dh_cert_file(Object *obj, const char *value, Error **errp) +sev_snp_guest_set_auth_key_en(Object *obj, bool value, Error **errp) { - SevGuestState *s = SEV_GUEST(obj); + SevSnpGuestState *sev_snp_guest = SEV_SNP_GUEST(obj); - s->dh_cert_file = g_strdup(value); + sev_snp_guest->kvm_finish_conf.auth_key_en = value; } static char * -sev_guest_get_sev_device(Object *obj, Error **errp) +sev_snp_guest_get_host_data(Object *obj, Error **errp) { - SevGuestState *sev = SEV_GUEST(obj); + SevSnpGuestState *sev_snp_guest = SEV_SNP_GUEST(obj); - return g_strdup(sev->sev_device); + return g_strdup(sev_snp_guest->host_data); } static void -sev_guest_set_sev_device(Object *obj, const char *value, Error **errp) +sev_snp_guest_set_host_data(Object *obj, const char *value, Error **errp) { - SevGuestState *sev = SEV_GUEST(obj); + SevSnpGuestState *sev_snp_guest = SEV_SNP_GUEST(obj); + struct kvm_sev_snp_launch_finish *finish = &sev_snp_guest->kvm_finish_conf; + g_autofree guchar *blob; + gsize len; + + if (sev_snp_guest->host_data) { + g_free(sev_snp_guest->host_data); + } + + /* store the base64 str so we don't need to re-encode in getter */ + sev_snp_guest->host_data = g_strdup(value); + + blob = qbase64_decode(sev_snp_guest->host_data, -1, &len, errp); + + if (!blob) { + return; + } - sev->sev_device = g_strdup(value); + if (len > sizeof(finish->host_data)) { + error_setg(errp, "parameter length of %lu exceeds max of %lu", + len, sizeof(finish->host_data)); + return; + } + + memcpy(finish->host_data, blob, len); } -static bool sev_guest_get_kernel_hashes(Object *obj, Error **errp) +static char * +sev_snp_guest_get_certs_path(Object *obj, Error **errp) { - SevGuestState *sev = SEV_GUEST(obj); + SevSnpGuestState *sev_snp_guest = SEV_SNP_GUEST(obj); - return sev->kernel_hashes; + return g_strdup(sev_snp_guest->certs_path); } -static void sev_guest_set_kernel_hashes(Object *obj, bool value, Error **errp) +static void +sev_snp_guest_set_certs_path(Object *obj, const char *value, Error **errp) { - SevGuestState *sev = SEV_GUEST(obj); + SevSnpGuestState *sev_snp_guest = SEV_SNP_GUEST(obj); + + if (sev_snp_guest->host_data) { + g_free(sev_snp_guest->host_data); + } - sev->kernel_hashes = value; + sev_snp_guest->certs_path = value ? g_strdup(value) : NULL; } static void -sev_guest_class_init(ObjectClass *oc, void *data) +sev_snp_guest_class_init(ObjectClass *oc, void *data) { - object_class_property_add_str(oc, "sev-device", - sev_guest_get_sev_device, - sev_guest_set_sev_device); - object_class_property_set_description(oc, "sev-device", - "SEV device to use"); - object_class_property_add_str(oc, "dh-cert-file", - sev_guest_get_dh_cert_file, - sev_guest_set_dh_cert_file); - object_class_property_set_description(oc, "dh-cert-file", - "guest owners DH certificate (encoded with base64)"); - object_class_property_add_str(oc, "session-file", - sev_guest_get_session_file, - sev_guest_set_session_file); - object_class_property_set_description(oc, "session-file", - "guest owners session parameters (encoded with base64)"); - object_class_property_add_bool(oc, "kernel-hashes", - sev_guest_get_kernel_hashes, - sev_guest_set_kernel_hashes); - object_class_property_set_description(oc, "kernel-hashes", - "add kernel hashes to guest firmware for measured Linux boot"); + object_class_property_add(oc, "init-flags", "uint64", + sev_snp_guest_get_init_flags, + sev_snp_guest_set_init_flags, NULL, NULL); + object_class_property_set_description(oc, "init-flags", + "guest initialization flags"); + object_class_property_add(oc, "policy", "uint64", + sev_snp_guest_get_policy, + sev_snp_guest_set_policy, NULL, NULL); + object_class_property_add_str(oc, "guest-visible-workarounds", + sev_snp_guest_get_guest_visible_workarounds, + sev_snp_guest_set_guest_visible_workarounds); + object_class_property_add_str(oc, "id-block", + sev_snp_guest_get_id_block, + sev_snp_guest_set_id_block); + object_class_property_add_str(oc, "id-auth", + sev_snp_guest_get_id_auth, + sev_snp_guest_set_id_auth); + object_class_property_add_bool(oc, "auth-key-enabled", + sev_snp_guest_get_auth_key_en, + sev_snp_guest_set_auth_key_en); + object_class_property_add_str(oc, "host-data", + sev_snp_guest_get_host_data, + sev_snp_guest_set_host_data); + object_class_property_add_str(oc, "certs-path", + sev_snp_guest_get_certs_path, + sev_snp_guest_set_certs_path); } static void -sev_guest_instance_init(Object *obj) +sev_snp_guest_instance_init(Object *obj) { - SevGuestState *sev = SEV_GUEST(obj); + SevSnpGuestState *sev_snp_guest = SEV_SNP_GUEST(obj); - sev->sev_device = g_strdup(DEFAULT_SEV_DEVICE); - sev->policy = DEFAULT_GUEST_POLICY; - object_property_add_uint32_ptr(obj, "policy", &sev->policy, - OBJ_PROP_FLAG_READWRITE); - object_property_add_uint32_ptr(obj, "handle", &sev->handle, - OBJ_PROP_FLAG_READWRITE); - object_property_add_uint32_ptr(obj, "cbitpos", &sev->cbitpos, - OBJ_PROP_FLAG_READWRITE); - object_property_add_uint32_ptr(obj, "reduced-phys-bits", - &sev->reduced_phys_bits, - OBJ_PROP_FLAG_READWRITE); + /* default init/start/finish params for kvm */ + sev_snp_guest->kvm_start_conf.policy = DEFAULT_SEV_SNP_POLICY; } -/* sev guest info */ -static const TypeInfo sev_guest_info = { - .parent = TYPE_CONFIDENTIAL_GUEST_SUPPORT, - .name = TYPE_SEV_GUEST, - .instance_size = sizeof(SevGuestState), - .instance_finalize = sev_guest_finalize, - .class_init = sev_guest_class_init, - .instance_init = sev_guest_instance_init, - .interfaces = (InterfaceInfo[]) { - { TYPE_USER_CREATABLE }, - { } - } +/* guest info specific to sev-snp */ +static const TypeInfo sev_snp_guest_info = { + .parent = TYPE_SEV_COMMON, + .name = TYPE_SEV_SNP_GUEST, + .instance_size = sizeof(SevSnpGuestState), + .class_init = sev_snp_guest_class_init, + .instance_init = sev_snp_guest_instance_init, }; bool sev_enabled(void) { - return !!sev_guest; + ConfidentialGuestSupport *cgs = MACHINE(qdev_get_machine())->cgs; + + return !!object_dynamic_cast(OBJECT(cgs), TYPE_SEV_COMMON); +} + +bool +sev_snp_enabled(void) +{ + ConfidentialGuestSupport *cgs = MACHINE(qdev_get_machine())->cgs; + + return !!object_dynamic_cast(OBJECT(cgs), TYPE_SEV_SNP_GUEST); } bool sev_es_enabled(void) { - return sev_enabled() && (sev_guest->policy & SEV_POLICY_ES); + ConfidentialGuestSupport *cgs = MACHINE(qdev_get_machine())->cgs; + + return sev_snp_enabled() || + (sev_enabled() && SEV_GUEST(cgs)->policy & SEV_POLICY_ES); } uint32_t sev_get_cbit_position(void) { - return sev_guest ? sev_guest->cbitpos : 0; + SevCommonState *sev_common = SEV_COMMON(MACHINE(qdev_get_machine())->cgs); + + return sev_common ? sev_common->cbitpos : 0; } uint32_t sev_get_reduced_phys_bits(void) { - return sev_guest ? sev_guest->reduced_phys_bits : 0; + SevCommonState *sev_common = SEV_COMMON(MACHINE(qdev_get_machine())->cgs); + + return sev_common ? sev_common->reduced_phys_bits : 0; } static SevInfo *sev_get_info(void) { SevInfo *info; + SevCommonState *sev_common = SEV_COMMON(MACHINE(qdev_get_machine())->cgs); info = g_new0(SevInfo, 1); info->enabled = sev_enabled(); if (info->enabled) { - info->api_major = sev_guest->api_major; - info->api_minor = sev_guest->api_minor; - info->build_id = sev_guest->build_id; - info->policy = sev_guest->policy; - info->state = sev_guest->state; - info->handle = sev_guest->handle; + info->api_major = sev_common->api_major; + info->api_minor = sev_common->api_minor; + info->build_id = sev_common->build_id; + info->state = sev_common->state; + + if (sev_snp_enabled()) { + info->sev_type = SEV_GUEST_TYPE_SEV_SNP; + info->u.sev_snp.snp_policy = + object_property_get_uint(OBJECT(sev_common), "policy", NULL); + } else { + info->sev_type = SEV_GUEST_TYPE_SEV; + info->u.sev.handle = SEV_GUEST(sev_common)->handle; + info->u.sev.policy = + (uint32_t)object_property_get_uint(OBJECT(sev_common), + "policy", NULL); + } } return info; @@ -470,20 +1287,36 @@ void hmp_info_sev(Monitor *mon, const QDict *qdict) { SevInfo *info = sev_get_info(); - if (info && info->enabled) { - monitor_printf(mon, "handle: %d\n", info->handle); + if (!info || !info->enabled) { + monitor_printf(mon, "SEV is not enabled\n"); + goto out; + } + + if (sev_snp_enabled()) { monitor_printf(mon, "state: %s\n", SevState_str(info->state)); monitor_printf(mon, "build: %d\n", info->build_id); monitor_printf(mon, "api version: %d.%d\n", info->api_major, info->api_minor); monitor_printf(mon, "debug: %s\n", - info->policy & SEV_POLICY_NODBG ? "off" : "on"); - monitor_printf(mon, "key-sharing: %s\n", - info->policy & SEV_POLICY_NOKS ? "off" : "on"); + info->u.sev_snp.snp_policy & SEV_SNP_POLICY_DBG ? "on" + : "off"); + monitor_printf(mon, "SMT allowed: %s\n", + info->u.sev_snp.snp_policy & SEV_SNP_POLICY_SMT ? "on" + : "off"); } else { - monitor_printf(mon, "SEV is not enabled\n"); + monitor_printf(mon, "handle: %d\n", info->u.sev.handle); + monitor_printf(mon, "state: %s\n", SevState_str(info->state)); + monitor_printf(mon, "build: %d\n", info->build_id); + monitor_printf(mon, "api version: %d.%d\n", + info->api_major, info->api_minor); + monitor_printf(mon, "debug: %s\n", + info->u.sev.policy & SEV_POLICY_NODBG ? "off" : "on"); + monitor_printf(mon, "key-sharing: %s\n", + info->u.sev.policy & SEV_POLICY_NOKS ? "off" : "on"); } + monitor_printf(mon, "SEV type: %s\n", SevGuestType_str(info->sev_type)); +out: qapi_free_SevInfo(info); } @@ -573,6 +1406,8 @@ static SevCapability *sev_get_capabilities(Error **errp) size_t pdh_len = 0, cert_chain_len = 0, cpu0_id_len = 0; uint32_t ebx; int fd; + SevCommonState *sev_common; + char *sev_device; if (!kvm_enabled()) { error_setg(errp, "KVM not enabled"); @@ -583,12 +1418,22 @@ static SevCapability *sev_get_capabilities(Error **errp) return NULL; } - fd = open(DEFAULT_SEV_DEVICE, O_RDWR); + sev_common = SEV_COMMON(MACHINE(qdev_get_machine())->cgs); + if (!sev_common) { + error_setg(errp, "SEV is not configured"); + return NULL; + } + + sev_device = object_property_get_str(OBJECT(sev_common), "sev-device", + &error_abort); + fd = open(sev_device, O_RDWR); if (fd < 0) { error_setg_errno(errp, errno, "SEV: Failed to open %s", DEFAULT_SEV_DEVICE); + g_free(sev_device); return NULL; } + g_free(sev_device); if (sev_get_pdh_info(fd, &pdh_data, &pdh_len, &cert_chain_data, &cert_chain_len, errp)) { @@ -631,7 +1476,7 @@ static SevAttestationReport *sev_get_attestation_report(const char *mnonce, { struct kvm_sev_attestation_report input = {}; SevAttestationReport *report = NULL; - SevGuestState *sev = sev_guest; + SevCommonState *sev_common; g_autofree guchar *data = NULL; g_autofree guchar *buf = NULL; gsize len; @@ -656,8 +1501,10 @@ static SevAttestationReport *sev_get_attestation_report(const char *mnonce, return NULL; } + sev_common = SEV_COMMON(MACHINE(qdev_get_machine())->cgs); + /* Query the report length */ - ret = sev_ioctl(sev->sev_fd, KVM_SEV_GET_ATTESTATION_REPORT, + ret = sev_ioctl(sev_common->sev_fd, KVM_SEV_GET_ATTESTATION_REPORT, &input, &err); if (ret < 0) { if (err != SEV_RET_INVALID_LEN) { @@ -673,7 +1520,7 @@ static SevAttestationReport *sev_get_attestation_report(const char *mnonce, memcpy(input.mnonce, buf, sizeof(input.mnonce)); /* Query the report */ - ret = sev_ioctl(sev->sev_fd, KVM_SEV_GET_ATTESTATION_REPORT, + ret = sev_ioctl(sev_common->sev_fd, KVM_SEV_GET_ATTESTATION_REPORT, &input, &err); if (ret) { error_setg_errno(errp, errno, "SEV: Failed to get attestation report" @@ -713,26 +1560,51 @@ sev_read_file_base64(const char *filename, guchar **data, gsize *len) } static int -sev_launch_start(SevGuestState *sev) +sev_snp_launch_start(SevSnpGuestState *sev_snp_guest) +{ + int fw_error, rc; + SevCommonState *sev_common = SEV_COMMON(sev_snp_guest); + struct kvm_sev_snp_launch_start *start = &sev_snp_guest->kvm_start_conf; + + trace_kvm_sev_snp_launch_start(start->policy, sev_snp_guest->guest_visible_workarounds); + + rc = sev_ioctl(sev_common->sev_fd, KVM_SEV_SNP_LAUNCH_START, + start, &fw_error); + if (rc < 0) { + error_report("%s: SNP_LAUNCH_START ret=%d fw_error=%d '%s'", + __func__, rc, fw_error, fw_error_to_str(fw_error)); + return 1; + } + + QTAILQ_INIT(&launch_update); + + sev_set_guest_state(sev_common, SEV_STATE_LAUNCH_UPDATE); + + return 0; +} + +static int +sev_launch_start(SevGuestState *sev_guest) { gsize sz; int ret = 1; int fw_error, rc; struct kvm_sev_launch_start start = { - .handle = sev->handle, .policy = sev->policy + .handle = sev_guest->handle, .policy = sev_guest->policy }; guchar *session = NULL, *dh_cert = NULL; + SevCommonState *sev_common = SEV_COMMON(sev_guest); - if (sev->session_file) { - if (sev_read_file_base64(sev->session_file, &session, &sz) < 0) { + if (sev_guest->session_file) { + if (sev_read_file_base64(sev_guest->session_file, &session, &sz) < 0) { goto out; } start.session_uaddr = (unsigned long)session; start.session_len = sz; } - if (sev->dh_cert_file) { - if (sev_read_file_base64(sev->dh_cert_file, &dh_cert, &sz) < 0) { + if (sev_guest->dh_cert_file) { + if (sev_read_file_base64(sev_guest->dh_cert_file, &dh_cert, &sz) < 0) { goto out; } start.dh_uaddr = (unsigned long)dh_cert; @@ -740,15 +1612,15 @@ sev_launch_start(SevGuestState *sev) } trace_kvm_sev_launch_start(start.policy, session, dh_cert); - rc = sev_ioctl(sev->sev_fd, KVM_SEV_LAUNCH_START, &start, &fw_error); + rc = sev_ioctl(sev_common->sev_fd, KVM_SEV_LAUNCH_START, &start, &fw_error); if (rc < 0) { error_report("%s: LAUNCH_START ret=%d fw_error=%d '%s'", __func__, ret, fw_error, fw_error_to_str(fw_error)); goto out; } - sev_set_guest_state(sev, SEV_STATE_LAUNCH_UPDATE); - sev->handle = start.handle; + sev_set_guest_state(sev_common, SEV_STATE_LAUNCH_UPDATE); + sev_guest->handle = start.handle; ret = 0; out: @@ -757,8 +1629,99 @@ sev_launch_start(SevGuestState *sev) return ret; } +static void +sev_snp_cpuid_report_mismatches(SnpCpuidInfo *old, + SnpCpuidInfo *new) +{ + size_t i; + + if (old->count != new->count) { + error_report("SEV-SNP: CPUID validation failed due to count mismatch, provided: %d, expected: %d", + old->count, new->count); + } + + for (i = 0; i < old->count; i++) { + SnpCpuidFunc *old_func, *new_func; + + old_func = &old->entries[i]; + new_func = &new->entries[i]; + + if (memcmp(old_func, new_func, sizeof(SnpCpuidFunc))) { + error_report("SEV-SNP: CPUID validation failed for function 0x%x, index: 0x%x.\n" + "provided: eax:0x%08x, ebx: 0x%08x, ecx: 0x%08x, edx: 0x%08x\n" + "expected: eax:0x%08x, ebx: 0x%08x, ecx: 0x%08x, edx: 0x%08x", + old_func->eax_in, old_func->ecx_in, + old_func->eax, old_func->ebx, old_func->ecx, old_func->edx, + new_func->eax, new_func->ebx, new_func->ecx, new_func->edx); + } + } +} + +static const char * +snp_page_type_to_str(int type) +{ + switch (type) { + case KVM_SEV_SNP_PAGE_TYPE_NORMAL: return "Normal"; + case KVM_SEV_SNP_PAGE_TYPE_VMSA: return "Vmsa"; + case KVM_SEV_SNP_PAGE_TYPE_ZERO: return "Zero"; + case KVM_SEV_SNP_PAGE_TYPE_UNMEASURED: return "Unmeasured"; + case KVM_SEV_SNP_PAGE_TYPE_SECRETS: return "Secrets"; + case KVM_SEV_SNP_PAGE_TYPE_CPUID: return "Cpuid"; + default: return "unknown"; + } +} + +static int +sev_snp_launch_update(SevSnpGuestState *sev_snp_guest, SevLaunchUpdateData *data) +{ + int ret, fw_error; + SnpCpuidInfo snp_cpuid_info; + struct kvm_sev_snp_launch_update update = {0}; + + if (!data->hva || !data->len) { + error_report("%s: SNP_LAUNCH_UPDATE called with invalid address / length: %p / %lx", + __func__, data->hva, data->len); + return 1; + } + + if (data->type == KVM_SEV_SNP_PAGE_TYPE_CPUID) { + /* Save a copy for comparison in case the LAUNCH_UPDATE fails */ + memcpy(&snp_cpuid_info, data->hva, sizeof(snp_cpuid_info)); + } + + update.uaddr = (__u64)(unsigned long)data->hva; + update.start_gfn = data->gpa >> TARGET_PAGE_BITS; + update.len = data->len; + update.page_type = data->type; + + trace_kvm_sev_snp_launch_update(data->hva, data->gpa, data->len, + snp_page_type_to_str(data->type)); + ret = sev_ioctl(SEV_COMMON(sev_snp_guest)->sev_fd, + KVM_SEV_SNP_LAUNCH_UPDATE, + &update, &fw_error); + if (ret) { + error_report("%s: SNP_LAUNCH_UPDATE ret=%d fw_error=%d '%s'", + __func__, ret, fw_error, fw_error_to_str(fw_error)); + + if (data->type == KVM_SEV_SNP_PAGE_TYPE_CPUID) { + sev_snp_cpuid_report_mismatches(&snp_cpuid_info, data->hva); + error_report("SEV-SNP: failed update CPUID page"); + } + + goto out; + } + + ret = kvm_convert_memory(data->gpa, data->len, true); + if (ret) { + error_report("SEV-SNP: failed to configure initial private guest memory"); + } + +out: + return ret; +} + static int -sev_launch_update_data(SevGuestState *sev, uint8_t *addr, uint64_t len) +sev_launch_update_data(SevGuestState *sev_guest, uint8_t *addr, uint64_t len) { int ret, fw_error; struct kvm_sev_launch_update_data update; @@ -770,7 +1733,7 @@ sev_launch_update_data(SevGuestState *sev, uint8_t *addr, uint64_t len) update.uaddr = (__u64)(unsigned long)addr; update.len = len; trace_kvm_sev_launch_update_data(addr, len); - ret = sev_ioctl(sev->sev_fd, KVM_SEV_LAUNCH_UPDATE_DATA, + ret = sev_ioctl(SEV_COMMON(sev_guest)->sev_fd, KVM_SEV_LAUNCH_UPDATE_DATA, &update, &fw_error); if (ret) { error_report("%s: LAUNCH_UPDATE ret=%d fw_error=%d '%s'", @@ -781,11 +1744,22 @@ sev_launch_update_data(SevGuestState *sev, uint8_t *addr, uint64_t len) } static int -sev_launch_update_vmsa(SevGuestState *sev) +sev_launch_update_vmsa(SevGuestState *sev_guest) { int ret, fw_error; + CPUState *cpu; + + /* + * The initial CPU state is measured as part of KVM_SEV_LAUNCH_UPDATE_VMSA. + * Synchronise the CPU state to any provided launch VMSA structures. + */ + CPU_FOREACH(cpu) { + sev_apply_cpu_context(cpu); + } - ret = sev_ioctl(sev->sev_fd, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL, &fw_error); + + ret = sev_ioctl(SEV_COMMON(sev_guest)->sev_fd, KVM_SEV_LAUNCH_UPDATE_VMSA, + NULL, &fw_error); if (ret) { error_report("%s: LAUNCH_UPDATE_VMSA ret=%d fw_error=%d '%s'", __func__, ret, fw_error, fw_error_to_str(fw_error)); @@ -797,25 +1771,26 @@ sev_launch_update_vmsa(SevGuestState *sev) static void sev_launch_get_measure(Notifier *notifier, void *unused) { - SevGuestState *sev = sev_guest; + SevCommonState *sev_common = SEV_COMMON(MACHINE(qdev_get_machine())->cgs); + SevGuestState *sev_guest = SEV_GUEST(sev_common); int ret, error; g_autofree guchar *data = NULL; struct kvm_sev_launch_measure measurement = {}; - if (!sev_check_state(sev, SEV_STATE_LAUNCH_UPDATE)) { + if (!sev_check_state(sev_common, SEV_STATE_LAUNCH_UPDATE)) { return; } if (sev_es_enabled()) { /* measure all the VM save areas before getting launch_measure */ - ret = sev_launch_update_vmsa(sev); + ret = sev_launch_update_vmsa(sev_guest); if (ret) { exit(1); } } /* query the measurement blob length */ - ret = sev_ioctl(sev->sev_fd, KVM_SEV_LAUNCH_MEASURE, + ret = sev_ioctl(sev_common->sev_fd, KVM_SEV_LAUNCH_MEASURE, &measurement, &error); if (!measurement.len) { error_report("%s: LAUNCH_MEASURE ret=%d fw_error=%d '%s'", @@ -827,7 +1802,7 @@ sev_launch_get_measure(Notifier *notifier, void *unused) measurement.uaddr = (unsigned long)data; /* get the measurement blob */ - ret = sev_ioctl(sev->sev_fd, KVM_SEV_LAUNCH_MEASURE, + ret = sev_ioctl(sev_common->sev_fd, KVM_SEV_LAUNCH_MEASURE, &measurement, &error); if (ret) { error_report("%s: LAUNCH_MEASURE ret=%d fw_error=%d '%s'", @@ -835,17 +1810,21 @@ sev_launch_get_measure(Notifier *notifier, void *unused) return; } - sev_set_guest_state(sev, SEV_STATE_LAUNCH_SECRET); + sev_set_guest_state(sev_common, SEV_STATE_LAUNCH_SECRET); /* encode the measurement value and emit the event */ - sev->measurement = g_base64_encode(data, measurement.len); - trace_kvm_sev_launch_measurement(sev->measurement); + sev_guest->measurement = g_base64_encode(data, measurement.len); + trace_kvm_sev_launch_measurement(sev_guest->measurement); } static char *sev_get_launch_measurement(void) { + ConfidentialGuestSupport *cgs = MACHINE(qdev_get_machine())->cgs; + SevGuestState *sev_guest = + (SevGuestState *)object_dynamic_cast(OBJECT(cgs), TYPE_SEV_GUEST); + if (sev_guest && - sev_guest->state >= SEV_STATE_LAUNCH_SECRET) { + SEV_COMMON(sev_guest)->state >= SEV_STATE_LAUNCH_SECRET) { return g_strdup(sev_guest->measurement); } @@ -857,66 +1836,304 @@ SevLaunchMeasureInfo *qmp_query_sev_launch_measure(Error **errp) char *data; SevLaunchMeasureInfo *info; - data = sev_get_launch_measurement(); - if (!data) { - error_setg(errp, "SEV launch measurement is not available"); - return NULL; - } + data = sev_get_launch_measurement(); + if (!data) { + error_setg(errp, "SEV launch measurement is not available"); + return NULL; + } + + info = g_malloc0(sizeof(*info)); + info->data = data; + + return info; +} + +static Notifier sev_machine_done_notify = { + .notify = sev_launch_get_measure, +}; + +static void +sev_launch_finish(SevGuestState *sev_guest) +{ + int ret, error; + + trace_kvm_sev_launch_finish(); + ret = sev_ioctl(SEV_COMMON(sev_guest)->sev_fd, KVM_SEV_LAUNCH_FINISH, 0, + &error); + if (ret) { + error_report("%s: LAUNCH_FINISH ret=%d fw_error=%d '%s'", + __func__, ret, error, fw_error_to_str(error)); + exit(1); + } + + sev_set_guest_state(SEV_COMMON(sev_guest), SEV_STATE_RUNNING); + + /* add migration blocker */ + error_setg(&sev_mig_blocker, + "SEV: Migration is not implemented"); + migrate_add_blocker(&sev_mig_blocker, &error_fatal); +} + +static int +sev_snp_cpuid_info_fill(SnpCpuidInfo *snp_cpuid_info, + const KvmCpuidInfo *kvm_cpuid_info) +{ + size_t i; + + if (kvm_cpuid_info->cpuid.nent > SNP_CPUID_FUNCTION_MAXCOUNT) { + error_report("SEV-SNP: CPUID entry count (%d) exceeds max (%d)", + kvm_cpuid_info->cpuid.nent, SNP_CPUID_FUNCTION_MAXCOUNT); + return -1; + } + + memset(snp_cpuid_info, 0, sizeof(*snp_cpuid_info)); + + for (i = 0; i < kvm_cpuid_info->cpuid.nent; i++) { + const struct kvm_cpuid_entry2 *kvm_cpuid_entry; + SnpCpuidFunc *snp_cpuid_entry; + + kvm_cpuid_entry = &kvm_cpuid_info->entries[i]; + snp_cpuid_entry = &snp_cpuid_info->entries[i]; + + snp_cpuid_entry->eax_in = kvm_cpuid_entry->function; + if (kvm_cpuid_entry->flags == KVM_CPUID_FLAG_SIGNIFCANT_INDEX) { + snp_cpuid_entry->ecx_in = kvm_cpuid_entry->index; + } + snp_cpuid_entry->eax = kvm_cpuid_entry->eax; + snp_cpuid_entry->ebx = kvm_cpuid_entry->ebx; + snp_cpuid_entry->ecx = kvm_cpuid_entry->ecx; + snp_cpuid_entry->edx = kvm_cpuid_entry->edx; + + /* + * Guest kernels will calculate EBX themselves using the 0xD + * subfunctions corresponding to the individual XSAVE areas, so only + * encode the base XSAVE size in the initial leaves, corresponding + * to the initial XCR0=1 state. + */ + if (snp_cpuid_entry->eax_in == 0xD && + (snp_cpuid_entry->ecx_in == 0x0 || snp_cpuid_entry->ecx_in == 0x1)) { + snp_cpuid_entry->ebx = 0x240; + snp_cpuid_entry->xcr0_in = 1; + snp_cpuid_entry->xss_in = 0; + } + } + + snp_cpuid_info->count = i; + + return 0; +} + +static int +snp_launch_update_data(uint64_t gpa, void *hva, uint32_t len, int type) +{ + SevLaunchUpdateData *data; + + data = g_new0(SevLaunchUpdateData, 1); + data->gpa = gpa; + data->hva = hva; + data->len = len; + data->type = type; + + QTAILQ_INSERT_TAIL(&launch_update, data, next); + + return 0; +} + +static int +snp_launch_update_cpuid(uint32_t cpuid_addr, void *hva, uint32_t cpuid_len) +{ + KvmCpuidInfo kvm_cpuid_info = {0}; + SnpCpuidInfo snp_cpuid_info; + CPUState *cs = first_cpu; + int ret; + uint32_t i = 0; + + assert(sizeof(snp_cpuid_info) <= cpuid_len); + + /* get the cpuid list from KVM */ + do { + kvm_cpuid_info.cpuid.nent = ++i; + ret = kvm_vcpu_ioctl(cs, KVM_GET_CPUID2, &kvm_cpuid_info); + } while (ret == -E2BIG); + + if (ret) { + error_report("SEV-SNP: unable to query CPUID values for CPU: '%s'", + strerror(-ret)); + return 1; + } + + ret = sev_snp_cpuid_info_fill(&snp_cpuid_info, &kvm_cpuid_info); + if (ret) { + error_report("SEV-SNP: failed to generate CPUID table information"); + return 1; + } + + memcpy(hva, &snp_cpuid_info, sizeof(snp_cpuid_info)); + + return snp_launch_update_data(cpuid_addr, hva, cpuid_len, KVM_SEV_SNP_PAGE_TYPE_CPUID); +} + +static int +snp_launch_update_kernel_hashes(SevSnpGuestState *sev_snp, uint32_t addr, + void *hva, uint32_t len) +{ + int type = KVM_SEV_SNP_PAGE_TYPE_ZERO; + if (sev_snp->sev_common.kernel_hashes) { + assert(sev_snp->kernel_hashes_data); + assert((sev_snp->kernel_hashes_offset + + sizeof(*sev_snp->kernel_hashes_data)) <= len); + memset(hva, 0, len); + memcpy(hva + sev_snp->kernel_hashes_offset, sev_snp->kernel_hashes_data, + sizeof(*sev_snp->kernel_hashes_data)); + type = KVM_SEV_SNP_PAGE_TYPE_NORMAL; + } + return snp_launch_update_data(addr, hva, len, type); +} + +static int +snp_metadata_desc_to_page_type(int desc_type) +{ + switch(desc_type) { + /* Add the umeasured prevalidated pages as a zero page */ + case SEV_DESC_TYPE_SNP_SEC_MEM: return KVM_SEV_SNP_PAGE_TYPE_ZERO; + case SEV_DESC_TYPE_SNP_SECRETS: return KVM_SEV_SNP_PAGE_TYPE_SECRETS; + case SEV_DESC_TYPE_CPUID: return KVM_SEV_SNP_PAGE_TYPE_CPUID; + case SEV_DESC_TYPE_SNP_KERNEL_HASHES: return KVM_SEV_SNP_PAGE_TYPE_NORMAL; + default: return -1; + } +} + +static void +snp_populate_metadata_pages(SevSnpGuestState *sev_snp, OvmfSevMetadata *metadata) +{ + OvmfSevMetadataDesc *desc; + int type, ret, i; + void *hva; + MemoryRegion *mr = NULL; + + for (i = 0; i < metadata->num_desc; i++) { + desc = &metadata->descs[i]; + + type = snp_metadata_desc_to_page_type(desc->type); + if (type < 0) { + error_report("%s: Invalid memory type '%d'\n", __func__, desc->type); + exit(1); + } + + hva = gpa2hva(&mr, desc->base, desc->len, NULL); + if (!hva) { + error_report("%s: Failed to get HVA for GPA 0x%x sz 0x%x\n", + __func__, desc->base, desc->len); + exit(1); + } - info = g_malloc0(sizeof(*info)); - info->data = data; + if (type == KVM_SEV_SNP_PAGE_TYPE_CPUID) { + ret = snp_launch_update_cpuid(desc->base, hva, desc->len); + } else if (desc->type == SEV_DESC_TYPE_SNP_KERNEL_HASHES) { + ret = snp_launch_update_kernel_hashes(sev_snp, desc->base, hva, + desc->len); + } else { + ret = snp_launch_update_data(desc->base, hva, desc->len, type); + } - return info; + if (ret) { + error_report("%s: Failed to add metadata page gpa 0x%x+%x type %d\n", + __func__, desc->base, desc->len, desc->type); + exit(1); + } + } } -static Notifier sev_machine_done_notify = { - .notify = sev_launch_get_measure, -}; - static void -sev_launch_finish(SevGuestState *sev) +sev_snp_launch_finish(SevSnpGuestState *sev_snp) { int ret, error; + Error *local_err = NULL; + OvmfSevMetadata *metadata; + SevLaunchUpdateData *data; + struct kvm_sev_snp_launch_finish *finish = &sev_snp->kvm_finish_conf; + ConfidentialGuestSupport *cgs = CONFIDENTIAL_GUEST_SUPPORT(sev_snp); - trace_kvm_sev_launch_finish(); - ret = sev_ioctl(sev->sev_fd, KVM_SEV_LAUNCH_FINISH, 0, &error); - if (ret) { - error_report("%s: LAUNCH_FINISH ret=%d fw_error=%d '%s'", + /* + * Populate all the metadata pages if not using an IGVM file. In the case + * where an IGVM file is provided it will be used to configure the metadata + * pages directly. + */ + if (!cgs_is_igvm(cgs)) { + /* + * To boot the SNP guest, the hypervisor is required to populate the CPUID + * and Secrets page before finalizing the launch flow. The location of + * the secrets and CPUID page is available through the OVMF metadata GUID. + */ + metadata = pc_system_get_ovmf_sev_metadata_ptr(); + if (metadata == NULL) { + error_report("%s: Failed to locate SEV metadata header\n", __func__); + exit(1); + } + + /* Populate all the metadata pages */ + snp_populate_metadata_pages(sev_snp, metadata); + } + + QTAILQ_FOREACH(data, &launch_update, next) { + ret = sev_snp_launch_update(sev_snp, data); + if (ret) { + exit(1); + } + } + + trace_kvm_sev_snp_launch_finish(sev_snp->id_block, sev_snp->id_auth, + sev_snp->host_data); + ret = sev_ioctl(SEV_COMMON(sev_snp)->sev_fd, KVM_SEV_SNP_LAUNCH_FINISH, + finish, &error); + if (ret || error) { + error_report("%s: SNP_LAUNCH_FINISH ret=%d fw_error=%d '%s'", __func__, ret, error, fw_error_to_str(error)); exit(1); } - sev_set_guest_state(sev, SEV_STATE_RUNNING); + sev_set_guest_state(SEV_COMMON(sev_snp), SEV_STATE_RUNNING); /* add migration blocker */ error_setg(&sev_mig_blocker, - "SEV: Migration is not implemented"); - migrate_add_blocker(&sev_mig_blocker, &error_fatal); + "SEV-SNP: Migration is not implemented"); + ret = migrate_add_blocker(&sev_mig_blocker, &local_err); + if (local_err) { + error_report_err(local_err); + error_free(sev_mig_blocker); + exit(1); + } } + static void sev_vm_state_change(void *opaque, bool running, RunState state) { - SevGuestState *sev = opaque; + SevCommonState *sev_common = opaque; if (running) { - if (!sev_check_state(sev, SEV_STATE_RUNNING)) { - sev_launch_finish(sev); + if (!sev_check_state(sev_common, SEV_STATE_RUNNING)) { + if (sev_snp_enabled()) { + sev_snp_launch_finish(SEV_SNP_GUEST(sev_common)); + } else { + sev_launch_finish(SEV_GUEST(sev_common)); + } } } } -int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) +int sev_kvm_init(MachineState *ms, Error **errp) { - SevGuestState *sev - = (SevGuestState *)object_dynamic_cast(OBJECT(cgs), TYPE_SEV_GUEST); + ConfidentialGuestSupport *cgs = ms->cgs; + SevCommonState *sev_common = SEV_COMMON(cgs); char *devname; int ret, fw_error, cmd; uint32_t ebx; uint32_t host_cbitpos; struct sev_user_data_status status = {}; + void *init_args = NULL; - if (!sev) { + if (!sev_common) { return 0; } @@ -926,8 +2143,7 @@ int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) return -1; } - sev_guest = sev; - sev->state = SEV_STATE_UNINIT; + sev_common->state = SEV_STATE_UNINIT; host_cpuid(0x8000001F, 0, NULL, &ebx, NULL, NULL); host_cbitpos = ebx & 0x3f; @@ -937,9 +2153,9 @@ int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) * register of CPUID 0x8000001F. No need to verify the range as the * comparison against the host value accomplishes that. */ - if (host_cbitpos != sev->cbitpos) { + if (host_cbitpos != sev_common->cbitpos) { error_setg(errp, "%s: cbitpos check failed, host '%d' requested '%d'", - __func__, host_cbitpos, sev->cbitpos); + __func__, host_cbitpos, sev_common->cbitpos); goto err; } @@ -948,16 +2164,16 @@ int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) * the EBX register of CPUID 0x8000001F, so verify the supplied value * is in the range of 1 to 63. */ - if (sev->reduced_phys_bits < 1 || sev->reduced_phys_bits > 63) { + if (sev_common->reduced_phys_bits < 1 || sev_common->reduced_phys_bits > 63) { error_setg(errp, "%s: reduced_phys_bits check failed," " it should be in the range of 1 to 63, requested '%d'", - __func__, sev->reduced_phys_bits); + __func__, sev_common->reduced_phys_bits); goto err; } - devname = object_property_get_str(OBJECT(sev), "sev-device", NULL); - sev->sev_fd = open(devname, O_RDWR); - if (sev->sev_fd < 0) { + devname = object_property_get_str(OBJECT(sev_common), "sev-device", NULL); + sev_common->sev_fd = open(devname, O_RDWR); + if (sev_common->sev_fd < 0) { error_setg(errp, "%s: Failed to open %s '%s'", __func__, devname, strerror(errno)); g_free(devname); @@ -965,7 +2181,7 @@ int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) } g_free(devname); - ret = sev_platform_ioctl(sev->sev_fd, SEV_PLATFORM_STATUS, &status, + ret = sev_platform_ioctl(sev_common->sev_fd, SEV_PLATFORM_STATUS, &status, &fw_error); if (ret) { error_setg(errp, "%s: failed to get platform status ret=%d " @@ -973,11 +2189,23 @@ int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) fw_error_to_str(fw_error)); goto err; } - sev->build_id = status.build; - sev->api_major = status.api_major; - sev->api_minor = status.api_minor; + sev_common->build_id = status.build; + sev_common->api_major = status.api_major; + sev_common->api_minor = status.api_minor; - if (sev_es_enabled()) { + if (sev_snp_enabled()) { + SevSnpGuestState *sev_snp_guest = SEV_SNP_GUEST(sev_common); + if (!kvm_kernel_irqchip_allowed()) { + error_setg(errp, "%s: SEV-SNP guests require in-kernel irqchip support", + __func__); + goto err; + } + + cmd = KVM_SEV_SNP_INIT; + init_args = (void *)&sev_snp_guest->kvm_init_conf; + trace_kvm_sev_init("SEV-SNP", sev_snp_guest->kvm_init_conf.flags); + ms->require_guest_memfd = true; + } else if (sev_es_enabled()) { if (!kvm_kernel_irqchip_allowed()) { error_report("%s: SEV-ES guests require in-kernel irqchip support", __func__); @@ -991,47 +2219,73 @@ int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp) goto err; } cmd = KVM_SEV_ES_INIT; + trace_kvm_sev_init("SEV-ES", 0); } else { cmd = KVM_SEV_INIT; + trace_kvm_sev_init("SEV", 0); } - trace_kvm_sev_init(); - ret = sev_ioctl(sev->sev_fd, cmd, NULL, &fw_error); + ret = sev_ioctl(sev_common->sev_fd, cmd, init_args, &fw_error); if (ret) { error_setg(errp, "%s: failed to initialize ret=%d fw_error=%d '%s'", __func__, ret, fw_error, fw_error_to_str(fw_error)); goto err; } - ret = sev_launch_start(sev); + if (sev_snp_enabled()) { + ret = sev_snp_launch_start(SEV_SNP_GUEST(sev_common)); + } else { + ret = sev_launch_start(SEV_GUEST(sev_common)); + } + if (ret) { error_setg(errp, "%s: failed to create encryption context", __func__); goto err; } - ram_block_notifier_add(&sev_ram_notifier); - qemu_add_machine_init_done_notifier(&sev_machine_done_notify); - qemu_add_vm_change_state_handler(sev_vm_state_change, sev); + if (!sev_snp_enabled()) { + ram_block_notifier_add(&sev_ram_notifier); + } + + /* + * The machine done notify event is used by the SEV guest to get the + * measurement of the encrypted images. When SEV-SNP is enabled, the + * measurement is part of the attestation. So skip registering the + * notifier. + */ + if (!sev_snp_enabled()) { + qemu_add_machine_init_done_notifier(&sev_machine_done_notify); + } + + qemu_add_vm_change_state_handler(sev_vm_state_change, sev_common); cgs->ready = true; return 0; err: - sev_guest = NULL; ram_block_discard_disable(false); return -1; } int -sev_encrypt_flash(uint8_t *ptr, uint64_t len, Error **errp) +sev_encrypt_flash(hwaddr gpa, uint8_t *ptr, uint64_t len, Error **errp) { - if (!sev_guest) { + SevCommonState *sev_common = SEV_COMMON(MACHINE(qdev_get_machine())->cgs); + + if (!sev_common) { return 0; } /* if SEV is in update state then encrypt the data else do nothing */ - if (sev_check_state(sev_guest, SEV_STATE_LAUNCH_UPDATE)) { - int ret = sev_launch_update_data(sev_guest, ptr, len); + if (sev_check_state(sev_common, SEV_STATE_LAUNCH_UPDATE)) { + int ret; + + if (sev_snp_enabled()) { + ret = snp_launch_update_data(gpa, ptr, len, + KVM_SEV_SNP_PAGE_TYPE_NORMAL); + } else { + ret = sev_launch_update_data(SEV_GUEST(sev_common), ptr, len); + } if (ret < 0) { error_setg(errp, "SEV: Failed to encrypt pflash rom"); return ret; @@ -1050,16 +2304,17 @@ int sev_inject_launch_secret(const char *packet_hdr, const char *secret, void *hva; gsize hdr_sz = 0, data_sz = 0; MemoryRegion *mr = NULL; + SevCommonState *sev_common = SEV_COMMON(MACHINE(qdev_get_machine())->cgs); - if (!sev_guest) { + if (!sev_common) { error_setg(errp, "SEV not enabled for guest"); return 1; } /* secret can be injected only in this state */ - if (!sev_check_state(sev_guest, SEV_STATE_LAUNCH_SECRET)) { + if (!sev_check_state(sev_common, SEV_STATE_LAUNCH_SECRET)) { error_setg(errp, "SEV: Not in correct state. (LSECRET) %x", - sev_guest->state); + sev_common->state); return 1; } @@ -1093,7 +2348,7 @@ int sev_inject_launch_secret(const char *packet_hdr, const char *secret, trace_kvm_sev_launch_secret(gpa, input.guest_uaddr, input.trans_uaddr, input.trans_len); - ret = sev_ioctl(sev_guest->sev_fd, KVM_SEV_LAUNCH_SECRET, + ret = sev_ioctl(sev_common->sev_fd, KVM_SEV_LAUNCH_SECRET, &input, &error); if (ret) { error_setg(errp, "SEV: failed to inject secret ret=%d fw_error=%d '%s'", @@ -1196,34 +2451,99 @@ sev_es_find_reset_vector(void *flash_ptr, uint64_t flash_size, return sev_es_parse_reset_block(info, addr); } -void sev_es_set_reset_vector(CPUState *cpu) -{ - X86CPU *x86; - CPUX86State *env; - /* Only update if we have valid reset information */ - if (!sev_guest || !sev_guest->reset_data_valid) { - return; - } +static void seg_to_vmsa(const SegmentCache *cpu_seg, struct vmcb_seg *vmsa_seg) +{ + vmsa_seg->selector = cpu_seg->selector; + vmsa_seg->base = cpu_seg->base; + vmsa_seg->limit = cpu_seg->limit; + vmsa_seg->attrib = FLAGS_SEGCACHE_TO_VMSA(cpu_seg->flags); +} - /* Do not update the BSP reset state */ - if (cpu->cpu_index == 0) { - return; - } +static void initialize_vmsa(const CPUState *cpu, struct sev_es_save_area *vmsa) +{ + const X86CPU *x86 = X86_CPU(cpu); + const CPUX86State *env = &x86->env; - x86 = X86_CPU(cpu); - env = &x86->env; + /* + * Initialize the SEV-ES save area from the current state of + * the CPU. The entire state does not need to be copied, only the state + * that is copied back to the CPUState in sev_apply_cpu_context. + */ + memset(vmsa, 0, sizeof(struct sev_es_save_area)); + vmsa->efer = env->efer; + vmsa->cr0 = env->cr[0]; + vmsa->cr3 = env->cr[3]; + vmsa->cr4 = env->cr[4]; + + seg_to_vmsa(&env->segs[R_CS], &vmsa->cs); + seg_to_vmsa(&env->segs[R_DS], &vmsa->ds); + seg_to_vmsa(&env->segs[R_ES], &vmsa->es); + seg_to_vmsa(&env->segs[R_FS], &vmsa->fs); + seg_to_vmsa(&env->segs[R_GS], &vmsa->gs); + seg_to_vmsa(&env->segs[R_SS], &vmsa->ss); + + seg_to_vmsa(&env->gdt, &vmsa->gdtr); + seg_to_vmsa(&env->idt, &vmsa->idtr); + + vmsa->rax = env->regs[R_EAX]; + vmsa->rcx = env->regs[R_ECX]; + vmsa->rdx = env->regs[R_EDX]; + vmsa->rbx = env->regs[R_EBX]; + vmsa->rsp = env->regs[R_ESP]; + vmsa->rbp = env->regs[R_EBP]; + vmsa->rsi = env->regs[R_ESI]; + vmsa->rdi = env->regs[R_EDI]; + +#ifdef TARGET_X86_64 + vmsa->r8 = env->regs[R_R8]; + vmsa->r9 = env->regs[R_R9]; + vmsa->r10 = env->regs[R_R10]; + vmsa->r11 = env->regs[R_R11]; + vmsa->r12 = env->regs[R_R12]; + vmsa->r13 = env->regs[R_R13]; + vmsa->r14 = env->regs[R_R14]; + vmsa->r15 = env->regs[R_R15]; +#endif + + vmsa->rip = env->eip; +} - cpu_x86_load_seg_cache(env, R_CS, 0xf000, sev_guest->reset_cs, 0xffff, - DESC_P_MASK | DESC_S_MASK | DESC_CS_MASK | - DESC_R_MASK | DESC_A_MASK); +static void sev_es_set_ap_context(uint32_t reset_addr) +{ + CPUState *cpu; + struct sev_es_save_area vmsa; + SegmentCache cs; + + cs.selector = 0xf000; + cs.base = reset_addr & 0xffff0000; + cs.limit = 0xffff; + cs.flags = DESC_P_MASK | DESC_S_MASK | DESC_CS_MASK | + DESC_R_MASK | DESC_A_MASK; + + CPU_FOREACH(cpu) { + if (cpu->cpu_index == 0) { + /* Do not update the BSP reset state */ + continue; + } + initialize_vmsa(cpu, &vmsa); + seg_to_vmsa(&cs, &vmsa.cs); + vmsa.rip = reset_addr & 0x0000ffff; + sev_set_cpu_context(cpu->cpu_index, &vmsa, + sizeof(struct sev_es_save_area), 0); + sev_apply_cpu_context(cpu); + } +} - env->eip = sev_guest->reset_ip; +void sev_es_set_reset_vector(CPUState *cpu) +{ + if (sev_enabled()) { + sev_apply_cpu_context(cpu); + } } int sev_es_save_reset_vector(void *flash_ptr, uint64_t flash_size) { - CPUState *cpu; uint32_t addr; int ret; @@ -1238,16 +2558,13 @@ int sev_es_save_reset_vector(void *flash_ptr, uint64_t flash_size) return ret; } + /* + * The reset vector is saved into a CPU context for each AP but not for + * the BSP. This is applied during guest startup or when the CPU is reset. + */ if (addr) { - sev_guest->reset_cs = addr & 0xffff0000; - sev_guest->reset_ip = addr & 0x0000ffff; - sev_guest->reset_data_valid = true; - - CPU_FOREACH(cpu) { - sev_es_set_reset_vector(cpu); - } + sev_es_set_ap_context(addr); } - return 0; } @@ -1269,44 +2586,16 @@ static const QemuUUID sev_cmdline_entry_guid = { 0x4d, 0x36, 0xab, 0x2a) }; -/* - * Add the hashes of the linux kernel/initrd/cmdline to an encrypted guest page - * which is included in SEV's initial memory measurement. - */ -bool sev_add_kernel_loader_hashes(SevKernelLoaderContext *ctx, Error **errp) +static bool build_kernel_loader_hashes(PaddedSevHashTable *padded_ht, + SevKernelLoaderContext *ctx, + Error **errp) { - uint8_t *data; - SevHashTableDescriptor *area; SevHashTable *ht; - PaddedSevHashTable *padded_ht; uint8_t cmdline_hash[HASH_SIZE]; uint8_t initrd_hash[HASH_SIZE]; uint8_t kernel_hash[HASH_SIZE]; uint8_t *hashp; size_t hash_len = HASH_SIZE; - hwaddr mapped_len = sizeof(*padded_ht); - MemTxAttrs attrs = { 0 }; - bool ret = true; - - /* - * Only add the kernel hashes if the sev-guest configuration explicitly - * stated kernel-hashes=on. - */ - if (!sev_guest->kernel_hashes) { - return false; - } - - if (!pc_system_ovmf_table_find(SEV_HASH_TABLE_RV_GUID, &data, NULL)) { - error_setg(errp, "SEV: kernel specified but guest firmware " - "has no hashes table GUID"); - return false; - } - area = (SevHashTableDescriptor *)data; - if (!area->base || area->size < sizeof(PaddedSevHashTable)) { - error_setg(errp, "SEV: guest firmware hashes table area is invalid " - "(base=0x%x size=0x%x)", area->base, area->size); - return false; - } /* * Calculate hash of kernel command-line with the terminating null byte. If @@ -1343,16 +2632,6 @@ bool sev_add_kernel_loader_hashes(SevKernelLoaderContext *ctx, Error **errp) } assert(hash_len == HASH_SIZE); - /* - * Populate the hashes table in the guest's memory at the OVMF-designated - * area for the SEV hashes table - */ - padded_ht = address_space_map(&address_space_memory, area->base, - &mapped_len, true, attrs); - if (!padded_ht || mapped_len != sizeof(*padded_ht)) { - error_setg(errp, "SEV: cannot map hashes table guest memory area"); - return false; - } ht = &padded_ht->ht; ht->guid = sev_hash_table_header_guid; @@ -1373,7 +2652,73 @@ bool sev_add_kernel_loader_hashes(SevKernelLoaderContext *ctx, Error **errp) /* zero the excess data so the measurement can be reliably calculated */ memset(padded_ht->padding, 0, sizeof(padded_ht->padding)); - if (sev_encrypt_flash((uint8_t *)padded_ht, sizeof(*padded_ht), errp) < 0) { + return true; +} + +/* + * Add the hashes of the linux kernel/initrd/cmdline to an encrypted guest page + * which is included in SEV's initial memory measurement. + */ +bool sev_add_kernel_loader_hashes(SevKernelLoaderContext *ctx, Error **errp) +{ + uint8_t *data; + SevHashTableDescriptor *area; + PaddedSevHashTable *padded_ht; + hwaddr mapped_len = sizeof(*padded_ht); + MemTxAttrs attrs = { 0 }; + bool ret = true; + SevCommonState *sev_common = SEV_COMMON(MACHINE(qdev_get_machine())->cgs); + + /* + * Only add the kernel hashes if the sev-guest configuration explicitly + * stated kernel-hashes=on. + */ + if (!sev_common->kernel_hashes) { + return false; + } + + if (!pc_system_ovmf_table_find(SEV_HASH_TABLE_RV_GUID, &data, NULL)) { + error_setg(errp, "SEV: kernel specified but guest firmware " + "has no hashes table GUID"); + return false; + } + + area = (SevHashTableDescriptor *)data; + if (!area->base || area->size < sizeof(PaddedSevHashTable)) { + error_setg(errp, "SEV: guest firmware hashes table area is invalid " + "(base=0x%x size=0x%x)", area->base, area->size); + return false; + } + + if (sev_snp_enabled()) { + /* + * SNP: Populate the hashes table in an area that later in + * snp_launch_update_kernel_hashes() will be copied to the guest memory + * and encrypted. + */ + SevSnpGuestState *sev_snp_guest = SEV_SNP_GUEST(sev_common); + sev_snp_guest->kernel_hashes_offset = area->base & ~TARGET_PAGE_MASK; + sev_snp_guest->kernel_hashes_data = g_new0(PaddedSevHashTable, 1); + return build_kernel_loader_hashes(sev_snp_guest->kernel_hashes_data, ctx, errp); + } + + /* + * Populate the hashes table in the guest's memory at the OVMF-designated + * area for the SEV hashes table + */ + padded_ht = address_space_map(&address_space_memory, area->base, + &mapped_len, true, attrs); + if (!padded_ht || mapped_len != sizeof(*padded_ht)) { + error_setg(errp, "SEV: cannot map hashes table guest memory area"); + return false; + } + + if (build_kernel_loader_hashes(padded_ht, ctx, errp)) { + if (sev_encrypt_flash(area->base, (uint8_t *)padded_ht, + sizeof(*padded_ht), errp) < 0) { + ret = false; + } + } else { ret = false; } @@ -1383,10 +2728,250 @@ bool sev_add_kernel_loader_hashes(SevKernelLoaderContext *ctx, Error **errp) return ret; } +#define GHCB_MSR_PSC_OP_PRIVATE 1 +#define GHCB_MSR_PSC_OP_SHARED 2 + +#define GHCB_SHARED_BUF_SIZE 0x7f0 + +struct ghcb_save_area { + uint8_t padding[0x390]; + uint64_t sw_exit_code; + uint64_t sw_exit_info1; + uint64_t sw_exit_info2; +} __attribute__((__packed__)); + +struct ghcb { + struct ghcb_save_area save; + uint8_t reserved_save[0x800 - sizeof(struct ghcb_save_area)]; + + uint8_t shared_buffer[GHCB_SHARED_BUF_SIZE]; + + uint8_t reserved_1[10]; + uint16_t protocol_version; + uint16_t ghcb_usage; +} __attribute__((__packed__)); + +struct psc_hdr { + uint16_t cur_entry; + uint16_t end_entry; + uint32_t reserved; +} __attribute__((__packed__)); + +struct psc_entry { + uint64_t cur_page : 12, + gfn : 40, + operation : 4, + pagesize : 1, + reserved : 7; +} __attribute__((__packed__)); + +#define VMGEXIT_PSC_MAX_ENTRY 253 + +struct snp_psc_desc { + struct psc_hdr hdr; + struct psc_entry entries[VMGEXIT_PSC_MAX_ENTRY]; +} __attribute__((__packed__)); + +static int kvm_handle_vmgexit_psc_msr_protocol(__u64 gpa, __u8 op, __u32 *psc_ret) +{ + int ret; + + ret = kvm_convert_memory(gpa, TARGET_PAGE_SIZE, + op == KVM_USER_VMGEXIT_PSC_MSR_OP_PRIVATE); + + *psc_ret = ret; + + return ret; +} + +static int next_contig_gpa_range(struct snp_psc_desc *desc, + uint16_t *entries_processed, hwaddr *gfn_base, + int *gfn_count, bool *range_to_private) +{ + int i; + + *entries_processed = 0; + *gfn_base = 0; + *gfn_count = 0; + *range_to_private = false; + + for (i = desc->hdr.cur_entry; i <= desc->hdr.end_entry; i++) { + struct psc_entry *entry = &desc->entries[i]; + bool to_private = entry->operation == 1; + int page_count = entry->pagesize ? 512 : 1; + + if (!*gfn_count) { + *range_to_private = to_private; + *gfn_base = entry->gfn; + } + + /* When first non-adjacent entry is encountered, report back the previous range */ + if (entry->gfn != *gfn_base + *gfn_count || (to_private != *range_to_private)) { + return 0; + } + +#if 0 + trace_kvm_vmgexit_psc(entry->gfn, entry->pagesize ? 0x200000 : 0x1000, + entry->cur_page, entry->operation, to_private); +#endif + + *gfn_count += page_count; + + /* + * TODO: this should only be changed after success, but is a bit painful + * handling this in conjunction with batching up multiple entries, so + * just assume success for now. Guests don't currently seem to make use + * of this sort of per-page error handling anyway. + */ + entry->cur_page = page_count; + *entries_processed += 1; + } + + return *gfn_count ? 0 : -ENOENT; +} + +#define PSC_ERROR_GENERIC (0x100UL << 32) + +static int kvm_handle_vmgexit_psc(__u64 shared_gpa, __u64 *psc_ret) +{ + hwaddr len = GHCB_SHARED_BUF_SIZE; + MemTxAttrs attrs = { 0 }; + struct snp_psc_desc *desc; + void *ghcb_shared_buf; + uint8_t shared_buf[GHCB_SHARED_BUF_SIZE]; + uint16_t entries_processed; + hwaddr gfn_base = 0; + int gfn_count = 0; + bool range_to_private; + + *psc_ret = 0; + ghcb_shared_buf = address_space_map(&address_space_memory, shared_gpa, + &len, true, attrs); + if (len < GHCB_SHARED_BUF_SIZE) { + g_warning("unable to map entire shared GHCB buffer, mapped size %ld (expected %d)", + len, GHCB_SHARED_BUF_SIZE); + *psc_ret = PSC_ERROR_GENERIC; + goto out_unmap; + } + memcpy(shared_buf, ghcb_shared_buf, GHCB_SHARED_BUF_SIZE); + address_space_unmap(&address_space_memory, ghcb_shared_buf, len, true, len); + + desc = (struct snp_psc_desc *)shared_buf; + + while (!next_contig_gpa_range(desc, &entries_processed, + &gfn_base, &gfn_count, &range_to_private)) { + int ret = kvm_convert_memory(gfn_base * 0x1000, gfn_count * 0x1000, + range_to_private); + if (ret) { + *psc_ret = 0x100ULL << 32; /* Indicate interrupted processing */ + g_warning("error doing memory conversion: %d", ret); + break; + } + + desc->hdr.cur_entry += entries_processed; + } + + ghcb_shared_buf = address_space_map(&address_space_memory, shared_gpa, + &len, true, attrs); + if (len < GHCB_SHARED_BUF_SIZE) { + g_warning("unable to map entire shared GHCB buffer, mapped size %ld (expected %d)", + len, GHCB_SHARED_BUF_SIZE); + *psc_ret = PSC_ERROR_GENERIC; + goto out_unmap; + } + memcpy(ghcb_shared_buf, shared_buf, GHCB_SHARED_BUF_SIZE); +out_unmap: + address_space_unmap(&address_space_memory, ghcb_shared_buf, len, true, len); + + return 0; +} + +#define SNP_EXT_REQ_ERROR_INVALID_LEN 1 +#define SNP_EXT_REQ_ERROR_BUSY 2 +#define SNP_EXT_REQ_ERROR_GENERIC (1 << 31) + +static int kvm_handle_vmgexit_ext_req(__u64 gpa, __u64 *npages, __u32 *vmm_ret) +{ + SevSnpGuestState *sev_snp_guest; + MemTxAttrs attrs = { 0 }; + void *guest_buf; + hwaddr buf_sz; + gsize sz; + g_autofree gchar *contents = NULL; + GError *error = NULL; + + *vmm_ret = SNP_EXT_REQ_ERROR_GENERIC; + + if (!sev_snp_enabled()) { + return 0; + } + + sev_snp_guest = SEV_SNP_GUEST(MACHINE(qdev_get_machine())->cgs); + + if (!sev_snp_guest->certs_path) { + *vmm_ret = 0; + return 0; + } + + if (!g_file_get_contents(sev_snp_guest->certs_path, &contents, &sz, &error)) { + error_report("SEV: Failed to read '%s' (%s)", sev_snp_guest->certs_path, error->message); + g_error_free(error); + return 0; + } + + buf_sz = *npages * TARGET_PAGE_SIZE; + + if (buf_sz < sz) { + *vmm_ret = SNP_EXT_REQ_ERROR_INVALID_LEN; + *npages = (sz + TARGET_PAGE_SIZE) / TARGET_PAGE_SIZE; + return 0; + } + + guest_buf = address_space_map(&address_space_memory, gpa, &buf_sz, true, attrs); + if (buf_sz < sz) { + g_warning("unable to map entire shared buffer, mapped size %ld (expected %d)", + buf_sz, GHCB_SHARED_BUF_SIZE); + goto out_unmap; + } + + memcpy(guest_buf, contents, buf_sz); + *vmm_ret = 0; + +out_unmap: + address_space_unmap(&address_space_memory, guest_buf, buf_sz, true, buf_sz); + + return 0; +} + +int kvm_handle_vmgexit(struct kvm_run *run) +{ + int ret; + + if (run->vmgexit.type == KVM_USER_VMGEXIT_PSC) { + ret = kvm_handle_vmgexit_psc(run->vmgexit.psc.shared_gpa, + &run->vmgexit.psc.ret); + } else if (run->vmgexit.type == KVM_USER_VMGEXIT_PSC_MSR) { + ret = kvm_handle_vmgexit_psc_msr_protocol(run->vmgexit.psc_msr.gpa, + run->vmgexit.psc_msr.op, + &run->vmgexit.psc_msr.ret); + } else if (run->vmgexit.type == KVM_USER_VMGEXIT_EXT_GUEST_REQ) { + ret = kvm_handle_vmgexit_ext_req(run->vmgexit.ext_guest_req.data_gpa, + &run->vmgexit.ext_guest_req.data_npages, + &run->vmgexit.ext_guest_req.ret); + } else { + warn_report("KVM: unknown vmgexit type: %d", run->vmgexit.type); + ret = -1; + } + + return ret; +} + static void sev_register_types(void) { + type_register_static(&sev_common_info); type_register_static(&sev_guest_info); + type_register_static(&sev_snp_guest_info); } type_init(sev_register_types); diff --git a/target/i386/sev.h b/target/i386/sev.h index e7499c95b1e8..cbb967bd48cb 100644 --- a/target/i386/sev.h +++ b/target/i386/sev.h @@ -20,6 +20,10 @@ #include "exec/confidential-guest-support.h" +#define TYPE_SEV_COMMON "sev-common" +#define TYPE_SEV_GUEST "sev-guest" +#define TYPE_SEV_SNP_GUEST "sev-snp-guest" + #define SEV_POLICY_NODBG 0x1 #define SEV_POLICY_NOKS 0x2 #define SEV_POLICY_ES 0x4 @@ -27,6 +31,9 @@ #define SEV_POLICY_DOMAIN 0x10 #define SEV_POLICY_SEV 0x20 +#define SEV_SNP_POLICY_SMT 0x10000 +#define SEV_SNP_POLICY_DBG 0x80000 + typedef struct SevKernelLoaderContext { char *setup_data; size_t setup_size; @@ -38,25 +45,150 @@ typedef struct SevKernelLoaderContext { size_t cmdline_size; } SevKernelLoaderContext; +/* Save area definition for SEV-ES and SEV-SNP guests */ +struct QEMU_PACKED sev_es_save_area { + struct vmcb_seg es; + struct vmcb_seg cs; + struct vmcb_seg ss; + struct vmcb_seg ds; + struct vmcb_seg fs; + struct vmcb_seg gs; + struct vmcb_seg gdtr; + struct vmcb_seg ldtr; + struct vmcb_seg idtr; + struct vmcb_seg tr; + uint64_t vmpl0_ssp; + uint64_t vmpl1_ssp; + uint64_t vmpl2_ssp; + uint64_t vmpl3_ssp; + uint64_t u_cet; + uint8_t reserved_0xc8[2]; + uint8_t vmpl; + uint8_t cpl; + uint8_t reserved_0xcc[4]; + uint64_t efer; + uint8_t reserved_0xd8[104]; + uint64_t xss; + uint64_t cr4; + uint64_t cr3; + uint64_t cr0; + uint64_t dr7; + uint64_t dr6; + uint64_t rflags; + uint64_t rip; + uint64_t dr0; + uint64_t dr1; + uint64_t dr2; + uint64_t dr3; + uint64_t dr0_addr_mask; + uint64_t dr1_addr_mask; + uint64_t dr2_addr_mask; + uint64_t dr3_addr_mask; + uint8_t reserved_0x1c0[24]; + uint64_t rsp; + uint64_t s_cet; + uint64_t ssp; + uint64_t isst_addr; + uint64_t rax; + uint64_t star; + uint64_t lstar; + uint64_t cstar; + uint64_t sfmask; + uint64_t kernel_gs_base; + uint64_t sysenter_cs; + uint64_t sysenter_esp; + uint64_t sysenter_eip; + uint64_t cr2; + uint8_t reserved_0x248[32]; + uint64_t g_pat; + uint64_t dbgctl; + uint64_t br_from; + uint64_t br_to; + uint64_t last_excp_from; + uint64_t last_excp_to; + uint8_t reserved_0x298[80]; + uint32_t pkru; + uint32_t tsc_aux; + uint8_t reserved_0x2f0[24]; + uint64_t rcx; + uint64_t rdx; + uint64_t rbx; + uint64_t reserved_0x320; /* rsp already available at 0x01d8 */ + uint64_t rbp; + uint64_t rsi; + uint64_t rdi; + uint64_t r8; + uint64_t r9; + uint64_t r10; + uint64_t r11; + uint64_t r12; + uint64_t r13; + uint64_t r14; + uint64_t r15; + uint8_t reserved_0x380[16]; + uint64_t guest_exit_info_1; + uint64_t guest_exit_info_2; + uint64_t guest_exit_int_info; + uint64_t guest_nrip; + uint64_t sev_features; + uint64_t vintr_ctrl; + uint64_t guest_exit_code; + uint64_t virtual_tom; + uint64_t tlb_id; + uint64_t pcpu_id; + uint64_t event_inj; + uint64_t xcr0; + uint8_t reserved_0x3f0[16]; + + /* Floating point area */ + uint64_t x87_dp; + uint32_t mxcsr; + uint16_t x87_ftw; + uint16_t x87_fsw; + uint16_t x87_fcw; + uint16_t x87_fop; + uint16_t x87_ds; + uint16_t x87_cs; + uint64_t x87_rip; + uint8_t fpreg_x87[80]; + uint8_t fpreg_xmm[256]; + uint8_t fpreg_ymm[256]; +}; + +struct QEMU_PACKED sev_snp_id_authentication { + uint32_t id_key_alg; + uint32_t auth_key_algo; + uint8_t reserved[56]; + uint8_t id_block_sig[512]; + uint8_t id_key[1028]; + uint8_t reserved2[60]; + uint8_t id_key_sig[512]; + uint8_t author_key[1028]; + uint8_t reserved3[892]; +}; + #ifdef CONFIG_SEV bool sev_enabled(void); bool sev_es_enabled(void); +bool sev_snp_enabled(void); #else #define sev_enabled() 0 #define sev_es_enabled() 0 +#define sev_snp_enabled() 0 #endif uint32_t sev_get_cbit_position(void); uint32_t sev_get_reduced_phys_bits(void); bool sev_add_kernel_loader_hashes(SevKernelLoaderContext *ctx, Error **errp); -int sev_encrypt_flash(uint8_t *ptr, uint64_t len, Error **errp); +int sev_encrypt_flash(hwaddr gpa, uint8_t *ptr, uint64_t len, Error **errp); int sev_inject_launch_secret(const char *hdr, const char *secret, uint64_t gpa, Error **errp); int sev_es_save_reset_vector(void *flash_ptr, uint64_t flash_size); void sev_es_set_reset_vector(CPUState *cpu); -int sev_kvm_init(ConfidentialGuestSupport *cgs, Error **errp); +int sev_kvm_init(MachineState *ms, Error **errp); +int kvm_handle_vmgexit(struct kvm_run *run); #endif diff --git a/target/i386/trace-events b/target/i386/trace-events index 2cd8726eebb7..9b3e453f1607 100644 --- a/target/i386/trace-events +++ b/target/i386/trace-events @@ -1,7 +1,7 @@ # See docs/devel/tracing.rst for syntax documentation. # sev.c -kvm_sev_init(void) "" +kvm_sev_init(const char *type, uint64_t flags) "type %s flags 0x%" PRIx64 kvm_memcrypt_register_region(void *addr, size_t len) "addr %p len 0x%zx" kvm_memcrypt_unregister_region(void *addr, size_t len) "addr %p len 0x%zx" kvm_sev_change_state(const char *old, const char *new) "%s -> %s" @@ -11,3 +11,6 @@ kvm_sev_launch_measurement(const char *value) "data %s" kvm_sev_launch_finish(void) "" kvm_sev_launch_secret(uint64_t hpa, uint64_t hva, uint64_t secret, int len) "hpa 0x%" PRIx64 " hva 0x%" PRIx64 " data 0x%" PRIx64 " len %d" kvm_sev_attestation_report(const char *mnonce, const char *data) "mnonce %s data %s" +kvm_sev_snp_launch_start(uint64_t policy, char *gosvw) "policy 0x%" PRIx64 " gosvw %s" +kvm_sev_snp_launch_update(void *addr, uint32_t gpa, uint64_t len, const char *type) "addr %p gpa 0x%x len 0x%" PRIx64 " (%s page)" +kvm_sev_snp_launch_finish(char *id_block, char *id_auth, char *host_data) "id_block %s id_auth %s host_data %s"