// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved. // // StratoVirt is licensed under Mulan PSL v2. // You can use this software according to the terms and conditions of the Mulan // PSL v2. // You may obtain a copy of Mulan PSL v2 at: // http://license.coscl.org.cn/MulanPSL2 // THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY // KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO // NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. // See the Mulan PSL v2 for more details. mod cpuid; use std::sync::{Arc, Mutex}; use anyhow::{Context, Result}; use kvm_bindings::{ kvm_cpuid_entry2 as CpuidEntry2, kvm_debugregs as DebugRegs, kvm_fpu as Fpu, kvm_lapic_state as LapicState, kvm_mp_state as MpState, kvm_msr_entry as MsrEntry, kvm_regs as Regs, kvm_segment as Segment, kvm_sregs as Sregs, kvm_vcpu_events as VcpuEvents, kvm_xcrs as Xcrs, kvm_xsave as Xsave, CpuId, KVM_CPUID_FLAG_SIGNIFCANT_INDEX as CPUID_FLAG_SIGNIFICANT_INDEX, KVM_MP_STATE_RUNNABLE as MP_STATE_RUNNABLE, KVM_MP_STATE_UNINITIALIZED as MP_STATE_UNINITIALIZED, }; use self::cpuid::host_cpuid; use crate::CPU; use migration::{ DeviceStateDesc, FieldDesc, MigrationError, MigrationHook, MigrationManager, StateTransfer, }; use migration_derive::{ByteCode, Desc}; use util::byte_code::ByteCode; const ECX_EPB_SHIFT: u32 = 3; const X86_FEATURE_HYPERVISOR: u32 = 31; const X86_FEATURE_TSC_DEADLINE_TIMER: u32 = 24; const MSR_LIST: &[u32] = &[ 0x0174, // MSR_IA32_SYSENTER_CS 0x0175, // MSR_IA32_SYSENTER_ESP 0x0176, // MSR_IA32_SYSENTER_EIP 0xc000_0081, // MSR_STAR, legacy mode SYSCALL target 0xc000_0082, // MSR_LSTAR, long mode SYSCALL target 0xc000_0083, // MSR_CSTAR, compat mode SYSCALL target 0xc000_0084, // MSR_SYSCALL_MASK, EFLAGS mask for syscall 0xc000_0102, // MSR_KERNEL_GS_BASE, SwapGS GS shadow 0x0010, // MSR_IA32_TSC, 0x01a0, // MSR_IA32_MISC_ENABLE, 0x2ff, // MSR_MTRRdefType ]; const MSR_IA32_MISC_ENABLE: u32 = 0x01a0; const MSR_IA32_MISC_ENABLE_FAST_STRING: u64 = 0x1; const ECX_INVALID: u32 = 0u32 << 8; const ECX_THREAD: u32 = 1u32 << 8; const ECX_CORE: u32 = 2u32 << 8; const ECX_DIE: u32 = 5u32 << 8; #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum X86RegsIndex { Regs, Sregs, Fpu, MpState, LapicState, MsrEntry, VcpuEvents, Xsave, Xcrs, DebugRegs, } /// X86 CPU booting configure information #[allow(clippy::upper_case_acronyms)] #[derive(Default, Clone, Debug)] pub struct X86CPUBootConfig { pub prot64_mode: bool, /// Register %rip value pub boot_ip: u64, /// Register %rsp value pub boot_sp: u64, /// Boot selector pub boot_selector: u16, /// zero page address, as the second parameter of __startup_64 /// arch/x86/kernel/head_64.S:86 pub zero_page: u64, pub code_segment: Segment, pub data_segment: Segment, pub gdt_base: u64, pub gdt_size: u16, pub idt_base: u64, pub idt_size: u16, pub pml4_start: u64, } #[allow(clippy::upper_case_acronyms)] #[derive(Default, Copy, Clone, Debug)] pub struct X86CPUTopology { threads: u8, cores: u8, dies: u8, } impl X86CPUTopology { pub fn new() -> Self { X86CPUTopology::default() } pub fn set_topology(mut self, toplogy: (u8, u8, u8)) -> Self { self.threads = toplogy.0; self.cores = toplogy.1; self.dies = toplogy.2; self } } /// The state of vCPU's register. #[allow(clippy::upper_case_acronyms)] #[repr(C)] #[derive(Copy, Clone, Desc, ByteCode)] #[desc_version(compat_version = "0.1.0")] pub struct X86CPUState { max_vcpus: u32, nr_threads: u32, nr_cores: u32, nr_dies: u32, nr_sockets: u32, pub apic_id: u32, pub regs: Regs, pub sregs: Sregs, pub fpu: Fpu, pub mp_state: MpState, pub lapic: LapicState, pub msr_len: usize, pub msr_list: [MsrEntry; 256], pub cpu_events: VcpuEvents, pub xsave: Xsave, pub xcrs: Xcrs, pub debugregs: DebugRegs, } impl X86CPUState { /// Allocates a new `X86CPUState`. /// /// # Arguments /// /// * `vcpu_id` - ID of this `CPU`. /// * `max_vcpus` - Number of vcpus. pub fn new(vcpu_id: u32, max_vcpus: u32) -> Self { let mp_state = MpState { mp_state: if vcpu_id == 0 { MP_STATE_RUNNABLE } else { MP_STATE_UNINITIALIZED }, }; X86CPUState { apic_id: vcpu_id, max_vcpus, mp_state, nr_threads: 1, nr_cores: 1, nr_dies: 1, nr_sockets: 1, ..Default::default() } } pub fn set(&mut self, cpu_state: &Arc<Mutex<X86CPUState>>) { let locked_cpu_state = cpu_state.lock().unwrap(); self.max_vcpus = locked_cpu_state.max_vcpus; self.apic_id = locked_cpu_state.apic_id; self.regs = locked_cpu_state.regs; self.sregs = locked_cpu_state.sregs; self.fpu = locked_cpu_state.fpu; self.mp_state = locked_cpu_state.mp_state; self.lapic = locked_cpu_state.lapic; self.msr_len = locked_cpu_state.msr_len; self.msr_list = locked_cpu_state.msr_list; self.cpu_events = locked_cpu_state.cpu_events; self.xsave = locked_cpu_state.xsave; self.xcrs = locked_cpu_state.xcrs; self.debugregs = locked_cpu_state.debugregs; } /// Set cpu topology /// /// # Arguments /// /// * `topology` - X86 CPU Topology pub fn set_cpu_topology(&mut self, topology: &X86CPUTopology) -> Result<()> { self.nr_threads = topology.threads as u32; self.nr_cores = topology.cores as u32; self.nr_dies = topology.dies as u32; Ok(()) } pub fn setup_lapic(&mut self, lapic: LapicState) -> Result<()> { // Disable nmi and external interrupt before enter protected mode // See: https://elixir.bootlin.com/linux/v4.19.123/source/arch/x86/include/asm/apicdef.h const APIC_LVT0: usize = 0x350; const APIC_LVT1: usize = 0x360; const APIC_MODE_NMI: u32 = 0x4; const APIC_MODE_EXTINT: u32 = 0x7; const APIC_ID: usize = 0x20; self.lapic = lapic; // SAFETY: The member regs in struct LapicState is a u8 array with 1024 entries, // so it's saft to cast u8 pointer to u32 at position APIC_LVT0 and APIC_LVT1. // Safe because all value in this unsafe block is certain. unsafe { let apic_lvt_lint0 = &mut self.lapic.regs[APIC_LVT0..] as *mut [i8] as *mut u32; *apic_lvt_lint0 &= !0x700; *apic_lvt_lint0 |= APIC_MODE_EXTINT << 8; let apic_lvt_lint1 = &mut self.lapic.regs[APIC_LVT1..] as *mut [i8] as *mut u32; *apic_lvt_lint1 &= !0x700; *apic_lvt_lint1 |= APIC_MODE_NMI << 8; let apic_id = &mut self.lapic.regs[APIC_ID..] as *mut [i8] as *mut u32; *apic_id = self.apic_id << 24; } Ok(()) } pub fn setup_regs(&mut self, boot_config: &X86CPUBootConfig) { self.regs = Regs { rflags: 0x0002, // Means processor has been initialized rip: boot_config.boot_ip, rsp: boot_config.boot_sp, rbp: boot_config.boot_sp, rsi: boot_config.zero_page, ..Default::default() }; } pub fn setup_sregs(&mut self, sregs: Sregs, boot_config: &X86CPUBootConfig) -> Result<()> { self.sregs = sregs; self.sregs.cs.base = (boot_config.boot_selector as u64) << 4; self.sregs.cs.selector = boot_config.boot_selector; self.sregs.ds.base = (boot_config.boot_selector as u64) << 4; self.sregs.ds.selector = boot_config.boot_selector; self.sregs.es.base = (boot_config.boot_selector as u64) << 4; self.sregs.es.selector = boot_config.boot_selector; self.sregs.fs.base = (boot_config.boot_selector as u64) << 4; self.sregs.fs.selector = boot_config.boot_selector; self.sregs.gs.base = (boot_config.boot_selector as u64) << 4; self.sregs.gs.selector = boot_config.boot_selector; self.sregs.ss.base = (boot_config.boot_selector as u64) << 4; self.sregs.ss.selector = boot_config.boot_selector; if boot_config.prot64_mode { self.set_prot64_sregs(boot_config); } Ok(()) } pub fn set_prot64_sregs(&mut self, boot_config: &X86CPUBootConfig) { // X86_CR0_PE: Protection Enable // EFER_LME: Long mode enable // EFER_LMA: Long mode active // arch/x86/include/uapi/asm/processor-flags.h const X86_CR0_PE: u64 = 0x1; const EFER_LME: u64 = 0x100; const EFER_LMA: u64 = 0x400; // X86_CR0_PG: enable Paging // X86_CR4_PAE: enable physical address extensions // arch/x86/include/uapi/asm/processor-flags.h const X86_CR0_PG: u64 = 0x8000_0000; const X86_CR4_PAE: u64 = 0x20; // Init gdt table, gdt table has loaded to Guest Memory Space self.sregs.cs = boot_config.code_segment; self.sregs.ds = boot_config.data_segment; self.sregs.es = boot_config.data_segment; self.sregs.fs = boot_config.data_segment; self.sregs.gs = boot_config.data_segment; self.sregs.ss = boot_config.data_segment; // Init gdt table, gdt table has loaded to Guest Memory Space self.sregs.gdt.base = boot_config.gdt_base; self.sregs.gdt.limit = boot_config.gdt_size; // Init idt table, idt table has loaded to Guest Memory Space self.sregs.idt.base = boot_config.idt_base; self.sregs.idt.limit = boot_config.idt_size; // Open 64-bit protected mode, include // Protection enable, Long mode enable, Long mode active self.sregs.cr0 |= X86_CR0_PE; self.sregs.efer |= EFER_LME | EFER_LMA; // Setup page table self.sregs.cr3 = boot_config.pml4_start; self.sregs.cr4 |= X86_CR4_PAE; self.sregs.cr0 |= X86_CR0_PG; } pub fn setup_fpu(&mut self) { // Default value for fxregs_state.mxcsr // arch/x86/include/asm/fpu/types.h const MXCSR_DEFAULT: u32 = 0x1f80; self.fpu = Fpu { fcw: 0x37f, mxcsr: MXCSR_DEFAULT, ..Default::default() }; } pub fn setup_msrs(&mut self) { // Enable fasting-string operation to improve string // store operations. for (index, msr) in MSR_LIST.iter().enumerate() { let data = match *msr { MSR_IA32_MISC_ENABLE => MSR_IA32_MISC_ENABLE_FAST_STRING, _ => 0u64, }; self.msr_list[index] = MsrEntry { index: *msr, data, ..Default::default() }; self.msr_len += 1; } } pub fn adjust_cpuid(&self, cpuid: &mut CpuId) -> Result<()> { if self.nr_dies < 2 { return Ok(()); } // Intel CPU topology with multi-dies support requires CPUID[0x1f]. let entries = cpuid.as_mut_slice(); for entry in entries.iter_mut() { if entry.function == 0 { if entry.eax >= 0x1f { return Ok(()); } else { entry.eax = 0x1f; } break; } } for index in 0..4 { let entry = CpuidEntry2 { function: 0x1f, index, ..Default::default() }; cpuid.push(entry)?; } Ok(()) } pub fn setup_cpuid(&self, cpuid: &mut CpuId) -> Result<()> { let core_offset = 32u32 - (self.nr_threads - 1).leading_zeros(); let die_offset = (32u32 - (self.nr_cores - 1).leading_zeros()) + core_offset; let pkg_offset = (32u32 - (self.nr_dies - 1).leading_zeros()) + die_offset; self.adjust_cpuid(cpuid)?; let entries = cpuid.as_mut_slice(); for entry in entries.iter_mut() { match entry.function { 1 => { if entry.index == 0 { entry.ecx |= 1u32 << X86_FEATURE_HYPERVISOR; entry.ecx |= 1u32 << X86_FEATURE_TSC_DEADLINE_TIMER; entry.ebx = self.apic_id << 24 | 8 << 8; } } 2 => { host_cpuid( 2, 0, &mut entry.eax, &mut entry.ebx, &mut entry.ecx, &mut entry.edx, ); } 4 => { // cache info: needed for Pentium Pro compatibility // Passthrough host cache info directly to guest host_cpuid( 4, entry.index, &mut entry.eax, &mut entry.ebx, &mut entry.ecx, &mut entry.edx, ); entry.eax &= !0xfc00_0000; if entry.eax & 0x0001_ffff != 0 && self.max_vcpus > 1 { entry.eax |= (self.max_vcpus - 1) << 26; } } 6 => { entry.ecx &= !(1u32 << ECX_EPB_SHIFT); } 10 => { if entry.eax != 0 { let version_id = entry.eax & 0xff; let num_counters = entry.eax & 0xff00; if version_id != 2 || num_counters == 0 { entry.eax = 0; } } } 0xb => { // Extended Topology Enumeration Leaf entry.edx = self.apic_id; entry.ecx = entry.index & 0xff; match entry.index { 0 => { entry.eax = core_offset; entry.ebx = self.nr_threads; entry.ecx |= ECX_THREAD; } 1 => { entry.eax = pkg_offset; entry.ebx = self.nr_threads * self.nr_cores; entry.ecx |= ECX_CORE; } _ => { entry.eax = 0; entry.ebx = 0; entry.ecx |= ECX_INVALID; } } } 0x1f => { if self.nr_dies < 2 { entry.eax = 0; entry.ebx = 0; entry.ecx = 0; entry.edx = 0; continue; } entry.edx = self.apic_id; entry.ecx = entry.index & 0xff; entry.flags = CPUID_FLAG_SIGNIFICANT_INDEX; match entry.index { 0 => { entry.eax = core_offset; entry.ebx = self.nr_threads; entry.ecx |= ECX_THREAD; } 1 => { entry.eax = die_offset; entry.ebx = self.nr_cores * self.nr_threads; entry.ecx |= ECX_CORE; } 2 => { entry.eax = pkg_offset; entry.ebx = self.nr_dies * self.nr_cores * self.nr_threads; entry.ecx |= ECX_DIE; } _ => { entry.eax = 0; entry.ebx = 0; entry.ecx |= ECX_INVALID; } } } 0x8000_0002..=0x8000_0004 => { // Passthrough host cpu model name directly to guest host_cpuid( entry.function, entry.index, &mut entry.eax, &mut entry.ebx, &mut entry.ecx, &mut entry.edx, ); } _ => (), } } Ok(()) } } impl StateTransfer for CPU { fn get_state_vec(&self) -> Result<Vec<u8>> { let hypervisor_cpu = self.hypervisor_cpu(); hypervisor_cpu.get_regs(self.arch_cpu.clone(), X86RegsIndex::MpState)?; hypervisor_cpu.get_regs(self.arch_cpu.clone(), X86RegsIndex::Regs)?; hypervisor_cpu.get_regs(self.arch_cpu.clone(), X86RegsIndex::Sregs)?; hypervisor_cpu.get_regs(self.arch_cpu.clone(), X86RegsIndex::Xsave)?; hypervisor_cpu.get_regs(self.arch_cpu.clone(), X86RegsIndex::Fpu)?; hypervisor_cpu.get_regs(self.arch_cpu.clone(), X86RegsIndex::Xcrs)?; hypervisor_cpu.get_regs(self.arch_cpu.clone(), X86RegsIndex::DebugRegs)?; hypervisor_cpu.get_regs(self.arch_cpu.clone(), X86RegsIndex::LapicState)?; hypervisor_cpu.get_regs(self.arch_cpu.clone(), X86RegsIndex::MsrEntry)?; hypervisor_cpu.get_regs(self.arch_cpu.clone(), X86RegsIndex::VcpuEvents)?; Ok(self.arch_cpu.lock().unwrap().as_bytes().to_vec()) } fn set_state(&self, state: &[u8]) -> Result<()> { let cpu_state = *X86CPUState::from_bytes(state) .with_context(|| MigrationError::FromBytesError("CPU"))?; let mut cpu_state_locked = self.arch_cpu.lock().unwrap(); *cpu_state_locked = cpu_state; Ok(()) } fn get_device_alias(&self) -> u64 { MigrationManager::get_desc_alias(&X86CPUState::descriptor().name).unwrap_or(!0) } } impl MigrationHook for CPU {}