// Copyright (c) 2020 Huawei Technologies Co.,Ltd. All rights reserved.
//
// StratoVirt is licensed under Mulan PSL v2.
// You can use this software according to the terms and conditions of the Mulan
// PSL v2.
// You may obtain a copy of Mulan PSL v2 at:
//         http://license.coscl.org.cn/MulanPSL2
// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY
// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
// NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
// See the Mulan PSL v2 for more details.

use std::mem::size_of;
use std::os::unix::io::{AsRawFd, RawFd};
use std::sync::atomic::{AtomicU16, Ordering};
use std::sync::{Arc, Mutex, Weak};

use anyhow::{anyhow, bail, Context, Result};
use byteorder::{ByteOrder, LittleEndian};
use log::error;
use vfio_bindings::bindings::vfio;
use vmm_sys_util::eventfd::EventFd;
use vmm_sys_util::ioctl::ioctl_with_mut_ref;

use crate::vfio_dev::*;
use crate::VfioError;
use crate::{CONTAINERS, GROUPS};
use address_space::{AddressSpace, FileBackend, GuestAddress, HostMemMapping, Region, RegionOps};
#[cfg(target_arch = "aarch64")]
use devices::pci::config::SECONDARY_BUS_NUM;
use devices::pci::config::{
    PciConfig, RegionType, BAR_0, BAR_5, BAR_IO_SPACE, BAR_MEM_64BIT, BAR_SPACE_UNMAPPED, COMMAND,
    COMMAND_BUS_MASTER, COMMAND_INTERRUPT_DISABLE, COMMAND_IO_SPACE, COMMAND_MEMORY_SPACE,
    HEADER_TYPE, IO_BASE_ADDR_MASK, MEM_BASE_ADDR_MASK, PCIE_CONFIG_SPACE_SIZE,
    PCI_CONFIG_SPACE_SIZE, REG_SIZE,
};
use devices::pci::msix::{
    Msix, MSIX_CAP_CONTROL, MSIX_CAP_ENABLE, MSIX_CAP_FUNC_MASK, MSIX_CAP_ID, MSIX_CAP_SIZE,
    MSIX_CAP_TABLE, MSIX_TABLE_BIR, MSIX_TABLE_ENTRY_SIZE, MSIX_TABLE_OFFSET, MSIX_TABLE_SIZE_MAX,
};
use devices::pci::{
    init_multifunction, le_read_u16, le_read_u32, le_write_u16, le_write_u32, pci_ext_cap_id,
    pci_ext_cap_next, pci_ext_cap_ver, PciBus, PciDevBase, PciDevOps,
};
use devices::{pci::MsiVector, Device, DeviceBase};
use util::num_ops::ranges_overlap;
use util::unix::host_page_size;

const PCI_NUM_BARS: u8 = 6;
const PCI_ROM_SLOT: u8 = 6;

struct MsixTable {
    table_bar: u8,
    table_offset: u64,
    table_size: u64,
}

struct VfioMsixInfo {
    // Table bar, table offset and table size info.
    table: MsixTable,
    // Msix entries.
    entries: u16,
}

struct VfioBar {
    vfio_region: VfioRegion,
    region_type: RegionType,
    size: u64,
}

struct GsiMsiRoute {
    irq_fd: Option<Arc<EventFd>>,
    gsi: i32,
    nr: u32,
}

/// VfioPciDevice is a VFIO PCI device. It implements PciDevOps trait for a PCI device.
/// And it is bound to a VFIO device.
pub struct VfioPciDevice {
    base: PciDevBase,
    config_size: u64,
    // Offset of pci config space region within vfio device fd.
    config_offset: u64,
    // Vfio device which is bound to.
    vfio_device: Arc<Mutex<VfioDevice>>,
    // Cache of MSI-X setup.
    msix_info: Option<VfioMsixInfo>,
    // Bars information without ROM.
    vfio_bars: Arc<Mutex<Vec<VfioBar>>>,
    // Maintains a list of GSI with irqfds that are registered to kvm.
    gsi_msi_routes: Arc<Mutex<Vec<GsiMsiRoute>>>,
    dev_id: Arc<AtomicU16>,
    // Multi-Function flag.
    multi_func: bool,
    mem_as: Arc<AddressSpace>,
}

impl VfioPciDevice {
    /// New a VFIO PCI device structure for the vfio device created by VMM.
    pub fn new(
        vfio_device: Arc<Mutex<VfioDevice>>,
        devfn: u8,
        name: String,
        parent_bus: Weak<Mutex<PciBus>>,
        multi_func: bool,
        mem_as: Arc<AddressSpace>,
    ) -> Self {
        Self {
            // Unknown PCI or PCIe type here, allocate enough space to match the two types.
            base: PciDevBase {
                base: DeviceBase::new(name, true),
                config: PciConfig::new(PCIE_CONFIG_SPACE_SIZE, PCI_NUM_BARS),
                devfn,
                parent_bus,
            },
            config_size: 0,
            config_offset: 0,
            vfio_device,
            msix_info: None,
            vfio_bars: Arc::new(Mutex::new(Vec::with_capacity(PCI_NUM_BARS as usize))),
            gsi_msi_routes: Arc::new(Mutex::new(Vec::new())),
            dev_id: Arc::new(AtomicU16::new(0)),
            multi_func,
            mem_as,
        }
    }

    fn get_pci_config(&mut self) -> Result<()> {
        let argsz: u32 = size_of::<vfio::vfio_region_info>() as u32;
        let mut info = vfio::vfio_region_info {
            argsz,
            flags: 0,
            index: vfio::VFIO_PCI_CONFIG_REGION_INDEX,
            cap_offset: 0,
            size: 0,
            offset: 0,
        };

        let locked_dev = self.vfio_device.lock().unwrap();
        let ret =
            // SAFETY: Device is the owner of file, and we will verify the result is valid.
            unsafe { ioctl_with_mut_ref(&locked_dev.fd, VFIO_DEVICE_GET_REGION_INFO(), &mut info) };
        if ret < 0 {
            return Err(anyhow!(VfioError::VfioIoctl(
                "VFIO_GET_PCI_CONFIG_INFO".to_string(),
                std::io::Error::last_os_error(),
            )));
        }

        self.config_size = info.size;
        self.config_offset = info.offset;
        let mut config_data = vec![0_u8; self.config_size as usize];
        locked_dev.read_region(config_data.as_mut_slice(), self.config_offset, 0)?;
        self.base.config.config[..PCI_CONFIG_SPACE_SIZE]
            .copy_from_slice(&config_data[..PCI_CONFIG_SPACE_SIZE]);

        // If guest OS can not see extended caps, just ignore them.
        if self.config_size == PCI_CONFIG_SPACE_SIZE as u64 {
            return Ok(());
        }

        // Cache the pci config space to avoid overwriting the original config space. Because we
        // will parse the chain of extended caps in cache config and insert them into original
        // config space.
        let mut config = PciConfig::new(PCIE_CONFIG_SPACE_SIZE, PCI_NUM_BARS);
        config.config = config_data;
        let mut next = PCI_CONFIG_SPACE_SIZE;
        while (PCI_CONFIG_SPACE_SIZE..PCIE_CONFIG_SPACE_SIZE).contains(&next) {
            let header = le_read_u32(&config.config, next)?;
            let cap_id = pci_ext_cap_id(header);
            let cap_version = pci_ext_cap_ver(header);
            // Get the actual size of extended capability.
            let size = config.get_ext_cap_size(next);
            let old_next = next;
            next = pci_ext_cap_next(header);

            // Drop the following extended caps:
            // * Alternate Routing ID(0x0e): Needs next function virtualization;
            // * Single Root I/O Virtualization(0x10): Read-only VF BARs confuse OVMF;
            // * Resizable BAR(0x15): Can't export read-only;
            if cap_id == 0x0e || cap_id == 0x10 || cap_id == 0x15 {
                continue;
            }
            let offset = self
                .base
                .config
                .add_pcie_ext_cap(cap_id, size, cap_version)?;
            self.base.config.config[offset..offset + size]
                .copy_from_slice(&config.config[old_next..old_next + size]);
        }

        Ok(())
    }

    /// Disable I/O, MMIO, bus master and INTx states, And clear host device bar size information.
    /// Guest OS can get residual addresses from the host if not clear bar size.
    fn pci_config_reset(&mut self) -> Result<()> {
        let mut cmd = le_read_u16(&self.base.config.config, COMMAND as usize)?;
        cmd &= !(COMMAND_IO_SPACE
            | COMMAND_MEMORY_SPACE
            | COMMAND_BUS_MASTER
            | COMMAND_INTERRUPT_DISABLE);
        le_write_u16(&mut self.base.config.config, COMMAND as usize, cmd)?;

        let mut data = vec![0u8; 2];
        LittleEndian::write_u16(&mut data, cmd);
        self.vfio_device.lock().unwrap().write_region(
            data.as_slice(),
            self.config_offset,
            COMMAND as u64,
        )?;

        for i in 0..PCI_ROM_SLOT {
            let offset = BAR_0 as usize + REG_SIZE * i as usize;
            let v = le_read_u32(&self.base.config.config, offset)?;
            if v & BAR_IO_SPACE as u32 != 0 {
                le_write_u32(&mut self.base.config.config, offset, v & !IO_BASE_ADDR_MASK)?;
            } else {
                le_write_u32(
                    &mut self.base.config.config,
                    offset,
                    v & !MEM_BASE_ADDR_MASK as u32,
                )?;
            }
        }

        Ok(())
    }

    /// Get MSI-X table and entry information from vfio device.
    fn get_msix_info(&mut self) -> Result<VfioMsixInfo> {
        let cap_offset = self.base.config.find_pci_cap(MSIX_CAP_ID);
        let table = le_read_u32(
            &self.base.config.config,
            cap_offset + MSIX_CAP_TABLE as usize,
        )?;

        let ctrl = le_read_u16(
            &self.base.config.config,
            cap_offset + MSIX_CAP_CONTROL as usize,
        )?;
        let entries = (ctrl & MSIX_TABLE_SIZE_MAX) + 1;
        // Make sure that if entries less than 1 or greater than (0x7ff + 1) is error value.
        if !(1..=(MSIX_TABLE_SIZE_MAX + 1)).contains(&entries) {
            bail!(
                "The number of MSI-X vectors is invalid, MSI-X vectors are {}",
                entries,
            );
        }

        Ok(VfioMsixInfo {
            table: MsixTable {
                table_bar: (table as u16 & MSIX_TABLE_BIR) as u8,
                table_offset: (table & MSIX_TABLE_OFFSET) as u64,
                table_size: (entries * MSIX_TABLE_ENTRY_SIZE) as u64,
            },
            entries,
        })
    }

    /// Get vfio bars information. Vfio device won't allow to mmap the MSI-X table area,
    /// we need to separate MSI-X table area and region mmap area.
    fn bar_region_info(&mut self) -> Result<Vec<VfioBar>> {
        let mut vfio_bars: Vec<VfioBar> = Vec::new();
        let locked_dev = self.vfio_device.lock().unwrap();
        let mut infos = locked_dev
            .get_regions_info()
            .with_context(|| "Failed get vfio device regions info")?;

        for i in 0..PCI_ROM_SLOT {
            let mut data = vec![0_u8; 4];
            locked_dev.read_region(
                data.as_mut_slice(),
                self.config_offset,
                (BAR_0 + (REG_SIZE as u8) * i) as u64,
            )?;
            let mut region_type = RegionType::Mem32Bit;
            let pci_bar = LittleEndian::read_u32(&data);
            if pci_bar & BAR_IO_SPACE as u32 != 0 {
                region_type = RegionType::Io;
            } else if pci_bar & BAR_MEM_64BIT as u32 != 0 {
                region_type = RegionType::Mem64Bit;
            }
            let vfio_region = infos.remove(0);
            let size = vfio_region.size;

            vfio_bars.push(VfioBar {
                vfio_region,
                region_type,
                size,
            });
        }

        self.fixup_msix_region(&mut vfio_bars)?;

        Ok(vfio_bars)
    }

    fn fixup_msix_region(&self, vfio_bars: &mut [VfioBar]) -> Result<()> {
        let msix_info = self
            .msix_info
            .as_ref()
            .with_context(|| "Failed to get MSIX info")?;

        let vfio_bar = vfio_bars
            .get_mut(msix_info.table.table_bar as usize)
            .with_context(|| "Failed to get vfio bar info")?;
        let region = &mut vfio_bar.vfio_region;
        // If MSI-X area already setups or does not support mapping, we shall just return.
        if region.mmaps.len() != 1
            || region.mmaps[0].offset != 0
            || region.size != region.mmaps[0].size
        {
            return Ok(());
        }

        // Align MSI-X table start and end to host page size.
        let page_size = host_page_size();
        let start: u64 = ((msix_info.table.table_offset as i64) & (0 - page_size as i64)) as u64;
        let end: u64 = (((msix_info.table.table_offset + msix_info.table.table_size)
            + (page_size - 1)) as i64
            & (0 - page_size as i64)) as u64;

        // The remaining area of the BAR before or after MSI-X table is remappable.
        if start == 0 {
            if end >= region.size {
                region.mmaps.clear();
            } else {
                region.mmaps[0].offset = end;
                region.mmaps[0].size = region.size - end;
            }
        } else if end >= region.size {
            region.mmaps[0].size = start;
        } else {
            region.mmaps[0].offset = 0;
            region.mmaps[0].size = start;
            region.mmaps.push(MmapInfo {
                offset: end,
                size: region.size - end,
            });
        }

        Ok(())
    }

    fn register_bars(&mut self) -> Result<()> {
        let msix_info = self
            .msix_info
            .as_ref()
            .with_context(|| "Failed to get MSIX info")?;
        let table_bar = msix_info.table.table_bar;
        let table_offset = msix_info.table.table_offset;
        let table_size = msix_info.table.table_size;
        // Create a separate region for MSI-X table, VFIO won't allow to map the MSI-X table area.
        let table_ops = self
            .get_table_region_ops()
            .with_context(|| "Failed to get table region ops")?;
        let bar_ops = self.get_bar_region_ops();

        for i in 0..PCI_ROM_SLOT {
            {
                let mut bars = self.vfio_bars.lock().unwrap();
                let bar = bars
                    .get_mut(i as usize)
                    .with_context(|| "Failed to get bar info")?;
                // Skip unimplemented bar and the upper half of 64 bit bar.
                if bar.size == 0 {
                    continue;
                }
            }

            let mut vfio_bars = self.vfio_bars.lock().unwrap();
            let vfio_bar = vfio_bars
                .get_mut(i as usize)
                .with_context(|| "Failed to get vfio bar info")?;
            let size = vfio_bar.size;

            let region = Region::init_container_region(size, "VfioPci");
            let bar_region = if i == table_bar {
                region
                    .add_subregion(
                        Region::init_io_region(table_size, table_ops.clone(), "VfioBar"),
                        table_offset,
                    )
                    .with_context(|| VfioError::AddRegBar(i as usize))?;

                if table_offset > 0 {
                    region
                        .add_subregion(
                            Region::init_io_region(table_offset, bar_ops.clone(), "VfioRegion"),
                            0,
                        )
                        .with_context(|| VfioError::AddRegBar(i as usize))?;
                }

                if table_offset + table_size < size {
                    region
                        .add_subregion(
                            Region::init_io_region(
                                size - table_offset - table_size,
                                bar_ops.clone(),
                                "vfio_io_region2",
                            ),
                            table_offset + table_size,
                        )
                        .with_context(|| VfioError::AddRegBar(i as usize))?;
                }
                region
            } else {
                region
                    .add_subregion(
                        Region::init_io_region(size, bar_ops.clone(), "vfio_io_region"),
                        0,
                    )
                    .with_context(|| VfioError::AddRegBar(i as usize))?;
                region
            };

            self.base.config.register_bar(
                i as usize,
                bar_region,
                vfio_bar.region_type,
                false,
                size,
            )?;
        }

        Ok(())
    }

    fn unregister_bars(&mut self) -> Result<()> {
        let bus = self.base.parent_bus.upgrade().unwrap();
        self.base.config.unregister_bars(&bus)?;
        Ok(())
    }

    /// Create region ops for MSI-X table.
    fn get_table_region_ops(&mut self) -> Result<RegionOps> {
        let msix_info = self
            .msix_info
            .as_ref()
            .with_context(|| "Failed to get MSIX info")?;
        let table_size = msix_info.table.table_size as u32;
        let cap_offset = self.base.config.find_pci_cap(MSIX_CAP_ID);

        let offset: usize = cap_offset + MSIX_CAP_CONTROL as usize;
        le_write_u16(
            &mut self.base.config.write_mask,
            offset,
            MSIX_CAP_FUNC_MASK | MSIX_CAP_ENABLE,
        )?;

        let msi_irq_manager = if let Some(pci_bus) = self.base.parent_bus.upgrade() {
            let locked_pci_bus = pci_bus.lock().unwrap();
            locked_pci_bus.get_msi_irq_manager()
        } else {
            None
        };

        let msix = Arc::new(Mutex::new(Msix::new(
            table_size,
            table_size / 128,
            cap_offset as u16,
            self.dev_id.clone(),
            msi_irq_manager,
        )));
        self.base.config.msix = Some(msix.clone());

        let cloned_msix = msix.clone();
        let read = move |data: &mut [u8], _: GuestAddress, offset: u64| -> bool {
            if offset as usize + data.len() > cloned_msix.lock().unwrap().table.len() {
                error!(
                    "Fail to read vfio msix table, data length {} plus offset {} overflow",
                    data.len(),
                    offset
                );
                return false;
            }
            data.copy_from_slice(
                &cloned_msix.lock().unwrap().table[offset as usize..(offset as usize + data.len())],
            );
            true
        };

        let cloned_dev = self.vfio_device.clone();
        let cloned_gsi_routes = self.gsi_msi_routes.clone();
        let parent_bus = self.base.parent_bus.clone();
        let dev_id = self.dev_id.clone();
        let devfn = self.base.devfn;
        let cloned_msix = msix.clone();
        let write = move |data: &[u8], _: GuestAddress, offset: u64| -> bool {
            let mut locked_msix = msix.lock().unwrap();
            locked_msix.table[offset as usize..(offset as usize + data.len())]
                .copy_from_slice(data);
            let vector = offset / MSIX_TABLE_ENTRY_SIZE as u64;
            if locked_msix.is_vector_masked(vector as u16) {
                return true;
            }
            let entry = locked_msix.get_message(vector as u16);

            let parent_bus = parent_bus.upgrade().unwrap();
            parent_bus.lock().unwrap().update_dev_id(devfn, &dev_id);
            let msix_vector = MsiVector {
                msg_addr_lo: entry.address_lo,
                msg_addr_hi: entry.address_hi,
                msg_data: entry.data,
                masked: false,
                #[cfg(target_arch = "aarch64")]
                dev_id: dev_id.load(Ordering::Acquire) as u32,
            };

            let mut locked_gsi_routes = cloned_gsi_routes.lock().unwrap();
            let gsi_route = locked_gsi_routes.get_mut(vector as usize).unwrap();
            if gsi_route.irq_fd.is_none() {
                let irq_fd = EventFd::new(libc::EFD_NONBLOCK).unwrap();
                gsi_route.irq_fd = Some(Arc::new(irq_fd));
            }
            let irq_fd = gsi_route.irq_fd.clone();
            let msi_irq_manager = &cloned_msix.lock().unwrap().msi_irq_manager;
            let irq_manager = msi_irq_manager.as_ref().unwrap();
            if gsi_route.gsi == -1 {
                gsi_route.gsi = match irq_manager.allocate_irq(msix_vector) {
                    Ok(g) => g as i32,
                    Err(e) => {
                        error!("Failed to init msix vector {:?}, error is {:?}", vector, e);
                        return true;
                    }
                };

                irq_manager
                    .register_irqfd(irq_fd.unwrap(), gsi_route.gsi as u32)
                    .unwrap_or_else(|e| error!("{:?}", e));
            } else {
                irq_manager
                    .update_route_table(gsi_route.gsi as u32, msix_vector)
                    .unwrap_or_else(|e| error!("{:?}", e));
            }

            let mut locked_dev = cloned_dev.lock().unwrap();
            if (vector + 1) > (locked_dev.nr_vectors as u64) {
                locked_dev
                    .disable_irqs()
                    .unwrap_or_else(|e| error!("Failed to disable irq, error is {:?}", e));

                locked_dev
                    .enable_irqs(
                        get_irq_rawfds(&locked_gsi_routes, 0, (vector + 1) as u32),
                        0,
                    )
                    .unwrap_or_else(|e| error!("Failed to enable irq, error is {:?}", e));
                locked_dev.nr_vectors = (vector + 1) as usize;
            } else {
                locked_dev
                    .enable_irqs(
                        get_irq_rawfds(&locked_gsi_routes, vector as u32, 1),
                        vector as u32,
                    )
                    .unwrap_or_else(|e| error!("Failed to enable irq, error is {:?}", e));
            }
            true
        };

        Ok(RegionOps {
            read: Arc::new(read),
            write: Arc::new(write),
        })
    }

    /// Create region ops for BARs.
    fn get_bar_region_ops(&self) -> RegionOps {
        let cloned_dev = self.vfio_device.clone();
        let cloned_bars = self.vfio_bars.clone();
        let read = move |data: &mut [u8], addr: GuestAddress, offset: u64| -> bool {
            for locked_bar in cloned_bars.lock().unwrap().iter() {
                if locked_bar.size == 0 {
                    continue;
                }

                let r = &locked_bar.vfio_region;
                if r.guest_phys_addr != 0
                    && addr.0 >= r.guest_phys_addr
                    && addr.0 < (r.guest_phys_addr + r.size)
                {
                    if let Err(e) =
                        cloned_dev
                            .lock()
                            .unwrap()
                            .read_region(data, r.region_offset, offset)
                    {
                        error!(
                            "Failed to read bar region, address is {}, offset is {}, error is {:?}",
                            addr.0, offset, e,
                        );
                    }
                    return true;
                }
            }
            true
        };

        let cloned_dev = self.vfio_device.clone();
        let cloned_bars = self.vfio_bars.clone();
        let write = move |data: &[u8], addr: GuestAddress, offset: u64| -> bool {
            for locked_bar in cloned_bars.lock().unwrap().iter() {
                if locked_bar.size == 0 {
                    continue;
                }

                let r = &locked_bar.vfio_region;
                if r.guest_phys_addr != 0
                    && addr.0 >= r.guest_phys_addr
                    && addr.0 < (r.guest_phys_addr + r.size)
                {
                    if let Err(e) =
                        cloned_dev
                            .lock()
                            .unwrap()
                            .write_region(data, r.region_offset, offset)
                    {
                        error!(
                            "Failed to write bar region, address is {}, offset is {}, error is {:?}",
                            addr.0, offset, e,
                        );
                    }
                    return true;
                }
            }
            true
        };

        RegionOps {
            read: Arc::new(read),
            write: Arc::new(write),
        }
    }

    /// Avoid VM exits when guest OS read or write device MMIO regions, it maps bar regions into
    /// the guest OS.
    fn setup_bars_mmap(&mut self) -> Result<()> {
        for i in vfio::VFIO_PCI_BAR0_REGION_INDEX..vfio::VFIO_PCI_ROM_REGION_INDEX {
            let gpa = self.base.config.get_bar_address(i as usize);
            if gpa == BAR_SPACE_UNMAPPED || gpa == 0 {
                continue;
            }

            let mut bars = self.vfio_bars.lock().unwrap();
            let bar = bars
                .get_mut(i as usize)
                .with_context(|| "Failed to get bar info")?;
            let region = &mut bar.vfio_region;
            // If bar region already setups or does not support mapping, just process the nest.
            if region.size == 0 || region.guest_phys_addr == gpa {
                continue;
            }

            region.guest_phys_addr = gpa;
            if region.mmaps.is_empty() {
                continue;
            }

            let mut read_only = true;
            if region.flags & vfio::VFIO_REGION_INFO_FLAG_WRITE != 0 {
                read_only = false;
            }

            for mmap in region.mmaps.iter() {
                let dev = self.vfio_device.lock().unwrap().fd.try_clone().unwrap();
                let fb = Some(FileBackend {
                    file: Arc::new(dev),
                    offset: region.region_offset + mmap.offset,
                    page_size: host_page_size(),
                });
                let host_mmap = HostMemMapping::new(
                    GuestAddress(gpa + mmap.offset),
                    None,
                    mmap.size,
                    fb,
                    false,
                    true,
                    read_only,
                )?;

                let ram_device = Region::init_ram_device_region(Arc::new(host_mmap), "VfioRam");
                let bar = self
                    .base
                    .config
                    .bars
                    .get_mut(i as usize)
                    .with_context(|| "Failed to get pci bar info")?;
                bar.region
                    .as_ref()
                    .unwrap()
                    .add_subregion(ram_device, mmap.offset)
                    .with_context(|| VfioError::AddRegBar(i as usize))?;
            }
        }
        Ok(())
    }

    fn vfio_enable_msix(&mut self) -> Result<()> {
        let mut gsi_routes = self.gsi_msi_routes.lock().unwrap();
        if gsi_routes.len() == 0 {
            let irq_fd = EventFd::new(libc::EFD_NONBLOCK).unwrap();
            let gsi_route = GsiMsiRoute {
                irq_fd: Some(Arc::new(irq_fd)),
                gsi: -1,
                nr: 0,
            };
            gsi_routes.push(gsi_route);

            let entries = self.msix_info.as_ref().unwrap().entries;
            for i in 1..entries {
                let gsi_route = GsiMsiRoute {
                    irq_fd: None,
                    gsi: -1,
                    nr: i as u32,
                };
                gsi_routes.push(gsi_route);
            }
        }
        // Register a vector of irqfd to kvm interrupts. If one of the device interrupt vector is
        // triggered, the corresponding irqfd is written, and interrupt is injected into VM finally.
        self.vfio_device
            .lock()
            .unwrap()
            .enable_irqs(get_irq_rawfds(&gsi_routes, 0, 1), 0)
            .with_context(|| "Failed enable irqfds in kvm")?;

        Ok(())
    }

    fn vfio_disable_msix(&mut self) -> Result<()> {
        self.vfio_device
            .lock()
            .unwrap()
            .disable_irqs()
            .with_context(|| "Failed disable irqfds in kvm")?;
        Ok(())
    }

    fn vfio_unregister_all_irqfd(&mut self) -> Result<()> {
        let routes = self.gsi_msi_routes.lock().unwrap();
        let msix = self.base.config.msix.as_ref().unwrap();
        let irq_ctrl = &msix.lock().unwrap().msi_irq_manager;
        for route in routes.iter() {
            if let Some(fd) = route.irq_fd.as_ref() {
                irq_ctrl
                    .as_ref()
                    .unwrap()
                    .unregister_irqfd(fd.clone(), route.gsi as u32)?;

                // No need to release gsi.
                if route.gsi == -1 {
                    continue;
                }

                irq_ctrl.as_ref().unwrap().release_irq(route.gsi as u32)?;
            }
        }
        Ok(())
    }

    fn unrealize(&mut self) -> Result<()> {
        self.vfio_disable_msix()?;
        self.vfio_unregister_all_irqfd()?;
        self.unregister_bars()?;

        let locked_dev = self.vfio_device.lock().unwrap();
        let group = locked_dev.group.upgrade().unwrap();
        let mut devices = group.devices.lock().unwrap();
        devices.remove(&locked_dev.fd.as_raw_fd());
        if devices.is_empty() {
            group.del_from_kvm_device()?;
            GROUPS.lock().unwrap().remove(&group.id);

            let container = locked_dev.container.upgrade().unwrap();
            let locked_container = container.lock().unwrap();
            let container_fd = locked_container.fd.as_raw_fd();
            let mut groups = locked_container.groups.lock().unwrap();
            groups.remove(&group.id);
            if groups.is_empty() {
                drop(groups);
                drop(locked_container);
                self.mem_as.unregister_listener(container.clone())?;
                CONTAINERS.lock().unwrap().remove(&container_fd);
            }
        }
        Ok(())
    }
}

impl Device for VfioPciDevice {
    fn device_base(&self) -> &DeviceBase {
        &self.base.base
    }

    fn device_base_mut(&mut self) -> &mut DeviceBase {
        &mut self.base.base
    }
}

impl PciDevOps for VfioPciDevice {
    fn pci_base(&self) -> &PciDevBase {
        &self.base
    }

    fn pci_base_mut(&mut self) -> &mut PciDevBase {
        &mut self.base
    }

    fn realize(mut self) -> Result<()> {
        self.init_write_mask(false)?;
        self.init_write_clear_mask(false)?;
        Result::with_context(self.vfio_device.lock().unwrap().reset(), || {
            "Failed to reset vfio device"
        })?;

        Result::with_context(self.get_pci_config(), || {
            "Failed to get vfio device pci config space"
        })?;
        Result::with_context(self.pci_config_reset(), || {
            "Failed to reset vfio device pci config space"
        })?;
        Result::with_context(
            init_multifunction(
                self.multi_func,
                &mut self.base.config.config,
                self.base.devfn,
                self.base.parent_bus.clone(),
            ),
            || "Failed to init vfio device multifunction.",
        )?;

        #[cfg(target_arch = "aarch64")]
        {
            let bus_num = self
                .base
                .parent_bus
                .upgrade()
                .unwrap()
                .lock()
                .unwrap()
                .number(SECONDARY_BUS_NUM as usize);
            self.dev_id = Arc::new(AtomicU16::new(self.set_dev_id(bus_num, self.base.devfn)));
        }

        self.msix_info = Some(Result::with_context(self.get_msix_info(), || {
            "Failed to get MSI-X info"
        })?);
        self.vfio_bars = Arc::new(Mutex::new(Result::with_context(
            self.bar_region_info(),
            || "Failed to get bar region info",
        )?));
        Result::with_context(self.register_bars(), || "Failed to register bars")?;

        let devfn = self.base.devfn;
        let dev = Arc::new(Mutex::new(self));
        let pci_bus = dev.lock().unwrap().base.parent_bus.upgrade().unwrap();
        let mut locked_pci_bus = pci_bus.lock().unwrap();
        let pci_device = locked_pci_bus.devices.get(&devfn);
        if pci_device.is_none() {
            locked_pci_bus.devices.insert(devfn, dev);
        } else {
            bail!(
                "Devfn {:?} has been used by {:?}",
                &devfn,
                pci_device.unwrap().lock().unwrap().name()
            );
        }

        Ok(())
    }

    fn unrealize(&mut self) -> Result<()> {
        if let Err(e) = VfioPciDevice::unrealize(self) {
            error!("{:?}", e);
            bail!("Failed to unrealize vfio-pci.");
        }
        Ok(())
    }

    /// Read pci data from pci config if it emulate, otherwise read from vfio device.
    fn read_config(&mut self, offset: usize, data: &mut [u8]) {
        let size = data.len();
        // SAFETY: offset is no more than 0xfff.
        let end = offset + size;
        if end > (self.config_size as usize) || size > 4 {
            error!(
                "Failed to read pci config space at offset {} with data size {}",
                offset, size
            );
            return;
        }

        // BAR, header_type and extended caps are always controlled by StratoVirt.
        let bars_size = (BAR_5 - BAR_0) as usize + REG_SIZE;
        let ext_cfg_size = PCIE_CONFIG_SPACE_SIZE - PCI_CONFIG_SPACE_SIZE;
        if ranges_overlap(offset, size, BAR_0 as usize, bars_size).unwrap()
            || ranges_overlap(offset, size, HEADER_TYPE as usize, 2).unwrap()
            || ranges_overlap(offset, size, PCI_CONFIG_SPACE_SIZE, ext_cfg_size).unwrap()
        {
            self.base.config.read(offset, data);
            return;
        }

        if let Err(e) =
            self.vfio_device
                .lock()
                .unwrap()
                .read_region(data, self.config_offset, offset as u64)
        {
            error!("Failed to read device pci config, error is {:?}", e);
            return;
        }
        for (i, data) in data.iter_mut().enumerate().take(size) {
            if i + offset == 0x3d {
                // Clear INIx
                *data &= 0;
            }
        }
    }

    /// Write data to pci config and vfio device at the same time
    fn write_config(&mut self, offset: usize, data: &[u8]) {
        let size = data.len();
        // SAFETY: offset is no more than 0xfff.
        let end = offset + size;
        if end > (self.config_size as usize) || size > 4 {
            error!(
                "Failed to write pci config space at offset {} with data size {}",
                offset, size
            );
            return;
        }

        // Let vfio device filter data to write.
        if let Err(e) =
            self.vfio_device
                .lock()
                .unwrap()
                .write_region(data, self.config_offset, offset as u64)
        {
            error!("Failed to write device pci config, error is {:?}", e);
            return;
        }

        let cap_offset = self
            .base
            .config
            .msix
            .as_ref()
            .map_or(0, |m| m.lock().unwrap().msix_cap_offset as usize);
        let was_enable = self.base.config.msix.as_ref().map_or(false, |m| {
            m.lock().unwrap().is_enabled(&self.base.config.config)
        });
        let parent_bus = self.base.parent_bus.upgrade().unwrap();
        let locked_parent_bus = parent_bus.lock().unwrap();
        self.base.config.write(
            offset,
            data,
            self.dev_id.load(Ordering::Acquire),
            #[cfg(target_arch = "x86_64")]
            Some(&locked_parent_bus.io_region),
            Some(&locked_parent_bus.mem_region),
        );

        if ranges_overlap(offset, size, COMMAND as usize, REG_SIZE).unwrap() {
            if le_read_u32(&self.base.config.config, offset).unwrap() & COMMAND_MEMORY_SPACE as u32
                != 0
            {
                if let Err(e) = self.setup_bars_mmap() {
                    error!("Failed to map bar regions, error is {:?}", e);
                }
            }
        } else if ranges_overlap(offset, size, cap_offset, MSIX_CAP_SIZE as usize).unwrap() {
            let is_enable = self.base.config.msix.as_ref().map_or(false, |m| {
                m.lock().unwrap().is_enabled(&self.base.config.config)
            });

            if !was_enable && is_enable {
                if let Err(e) = self.vfio_enable_msix() {
                    error!("{:?}\nFailed to enable MSI-X.", e);
                }
            } else if was_enable && !is_enable {
                if let Err(e) = self.vfio_disable_msix() {
                    error!("{:?}\nFailed to disable MSI-X.", e);
                }
            }
        }
    }

    fn reset(&mut self, _reset_child_device: bool) -> Result<()> {
        Result::with_context(self.vfio_device.lock().unwrap().reset(), || {
            "Fail to reset vfio dev"
        })
    }
}

fn get_irq_rawfds(gsi_msi_routes: &[GsiMsiRoute], start: u32, count: u32) -> Vec<RawFd> {
    let mut rawfds: Vec<RawFd> = Vec::new();
    for r in gsi_msi_routes.iter() {
        if r.nr >= start && r.nr < start + count {
            if let Some(fd) = r.irq_fd.as_ref() {
                rawfds.push(fd.as_raw_fd());
            } else {
                rawfds.push(-1);
            }
        }
    }
    rawfds
}