Skip to content

Use mincore(2) to create diff snapshots without dirty page tracking #5274

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Jul 16, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ and this project adheres to
- [#5175](https://github.com/firecracker-microvm/firecracker/pull/5175): Allow
including a custom cpu template directly in the json configuration file passed
to `--config-file` under the `cpu_config` key.
- [#5274](https://github.com/firecracker-microvm/firecracker/pull/5274): Allow
taking diff snapshots even if dirty page tracking is disabled, by using
`mincore(2)` to overapproximate the set of dirty pages. Only works if swap is
disabled.

### Changed

Expand Down
3 changes: 3 additions & 0 deletions resources/seccomp/aarch64-unknown-linux-musl.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@
{
"syscall": "write"
},
{
"syscall": "mincore"
},
{
"syscall": "writev",
"comment": "Used by the VirtIO net device to write to tap"
Expand Down
3 changes: 3 additions & 0 deletions resources/seccomp/x86_64-unknown-linux-musl.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@
{
"syscall": "write"
},
{
"syscall": "mincore"
},
{
"syscall": "writev",
"comment": "Used by the VirtIO net device to write to tap"
Expand Down
4 changes: 2 additions & 2 deletions src/vmm/src/persist.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ use crate::vstate::kvm::KvmState;
use crate::vstate::memory;
use crate::vstate::memory::{GuestMemoryState, GuestRegionMmap, MemoryError};
use crate::vstate::vcpu::{VcpuSendEventError, VcpuState};
use crate::vstate::vm::VmState;
use crate::vstate::vm::{VmError, VmState};
use crate::{EventManager, Vmm, vstate};

/// Holds information related to the VM that is not part of VmState.
Expand Down Expand Up @@ -134,7 +134,7 @@ pub enum MicrovmStateError {
#[derive(Debug, thiserror::Error, displaydoc::Display)]
pub enum CreateSnapshotError {
/// Cannot get dirty bitmap: {0}
DirtyBitmap(#[from] vmm_sys_util::errno::Error),
DirtyBitmap(#[from] VmError),
/// Cannot write memory file: {0}
Memory(#[from] MemoryError),
/// Cannot perform {0} on the memory backing file: {1}
Expand Down
7 changes: 0 additions & 7 deletions src/vmm/src/rpc_interface.rs
Original file line number Diff line number Diff line change
Expand Up @@ -747,13 +747,6 @@ impl RuntimeApiController {
) -> Result<VmmData, VmmActionError> {
if create_params.snapshot_type == SnapshotType::Diff {
log_dev_preview_warning("Virtual machine diff snapshots", None);

if !self.vm_resources.machine_config.track_dirty_pages {
return Err(VmmActionError::NotSupported(
"Diff snapshots are not allowed on uVMs with dirty page tracking disabled."
.to_string(),
));
}
}

let mut locked_vmm = self.vmm.lock().unwrap();
Expand Down
1 change: 0 additions & 1 deletion src/vmm/src/test_utils/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,6 @@ pub fn default_vmm_no_boot(kernel_image: Option<&str>) -> (Arc<Mutex<Vmm>>, Even
create_vmm(kernel_image, false, false)
}

#[cfg(target_arch = "x86_64")]
pub fn dirty_tracking_vmm(kernel_image: Option<&str>) -> (Arc<Mutex<Vmm>>, EventManager) {
create_vmm(kernel_image, true, true)
}
Expand Down
67 changes: 58 additions & 9 deletions src/vmm/src/vstate/vm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the THIRD-PARTY file.

use std::collections::HashMap;
use std::fs::OpenOptions;
use std::io::Write;
use std::path::Path;
Expand All @@ -15,6 +14,7 @@ use kvm_bindings::{KVM_MEM_LOG_DIRTY_PAGES, kvm_userspace_memory_region};
use kvm_ioctls::VmFd;
use vmm_sys_util::eventfd::EventFd;

use crate::arch::host_page_size;
pub use crate::arch::{ArchVm as Vm, ArchVmError, VmState};
use crate::logger::info;
use crate::persist::CreateSnapshotError;
Expand Down Expand Up @@ -45,6 +45,8 @@ pub enum VmError {
SetUserMemoryRegion(kvm_ioctls::Error),
/// Failed to create VM: {0}
CreateVm(kvm_ioctls::Error),
/// Failed to get KVM's dirty log: {0}
GetDirtyLog(kvm_ioctls::Error),
/// {0}
Arch(#[from] ArchVmError),
/// Error during eventfd operations: {0}
Expand All @@ -55,6 +57,8 @@ pub enum VmError {
NotEnoughMemorySlots,
/// Memory Error: {0}
VmMemory(#[from] vm_memory::Error),
/// Error calling mincore: {0}
Mincore(vmm_sys_util::errno::Error),
}

/// Contains Vm functions that are usable across CPU architectures
Expand Down Expand Up @@ -196,17 +200,21 @@ impl Vm {
}

/// Retrieves the KVM dirty bitmap for each of the guest's memory regions.
pub fn get_dirty_bitmap(&self) -> Result<DirtyBitmap, vmm_sys_util::errno::Error> {
let mut bitmap: DirtyBitmap = HashMap::new();
pub fn get_dirty_bitmap(&self) -> Result<DirtyBitmap, VmError> {
self.guest_memory()
.iter()
.zip(0u32..)
.try_for_each(|(region, slot)| {
self.fd()
.get_dirty_log(slot, u64_to_usize(region.len()))
.map(|bitmap_region| _ = bitmap.insert(slot, bitmap_region))
})?;
Ok(bitmap)
.map(|(region, slot)| {
let bitmap = match region.bitmap() {
Some(_) => self
.fd()
.get_dirty_log(slot, u64_to_usize(region.len()))
.map_err(VmError::GetDirtyLog)?,
None => mincore_bitmap(region)?,
};
Ok((slot, bitmap))
})
.collect()
}

/// Takes a snapshot of the virtual machine running inside the given [`Vmm`] and saves it to
Expand Down Expand Up @@ -278,6 +286,47 @@ impl Vm {
}
}

/// Use `mincore(2)` to overapproximate the dirty bitmap for the given memslot. To be used
/// if a diff snapshot is requested, but dirty page tracking wasn't enabled.
fn mincore_bitmap(region: &GuestRegionMmap) -> Result<Vec<u64>, VmError> {
// TODO: Once Host 5.10 goes out of support, we can make this more robust and work on
// swap-enabled systems, by doing mlock2(MLOCK_ONFAULT)/munlock() in this function (to
// force swapped-out pages to get paged in, so that mincore will consider them incore).
// However, on AMD (m6a/m7a) 5.10, doing so introduces a 100%/30ms regression to snapshot
// creation, even if swap is disabled, so currently it cannot be done.

// Mincore always works at PAGE_SIZE granularity, even if the VMA we are dealing with
// is a hugetlbfs VMA (e.g. to report a single hugepage as "present", mincore will
// give us 512 4k markers with the lowest bit set).
let page_size = host_page_size();
let mut mincore_bitmap = vec![0u8; u64_to_usize(region.len()) / page_size];
let mut bitmap = vec![0u64; (u64_to_usize(region.len()) / page_size).div_ceil(64)];

// SAFETY: The safety invariants of GuestRegionMmap ensure that region.as_ptr() is a valid
// userspace mapping of size region.len() bytes. The bitmap has exactly one byte for each
// page in this userspace mapping. Note that mincore does not operate on bitmaps like
// KVM_MEM_LOG_DIRTY_PAGES, but rather it uses 8 bits per page (e.g. 1 byte), setting the
// least significant bit to 1 if the page corresponding to a byte is in core (available in
// the page cache and resolvable via just a minor page fault).
let r = unsafe {
libc::mincore(
region.as_ptr().cast::<libc::c_void>(),
u64_to_usize(region.len()),
mincore_bitmap.as_mut_ptr(),
)
};

if r != 0 {
return Err(VmError::Mincore(vmm_sys_util::errno::Error::last()));
}

for (page_idx, b) in mincore_bitmap.iter().enumerate() {
bitmap[page_idx / 64] |= (*b as u64 & 0x1) << (page_idx as u64 % 64);
}

Ok(bitmap)
}

#[cfg(test)]
pub(crate) mod tests {
use vm_memory::GuestAddress;
Expand Down
59 changes: 21 additions & 38 deletions src/vmm/tests/integration_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,47 +100,30 @@ fn test_pause_resume_microvm() {
vmm.lock().unwrap().stop(FcExitCode::Ok);
}

#[test]
fn test_dirty_bitmap_error() {
// Error case: dirty tracking disabled.
let (vmm, _) = default_vmm(None);

// The vmm will start with dirty page tracking = OFF.
// With dirty tracking disabled, the underlying KVM_GET_DIRTY_LOG ioctl will fail
// with errno 2 (ENOENT) because KVM can't find any guest memory regions with dirty
// page tracking enabled.
assert_eq!(
vmm.lock()
.unwrap()
.vm
.get_dirty_bitmap()
.unwrap_err()
.errno(),
2
);
vmm.lock().unwrap().stop(FcExitCode::Ok);
}

#[test]
#[cfg(target_arch = "x86_64")]
fn test_dirty_bitmap_success() {
// The vmm will start with dirty page tracking = ON.
let (vmm, _) = vmm::test_utils::dirty_tracking_vmm(Some(NOISY_KERNEL_IMAGE));

// Let it churn for a while and dirty some pages...
thread::sleep(Duration::from_millis(100));
let bitmap = vmm.lock().unwrap().vm.get_dirty_bitmap().unwrap();
let num_dirty_pages: u32 = bitmap
.values()
.map(|bitmap_per_region| {
// Gently coerce to u32
let num_dirty_pages_per_region: u32 =
bitmap_per_region.iter().map(|n| n.count_ones()).sum();
num_dirty_pages_per_region
})
.sum();
assert!(num_dirty_pages > 0);
vmm.lock().unwrap().stop(FcExitCode::Ok);
let vmms = [
vmm::test_utils::dirty_tracking_vmm(Some(NOISY_KERNEL_IMAGE)),
default_vmm(Some(NOISY_KERNEL_IMAGE)),
];

for (vmm, _) in vmms {
// Let it churn for a while and dirty some pages...
thread::sleep(Duration::from_millis(100));
let bitmap = vmm.lock().unwrap().vm.get_dirty_bitmap().unwrap();
let num_dirty_pages: u32 = bitmap
.values()
.map(|bitmap_per_region| {
// Gently coerce to u32
let num_dirty_pages_per_region: u32 =
bitmap_per_region.iter().map(|n| n.count_ones()).sum();
num_dirty_pages_per_region
})
.sum();
assert!(num_dirty_pages > 0);
vmm.lock().unwrap().stop(FcExitCode::Ok);
}
}

#[test]
Expand Down
16 changes: 1 addition & 15 deletions tests/integration_tests/functional/test_snapshot_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def test_resume(uvm_nano, microvm_factory, resume_at_restore):
assert restored_vm.state == "Paused"
restored_vm.resume()
assert restored_vm.state == "Running"
restored_vm.ssh.check_output("true")


def test_snapshot_current_version(uvm_nano):
Expand Down Expand Up @@ -390,21 +391,6 @@ def test_negative_snapshot_create(uvm_nano):
mem_file_path="memfile", snapshot_path="statefile", snapshot_type="Full"
)

vm.api.vm.patch(state="Paused")

# Try diff with dirty pages tracking disabled.
expected_msg = (
"Diff snapshots are not allowed on uVMs with dirty page tracking disabled"
)
with pytest.raises(RuntimeError, match=expected_msg):
vm.api.snapshot_create.put(
mem_file_path="memfile", snapshot_path="statefile", snapshot_type="Diff"
)
assert not os.path.exists("statefile")
assert not os.path.exists("memfile")

vm.kill()


def test_create_large_diff_snapshot(uvm_plain):
"""
Expand Down