Skip to content

Commit 3cdac3d

Browse files
committed
feat: enable vmrt to boot with host virtiofs drive as root fs
Allow using a host virtiofs drive (exposed as a socket) as the guest root filesystem in the QEMU VM (and successfully boot from it)! This tweak both the runtime flags passed to `vmrt` and the `init` programs in the initramfs. The whole setup is gated behind `VIRTIOFS` Makefile flag and a `virtiofs` Cargo feature.
1 parent 29113b2 commit 3cdac3d

File tree

6 files changed

+86
-32
lines changed

6 files changed

+86
-32
lines changed

qemu/Dockerfile

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
FROM ubuntu:22.04
22

3+
ARG NUMA=false
4+
ARG VIRTIOFS=false
5+
36
RUN apt update \
47
&& \
58
\
@@ -44,14 +47,15 @@ RUN echo CONFIG_EDID=y >> /qemu/configs/devices/i386-softmmu/default.mak
4447
RUN echo CONFIG_VGA=y >> /qemu/configs/devices/i386-softmmu/default.mak
4548
RUN echo CONFIG_VGA_PCI=y >> /qemu/configs/devices/i386-softmmu/default.mak
4649
RUN echo CONFIG_PCIE_PORT=y >> /qemu/configs/devices/i386-softmmu/default.mak
50+
RUN if [ "$VIRTIOFS" = "true" ]; then \
51+
echo CONFIG_VHOST_USER_FS=y >> /qemu/configs/devices/i386-softmmu/default.mak; \
52+
fi
4753

48-
# --without-default-devices
49-
RUN mkdir build && \
50-
cd build && \
51-
/qemu/configure \
54+
RUN mkdir build && cd build && \
55+
CONFIGURE_OPTS=" \
5256
--target-list=x86_64-softmmu \
5357
--static \
54-
--audio-drv-list="" \
58+
--audio-drv-list='' \
5559
--disable-slirp \
5660
--disable-tcg-interpreter \
5761
--disable-containers \
@@ -70,14 +74,18 @@ RUN mkdir build && \
7074
--disable-bochs \
7175
--disable-bzip2 \
7276
--disable-guest-agent \
73-
--disable-numa \
74-
--disable-tcg \
7577
--disable-vnc \
7678
--disable-live-block-migration \
7779
--disable-gio \
7880
--enable-vhost-kernel \
7981
--enable-virtfs \
80-
--without-default-devices
82+
--without-default-devices" && \
83+
if [ "$NUMA" = "true" ]; then \
84+
CONFIGURE_OPTS="$CONFIGURE_OPTS --enable-memfd --enable-mem-backend"; \
85+
else \
86+
CONFIGURE_OPTS="$CONFIGURE_OPTS --disable-numa --disable-tcg"; \
87+
fi && \
88+
/qemu/configure $CONFIGURE_OPTS
8189

8290
RUN cd build && make V=1 CFLAGS+="-Os -flto" -j4
8391

qemu/Makefile

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,13 @@
11
all: vmrt
22

3+
VIRTIOFS ?= false
4+
NUMA ?= $(if $(filter true,$(VIRTIOFS)),true,false)
5+
36
vmrt: Dockerfile
4-
docker build -t build-qemu .
7+
docker build \
8+
--build-arg VIRTIOFS=$(VIRTIOFS) \
9+
--build-arg NUMA=$(NUMA) \
10+
-t build-qemu .
511
$(SHELL) copy_img build-qemu vmrt .
612
$(SHELL) copy_img build-qemu /qemu/pc-bios/vgabios-stdvga.bin .
713

runtime/Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,3 +118,8 @@ path = "src/lib.rs"
118118
[[bin]]
119119
name = "ya-runtime-vm"
120120
path = "src/main.rs"
121+
122+
[features]
123+
default = []
124+
virtiofs = []
125+
numa = ["virtiofs"]

runtime/init-container/Makefile

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@ NEW_ROOT := newroot
55
# -MMD to create dependency files (*.d) on first compilation
66
CFLAGS := -MMD -std=c11 -O2 -Wall -Wextra -Werror -fPIE -pie -Iinclude/ -Wmaybe-uninitialized -Iunpacked_headers/usr/include -I$(CURDIR)/$(LIBSECCOMP_SUBMODULE)/include '-DNEW_ROOT="$(NEW_ROOT)"'
77

8+
VIRTIOFS ?= false
9+
CFLAGS += '-DVIRTIOFS=$(VIRTIOFS)'
10+
811
ifneq ($(DEBUG), "")
912
CFLAGS += -DNDEBUG
1013
endif
@@ -135,7 +138,10 @@ initramfs.cpio.gz: init mkfs $(UNPACKED_KERNEL)
135138
cp $(UNPACKED_KERNEL)/lib/modules/$(KERNEL_VER)/kernel/net/core/failover.ko initramfs
136139
cp $(UNPACKED_KERNEL)/lib/modules/$(KERNEL_VER)/kernel/net/ipv6/ipv6.ko initramfs
137140
cp $(UNPACKED_KERNEL)/lib/modules/$(KERNEL_VER)/kernel/net/packet/af_packet.ko initramfs
138-
cp $(UNPACKED_KERNEL)/lib/modules/5.10.29-0-virt/kernel/fs/fuse/fuse.ko initramfs
141+
cp $(UNPACKED_KERNEL)/lib/modules/$(KERNEL_VER)/kernel/fs/fuse/fuse.ko initramfs
142+
ifdef VIRTIOFS
143+
cp $(UNPACKED_KERNEL)/lib/modules/$(KERNEL_VER)/kernel/fs/fuse/virtiofs.ko initramfs
144+
endif
139145

140146
cp $(BUSYBOX)/$(MKFS_NAME) initramfs
141147
mkdir initramfs/$(NEW_ROOT)

runtime/init-container/src/init.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@
3939
#include "proto.h"
4040
#include "init-seccomp.h"
4141

42+
#ifndef VIRTIOFS
43+
#define VIRTIOFS false
44+
#endif
45+
4246
#define SYSROOT "/mnt/newroot"
4347

4448
#define CONTAINER_OF(ptr, type, member) (type *)((char *)(ptr) - offsetof(type, member))
@@ -2604,6 +2608,12 @@ static void scan_storage(struct storage_node_t **list)
26042608
free(data);
26052609
}
26062610

2611+
#ifdef VIRTIOFS
2612+
// This assumes that a virtiofs user drive with the tag "rootfs-0" is attached to QEMU (vmrt).
2613+
// A more robust and extensible approach could be to look up the list of tags in /sys/class/virtio-ports/
2614+
storage_append(list, "/mnt/image-0", "rootfs-0", "virtiofs", "", MS_RDONLY | MS_NODEV);
2615+
#endif
2616+
26072617
fflush(stderr);
26082618

26092619
for (char **p = environ; *p; ++p)
@@ -2678,7 +2688,7 @@ int main(int argc, char **argv)
26782688
if (access("/netfs.ko", R_OK) == 0) {
26792689
load_module("/netfs.ko");
26802690
}
2681-
2691+
26822692
load_module("/fscache.ko");
26832693
load_module("/af_packet.ko");
26842694
load_module("/ipv6.ko");

runtime/src/vmrt.rs

Lines changed: 40 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,10 @@ use std::sync::Arc;
88
use futures::lock::Mutex;
99
use futures::FutureExt;
1010
use tokio::io::AsyncBufReadExt;
11-
use tokio::{io, process, spawn};
11+
use tokio::{
12+
io, process, spawn,
13+
time::{sleep, Duration},
14+
};
1215

1316
use ya_client_model::activity::exe_script_command::VolumeMount;
1417
use ya_runtime_sdk::runtime_api::server;
@@ -105,27 +108,43 @@ pub async fn start_vmrt(
105108
"virtserialport,chardev=manager_cdev,name=manager_port",
106109
];
107110

108-
let rootfs_devices: Vec<(String, String)> = deployment
109-
.task_packages
110-
.iter()
111-
.enumerate()
112-
.map(|(i, path)| {
113-
let drive = format!(
114-
"file={},cache=unsafe,readonly=on,format=raw,id=rootfs-{},if=none",
115-
path.display(),
116-
i
117-
);
118-
let device = format!("virtio-blk-pci,drive=rootfs-{},serial=rootfs-{}", i, i);
119-
(drive, device)
120-
})
121-
.collect();
122-
123-
for (drive, device) in rootfs_devices.iter() {
124-
args.push("-drive");
125-
args.push(drive);
126-
args.push("-device");
127-
args.push(device);
111+
let mut additional_args = Vec::new();
112+
if cfg!(feature = "virtiofs") {
113+
// Reading the VIRTIOFS_SOCK_PATH environment variable isn't a robust or extensible solution.
114+
// A better approach would be to update the deployment logic to retrieve this information
115+
// from the image parameters. This would allow the image creator to specify whether a host
116+
// user virtiofs drive should be used, and if so, ensure the drive is set up and mounted
117+
// on the host before attaching it to the guest VM.
118+
let socket_path = std::env::var("VIRTIOFS_SOCK_PATH")
119+
.expect("Environment variable VIRTIOFS_SOCK_PATH is not set");
120+
additional_args.extend([
121+
"-chardev".to_string(),
122+
format!("socket,id=char-0,path={socket_path}"),
123+
"-device".to_string(),
124+
format!("vhost-user-fs-pci,queue-size=1024,chardev=char-0,tag=rootfs-0"),
125+
]);
126+
} else {
127+
for (i, path) in deployment.task_packages.iter().enumerate() {
128+
additional_args.extend([
129+
"-drive".to_string(),
130+
format!(
131+
"file={},cache=unsafe,readonly=on,format=raw,id=rootfs-{i},if=none",
132+
path.display()
133+
),
134+
"-device".to_string(),
135+
format!("virtio-blk-pci,drive=rootfs-{i},serial=rootfs-{i}"),
136+
]);
137+
}
138+
}
139+
if cfg!(feature = "numa") {
140+
additional_args.extend([
141+
"-object".to_string(),
142+
format!("memory-backend-file,id=mem,size={memory_size},mem-path=/dev/shm,share=on"),
143+
"-numa".to_string(),
144+
"node,memdev=mem".to_string(),
145+
]);
128146
}
147+
args.extend(additional_args.iter().map(String::as_str));
129148

130149
cmd.args(args);
131150

0 commit comments

Comments
 (0)