diff --git a/Justfile b/Justfile index bd569fa64..49b8405aa 100644 --- a/Justfile +++ b/Justfile @@ -24,9 +24,11 @@ export CROSS_CONTAINER_GID := if path_exists("/dev/kvm") == "true" { kvm-gid } e root := justfile_directory() default-target := "debug" -simpleguest_source := "src/tests/rust_guests/simpleguest/target/x86_64-hyperlight-none" -dummyguest_source := "src/tests/rust_guests/dummyguest/target/x86_64-hyperlight-none" -witguest_source := "src/tests/rust_guests/witguest/target/x86_64-hyperlight-none" +hyperlight-target-arch := env("HYPERLIGHT_TARGET", arch()) +hyperlight-target := if hyperlight-target-arch == "x86_64" { "x86_64-hyperlight-none" } else if hyperlight-target-arch == "aarch64" { "aarch64-hyperlight-none" } else { error("Unsupported architecture: " + arch()) } +simpleguest_source := "src/tests/rust_guests/simpleguest/target/" + hyperlight-target +dummyguest_source := "src/tests/rust_guests/dummyguest/target/" + hyperlight-target +witguest_source := "src/tests/rust_guests/witguest/target/" + hyperlight-target rust_guests_bin_dir := "src/tests/rust_guests/bin" ################ diff --git a/c.just b/c.just index 92bcf8f6c..54cf0cc37 100644 --- a/c.just +++ b/c.just @@ -1,8 +1,9 @@ mkdir := if os() == "windows" { "mkdir -f -p" } else { "mkdir -p"} # Elf options +hyperlight-target-c := if hyperlight-target-arch == "x86_64" { "x86_64-unknown-linux-none" } else if hyperlight-target-arch == "aarch64" { "aarch64-unknown-linux-none" } else { error("Unsupported architecture: " + hyperlight-target-arch) } # We don't support stack protectors at the moment, but Arch Linux clang auto-enables them for -linux platforms, so explicitly disable them. -c-compile-options-elf := '-nostdlibinc -H --target=x86_64-unknown-linux-none -fno-stack-protector -fstack-clash-protection -mstack-probe-size=4096 -fPIC' +c-compile-options-elf := f'-nostdlibinc -H --target={{hyperlight-target-c}} -fno-stack-protector -fstack-clash-protection -mstack-probe-size=4096 -fPIC' c-include-flags-elf := "-I " + root / "src/hyperlight_guest_capi/include/" + " -I " + root / "src/hyperlight_libc/third_party/picolibc/libc/include/" + " -I " + root / "src/hyperlight_libc/third_party/picolibc/libc/stdio/" + " -I " + root / "src/hyperlight_libc/include/" c-linker-options-elf := '--entry "entrypoint" --nostdlib -pie --no-dynamic-linker' c-flags-debug-elf := '-O0' @@ -19,7 +20,7 @@ compile-c-guest target=default-target: link-c-guest target=default-target: # elf - cd src/tests/c_guests/c_simpleguest && ld.lld -o out/{{target}}/simpleguest {{c-linker-options-elf}} out/{{target}}/main.o -l hyperlight_guest_capi -L "{{justfile_directory()}}/target/x86_64-hyperlight-none/{{target}}" + cd src/tests/c_guests/c_simpleguest && ld.lld -o out/{{target}}/simpleguest {{c-linker-options-elf}} out/{{target}}/main.o -l hyperlight_guest_capi -L "{{justfile_directory()}}/target/{{hyperlight-target}}/{{target}}" move-c-guests target=default-target: # elf diff --git a/docs/paging-development-notes.md b/docs/paging-development-notes.md index 573c2f77b..da08f6f24 100644 --- a/docs/paging-development-notes.md +++ b/docs/paging-development-notes.md @@ -172,3 +172,9 @@ below the exception stack within the scratch region. Hyperlight unconditionally uses 48-bit virtual addresses (4-level paging) and enables PAE. The guest is always entered in long mode. + +## aarch64 + +Hyperlight unconditionally uses 48-bit virtual addresses. Hyperlight +presently only uses addresses in the lower (ttbr0) half of the address range. + diff --git a/flake.nix b/flake.nix index cbec54678..d1604117d 100644 --- a/flake.nix +++ b/flake.nix @@ -51,7 +51,7 @@ toolchainVersionAttrs = args; }; })) // { - targetPlatforms = [ "x86_64-linux" ]; + targetPlatforms = [ "aarch64-linux" "x86_64-linux" ]; badTargetPlatforms = [ ]; }; overrideRustPkg = pkg: self.lib.makeOverridable (origArgs: @@ -68,21 +68,21 @@ args = [ "-c" "declare > $out" ]; }); in { - shells.default = gcrootForShell devShells.x86_64-linux.default; + shells.x86_64-linux.default = gcrootForShell devShells.x86_64-linux.default; + shells.aarch64-linux.default = gcrootForShell devShells.aarch64-linux.default; }; - devShells.x86_64-linux.default = - let pkgs = import nixpkgs { - system = "x86_64-linux"; - overlays = [ (import (nixpkgs-mozilla + "/rust-overlay.nix")) overlays.fix-rust ]; - }; - in with pkgs; let + devShells = nixpkgs.lib.genAttrs nixpkgs.lib.systems.flakeExposed (system: { + default = let pkgs = import nixpkgs { + inherit system; + overlays = [ (import (nixpkgs-mozilla + "/rust-overlay.nix")) overlays.fix-rust ]; + }; in with pkgs; let customisedRustChannelOf = args: lib.flip builtins.mapAttrs (rustChannelOf args) (_: pkg: pkg.override { targets = [ "x86_64-unknown-linux-gnu" "x86_64-pc-windows-msvc" "x86_64-unknown-none" "wasm32-wasip1" "wasm32-wasip2" "wasm32-unknown-unknown" - "i686-unknown-linux-gnu" + "i686-unknown-linux-gnu" "aarch64-unknown-none" ]; extensions = [ "rust-src" ] ++ (if args.channel == "nightly" then [ "miri-preview" ] else []); }); @@ -227,10 +227,11 @@ src = fetchFromGitHub { owner = "hyperlight-dev"; repo = "cargo-hyperlight"; - tag = "v${version}"; - hash = "sha256-xq4/c69N0wG/I8WOYVloo0J0JqoSIKiWWtECdSKrsxo="; + rev = "28ac7b57e8e7b83f80bd601f1fab334aa3ae6d4a"; + hash = "sha256-a/mvPEDJycrCbmd826SmFdasE8BFtMkCsefCNR5JnkM="; }; - cargoHash = "sha256-muiMVrK1TydQiMitihfo7xYidqUIIQ+Hw3BIeo5rLFw="; + cargoHash = "sha256-wLapaao8qcB/toltV/xjQ7SXXcfh2J19nw6jWljmb2s="; + doCheck = false; }; in (buildRustPackageClang (mkDerivationAttrs: { pname = "hyperlight"; @@ -280,5 +281,6 @@ })).overrideAttrs(oA: { hardeningDisable = [ "all" ]; }); + }); }; } diff --git a/src/hyperlight_common/src/arch/aarch64/layout.rs b/src/hyperlight_common/src/arch/aarch64/layout.rs index 20f17026c..cb32cfe8e 100644 --- a/src/hyperlight_common/src/arch/aarch64/layout.rs +++ b/src/hyperlight_common/src/arch/aarch64/layout.rs @@ -14,12 +14,21 @@ See the License for the specific language governing permissions and limitations under the License. */ -// TODO(aarch64): change these, they are only provided in order to compile -pub const MAX_GVA: usize = 0xffff_ffff_ffff_efff; -pub const SNAPSHOT_PT_GVA_MIN: usize = 0xffff_8000_0000_0000; -pub const SNAPSHOT_PT_GVA_MAX: usize = 0xffff_80ff_ffff_ffff; -pub const MAX_GPA: usize = 0x0000_000f_ffff_ffff; +// TODO: consider using the upper half, like we do on x86; +// this would require enabling ttbr1 +pub const SCRATCH_TOP_GVA: usize = 0x0000_ffff_ffff_dfff; +pub const SNAPSHOT_PT_GVA_MIN: usize = 0x0000_8000_0000_0000; +pub const SNAPSHOT_PT_GVA_MAX: usize = 0x0000_80ff_ffff_ffff; +pub const SCRATCH_TOP_GPA: usize = 0x0000_000f_ffff_efff; -pub fn min_scratch_size(_input_data_size: usize, _output_data_size: usize) -> usize { - unimplemented!("min_scratch_size") +pub const IO_PAGE_GVA: u64 = 0x0000_ffff_ffff_e000; +pub const IO_PAGE_GPA: u64 = 0x0000_000f_ffff_f000; + +pub const fn io_page() -> Option<(crate::vmem::PhysAddr, crate::vmem::VirtAddr)> { + Some((IO_PAGE_GPA, IO_PAGE_GVA)) +} + +pub fn min_scratch_size(input_data_size: usize, output_data_size: usize) -> usize { + (input_data_size + output_data_size).next_multiple_of(crate::vmem::PAGE_SIZE) + + 12 * crate::vmem::PAGE_SIZE } diff --git a/src/hyperlight_common/src/arch/aarch64/vmem.rs b/src/hyperlight_common/src/arch/aarch64/vmem.rs index 3c83cabc8..9332494cf 100644 --- a/src/hyperlight_common/src/arch/aarch64/vmem.rs +++ b/src/hyperlight_common/src/arch/aarch64/vmem.rs @@ -14,9 +14,18 @@ See the License for the specific language governing permissions and limitations under the License. */ -// TODO(aarch64): implement real page table operations +use alloc::collections::BTreeMap; +use alloc::collections::btree_map::Entry; +use alloc::rc::Rc; +use alloc::vec::Vec; +use core::cell::RefCell; -use crate::vmem::{Mapping, TableOps, TableReadOps, Void}; +use crate::vmem::sealed::TableMovabilityBase; +use crate::vmem::{ + BasicMapping, CowMapping, MapRequest, MapResponse, Mapping, MappingKind, SpaceAwareMapping, + SpaceId, SpaceReferenceMapping, TableOps, TableReadOps, UpdateParent, UpdateParentNone, + UpdateParentTable, Void, bits, modify_ptes, write_entry_updating, +}; pub const PAGE_SIZE: usize = 4096; pub const PAGE_TABLE_SIZE: usize = 4096; @@ -26,38 +35,506 @@ pub type PageTableEntry = u64; pub type VirtAddr = u64; pub type PhysAddr = u64; +const VA_BITS: usize = 48; +pub const ATTR_INDEX_NORMAL: u8 = 0; +const SOFTWARE_USE_COW: u8 = 0b1; + +// Utility structures +impl< + Op: TableOps, + P: UpdateParent, +> UpdateParent for UpdateParentTable +{ + type TableMoveInfo = Op::TableAddr; + type ChildType = UpdateParentTable; + fn update_parent(self, op: &Op, new_ptr: Op::TableAddr) { + let pte = desc_for_table::(new_ptr); + unsafe { + write_entry_updating(op, self.parent, self.entry_ptr, pte); + } + } + fn for_child_at_entry(self, entry_ptr: Op::TableAddr) -> Self::ChildType { + Self::ChildType::new(self, entry_ptr) + } +} +#[derive(Copy, Clone)] +pub(in crate::vmem) struct UpdateParentRoot {} +impl> UpdateParent + for UpdateParentRoot +{ + type TableMoveInfo = Op::TableAddr; + type ChildType = UpdateParentTable; + fn update_parent(self, op: &Op, new_ptr: Op::TableAddr) { + unsafe { + op.update_root(new_ptr); + } + } + fn for_child_at_entry(self, entry_ptr: Op::TableAddr) -> Self::ChildType { + Self::ChildType::new(self, entry_ptr) + } +} +/// This trait is used to select appropriate implementations of +/// [`UpdateParent`] to be used, depending on whether a particular +/// implementation needs the ability to move tables. +pub(in crate::vmem) trait TableMovability { + type RootUpdateParent: UpdateParent; + fn root_update_parent() -> Self::RootUpdateParent; +} +impl> TableMovability + for crate::vmem::MayMoveTable +{ + type RootUpdateParent = UpdateParentRoot; + fn root_update_parent() -> Self::RootUpdateParent { + UpdateParentRoot {} + } +} +impl TableMovability for crate::vmem::MayNotMoveTable { + type RootUpdateParent = UpdateParentNone; + fn root_update_parent() -> Self::RootUpdateParent { + UpdateParentNone {} + } +} + +#[allow(clippy::identity_op)] +#[allow(clippy::precedence)] +fn desc_for_table(table_addr: Op::TableAddr) -> u64 { + Op::to_phys(table_addr) | + // Don't set APTable[1:0] - we don't use hierachical permissions + // Don't set {U,P,}XNTable - we don't use hierarchical permissions + // Don't set set Protected - we don't use FEAT_THE + // We don't need to set AF on a table descriptor to avoid AF + // faults. Since we don't enable FEAT_HAFT, there is no AF on + // table descriptors, only on page descriptors. + 0b11 // table descriptor +} + +// We do not currently use hugepage mappings in the guest, and so we +// do not need to worry about block descriptors at intermediate +// levels. + +#[allow(clippy::identity_op)] +#[allow(clippy::precedence)] +fn desc_for_page( + page_addr: u64, + _readable: bool, + writable: bool, + executable: bool, + software_use: u8, + user_accessible: bool, +) -> u64 { + // todo: make use of the Contiguous bit to reduce tlb pressure + let xn = match (executable, user_accessible) { + (true, true) => 0, + (true, false) => 0b10, + (false, _) => 0b11, + }; + let ap = match (writable, user_accessible) { + (true, true) => 0b01, + (true, false) => 0b00, + (false, true) => 0b11, + (false, false) => 0b10, + }; + page_addr | + ((software_use as u64 & 0xf) << 55) | + (xn << 53) | + // we do not use hardware management of the dirty state + // If we support hugepage block descriptors in the future, we + // will need to support setting the nT bit here when the + // hardware supports FEAT_BBM Level 1 + (0b1 << 11) | // always set nG for now, since multi-space + // support is not properly reflected in the + // mapping API. + (0b1 << 10) | // we don't need AF tracking, so set it always + (0b11 << 8) | // Inner Shareable + (ap << 6) | + ((ATTR_INDEX_NORMAL as u64) << 2) | + 0b11 +} + +#[allow(clippy::identity_op)] +#[allow(clippy::precedence)] +// Produces only page descriptors valid at Level 3; there is presently +// no support for block descriptors valid at earlier levels +unsafe fn map_page< + Op: TableOps, + P: UpdateParent< + Op, + TableMoveInfo = >::TableMoveInfo, + >, +>( + op: &Op, + mapping: &Mapping, + r: MapResponse, +) { + let presumed_base = mapping.phys_base + (r.vmin - mapping.virt_base); + let desc = match &mapping.kind { + MappingKind::Basic(bm) => desc_for_page( + presumed_base, + bm.readable, + bm.writable, + bm.executable, + 0, + mapping.user_accessible, + ), + MappingKind::Cow(cm) => desc_for_page( + presumed_base, + cm.readable, + false, + cm.executable, + SOFTWARE_USE_COW, + mapping.user_accessible, + ), + MappingKind::Unmapped => 0, + }; + unsafe { + write_entry_updating(op, r.update_parent, r.entry_ptr, desc); + } +} + +enum FinalLevelDescriptorKind { + Page, +} +enum EarlyLevelDescriptorKind { + Block, + Table, +} +fn final_level_descriptor_kind(desc: u64) -> Option { + if desc & 3 == 3 { + Some(FinalLevelDescriptorKind::Page) + } else { + None + } +} +fn early_level_descriptor_kind(desc: u64) -> Option { + match desc & 0b11 { + 0b01 => Some(EarlyLevelDescriptorKind::Block), + 0b11 => Some(EarlyLevelDescriptorKind::Table), + _ => None, + } +} + +unsafe fn next_level_table_if_present( + op: &Op, + addr: Op::TableAddr, +) -> Option { + let desc: u64 = unsafe { op.read_entry(addr) }; + if let Some(EarlyLevelDescriptorKind::Table) = early_level_descriptor_kind(desc) { + Some(Op::from_phys(bits::<47, 12>(desc) << 12)) + } else { + None + } +} + +/// Page-mapping callback to allocate a next-level page table if necessary. +/// +/// Should only be called on a [`MapResponse`] representing an entry +/// at level < 3, since it allocates a next-level table. +/// # Safety +/// This function modifies page table data structures, and should not be called concurrently +/// with any other operations that modify the page tables. +unsafe fn alloc_table_if_needed< + Op: TableOps, + P: UpdateParent< + Op, + TableMoveInfo = >::TableMoveInfo, + >, +>( + op: &Op, + x: MapResponse, +) -> MapRequest +where + P::ChildType: UpdateParent, +{ + #[cfg(target_os = "linux")] + extern crate std; + + let new_update_parent = x.update_parent.for_child_at_entry(x.entry_ptr); + if let Some(table_base) = unsafe { next_level_table_if_present(op, x.entry_ptr) } { + return MapRequest { + table_base, + vmin: x.vmin, + len: x.len, + update_parent: new_update_parent, + }; + } + // If we eventually support huge pages, we will need to check if + // there was a Block descriptor here and follow the + // break-before-make sequence. + + let page_addr = unsafe { op.alloc_table() }; + + let pte = desc_for_table::(page_addr); + unsafe { + write_entry_updating(op, x.update_parent, x.entry_ptr, pte); + }; + MapRequest { + table_base: page_addr, + vmin: x.vmin, + len: x.len, + update_parent: new_update_parent, + } +} + +unsafe fn require_table_exist>( + op: &Op, + x: MapResponse, +) -> Option> +where + P::ChildType: UpdateParent, +{ + unsafe { + next_level_table_if_present(op, x.entry_ptr).map(|table_base| MapRequest { + table_base, + vmin: x.vmin, + len: x.len, + update_parent: x.update_parent.for_child_at_entry(x.entry_ptr), + }) + } +} + +enum WalkNextLevelResponse> { + WalkNextLevel(MapResponse), + AlreadySeen(SpaceReferenceMapping), +} + +enum WalkNextLevelRequest> { + WalkNextLevel(MapRequest), + AlreadySeen(SpaceReferenceMapping), +} +fn walk_check_request_seen>( + seen_pts: &Option>>>, + space: SpaceId, + depth: usize, + rq: MapRequest, +) -> WalkNextLevelRequest { + let Some(seen_pts) = seen_pts else { + return WalkNextLevelRequest::WalkNextLevel(rq); + }; + match seen_pts.borrow_mut().entry(Op::to_phys(rq.table_base)) { + Entry::Vacant(ve) => { + ve.insert(SpaceReferenceMapping { + depth, + space, + our_va: 0, + their_va: rq.vmin, + }); + WalkNextLevelRequest::WalkNextLevel(rq) + } + Entry::Occupied(oe) => { + let mut sm = *oe.get(); + if sm.depth != depth { + // Sharing a page table at different levels like this + // is not supported in the Hyperlight memory + // model. Ignore the "sharing". + WalkNextLevelRequest::WalkNextLevel(rq) + } else { + sm.our_va = rq.vmin; + WalkNextLevelRequest::AlreadySeen(sm) + } + } + } +} +fn walk_next_level_table>( + op: &Op, + x: WalkNextLevelResponse, + next_depth: usize, + space: SpaceId, + seen_pts: &Option>>>, +) -> Option> +where + P::ChildType: UpdateParent, +{ + let rq = match x { + WalkNextLevelResponse::WalkNextLevel(rq) => rq, + WalkNextLevelResponse::AlreadySeen(sm) => { + return Some(WalkNextLevelRequest::AlreadySeen(sm)); + } + }; + let next_base = unsafe { require_table_exist(op, rq)? }; + Some(walk_check_request_seen( + seen_pts, space, next_depth, next_base, + )) +} + +fn do_walk_next_level< + const HIGH_BIT: u8, + const LOW_BIT: u8, + Op: TableReadOps, + P: UpdateParent, +>( + x: WalkNextLevelRequest, +) -> impl Iterator> { + let (iter_a, iter_b) = match x { + WalkNextLevelRequest::WalkNextLevel(rq) => ( + Some( + modify_ptes::(rq) + .map(|r| WalkNextLevelResponse::WalkNextLevel(r)), + ), + None, + ), + WalkNextLevelRequest::AlreadySeen(sm) => ( + None, + Some(core::iter::once(WalkNextLevelResponse::AlreadySeen(sm))), + ), + }; + iter_a + .into_iter() + .flatten() + .chain(iter_b.into_iter().flatten()) +} + /// # Safety /// See `TableOps` documentation. #[allow(clippy::missing_safety_doc)] -pub unsafe fn map(_op: &Op, _mapping: Mapping) { - unimplemented!("map") +pub unsafe fn map(op: &Op, mapping: Mapping) { + modify_ptes::<47, 39, Op, _>(MapRequest { + table_base: op.root_table(), + vmin: mapping.virt_base, + len: mapping.len, + update_parent: Op::TableMovability::root_update_parent(), + }) + .map(|r| unsafe { alloc_table_if_needed(op, r) }) + .flat_map(modify_ptes::<38, 30, Op, _>) + .map(|r| unsafe { alloc_table_if_needed(op, r) }) + .flat_map(modify_ptes::<29, 21, Op, _>) + .map(|r| unsafe { alloc_table_if_needed(op, r) }) + .flat_map(modify_ptes::<20, 12, Op, _>) + .map(|r| unsafe { map_page(op, &mapping, r) }) + .for_each(drop); } /// # Safety /// See `TableReadOps` documentation. #[allow(clippy::missing_safety_doc)] pub unsafe fn virt_to_phys<'a, Op: TableReadOps + 'a>( - _op: impl core::convert::AsRef + Copy + 'a, - _address: u64, - _len: u64, + op: impl core::convert::AsRef + Copy + 'a, + address: u64, + len: u64, ) -> impl Iterator + 'a { - unimplemented!("virt_to_phys"); - #[allow(unreachable_code)] - core::iter::empty() + let roots = core::iter::once(op.as_ref().root_table()); + unsafe { + internal_walk_va_spaces(op, roots, false, address, len) + .flat_map(|(_, mappings)| mappings) + .filter_map(|saw| match saw { + SpaceAwareMapping::ThisSpace(m) => Some(m), + // this is guaranteed to never actually happen, both since + // we only passed one root and since we passed do_dedup = + // false + SpaceAwareMapping::AnotherSpace(_) => None, + }) + } +} + +#[allow(clippy::missing_safety_doc)] +unsafe fn internal_walk_va_spaces<'a, Op: TableReadOps + 'a>( + op: impl core::convert::AsRef + Copy + 'a, + roots: impl Iterator + 'a, + // todo - type magic could unify virt_to_phys and walk_va_spaces + do_dedup: bool, + address: u64, + len: u64, +) -> impl Iterator< + Item = ( + SpaceId, + impl Iterator, + ), +> + 'a { + #[cfg(target_os = "linux")] + extern crate std; + let addr = address & ((1u64 << VA_BITS) - 1); + let vmin = addr & !((PAGE_SIZE as u64) - 1); + let vmax = core::cmp::min(addr + len, 1u64 << VA_BITS); + let seen_pts: Option>>> = if do_dedup { + Some(Rc::new(RefCell::new(BTreeMap::new()))) + } else { + None + }; + roots.into_iter().map(move |root| { + let root_id = Op::to_phys(root); + let root_req = walk_check_request_seen( + &seen_pts, + root_id, + 0, + MapRequest { + table_base: root, + vmin, + len: vmax.saturating_sub(vmin), + update_parent: UpdateParentNone {}, + }, + ); + let seen_pts_1 = seen_pts.clone(); + let seen_pts_2 = seen_pts.clone(); + let seen_pts_3 = seen_pts.clone(); + let iter = do_walk_next_level::<47, 39, Op, _>(root_req) + .filter_map(move |r| walk_next_level_table(op.as_ref(), r, 1, root_id, &seen_pts_1)) + .flat_map(do_walk_next_level::<38, 30, Op, _>) + .filter_map(move |r| walk_next_level_table(op.as_ref(), r, 2, root_id, &seen_pts_2)) + .flat_map(do_walk_next_level::<29, 21, Op, _>) + .filter_map(move |r| walk_next_level_table(op.as_ref(), r, 3, root_id, &seen_pts_3)) + .flat_map(do_walk_next_level::<20, 12, Op, _>) + .filter_map(move |r| { + let rq = match r { + WalkNextLevelResponse::AlreadySeen(sm) => { + return Some(SpaceAwareMapping::AnotherSpace(sm)); + } + WalkNextLevelResponse::WalkNextLevel(rq) => rq, + }; + let desc = unsafe { op.as_ref().read_entry(rq.entry_ptr) }; + if let Some(FinalLevelDescriptorKind::Page) = final_level_descriptor_kind(desc) { + let phys_addr = bits::<47, 12>(desc) << 12; + // Don't sign-extend to canonicalise, because we + // only uses addresess in the lower half right + // now---VA_BITS does not include the bit that + // selects between the ttbr0 and ttbr1 spaces. + let virt_addr = rq.vmin; + // The division of flags in the mapping structure + // does not perfectly capture the fact that + // user-level data and instruction access + // permissions can be different. For now, we just + // assume that the mapping should be marked as + // executable if it was executable to the kernel + // at all. + let executable = bits::<53, 53>(desc) == 0; + let user_accessible = bits::<6, 6>(desc) != 0; // AP[1] + let kind = if bits::<58, 55>(desc) == SOFTWARE_USE_COW as u64 { + MappingKind::Cow(CowMapping { + readable: true, + executable, + }) + } else { + MappingKind::Basic(BasicMapping { + readable: true, + writable: bits::<7, 7>(desc) == 0, // AP[2] + executable, + }) + }; + Some(SpaceAwareMapping::ThisSpace(Mapping { + phys_base: phys_addr, + virt_base: virt_addr, + len: PAGE_SIZE as u64, + kind, + user_accessible, + })) + } else { + None // do nothing - there is no mapping to record here + } + }); + (root_id, iter) + }) } -/// Stub — see [`crate::vmem::walk_va_spaces`]. #[allow(clippy::missing_safety_doc)] pub unsafe fn walk_va_spaces( - _op: &Op, - _roots: &[Op::TableAddr], - _address: u64, - _len: u64, -) -> ::alloc::vec::Vec<( - crate::vmem::SpaceId, - ::alloc::vec::Vec, -)> { - ::alloc::vec::Vec::new() + op: impl core::convert::AsRef + Copy, + roots: &[Op::TableAddr], + address: u64, + len: u64, +) -> Vec<(SpaceId, Vec)> { + unsafe { + internal_walk_va_spaces(&op, roots.iter().cloned(), true, address, len) + .map(|(id, mappings)| (id, mappings.collect::>())) + .collect::>() + } } /// Stub — see [`crate::vmem::space_aware_map`]. @@ -67,11 +544,7 @@ pub unsafe fn space_aware_map( _ref_map: crate::vmem::SpaceReferenceMapping, _built_roots: &::alloc::collections::BTreeMap, ) { + // in practice, we never construct page tables that would result + // in reaching this right now. todo: implement this properly + unreachable!() } - -pub trait TableMovability {} -impl> TableMovability - for crate::vmem::MayMoveTable -{ -} -impl TableMovability for crate::vmem::MayNotMoveTable {} diff --git a/src/hyperlight_common/src/arch/amd64/layout.rs b/src/hyperlight_common/src/arch/amd64/layout.rs index 14a9cd62a..4237caf51 100644 --- a/src/hyperlight_common/src/arch/amd64/layout.rs +++ b/src/hyperlight_common/src/arch/amd64/layout.rs @@ -21,7 +21,7 @@ limitations under the License. /// We have this the top of the page below the top of memory in order /// to make working with start/end ptrs in a few places more /// convenient (not needing to worry about overflow) -pub const MAX_GVA: usize = 0xffff_ffff_ffff_efff; +pub const SCRATCH_TOP_GVA: usize = 0xffff_ffff_ffff_efff; pub const SNAPSHOT_PT_GVA_MIN: usize = 0xffff_8000_0000_0000; pub const SNAPSHOT_PT_GVA_MAX: usize = 0xffff_80ff_ffff_ffff; @@ -29,7 +29,11 @@ pub const SNAPSHOT_PT_GVA_MAX: usize = 0xffff_80ff_ffff_ffff; /// supports at least 36 bits. Almost all of them support at least 40 /// bits, so we could consider bumping this in the future if we were /// ever memory-constrained. -pub const MAX_GPA: usize = 0x0000_000f_ffff_ffff; +pub const SCRATCH_TOP_GPA: usize = 0x0000_000f_ffff_ffff; + +pub fn io_page() -> Option<(u64, u64)> { + None +} /// On amd64, this is: /// - Two pages for the TSS and IDT diff --git a/src/hyperlight_common/src/arch/amd64/vmem.rs b/src/hyperlight_common/src/arch/amd64/vmem.rs index eb7a1104c..8f29316e4 100644 --- a/src/hyperlight_common/src/arch/amd64/vmem.rs +++ b/src/hyperlight_common/src/arch/amd64/vmem.rs @@ -27,31 +27,12 @@ limitations under the License. use crate::vmem::{ BasicMapping, CowMapping, MapRequest, MapResponse, Mapping, MappingKind, TableMovabilityBase, - TableOps, TableReadOps, UpdateParent, UpdateParentNone, Void, modify_ptes, + TableOps, TableReadOps, UpdateParent, UpdateParentNone, UpdateParentTable, Void, modify_ptes, write_entry_updating, }; -/// Parent is another page table whose ancestors may also need -/// updating when it relocates. -pub struct UpdateParentTable> { - pub(crate) parent: P, - pub(crate) entry_ptr: Op::TableAddr, -} -impl> Clone for UpdateParentTable { - fn clone(&self) -> Self { - *self - } -} -impl> Copy for UpdateParentTable {} -impl> UpdateParentTable { - pub(crate) fn new(parent: P, entry_ptr: Op::TableAddr) -> Self { - UpdateParentTable { parent, entry_ptr } - } -} - -/// Parent is the root (e.g. CR3). #[derive(Copy, Clone)] -pub struct UpdateParentRoot {} +pub(in crate::vmem) struct UpdateParentRoot {} /// Read a PTE and return it (widened to u64) if the present bit is /// set. The amd64 "present" encoding is a single bit (bit 0); other @@ -162,7 +143,7 @@ fn pte_for_table(table_addr: Op::TableAddr) -> u64 { /// This trait is used to select appropriate implementations of /// [`UpdateParent`] to be used, depending on whether a particular /// implementation needs the ability to move tables. -pub trait TableMovability { +pub(in crate::vmem) trait TableMovability { type RootUpdateParent: UpdateParent; fn root_update_parent() -> Self::RootUpdateParent; } diff --git a/src/hyperlight_common/src/arch/i686/layout.rs b/src/hyperlight_common/src/arch/i686/layout.rs index cdc3af7d1..81de7d505 100644 --- a/src/hyperlight_common/src/arch/i686/layout.rs +++ b/src/hyperlight_common/src/arch/i686/layout.rs @@ -16,10 +16,10 @@ limitations under the License. // i686 layout constants for 32-bit protected mode with paging. -pub const MAX_GVA: usize = 0xffff_ffff; +pub const SCRATCH_TOP_GVA: usize = 0xffff_ffff; /// Set below the KVM APIC access page at 0xFEE00000 to avoid EEXIST when scratch /// regions are large enough to reach that address. -pub const MAX_GPA: usize = 0xFEDF_FFFF; +pub const SCRATCH_TOP_GPA: usize = 0xFEDF_FFFF; /// Minimum scratch region size: IO buffers (page-aligned) plus 12 pages /// for bookkeeping and the exception stack. Page table space is validated diff --git a/src/hyperlight_common/src/layout.rs b/src/hyperlight_common/src/layout.rs index 1a7ca0880..e5dde0c2d 100644 --- a/src/hyperlight_common/src/layout.rs +++ b/src/hyperlight_common/src/layout.rs @@ -26,7 +26,7 @@ limitations under the License. #[cfg_attr(target_arch = "aarch64", path = "arch/aarch64/layout.rs")] mod arch; -pub use arch::{MAX_GPA, MAX_GVA}; +pub use arch::{SCRATCH_TOP_GPA, SCRATCH_TOP_GVA, io_page}; #[cfg(any( all(target_arch = "x86_64", not(feature = "i686-guest")), target_arch = "aarch64" @@ -50,10 +50,10 @@ pub const SCRATCH_TOP_EXN_STACK_OFFSET: u64 = 0x30; pub const SCRATCH_TOP_GUEST_COUNTER_OFFSET: u64 = 0x1008; pub fn scratch_base_gpa(size: usize) -> u64 { - (MAX_GPA - size + 1) as u64 + (SCRATCH_TOP_GPA - size + 1) as u64 } pub fn scratch_base_gva(size: usize) -> u64 { - (MAX_GVA - size + 1) as u64 + (SCRATCH_TOP_GVA - size + 1) as u64 } /// Compute the minimum scratch region size needed for a sandbox. diff --git a/src/hyperlight_common/src/vmem.rs b/src/hyperlight_common/src/vmem.rs index 94b67319c..bb9536b09 100644 --- a/src/hyperlight_common/src/vmem.rs +++ b/src/hyperlight_common/src/vmem.rs @@ -42,6 +42,13 @@ pub use arch::{PAGE_PRESENT, PAGE_TABLE_SIZE, PTE_ADDR_MASK, PageTableEntry, Phy pub const PAGE_TABLE_ENTRIES_PER_TABLE: usize = PAGE_TABLE_SIZE / core::mem::size_of::(); +// It would be nice not to have any arch-dependent re-exports here, +// but on arm64 the MAIR indices used need to be synced between the +// descriptor creation code and the register initialisation code to +// make sure that MAIR is set up properly. +#[cfg(target_arch = "aarch64")] +pub use arch::ATTR_INDEX_NORMAL; + // Shared page table iterator infrastructure used by each arch module. /// Utility function to extract an (inclusive on both ends) bit range @@ -115,6 +122,31 @@ impl UpdateParent for UpdateParentNone { } } +/// A struct implementing [`UpdateParent`] to be used when a table's +/// parent is another table that needs to be updated recursively. +pub(in crate::vmem) struct UpdateParentTable> { + pub(in crate::vmem) parent: P, + pub(in crate::vmem) entry_ptr: Op::TableAddr, +} +impl> Clone for UpdateParentTable { + fn clone(&self) -> Self { + *self + } +} +impl> Copy for UpdateParentTable {} +impl> UpdateParentTable { + pub(in crate::vmem) fn new(parent: P, entry_ptr: Op::TableAddr) -> Self { + UpdateParentTable { parent, entry_ptr } + } +} + +/// A struct implementing [`UpdateParent`] to be used when a table's +/// parent is the "root table" (with access to that root pointer +/// provided in an architecture/environment-insensitive manner via +/// `TableOps`) +#[derive(Copy, Clone)] +pub struct UpdateParentRoot {} + /// A helper structure indicating a mapping operation that needs to be /// performed. pub(in crate::vmem) struct MapRequest> { @@ -319,6 +351,7 @@ mod sealed { use sealed::*; /// A sealed trait used to collect some information about the marker structures [`MayMoveTable`] and [`MayNotMoveTable`] +#[allow(private_bounds)] // this trait is intentionally sealed pub trait TableMovability: TableMovabilityBase + arch::TableMovability>::TableMoveInfo> diff --git a/src/hyperlight_guest/src/arch/aarch64/exit.rs b/src/hyperlight_guest/src/arch/aarch64/exit.rs index 0ac27570d..01ea692d9 100644 --- a/src/hyperlight_guest/src/arch/aarch64/exit.rs +++ b/src/hyperlight_guest/src/arch/aarch64/exit.rs @@ -16,7 +16,16 @@ limitations under the License. // TODO(aarch64): implement VM exit mechanism (e.g. hvc instruction) +const IO_PAGE_GVA: u64 = hyperlight_common::layout::io_page().unwrap().1; + /// Trigger a VM exit sending a 32-bit value to the host on the given port. -pub(crate) unsafe fn out32(_port: u16, _val: u32) { - unimplemented!("aarch64 out32") +pub(crate) unsafe fn out32(port: u16, val: u32) { + if port as usize > (hyperlight_common::vmem::PAGE_SIZE / core::mem::size_of::()) { + panic!("aarch64 mmio: unsupported hypercall number {}", port); + } + unsafe { + (IO_PAGE_GVA as *mut u64) + .wrapping_add(port as usize) + .write_volatile(val as u64); + } } diff --git a/src/hyperlight_guest/src/arch/aarch64/layout.rs b/src/hyperlight_guest/src/arch/aarch64/layout.rs index 685447ce7..59ffd29b4 100644 --- a/src/hyperlight_guest/src/arch/aarch64/layout.rs +++ b/src/hyperlight_guest/src/arch/aarch64/layout.rs @@ -15,17 +15,18 @@ limitations under the License. */ // TODO(aarch64): these values are placeholders copied from amd64 -pub const MAIN_STACK_TOP_GVA: u64 = 0xffff_ff00_0000_0000; -pub const MAIN_STACK_LIMIT_GVA: u64 = 0xffff_fe00_0000_0000; +pub const MAIN_STACK_TOP_GVA: u64 = 0x0000_ff00_0000_0000; +pub const MAIN_STACK_LIMIT_GVA: u64 = 0x0000_fe00_0000_0000; pub fn scratch_size() -> u64 { - unimplemented!("aarch64 scratch_size") + let addr = crate::layout::scratch_size_gva(); + unsafe { (addr as *mut u64).read_volatile() } } pub fn scratch_base_gpa() -> u64 { - unimplemented!("aarch64 scratch_base_gpa") + hyperlight_common::layout::scratch_base_gpa(scratch_size() as usize) } pub fn scratch_base_gva() -> u64 { - unimplemented!("aarch64 scratch_base_gva") + hyperlight_common::layout::scratch_base_gva(scratch_size() as usize) } diff --git a/src/hyperlight_guest/src/arch/aarch64/prim_alloc.rs b/src/hyperlight_guest/src/arch/aarch64/prim_alloc.rs index 4a5b5d137..d49e3f936 100644 --- a/src/hyperlight_guest/src/arch/aarch64/prim_alloc.rs +++ b/src/hyperlight_guest/src/arch/aarch64/prim_alloc.rs @@ -14,12 +14,40 @@ See the License for the specific language governing permissions and limitations under the License. */ -// TODO(aarch64): implement real aarch64 page allocator +use hyperlight_common::flatbuffer_wrappers::guest_error::ErrorCode; +use hyperlight_common::{layout, vmem}; // There are no notable architecture-specific safety considerations // here, and the general conditions are documented in the // architecture-independent re-export in prim_alloc.rs #[allow(clippy::missing_safety_doc)] -pub unsafe fn alloc_phys_pages(_n: u64) -> u64 { - unimplemented!("aarch64 alloc_phys_pages") +pub unsafe fn alloc_phys_pages(n: u64) -> u64 { + let addr = crate::layout::allocator_gva(); + let nbytes = n * vmem::PAGE_SIZE as u64; + let mut prev_base: u64 = 0; + unsafe { + // todo: actually check for FEAT_LSE presence. + core::arch::asm!(" + ldadd {nbytes}, {prev_base}, [{addr}] + ", + addr = in(reg) addr, + nbytes = in(reg) nbytes, + prev_base = out(reg) prev_base, + ); + } + // Set aside two pages at the top of the scratch region for the + // exception stack, shared state, etc + let max_avail = layout::SCRATCH_TOP_GPA - vmem::PAGE_SIZE * 2; + if prev_base + .checked_add(nbytes) + .is_none_or(|xx| xx >= max_avail as u64) + { + unsafe { + crate::exit::abort_with_code_and_message( + &[ErrorCode::MallocFailed as u8], + c"Out of physical memory".as_ptr(), + ) + } + } + prev_base } diff --git a/src/hyperlight_guest/src/arch/amd64/prim_alloc.rs b/src/hyperlight_guest/src/arch/amd64/prim_alloc.rs index cfaad9a0b..e1d388c64 100644 --- a/src/hyperlight_guest/src/arch/amd64/prim_alloc.rs +++ b/src/hyperlight_guest/src/arch/amd64/prim_alloc.rs @@ -33,7 +33,8 @@ pub unsafe fn alloc_phys_pages(n: u64) -> u64 { } // Set aside two pages at the top of the scratch region for the // exception stack, shared state, etc - let max_avail = hyperlight_common::layout::MAX_GPA - hyperlight_common::vmem::PAGE_SIZE * 2; + let max_avail = + hyperlight_common::layout::SCRATCH_TOP_GPA - hyperlight_common::vmem::PAGE_SIZE * 2; if x.checked_add(nbytes) .is_none_or(|xx| xx >= max_avail as u64) { diff --git a/src/hyperlight_guest/src/layout.rs b/src/hyperlight_guest/src/layout.rs index e4da4dd89..a9f56d78b 100644 --- a/src/hyperlight_guest/src/layout.rs +++ b/src/hyperlight_guest/src/layout.rs @@ -21,26 +21,26 @@ mod arch; pub use arch::{MAIN_STACK_LIMIT_GVA, MAIN_STACK_TOP_GVA}; pub fn scratch_size_gva() -> *mut u64 { - use hyperlight_common::layout::{MAX_GVA, SCRATCH_TOP_SIZE_OFFSET}; - (MAX_GVA as u64 - SCRATCH_TOP_SIZE_OFFSET + 1) as *mut u64 + use hyperlight_common::layout::{SCRATCH_TOP_GVA, SCRATCH_TOP_SIZE_OFFSET}; + (SCRATCH_TOP_GVA as u64 - SCRATCH_TOP_SIZE_OFFSET + 1) as *mut u64 } pub fn allocator_gva() -> *mut u64 { - use hyperlight_common::layout::{MAX_GVA, SCRATCH_TOP_ALLOCATOR_OFFSET}; - (MAX_GVA as u64 - SCRATCH_TOP_ALLOCATOR_OFFSET + 1) as *mut u64 + use hyperlight_common::layout::{SCRATCH_TOP_ALLOCATOR_OFFSET, SCRATCH_TOP_GVA}; + (SCRATCH_TOP_GVA as u64 - SCRATCH_TOP_ALLOCATOR_OFFSET + 1) as *mut u64 } pub fn snapshot_pt_gpa_base_gva() -> *mut u64 { - use hyperlight_common::layout::{MAX_GVA, SCRATCH_TOP_SNAPSHOT_PT_GPA_BASE_OFFSET}; - (MAX_GVA as u64 - SCRATCH_TOP_SNAPSHOT_PT_GPA_BASE_OFFSET + 1) as *mut u64 + use hyperlight_common::layout::{SCRATCH_TOP_GVA, SCRATCH_TOP_SNAPSHOT_PT_GPA_BASE_OFFSET}; + (SCRATCH_TOP_GVA as u64 - SCRATCH_TOP_SNAPSHOT_PT_GPA_BASE_OFFSET + 1) as *mut u64 } pub fn snapshot_generation_gva() -> *mut u64 { - use hyperlight_common::layout::{MAX_GVA, SCRATCH_TOP_SNAPSHOT_GENERATION_OFFSET}; - (MAX_GVA as u64 - SCRATCH_TOP_SNAPSHOT_GENERATION_OFFSET + 1) as *mut u64 + use hyperlight_common::layout::{SCRATCH_TOP_GVA, SCRATCH_TOP_SNAPSHOT_GENERATION_OFFSET}; + (SCRATCH_TOP_GVA as u64 - SCRATCH_TOP_SNAPSHOT_GENERATION_OFFSET + 1) as *mut u64 } pub use arch::{scratch_base_gpa, scratch_base_gva}; /// Returns a pointer to the guest counter u64 in scratch memory. #[cfg(feature = "guest-counter")] pub fn guest_counter_gva() -> *const u64 { - use hyperlight_common::layout::{MAX_GVA, SCRATCH_TOP_GUEST_COUNTER_OFFSET}; - (MAX_GVA as u64 - SCRATCH_TOP_GUEST_COUNTER_OFFSET + 1) as *const u64 + use hyperlight_common::layout::{SCRATCH_TOP_GUEST_COUNTER_OFFSET, SCRATCH_TOP_GVA}; + (SCRATCH_TOP_GVA as u64 - SCRATCH_TOP_GUEST_COUNTER_OFFSET + 1) as *const u64 } diff --git a/src/hyperlight_guest_bin/src/arch/aarch64/exception/entry.rs b/src/hyperlight_guest_bin/src/arch/aarch64/exception/entry.rs new file mode 100644 index 000000000..2b3d22d21 --- /dev/null +++ b/src/hyperlight_guest_bin/src/arch/aarch64/exception/entry.rs @@ -0,0 +1,173 @@ +/* +Copyright 2026 The Hyperlight Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ + +use core::arch::global_asm; +use core::mem::{offset_of, size_of}; + +use super::types::*; + +const _: () = assert!(2 * size_of::() == 0x10); +const _: () = assert!(2 * size_of::() == 0x20); + +// sp should already have been lowered to make room for the context +// save structure +// +// x30 should have been saved already +global_asm!(" +.global context_save\n +context_save:\n + stp x0, x1, [sp, #{x_off}+0x00]\n + stp x2, x3, [sp, #{x_off}+0x10]\n + stp x4, x5, [sp, #{x_off}+0x20]\n + stp x6, x7, [sp, #{x_off}+0x30]\n + stp x8, x9, [sp, #{x_off}+0x40]\n + stp x10, x11, [sp, #{x_off}+0x50]\n + stp x12, x13, [sp, #{x_off}+0x60]\n + stp x14, x15, [sp, #{x_off}+0x70]\n + stp x16, x17, [sp, #{x_off}+0x80]\n + stp x18, x19, [sp, #{x_off}+0x90]\n + stp x20, x21, [sp, #{x_off}+0xa0]\n + stp x22, x23, [sp, #{x_off}+0xb0]\n + stp x24, x25, [sp, #{x_off}+0xc0]\n + stp x26, x27, [sp, #{x_off}+0xd0]\n + stp x28, x29, [sp, #{x_off}+0xe0]\n + mrs x0, fpcr\n + mrs x1, fpsr\n + stp x0, x1, [sp, #{fpcr_off}]\n + stp q0, q1, [sp, #{q_off}+0x000]\n + stp q2, q3, [sp, #{q_off}+0x020]\n + stp q4, q5, [sp, #{q_off}+0x040]\n + stp q6, q7, [sp, #{q_off}+0x060]\n + stp q8, q9, [sp, #{q_off}+0x080]\n + stp q10, q11, [sp, #{q_off}+0x0a0]\n + stp q12, q13, [sp, #{q_off}+0x0c0]\n + stp q14, q15, [sp, #{q_off}+0x0e0]\n + stp q16, q17, [sp, #{q_off}+0x100]\n + stp q18, q19, [sp, #{q_off}+0x120]\n + stp q20, q21, [sp, #{q_off}+0x140]\n + stp q22, q23, [sp, #{q_off}+0x160]\n + stp q24, q25, [sp, #{q_off}+0x180]\n + stp q26, q27, [sp, #{q_off}+0x1a0]\n + stp q28, q29, [sp, #{q_off}+0x1c0]\n + stp q30, q31, [sp, #{q_off}+0x1e0]\n + ret +", + x_off = const offset_of!(ExceptionContext, x), + fpcr_off = const offset_of!(ExceptionContext, fpcr), + q_off = const offset_of!(ExceptionContext, q), +); + +global_asm!(" +.global context_restore\n +context_restore:\n + ldp x0, x1, [sp, #{fpcr_off}]\n + msr fpcr, x0\n + msr fpsr, x1\n + ldp x0, x1, [sp, #{x_off}+0x00]\n + ldp x2, x3, [sp, #{x_off}+0x10]\n + ldp x4, x5, [sp, #{x_off}+0x20]\n + ldp x6, x7, [sp, #{x_off}+0x30]\n + ldp x8, x9, [sp, #{x_off}+0x40]\n + ldp x10, x11, [sp, #{x_off}+0x50]\n + ldp x12, x13, [sp, #{x_off}+0x60]\n + ldp x14, x15, [sp, #{x_off}+0x70]\n + ldp x16, x17, [sp, #{x_off}+0x80]\n + ldp x18, x19, [sp, #{x_off}+0x90]\n + ldp x20, x21, [sp, #{x_off}+0xa0]\n + ldp x22, x23, [sp, #{x_off}+0xb0]\n + ldp x24, x25, [sp, #{x_off}+0xc0]\n + ldp x26, x27, [sp, #{x_off}+0xd0]\n + ldp x28, x29, [sp, #{x_off}+0xe0]\n + ldr x30, [sp, #{x_off}+0xf0]\n + ldp q0, q1, [sp, #{q_off}+0x000]\n + ldp q2, q3, [sp, #{q_off}+0x020]\n + ldp q4, q5, [sp, #{q_off}+0x040]\n + ldp q6, q7, [sp, #{q_off}+0x060]\n + ldp q8, q9, [sp, #{q_off}+0x080]\n + ldp q10, q11, [sp, #{q_off}+0x0a0]\n + ldp q12, q13, [sp, #{q_off}+0x0c0]\n + ldp q14, q15, [sp, #{q_off}+0x0e0]\n + ldp q16, q17, [sp, #{q_off}+0x100]\n + ldp q18, q19, [sp, #{q_off}+0x120]\n + ldp q20, q21, [sp, #{q_off}+0x140]\n + ldp q22, q23, [sp, #{q_off}+0x160]\n + ldp q24, q25, [sp, #{q_off}+0x180]\n + ldp q26, q27, [sp, #{q_off}+0x1a0]\n + ldp q28, q29, [sp, #{q_off}+0x1c0]\n + ldp q30, q31, [sp, #{q_off}+0x1e0]\n + add sp, sp, #{ctx_size}\n + eret\n +", + ctx_size = const size_of::(), + x_off = const offset_of!(ExceptionContext, x), + fpcr_off = const offset_of!(ExceptionContext, fpcr), + q_off = const offset_of!(ExceptionContext, q), +); + +macro_rules! vbar_entry { + ($et:literal, $ef:literal) => { + concat!( + " + sub sp, sp, #{ctx_size}\n + str x30, [sp, #{x30_off}]\n + bl context_save\n + mov x0, {ExceptionType_", + $et, + "}\n + mov x1, {ExceptionFrom_", + $ef, + "}\n + mov x2, sp\n + bl {handler}\n + b context_restore\n + .balign 0x80\n + " + ) + }; +} + +global_asm!(" +.balign 0x800\n +.global vbar\n +vbar:\n", + vbar_entry!("Synchronous", "CurrentSP0"), + vbar_entry!("IRQ", "CurrentSP0"), + vbar_entry!("FIQ", "CurrentSP0"), + vbar_entry!("SError", "CurrentSP0"), + vbar_entry!("Synchronous", "CurrentSPx"), + vbar_entry!("IRQ", "CurrentSPx"), + vbar_entry!("FIQ", "CurrentSPx"), + vbar_entry!("SError", "CurrentSPx"), + vbar_entry!("Synchronous", "LowerAArch64"), + vbar_entry!("IRQ", "LowerAArch64"), + vbar_entry!("FIQ", "LowerAArch64"), + vbar_entry!("SError", "LowerAArch64"), + vbar_entry!("Synchronous", "LowerAArch32"), + vbar_entry!("IRQ", "LowerAArch32"), + vbar_entry!("FIQ", "LowerAArch32"), + vbar_entry!("SError", "LowerAArch32"), + ctx_size = const size_of::(), + x30_off = const offset_of!(ExceptionContext, x) + 15 * 0x010, + handler = sym super::handle::handle_exception, + ExceptionType_Synchronous = const ExceptionType::Synchronous as u64, + ExceptionType_IRQ = const ExceptionType::IRQ as u64, + ExceptionType_FIQ = const ExceptionType::FIQ as u64, + ExceptionType_SError = const ExceptionType::SError as u64, + ExceptionFrom_CurrentSP0 = const ExceptionFrom::CurrentSP0 as u64, + ExceptionFrom_CurrentSPx = const ExceptionFrom::CurrentSPx as u64, + ExceptionFrom_LowerAArch64 = const ExceptionFrom::LowerAArch64 as u64, + ExceptionFrom_LowerAArch32 = const ExceptionFrom::LowerAArch32 as u64, +); diff --git a/src/hyperlight_guest_bin/src/arch/aarch64/exception/handle.rs b/src/hyperlight_guest_bin/src/arch/aarch64/exception/handle.rs new file mode 100644 index 000000000..464ccf9ae --- /dev/null +++ b/src/hyperlight_guest_bin/src/arch/aarch64/exception/handle.rs @@ -0,0 +1,233 @@ +/* +Copyright 2026 The Hyperlight Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ +use core::fmt::Write; + +use hyperlight_common::vmem::{ + BasicMapping, CowMapping, MappingKind, PAGE_SIZE, PhysAddr, VirtAddr, +}; +use hyperlight_guest::error::ErrorCode; +use hyperlight_guest::exit::write_abort; +use hyperlight_guest::layout::{MAIN_STACK_LIMIT_GVA, MAIN_STACK_TOP_GVA}; + +use super::super::mrs; +use super::types::*; +use crate::HyperlightAbortWriter; + +/// Utility function to extract an (inclusive on both ends) bit range +/// from a quadword. +#[inline(always)] +fn bits(x: u64) -> u64 { + (x & ((1 << (HIGH_BIT + 1)) - 1)) >> LOW_BIT +} + +const ESR_EC_DATA_ABORT_LOWER_EL: u64 = 0b100100; +const ESR_EC_DATA_ABORT_SAME_EL: u64 = 0b100101; + +// some of the data in these is not used presently, but is logically +// part of the code being decoded & should be accounted for +#[allow(dead_code)] +#[derive(Debug, Copy, Clone)] +enum DataFault { + TranslationFault(i64), + PermissionFault(i64), + Other(u64), +} +fn decode_data_fault(dfsc: u64) -> DataFault { + if bits::<5, 2>(dfsc) == 0b0011 { + DataFault::PermissionFault(bits::<1, 0>(dfsc) as i64) + } else if bits::<5, 2>(dfsc) == 0b0001 { + DataFault::TranslationFault(bits::<1, 0>(dfsc) as i64) + } else if bits::<5, 2>(dfsc) == 0b1010 { + if bits::<1, 0>(dfsc) >= 2 { + DataFault::TranslationFault(bits::<1, 0>(dfsc) as i64 - 4) + } else { + DataFault::Other(dfsc) + } + } else { + DataFault::Other(dfsc) + } +} + +// some of the data in these is not used presently, but is logically +// part of the code being decoded & should be accounted for +#[allow(dead_code)] +#[derive(Debug, Copy, Clone)] +enum Exception { + /// lower el?, faulting address, status code + DataFault(bool, u64, DataFault), + Other(u64), +} +fn decode_syndrome(esr: u64) -> Exception { + let ec = bits::<31, 26>(esr); + match ec { + ESR_EC_DATA_ABORT_LOWER_EL => Exception::DataFault( + true, + unsafe { mrs!(FAR_EL1) }, + decode_data_fault(bits::<5, 0>(esr)), + ), + ESR_EC_DATA_ABORT_SAME_EL => Exception::DataFault( + false, + unsafe { mrs!(FAR_EL1) }, + decode_data_fault(bits::<5, 0>(esr)), + ), + _ => Exception::Other(esr), + } +} + +fn handle_stack_fault(far: u64) { + // TODO: perhaps we should have a sanity check that the + // stack grows only one page at a time, which should be + // ensured by our stack probing discipline? + unsafe { + let new_page = hyperlight_guest::prim_alloc::alloc_phys_pages(1); + crate::paging::map_region( + new_page, + (far & !((PAGE_SIZE - 1) as u64)) as *mut u8, + PAGE_SIZE as u64, + MappingKind::Basic(BasicMapping { + readable: true, + writable: true, + executable: false, + }), + ); + // We don't use crate::barrier::first_valid_same_ctx, because + // we don't (presently) use FEAT_ExS and consequently don't + // need the `isb`. + core::arch::asm!("dsb sy"); + } +} + +fn handle_cow_fault(_orig_phys: PhysAddr, virt: VirtAddr, perms: CowMapping) { + unsafe { + let new_page = hyperlight_guest::prim_alloc::alloc_phys_pages(1); + let target_virt = virt as *mut u8; + let Some(scratch_mapping_access) = crate::paging::phys_to_virt(new_page) else { + write_abort(&[ErrorCode::GuestError as u8, 0xfeu8]); + write_abort("impossible: phys_to_virt failed on alloc_phys_pages return".as_bytes()); + write_abort(&[0xFF]); + // At this point, write_abort with the 0xFF terminator is + // expected to terminate guest execution, so control + // should never reach beyond this call. + unreachable!(); + }; + core::ptr::copy(target_virt, scratch_mapping_access, PAGE_SIZE); + // todo(multithreading): this will definitely require a + // break-before-make sequence + crate::paging::map_region( + new_page, + target_virt, + PAGE_SIZE as u64, + MappingKind::Basic(BasicMapping { + // Inherit R bit from the original mapping (always 1 at the moment) + readable: perms.readable, + // If we got here, the original marking was marked + // CoW, so the copied mapping should always be + // writable + writable: true, + executable: perms.executable, + }), + ); + // This is updating an entry that was already valid, changing + // its OA, so we need to actually invalidate the TLB for it. + core::arch::asm!(" + dsb ish + tlbi vae1is, {} + dsb ish + isb + ", + in(reg) (virt >> 12), + options(readonly, nostack, preserves_flags) + ); + } +} + +#[unsafe(no_mangle)] +pub extern "Rust" fn _debug_print(x: &str) { + hyperlight_guest::exit::debug_print(x); +} + +fn handle_internal_fault(exn: Exception) -> bool { + match exn { + Exception::DataFault(false, far, DataFault::TranslationFault(_)) => { + if (MAIN_STACK_LIMIT_GVA..MAIN_STACK_TOP_GVA).contains(&far) { + handle_stack_fault(far); + true + } else { + false + } + } + Exception::DataFault(false, far, DataFault::PermissionFault(_)) => { + let mut orig_mappings = crate::paging::virt_to_phys(far); + if let Some(mapping) = orig_mappings.next() + && let None = orig_mappings.next() + && let MappingKind::Cow(cm) = mapping.kind + { + handle_cow_fault(mapping.phys_base, mapping.virt_base, cm); + true + } else { + false + } + } + _ => false, + } +} + +pub(super) extern "C" fn handle_exception( + typ: ExceptionType, + from: ExceptionFrom, + _regs: *mut ExceptionContext, +) { + let esr = unsafe { mrs!(ESR_EL1) }; + + if typ == ExceptionType::Synchronous && from == ExceptionFrom::CurrentSP0 { + let exn = decode_syndrome(esr); + if handle_internal_fault(exn) { + return; + } + } + + // Die with some diagnostic information + let elr = unsafe { mrs!(ELR_EL1) }; + let far = unsafe { mrs!(FAR_EL1) }; + let insn_bytes = unsafe { (elr as *const [u8; 8]).read_volatile() }; + // amd64 provides the exception vector as the first byte of the + // abort sequence after the guest error identifier code, but the + // host doesn't use it for anything except printing an error + // message, so it's not really useful to try to find an analogue + // (e.g. we could use ESR_EL1.EC---but it's only used for + // debugging and we'll include the whole syndrome in the message + // anyway). So, use 0xfe which is invalid as an exception on x86, + // to let the host know not to try to print anything extra. + let mut w = HyperlightAbortWriter; + write_abort(&[ErrorCode::GuestError as u8, 0xfe as u8]); + let write_res = write!( + w, + "Exception vector: {:?} {:?}\n\ + Faulting Instruction: {:#x}\n\ + Bytes At Faulting Instruction: {:?}\n\ + Faulting Address: {:#x}\n\ + Exception Syndrome: {:#x}", + from, typ, elr, insn_bytes, far, esr + ); + if write_res.is_err() { + write_abort("exception message format failed".as_bytes()); + } + + write_abort(&[0xFF]); + // At this point, write_abort with the 0xFF terminator is expected to terminate guest execution, + // so control should never reach beyond this call. + unreachable!(); +} diff --git a/src/hyperlight_guest_bin/src/arch/aarch64/exception/mod.rs b/src/hyperlight_guest_bin/src/arch/aarch64/exception/mod.rs new file mode 100644 index 000000000..89db2bb7f --- /dev/null +++ b/src/hyperlight_guest_bin/src/arch/aarch64/exception/mod.rs @@ -0,0 +1,19 @@ +/* +Copyright 2026 The Hyperlight Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ + +pub(super) mod entry; +pub mod handle; +mod types; diff --git a/src/hyperlight_guest_bin/src/arch/aarch64/exception/types.rs b/src/hyperlight_guest_bin/src/arch/aarch64/exception/types.rs new file mode 100644 index 000000000..a73d7a617 --- /dev/null +++ b/src/hyperlight_guest_bin/src/arch/aarch64/exception/types.rs @@ -0,0 +1,46 @@ +/* +Copyright 2026 The Hyperlight Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ + +use core::mem::{offset_of, size_of}; + +#[derive(Debug, PartialEq)] +#[repr(u64)] +pub(super) enum ExceptionType { + Synchronous, + IRQ, + FIQ, + SError, +} + +#[derive(Debug, PartialEq)] +#[repr(u64)] +pub(super) enum ExceptionFrom { + CurrentSP0, + CurrentSPx, + LowerAArch64, + LowerAArch32, +} + +#[repr(C)] +pub(super) struct ExceptionContext { + pub(super) x: [u64; 31], + pub(super) fpcr: u64, + pub(super) fpsr: u64, + // No need to store main context SP: it's in SP_EL0 + pub(super) q: [u128; 32], +} +const _: () = assert!(size_of::().is_multiple_of(16)); +const _: () = assert!(offset_of!(ExceptionContext, fpsr) == offset_of!(ExceptionContext, fpcr) + 8); diff --git a/src/hyperlight_guest_bin/src/arch/aarch64/mod.rs b/src/hyperlight_guest_bin/src/arch/aarch64/mod.rs index 4af3f518a..7da984d51 100644 --- a/src/hyperlight_guest_bin/src/arch/aarch64/mod.rs +++ b/src/hyperlight_guest_bin/src/arch/aarch64/mod.rs @@ -16,20 +16,110 @@ limitations under the License. // TODO(aarch64): implement aarch64 guest runtime +const IO_PAGE_GVA: u64 = hyperlight_common::layout::io_page().unwrap().1; +const HLT_ADDR: u64 = IO_PAGE_GVA + + (core::mem::size_of::() as u64 * hyperlight_common::outb::VmAction::Halt as u64); + pub mod dispatch { - /// Dispatch function pointer — set during initialisation and called - /// by the host for each guest function invocation. - #[unsafe(no_mangle)] - pub extern "C" fn dispatch_function() { - unimplemented!("aarch64 dispatch_function") + unsafe extern "C" { + /// See comments in amd64/dispatch.rs for why this + /// architecture-dependent stub exists + /// + /// # ABI + /// + /// If a TLB flush is required, the host should start executing + /// one instruction (4 bytes) after the base address of the + /// dispatch function. + pub(crate) unsafe fn dispatch_function(); + } + core::arch::global_asm!(" + .global dispatch_function + dispatch_function: + .cfi_startproc\n + .cfi_undefined x30\n + b 0f\n + tlbi vmalle1\n + dsb ish\n + isb\n + 0:\n + bl {internal_dispatch_function}\n + ldr x1, ={hlt_addr}\n + str x0, [x1]\n + .cfi_endproc\n + ", + internal_dispatch_function = sym crate::guest_function::call::internal_dispatch_function, + hlt_addr = const super::HLT_ADDR, + ); +} + +mod exception; + +macro_rules! msr { + ($sysreg:ident, $expr:expr) => { + core::arch::asm!(concat!("msr ", core::stringify!($sysreg), ", {}"), in(reg) $expr); + } +} +pub(crate) use msr; +macro_rules! mrs { + ($sysreg:ident) => { + { + let x: u64; + core::arch::asm!(concat!("mrs {}, ", core::stringify!($sysreg)), out(reg) x); + x + } } } +pub(crate) use mrs; -/// The entrypoint for the guest binary — called by the hypervisor. -/// -/// On aarch64 this is a stub that will be implemented when the -/// aarch64 hypervisor backend is ready. +unsafe fn init_vbar() { + unsafe { + core::arch::asm!(" + adrp {tmp}, vbar\n + add {tmp}, {tmp}, :lo12:vbar\n + msr VBAR_EL1, {tmp}\n + ", tmp = out(reg) _); + } +} + +/// Machine-specific initialisation; calls [`crate::generic_init`] +/// once VBAR and the main stack have been set up #[unsafe(no_mangle)] -pub extern "C" fn entrypoint() -> ! { - unimplemented!("aarch64 entrypoint") +pub extern "C" fn entrypoint(peb_address: u64, seed: u64, ops: u64, max_log_level: u64) -> ! { + unsafe { + init_vbar(); + let stack_top = crate::init::init_stack(); + pivot_stack(peb_address, seed, ops, max_log_level, stack_top); + } } + +unsafe extern "C" { + unsafe fn pivot_stack( + peb_address: u64, + seed: u64, + ops: u64, + max_log_level: u64, + stack_top: u64, + ) -> !; +} + +core::arch::global_asm!(" + .global pivot_stack\n + pivot_stack:\n + .cfi_startproc\n + .cfi_undefined x30\n + ldr x5, ={exn_stack}\n + msr SPSel, #1\n + mov sp, x5\n + msr SPSel, #0\n + mov sp, x4\n + bl {generic_init}\n + ldr x1, ={hlt_addr}\n + str x0, [x1]\n + .cfi_endproc\n +", + exn_stack = const (hyperlight_common::layout::SCRATCH_TOP_GVA as u64 + - hyperlight_common::layout::SCRATCH_TOP_EXN_STACK_OFFSET + + 1), + generic_init = sym crate::generic_init, + hlt_addr = const HLT_ADDR, +); diff --git a/src/hyperlight_guest_bin/src/arch/aarch64/paging.rs b/src/hyperlight_guest_bin/src/arch/aarch64/paging.rs new file mode 100644 index 000000000..1489ccd86 --- /dev/null +++ b/src/hyperlight_guest_bin/src/arch/aarch64/paging.rs @@ -0,0 +1,180 @@ +/* +Copyright 2025 The Hyperlight Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +use hyperlight_common::vmem; +use hyperlight_guest::prim_alloc::alloc_phys_pages; + +use crate::arch::{mrs, msr}; +// TODO: This is not at all thread-safe atm + +#[derive(Copy, Clone)] +struct GuestMappingOperations { + scratch_base_gpa: u64, + scratch_base_gva: u64, +} +impl GuestMappingOperations { + fn new() -> Self { + Self { + scratch_base_gpa: hyperlight_guest::layout::scratch_base_gpa(), + scratch_base_gva: hyperlight_guest::layout::scratch_base_gva(), + } + } + fn try_phys_to_virt(&self, addr: u64) -> Option<*mut u8> { + if addr >= self.scratch_base_gpa { + Some((self.scratch_base_gva + (addr - self.scratch_base_gpa)) as *mut u8) + } else { + None + } + } + fn phys_to_virt(&self, addr: u64) -> *mut u8 { + self.try_phys_to_virt(addr) + .expect("phys_to_virt encountered snapshot non-PT page") + } +} +// for virt_to_phys +impl core::convert::AsRef for GuestMappingOperations { + fn as_ref(&self) -> &Self { + self + } +} +impl vmem::TableReadOps for GuestMappingOperations { + type TableAddr = u64; + fn entry_addr(addr: u64, offset: u64) -> u64 { + addr + offset + } + unsafe fn read_entry(&self, addr: u64) -> u64 { + let addr = self.phys_to_virt(addr); + unsafe { (addr as *mut u64).read_volatile() } + } + fn to_phys(addr: u64) -> u64 { + addr + } + fn from_phys(addr: u64) -> u64 { + addr + } + fn root_table(&self) -> u64 { + unsafe { mrs!(TTBR0_EL1) & !0xfff } + } +} + +impl vmem::TableOps for GuestMappingOperations { + // Currently, we don't actually move tables anywhere on amd64 + // because of issues with guest PTs in IPAs that are mapped + // readonly in Stage 2 translation. However, this code all works + // and will re-enabled as soon as there is improved + // architecture/hypervisor support. + type TableMovability = vmem::MayMoveTable; + unsafe fn alloc_table(&self) -> u64 { + let page_addr = unsafe { alloc_phys_pages(1) }; + unsafe { + self.phys_to_virt(page_addr) + .write_bytes(0u8, vmem::PAGE_TABLE_SIZE); + // Make sure that the zero'ing writes are ordered with the + // subsequent write that will actually link this table + // into the hierarchy, so that the table walker can never + // read+cache a stale valid entry. See e.g. litmus test + // ROT.inv+dmbst in [1] + // + // [1] Ben Simner, Alasdair Armstrong, Jean + // Pichon-Pharabod, Christopher Pulte, Richard + // Grisenthwaite, and Peter Sewell. 2022. Relaxed + // virtual memory [extended version]. In: Proceedings + // of the 31st European Symposium on Systems + // Programming, ESOP 2022. + core::arch::asm!("dmb st"); + } + page_addr + } + unsafe fn write_entry(&self, addr: u64, entry: u64) -> Option { + unsafe { + (self.phys_to_virt(addr) as *mut u64).write_volatile(entry); + } + None + } + unsafe fn update_root(&self, new_root: u64) { + unsafe { + msr!(TTBR0_EL1, new_root); + } + } +} + +/// Assumption: all are page-aligned +/// # Safety +/// This function modifies pages backing a virtual memory range which is inherently unsafe w.r.t. +/// the Rust memory model. +/// When using this function note: +/// - No locking is performed before touching page table data structures, +/// as such do not use concurrently with any other page table operations +/// - TLB invalidation is not performed, +/// if previously-unmapped ranges are not being mapped, TLB invalidation may need to be performed afterwards. +pub unsafe fn map_region(phys_base: u64, virt_base: *mut u8, len: u64, kind: vmem::MappingKind) { + unsafe { + vmem::map( + &GuestMappingOperations::new(), + vmem::Mapping { + phys_base, + virt_base: virt_base as u64, + len, + kind, + user_accessible: false, + }, + ); + } +} + +pub fn virt_to_phys(gva: vmem::VirtAddr) -> impl Iterator { + unsafe { vmem::virt_to_phys::<_>(GuestMappingOperations::new(), gva, 1) } +} + +pub fn phys_to_virt(gpa: vmem::PhysAddr) -> Option<*mut u8> { + GuestMappingOperations::new().try_phys_to_virt(gpa) +} + +pub mod barrier { + /// # Architecture-specific (aarch64) notes + /// + /// I_WZCBG from [1]: + /// > When a translation table entry that generates a Translation + /// > fault, Address size fault, or Access flag fault is changed to + /// > one that does not fault, all of the following apply to + /// > software: + /// > - TLB invalidation is not required because an entry that + /// > generates one of the listed faults is never cached in a TLB. + /// > - A Context synchronization event is required to ensure that + /// the completed change to the translation table entry affects + /// subsequent instruction fetches. + /// + /// In theory, without FEAT_nTLBPA, there could be some subtlety + /// here if the physical memory location used for the descriptor + /// was previously used after the last TLBI to store a valid + /// descriptor. Hyperlight does not recycle page tables in a way + /// that would cause problems here. + /// + /// [1] Arm Architecture Reference Manual for A-profile architecture + /// Chapter D8: The AArch64 Virtual Memory System Architecture + /// §D8.17 TLB maintenance + #[inline(always)] + pub fn first_valid_same_ctx() { + unsafe { + core::arch::asm!( + " + dsb ish + isb + " + ); + } + } +} diff --git a/src/hyperlight_guest_bin/src/arch/amd64/init.rs b/src/hyperlight_guest_bin/src/arch/amd64/init.rs index 073bd3a2f..4dfa7e2f8 100644 --- a/src/hyperlight_guest_bin/src/arch/amd64/init.rs +++ b/src/hyperlight_guest_bin/src/arch/amd64/init.rs @@ -92,7 +92,7 @@ unsafe fn init_tss(pc: *mut ProcCtrl) { let tss_ptr = &raw mut (*pc).tss; // copy byte by byte to avoid alignment issues let ist1_ptr = &raw mut (*tss_ptr).ist1 as *mut [u8; 8]; - let exn_stack = hyperlight_common::layout::MAX_GVA as u64 + let exn_stack = hyperlight_common::layout::SCRATCH_TOP_GVA as u64 - hyperlight_common::layout::SCRATCH_TOP_EXN_STACK_OFFSET + 1; ist1_ptr.write_volatile(exn_stack.to_ne_bytes()); @@ -104,28 +104,6 @@ unsafe fn init_tss(pc: *mut ProcCtrl) { } } -/// To initialise the main stack, we just pre-emptively map the first -/// page of it. -unsafe fn init_stack() -> u64 { - use hyperlight_guest::layout::MAIN_STACK_TOP_GVA; - let stack_top_page_base = (MAIN_STACK_TOP_GVA - 1) & !0xfff; - unsafe { - use hyperlight_common::vmem::{BasicMapping, MappingKind, PAGE_SIZE}; - crate::paging::map_region( - hyperlight_guest::prim_alloc::alloc_phys_pages(1), - stack_top_page_base as *mut u8, - PAGE_SIZE as u64, - MappingKind::Basic(BasicMapping { - readable: true, - writable: true, - executable: false, - }), - ); - crate::paging::barrier::first_valid_same_ctx(); - } - MAIN_STACK_TOP_GVA -} - /// Machine-specific initialisation; calls [`crate::generic_init`] /// once stack, CoW, etc have been set up. #[unsafe(no_mangle)] @@ -138,7 +116,7 @@ pub extern "C" fn entrypoint(peb_address: u64, seed: u64, ops: u64, max_log_leve init_gdt(pc); init_tss(pc); init_idt(pc); - let stack_top = init_stack(); + let stack_top = crate::init::init_stack(); // Architecture early init is complete! We pivot now to // executing on the main stack, and jump into generic diff --git a/src/hyperlight_guest_bin/src/arch/amd64/paging.rs b/src/hyperlight_guest_bin/src/arch/amd64/paging.rs new file mode 100644 index 000000000..8af130eec --- /dev/null +++ b/src/hyperlight_guest_bin/src/arch/amd64/paging.rs @@ -0,0 +1,202 @@ +/* +Copyright 2025 The Hyperlight Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +use core::arch::asm; + +use hyperlight_common::vmem; +use hyperlight_guest::prim_alloc::alloc_phys_pages; + +// TODO: This is not at all thread-safe atm +// TODO: A lot of code in this file uses inline assembly to load and +// store page table entries. It would be nice to use pointer +// volatile read/writes instead, but unfortunately we have a PTE +// at physical address 0, which is currently identity-mapped at +// virtual address 0, and Rust raw pointer operations can't be +// used to read/write from address 0. + +#[derive(Copy, Clone)] +struct GuestMappingOperations { + scratch_base_gpa: u64, + scratch_base_gva: u64, +} +impl GuestMappingOperations { + fn new() -> Self { + Self { + scratch_base_gpa: hyperlight_guest::layout::scratch_base_gpa(), + scratch_base_gva: hyperlight_guest::layout::scratch_base_gva(), + } + } + fn try_phys_to_virt(&self, addr: u64) -> Option<*mut u8> { + if addr >= self.scratch_base_gpa { + Some((self.scratch_base_gva + (addr - self.scratch_base_gpa)) as *mut u8) + } else { + None + } + } + fn phys_to_virt(&self, addr: u64) -> *mut u8 { + self.try_phys_to_virt(addr) + .expect("phys_to_virt encountered snapshot non-PT page") + } +} +// for virt_to_phys +impl core::convert::AsRef for GuestMappingOperations { + fn as_ref(&self) -> &Self { + self + } +} +impl vmem::TableReadOps for GuestMappingOperations { + type TableAddr = u64; + fn entry_addr(addr: u64, offset: u64) -> u64 { + addr + offset + } + unsafe fn read_entry(&self, addr: u64) -> u64 { + let addr = self.phys_to_virt(addr); + let ret: u64; + unsafe { + asm!("mov {}, qword ptr [{}]", out(reg) ret, in(reg) addr); + } + ret + } + fn to_phys(addr: u64) -> u64 { + addr + } + fn from_phys(addr: u64) -> u64 { + addr + } + fn root_table(&self) -> u64 { + let pml4_base: u64; + unsafe { + asm!("mov {}, cr3", out(reg) pml4_base); + } + pml4_base & !0xfff + } +} + +impl vmem::TableOps for GuestMappingOperations { + // Currently, we don't actually move tables anywhere on amd64 + // because of issues with guest PTs in IPAs that are mapped + // readonly in Stage 2 translation. However, this code all works + // and will re-enabled as soon as there is improved + // architecture/hypervisor support. + type TableMovability = vmem::MayMoveTable; + unsafe fn alloc_table(&self) -> u64 { + let page_addr = unsafe { alloc_phys_pages(1) }; + unsafe { + self.phys_to_virt(page_addr) + .write_bytes(0u8, vmem::PAGE_TABLE_SIZE) + }; + page_addr + } + unsafe fn write_entry(&self, addr: u64, entry: u64) -> Option { + let addr = self.phys_to_virt(addr); + unsafe { + asm!("mov qword ptr [{}], {}", in(reg) addr, in(reg) entry); + } + None + } + unsafe fn update_root(&self, new_root: u64) { + unsafe { + core::arch::asm!("mov cr3, {}", in(reg) ::to_phys(new_root)); + } + } +} + +/// Assumption: all are page-aligned +/// # Safety +/// This function modifies pages backing a virtual memory range which is inherently unsafe w.r.t. +/// the Rust memory model. +/// When using this function note: +/// - No locking is performed before touching page table data structures, +/// as such do not use concurrently with any other page table operations +/// - TLB invalidation is not performed, +/// if previously-unmapped ranges are not being mapped, TLB invalidation may need to be performed afterwards. +pub unsafe fn map_region(phys_base: u64, virt_base: *mut u8, len: u64, kind: vmem::MappingKind) { + unsafe { + vmem::map( + &GuestMappingOperations::new(), + vmem::Mapping { + phys_base, + virt_base: virt_base as u64, + len, + kind, + user_accessible: false, + }, + ); + } +} + +pub fn virt_to_phys(gva: vmem::VirtAddr) -> impl Iterator { + unsafe { vmem::virt_to_phys::<_>(GuestMappingOperations::new(), gva, 1) } +} + +pub fn phys_to_virt(gpa: vmem::PhysAddr) -> Option<*mut u8> { + GuestMappingOperations::new().try_phys_to_virt(gpa) +} + +/// Barriers that other code may need to use when updating page tables +pub mod barrier { + /// Call this function when a virtual address has just been made + /// valid for the first time after the last tlb invalidate that + /// affected it, and it will be used for the first time in the + /// same execution context as has made the modification. + /// + /// On most architectures, TLBs will not cache invalid entries, so + /// this does not need to issue a TLB. However, it does need to + /// ensure coherency between the previous writes and any future + /// uses by a page table walker. + /// + /// # Architecture-specific (amd64) notes + /// + /// The exact details around page walk coherency on amd64 seem a + /// bit fuzzy. The Intel manual notes that a serialising + /// instruction is necessary specifically to synchronise table + /// walks performed during instruction fetch [1], but is + /// relatively quiet about other page walks. The AMD manual notes + /// [2] that "a table entry is allowed to be upgraded (by marking + /// it as present, or by removing its write, execute or supervisor + /// restrictions) without explicitly maintaining TLB coherency", + /// but only states that TLB any upper-level TLB cache entries + /// will be flushed before re-walking to confirm the fault, which + /// does not clearly seem strong enough. + /// + /// In some limited testing, `mfence` typically seems to be + /// enough, but as it is not a serializing instruction on Intel + /// platforms, we assume it may not be quite good enough. `cpuid` + /// is likely to be very slow, since we are definitely running + /// under a hypervisor (and often even nested). Currently, for + /// simplicity's sake, this just copies cr0 to itself, but other + /// options (including the `serialize` instruction where + /// available) could be worth exploring. + /// + /// [1] Intel 64 and IA-32 Architectures Software Developer's Manual, Volume 3: System Programming Guide + /// Chapter 5: Paging + /// §5.10: Caching Translation Information + /// §5.10.4: Invalidation of TLBs and Paging-Structure Caches + /// §5.10.4.3: Optional Invalidation + /// [2] AMD64 Architecture Programmer's Manual, Volume 2: System Programming + /// Section 5: Page Translation and Protection + /// §5.5: Translation-Lookaside Buffer + /// §5.5.3: TLB Management + #[inline(always)] + pub fn first_valid_same_ctx() { + unsafe { + core::arch::asm!(" + mov rax, cr0 + mov cr0, rax + ", out("rax") _); + } + } +} diff --git a/src/hyperlight_guest_bin/src/init.rs b/src/hyperlight_guest_bin/src/init.rs new file mode 100644 index 000000000..824a51576 --- /dev/null +++ b/src/hyperlight_guest_bin/src/init.rs @@ -0,0 +1,38 @@ +/* +Copyright 2025 The Hyperlight Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ + +/// To initialise the main stack, we just pre-emptively map the first +/// page of it. We assume the architecture-specific exception handler +/// will allocate pages on fault as necessary +pub(crate) unsafe fn init_stack() -> u64 { + use hyperlight_common::vmem::{BasicMapping, MappingKind, PAGE_SIZE}; + use hyperlight_guest::layout::MAIN_STACK_TOP_GVA; + let stack_top_page_base = (MAIN_STACK_TOP_GVA - 1) & !(PAGE_SIZE as u64 - 1); + unsafe { + crate::paging::map_region( + hyperlight_guest::prim_alloc::alloc_phys_pages(1), + stack_top_page_base as *mut u8, + PAGE_SIZE as u64, + MappingKind::Basic(BasicMapping { + readable: true, + writable: true, + executable: false, + }), + ); + crate::paging::barrier::first_valid_same_ctx(); + } + MAIN_STACK_TOP_GVA +} diff --git a/src/hyperlight_guest_bin/src/lib.rs b/src/hyperlight_guest_bin/src/lib.rs index 450b54930..6b8039fac 100644 --- a/src/hyperlight_guest_bin/src/lib.rs +++ b/src/hyperlight_guest_bin/src/lib.rs @@ -51,7 +51,6 @@ pub mod error; pub mod guest_logger; pub mod host_comm; pub mod memory; -#[cfg(target_arch = "x86_64")] pub mod paging; /// Bridge between picolibc's POSIX expectations and the Hyperlight host. @@ -59,6 +58,9 @@ pub mod paging; #[cfg(feature = "libc")] mod libc_stubs; +/// Shared initialisation code used by multiple architectures +mod init; + /// Re-export the libc bindings from hyperlight-libc when the libc feature is enabled. #[cfg(feature = "libc")] pub use hyperlight_libc as libc; diff --git a/src/hyperlight_guest_bin/src/paging.rs b/src/hyperlight_guest_bin/src/paging.rs index 8af130eec..9b559f6c2 100644 --- a/src/hyperlight_guest_bin/src/paging.rs +++ b/src/hyperlight_guest_bin/src/paging.rs @@ -14,138 +14,11 @@ See the License for the specific language governing permissions and limitations under the License. */ -use core::arch::asm; - -use hyperlight_common::vmem; -use hyperlight_guest::prim_alloc::alloc_phys_pages; - -// TODO: This is not at all thread-safe atm -// TODO: A lot of code in this file uses inline assembly to load and -// store page table entries. It would be nice to use pointer -// volatile read/writes instead, but unfortunately we have a PTE -// at physical address 0, which is currently identity-mapped at -// virtual address 0, and Rust raw pointer operations can't be -// used to read/write from address 0. - -#[derive(Copy, Clone)] -struct GuestMappingOperations { - scratch_base_gpa: u64, - scratch_base_gva: u64, -} -impl GuestMappingOperations { - fn new() -> Self { - Self { - scratch_base_gpa: hyperlight_guest::layout::scratch_base_gpa(), - scratch_base_gva: hyperlight_guest::layout::scratch_base_gva(), - } - } - fn try_phys_to_virt(&self, addr: u64) -> Option<*mut u8> { - if addr >= self.scratch_base_gpa { - Some((self.scratch_base_gva + (addr - self.scratch_base_gpa)) as *mut u8) - } else { - None - } - } - fn phys_to_virt(&self, addr: u64) -> *mut u8 { - self.try_phys_to_virt(addr) - .expect("phys_to_virt encountered snapshot non-PT page") - } -} -// for virt_to_phys -impl core::convert::AsRef for GuestMappingOperations { - fn as_ref(&self) -> &Self { - self - } -} -impl vmem::TableReadOps for GuestMappingOperations { - type TableAddr = u64; - fn entry_addr(addr: u64, offset: u64) -> u64 { - addr + offset - } - unsafe fn read_entry(&self, addr: u64) -> u64 { - let addr = self.phys_to_virt(addr); - let ret: u64; - unsafe { - asm!("mov {}, qword ptr [{}]", out(reg) ret, in(reg) addr); - } - ret - } - fn to_phys(addr: u64) -> u64 { - addr - } - fn from_phys(addr: u64) -> u64 { - addr - } - fn root_table(&self) -> u64 { - let pml4_base: u64; - unsafe { - asm!("mov {}, cr3", out(reg) pml4_base); - } - pml4_base & !0xfff - } -} - -impl vmem::TableOps for GuestMappingOperations { - // Currently, we don't actually move tables anywhere on amd64 - // because of issues with guest PTs in IPAs that are mapped - // readonly in Stage 2 translation. However, this code all works - // and will re-enabled as soon as there is improved - // architecture/hypervisor support. - type TableMovability = vmem::MayMoveTable; - unsafe fn alloc_table(&self) -> u64 { - let page_addr = unsafe { alloc_phys_pages(1) }; - unsafe { - self.phys_to_virt(page_addr) - .write_bytes(0u8, vmem::PAGE_TABLE_SIZE) - }; - page_addr - } - unsafe fn write_entry(&self, addr: u64, entry: u64) -> Option { - let addr = self.phys_to_virt(addr); - unsafe { - asm!("mov qword ptr [{}], {}", in(reg) addr, in(reg) entry); - } - None - } - unsafe fn update_root(&self, new_root: u64) { - unsafe { - core::arch::asm!("mov cr3, {}", in(reg) ::to_phys(new_root)); - } - } -} - -/// Assumption: all are page-aligned -/// # Safety -/// This function modifies pages backing a virtual memory range which is inherently unsafe w.r.t. -/// the Rust memory model. -/// When using this function note: -/// - No locking is performed before touching page table data structures, -/// as such do not use concurrently with any other page table operations -/// - TLB invalidation is not performed, -/// if previously-unmapped ranges are not being mapped, TLB invalidation may need to be performed afterwards. -pub unsafe fn map_region(phys_base: u64, virt_base: *mut u8, len: u64, kind: vmem::MappingKind) { - unsafe { - vmem::map( - &GuestMappingOperations::new(), - vmem::Mapping { - phys_base, - virt_base: virt_base as u64, - len, - kind, - user_accessible: false, - }, - ); - } -} - -pub fn virt_to_phys(gva: vmem::VirtAddr) -> impl Iterator { - unsafe { vmem::virt_to_phys::<_>(GuestMappingOperations::new(), gva, 1) } -} - -pub fn phys_to_virt(gpa: vmem::PhysAddr) -> Option<*mut u8> { - GuestMappingOperations::new().try_phys_to_virt(gpa) -} +#[cfg_attr(target_arch = "x86_64", path = "arch/amd64/paging.rs")] +#[cfg_attr(target_arch = "aarch64", path = "arch/aarch64/paging.rs")] +mod arch; +pub use arch::{map_region, phys_to_virt, virt_to_phys}; /// Barriers that other code may need to use when updating page tables pub mod barrier { /// Call this function when a virtual address has just been made @@ -157,46 +30,7 @@ pub mod barrier { /// this does not need to issue a TLB. However, it does need to /// ensure coherency between the previous writes and any future /// uses by a page table walker. - /// - /// # Architecture-specific (amd64) notes - /// - /// The exact details around page walk coherency on amd64 seem a - /// bit fuzzy. The Intel manual notes that a serialising - /// instruction is necessary specifically to synchronise table - /// walks performed during instruction fetch [1], but is - /// relatively quiet about other page walks. The AMD manual notes - /// [2] that "a table entry is allowed to be upgraded (by marking - /// it as present, or by removing its write, execute or supervisor - /// restrictions) without explicitly maintaining TLB coherency", - /// but only states that TLB any upper-level TLB cache entries - /// will be flushed before re-walking to confirm the fault, which - /// does not clearly seem strong enough. - /// - /// In some limited testing, `mfence` typically seems to be - /// enough, but as it is not a serializing instruction on Intel - /// platforms, we assume it may not be quite good enough. `cpuid` - /// is likely to be very slow, since we are definitely running - /// under a hypervisor (and often even nested). Currently, for - /// simplicity's sake, this just copies cr0 to itself, but other - /// options (including the `serialize` instruction where - /// available) could be worth exploring. - /// - /// [1] Intel 64 and IA-32 Architectures Software Developer's Manual, Volume 3: System Programming Guide - /// Chapter 5: Paging - /// §5.10: Caching Translation Information - /// §5.10.4: Invalidation of TLBs and Paging-Structure Caches - /// §5.10.4.3: Optional Invalidation - /// [2] AMD64 Architecture Programmer's Manual, Volume 2: System Programming - /// Section 5: Page Translation and Protection - /// §5.5: Translation-Lookaside Buffer - /// §5.5.3: TLB Management - #[inline(always)] - pub fn first_valid_same_ctx() { - unsafe { - core::arch::asm!(" - mov rax, cr0 - mov cr0, rax - ", out("rax") _); - } - } + pub use arch::first_valid_same_ctx; + + use super::arch::barrier as arch; } diff --git a/src/hyperlight_host/src/hypervisor/hyperlight_vm/aarch64.rs b/src/hyperlight_host/src/hypervisor/hyperlight_vm/aarch64.rs index 42dbb7aeb..8ce5ea8b3 100644 --- a/src/hyperlight_host/src/hypervisor/hyperlight_vm/aarch64.rs +++ b/src/hyperlight_host/src/hypervisor/hyperlight_vm/aarch64.rs @@ -17,6 +17,7 @@ limitations under the License. // TODO(aarch64): implement arch-specific HyperlightVm methods use std::sync::Arc; +use std::sync::atomic::{AtomicBool, AtomicU8, AtomicU64}; use super::{ AccessPageTableError, CreateHyperlightVmError, DispatchGuestCallError, HyperlightVm, @@ -24,9 +25,17 @@ use super::{ }; #[cfg(gdb)] use crate::hypervisor::gdb::{DebugCommChannel, DebugMsg, DebugResponse}; -use crate::hypervisor::regs::CommonSpecialRegisters; -use crate::hypervisor::virtual_machine::RegisterError; -use crate::mem::mgr::SandboxMemoryManager; +use crate::hypervisor::hyperlight_vm::get_guest_log_filter; +use crate::hypervisor::regs::{CommonFpu, CommonRegisters, CommonSpecialRegisters}; +#[cfg(kvm)] +use crate::hypervisor::virtual_machine::kvm::KvmVm; +#[cfg(kvm)] +use crate::hypervisor::virtual_machine::{HypervisorType, VmError}; +use crate::hypervisor::virtual_machine::{ + ResetVcpuError, VirtualMachine, get_available_hypervisor, +}; +use crate::hypervisor::{InterruptHandleImpl, LinuxInterruptHandle}; +use crate::mem::mgr::{SandboxMemoryManager, SnapshotSharedMemory}; use crate::mem::shared_mem::{GuestSharedMemory, HostSharedMemory}; use crate::sandbox::SandboxConfiguration; use crate::sandbox::host_funcs::FunctionRegistry; @@ -39,61 +48,180 @@ use crate::sandbox::uninitialized::SandboxRuntimeConfig; impl HyperlightVm { #[allow(clippy::too_many_arguments)] pub(crate) fn new( - _snapshot_mem: GuestSharedMemory, - _scratch_mem: GuestSharedMemory, - _root_pt_addr: u64, - _entrypoint: NextAction, - _rsp_gva: u64, - _config: &SandboxConfiguration, + snapshot_mem: SnapshotSharedMemory, + scratch_mem: GuestSharedMemory, + root_pt_addr: u64, + entrypoint: NextAction, + rsp_gva: u64, + page_size: usize, + config: &SandboxConfiguration, #[cfg(gdb)] _gdb_conn: Option>, #[cfg(crashdump)] _rt_cfg: SandboxRuntimeConfig, #[cfg(feature = "mem_profile")] _trace_info: MemTraceInfo, ) -> std::result::Result { - unimplemented!("new") + // TODO: support gdb on aarch64 + type VmType = Box; + let vm: VmType = match get_available_hypervisor() { + #[cfg(kvm)] + Some(HypervisorType::Kvm) => Box::new(KvmVm::new().map_err(VmError::CreateVm)?), + // TODO: mshv support + #[cfg(mshv3)] + Some(HypervisorType::Mshv) => return Err(CreateHyperlightVmError::NoHypervisorFound), + None => return Err(CreateHyperlightVmError::NoHypervisorFound), + }; + vm.set_sregs(&CommonSpecialRegisters::defaults(root_pt_addr)) + .map_err(VmError::Register)?; + let interrupt_handle: Arc = Arc::new(LinuxInterruptHandle { + state: AtomicU8::new(0), + tid: AtomicU64::new(unsafe { libc::pthread_self() as u64 }), + retry_delay: config.get_interrupt_retry_delay(), + sig_rt_min_offset: config.get_interrupt_vcpu_sigrtmin_offset(), + dropped: AtomicBool::new(false), + }); + + let snapshot_slot = 0u32; + let scratch_slot = 1u32; + let vm_can_reset_vcpu = vm.can_reset_vcpu(); + let mut ret = Self { + vm, + entrypoint, + rsp_gva, + interrupt_handle, + page_size, + + next_slot: scratch_slot + 1, + freed_slots: Vec::new(), + + snapshot_slot, + snapshot_memory: None, + scratch_slot, + scratch_memory: None, + + mmap_regions: Vec::new(), + + vm_can_reset_vcpu, + pending_tlb_flush: false, + }; + ret.update_snapshot_mapping(snapshot_mem)?; + ret.update_scratch_mapping(scratch_mem)?; + Ok(ret) } #[allow(clippy::too_many_arguments)] pub(crate) fn initialise( &mut self, - _peb_addr: crate::mem::ptr::RawPtr, - _seed: u64, - _page_size: u32, - _mem_mgr: &mut SandboxMemoryManager, - _host_funcs: &Arc>, - _guest_max_log_level: Option, - #[cfg(gdb)] _dbg_mem_access_fn: Arc< + peb_addr: crate::mem::ptr::RawPtr, + seed: u64, + mem_mgr: &mut SandboxMemoryManager, + host_funcs: &Arc>, + guest_max_log_level: Option, + #[cfg(gdb)] dbg_mem_access_fn: Arc< std::sync::Mutex>, >, ) -> Result<(), InitializeError> { - unimplemented!("initialise") + let NextAction::Initialise(initialise) = self.entrypoint else { + return Ok(()); + }; + let mut x: [u64; 31] = [0; 31]; + x[0] = peb_addr.into(); + x[1] = seed; + x[2] = self.page_size as u64; + x[3] = get_guest_log_filter(guest_max_log_level); + let regs = CommonRegisters { + pc: initialise, + sp: self.rsp_gva, + x, + // start up with interrupts disabled in EL1t + pstate: 0b11 << 6 | 0b100, + }; + self.vm.set_regs(®s)?; + + self.run( + mem_mgr, + host_funcs, + #[cfg(gdb)] + dbg_mem_access_fn, + ) + .map_err(InitializeError::Run)?; + + let regs = self.vm.regs()?; + if !regs.sp.is_multiple_of(16) { + return Err(InitializeError::InvalidStackPointer(regs.sp)); + } + self.rsp_gva = regs.sp; + self.entrypoint = NextAction::Call(regs.x[0]); + + Ok(()) } pub(crate) fn dispatch_call_from_host( &mut self, - _mem_mgr: &mut SandboxMemoryManager, - _host_funcs: &Arc>, + mem_mgr: &mut SandboxMemoryManager, + host_funcs: &Arc>, #[cfg(gdb)] _dbg_mem_access_fn: Arc< std::sync::Mutex>, >, ) -> Result<(), DispatchGuestCallError> { - unimplemented!("dispatch_call_from_host") + let NextAction::Call(dispatch_func_addr) = self.entrypoint else { + return Err(DispatchGuestCallError::Uninitialized); + }; + let mut regs = CommonRegisters { + pc: dispatch_func_addr, + sp: self.rsp_gva, + // start with interrupts disabled in EL1t + pstate: 0b1 << 21 | 0b11 << 6 | 0b100, + ..Default::default() + }; + if self.pending_tlb_flush { + regs.pc += 4; + } + self.vm + .set_regs(®s) + .map_err(DispatchGuestCallError::SetupRegs)?; + self.vm + .set_fpu(&CommonFpu::default()) + .map_err(DispatchGuestCallError::SetupRegs)?; + let result = self + .run( + mem_mgr, + host_funcs, + #[cfg(gdb)] + mem_access_fn, + ) + .map_err(DispatchGuestCallError::Run); + self.pending_tlb_flush = false; + result } pub(crate) fn get_root_pt(&self) -> Result { - unimplemented!("get_root_pt") + let sregs = self.vm.sregs()?; + Ok(sregs.ttbr0_el1 & ((1 << 48) - 2)) } pub(crate) fn get_snapshot_sregs( &mut self, ) -> Result { - unimplemented!("get_snapshot_sregs") + let x = self.vm.sregs()?; + Ok(x) } pub(crate) fn reset_vcpu( &mut self, - _cr3: u64, - _sregs: &CommonSpecialRegisters, - ) -> std::result::Result<(), RegisterError> { - unimplemented!("reset_vcpu") + cr3: u64, + sregs: &CommonSpecialRegisters, + ) -> std::result::Result<(), ResetVcpuError> { + self.pending_tlb_flush = true; + debug_assert!( + self.vm_can_reset_vcpu, + "No fallback path for vcpu reset on aarch64" + ); + self.vm.reset_vcpu()?; + let mut sregs = *sregs; + sregs.ttbr0_el1 = cr3 & ((1 << 48) - 2); + + self.vm + .set_sregs(&sregs) + .map_err(ResetVcpuError::Register)?; + Ok(()) } } diff --git a/src/hyperlight_host/src/hypervisor/hyperlight_vm/mod.rs b/src/hyperlight_host/src/hypervisor/hyperlight_vm/mod.rs index 830b856c0..f4e8f983b 100644 --- a/src/hyperlight_host/src/hypervisor/hyperlight_vm/mod.rs +++ b/src/hyperlight_host/src/hypervisor/hyperlight_vm/mod.rs @@ -43,7 +43,7 @@ use crate::hypervisor::hyperlight_vm::x86_64::debug::ProcessDebugRequestError; #[cfg(not(gdb))] use crate::hypervisor::virtual_machine::VirtualMachine; use crate::hypervisor::virtual_machine::{ - MapMemoryError, RegisterError, RunVcpuError, UnmapMemoryError, VmError, VmExit, + MapMemoryError, RegisterError, ResetVcpuError, RunVcpuError, UnmapMemoryError, VmError, VmExit, }; use crate::hypervisor::{InterruptHandle, InterruptHandleImpl}; use crate::mem::memory_region::{MemoryRegion, MemoryRegionFlags, MemoryRegionType}; @@ -344,7 +344,7 @@ pub enum HyperlightVmError { #[error("Map region error: {0}")] MapRegion(#[from] MapRegionError), #[error("Restore VM (vcpu) error: {0}")] - Restore(#[from] RegisterError), + Restore(#[from] ResetVcpuError), #[error("Unmap region error: {0}")] UnmapRegion(#[from] UnmapRegionError), #[error("Update region error: {0}")] @@ -383,6 +383,7 @@ pub(crate) struct HyperlightVm { pub(super) mmap_regions: Vec<(u32, MemoryRegion)>, // Later mapped regions (slot number, region) + pub(self) vm_can_reset_vcpu: bool, pub(super) pending_tlb_flush: bool, #[cfg(gdb)] diff --git a/src/hyperlight_host/src/hypervisor/hyperlight_vm/x86_64.rs b/src/hyperlight_host/src/hypervisor/hyperlight_vm/x86_64.rs index 16ac55ad3..281696092 100644 --- a/src/hyperlight_host/src/hypervisor/hyperlight_vm/x86_64.rs +++ b/src/hyperlight_host/src/hypervisor/hyperlight_vm/x86_64.rs @@ -162,6 +162,7 @@ impl HyperlightVm { mmap_regions: Vec::new(), + vm_can_reset_vcpu: false, pending_tlb_flush: false, #[cfg(gdb)] @@ -202,7 +203,6 @@ impl HyperlightVm { &mut self, peb_addr: RawPtr, seed: u64, - page_size: u32, mem_mgr: &mut SandboxMemoryManager, host_funcs: &Arc>, guest_max_log_level: Option, @@ -225,7 +225,7 @@ impl HyperlightVm { // function args rdi: peb_addr.into(), rsi: seed, - rdx: page_size.into(), + rdx: self.page_size as u64, rcx: get_guest_log_filter(guest_max_log_level), rflags: 1 << 1, @@ -338,7 +338,7 @@ impl HyperlightVm { &mut self, cr3: u64, sregs: &CommonSpecialRegisters, - ) -> std::result::Result<(), RegisterError> { + ) -> std::result::Result<(), ResetVcpuError> { self.vm.set_regs(&CommonRegisters { rflags: 1 << 1, // Reserved bit always set ..Default::default() @@ -346,7 +346,9 @@ impl HyperlightVm { self.vm.set_debug_regs(&CommonDebugRegs::default())?; self.vm.reset_xsave()?; - self.apply_sregs(cr3, sregs) + self.apply_sregs(cr3, sregs)?; + + Ok(()) } /// Apply special registers and mark TLB for flush. @@ -1499,7 +1501,7 @@ mod tests { let (mut hshm, gshm) = mem_mgr.build().unwrap(); let peb_address = gshm.layout.peb_address(); - let stack_top_gva = hyperlight_common::layout::MAX_GVA as u64 + let stack_top_gva = hyperlight_common::layout::SCRATCH_TOP_GVA as u64 - hyperlight_common::layout::SCRATCH_TOP_EXN_STACK_OFFSET + 1; let mut vm = set_up_hypervisor_partition( @@ -1525,7 +1527,6 @@ mod tests { vm.initialise( peb_addr, seed, - page_size, &mut hshm, &host_funcs, None, @@ -2112,7 +2113,7 @@ mod tests { /// Get the stack top GVA, same as the regular codepath. fn stack_top_gva(&self) -> u64 { - hyperlight_common::layout::MAX_GVA as u64 + hyperlight_common::layout::SCRATCH_TOP_GVA as u64 - hyperlight_common::layout::SCRATCH_TOP_EXN_STACK_OFFSET + 1 } diff --git a/src/hyperlight_host/src/hypervisor/mod.rs b/src/hyperlight_host/src/hypervisor/mod.rs index be1a15c22..2b33000ea 100644 --- a/src/hyperlight_host/src/hypervisor/mod.rs +++ b/src/hyperlight_host/src/hypervisor/mod.rs @@ -487,7 +487,7 @@ pub(crate) mod tests { let sandbox = UninitializedSandbox::new(GuestBinary::FilePath(filename.clone()), Some(config))?; let (mut mem_mgr, gshm) = sandbox.mgr.build().unwrap(); - let exn_stack_top_gva = hyperlight_common::layout::MAX_GVA as u64 + let exn_stack_top_gva = hyperlight_common::layout::SCRATCH_TOP_GVA as u64 - hyperlight_common::layout::SCRATCH_TOP_EXN_STACK_OFFSET + 1; let mut vm = set_up_hypervisor_partition( @@ -514,7 +514,6 @@ pub(crate) mod tests { vm.initialise( peb_addr, seed, - page_size, &mut mem_mgr, &host_funcs, guest_max_log_level, diff --git a/src/hyperlight_host/src/hypervisor/regs/aarch64/common_fpu.rs b/src/hyperlight_host/src/hypervisor/regs/aarch64/common_fpu.rs new file mode 100644 index 000000000..8cecc067c --- /dev/null +++ b/src/hyperlight_host/src/hypervisor/regs/aarch64/common_fpu.rs @@ -0,0 +1,6 @@ +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub(crate) struct CommonFpu { + pub(crate) v: [u128; 32], + pub(crate) fpsr: u32, + pub(crate) fpcr: u32, +} diff --git a/src/hyperlight_host/src/hypervisor/regs/aarch64/common_regs.rs b/src/hyperlight_host/src/hypervisor/regs/aarch64/common_regs.rs new file mode 100644 index 000000000..e15dfc0c4 --- /dev/null +++ b/src/hyperlight_host/src/hypervisor/regs/aarch64/common_regs.rs @@ -0,0 +1,7 @@ +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub(crate) struct CommonRegisters { + pub(crate) x: [u64; 31], + pub(crate) sp: u64, + pub(crate) pc: u64, + pub(crate) pstate: u64, +} diff --git a/src/hyperlight_host/src/hypervisor/regs/aarch64/fpu.rs b/src/hyperlight_host/src/hypervisor/regs/aarch64/fpu.rs new file mode 100644 index 000000000..4c1757a36 --- /dev/null +++ b/src/hyperlight_host/src/hypervisor/regs/aarch64/fpu.rs @@ -0,0 +1,6 @@ +#[derive(Debug, Default, Copy, Clone, PartialEq1)] +pub(crate) struct CommonFpu { + pub(crate) v: [u128; 32], + pub(crate) fpsr: u32, + pub(crate) fpcr: u32, +} diff --git a/src/hyperlight_host/src/hypervisor/regs/aarch64/kvm_reg.rs b/src/hyperlight_host/src/hypervisor/regs/aarch64/kvm_reg.rs new file mode 100644 index 000000000..c6d5a51e9 --- /dev/null +++ b/src/hyperlight_host/src/hypervisor/regs/aarch64/kvm_reg.rs @@ -0,0 +1,162 @@ +use kvm_bindings::{ + KVM_REG_ARM64, KVM_REG_ARM64_SYSREG, KVM_REG_ARM64_SYSREG_CRM_MASK, + KVM_REG_ARM64_SYSREG_CRM_SHIFT, KVM_REG_ARM64_SYSREG_CRN_MASK, KVM_REG_ARM64_SYSREG_CRN_SHIFT, + KVM_REG_ARM64_SYSREG_OP0_MASK, KVM_REG_ARM64_SYSREG_OP0_SHIFT, KVM_REG_ARM64_SYSREG_OP1_MASK, + KVM_REG_ARM64_SYSREG_OP1_SHIFT, KVM_REG_ARM64_SYSREG_OP2_MASK, KVM_REG_ARM64_SYSREG_OP2_SHIFT, + KVM_REG_SIZE_U32, KVM_REG_SIZE_U64, KVM_REG_SIZE_U128, +}; +use kvm_ioctls::VcpuFd; + +enum Size { + U32, + U64, + U128, +} +const fn size_kvm_bits(s: Size) -> u64 { + match s { + Size::U32 => KVM_REG_SIZE_U32, + Size::U64 => KVM_REG_SIZE_U64, + Size::U128 => KVM_REG_SIZE_U128, + } +} +const fn kvm_sys_reg(op0: u8, op1: u8, crn: u8, crm: u8, op2: u8, s: Size) -> u64 { + KVM_REG_ARM64 + | (KVM_REG_ARM64_SYSREG as u64) + | (((op0 as u64) << KVM_REG_ARM64_SYSREG_OP0_SHIFT) & KVM_REG_ARM64_SYSREG_OP0_MASK as u64) + | (((op1 as u64) << KVM_REG_ARM64_SYSREG_OP1_SHIFT) & KVM_REG_ARM64_SYSREG_OP1_MASK as u64) + | (((crn as u64) << KVM_REG_ARM64_SYSREG_CRN_SHIFT) & KVM_REG_ARM64_SYSREG_CRN_MASK as u64) + | (((crm as u64) << KVM_REG_ARM64_SYSREG_CRM_SHIFT) & KVM_REG_ARM64_SYSREG_CRM_MASK as u64) + | (((op2 as u64) << KVM_REG_ARM64_SYSREG_OP2_SHIFT) & KVM_REG_ARM64_SYSREG_OP2_MASK as u64) + | size_kvm_bits(s) +} +macro_rules! decl_sys_reg { + ($name:ident, $op0:expr, $op1:expr, $crn:expr, $crm:expr, $op2:expr, $size:ident) => { + pub const $name: u64 = kvm_sys_reg($op0, $op1, $crn, $crm, $op2, Size::$size); + }; +} +decl_sys_reg!(TTBR0_EL1, 0b11, 0b000, 0b0010, 0b0000, 0b000, U64); +decl_sys_reg!(TCR_EL1, 0b11, 0b000, 0b0010, 0b0000, 0b010, U64); +decl_sys_reg!(MAIR_EL1, 0b11, 0b000, 0b1010, 0b0010, 0b000, U64); +decl_sys_reg!(SCTLR_EL1, 0b11, 0b000, 0b0001, 0b0000, 0b000, U64); +decl_sys_reg!(CPACR_EL1, 0b11, 0b000, 0b0001, 0b0000, 0b010, U64); +decl_sys_reg!(VBAR_EL1, 0b11, 0b000, 0b1100, 0b0000, 0b000, U64); + +const fn kvm_core_reg(offset: u8, s: Size) -> u64 { + KVM_REG_ARM64 | 0x10_0000u64 | offset as u64 | size_kvm_bits(s) +} +macro_rules! decl_core_reg { + ($name:ident, $offset:expr, $size:ident) => { + pub const $name: u64 = kvm_core_reg($offset, Size::$size); + }; +} +decl_core_reg!(X0, 0x00, U64); +decl_core_reg!(X1, 0x02, U64); +decl_core_reg!(X2, 0x04, U64); +decl_core_reg!(X3, 0x06, U64); +decl_core_reg!(X4, 0x08, U64); +decl_core_reg!(X5, 0x0A, U64); +decl_core_reg!(X6, 0x0C, U64); +decl_core_reg!(X7, 0x0E, U64); +decl_core_reg!(X8, 0x10, U64); +decl_core_reg!(X9, 0x12, U64); +decl_core_reg!(X10, 0x14, U64); +decl_core_reg!(X11, 0x16, U64); +decl_core_reg!(X12, 0x18, U64); +decl_core_reg!(X13, 0x1A, U64); +decl_core_reg!(X14, 0x1C, U64); +decl_core_reg!(X15, 0x1E, U64); +decl_core_reg!(X16, 0x20, U64); +decl_core_reg!(X17, 0x22, U64); +decl_core_reg!(X18, 0x24, U64); +decl_core_reg!(X19, 0x26, U64); +decl_core_reg!(X20, 0x28, U64); +decl_core_reg!(X21, 0x2A, U64); +decl_core_reg!(X22, 0x2C, U64); +decl_core_reg!(X23, 0x2E, U64); +decl_core_reg!(X24, 0x30, U64); +decl_core_reg!(X25, 0x32, U64); +decl_core_reg!(X26, 0x34, U64); +decl_core_reg!(X27, 0x36, U64); +decl_core_reg!(X28, 0x38, U64); +decl_core_reg!(X29, 0x3A, U64); +decl_core_reg!(X30, 0x3C, U64); +decl_core_reg!(SP, 0x3E, U64); +decl_core_reg!(PC, 0x40, U64); +decl_core_reg!(PSTATE, 0x42, U64); +decl_core_reg!(SP_EL1, 0x44, U64); +// ignore the other SPSRs that are just for AA32-compat +decl_core_reg!(V0, 0x54, U128); +decl_core_reg!(V1, 0x58, U128); +decl_core_reg!(V2, 0x5c, U128); +decl_core_reg!(V3, 0x60, U128); +decl_core_reg!(V4, 0x64, U128); +decl_core_reg!(V5, 0x68, U128); +decl_core_reg!(V6, 0x6c, U128); +decl_core_reg!(V7, 0x70, U128); +decl_core_reg!(V8, 0x74, U128); +decl_core_reg!(V9, 0x78, U128); +decl_core_reg!(V10, 0x7c, U128); +decl_core_reg!(V11, 0x80, U128); +decl_core_reg!(V12, 0x84, U128); +decl_core_reg!(V13, 0x88, U128); +decl_core_reg!(V14, 0x8c, U128); +decl_core_reg!(V15, 0x90, U128); +decl_core_reg!(V16, 0x94, U128); +decl_core_reg!(V17, 0x98, U128); +decl_core_reg!(V18, 0x9c, U128); +decl_core_reg!(V19, 0xa0, U128); +decl_core_reg!(V20, 0xa4, U128); +decl_core_reg!(V21, 0xa8, U128); +decl_core_reg!(V22, 0xac, U128); +decl_core_reg!(V23, 0xb0, U128); +decl_core_reg!(V24, 0xb4, U128); +decl_core_reg!(V25, 0xb8, U128); +decl_core_reg!(V26, 0xbc, U128); +decl_core_reg!(V27, 0xc0, U128); +decl_core_reg!(V28, 0xc4, U128); +decl_core_reg!(V29, 0xc8, U128); +decl_core_reg!(V30, 0xcc, U128); +decl_core_reg!(V31, 0xd0, U128); +decl_core_reg!(FPSR, 0xd4, U32); +decl_core_reg!(FPCR, 0xd4, U32); + +pub(crate) fn get_reg_bytes( + fd: &VcpuFd, + id: u64, + err: impl Fn(kvm_ioctls::Error) -> E, +) -> Result<[u8; N], E> { + let mut buf: [u8; N] = [0; N]; + fd.get_one_reg(id, &mut buf).map_err(err)?; + Ok(buf) +} +macro_rules! get_reg { + ($fd:expr, $err:expr, $reg:ident, $t:ident) => { + $crate::hypervisor::regs::kvm_reg::get_reg_bytes::<{ core::mem::size_of::<$t>() }, _>( + $fd, + $crate::hypervisor::regs::kvm_reg::$reg, + $err, + ) + .map($t::from_ne_bytes) + }; +} +pub(crate) use get_reg; +pub(crate) fn set_reg_bytes( + fd: &VcpuFd, + err: impl Fn(kvm_ioctls::Error) -> E, + id: u64, + bytes: [u8; N], +) -> Result<(), E> { + fd.set_one_reg(id, &bytes).map_err(err)?; + Ok(()) +} +macro_rules! set_reg { + ($fd:expr, $err:expr, $reg:ident, $t:ident, $val:expr) => { + $crate::hypervisor::regs::kvm_reg::set_reg_bytes::<{ core::mem::size_of::<$t>() }, _>( + $fd, + $err, + $crate::hypervisor::regs::kvm_reg::$reg, + $val.to_ne_bytes(), + ) + }; +} +pub(crate) use set_reg; diff --git a/src/hyperlight_host/src/hypervisor/regs/aarch64/mod.rs b/src/hyperlight_host/src/hypervisor/regs/aarch64/mod.rs index 8f91c634d..2f7331dbf 100644 --- a/src/hyperlight_host/src/hypervisor/regs/aarch64/mod.rs +++ b/src/hyperlight_host/src/hypervisor/regs/aarch64/mod.rs @@ -16,22 +16,19 @@ limitations under the License. // TODO(aarch64): implement real register definitions -#[derive(Debug, Default, Copy, Clone, PartialEq)] -pub(crate) struct CommonRegisters { - _placeholder: u64, -} +mod common_regs; +pub(crate) use common_regs::*; -#[derive(Debug, Default, Copy, Clone, PartialEq)] -pub(crate) struct CommonSpecialRegisters { - _placeholder: u64, -} +mod special_regs; +pub(crate) use special_regs::*; -#[derive(Debug, Default, Copy, Clone, PartialEq)] -pub(crate) struct CommonFpu { - _placeholder: u64, -} +mod common_fpu; +pub(crate) use common_fpu::*; #[derive(Debug, Default, Copy, Clone, PartialEq)] pub(crate) struct CommonDebugRegs { _placeholder: u64, } + +#[cfg(kvm)] +pub(crate) mod kvm_reg; diff --git a/src/hyperlight_host/src/hypervisor/regs/aarch64/special_regs.rs b/src/hyperlight_host/src/hypervisor/regs/aarch64/special_regs.rs new file mode 100644 index 000000000..159d544ec --- /dev/null +++ b/src/hyperlight_host/src/hypervisor/regs/aarch64/special_regs.rs @@ -0,0 +1,46 @@ +#[derive(Debug, Default, Copy, Clone, PartialEq)] +pub(crate) struct CommonSpecialRegisters { + pub(crate) ttbr0_el1: u64, + // todo: handle ttbr1 as well + pub(crate) tcr_el1: u64, + pub(crate) mair_el1: u64, + pub(crate) sctlr_el1: u64, + pub(crate) cpacr_el1: u64, + pub(crate) vbar_el1: u64, + pub(crate) sp_el1: u64, +} + +pub(crate) const TCR_EL1_PS_48: u64 = 0b101u64 << 32; +pub(crate) const TCR_EL1_TG0_4K: u64 = 0b00u64 << 14; +pub(crate) const TCR_EL1_TG1_4K: u64 = 0b00u64 << 30; +#[allow(clippy::identity_op)] +pub(crate) const TCR_EL1_T0SZ_48: u64 = 16u64 << 0; +pub(crate) const TCR_EL1_T1SZ_48: u64 = 16u64 << 16; + +pub(crate) const MAIR_NORMAL_OWT_NT_AA: u64 = 0b10111011; +pub(crate) const MAIR_ITEM_WIDTH: u8 = 8; + +pub(crate) const SCTLR_EL1_RES1: u64 = 0b11u64 << 28 | 0b11u64 << 22 | 0b1u64 << 20 | 0b1u64 << 11; +pub(crate) const SCTLR_EL1_M: u64 = 0b1u64 << 0; +pub(crate) const SCTLR_EL1_C: u64 = 0b1u64 << 2; + +pub(crate) const CPACR_EL1_FPEN_NO_TRAP: u64 = 0b11 << 20; + +impl CommonSpecialRegisters { + pub(crate) fn defaults(root_pt_addr: u64) -> Self { + CommonSpecialRegisters { + ttbr0_el1: root_pt_addr & !0xfff, + tcr_el1: TCR_EL1_PS_48 + | TCR_EL1_TG0_4K + | TCR_EL1_TG1_4K + | TCR_EL1_T0SZ_48 + | TCR_EL1_T1SZ_48, + mair_el1: MAIR_NORMAL_OWT_NT_AA + << (MAIR_ITEM_WIDTH * hyperlight_common::vmem::ATTR_INDEX_NORMAL), + sctlr_el1: SCTLR_EL1_RES1 | SCTLR_EL1_M | SCTLR_EL1_C, + cpacr_el1: CPACR_EL1_FPEN_NO_TRAP, + vbar_el1: 0, + sp_el1: 0, + } + } +} diff --git a/src/hyperlight_host/src/hypervisor/virtual_machine/kvm/aarch64.rs b/src/hyperlight_host/src/hypervisor/virtual_machine/kvm/aarch64.rs index 39ecb775d..d20b6fd8b 100644 --- a/src/hyperlight_host/src/hypervisor/virtual_machine/kvm/aarch64.rs +++ b/src/hyperlight_host/src/hypervisor/virtual_machine/kvm/aarch64.rs @@ -16,25 +16,532 @@ limitations under the License. // TODO(aarch64): implement KVM backend +use std::sync::LazyLock; + +use hyperlight_common::outb::VmAction; +use kvm_bindings::{ + KVM_CAP_ARM_NISV_TO_USER, KVM_EXIT_ARM_NISV, KVMIO, kvm_enable_cap, kvm_userspace_memory_region, +}; +use kvm_ioctls::{Kvm, VcpuExit, VcpuFd, VmFd}; use tracing::{Span, instrument}; -use crate::hypervisor::virtual_machine::CreateVmError; +use crate::hypervisor::regs::{CommonFpu, CommonRegisters, CommonSpecialRegisters}; +use crate::hypervisor::virtual_machine::{ + CreateVmError, HypervisorError, MapMemoryError, RegisterError, ResetVcpuError, RunVcpuError, + UnmapMemoryError, VirtualMachine, VmExit, +}; + +static KVM: LazyLock> = + LazyLock::new(|| Kvm::new().map_err(|e| CreateVmError::HypervisorNotAvailable(e.into()))); /// Return `true` if the KVM API is available #[instrument(skip_all, parent = Span::current(), level = "Trace")] pub(crate) fn is_hypervisor_present() -> bool { - // TODO(aarch64): implement KVM detection - false + if let Ok(kvm) = KVM.as_ref() { + let api_version = kvm.get_api_version(); + api_version == 12 + } else { + false + } } /// A KVM implementation of a single-vcpu VM #[derive(Debug)] pub(crate) struct KvmVm { - _placeholder: (), + vm_fd: VmFd, + vcpu_fd: VcpuFd, } impl KvmVm { + pub(self) fn vcpu_init(&mut self) -> Result<(), HypervisorError> { + let mut kvi = kvm_bindings::kvm_vcpu_init::default(); + self.vm_fd.get_preferred_target(&mut kvi)?; + self.vcpu_fd.vcpu_init(&kvi)?; + Ok(()) + } pub(crate) fn new() -> std::result::Result { - unimplemented!("KvmVm::new") + let hv = KVM.as_ref().map_err(|e| e.clone())?; + let vm_fd = hv + .create_vm_with_type(0) + .map_err(|e| CreateVmError::CreateVmFd(e.into()))?; + if vm_fd.check_extension_raw(KVM_CAP_ARM_NISV_TO_USER as u64) != 0 { + // Available since Linux 5.5. Needed for the workaround + // described below for KVM mis-behaviour when a cache + // maintenance operation is applied to a VA that is paged + // out at Stage 2. + // + // When this cap is not available, there is a (small) + // chance that self-modifying code inside the VM will + // cause [`run_vcpu`] to fail, ultimately poisoning the + // sandbox. With this capability, the relevant code will + // instead be retried. + let cap: kvm_enable_cap = kvm_enable_cap { + cap: KVM_CAP_ARM_NISV_TO_USER, + ..Default::default() + }; + unsafe { + vmm_sys_util::ioctl_iow_nr!(KVM_ENABLE_CAP, KVMIO, 0xa3, kvm_enable_cap); + vmm_sys_util::ioctl::ioctl_with_ref(&vm_fd, KVM_ENABLE_CAP(), &cap); + } + } + + let vcpu_fd = vm_fd + .create_vcpu(0) + .map_err(|e| CreateVmError::CreateVcpuFd(e.into()))?; + + let mut to_ret = Self { vm_fd, vcpu_fd }; + to_ret + .vcpu_init() + .map_err(CreateVmError::SetPartitionProperty)?; + Ok(to_ret) + } + + fn run_immediate_exit(&mut self) -> Result<(), Result> { + self.vcpu_fd.set_kvm_immediate_exit(1u8); + let ret = loop { + let r = self.vcpu_fd.run(); + if let Err(e) = r { + match e.errno() { + libc::EINTR => break Ok(()), + libc::EAGAIN => continue, + _ => break Err(Ok(e.into())), + } + } else { + break Err(Err(format!( + "KVM run for state quiescence exited without EINTR: {:?}", + r + ))); + } + }; + self.vcpu_fd.set_kvm_immediate_exit(0u8); + ret + } +} + +impl VirtualMachine for KvmVm { + unsafe fn map_memory( + &mut self, + (slot, region): (u32, &crate::mem::memory_region::MemoryRegion), + ) -> std::result::Result<(), crate::hypervisor::virtual_machine::MapMemoryError> { + let mut kvm_region: kvm_userspace_memory_region = region.into(); + kvm_region.slot = slot; + unsafe { self.vm_fd.set_user_memory_region(kvm_region) } + .map_err(|e| MapMemoryError::Hypervisor(e.into())) + } + + fn unmap_memory( + &mut self, + (slot, region): (u32, &crate::mem::memory_region::MemoryRegion), + ) -> std::result::Result<(), crate::hypervisor::virtual_machine::UnmapMemoryError> { + let mut kvm_region: kvm_userspace_memory_region = region.into(); + kvm_region.slot = slot; + // Setting memory_size to 0 unmaps the slot's region + // From https://docs.kernel.org/virt/kvm/api.html + // > Deleting a slot is done by passing zero for memory_size. + kvm_region.memory_size = 0; + unsafe { self.vm_fd.set_user_memory_region(kvm_region) } + .map_err(|e| UnmapMemoryError::Hypervisor(e.into())) + } + + fn run_vcpu( + &mut self, + #[cfg(feature = "trace_guest")] tc: &mut SandboxTraceContext, + ) -> std::result::Result< + crate::hypervisor::virtual_machine::VmExit, + crate::hypervisor::virtual_machine::RunVcpuError, + > { + let exit = loop { + let mut exit = self.vcpu_fd.run(); + if let Ok(VcpuExit::Unsupported(KVM_EXIT_ARM_NISV)) = exit { + // [`VcpuExit`] borrows the [`Vcpu`] which produced + // it, but that lifetime isn't used in this case. End + // the borrow early by re-constructing the value while + // preserving the possibility for more tests to be + // inserted after this one. + exit = Ok(VcpuExit::Unsupported(KVM_EXIT_ARM_NISV)); + // If a readonly-at-stage-2 page is paged out at stage + // 2, KVM does not correctly handle the page fault due + // to Stage 2 translation that occurs when cache + // maintenance operations must resolve the page + // address in order to execute. KVM incorrectly treats + // the fault as an indication that the guest is making + // an MMIO access the details of which are not + // captured in NISV. + // + // Guest code tries to reduce the chance of this + // happening by making a data access shortly before + // the cache cleaning instructions. However, this is + // possibly racy, since KVM could page out the + // relevant Stage 2 translation in between the data + // access and the cache maintenance operation. In + // order to account for this case, we detect it and + // cooperate with code inside the VM to re-fault-in + // the page and re-try the cache maintenance operation + // in question. + // + // The calling convention for this is: any cache + // maintenance operation should be executed with the + // Zero flag cleared. If it fails for this reason, + // Hyperlight will increment PC to the next + // instruction as usual, but set the Zero flag. The + // guest should detect this and attempt to fault in + // the page and re-try the operation. + use crate::hypervisor::regs::kvm_reg::{get_reg, set_reg}; + let pc = get_reg!( + &self.vcpu_fd, + |e| { RunVcpuError::Unknown(e.into()) }, + PC, + u64 + )?; + let pstate = get_reg!( + &self.vcpu_fd, + |e| { RunVcpuError::Unknown(e.into()) }, + PSTATE, + u64 + )?; + + const Z_BIT: u64 = 1 << 30; + // Because we got here from the NISV mmio exit path, + // we know that ESR_EL2.EC codes for a Data Abort, and + // we can assume the relevant encoding of ESR_EL2.ISS + const ESR_EL2_ISS_CM: u64 = 1 << 8; + + let esr_iss = unsafe { + // SAFETY: KVM_EXIT_ARM_NISV implies this is the arm_nisv variant. + self.vcpu_fd.get_kvm_run().__bindgen_anon_1.arm_nisv.esr_iss + }; + if esr_iss & ESR_EL2_ISS_CM != 0 && pstate & Z_BIT == 0 { + // if ESR_EL2.ISS.CM is set, the abort was caused + // by a Cache Maintenance instruction. Assume that + // any Cache Maintenance instruction in the VM is + // part of a Hyperlight-aware sequence and can + // deal with it. + set_reg!( + &self.vcpu_fd, + |e| { RunVcpuError::Unknown(e.into()) }, + PSTATE, + u64, + pstate | Z_BIT + )?; + set_reg!( + &self.vcpu_fd, + |e| { RunVcpuError::Unknown(e.into()) }, + PC, + u64, + pc + 4 + )?; + continue; + } + } + break exit; + }; + match exit { + Ok(VcpuExit::MmioWrite(addr, data)) => { + let io_page_gpa = const { hyperlight_common::layout::io_page().unwrap().0 }; + if addr > io_page_gpa + && let off = (addr - io_page_gpa) as usize + && off < hyperlight_common::vmem::PAGE_SIZE + { + let port = off / core::mem::size_of::(); + if port == VmAction::Halt as usize { + // As per [1]: + // > For KVM_EXIT_IO [...] the corresponding operations are complete + // > (and guest state is consistent) only after userspace has re-entered + // > the kernel with KVM_RUN. The kernel side will first finish + // > incomplete operations and then check for pending signals. + // > + // > The pending state of the operation is not preserved in state which + // > is visible to userspace, thus userspace should ensure that the + // > operation is completed before performing a live + // > migration. Userspace can re-enter the guest with an unmasked signal + // > pending or with the immediate_exit field set to complete pending + // > operations without allowing any further instructions to be + // > executed. + // + // On AArch64, the incomplete operation state includes incrementing the + // program counter past the faulting I/O instruction. Since a halt exit + // is used to logically end a thread of execution, we will likely start + // executing from somewhere else again after, in which case such a + // program counter increment would be undesirable. Therefore, in the hlt + // case, re-enter the kernel with immediate_exit set right away to clear + // that state. + // + // We assume that this pattern is not required in any other case, + // because any error that prevents the guest code from fully unwinding + // its stack and running "to completion" (i.e. to a halt exit) should + // poison the sandbox, and the vcpu reset on sandbox reset needed to + // un-poison it will take care of clearing the necessary state. + self.run_immediate_exit() + .map_err(|e| RunVcpuError::FlushMmioPending(format!("{:?}", e)))?; + Ok(VmExit::Halt()) + } else { + Ok(VmExit::IoOut(port as u16, data.to_vec())) + } + } else { + Ok(VmExit::MmioWrite(addr)) + } + } + Ok(VcpuExit::MmioRead(addr, _)) => Ok(VmExit::MmioRead(addr)), + Err(e) => match e.errno() { + libc::EINTR => Ok(VmExit::Cancelled()), + libc::EAGAIN => Ok(VmExit::Retry()), + _ => Err(RunVcpuError::Unknown(e.into())), + }, + Ok(other) => Ok(VmExit::Unknown(format!( + "Unknown KVM VCPU exit: {:?}", + other + ))), + } + } + + fn regs(&self) -> std::result::Result { + use crate::hypervisor::regs::kvm_reg::get_reg; + fn err(e: kvm_ioctls::Error) -> RegisterError { + RegisterError::GetSregs(e.into()) + } + Ok(CommonRegisters { + x: [ + get_reg!(&self.vcpu_fd, err, X0, u64)?, + get_reg!(&self.vcpu_fd, err, X1, u64)?, + get_reg!(&self.vcpu_fd, err, X2, u64)?, + get_reg!(&self.vcpu_fd, err, X3, u64)?, + get_reg!(&self.vcpu_fd, err, X4, u64)?, + get_reg!(&self.vcpu_fd, err, X5, u64)?, + get_reg!(&self.vcpu_fd, err, X6, u64)?, + get_reg!(&self.vcpu_fd, err, X7, u64)?, + get_reg!(&self.vcpu_fd, err, X8, u64)?, + get_reg!(&self.vcpu_fd, err, X9, u64)?, + get_reg!(&self.vcpu_fd, err, X10, u64)?, + get_reg!(&self.vcpu_fd, err, X11, u64)?, + get_reg!(&self.vcpu_fd, err, X12, u64)?, + get_reg!(&self.vcpu_fd, err, X13, u64)?, + get_reg!(&self.vcpu_fd, err, X14, u64)?, + get_reg!(&self.vcpu_fd, err, X15, u64)?, + get_reg!(&self.vcpu_fd, err, X16, u64)?, + get_reg!(&self.vcpu_fd, err, X17, u64)?, + get_reg!(&self.vcpu_fd, err, X18, u64)?, + get_reg!(&self.vcpu_fd, err, X19, u64)?, + get_reg!(&self.vcpu_fd, err, X20, u64)?, + get_reg!(&self.vcpu_fd, err, X21, u64)?, + get_reg!(&self.vcpu_fd, err, X22, u64)?, + get_reg!(&self.vcpu_fd, err, X23, u64)?, + get_reg!(&self.vcpu_fd, err, X24, u64)?, + get_reg!(&self.vcpu_fd, err, X25, u64)?, + get_reg!(&self.vcpu_fd, err, X26, u64)?, + get_reg!(&self.vcpu_fd, err, X27, u64)?, + get_reg!(&self.vcpu_fd, err, X28, u64)?, + get_reg!(&self.vcpu_fd, err, X29, u64)?, + get_reg!(&self.vcpu_fd, err, X30, u64)?, + ], + sp: get_reg!(&self.vcpu_fd, err, SP, u64)?, + pc: get_reg!(&self.vcpu_fd, err, PC, u64)?, + pstate: get_reg!(&self.vcpu_fd, err, PSTATE, u64)?, + }) + } + + fn set_regs(&self, regs: &CommonRegisters) -> std::result::Result<(), RegisterError> { + use crate::hypervisor::regs::kvm_reg::set_reg; + fn err(e: kvm_ioctls::Error) -> RegisterError { + RegisterError::SetSregs(e.into()) + } + set_reg!(&self.vcpu_fd, err, X0, u64, regs.x[0])?; + set_reg!(&self.vcpu_fd, err, X1, u64, regs.x[1])?; + set_reg!(&self.vcpu_fd, err, X2, u64, regs.x[2])?; + set_reg!(&self.vcpu_fd, err, X3, u64, regs.x[3])?; + set_reg!(&self.vcpu_fd, err, X4, u64, regs.x[4])?; + set_reg!(&self.vcpu_fd, err, X5, u64, regs.x[5])?; + set_reg!(&self.vcpu_fd, err, X6, u64, regs.x[6])?; + set_reg!(&self.vcpu_fd, err, X7, u64, regs.x[7])?; + set_reg!(&self.vcpu_fd, err, X8, u64, regs.x[8])?; + set_reg!(&self.vcpu_fd, err, X9, u64, regs.x[9])?; + set_reg!(&self.vcpu_fd, err, X10, u64, regs.x[10])?; + set_reg!(&self.vcpu_fd, err, X11, u64, regs.x[11])?; + set_reg!(&self.vcpu_fd, err, X12, u64, regs.x[12])?; + set_reg!(&self.vcpu_fd, err, X13, u64, regs.x[13])?; + set_reg!(&self.vcpu_fd, err, X14, u64, regs.x[14])?; + set_reg!(&self.vcpu_fd, err, X15, u64, regs.x[15])?; + set_reg!(&self.vcpu_fd, err, X16, u64, regs.x[16])?; + set_reg!(&self.vcpu_fd, err, X17, u64, regs.x[17])?; + set_reg!(&self.vcpu_fd, err, X18, u64, regs.x[18])?; + set_reg!(&self.vcpu_fd, err, X19, u64, regs.x[19])?; + set_reg!(&self.vcpu_fd, err, X20, u64, regs.x[20])?; + set_reg!(&self.vcpu_fd, err, X21, u64, regs.x[21])?; + set_reg!(&self.vcpu_fd, err, X22, u64, regs.x[22])?; + set_reg!(&self.vcpu_fd, err, X23, u64, regs.x[23])?; + set_reg!(&self.vcpu_fd, err, X24, u64, regs.x[24])?; + set_reg!(&self.vcpu_fd, err, X25, u64, regs.x[25])?; + set_reg!(&self.vcpu_fd, err, X26, u64, regs.x[26])?; + set_reg!(&self.vcpu_fd, err, X27, u64, regs.x[27])?; + set_reg!(&self.vcpu_fd, err, X28, u64, regs.x[28])?; + set_reg!(&self.vcpu_fd, err, X29, u64, regs.x[29])?; + set_reg!(&self.vcpu_fd, err, X30, u64, regs.x[30])?; + set_reg!(&self.vcpu_fd, err, SP, u64, regs.sp)?; + set_reg!(&self.vcpu_fd, err, PC, u64, regs.pc)?; + set_reg!(&self.vcpu_fd, err, PSTATE, u64, regs.pstate)?; + + Ok(()) + } + + fn fpu(&self) -> Result { + use crate::hypervisor::regs::CommonFpu; + use crate::hypervisor::regs::kvm_reg::get_reg; + fn err(e: kvm_ioctls::Error) -> RegisterError { + RegisterError::GetFpu(e.into()) + } + Ok(CommonFpu { + v: [ + get_reg!(&self.vcpu_fd, err, V0, u128)?, + get_reg!(&self.vcpu_fd, err, V1, u128)?, + get_reg!(&self.vcpu_fd, err, V2, u128)?, + get_reg!(&self.vcpu_fd, err, V3, u128)?, + get_reg!(&self.vcpu_fd, err, V4, u128)?, + get_reg!(&self.vcpu_fd, err, V5, u128)?, + get_reg!(&self.vcpu_fd, err, V6, u128)?, + get_reg!(&self.vcpu_fd, err, V7, u128)?, + get_reg!(&self.vcpu_fd, err, V8, u128)?, + get_reg!(&self.vcpu_fd, err, V9, u128)?, + get_reg!(&self.vcpu_fd, err, V10, u128)?, + get_reg!(&self.vcpu_fd, err, V11, u128)?, + get_reg!(&self.vcpu_fd, err, V12, u128)?, + get_reg!(&self.vcpu_fd, err, V13, u128)?, + get_reg!(&self.vcpu_fd, err, V14, u128)?, + get_reg!(&self.vcpu_fd, err, V15, u128)?, + get_reg!(&self.vcpu_fd, err, V16, u128)?, + get_reg!(&self.vcpu_fd, err, V17, u128)?, + get_reg!(&self.vcpu_fd, err, V18, u128)?, + get_reg!(&self.vcpu_fd, err, V19, u128)?, + get_reg!(&self.vcpu_fd, err, V20, u128)?, + get_reg!(&self.vcpu_fd, err, V21, u128)?, + get_reg!(&self.vcpu_fd, err, V22, u128)?, + get_reg!(&self.vcpu_fd, err, V23, u128)?, + get_reg!(&self.vcpu_fd, err, V24, u128)?, + get_reg!(&self.vcpu_fd, err, V25, u128)?, + get_reg!(&self.vcpu_fd, err, V26, u128)?, + get_reg!(&self.vcpu_fd, err, V27, u128)?, + get_reg!(&self.vcpu_fd, err, V28, u128)?, + get_reg!(&self.vcpu_fd, err, V29, u128)?, + get_reg!(&self.vcpu_fd, err, V30, u128)?, + get_reg!(&self.vcpu_fd, err, V31, u128)?, + ], + fpsr: get_reg!(&self.vcpu_fd, err, FPSR, u32)?, + fpcr: get_reg!(&self.vcpu_fd, err, FPCR, u32)?, + }) + } + + fn set_fpu(&self, fpu: &CommonFpu) -> Result<(), RegisterError> { + use crate::hypervisor::regs::kvm_reg::set_reg; + fn err(e: kvm_ioctls::Error) -> RegisterError { + RegisterError::SetFpu(e.into()) + } + set_reg!(&self.vcpu_fd, err, V0, u128, fpu.v[0])?; + set_reg!(&self.vcpu_fd, err, V1, u128, fpu.v[1])?; + set_reg!(&self.vcpu_fd, err, V2, u128, fpu.v[2])?; + set_reg!(&self.vcpu_fd, err, V3, u128, fpu.v[3])?; + set_reg!(&self.vcpu_fd, err, V4, u128, fpu.v[4])?; + set_reg!(&self.vcpu_fd, err, V5, u128, fpu.v[5])?; + set_reg!(&self.vcpu_fd, err, V6, u128, fpu.v[6])?; + set_reg!(&self.vcpu_fd, err, V7, u128, fpu.v[7])?; + set_reg!(&self.vcpu_fd, err, V8, u128, fpu.v[8])?; + set_reg!(&self.vcpu_fd, err, V9, u128, fpu.v[9])?; + set_reg!(&self.vcpu_fd, err, V10, u128, fpu.v[10])?; + set_reg!(&self.vcpu_fd, err, V11, u128, fpu.v[11])?; + set_reg!(&self.vcpu_fd, err, V12, u128, fpu.v[12])?; + set_reg!(&self.vcpu_fd, err, V13, u128, fpu.v[13])?; + set_reg!(&self.vcpu_fd, err, V14, u128, fpu.v[14])?; + set_reg!(&self.vcpu_fd, err, V15, u128, fpu.v[15])?; + set_reg!(&self.vcpu_fd, err, V16, u128, fpu.v[16])?; + set_reg!(&self.vcpu_fd, err, V17, u128, fpu.v[17])?; + set_reg!(&self.vcpu_fd, err, V18, u128, fpu.v[18])?; + set_reg!(&self.vcpu_fd, err, V19, u128, fpu.v[19])?; + set_reg!(&self.vcpu_fd, err, V20, u128, fpu.v[20])?; + set_reg!(&self.vcpu_fd, err, V21, u128, fpu.v[21])?; + set_reg!(&self.vcpu_fd, err, V22, u128, fpu.v[22])?; + set_reg!(&self.vcpu_fd, err, V23, u128, fpu.v[23])?; + set_reg!(&self.vcpu_fd, err, V24, u128, fpu.v[24])?; + set_reg!(&self.vcpu_fd, err, V25, u128, fpu.v[25])?; + set_reg!(&self.vcpu_fd, err, V26, u128, fpu.v[26])?; + set_reg!(&self.vcpu_fd, err, V27, u128, fpu.v[27])?; + set_reg!(&self.vcpu_fd, err, V28, u128, fpu.v[28])?; + set_reg!(&self.vcpu_fd, err, V29, u128, fpu.v[29])?; + set_reg!(&self.vcpu_fd, err, V30, u128, fpu.v[30])?; + set_reg!(&self.vcpu_fd, err, V31, u128, fpu.v[31])?; + set_reg!(&self.vcpu_fd, err, FPSR, u32, fpu.fpsr)?; + set_reg!(&self.vcpu_fd, err, FPCR, u32, fpu.fpcr)?; + Ok(()) + } + + fn sregs(&self) -> Result { + use crate::hypervisor::regs::kvm_reg::get_reg; + fn err(e: kvm_ioctls::Error) -> RegisterError { + RegisterError::GetSregs(e.into()) + } + Ok(CommonSpecialRegisters { + ttbr0_el1: get_reg!(&self.vcpu_fd, err, TTBR0_EL1, u64)?, + tcr_el1: get_reg!(&self.vcpu_fd, err, TCR_EL1, u64)?, + mair_el1: get_reg!(&self.vcpu_fd, err, MAIR_EL1, u64)?, + sctlr_el1: get_reg!(&self.vcpu_fd, err, SCTLR_EL1, u64)?, + cpacr_el1: get_reg!(&self.vcpu_fd, err, CPACR_EL1, u64)?, + vbar_el1: get_reg!(&self.vcpu_fd, err, VBAR_EL1, u64)?, + sp_el1: get_reg!(&self.vcpu_fd, err, SP_EL1, u64)?, + }) + } + + fn set_sregs(&self, sregs: &CommonSpecialRegisters) -> Result<(), RegisterError> { + use crate::hypervisor::regs::kvm_reg::set_reg; + fn err(e: kvm_ioctls::Error) -> RegisterError { + RegisterError::SetSregs(e.into()) + } + set_reg!(&self.vcpu_fd, err, TTBR0_EL1, u64, sregs.ttbr0_el1)?; + set_reg!(&self.vcpu_fd, err, TCR_EL1, u64, sregs.tcr_el1)?; + set_reg!(&self.vcpu_fd, err, MAIR_EL1, u64, sregs.mair_el1)?; + set_reg!(&self.vcpu_fd, err, SCTLR_EL1, u64, sregs.sctlr_el1)?; + set_reg!(&self.vcpu_fd, err, CPACR_EL1, u64, sregs.cpacr_el1)?; + set_reg!(&self.vcpu_fd, err, VBAR_EL1, u64, sregs.vbar_el1)?; + set_reg!(&self.vcpu_fd, err, SP_EL1, u64, sregs.sp_el1)?; + Ok(()) + } + + fn debug_regs( + &self, + ) -> std::result::Result { + todo!() + } + + fn set_debug_regs( + &self, + _drs: &crate::hypervisor::regs::CommonDebugRegs, + ) -> std::result::Result<(), RegisterError> { + todo!() + } + + fn xsave(&self) -> std::result::Result, RegisterError> { + unimplemented!("aarch64 does not support XSAVE operations") + } + + fn reset_xsave(&self) -> std::result::Result<(), RegisterError> { + unimplemented!("aarch64 does not support XSAVE operations") + } + + #[cfg(test)] + fn set_xsave(&self, xsave: &[u32]) -> std::result::Result<(), RegisterError> { + unimplemented!("aarch64 does not support XSAVE operations") + } + + fn can_reset_vcpu(&self) -> bool { + true + } + fn reset_vcpu(&mut self) -> Result<(), ResetVcpuError> { + self.run_immediate_exit().map_err(|e| { + e.map(ResetVcpuError::Hypervisor) + .map_err(ResetVcpuError::Unknown) + .unwrap_or_else(|e| e) + })?; + self.vcpu_init().map_err(ResetVcpuError::Hypervisor)?; + self.run_immediate_exit().map_err(|e| { + e.map(ResetVcpuError::Hypervisor) + .map_err(ResetVcpuError::Unknown) + .unwrap_or_else(|e| e) + })?; + Ok(()) } } diff --git a/src/hyperlight_host/src/hypervisor/virtual_machine/kvm/x86_64.rs b/src/hyperlight_host/src/hypervisor/virtual_machine/kvm/x86_64.rs index db68dfdd0..fdaa1ab12 100644 --- a/src/hyperlight_host/src/hypervisor/virtual_machine/kvm/x86_64.rs +++ b/src/hyperlight_host/src/hypervisor/virtual_machine/kvm/x86_64.rs @@ -175,7 +175,7 @@ impl KvmVm { == CPUID_FUNCTION_PROCESSOR_CAPACITY_PARAMETERS_AND_EXTENDED_FEATURE_IDENTIFICATION { entry.eax &= !0xff; - entry.eax |= hyperlight_common::layout::MAX_GPA.ilog2() + 1; + entry.eax |= hyperlight_common::layout::SCRATCH_TOP_GPA.ilog2() + 1; } } vcpu_fd diff --git a/src/hyperlight_host/src/hypervisor/virtual_machine/mod.rs b/src/hyperlight_host/src/hypervisor/virtual_machine/mod.rs index ecb19a09f..55a5b5e0f 100644 --- a/src/hyperlight_host/src/hypervisor/virtual_machine/mod.rs +++ b/src/hyperlight_host/src/hypervisor/virtual_machine/mod.rs @@ -107,7 +107,7 @@ pub(crate) enum HypervisorType { /// Minimum XSAVE buffer size: 512 bytes legacy region + 64 bytes header. /// Only used by MSHV and WHP which use compacted XSAVE format and need to /// validate buffer size before accessing XCOMP_BV. -#[cfg(any(mshv3, target_os = "windows"))] +#[cfg(all(target_arch = "x86_64", any(mshv3, target_os = "windows")))] pub(crate) const XSAVE_MIN_SIZE: usize = 576; /// Standard XSAVE buffer size (4KB) used by KVM and MSHV. @@ -202,6 +202,9 @@ pub enum RunVcpuError { IncrementRip(HypervisorError), #[error("Parse GPA access info failed")] ParseGpaAccessInfo, + #[cfg(target_arch = "aarch64")] + #[error("Flush MMIO pending state failed: {0}")] + FlushMmioPending(String), #[error("Unknown error: {0}")] Unknown(HypervisorError), } @@ -246,6 +249,18 @@ pub enum RegisterError { ConversionFailed(String), } +#[derive(Debug, Clone, thiserror::Error)] +pub enum ResetVcpuError { + #[error("Single-operation vcpu reset not supported on this hypervisor")] + NotSupported, + #[error("Hypervisor operation failed: {0}")] + Hypervisor(HypervisorError), + #[error("Register operation failed: {0}")] + Register(#[from] RegisterError), + #[error("Operation failed: {0}")] + Unknown(String), +} + /// Map memory error #[derive(Debug, Clone, thiserror::Error)] pub enum MapMemoryError { @@ -353,6 +368,13 @@ pub(crate) trait VirtualMachine: Debug + Send { #[cfg(not(feature = "i686-guest"))] fn set_xsave(&self, xsave: &[u32]) -> std::result::Result<(), RegisterError>; + /// Single-operation vCPU reset + fn can_reset_vcpu(&self) -> bool { + false + } + fn reset_vcpu(&mut self) -> std::result::Result<(), ResetVcpuError> { + Err(ResetVcpuError::NotSupported) + } /// Get partition handle #[cfg(target_os = "windows")] fn partition_handle(&self) -> windows::Win32::System::Hypervisor::WHV_PARTITION_HANDLE; diff --git a/src/hyperlight_host/src/mem/mgr.rs b/src/hyperlight_host/src/mem/mgr.rs index 9e5d843d1..3c696a789 100644 --- a/src/hyperlight_host/src/mem/mgr.rs +++ b/src/hyperlight_host/src/mem/mgr.rs @@ -650,7 +650,7 @@ impl SandboxMemoryManager { ) -> Result> { use crate::sandbox::snapshot::SharedMemoryPageTableBuffer; - let len = hyperlight_common::layout::MAX_GVA; + let len = hyperlight_common::layout::SCRATCH_TOP_GVA; let regions = self.shared_mem.with_contents(|snapshot| { self.scratch_mem.with_contents(|scratch| { diff --git a/src/hyperlight_host/src/sandbox/initialized_multi_use.rs b/src/hyperlight_host/src/sandbox/initialized_multi_use.rs index 241622cab..fb8f28ecd 100644 --- a/src/hyperlight_host/src/sandbox/initialized_multi_use.rs +++ b/src/hyperlight_host/src/sandbox/initialized_multi_use.rs @@ -1215,12 +1215,12 @@ mod tests { match res.unwrap_err() { HyperlightError::GuestAborted(_, msg) => { // msg should indicate we got an invalid opcode exception + #[cfg(target_arch = "x86_64")] assert!(msg.contains("InvalidOpcode")); + #[cfg(target_arch = "aarch64")] + assert!(msg.contains("0x2000000")); } - e => panic!( - "Expected HyperlightError::GuestExecutionError but got {:?}", - e - ), + e => panic!("Expected HyperlightError::GuestAborted but got {:?}", e), } } @@ -1306,7 +1306,10 @@ mod tests { .evolve() .unwrap(); + #[cfg(target_arch = "x86_64")] let expected = &[0x90, 0x90, 0x90, 0xC3]; // NOOP slide to RET + #[cfg(target_arch = "aarch64")] + let expected = &[0x1f, 0x20, 0x03, 0xd5, 0xc0, 0x03, 0x5f, 0xd6]; let map_mem = page_aligned_memory(expected); let guest_base = 0x1_0000_0000; // Arbitrary guest base address @@ -1488,8 +1491,9 @@ mod tests { let dr0_initial: u64 = sandbox.call("GetDr0", ()).unwrap(); assert_eq!(dr0_initial, 0, "DR0 should initially be 0"); - // Dirty DR0 by setting it to a known non-zero value - const DIRTY_VALUE: u64 = 0xDEAD_BEEF_CAFE_BABE; + // Dirty DR0 by setting it to a known non-zero value, avoiding + // bits that are reserved in aarch64 DBGBVR0_EL1 + const DIRTY_VALUE: u64 = 0xFFFF_FEDC_7654_3210; sandbox.call::<()>("SetDr0", DIRTY_VALUE).unwrap(); let dr0_dirty: u64 = sandbox.call("GetDr0", ()).unwrap(); assert_eq!( diff --git a/src/hyperlight_host/src/sandbox/snapshot/mod.rs b/src/hyperlight_host/src/sandbox/snapshot/mod.rs index e4c7b1133..8579cdaa0 100644 --- a/src/hyperlight_host/src/sandbox/snapshot/mod.rs +++ b/src/hyperlight_host/src/sandbox/snapshot/mod.rs @@ -17,7 +17,7 @@ limitations under the License. use std::collections::{BTreeMap, HashMap}; use std::sync::atomic::{AtomicU64, Ordering}; -use hyperlight_common::layout::{scratch_base_gpa, scratch_base_gva}; +use hyperlight_common::layout::{io_page, scratch_base_gpa, scratch_base_gva}; use hyperlight_common::vmem; use hyperlight_common::vmem::{ BasicMapping, CowMapping, Mapping, MappingKind, PAGE_SIZE, SpaceAwareMapping, SpaceId, TableOps, @@ -296,6 +296,21 @@ unsafe fn guest_page<'a>( } fn map_specials(pt_buf: &GuestPageTableBuffer, scratch_size: usize) { + if let Some((phys_base, virt_base)) = io_page() { + // Map the IO page + let mapping = Mapping { + phys_base, + virt_base, + len: PAGE_SIZE as u64, + kind: MappingKind::Basic(BasicMapping { + readable: true, + writable: true, + executable: false, + }), + user_accessible: false, + }; + unsafe { vmem::map(pt_buf, mapping) }; + } // Map the scratch region let mapping = Mapping { phys_base: scratch_base_gpa(scratch_size), @@ -405,7 +420,7 @@ impl Snapshot { layout.set_pt_size(pt_bytes.len())?; memory.extend(&pt_bytes); - let exn_stack_top_gva = hyperlight_common::layout::MAX_GVA as u64 + let exn_stack_top_gva = hyperlight_common::layout::SCRATCH_TOP_GVA as u64 - hyperlight_common::layout::SCRATCH_TOP_EXN_STACK_OFFSET + 1; @@ -473,7 +488,7 @@ impl Snapshot { &op, root_pt_gpas, 0, - hyperlight_common::layout::MAX_GVA as u64, + hyperlight_common::layout::SCRATCH_TOP_GVA as u64, ) }; diff --git a/src/hyperlight_host/src/sandbox/uninitialized_evolve.rs b/src/hyperlight_host/src/sandbox/uninitialized_evolve.rs index c037af06e..8f5f2e763 100644 --- a/src/hyperlight_host/src/sandbox/uninitialized_evolve.rs +++ b/src/hyperlight_host/src/sandbox/uninitialized_evolve.rs @@ -122,7 +122,6 @@ pub(super) fn evolve_impl_multi_use(u_sbox: UninitializedSandbox) -> Result panic!("Expected GuestAborted error, got: {:?}", err), } diff --git a/src/hyperlight_libc/build.rs b/src/hyperlight_libc/build.rs index d80c66de5..495e20d5e 100644 --- a/src/hyperlight_libc/build.rs +++ b/src/hyperlight_libc/build.rs @@ -23,7 +23,9 @@ use std::{env, fs}; use anyhow::{Context, Result, bail}; use bindgen::Formatter::Prettyplease; use bindgen::RustEdition::Edition2021; -use build_files::{LIBC_FILES, LIBC_FILES_X86, LIBM_FILES, LIBM_FILES_X86}; +use build_files::{ + LIBC_FILES, LIBC_FILES_AARCH64, LIBC_FILES_X86, LIBM_FILES, LIBM_FILES_AARCH64, LIBM_FILES_X86, +}; fn copy_includes, Q: AsRef + std::fmt::Debug>( include_dir: P, @@ -121,6 +123,10 @@ fn cc_build(picolibc_dir: &PathBuf, target: &str) -> Result { build.include(picolibc_dir.join("libm/machine/x86")); build.include(picolibc_dir.join("libc/machine/x86")); } + "aarch64" => { + build.include(picolibc_dir.join("libc/machine/aarch64")); + build.include(picolibc_dir.join("libm/machine/aarch64")); + } arch => { bail!("Unsupported target architecture: {arch}"); } @@ -139,6 +145,7 @@ fn add_libc(build: &mut cc::Build, picolibc_dir: &Path, target: &str) -> Result< let base = LIBC_FILES.iter(); let files = match target { "x86" | "x86_64" => base.chain(LIBC_FILES_X86.iter()), + "aarch64" => base.chain(LIBC_FILES_AARCH64.iter()), arch => bail!("Unsupported target architecture: {arch}"), }; @@ -156,6 +163,7 @@ fn add_libm(build: &mut cc::Build, picolibc_dir: &Path, target: &str) -> Result< let base = LIBM_FILES.iter(); let files = match target { "x86" | "x86_64" => base.chain(LIBM_FILES_X86.iter()), + "aarch64" => base.chain(LIBM_FILES_AARCH64.iter()), arch => bail!("Unsupported target architecture: {arch}"), }; diff --git a/src/hyperlight_libc/build_files.rs b/src/hyperlight_libc/build_files.rs index ab4bf3535..23133071b 100644 --- a/src/hyperlight_libc/build_files.rs +++ b/src/hyperlight_libc/build_files.rs @@ -659,6 +659,29 @@ pub(crate) const LIBC_FILES_X86: &[&str] = &[ "machine/x86/tls.c", ]; +pub(crate) const LIBC_FILES_AARCH64: &[&str] = &[ + "machine/aarch64/interrupt.c", + "machine/aarch64/interrupt_vector.S", + "machine/aarch64/memchr.S", + "machine/aarch64/memcmp.S", + "machine/aarch64/memcpy.S", + "machine/aarch64/memmove.S", + "machine/aarch64/memrchr.S", + "machine/aarch64/memset.S", + "machine/aarch64/rawmemchr.S", + "machine/aarch64/setjmp.S", + "machine/aarch64/stpcpy.S", + "machine/aarch64/strchr.S", + "machine/aarch64/strchrnul.S", + "machine/aarch64/strcmp.S", + "machine/aarch64/strcpy.S", + "machine/aarch64/strlen.S", + "machine/aarch64/strncmp.S", + "machine/aarch64/strnlen.S", + "machine/aarch64/strrchr.S", + "machine/aarch64/tls.c", +]; + pub(crate) const LIBM_FILES: &[&str] = &[ "common/copysignl.c", "common/exp10l.c", @@ -942,3 +965,5 @@ pub(crate) const LIBM_FILES: &[&str] = &[ ]; pub(crate) const LIBM_FILES_X86: &[&str] = &["machine/x86/fenv.c"]; + +pub(crate) const LIBM_FILES_AARCH64: &[&str] = &[]; diff --git a/src/tests/rust_guests/dummyguest/src/main.rs b/src/tests/rust_guests/dummyguest/src/main.rs index 924bb686c..91a62edc3 100644 --- a/src/tests/rust_guests/dummyguest/src/main.rs +++ b/src/tests/rust_guests/dummyguest/src/main.rs @@ -33,6 +33,7 @@ fn halt() { // VmAction::Halt = 108; using raw constant to avoid pulling in // anyhow (via hyperlight_common's TryFrom impl) which requires alloc. unsafe { + #[cfg(target_arch = "x86_64")] asm!( "out dx, eax", "cli", @@ -40,18 +41,28 @@ fn halt() { in("dx") 108u16, in("eax") 0u32, ); + #[cfg(target_arch = "aarch64")] + asm!( + "str {val}, [{addr}]", + val = in(reg) 0, addr = in(reg) 0xffff_ffff_e000u64 + 108 * 8, + ); } } fn mmio_read() { unsafe { + #[cfg(target_arch = "x86_64")] asm!("mov al, [0x8000]"); + + let mut out: u8; + #[cfg(target_arch = "aarch64")] + asm!("ldr {0:x}, [{1}]", out(reg) out, in(reg) 0x8000); } } #[allow(non_snake_case)] #[no_mangle] -pub extern "win64" fn entrypoint(a: i64, b: i64, c: i32) -> i32 { +pub extern "C" fn entrypoint(a: i64, b: i64, c: i32) -> i32 { if a != 0x230000 || b != 1234567890 || c != 4096 { mmio_read(); } diff --git a/src/tests/rust_guests/simpleguest/src/main.rs b/src/tests/rust_guests/simpleguest/src/main.rs index b6844a716..d039519da 100644 --- a/src/tests/rust_guests/simpleguest/src/main.rs +++ b/src/tests/rust_guests/simpleguest/src/main.rs @@ -45,6 +45,7 @@ use hyperlight_common::log_level::GuestLogFilter; use hyperlight_common::vmem::{BasicMapping, MappingKind}; use hyperlight_guest::error::{HyperlightGuestError, Result}; use hyperlight_guest::exit::{abort_with_code, abort_with_code_and_message}; +#[cfg(target_arch = "x86_64")] use hyperlight_guest_bin::exception::arch::{Context, ExceptionInfo}; use hyperlight_guest_bin::guest_function::definition::{GuestFunc, GuestFunctionDefinition}; use hyperlight_guest_bin::guest_function::register::register_function; @@ -83,6 +84,7 @@ fn echo_double(value: f64) -> f64 { // Test exception handler that validates stack layout and records invocation // It is designed to interact with the trigger_int3 breakpoint exception function below +#[cfg(target_arch = "x86_64")] fn test_exception_handler( exception_number: u64, _exception_info: *mut ExceptionInfo, @@ -131,6 +133,7 @@ fn test_exception_handler( /// Install handler for a specific vector #[guest_function("InstallHandler")] +#[cfg(target_arch = "x86_64")] fn install_handler(vector: i32) { hyperlight_guest_bin::exception::arch::HANDLERS[vector as usize] .store(test_exception_handler as usize as u64, Ordering::Release); @@ -145,6 +148,7 @@ fn get_exception_handler_call_count() -> i32 { /// Trigger an INT3 breakpoint exception (vector 3) #[guest_function("TriggerInt3")] +#[cfg(target_arch = "x86_64")] fn trigger_int3() -> i32 { // Set up test value in R9 before triggering exception let test_value: u64 = TEST_R9_VALUE; @@ -346,7 +350,7 @@ fn fill_heap_and_cause_exception() { } // trigger an undefined instruction exception - unsafe { core::arch::asm!("ud2") }; + trigger_exception(); } #[guest_function("ExhaustHeap")] @@ -447,7 +451,13 @@ fn test_guest_panic(message: String) { fn execute_on_heap() -> String { unsafe { // NO-OP followed by RET - let heap_memory = Box::new([0x90u8, 0xC3]); + let mut heap_memory = Box::new( + #[cfg(target_arch = "x86_64")] + [0x90u8, 0xC3], + #[cfg(target_arch = "aarch64")] + [0x1f, 0x20, 0x03, 0xd5, 0xc0, 0x03, 0x5f, 0xd6], + ); + dicachesync(heap_memory.as_mut_ptr(), heap_memory.len()); let heap_fn: fn() = core::mem::transmute(Box::into_raw(heap_memory)); heap_fn(); black_box(heap_fn); // avoid optimization when running in release mode @@ -482,13 +492,21 @@ fn log_message(message: String, level: i32) { #[guest_function("TriggerException")] fn trigger_exception() { // trigger an undefined instruction exception - unsafe { core::arch::asm!("ud2") }; + #[cfg(target_arch = "x86_64")] + unsafe { + core::arch::asm!("ud2") + }; + #[cfg(target_arch = "aarch64")] + unsafe { + core::arch::asm!("udf #0") + }; } /// Execute an OUT instruction with an arbitrary port and value. /// This is used to test that invalid OUT ports cause errors. #[guest_function("OutbWithPort")] fn outb_with_port(port: u32, value: u32) { + #[cfg(target_arch = "x86_64")] unsafe { core::arch::asm!( "out dx, eax", @@ -497,6 +515,12 @@ fn outb_with_port(port: u32, value: u32) { options(preserves_flags, nomem, nostack) ); } + #[cfg(target_arch = "aarch64")] + unsafe { + (hyperlight_common::layout::io_page().unwrap().1 as *mut u64) + .wrapping_add(port as usize) + .write_volatile(value as u64); + } } // ============================================================================= @@ -513,6 +537,7 @@ static TIMER_IRQ_COUNT: AtomicU32 = AtomicU32::new(0); // for the atomic counter update, and sends a non-specific EOI to the master PIC. // // NOTE: global_asm! on x86_64 in Rust defaults to Intel syntax. +#[cfg(target_arch = "x86_64")] core::arch::global_asm!( ".globl _timer_irq_handler", "_timer_irq_handler:", @@ -551,6 +576,7 @@ struct IdtPtr { /// - `max_spin`: maximum busy-wait iterations before giving up /// /// Returns the number of timer interrupts received. +#[cfg(target_arch = "x86_64")] #[guest_function("TestTimerInterrupts")] fn test_timer_interrupts(period_us: i32, max_spin: i32) -> i32 { // Reset counter @@ -700,6 +726,7 @@ fn call_given_paramless_hostfunc_that_returns_i64(hostfuncname: String) -> Resul } #[guest_function("UseSSE2Registers")] +#[cfg(target_arch = "x86_64")] fn use_sse2_registers() { let val: f32 = 1.2f32; unsafe { core::arch::asm!("movss xmm1, DWORD PTR [{0}]", in(reg) &val) }; @@ -707,13 +734,27 @@ fn use_sse2_registers() { #[guest_function("SetDr0")] fn set_dr0(value: u64) { - unsafe { core::arch::asm!("mov dr0, {}", in(reg) value) }; + #[cfg(target_arch = "x86_64")] + unsafe { + core::arch::asm!("mov dr0, {}", in(reg) value) + }; + #[cfg(target_arch = "aarch64")] + unsafe { + core::arch::asm!("msr dbgbvr0_el1, {}", in(reg) value) + }; } #[guest_function("GetDr0")] fn get_dr0() -> u64 { let value: u64; - unsafe { core::arch::asm!("mov {}, dr0", out(reg) value) }; + #[cfg(target_arch = "x86_64")] + unsafe { + core::arch::asm!("mov {}, dr0", out(reg) value) + }; + #[cfg(target_arch = "aarch64")] + unsafe { + core::arch::asm!("mrs {}, dbgbvr0_el1", out(reg) value) + }; value } @@ -808,6 +849,60 @@ fn write_mapped_buffer(base: u64, len: u64) -> bool { true } +fn dicachesync(_base: *mut u8, _len: usize) { + #[cfg(target_arch = "aarch64")] + unsafe { + let ctr_el0: u64; + core::arch::asm!("mrs {}, ctr_el0", out(reg) ctr_el0); + let iminline = 4 * (1 << (ctr_el0 & 0xf)); + #[allow(unused)] + let dminline = 4 * (1 << ((ctr_el0 >> 16) & 0xf)); + // See the comment in the `KVM_EXIT_ARM_NISV` case of + // `run_vcpu` in + // src/hyperlight_host/src/hypervisor/virtual_machine/kvm.rs + // for an explanation of why this cache maintenance sequence + // is so complex. + core::arch::asm!(" + ldr xzr, [{addr}] + msr nzcv, xzr + b 2f + + 0: ldr xzr, [{tmp}] + msr nzcv, xzr + b 3f + 1: ldr xzr, [{tmp}] + msr nzcv, xzr + b 4f + + 2: mov {tmp}, {addr} + + 3: dc cvau, {tmp} + b.eq 0b + add {tmp}, {tmp}, {dminline:x} + cmp {tmp}, {max} + b.lt 3b + + dsb ish + + mov {tmp}, {addr} + + 4: ic ivau, {tmp} + b.eq 1b + add {tmp}, {tmp}, {iminline:x} + cmp {tmp}, {max} + b.lt 4b + + dsb ish + isb + ", + iminline = in(reg) iminline, + dminline = in(reg) dminline, + addr = in(reg) _base as usize, + max = in(reg) _base as usize + _len, + tmp = out(reg) _); + } +} + #[guest_function("ExecMappedBuffer")] fn exec_mapped_buffer(base: u64, len: u64) -> bool { let base = base as usize as *mut u8; @@ -831,6 +926,9 @@ fn exec_mapped_buffer(base: u64, len: u64) -> bool { // Should be safe as long as data is something like a NOOP followed by a RET let func: fn() = unsafe { core::mem::transmute(data.as_ptr()) }; + + dicachesync(base, len); + func(); true @@ -987,15 +1085,8 @@ fn corrupt_output_size_prefix() -> i32 { buf[8..12].copy_from_slice(&0xFFFF_FFFBu32.to_le_bytes()); buf[12..16].copy_from_slice(&[0u8; 4]); buf[16..24].copy_from_slice(&8_u64.to_le_bytes()); - - core::arch::asm!( - "out dx, eax", - "cli", - "hlt", - in("dx") hyperlight_common::outb::VmAction::Halt as u16, - in("eax") 0u32, - options(noreturn), - ); + outb_with_port(hyperlight_common::outb::VmAction::Halt as u32, 0u32); + unreachable!(); } } @@ -1010,15 +1101,8 @@ fn corrupt_output_back_pointer() -> i32 { buf[0..8].copy_from_slice(&24_u64.to_le_bytes()); buf[8..16].copy_from_slice(&[0u8; 8]); buf[16..24].copy_from_slice(&0xDEAD_u64.to_le_bytes()); - - core::arch::asm!( - "out dx, eax", - "cli", - "hlt", - in("dx") hyperlight_common::outb::VmAction::Halt as u16, - in("eax") 0u32, - options(noreturn), - ); + outb_with_port(hyperlight_common::outb::VmAction::Halt as u32, 0u32); + unreachable!(); } }