diff --git a/Makefile b/Makefile index 8c22f623b..16c1b792b 100644 --- a/Makefile +++ b/Makefile @@ -87,14 +87,14 @@ opam-virtio-uninstall: opam-ukvm-install: solo5-kernel-ukvm.pc ukvm mkdir -p $(OPAM_UKVM_INCDIR) $(OPAM_UKVM_LIBDIR) cp kernel/solo5.h $(OPAM_UKVM_INCDIR)/solo5.h - cp ukvm/ukvm.h $(OPAM_UKVM_INCDIR)/ukvm.h + cp monitors/ukvm.h $(OPAM_UKVM_INCDIR)/ukvm.h mkdir -p $(OPAM_UKVM_INCDIR)/host cp -R include-host/. $(OPAM_UKVM_INCDIR)/host cp kernel/ukvm/solo5.o kernel/ukvm/solo5.lds $(OPAM_UKVM_LIBDIR) mkdir -p $(OPAM_BINDIR) mkdir -p $(OPAM_UKVM_LIBDIR)/src - cp -R ukvm $(OPAM_UKVM_LIBDIR)/src - cp ukvm/ukvm-configure $(OPAM_BINDIR) + cp -R monitors $(OPAM_UKVM_LIBDIR)/src + cp monitors/ukvm-configure $(OPAM_BINDIR) mkdir -p $(PREFIX)/lib/pkgconfig cp solo5-kernel-ukvm.pc $(PREFIX)/lib/pkgconfig diff --git a/Makefile.common b/Makefile.common index 5ea4737dd..a0ea50106 100644 --- a/Makefile.common +++ b/Makefile.common @@ -17,10 +17,9 @@ # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. # Be clear about some common programs we use! -AS=as -CC=cc -LD=ld -OBJCOPY=objcopy +CC?=cc +LD?=ld +OBJCOPY?=objcopy # This Makefile defines global defaults for building Solo5 (kernel/) and the # in-tree test programs. These can be overriden either on the command line, or @@ -30,6 +29,7 @@ include $(TOP)/Makeconf # Exported to OPAM via pkg-config. MD_CFLAGS=$(HOST_CFLAGS) -ffreestanding -mno-red-zone # Likewise. -LDFLAGS=-nostdlib -z max-page-size=0x1000 -static +# LDFLAGS=-nostdlib -z max-page-size=0x1000 -static +LDFLAGS=-static -arch x86_64 # CFLAGS used for building kernel/ and in-tree tests. CFLAGS=$(MD_CFLAGS) -isystem $(TOP)/include-host -std=gnu99 -Wall -Wextra -Werror -O2 -g diff --git a/configure.sh b/configure.sh index 0f2056887..30536c58e 100755 --- a/configure.sh +++ b/configure.sh @@ -25,7 +25,7 @@ die() cc_is_clang() { - ${CC:-cc} -v 2>&1 | grep -q "clang version" + ${CC:-cc} -v 2>&1 | grep -q "clang" } cc_is_gcc() @@ -33,6 +33,16 @@ cc_is_gcc() ${CC:-cc} -v 2>&1 | grep -q "^gcc version" } +ld_is_x86_64_elf() +{ + ${LD:-ld} --help 2>&1 |grep "supported targets" 2>&1 | grep -q elf64-x86-64 +} + +objcopy_exists() +{ + ${OBJCOPY:-objcopy} --version 2>&1 | grep -q "GNU objcopy" +} + # Host-provided header files are installed here for in-tree builds. OPAM will # install these to $(OPAM_INCDIR)/host where they will be picked up by # pkg-config. @@ -76,6 +86,57 @@ case $(uname -s) in BUILD_UKVM= BUILD_VIRTIO="yes" ;; + Darwin) + # On MacOSX we use clang and approximate the FreeBSD header + # copying. While clang is good enough to build the kernel, we + # need a cross-linker that understands ELF, rather than + # Mach-O. As of Jan 2016, the llvm linker was not able to + # link the unikernel, so we require a GNU binutils linker (and + # objcopy). + # + # Binutils from Homebrew does not contain ld, so you can build + # from source. After downloading binutils source, do: + # + # ./configure --prefix=/usr/local/opt/binutils-x86_64 \ + # --target=x86_64-elf --disable-nls --disable-werror + # make + # make install + # + # Then add the newly compiled cross-binutils to your path + # export PATH="/usr/local/opt/binutils-x86_64/bin:$PATH" + cc_is_clang || die "Only 'clang' is supported on MacOSX" + # ld_is_x86_64_elf || die "LD must be a cross-linker for elf64-x86-64." \ + # "You may need to build GNU binutils from source." + # objcopy_exists || die "OBJCOPY should point to GNU objcopy." \ + # "You may need to build GNU binutils from source." + INCDIR=/usr/include + SRCS_MACH="machine/_types.h machine/endian.h \ + machine/_limits.h" + SRCS_SYS="sys/cdefs.h sys/_symbol_aliasing.h sys/_posix_availability.h \ + sys/_endian.h sys/_types.h" + SRCS_SYS_PTHREAD="sys/_pthread/_pthread_types.h" + SRCS_X86="i386/endian.h i386/_types.h i386/_limits.h" + SRCS_LIBKERN="libkern/_OSByteOrder.h" + SRCS_LIBKERN_I386="libkern/i386/_OSByteOrder.h" + SRCS="_types.h" + + mkdir -p ${HOST_INCDIR} + mkdir -p ${HOST_INCDIR}/machine ${HOST_INCDIR}/sys ${HOST_INCDIR}/i386 \ + ${HOST_INCDIR}/libkern ${HOST_INCDIR}/libkern/i386 \ + ${HOST_INCDIR}/sys/_pthread + for f in ${SRCS_MACH}; do cp -f ${INCDIR}/$f ${HOST_INCDIR}/machine; done + for f in ${SRCS_SYS}; do cp -f ${INCDIR}/$f ${HOST_INCDIR}/sys; done + for f in ${SRCS_SYS_PTHREAD}; do cp -f ${INCDIR}/$f ${HOST_INCDIR}/sys/_pthread; done + for f in ${SRCS_X86}; do cp -f ${INCDIR}/$f ${HOST_INCDIR}/i386; done + for f in ${SRCS_LIBKERN}; do cp -f ${INCDIR}/$f ${HOST_INCDIR}/libkern; done + for f in ${SRCS_LIBKERN_I386}; do cp -f ${INCDIR}/$f ${HOST_INCDIR}/libkern/i386; done + for f in ${SRCS}; do cp -f ${INCDIR}/$f ${HOST_INCDIR}; done + + # Where is the stack protector library on OSX? + HOST_CFLAGS="-nostdlibinc -arch x86_64 -fno-stack-protector" + BUILD_UKVM="yes" + BUILD_VIRTIO= + ;; *) die "Unsupported build OS: $(uname -s)" ;; diff --git a/kernel/Makefile b/kernel/Makefile index 22d38ab8c..bd3058001 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -20,10 +20,7 @@ include $(TOP)/Makefile.common COMMON_COBJS=\ abort.o \ -cpu.o \ -cpu_vectors.o \ ee_printf.o \ -intr.o \ lib.o \ malloc.o \ exit.o \ @@ -31,18 +28,20 @@ pvclock.o UKVM_COBJS=\ ukvm/kernel.o \ -ukvm/gdt.o \ ukvm/io.o \ ukvm/platform.o \ -ukvm/platform_intr.o \ ukvm/mem.o \ ukvm/time.o \ +ukvm/ukvmclock.o \ $(COMMON_COBJS) VIRTIO_COBJS=\ virtio/boot.o \ +virtio/cpu.o \ +virtio/cpu_vectors.o \ virtio/kernel.o \ virtio/platform.o \ +virtio/intr.o \ virtio/platform_intr.o \ virtio/mem.o \ virtio/pci.o \ @@ -78,7 +77,7 @@ virtio/solo5.o: $(VIRTIO_COBJS) virtio/solo5.lds ukvm/solo5.o: $(UKVM_COBJS) ukvm/solo5.lds $(LD) -r $(LDFLAGS) -o $@ $(UKVM_COBJS) - $(OBJCOPY) -w -G solo5_\* -G _start $@ $@ +# $(OBJCOPY) -w -G solo5_\* -G _start $@ $@ .PHONY: clean clean: diff --git a/kernel/abort.c b/kernel/abort.c index d3410e6d4..2c9409806 100644 --- a/kernel/abort.c +++ b/kernel/abort.c @@ -27,7 +27,7 @@ static void puts(const char *s) { - (void)platform_puts(s, strlen(s)); + (void)solo5_console_write(s, strlen(s)); } void _assert_fail(const char *file, const char *line, const char *e) diff --git a/kernel/cpu.h b/kernel/cpu.h new file mode 100644 index 000000000..05f57dccb --- /dev/null +++ b/kernel/cpu.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2015-2017 Contributors as noted in the AUTHORS file + * + * This file is part of Solo5, a unikernel base layer. + * + * Permission to use, copy, modify, and/or distribute this software + * for any purpose with or without fee is hereby granted, provided + * that the above copyright notice and this permission notice appear + * in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL + * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE + * AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, + * NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef __CPU_H__ +#define __CPU_H__ + +/* compiler-only memory "barrier" */ +#define cpu_cc_barrier() __asm__ __volatile__("" : : : "memory") +#define cpu_mfence_memory() __asm__ ("mfence" ::: "memory"); + +static inline void cpu_halt(void) __attribute__((noreturn)); +static inline void cpu_halt(void) { + for (;;) + __asm__("hlt"); +} + +static inline uint64_t cpu_rdtsc(void) +{ + uint32_t edx_, eax_; + + __asm__("rdtsc" : "=a" (eax_), "=d" (edx_)); + return (uint64_t)eax_ + ((uint64_t)edx_ << 32); +} + +static inline void cpu_wrmsr(uint64_t a, uint32_t b, uint32_t c) +{ + __asm__ __volatile("wrmsr" :: + "c" (a), + "a" (b), + "d" (c) + ); +} + +static inline void cpu_x86_cpuid(uint32_t level, + uint32_t *eax_out, uint32_t *ebx_out, + uint32_t *ecx_out, uint32_t *edx_out) +{ + uint32_t eax_, ebx_, ecx_, edx_; + + __asm__( + "cpuid" + : "=a" (eax_), "=b" (ebx_), "=c" (ecx_), "=d" (edx_) + : "0" (level) + ); + *eax_out = eax_; + *ebx_out = ebx_; + *ecx_out = ecx_; + *edx_out = edx_; +} + + + +static inline uint64_t cpu_mul64_32(uint64_t a, uint32_t b) +{ + uint64_t prod; + + __asm__ ( + "mul %%rdx ; " + "shrd $32, %%rdx, %%rax" + : "=a" (prod) + : "0" (a), "d" ((uint64_t)b) + ); + + return prod; +} + + +#endif diff --git a/kernel/ee_printf.c b/kernel/ee_printf.c index b12bf2029..20cc74d41 100644 --- a/kernel/ee_printf.c +++ b/kernel/ee_printf.c @@ -709,7 +709,7 @@ static int ee_vprintf(const char *fmt, va_list args) n = vsnprintf(buf, PRINTF_BUF_LEN, fmt, args); assert(n < PRINTF_BUF_LEN); - return platform_puts(buf, n); + return solo5_console_write(buf, n); } int printf(const char *fmt, ...) diff --git a/kernel/kernel.h b/kernel/kernel.h index c718e79c3..ceec980a6 100644 --- a/kernel/kernel.h +++ b/kernel/kernel.h @@ -22,6 +22,7 @@ #define __KERNEL_H__ #include "solo5.h" +#include "cpu.h" /* This is the main header file for everything in the kernel */ @@ -32,31 +33,11 @@ #include #include -/* alignment macros */ -#define ALIGN_4K __attribute__((aligned(0x1000))) -#define ALIGN_64_BIT __attribute__((aligned(0x8))) - /* memory defines */ #define PAGE_SIZE 4096 #define PAGE_SHIFT 12 #define PAGE_MASK ~(0xfff) -/* We have already set up the GDT for the kernel. Here are the - * descriptor numbers (useful for when the kernel sets up the IDT) - */ -#define GDT_NUM_ENTRIES 6 -#define GDT_DESC_NULL 0 -#define GDT_DESC_CODE 1 -/* 2 == unused / 32-bit bootstrap */ -#define GDT_DESC_DATA 3 -#define GDT_DESC_TSS_LO 4 -#define GDT_DESC_TSS_HI 5 -#define GDT_DESC_TSS GDT_DESC_TSS_LO -#define GDT_DESC_OFFSET(n) ((n) * 0x8) - -/* We have already loaded a "known good" stack in the TSS */ -#define TSS_IST_INDEX 0x1 - /* convenient macro stringification */ #define STR_EXPAND(y) #y #define STR(x) STR_EXPAND(x) @@ -71,32 +52,12 @@ void _abort(const char *, const char *, const char *) do { \ _abort(__FILE__, STR(__LINE__), s); \ } while (0) - #define assert(e) \ do { \ if (!(e)) \ _assert_fail(__FILE__, STR(__LINE__), #e); \ } while (0) -/* cpu.S: low-level CPU functions */ -void cpu_halt(void) __attribute__((noreturn)); -void cpu_tss_load(uint16_t); -void cpu_idt_load(uint64_t); -void cpu_gdt_load(uint64_t); -void cpu_sse_enable(void); -uint64_t cpu_rdtsc(void); - -/* intr.c: interrupt handling */ -void intr_init(void); -void intr_enable(void); -void intr_disable(void); -void intr_register_irq(unsigned irq, int (*handler)(void *), void *arg); -extern int intr_depth; - -/* mem.c: low-level page alloc routines */ -uint64_t mem_max_addr(void); -void *sbrk(intptr_t increment); - /* malloc.c: memory allocation */ void *malloc(size_t bytes); void free(void *ptr); @@ -125,90 +86,12 @@ size_t strlen(const char *s); /* platform.c: specifics for ukvm or virito platform */ void platform_exit(void) __attribute__((noreturn)); -int platform_puts(const char *buf, int n); - -/* platform_intr.c: platform-specific interrupt handling */ -void platform_intr_init(void); -void platform_intr_clear_irq(unsigned irq); -void platform_intr_mask_irq(unsigned irq); -void platform_intr_ack_irq(unsigned irq); /* pvclock.c: KVM paravirtualized clock */ int pvclock_init(void); uint64_t pvclock_monotonic(void); uint64_t pvclock_epochoffset(void); -/* accessing devices via port space */ - -static inline void outb(uint16_t port, uint8_t v) -{ - __asm__ __volatile__("outb %0,%1" : : "a" (v), "dN" (port)); -} -static inline void outw(uint16_t port, uint16_t v) -{ - __asm__ __volatile__("outw %0,%1" : : "a" (v), "dN" (port)); -} -static inline void outl(uint16_t port, uint32_t v) -{ - __asm__ __volatile__("outl %0,%1" : : "a" (v), "dN" (port)); -} -static inline uint8_t inb(uint16_t port) -{ - uint8_t v; - - __asm__ __volatile__("inb %1,%0" : "=a" (v) : "dN" (port)); - return v; -} -static inline uint16_t inw(uint16_t port) -{ - uint16_t v; - - __asm__ __volatile__("inw %1,%0" : "=a" (v) : "dN" (port)); - return v; -} -static inline uint32_t inl(uint16_t port) -{ - uint32_t v; - - __asm__ __volatile__("inl %1,%0" : "=a" (v) : "dN" (port)); - return v; -} - -static inline uint64_t inq(uint16_t port_lo) -{ - uint16_t port_hi = port_lo + 4; - uint32_t lo, hi; - - __asm__ __volatile__("inl %1,%0" : "=a" (lo) : "dN" (port_lo)); - __asm__ __volatile__("inl %1,%0" : "=a" (hi) : "dN" (port_hi)); - - return ((uint64_t)lo) | ((uint64_t)hi << 32); -} - -static inline uint64_t mul64_32(uint64_t a, uint32_t b) -{ - uint64_t prod; - - __asm__ ( - "mul %%rdx ; " - "shrd $32, %%rdx, %%rax" - : "=a" (prod) - : "0" (a), "d" ((uint64_t)b) - ); - - return prod; -} - -/* compiler-only memory "barrier" */ -#define cc_barrier() __asm__ __volatile__("" : : : "memory") - -/* should only use outside of interrupt context */ -#define atomic_printf(x...) do { \ - intr_disable(); \ - printf(x); \ - intr_enable(); \ - } while (0) - #define NSEC_PER_SEC 1000000000ULL #endif diff --git a/kernel/malloc.c b/kernel/malloc.c index b7f58b2e9..200995ef2 100644 --- a/kernel/malloc.c +++ b/kernel/malloc.c @@ -61,14 +61,6 @@ #define ENOMEM -1 /* Out of memory */ #define EINVAL -1 /* Invalid argument */ -/* - * Export Solo5 public interfaces defined in this file - */ -void *solo5_malloc(size_t) __attribute__ ((alias ("malloc"))); -void solo5_free(void *) __attribute__ ((alias ("free"))); -void *solo5_calloc(size_t, size_t) __attribute__ ((alias ("calloc"))); -void *solo5_realloc(void *, size_t) __attribute__ ((alias ("realloc"))); - /* This is a version (aka dlmalloc) of malloc/free/realloc written by Doug Lea and released to the public domain, as explained at @@ -655,26 +647,6 @@ MAX_RELEASE_CHECK_RATE default: 4095 unless not HAVE_MMAP /* The maximum possible size_t value has all bits set */ #define MAX_SIZE_T (~(size_t)0) -#ifndef USE_LOCKS /* ensure true if spin or recursive locks set */ -#define USE_LOCKS ((defined(USE_SPIN_LOCKS) && USE_SPIN_LOCKS != 0) || \ - (defined(USE_RECURSIVE_LOCKS) && USE_RECURSIVE_LOCKS != 0)) -#endif /* USE_LOCKS */ - -#if USE_LOCKS /* Spin locks for gcc >= 4.1, older gcc on x86, MSC >= 1310 */ -#if ((defined(__GNUC__) && \ - ((__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)) || \ - defined(__i386__) || defined(__x86_64__))) || \ - (defined(_MSC_VER) && _MSC_VER>=1310)) -#ifndef USE_SPIN_LOCKS -#define USE_SPIN_LOCKS 1 -#endif /* USE_SPIN_LOCKS */ -#elif USE_SPIN_LOCKS -#error "USE_SPIN_LOCKS defined without implementation" -#endif /* ... locks available... */ -#elif !defined(USE_SPIN_LOCKS) -#define USE_SPIN_LOCKS 0 -#endif /* USE_LOCKS */ - #ifndef ONLY_MSPACES #define ONLY_MSPACES 0 #endif /* ONLY_MSPACES */ @@ -884,12 +856,15 @@ extern "C" { /* ------------------- Declarations of public routines ------------------- */ #ifndef USE_DL_PREFIX -#define dlcalloc calloc -#define dlfree free -#define dlmalloc malloc + +/* Export Solo5 public interfaces defined in this file */ +#define dlcalloc solo5_calloc +#define dlfree solo5_free +#define dlmalloc solo5_malloc +#define dlrealloc solo5_realloc + #define dlmemalign memalign #define dlposix_memalign posix_memalign -#define dlrealloc realloc #define dlrealloc_in_place realloc_in_place #define dlvalloc valloc #define dlpvalloc pvalloc @@ -1561,39 +1536,6 @@ extern void* sbrk(ptrdiff_t); #endif /* LACKS_UNISTD_H */ /* Declarations for locking */ -#if USE_LOCKS -#ifndef WIN32 -#if defined (__SVR4) && defined (__sun) /* solaris */ -#include -#elif !defined(LACKS_SCHED_H) -#include -#endif /* solaris or LACKS_SCHED_H */ -#if (defined(USE_RECURSIVE_LOCKS) && USE_RECURSIVE_LOCKS != 0) || !USE_SPIN_LOCKS -#include -#endif /* USE_RECURSIVE_LOCKS ... */ -#elif defined(_MSC_VER) -#ifndef _M_AMD64 -/* These are already defined on AMD64 builds */ -#ifdef __cplusplus -extern "C" { -#endif /* __cplusplus */ -LONG __cdecl _InterlockedCompareExchange(LONG volatile *Dest, LONG Exchange, LONG Comp); -LONG __cdecl _InterlockedExchange(LONG volatile *Target, LONG Value); -#ifdef __cplusplus -} -#endif /* __cplusplus */ -#endif /* _M_AMD64 */ -#pragma intrinsic (_InterlockedCompareExchange) -#pragma intrinsic (_InterlockedExchange) -#define interlockedcompareexchange _InterlockedCompareExchange -#define interlockedexchange _InterlockedExchange -#elif defined(WIN32) && defined(__GNUC__) -#define interlockedcompareexchange(a, b, c) __sync_val_compare_and_swap(a, c, b) -#define interlockedexchange __sync_lock_test_and_set -#endif /* Win32 */ -#else /* USE_LOCKS */ -#endif /* USE_LOCKS */ - #ifndef LOCK_AT_FORK #define LOCK_AT_FORK 0 #endif @@ -1871,249 +1813,13 @@ static FORCEINLINE int win32munmap(void* ptr, size_t size) { */ -#if !USE_LOCKS +/* For Solo5, we don't USE_LOCKS */ #define USE_LOCK_BIT (0U) #define INITIAL_LOCK(l) (0) #define DESTROY_LOCK(l) (0) #define ACQUIRE_MALLOC_GLOBAL_LOCK() #define RELEASE_MALLOC_GLOBAL_LOCK() -#else -#if USE_LOCKS > 1 -/* ----------------------- User-defined locks ------------------------ */ -/* Define your own lock implementation here */ -/* #define INITIAL_LOCK(lk) ... */ -/* #define DESTROY_LOCK(lk) ... */ -/* #define ACQUIRE_LOCK(lk) ... */ -/* #define RELEASE_LOCK(lk) ... */ -/* #define TRY_LOCK(lk) ... */ -/* static MLOCK_T malloc_global_mutex = ... */ - -#elif USE_SPIN_LOCKS - -/* First, define CAS_LOCK and CLEAR_LOCK on ints */ -/* Note CAS_LOCK defined to return 0 on success */ - -#if defined(__GNUC__)&& (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)) -#define CAS_LOCK(sl) __sync_lock_test_and_set(sl, 1) -#define CLEAR_LOCK(sl) __sync_lock_release(sl) - -#elif (defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))) -/* Custom spin locks for older gcc on x86 */ -static FORCEINLINE int x86_cas_lock(int *sl) { - int ret; - int val = 1; - int cmp = 0; - __asm__ __volatile__ ("lock; cmpxchgl %1, %2" - : "=a" (ret) - : "r" (val), "m" (*(sl)), "0"(cmp) - : "memory", "cc"); - return ret; -} - -static FORCEINLINE void x86_clear_lock(int* sl) { - assert(*sl != 0); - int prev = 0; - int ret; - __asm__ __volatile__ ("lock; xchgl %0, %1" - : "=r" (ret) - : "m" (*(sl)), "0"(prev) - : "memory"); -} - -#define CAS_LOCK(sl) x86_cas_lock(sl) -#define CLEAR_LOCK(sl) x86_clear_lock(sl) - -#else /* Win32 MSC */ -#define CAS_LOCK(sl) interlockedexchange(sl, (LONG)1) -#define CLEAR_LOCK(sl) interlockedexchange (sl, (LONG)0) - -#endif /* ... gcc spins locks ... */ - -/* How to yield for a spin lock */ -#define SPINS_PER_YIELD 63 -#if defined(_MSC_VER) -#define SLEEP_EX_DURATION 50 /* delay for yield/sleep */ -#define SPIN_LOCK_YIELD SleepEx(SLEEP_EX_DURATION, FALSE) -#elif defined (__SVR4) && defined (__sun) /* solaris */ -#define SPIN_LOCK_YIELD thr_yield(); -#elif !defined(LACKS_SCHED_H) -#define SPIN_LOCK_YIELD sched_yield(); -#else -#define SPIN_LOCK_YIELD -#endif /* ... yield ... */ - -#if !defined(USE_RECURSIVE_LOCKS) || USE_RECURSIVE_LOCKS == 0 -/* Plain spin locks use single word (embedded in malloc_states) */ -static int spin_acquire_lock(int *sl) { - int spins = 0; - while (*(volatile int *)sl != 0 || CAS_LOCK(sl)) { - if ((++spins & SPINS_PER_YIELD) == 0) { - SPIN_LOCK_YIELD; - } - } - return 0; -} - -#define MLOCK_T int -#define TRY_LOCK(sl) !CAS_LOCK(sl) -#define RELEASE_LOCK(sl) CLEAR_LOCK(sl) -#define ACQUIRE_LOCK(sl) (CAS_LOCK(sl)? spin_acquire_lock(sl) : 0) -#define INITIAL_LOCK(sl) (*sl = 0) -#define DESTROY_LOCK(sl) (0) -static MLOCK_T malloc_global_mutex = 0; - -#else /* USE_RECURSIVE_LOCKS */ -/* types for lock owners */ -#ifdef WIN32 -#define THREAD_ID_T DWORD -#define CURRENT_THREAD GetCurrentThreadId() -#define EQ_OWNER(X,Y) ((X) == (Y)) -#else -/* - Note: the following assume that pthread_t is a type that can be - initialized to (casted) zero. If this is not the case, you will need to - somehow redefine these or not use spin locks. -*/ -#define THREAD_ID_T pthread_t -#define CURRENT_THREAD pthread_self() -#define EQ_OWNER(X,Y) pthread_equal(X, Y) -#endif - -struct malloc_recursive_lock { - int sl; - unsigned int c; - THREAD_ID_T threadid; -}; - -#define MLOCK_T struct malloc_recursive_lock -static MLOCK_T malloc_global_mutex = { 0, 0, (THREAD_ID_T)0}; - -static FORCEINLINE void recursive_release_lock(MLOCK_T *lk) { - assert(lk->sl != 0); - if (--lk->c == 0) { - CLEAR_LOCK(&lk->sl); - } -} - -static FORCEINLINE int recursive_acquire_lock(MLOCK_T *lk) { - THREAD_ID_T mythreadid = CURRENT_THREAD; - int spins = 0; - for (;;) { - if (*((volatile int *)(&lk->sl)) == 0) { - if (!CAS_LOCK(&lk->sl)) { - lk->threadid = mythreadid; - lk->c = 1; - return 0; - } - } - else if (EQ_OWNER(lk->threadid, mythreadid)) { - ++lk->c; - return 0; - } - if ((++spins & SPINS_PER_YIELD) == 0) { - SPIN_LOCK_YIELD; - } - } -} - -static FORCEINLINE int recursive_try_lock(MLOCK_T *lk) { - THREAD_ID_T mythreadid = CURRENT_THREAD; - if (*((volatile int *)(&lk->sl)) == 0) { - if (!CAS_LOCK(&lk->sl)) { - lk->threadid = mythreadid; - lk->c = 1; - return 1; - } - } - else if (EQ_OWNER(lk->threadid, mythreadid)) { - ++lk->c; - return 1; - } - return 0; -} - -#define RELEASE_LOCK(lk) recursive_release_lock(lk) -#define TRY_LOCK(lk) recursive_try_lock(lk) -#define ACQUIRE_LOCK(lk) recursive_acquire_lock(lk) -#define INITIAL_LOCK(lk) ((lk)->threadid = (THREAD_ID_T)0, (lk)->sl = 0, (lk)->c = 0) -#define DESTROY_LOCK(lk) (0) -#endif /* USE_RECURSIVE_LOCKS */ - -#elif defined(WIN32) /* Win32 critical sections */ -#define MLOCK_T CRITICAL_SECTION -#define ACQUIRE_LOCK(lk) (EnterCriticalSection(lk), 0) -#define RELEASE_LOCK(lk) LeaveCriticalSection(lk) -#define TRY_LOCK(lk) TryEnterCriticalSection(lk) -#define INITIAL_LOCK(lk) (!InitializeCriticalSectionAndSpinCount((lk), 0x80000000|4000)) -#define DESTROY_LOCK(lk) (DeleteCriticalSection(lk), 0) -#define NEED_GLOBAL_LOCK_INIT - -static MLOCK_T malloc_global_mutex; -static volatile LONG malloc_global_mutex_status; - -/* Use spin loop to initialize global lock */ -static void init_malloc_global_mutex() { - for (;;) { - long stat = malloc_global_mutex_status; - if (stat > 0) - return; - /* transition to < 0 while initializing, then to > 0) */ - if (stat == 0 && - interlockedcompareexchange(&malloc_global_mutex_status, (LONG)-1, (LONG)0) == 0) { - InitializeCriticalSection(&malloc_global_mutex); - interlockedexchange(&malloc_global_mutex_status, (LONG)1); - return; - } - SleepEx(0, FALSE); - } -} - -#else /* pthreads-based locks */ -#define MLOCK_T pthread_mutex_t -#define ACQUIRE_LOCK(lk) pthread_mutex_lock(lk) -#define RELEASE_LOCK(lk) pthread_mutex_unlock(lk) -#define TRY_LOCK(lk) (!pthread_mutex_trylock(lk)) -#define INITIAL_LOCK(lk) pthread_init_lock(lk) -#define DESTROY_LOCK(lk) pthread_mutex_destroy(lk) - -#if defined(USE_RECURSIVE_LOCKS) && USE_RECURSIVE_LOCKS != 0 && defined(linux) && !defined(PTHREAD_MUTEX_RECURSIVE) -/* Cope with old-style linux recursive lock initialization by adding */ -/* skipped internal declaration from pthread.h */ -extern int pthread_mutexattr_setkind_np __P ((pthread_mutexattr_t *__attr, - int __kind)); -#define PTHREAD_MUTEX_RECURSIVE PTHREAD_MUTEX_RECURSIVE_NP -#define pthread_mutexattr_settype(x,y) pthread_mutexattr_setkind_np(x,y) -#endif /* USE_RECURSIVE_LOCKS ... */ - -static MLOCK_T malloc_global_mutex = PTHREAD_MUTEX_INITIALIZER; - -static int pthread_init_lock (MLOCK_T *lk) { - pthread_mutexattr_t attr; - if (pthread_mutexattr_init(&attr)) return 1; -#if defined(USE_RECURSIVE_LOCKS) && USE_RECURSIVE_LOCKS != 0 - if (pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE)) return 1; -#endif - if (pthread_mutex_init(lk, &attr)) return 1; - if (pthread_mutexattr_destroy(&attr)) return 1; - return 0; -} - -#endif /* ... lock types ... */ - -/* Common code for all lock types */ -#define USE_LOCK_BIT (2U) - -#ifndef ACQUIRE_MALLOC_GLOBAL_LOCK -#define ACQUIRE_MALLOC_GLOBAL_LOCK() ACQUIRE_LOCK(&malloc_global_mutex); -#endif - -#ifndef RELEASE_MALLOC_GLOBAL_LOCK -#define RELEASE_MALLOC_GLOBAL_LOCK() RELEASE_LOCK(&malloc_global_mutex); -#endif - -#endif /* USE_LOCKS */ - /* ----------------------- Chunk representations ------------------------ */ /* @@ -2663,9 +2369,6 @@ struct malloc_state { size_t max_footprint; size_t footprint_limit; /* zero means no limit */ flag_t mflags; -#if USE_LOCKS - MLOCK_T mutex; /* locate lock among fields that rarely change */ -#endif /* USE_LOCKS */ msegment seg; void* extp; /* Unused but available for extensions */ size_t exts; @@ -2713,11 +2416,8 @@ static struct malloc_state _gm_; #define use_lock(M) ((M)->mflags & USE_LOCK_BIT) #define enable_lock(M) ((M)->mflags |= USE_LOCK_BIT) -#if USE_LOCKS -#define disable_lock(M) ((M)->mflags &= ~USE_LOCK_BIT) -#else #define disable_lock(M) -#endif + #define use_mmap(M) ((M)->mflags & USE_MMAP_BIT) #define enable_mmap(M) ((M)->mflags |= USE_MMAP_BIT) @@ -2809,11 +2509,6 @@ static int has_segment_link(mstate m, msegmentptr ss) { anything you like. */ -#if USE_LOCKS -#define PREACTION(M) ((use_lock(M))? ACQUIRE_LOCK(&(M)->mutex) : 0) -#define POSTACTION(M) { if (use_lock(M)) RELEASE_LOCK(&(M)->mutex); } -#else /* USE_LOCKS */ - #ifndef PREACTION #define PREACTION(M) (0) #endif /* PREACTION */ @@ -2822,8 +2517,6 @@ static int has_segment_link(mstate m, msegmentptr ss) { #define POSTACTION(M) #endif /* POSTACTION */ -#endif /* USE_LOCKS */ - /* CORRUPTION_ERROR_ACTION is triggered upon detected bad addresses. USAGE_ERROR_ACTION is triggered on detected bad frees and @@ -4632,10 +4325,6 @@ void* dlmalloc(size_t bytes) { The ugly goto's here ensure that postaction occurs along all paths. */ -#if USE_LOCKS - ensure_initialization(); /* initialize in sys_alloc if not using locks */ -#endif - if (!PREACTION(gm)) { void* mem; size_t nb; @@ -6351,3 +6040,5 @@ int mspace_mallopt(int param_number, int value) { structure of old version, but most details differ.) */ + + diff --git a/kernel/pvclock.c b/kernel/pvclock.c index eb8d0ed99..761273601 100644 --- a/kernel/pvclock.c +++ b/kernel/pvclock.c @@ -77,38 +77,21 @@ struct pvclock_wall_clock { static volatile struct pvclock_vcpu_time_info pvclock_ti; static volatile struct pvclock_wall_clock pvclock_wc; -static inline void -x86_cpuid(uint32_t level, uint32_t *eax_out, uint32_t *ebx_out, - uint32_t *ecx_out, uint32_t *edx_out) -{ - uint32_t eax_, ebx_, ecx_, edx_; - - __asm__( - "cpuid" - : "=a" (eax_), "=b" (ebx_), "=c" (ecx_), "=d" (edx_) - : "0" (level) - ); - *eax_out = eax_; - *ebx_out = ebx_; - *ecx_out = ecx_; - *edx_out = edx_; -} - uint64_t pvclock_monotonic(void) { uint32_t version; uint64_t delta, time_now; do { version = pvclock_ti.version; - __asm__ ("mfence" ::: "memory"); + cpu_mfence_memory(); delta = cpu_rdtsc() - pvclock_ti.tsc_timestamp; if (pvclock_ti.tsc_shift < 0) delta >>= -pvclock_ti.tsc_shift; else delta <<= pvclock_ti.tsc_shift; - time_now = mul64_32(delta, pvclock_ti.tsc_to_system_mul) + + time_now = cpu_mul64_32(delta, pvclock_ti.tsc_to_system_mul) + pvclock_ti.system_time; - __asm__ ("mfence" ::: "memory"); + cpu_mfence_memory(); } while ((pvclock_ti.version & 1) || (pvclock_ti.version != version)); return time_now; @@ -124,10 +107,10 @@ static uint64_t pvclock_read_wall_clock(void) do { version = pvclock_wc.version; - __asm__ ("mfence" ::: "memory"); + cpu_mfence_memory(); wc_boot = pvclock_wc.sec * NSEC_PER_SEC; wc_boot += pvclock_wc.nsec; - __asm__ ("mfence" ::: "memory"); + cpu_mfence_memory(); } while ((pvclock_wc.version & 1) || (pvclock_wc.version != version)); return wc_boot; @@ -141,7 +124,7 @@ int pvclock_init(void) { * Prefer new-style MSRs, and bail entirely if neither is indicated as * available by CPUID. */ - x86_cpuid(0x40000001, &eax, &ebx, &ecx, &edx); + cpu_x86_cpuid(0x40000001, &eax, &ebx, &ecx, &edx); if (eax & (1 << 3)) { msr_kvm_system_time = 0x4b564d01; msr_kvm_wall_clock = 0x4b564d00; @@ -156,16 +139,13 @@ int pvclock_init(void) { printf("Solo5: Clock source: KVM paravirtualized clock\n"); - __asm__ __volatile("wrmsr" :: - "c" (msr_kvm_system_time), - "a" ((uint32_t)((uintptr_t)&pvclock_ti | 0x1)), - "d" ((uint32_t)((uintptr_t)&pvclock_ti >> 32)) - ); - __asm__ __volatile("wrmsr" :: - "c" (msr_kvm_wall_clock), - "a" ((uint32_t)((uintptr_t)&pvclock_wc)), - "d" ((uint32_t)((uintptr_t)&pvclock_wc >> 32)) - ); + cpu_wrmsr(msr_kvm_system_time, + (uint32_t)((uintptr_t)&pvclock_ti | 0x1), + (uint32_t)((uintptr_t)&pvclock_ti >> 32)); + cpu_wrmsr(msr_kvm_wall_clock, + (uint32_t)((uintptr_t)&pvclock_wc), + (uint32_t)((uintptr_t)&pvclock_wc >> 32)); + /* Initialise epoch offset using wall clock time */ wc_epochoffset = pvclock_read_wall_clock(); diff --git a/kernel/solo5.h b/kernel/solo5.h index d03c2e1fe..bb81d1770 100644 --- a/kernel/solo5.h +++ b/kernel/solo5.h @@ -65,4 +65,14 @@ uint64_t solo5_clock_wall(void); */ int solo5_poll(uint64_t until_nsecs); + +/* TODO: Replace lib stuff */ +void *memset(void *dest, int c, size_t n); +void *memcpy(void *restrict dest, const void *restrict src, size_t n); +void *memmove(void *dest, const void *src, size_t n); +int memcmp(const void *vl, const void *vr, size_t n); +int strcmp(const char *l, const char *r); +char *strcpy(char *restrict dest, const char *restrict src); +size_t strlen(const char *s); + #endif diff --git a/kernel/ukvm/gdt.c b/kernel/ukvm/gdt.c deleted file mode 100644 index 677fa093d..000000000 --- a/kernel/ukvm/gdt.c +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2015-2017 Contributors as noted in the AUTHORS file - * - * This file is part of Solo5, a unikernel base layer. - * - * Permission to use, copy, modify, and/or distribute this software - * for any purpose with or without fee is hereby granted, provided - * that the above copyright notice and this permission notice appear - * in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL - * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE - * AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR - * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS - * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, - * NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include "kernel.h" - -/* granularity (23), long mode (21), present (15), - * always 1 (12, 11), readable (9), limit (16-19) - */ -#define GDT_DESC_CODE_VAL (0x00af9a000000ffff) -/* granularity (23), big data seg (22), present (15), - * type data rw (9), limit (16-19) - */ -#define GDT_DESC_DATA_VAL (0x00cf92000000ffff) - -struct __attribute__((__packed__)) gdtptr { - uint16_t limit; - uint64_t base; -}; - -uint64_t cpu_gdt64[GDT_NUM_ENTRIES] ALIGN_64_BIT; - -/* - * Ukvm starts up with a bootstrap GDT which is "invisible" to the guest, init - * and switch to our own GDT. - */ -void gdt_init(void) -{ - struct gdtptr gdtptr; - - /* initialize GDT "pointer" */ - gdtptr.limit = sizeof(cpu_gdt64) - 1; - gdtptr.base = (uint64_t)&cpu_gdt64; - - /* clear structures */ - memset(cpu_gdt64, 0, sizeof(cpu_gdt64)); - - cpu_gdt64[GDT_DESC_CODE] = GDT_DESC_CODE_VAL; - cpu_gdt64[GDT_DESC_DATA] = GDT_DESC_DATA_VAL; - - cpu_gdt_load((uint64_t)&gdtptr); -} diff --git a/kernel/ukvm/io.c b/kernel/ukvm/io.c index 90f6c12d5..2c9c81977 100644 --- a/kernel/ukvm/io.c +++ b/kernel/ukvm/io.c @@ -30,7 +30,7 @@ int solo5_net_write_sync(uint8_t *data, int n) wr.ret = 0; outl(UKVM_PORT_NETWRITE, ukvm_ptr(&wr)); - cc_barrier(); + cpu_cc_barrier(); return wr.ret; } @@ -44,7 +44,7 @@ int solo5_net_read_sync(uint8_t *data, int *n) rd.ret = 0; outl(UKVM_PORT_NETREAD, ukvm_ptr(&rd)); - cc_barrier(); + cpu_cc_barrier(); *n = rd.len; return rd.ret; @@ -56,7 +56,7 @@ char *solo5_net_mac_str(void) volatile struct ukvm_netinfo info; outl(UKVM_PORT_NETINFO, ukvm_ptr(&info)); - cc_barrier(); + cpu_cc_barrier(); memcpy(mac_str, (void *)&info, 18); return mac_str; @@ -73,7 +73,7 @@ int solo5_blk_write_sync(uint64_t sec, uint8_t *data, int n) wr.ret = 0; outl(UKVM_PORT_BLKWRITE, ukvm_ptr(&wr)); - cc_barrier(); + cpu_cc_barrier(); return wr.ret; } @@ -88,7 +88,7 @@ int solo5_blk_read_sync(uint64_t sec, uint8_t *data, int *n) rd.ret = 0; outl(UKVM_PORT_BLKREAD, ukvm_ptr(&rd)); - cc_barrier(); + cpu_cc_barrier(); *n = rd.len; return rd.ret; @@ -99,7 +99,7 @@ int solo5_blk_sector_size(void) volatile struct ukvm_blkinfo info; outl(UKVM_PORT_BLKINFO, ukvm_ptr(&info)); - cc_barrier(); + cpu_cc_barrier(); return info.sector_size; } @@ -109,7 +109,7 @@ uint64_t solo5_blk_sectors(void) volatile struct ukvm_blkinfo info; outl(UKVM_PORT_BLKINFO, ukvm_ptr(&info)); - cc_barrier(); + cpu_cc_barrier(); return info.num_sectors; } @@ -119,7 +119,7 @@ int solo5_blk_rw(void) volatile struct ukvm_blkinfo info; outl(UKVM_PORT_BLKINFO, ukvm_ptr(&info)); - cc_barrier(); + cpu_cc_barrier(); return info.rw; } diff --git a/kernel/ukvm/kernel.c b/kernel/ukvm/kernel.c index 89862644e..34f4fe27a 100644 --- a/kernel/ukvm/kernel.c +++ b/kernel/ukvm/kernel.c @@ -29,14 +29,23 @@ void _start(struct ukvm_boot_info *bi) printf("\\__ \\ ( | | ( | ) |\n"); printf("____/\\___/ _|\\___/____/\n"); - gdt_init(); - mem_init(bi->mem_size, bi->kernel_end); - intr_init(); + /* It appears that on macosx Hypervisor.framework, the mxcsr is + * not started with its default value of 0x1f80. This results in + * floating point related exceptions (e.g., denormal numbers) + * being raised to the guest which would normally be masked and + * dealt with by hardware (e.g., by rounding). It is unclear + * whether this is a problem with Hypervisor.framework or the way + * that `uhvf` sets up the VMX context. + * + * A workaround is for the guest to explicitly set the MXCSR to + * the default value (0x1f80). + */ + unsigned default_mxcsr = 0x1f80; + __asm__ __volatile__("ldmxcsr %0\n" : : "m"(default_mxcsr)); + mem_init(bi->mem_size, bi->kernel_end); time_init(); - intr_enable(); - ret = solo5_app_main((char *)bi->cmdline); printf("Solo5: solo5_app_main() returned with %d\n", ret); diff --git a/kernel/ukvm/kernel.h b/kernel/ukvm/kernel.h index a7c44acd4..7a0783f0f 100644 --- a/kernel/ukvm/kernel.h +++ b/kernel/ukvm/kernel.h @@ -24,12 +24,10 @@ #include "../kernel.h" #include "ukvm.h" -/* gdt.c: initialize segment descriptors */ -/* TODO: remove these, as they should be already done in ukvm */ -void gdt_init(void); -void gdt_load(uint64_t gdtptr); -void tss_load(uint16_t tss); - void mem_init(uint64_t size, uint64_t _kernel_end); +int ukvmclock_init(void); +uint64_t ukvmclock_monotonic(void); +uint64_t ukvmclock_epochoffset(void); + #endif diff --git a/kernel/ukvm/mem.c b/kernel/ukvm/mem.c index ba6f7f29b..d4df13a60 100644 --- a/kernel/ukvm/mem.c +++ b/kernel/ukvm/mem.c @@ -31,17 +31,17 @@ uint64_t mem_max_addr(void) void mem_init(uint64_t size, uint64_t kernel_end) { - extern char _stext[], _etext[], _erodata[], _end[]; + /* extern char _stext[], _etext[], _erodata[], _end[]; */ max_addr = size; heap_start = (kernel_end + PAGE_SIZE - 1) & PAGE_MASK; heap_top = heap_start; printf("Solo5: Memory map: %lu MB addressable:\n", max_addr >> 20); - printf("Solo5: unused @ (0x0 - 0x%lx)\n", &_stext[-1]); - printf("Solo5: text @ (0x%lx - 0x%lx)\n", &_stext, &_etext[-1]); - printf("Solo5: rodata @ (0x%lx - 0x%lx)\n", &_etext, &_erodata[-1]); - printf("Solo5: data @ (0x%lx - 0x%lx)\n", &_erodata, &_end[-1]); + /* printf("Solo5: unused @ (0x0 - 0x%lx)\n", &_stext[-1]); */ + /* printf("Solo5: text @ (0x%lx - 0x%lx)\n", &_stext, &_etext[-1]); */ + /* printf("Solo5: rodata @ (0x%lx - 0x%lx)\n", &_etext, &_erodata[-1]); */ + /* printf("Solo5: data @ (0x%lx - 0x%lx)\n", &_erodata, &_end[-1]); */ printf("Solo5: heap >= 0x%lx < stack < 0x%lx\n", heap_start, max_addr); } @@ -55,7 +55,7 @@ void mem_init(uint64_t size, uint64_t kernel_end) /* * Called by dlmalloc to allocate or free memory. */ -void *sbrk(intptr_t increment) +void *sbrk(ptrdiff_t increment) { uint64_t prev, brk; uint64_t heap_max = (uint64_t)&prev - STACK_GUARD_SIZE; diff --git a/kernel/ukvm/platform.c b/kernel/ukvm/platform.c index 6017b07d0..0500b8061 100644 --- a/kernel/ukvm/platform.c +++ b/kernel/ukvm/platform.c @@ -28,7 +28,7 @@ void platform_exit(void) cpu_halt(); } -int platform_puts(const char *buf, int n) +static inline int platform_puts(const char *buf, size_t n) { struct ukvm_puts str; @@ -40,5 +40,6 @@ int platform_puts(const char *buf, int n) return str.len; } -int solo5_console_write(const char *, size_t) - __attribute__ ((alias("platform_puts"))); +int solo5_console_write(const char *buf, size_t n) { + return platform_puts(buf, n); +} diff --git a/kernel/ukvm/time.c b/kernel/ukvm/time.c index 7042031be..46b76743d 100644 --- a/kernel/ukvm/time.c +++ b/kernel/ukvm/time.c @@ -23,18 +23,18 @@ void time_init(void) { - assert(pvclock_init() == 0); + assert(ukvmclock_init() == 0); } uint64_t solo5_clock_monotonic(void) { - return pvclock_monotonic(); + return ukvmclock_monotonic(); } /* return wall time in nsecs */ uint64_t solo5_clock_wall(void) { - return pvclock_monotonic() + pvclock_epochoffset(); + return ukvmclock_monotonic() + ukvmclock_epochoffset(); } int solo5_poll(uint64_t until_nsecs) @@ -42,12 +42,13 @@ int solo5_poll(uint64_t until_nsecs) struct ukvm_poll t; uint64_t now; + now = solo5_clock_monotonic(); if (until_nsecs <= now) t.timeout_nsecs = 0; else t.timeout_nsecs = until_nsecs - now; outl(UKVM_PORT_POLL, ukvm_ptr(&t)); - cc_barrier(); + cpu_cc_barrier(); return t.ret; } diff --git a/kernel/ukvm/ukvm.h b/kernel/ukvm/ukvm.h index 865a4029a..d6603e586 120000 --- a/kernel/ukvm/ukvm.h +++ b/kernel/ukvm/ukvm.h @@ -1 +1 @@ -../../ukvm/ukvm.h \ No newline at end of file +../../monitors/ukvm.h \ No newline at end of file diff --git a/kernel/ukvm/ukvmclock.c b/kernel/ukvm/ukvmclock.c new file mode 100644 index 000000000..38177c61f --- /dev/null +++ b/kernel/ukvm/ukvmclock.c @@ -0,0 +1,109 @@ +/*- + * Copyright (c) 2014, 2015 Antti Kantee. All Rights Reserved. + * Copyright (c) 2015 Martin Lucina. All Rights Reserved. + * Modified for solo5 by Ricardo Koller + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "kernel.h" + +/* RTC wall time offset at monotonic time base. */ +static uint64_t rtc_epochoffset; + +/* + * TSC clock specific. + */ + +/* Base time values at the last call to tscclock_monotonic(). */ +static uint64_t time_base; +static uint64_t tsc_base; + +/* Multiplier for converting TSC ticks to nsecs. (0.32) fixed point. */ +static uint32_t tsc_mult; + +/* + * Return monotonic time using TSC clock. + */ +uint64_t ukvmclock_monotonic(void) { + uint64_t tsc_now, tsc_delta; + + /* + * Update time_base (monotonic time) and tsc_base (TSC time). + */ + tsc_now = cpu_rdtsc(); + tsc_delta = tsc_now - tsc_base; + time_base += cpu_mul64_32(tsc_delta, tsc_mult); + tsc_base = tsc_now; + + return time_base; +} + +/* + * Calibrate TSC and initialise TSC clock. + */ +int ukvmclock_init(void) { + uint64_t tsc_freq, rtc_boot; + struct ukvm_time_init t; + + outl(UKVM_PORT_TIME_INIT, ukvm_ptr(&t)); + cpu_cc_barrier(); + + /* + * Read RTC "time at boot". This must be done just before tsc_base is + * initialised in order to get a correct offset below. + */ + rtc_boot = t.rtc_boot; + tsc_freq = t.freq; + tsc_base = cpu_rdtsc(); + printf("Solo5: Clock source: ukvm, frequency estimate is %lu Hz\n", + (unsigned long long)tsc_freq); + + /* + * Calculate TSC scaling multiplier. + * + * (0.32) tsc_mult = NSEC_PER_SEC (32.32) / tsc_freq (32.0) + */ + tsc_mult = (NSEC_PER_SEC << 32) / tsc_freq; + + /* + * Monotonic time begins at tsc_base (first read of TSC before + * calibration). + */ + time_base = cpu_mul64_32(tsc_base, tsc_mult); + + /* + * Compute RTC epoch offset by subtracting monotonic time_base from RTC + * time at boot. + */ + rtc_epochoffset = rtc_boot - time_base; + + return 0; +} + +/* + * Return epoch offset (wall time offset to monotonic clock start). + */ +uint64_t ukvmclock_epochoffset(void) { + return rtc_epochoffset; +} + diff --git a/kernel/cpu.S b/kernel/virtio/cpu.S similarity index 90% rename from kernel/cpu.S rename to kernel/virtio/cpu.S index 5b32e8577..9c46dd11e 100644 --- a/kernel/cpu.S +++ b/kernel/virtio/cpu.S @@ -24,19 +24,6 @@ .text .code64 -ENTRY(cpu_halt) - cli - hlt - jmp cpu_halt -END(cpu_halt) - -ENTRY(cpu_rdtsc) - rdtsc - shl $32, %rdx - add %rdx, %rax - ret -END(cpu_rdtsc) - ENTRY(cpu_idt_load) lidt 0(%rdi) ret diff --git a/kernel/cpu_vectors.S b/kernel/virtio/cpu_vectors.S similarity index 97% rename from kernel/cpu_vectors.S rename to kernel/virtio/cpu_vectors.S index f24856263..e2470a7e8 100644 --- a/kernel/cpu_vectors.S +++ b/kernel/virtio/cpu_vectors.S @@ -44,7 +44,7 @@ .macro TRAP_ENTRY trapno, has_ec .global cpu_trap_\trapno -.type cpu_trap_\trapno, @function + ## .type cpu_trap_\trapno, @function cpu_trap_\trapno: cld @@ -70,7 +70,7 @@ cpu_trap_\trapno: .macro IRQ_ENTRY irqno .global cpu_irq_\irqno -.type cpu_irq_\irqno, @function + ## .type cpu_irq_\irqno, @function cpu_irq_\irqno: cld diff --git a/kernel/intr.c b/kernel/virtio/intr.c similarity index 100% rename from kernel/intr.c rename to kernel/virtio/intr.c diff --git a/kernel/virtio/kernel.h b/kernel/virtio/kernel.h index f1e580b74..c97214789 100644 --- a/kernel/virtio/kernel.h +++ b/kernel/virtio/kernel.h @@ -24,7 +24,28 @@ #include "../kernel.h" #include "multiboot.h" +/* alignment macros */ +#define ALIGN_4K __attribute__((aligned(0x1000))) +#define ALIGN_64_BIT __attribute__((aligned(0x8))) + +/* We have already set up the GDT for the kernel. Here are the + * descriptor numbers (useful for when the kernel sets up the IDT) + */ +#define GDT_NUM_ENTRIES 6 +#define GDT_DESC_NULL 0 +#define GDT_DESC_CODE 1 +/* 2 == unused / 32-bit bootstrap */ +#define GDT_DESC_DATA 3 +#define GDT_DESC_TSS_LO 4 +#define GDT_DESC_TSS_HI 5 +#define GDT_DESC_TSS GDT_DESC_TSS_LO +#define GDT_DESC_OFFSET(n) ((n) * 0x8) + +/* We have already loaded a "known good" stack in the TSS */ +#define TSS_IST_INDEX 0x1 + void mem_init(struct multiboot_info *mb); +uint64_t mem_max_addr(void); /* serial.c: console output for debugging */ void serial_init(void); @@ -57,4 +78,76 @@ void virtio_net_pkt_put(void); /* we're done with recv'd data */ int virtio_net_xmit_packet(void *data, int len); int virtio_net_pkt_poll(void); /* test if packet(s) are available */ +/* cpu.S: low level cpu functions */ +void cpu_tss_load(uint16_t); +void cpu_idt_load(uint64_t); +void cpu_gdt_load(uint64_t); +void cpu_sse_enable(void); + +/* intr.c: interrupt handling */ +void intr_init(void); +void intr_enable(void); +void intr_disable(void); +void intr_register_irq(unsigned irq, int (*handler)(void *), void *arg); +extern int intr_depth; + +/* platform_intr.c: platform-specific interrupt handling */ +void platform_intr_init(void); +void platform_intr_clear_irq(unsigned irq); +void platform_intr_mask_irq(unsigned irq); +void platform_intr_ack_irq(unsigned irq); + +/* accessing devices via port space */ +static inline void outb(uint16_t port, uint8_t v) +{ + __asm__ __volatile__("outb %0,%1" : : "a" (v), "dN" (port)); +} +static inline void outw(uint16_t port, uint16_t v) +{ + __asm__ __volatile__("outw %0,%1" : : "a" (v), "dN" (port)); +} +static inline void outl(uint16_t port, uint32_t v) +{ + __asm__ __volatile__("outl %0,%1" : : "a" (v), "dN" (port)); +} +static inline uint8_t inb(uint16_t port) +{ + uint8_t v; + + __asm__ __volatile__("inb %1,%0" : "=a" (v) : "dN" (port)); + return v; +} +static inline uint16_t inw(uint16_t port) +{ + uint16_t v; + + __asm__ __volatile__("inw %1,%0" : "=a" (v) : "dN" (port)); + return v; +} +static inline uint32_t inl(uint16_t port) +{ + uint32_t v; + + __asm__ __volatile__("inl %1,%0" : "=a" (v) : "dN" (port)); + return v; +} + +static inline uint64_t inq(uint16_t port_lo) +{ + uint16_t port_hi = port_lo + 4; + uint32_t lo, hi; + + __asm__ __volatile__("inl %1,%0" : "=a" (lo) : "dN" (port_lo)); + __asm__ __volatile__("inl %1,%0" : "=a" (hi) : "dN" (port_hi)); + + return ((uint64_t)lo) | ((uint64_t)hi << 32); +} + +#define atomic_printf(x...) do { \ + intr_disable(); \ + printf(x); \ + intr_enable(); \ + } while (0) + + #endif diff --git a/kernel/queue.h b/kernel/virtio/queue.h similarity index 100% rename from kernel/queue.h rename to kernel/virtio/queue.h diff --git a/kernel/virtio/tscclock.c b/kernel/virtio/tscclock.c index 577ab2b1a..b52326675 100644 --- a/kernel/virtio/tscclock.c +++ b/kernel/virtio/tscclock.c @@ -179,7 +179,7 @@ uint64_t tscclock_monotonic(void) { */ tsc_now = cpu_rdtsc(); tsc_delta = tsc_now - tsc_base; - time_base += mul64_32(tsc_delta, tsc_mult); + time_base += cpu_mul64_32(tsc_delta, tsc_mult); tsc_base = tsc_now; return time_base; @@ -223,7 +223,7 @@ int tscclock_init(void) { * Monotonic time begins at tsc_base (first read of TSC before * calibration). */ - time_base = mul64_32(tsc_base, tsc_mult); + time_base = cpu_mul64_32(tsc_base, tsc_mult); /* * Compute RTC epoch offset by subtracting monotonic time_base from RTC @@ -279,7 +279,7 @@ void cpu_block(uint64_t until) { * the timeout. */ delta_ns = until - now; - delta_ticks = mul64_32(delta_ns, pit_mult); + delta_ticks = cpu_mul64_32(delta_ns, pit_mult); if (delta_ticks < PIT_MIN_DELTA) { /* * Since we are "spinning", quickly enable interrupts in diff --git a/ukvm/ukvm-blk.c b/monitors/blk.c similarity index 89% rename from ukvm/ukvm-blk.c rename to monitors/blk.c index a4721d394..503f9ac3d 100644 --- a/ukvm/ukvm-blk.c +++ b/monitors/blk.c @@ -21,7 +21,6 @@ #define _GNU_SOURCE #include #include -#include #include #include #include @@ -98,30 +97,29 @@ static void ukvm_port_blkread(uint8_t *mem, uint64_t paddr) rd->ret = 0; } -static int handle_exit(struct kvm_run *run, int vcpufd, uint8_t *mem) +static int handle_exit(struct platform *p) { - if ((run->exit_reason != KVM_EXIT_IO) || - (run->io.direction != KVM_EXIT_IO_OUT) || - (run->io.size != 4)) + if (platform_get_exit_reason(p) != EXIT_IO) return -1; - uint64_t paddr = - GUEST_PIO32_TO_PADDR((uint8_t *)run + run->io.data_offset); + int port = platform_get_io_port(p); + uint64_t data = platform_get_io_data(p); - switch (run->io.port) { + switch (port) { case UKVM_PORT_BLKINFO: - ukvm_port_blkinfo(mem, paddr); + ukvm_port_blkinfo(p->mem, data); break; case UKVM_PORT_BLKWRITE: - ukvm_port_blkwrite(mem, paddr); + ukvm_port_blkwrite(p->mem, data); break; case UKVM_PORT_BLKREAD: - ukvm_port_blkread(mem, paddr); + ukvm_port_blkread(p->mem, data); break; default: return -1; } + platform_advance_rip(p); return 0; } @@ -134,7 +132,7 @@ static int handle_cmdarg(char *cmdarg) return 0; } -static int setup(int vcpufd, uint8_t *mem) +static int setup(struct platform *p) { if (diskfile == NULL) return -1; @@ -169,4 +167,3 @@ struct ukvm_module ukvm_blk = { .usage = usage, .name = "blk" }; - diff --git a/monitors/core.c b/monitors/core.c new file mode 100644 index 000000000..7c50cb594 --- /dev/null +++ b/monitors/core.c @@ -0,0 +1,515 @@ +/* Copyright (c) 2015, IBM + * Author(s): Dan Williams + * Ricardo Koller + * + * Permission to use, copy, modify, and/or distribute this software + * for any purpose with or without fee is hereby granted, provided + * that the above copyright notice and this permission notice appear + * in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL + * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE + * AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, + * NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* from ukvm */ +#include "ukvm-private.h" +#include "ukvm-modules.h" +#include "ukvm-cpu.h" +#include "ukvm.h" +#include "unikernel-monitor.h" + +struct ukvm_module *modules[] = { +#ifdef UKVM_MODULE_BLK + &ukvm_blk, +#endif +#ifdef UKVM_MODULE_NET + &ukvm_net, +#endif +#ifdef UKVM_MODULE_GDB + &ukvm_gdb, +#endif + NULL, +}; +#define NUM_MODULES ((sizeof(modules) / sizeof(struct ukvm_module *)) - 1) + +/* + * Memory map: + * + * 0x100000 loaded elf file (linker script dictates location) + * ######## unused + * 0x013000 + * 0x012000 bootstrap pde + * 0x011000 bootstrap pdpte + * 0x010000 bootstrap pml4 + * ######## command line arguments + * 0x002000 ukvm_boot_info + * 0x001000 bootstrap gdt (contains correct code/data/ but tss points to 0) + */ + +#define BOOT_GDT 0x1000 +#define BOOT_INFO 0x2000 +#define BOOT_PML4 0x10000 +#define BOOT_PDPTE 0x11000 +#define BOOT_PDE 0x12000 + +#define BOOT_GDT_NULL 0 +#define BOOT_GDT_CODE 1 +#define BOOT_GDT_CODE32 2 +#define BOOT_GDT_DATA 3 +#define BOOT_GDT_TSS1 4 +#define BOOT_GDT_TSS2 5 +#define BOOT_GDT_MAX 6 + +static uint64_t sleep_time_s; /* track unikernel sleeping time */ +static uint64_t sleep_time_ns; +static uint64_t tsc_freq; + +static void setup_boot_info(uint8_t *mem, + uint64_t size, + uint64_t kernel_end, + int argc, char **argv) +{ + struct ukvm_boot_info *bi = (struct ukvm_boot_info *)(mem + BOOT_INFO); + uint64_t cmdline = BOOT_INFO + sizeof(struct ukvm_boot_info); + size_t cmdline_free = BOOT_PML4 - cmdline - 1; + char *cmdline_p = (char *)(mem + cmdline); + + bi->mem_size = size; + bi->kernel_end = kernel_end; + bi->cmdline = cmdline; + cmdline_p[0] = 0; + + for (; *argv; argc--, argv++) { + size_t alen = snprintf(cmdline_p, cmdline_free, "%s%s", *argv, + (argc > 1) ? " " : ""); + if (alen >= cmdline_free) { + warnx("command line too long, truncated"); + break; + } + cmdline_free -= alen; + cmdline_p += alen; + } + +} + +static void setup_system_64bit(struct platform *p) +{ + uint64_t cr0 = (X86_CR0_NE | X86_CR0_PE | X86_CR0_PG) + & ~(X86_CR0_NW | X86_CR0_CD); + uint64_t cr4 = X86_CR4_PAE | X86_CR4_VMXE; + uint64_t efer = X86_EFER_LME | X86_EFER_LMA; + + /* enable sse */ + cr0 = (cr0 | X86_CR0_MP) & ~(X86_CR0_EM); + cr4 = cr4 | X86_CR4_FXSR | X86_CR4_XMM; /* OSFXSR and OSXMMEXCPT */ + + platform_setup_system_64bit(p, cr0, cr4, efer); +} + + +static void setup_system_page_tables(struct platform *p) +{ + uint64_t *pml4 = (uint64_t *) (p->mem + BOOT_PML4); + uint64_t *pdpte = (uint64_t *) (p->mem + BOOT_PDPTE); + uint64_t *pde = (uint64_t *) (p->mem + BOOT_PDE); + uint64_t paddr; + + /* + * For simplicity we currently use 2MB pages and only a single + * PML4/PDPTE/PDE. Sanity check that the guest size is a multiple of the + * page size and will fit in a single PDE (512 entries). + */ + assert((GUEST_SIZE & (GUEST_PAGE_SIZE - 1)) == 0); + assert(GUEST_SIZE <= (GUEST_PAGE_SIZE * 512)); + + memset(pml4, 0, 4096); + memset(pdpte, 0, 4096); + memset(pde, 0, 4096); + + *pml4 = BOOT_PDPTE | (X86_PDPT_P | X86_PDPT_RW); + *pdpte = BOOT_PDE | (X86_PDPT_P | X86_PDPT_RW); + for (paddr = 0; paddr < GUEST_SIZE; paddr += GUEST_PAGE_SIZE, pde++) + *pde = paddr | (X86_PDPT_P | X86_PDPT_RW | X86_PDPT_PS); + + platform_setup_system_page_tables(p, BOOT_PML4); +} + +static void setup_system_gdt(struct platform *p, uint64_t off) +{ + uint64_t *gdt_entry; + + gdt_entry = ((uint64_t *) (p->mem + off)); + gdt_entry[0] = 0x0000000000000000; + gdt_entry[1] = 0x00af9b000000ffff; /* 64bit CS */ + gdt_entry[2] = 0x00cf9b000000ffff; /* 32bit CS */ + gdt_entry[3] = 0x00cf93000000ffff; /* DS */ + gdt_entry[4] = 0x0000000000000000; /* TSS part 1 (via C) */ + gdt_entry[5] = 0x0000000000000000; /* TSS part 2 (via C) */ + + platform_setup_system_gdt(p, BOOT_GDT_CODE, BOOT_GDT_DATA, + off, (sizeof(uint64_t) * BOOT_GDT_MAX) - 1); +} + +static void setup_system(struct platform *p, uint64_t entry) +{ + setup_system_gdt(p, BOOT_GDT); + setup_system_page_tables(p); + setup_system_64bit(p); + + platform_setup_system(p, entry, BOOT_INFO); +} + +void ukvm_port_puts(uint8_t *mem, uint64_t paddr) +{ + GUEST_CHECK_PADDR(paddr, GUEST_SIZE, sizeof (struct ukvm_puts)); + struct ukvm_puts *p = (struct ukvm_puts *)(mem + paddr); + + GUEST_CHECK_PADDR(p->data, GUEST_SIZE, p->len); + assert(write(1, mem + p->data, p->len) != -1); +} + +static void ukvm_port_time_init(uint8_t *mem, uint64_t paddr) +{ + GUEST_CHECK_PADDR(paddr, GUEST_SIZE, sizeof (struct ukvm_time_init)); + struct ukvm_time_init *p = (struct ukvm_time_init *) (mem + paddr); + struct timeval tv; + int ret; + + p->freq = tsc_freq; + ret = gettimeofday(&tv, NULL); + assert(ret == 0); + /* get ns since epoch */ + p->rtc_boot = (((uint64_t)tv.tv_sec * 1000000) + + (uint64_t)tv.tv_usec) * 1000; +} + +static void ukvm_port_poll(uint8_t *mem, uint64_t paddr) +{ + GUEST_CHECK_PADDR(paddr, GUEST_SIZE, sizeof (struct ukvm_poll)); + struct ukvm_poll *t = (struct ukvm_poll *)(mem + paddr); + uint64_t ts_s1, ts_ns1, ts_s2, ts_ns2; + + struct timespec ts; + int rc, i, max_fd = 0; + fd_set readfds; + + platform_get_timestamp(&ts_s1, &ts_ns1); + + FD_ZERO(&readfds); + for (i = 0; i < NUM_MODULES; i++) { + int fd = modules[i]->get_fd(); + + if (fd) { + FD_SET(fd, &readfds); + if (fd > max_fd) + max_fd = fd; + } + } + + ts.tv_sec = t->timeout_nsecs / 1000000000ULL; + ts.tv_nsec = t->timeout_nsecs % 1000000000ULL; + + /* + * Guest execution is blocked during the pselect() call, note that + * interrupts will not be injected. + */ + do { + rc = pselect(max_fd + 1, &readfds, NULL, NULL, &ts, NULL); + } while (rc == -1 && errno == EINTR); + assert(rc >= 0); + + platform_get_timestamp(&ts_s2, &ts_ns2); + sleep_time_s += ts_s2 - ts_s1; + sleep_time_ns += ts_ns2 - ts_ns1; + + t->ret = rc; +} + +static void tsc_init(void) +{ + platform_init_time(&tsc_freq); + printf("tsc_freq=0x%" PRIx64 "(%" PRIu64 ")\n", + tsc_freq, tsc_freq); +} + +static int dbg_rdtsc_cnt; + +static int vcpu_loop(struct platform *p) +{ + tsc_init(); + + /* Repeatedly run code and handle VM exits. */ + while (1) { + int i, handled = 0; + + if (platform_run(p)) + err(1, "Couldn't run vcpu"); + + for (i = 0; i < NUM_MODULES; i++) { + if (!modules[i]->handle_exit(p)) { + handled = 1; + break; + } + } + + if (handled) + continue; + + switch (platform_get_exit_reason(p)) { + case EXIT_HLT: + /* Guest has halted the CPU, this is considered as a normal exit. */ + printf("RDTSC count is %d\n", dbg_rdtsc_cnt); + return 0; + + case EXIT_IO: { + int port = platform_get_io_port(p); + uint64_t paddr = platform_get_io_data(p); + + switch (port) { + case UKVM_PORT_PUTS: + ukvm_port_puts(p->mem, paddr); + break; + case UKVM_PORT_POLL: + ukvm_port_poll(p->mem, paddr); + break; + case UKVM_PORT_TIME_INIT: + ukvm_port_time_init(p->mem, paddr); + break; + default: + errx(1, "Invalid guest port access: port=0x%x", port); + }; + + platform_advance_rip(p); + + break; + } + + case EXIT_RDTSC: { + uint64_t exec_time; + uint64_t sleep_time; + uint64_t new_tsc; + double tsc_f; + int dbg_sanity_check_rdtsc = 0; + + dbg_rdtsc_cnt++; + exec_time = platform_get_exec_time(p); + + if (dbg_sanity_check_rdtsc) { + static uint64_t last_exec_time; + assert(exec_time > last_exec_time); + last_exec_time = exec_time; + } + + sleep_time = ((sleep_time_s * 1000000000ULL) + sleep_time_ns); + + if (dbg_sanity_check_rdtsc) { + static uint64_t last_sleep_time; + assert(sleep_time >= last_sleep_time); + last_sleep_time = sleep_time; + } + + tsc_f = (((double)exec_time + (double)sleep_time) + * (double)tsc_freq) / 1000000000ULL; + + new_tsc = (uint64_t)tsc_f; + + if (dbg_sanity_check_rdtsc) { + static uint64_t last_tsc; + assert(new_tsc > last_tsc); + last_tsc = new_tsc; + } + + platform_emul_rdtsc(p, new_tsc); + platform_advance_rip(p); + break; + } + + case EXIT_CPUID: { + uint32_t eax, ebx, ecx, edx; + uint64_t code = platform_get_reg(p, RAX); + + eax = ebx = ecx = edx = 0; + switch (code) { + case 0: /* genuine intel */ + case 1: /* family/model, etc. */ + break; + default: + // XXX make sure all of these are OK + //printf("unsupported cpuid %llx\n", code); + //return -1; + break; + } + + printf("cpuid with code 0x%llx\n", code); + __asm__ volatile("cpuid" + :"=a"(eax),"=b"(ebx),"=c"(ecx),"=d"(edx) + :"a"((uint32_t)code)); + + printf("cpuid results are 0x%x 0x%x 0x%x 0x%x\n", eax, ebx, ecx, edx); + platform_set_reg(p, RAX, (uint64_t)eax & 0xffffffff); + platform_set_reg(p, RBX, (uint64_t)ebx & 0xffffffff); + platform_set_reg(p, RCX, (uint64_t)ecx & 0xffffffff); + platform_set_reg(p, RDX, (uint64_t)edx & 0xffffffff); + + platform_advance_rip(p); + break; + } + + case EXIT_RDRAND: { + uint64_t r; + __asm__ volatile ("rdrand %0;":"=r"(r)); + platform_emul_rdrand(p, r); + platform_advance_rip(p); + break; + } + + case EXIT_IGNORE: + break; + + case EXIT_FAIL: + return -1; + + default: + errx(1, "Unhandled exit"); + } + } +} + +int setup_modules(struct platform *p) +{ + int i; + + for (i = 0; i < NUM_MODULES; i++) { + if (modules[i]->setup(p)) { + warnx("Module `%s' setup failed", modules[i]->name); + warnx("Please check you have correctly specified:\n %s", + modules[i]->usage()); + return -1; + } + } + return 0; +} + +void sig_handler(int signo) +{ + printf("RDTSC count is %d\n", dbg_rdtsc_cnt); + errx(1, "Exiting on signal %d", signo); +} + +static void usage(const char *prog) +{ + int m; + + fprintf(stderr, "usage: %s [ CORE OPTIONS ] [ MODULE OPTIONS ] [ -- ] " + "KERNEL [ ARGS ]\n", prog); + fprintf(stderr, "KERNEL is the filename of the unikernel to run.\n"); + fprintf(stderr, "ARGS are optional arguments passed to the unikernel.\n"); + fprintf(stderr, "Core options:\n"); + fprintf(stderr, " --help (display this help)\n"); + fprintf(stderr, "Compiled-in module options:\n"); + for (m = 0; m < NUM_MODULES; m++) + fprintf(stderr, " %s\n", modules[m]->usage()); + if (!m) + fprintf(stderr, " (none)\n"); + exit(1); +} + +int main(int argc, char **argv) +{ + struct platform *p; + uint64_t entrypoint; + uint64_t kernel_end; + const char *prog; + const char *file; + struct sigaction sa; + int matched; + int rc; + + prog = basename(*argv); + argc--; + argv++; + + while (*argv && *argv[0] == '-') { + int j; + + if (strcmp("--help", *argv) == 0) + usage(prog); + + if (strcmp("--", *argv) == 0) { + /* Consume and stop arg processing */ + argc--; + argv++; + break; + } + + matched = 0; + for (j = 0; j < NUM_MODULES; j++) { + if (modules[j]->handle_cmdarg(*argv) == 0) { + /* Handled by module, consume and go on to next arg */ + matched = 1; + argc--; + argv++; + break; + } + } + if (!matched) { + warnx("Invalid option: `%s'", *argv); + usage(prog); + } + } + + /* At least one non-option argument required */ + if (*argv == NULL) { + warnx("Missing KERNEL operand"); + usage(prog); + } + file = *argv; + argc--; + argv++; + + memset(&sa, 0, sizeof (struct sigaction)); + sa.sa_handler = sig_handler; + sigfillset(&sa.sa_mask); + if (sigaction(SIGINT, &sa, NULL) == -1) + err(1, "Could not install signal handler"); + if (sigaction(SIGTERM, &sa, NULL) == -1) + err(1, "Could not install signal handler"); + + if (platform_init(&p)) + err(1, "platform init"); + + platform_load_code(p, file, &entrypoint, &kernel_end); + + /* Setup x86 registers and memory */ + setup_system(p, entrypoint); + /* Setup ukvm_boot_info and command line */ + setup_boot_info(p->mem, GUEST_SIZE, kernel_end, argc, argv); + + if (setup_modules(p)) + exit(1); + + rc = vcpu_loop(p); + platform_cleanup(p); + return rc; +} diff --git a/ukvm/ukvm-gdb.c b/monitors/gdb.c similarity index 88% rename from ukvm/ukvm-gdb.c rename to monitors/gdb.c index 3cbf20b41..6683b2710 100644 --- a/ukvm/ukvm-gdb.c +++ b/monitors/gdb.c @@ -123,11 +123,12 @@ #include #include #include -#include #include "ukvm-private.h" #include "ukvm-modules.h" +#include "ukvm-cpu.h" #include "ukvm.h" +#include "unikernel-monitor.h" static int use_gdb; @@ -248,22 +249,6 @@ int remote_debug; static const char hexchars[] = "0123456789abcdef"; -/* Number of registers. */ -#define NUMREGS 32 - -/* Number of bytes of registers. */ -#define NUMREGBYTES (NUMREGS * 8) - -/* list is here: gdb/amd64-linux-nat.c */ -enum regnames { - RAX, RBX, RCX, RDX, - RSI, RDI, RBP, RSP, - R8, R9, R10, R11, - R12, R13, R14, R15, - RIP, EFLAGS, CS, SS, - DS, ES, FS, GS -}; - /* * these should not be static cuz they can be used outside this module */ @@ -485,8 +470,7 @@ int gdb_remove_breakpoint(uint64_t addr) return 0; } - -void gdb_handle_exception(uint8_t *mem, int vcpufd, int sig) +void gdb_handle_exception(struct platform *p, int sig) { unsigned char *buffer; char obuf[4096]; @@ -525,7 +509,7 @@ void gdb_handle_exception(uint8_t *mem, int vcpufd, int sig) if ((addr + len) >= GUEST_SIZE) memset(obuf, '0', len); else - mem2hex((char *)mem + addr, obuf, len); + mem2hex((char *)p->mem + addr, obuf, len); putpacket(obuf); break; } @@ -534,39 +518,11 @@ void gdb_handle_exception(uint8_t *mem, int vcpufd, int sig) break; } case 'g': { - struct kvm_regs regs; int ret; - - ret = ioctl(vcpufd, KVM_GET_REGS, ®s); - if (ret == -1) - err(1, "KVM_GET_REGS"); - - registers[RAX] = regs.rax; - registers[RBX] = regs.rbx; - registers[RCX] = regs.rcx; - registers[RDX] = regs.rdx; - - registers[RSI] = regs.rsi; - registers[RDI] = regs.rdi; - registers[RBP] = regs.rbp; - registers[RSP] = regs.rsp; - - registers[R8] = regs.r8; - registers[R9] = regs.r9; - registers[R10] = regs.r10; - registers[R11] = regs.r11; - registers[R12] = regs.r12; - registers[R13] = regs.r13; - registers[R14] = regs.r14; - registers[R15] = regs.r15; - - registers[RIP] = regs.rip; - registers[EFLAGS] = regs.rflags; - - /* TODO what about others like cs and ss? */ + ret = platform_get_regs(p, registers); + assert(ret == 0); mem2hex((char *) registers, obuf, NUMREGBYTES); - putpacket(obuf); break; } @@ -632,7 +588,7 @@ void gdb_handle_exception(uint8_t *mem, int vcpufd, int sig) return; } -static void gdb_stub_start(int vcpufd, uint8_t *mem) +static void gdb_stub_start(struct platform *p) { int i; @@ -640,39 +596,31 @@ static void gdb_stub_start(int vcpufd, uint8_t *mem) breakpoints[i] = 0; wait_for_connect(1234); - gdb_handle_exception(mem, vcpufd, 0); + gdb_handle_exception(p, 0); } -static int handle_exit(struct kvm_run *run, int vcpufd, uint8_t *mem) +static int handle_exit(struct platform *p) { - struct kvm_debug_exit_arch *arch_info; - - if (run->exit_reason != KVM_EXIT_DEBUG) + if (platform_get_exit_reason(p) != EXIT_DEBUG) return -1; - arch_info = &run->debug.arch; - if (gdb_is_pc_breakpointing(arch_info->pc)) - gdb_handle_exception(mem, vcpufd, 1); + if (gdb_is_pc_breakpointing(platform_get_rip(p))) + gdb_handle_exception(p, 1); return 0; } -static int setup(int vcpufd, uint8_t *mem) +static int setup(struct platform *p) { if (!use_gdb) return 0; - /* TODO check if we have the KVM_CAP_SET_GUEST_DEBUG capbility */ - struct kvm_guest_debug debug = { - .control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP, - }; - - if (ioctl(vcpufd, KVM_SET_GUEST_DEBUG, &debug) < 0) - printf("KVM_SET_GUEST_DEBUG failed"); + if (platform_enable_debug(p)) + return -1; - gdb_stub_start(vcpufd, mem); + gdb_stub_start(p); return 0; } diff --git a/kernel/ukvm/platform_intr.c b/monitors/net.c similarity index 60% rename from kernel/ukvm/platform_intr.c rename to monitors/net.c index dd9376cad..fcdec04fe 100644 --- a/kernel/ukvm/platform_intr.c +++ b/monitors/net.c @@ -1,7 +1,6 @@ -/* - * Copyright (c) 2015-2017 Contributors as noted in the AUTHORS file - * - * This file is part of Solo5, a unikernel base layer. +/* Copyright (c) 2015, IBM + * Author(s): Dan Williams + * Ricardo Koller * * Permission to use, copy, modify, and/or distribute this software * for any purpose with or without fee is hereby granted, provided @@ -18,24 +17,3 @@ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ -#include "kernel.h" - -/* - * Interrupts are not used on the ukvm platform, hence these are stubs. - */ - -void platform_intr_init(void) -{ -} - -void platform_intr_ack_irq(unsigned irq __attribute__((unused))) -{ -} - -void platform_intr_mask_irq(unsigned irq __attribute__((unused))) -{ -} - -void platform_intr_clear_irq(unsigned irq __attribute__((unused))) -{ -} diff --git a/monitors/uhvf/README.md b/monitors/uhvf/README.md new file mode 100644 index 000000000..72443d2bf --- /dev/null +++ b/monitors/uhvf/README.md @@ -0,0 +1,68 @@ +This is the beginning of a port of ukvm to Hypervisor.framework on +MacOSX, so that Solo5/Mirage-based unikernels can run natively on that +system. The goal is for ukvm to end up with both a KVM-based bottom +half and Hypervisor.framework-based bottom half, whereas the top half +is shared. + +Solo5 doesn't build properly in OSX yet (although @hannesm has done a +bunch of work to make it build on FreeBSD, so I don't think it's far +off), so I build using Docker for Mac with a simple build container. I +also use containers to build Mirage unikernels. See +https://github.com/djwillia/dockerfiles. + +At the moment, uhvf can do the Solo5 hello test and ping_serve test +and also run the Mirage console, stackv4, and block test (from +mirage-skeleton). At this point, uhvf should have all the features of +ukvm. + +For networking, I'm using the `vmnet` framework. We can test ping by +running the test_ping_serve unikernel: + + sudo ./uhvf ../tests/test_ping_serve/test_ping_serve.ukvm + +And configure the host to know how to ping it like this (also in +`net-setup.bash`): + + BRIDGE=`ifconfig -l |grep -o bridge[0-9]* |tail -n 1` + IF=`ifconfig -l |grep -o en[0-9]* |tail -n 1` + sudo ifconfig $BRIDGE 10.0.0.1/24 -hostfilter $IF + +Then: + + ping 10.0.0.2 + +GDB also works, but it's a little bit weird because a gdb compiled and +configured for Mac doesn't necessarily understand the ELF format that +the .ukvm unikernel is in. For this reason, I use a gdb in a Docker +container as the gdb console. For example, this unikernel: + + sudo ./uhvf --disk=../tests/test_blk/disk.img --gdb ../tests/test_blk/test_blk.ukvm + +is debugged with (again see https://github.com/djwillia/dockerfiles): + + gdb-docker test_blk.ukvm + +with the gdb command: + + target remote $IP:1234 + +where IP is the *external* network interface's IP address. (localhost +doesn't work because of how the container networking currently is +done). + +Older notes: + +- KVM doesn't allow a trap on `rdtsc` but it should if we want to use + the same interface for ukvm and uhvf (for e.g., det replay). + +- It looks like the PVCLOCK can be completely removed from the ukvm + parts of Solo5, as long as we change the poll hypercall to send the + `until_nsecs` directly + +- All interrupt handlers can be removed from the solo5 parts of ukvm + because we get to see what exception happened in uhvf + + + + + diff --git a/monitors/uhvf/net-setup.bash b/monitors/uhvf/net-setup.bash new file mode 100644 index 000000000..97bf1a830 --- /dev/null +++ b/monitors/uhvf/net-setup.bash @@ -0,0 +1,6 @@ +#!/bin/bash + +BRIDGE=`ifconfig -l |grep -o bridge[0-9]* |tail -n 1` +IF=`ifconfig -l |grep -o en[0-9]* |tail -n 1` +sudo ifconfig $BRIDGE 10.0.0.1/24 -hostfilter $IF + diff --git a/monitors/uhvf/uhvf-blk.c b/monitors/uhvf/uhvf-blk.c new file mode 100644 index 000000000..e69de29bb diff --git a/monitors/uhvf/uhvf-core.c b/monitors/uhvf/uhvf-core.c new file mode 100644 index 000000000..a510cb157 --- /dev/null +++ b/monitors/uhvf/uhvf-core.c @@ -0,0 +1,617 @@ +/* Copyright (c) 2016, IBM + * Author(s): Dan Williams + * + * Permission to use, copy, modify, and/or distribute this software + * for any purpose with or without fee is hereby granted, provided + * that the above copyright notice and this permission notice appear + * in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL + * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE + * AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, + * NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* We used several existing projects as guides + * hvdos: https://github.com/mist64/hvdos + * xhyve: https://github.com/mist64/xhyve + * ukvm: https://github.com/solo5/solo5 + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for `basename` */ +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "../ukvm-private.h" +#include "../ukvm-cpu.h" +#include "../ukvm.h" +#include "../unikernel-monitor.h" + +#include +#include + +#ifdef __MACH__ +#include +#include +#endif + +static struct platform platform; +static clock_serv_t cclock; + +/* + * Memory map: + * + * 0x100000 loaded elf file (linker script dictates location) + * ######## unused + * 0x013000 + * 0x012000 bootstrap pde + * 0x011000 bootstrap pdpte + * 0x010000 bootstrap pml4 + * ######## command line arguments + * 0x002000 ukvm_boot_info + * 0x001000 bootstrap gdt (contains correct code/data/ but tss points to 0) + */ + +#define GUEST_PAGE_SIZE 0x200000 /* 2 MB pages in guest */ + +#define BOOT_GDT 0x1000 +#define BOOT_INFO 0x2000 +#define BOOT_PML4 0x10000 +#define BOOT_PDPTE 0x11000 +#define BOOT_PDE 0x12000 + +/* read GPR */ +uint64_t rreg(hv_vcpuid_t vcpu, hv_x86_reg_t reg) +{ + uint64_t v; + + if (hv_vcpu_read_register(vcpu, reg, &v)) + abort(); + + return v; +} + +/* write GPR */ +void wreg(hv_vcpuid_t vcpu, hv_x86_reg_t reg, uint64_t v) +{ + if (hv_vcpu_write_register(vcpu, reg, v)) + abort(); +} + +/* read VMCS field */ +static uint64_t rvmcs(hv_vcpuid_t vcpu, uint32_t field) +{ + uint64_t v; + + if (hv_vmx_vcpu_read_vmcs(vcpu, field, &v)) + abort(); + + return v; +} + +/* write VMCS field */ +static void wvmcs(hv_vcpuid_t vcpu, uint32_t field, uint64_t v) +{ + if (hv_vmx_vcpu_write_vmcs(vcpu, field, v)) + abort(); +} + +/* desired control word constrained by hardware/hypervisor capabilities */ +static uint64_t cap2ctrl(uint64_t cap, uint64_t ctrl) +{ + return (ctrl | (cap & 0xffffffff)) & (cap >> 32); +} + + +void platform_load_code(struct platform *p, const char *file, /* IN */ + uint64_t *p_entry, uint64_t *p_end) /* OUT */ +{ + int fd_kernel; + uint32_t off; + int i; + uint8_t *macho; + struct stat fd_stat; + struct mach_header_64 *hdr; + int dbg = 0; + + fd_kernel = open(file, O_RDONLY); + if (fd_kernel == -1) + goto out_error; + fstat(fd_kernel, &fd_stat); + + macho = mmap(NULL, fd_stat.st_size, PROT_READ, MAP_SHARED, fd_kernel, 0); + if (macho == MAP_FAILED) + goto out_error; + + hdr = (struct mach_header_64 *)macho; + + if (hdr->magic != MH_MAGIC_64 + || hdr->cputype != CPU_TYPE_X86_64) + goto out_invalid; + + off = sizeof(struct mach_header_64); + if (dbg) printf("%d load commands\n", hdr->ncmds); + + for (i = 0; i < hdr->ncmds; i++) { + struct load_command *lc = (struct load_command *)(macho + off); + + if (dbg) printf("0x%08x ", off); + switch (lc->cmd) { + case LC_UNIXTHREAD: { + struct x86_thread_state *ts; + ts = (struct x86_thread_state *)(macho + off + + sizeof(struct load_command)); + + if (dbg) printf("LC_UNIXTHREAD [%d]\n", lc->cmdsize); + assert(ts->tsh.flavor == x86_THREAD_STATE64); + + *p_entry = ts->uts.ts64.__rip; + + if (dbg) printf(" entry point is 0x%llx\n", *p_entry); + break; + } + case LC_UUID: + if (dbg) printf("LC_UUID\n"); + break; + case LC_SOURCE_VERSION: + if (dbg) printf("LC_SOURCE_VERSION\n"); + break; + case LC_SYMTAB: + if (dbg) printf("LC_SYMTAB\n"); + break; + case LC_SEGMENT_64: { + struct segment_command_64 *sc; + int sects; + + sc = (struct segment_command_64 *)(macho + off); + if (dbg) + printf("LC_SEGMENT_64 [%08llx - %08llx] %s (%d sections)\n", + sc->vmaddr, sc->vmaddr + sc->vmsize, + sc->segname, sc->nsects); + + for (sects = 0; sects < sc->nsects; sects++) { + struct section_64 *s = (struct section_64 *)(macho + off + + sizeof(struct segment_command_64) + + sects * sizeof(struct section_64)); + + if (dbg) printf(" [%08llx - %08llx] (0x%x) %s:%s\n", + s->addr, s->addr + s->size, s->flags, + s->segname, s->sectname); + + if ((s->flags & 0x7) == S_ZEROFILL) { + if (dbg) printf("zeroing %lld bytes at 0x%llx\n", + s->size, s->addr); + memset(p->mem + s->addr, 0, s->size); + } else { + if (dbg) printf("copying %lld bytes from 0x%x to 0x%llx\n", + s->size, s->offset, s->addr); + memcpy(p->mem + s->addr, macho + s->offset, s->size); + } + } + + *p_end = sc->vmaddr + sc->vmsize; + break; + } + default: + printf("unknown %x (%d)\n", lc->cmd, lc->cmd); + } + + off += lc->cmdsize; + } + + return; + + out_error: + err(1, "%s", file); +out_invalid: + errx(1, "%s: Exec format error", file); +} + +void platform_setup_system_64bit(struct platform *p, uint64_t cr0, + uint64_t cr4, uint64_t efer) +{ + wvmcs(p->vcpu, VMCS_GUEST_CR0, cr0); + wvmcs(p->vcpu, VMCS_GUEST_CR4, cr4); + wvmcs(p->vcpu, VMCS_GUEST_IA32_EFER, efer); +} + +void platform_setup_system_page_tables(struct platform *p, uint64_t pml4) +{ + wvmcs(p->vcpu, VMCS_GUEST_CR3, pml4); +} + +void platform_setup_system_gdt(struct platform *p, + uint64_t cs_idx, + uint64_t ds_idx, + uint64_t off, + uint64_t limit) +{ + uint64_t *gdt_entry; + gdt_entry = ((uint64_t *) (p->mem + off)); + uint64_t cs_off = cs_idx * sizeof(uint64_t); + uint64_t ds_off = ds_idx * sizeof(uint64_t); + uint64_t cs_ar = (gdt_entry[cs_idx] & 0x0f0ff0000000000) >> 40; + uint64_t ds_ar = (gdt_entry[ds_idx] & 0x0f0ff0000000000) >> 40; + + wvmcs(p->vcpu, VMCS_GUEST_CS_BASE, 0); + wvmcs(p->vcpu, VMCS_GUEST_CS_LIMIT, 0xffffffff); + wvmcs(p->vcpu, VMCS_GUEST_CS_AR, cs_ar); + wvmcs(p->vcpu, VMCS_GUEST_SS_BASE, 0); + wvmcs(p->vcpu, VMCS_GUEST_SS_LIMIT, 0xffffffff); + wvmcs(p->vcpu, VMCS_GUEST_SS_AR, ds_ar); + wvmcs(p->vcpu, VMCS_GUEST_DS_BASE, 0); + wvmcs(p->vcpu, VMCS_GUEST_DS_LIMIT, 0xffffffff); + wvmcs(p->vcpu, VMCS_GUEST_DS_AR, ds_ar); + wvmcs(p->vcpu, VMCS_GUEST_ES_BASE, 0); + wvmcs(p->vcpu, VMCS_GUEST_ES_LIMIT, 0xffffffff); + wvmcs(p->vcpu, VMCS_GUEST_ES_AR, ds_ar); + wvmcs(p->vcpu, VMCS_GUEST_FS_BASE, 0); + wvmcs(p->vcpu, VMCS_GUEST_FS_LIMIT, 0xffffffff); + wvmcs(p->vcpu, VMCS_GUEST_FS_AR, ds_ar); + wvmcs(p->vcpu, VMCS_GUEST_GS_BASE, 0); + wvmcs(p->vcpu, VMCS_GUEST_GS_LIMIT, 0xffffffff); + wvmcs(p->vcpu, VMCS_GUEST_GS_AR, ds_ar); + + wvmcs(p->vcpu, VMCS_GUEST_CS, cs_off); + wvmcs(p->vcpu, VMCS_GUEST_DS, ds_off); + wvmcs(p->vcpu, VMCS_GUEST_SS, ds_off); + wvmcs(p->vcpu, VMCS_GUEST_ES, ds_off); + wvmcs(p->vcpu, VMCS_GUEST_FS, ds_off); + wvmcs(p->vcpu, VMCS_GUEST_GS, ds_off); + + wvmcs(p->vcpu, VMCS_GUEST_GDTR_BASE, off); + wvmcs(p->vcpu, VMCS_GUEST_GDTR_LIMIT, limit); + + /* no IDT: all interrupts/exceptions exit */ + wvmcs(p->vcpu, VMCS_GUEST_IDTR_BASE, 0); + wvmcs(p->vcpu, VMCS_GUEST_IDTR_LIMIT, 0); + + wvmcs(p->vcpu, VMCS_GUEST_TR_BASE, 0); + wvmcs(p->vcpu, VMCS_GUEST_TR_LIMIT, 0); + wvmcs(p->vcpu, VMCS_GUEST_TR_AR, 0x0000008b); + wvmcs(p->vcpu, VMCS_GUEST_LDTR_BASE, 0); + wvmcs(p->vcpu, VMCS_GUEST_LDTR_LIMIT, 0xffff); + wvmcs(p->vcpu, VMCS_GUEST_LDTR_AR, 0x00000082); +} + +void platform_setup_system(struct platform *p, uint64_t entry, + uint64_t boot_info) +{ + wvmcs(p->vcpu, VMCS_GUEST_RFLAGS, 0x2); + wvmcs(p->vcpu, VMCS_GUEST_RIP, entry); + wvmcs(p->vcpu, VMCS_GUEST_RSP, GUEST_SIZE - 8); + wreg(p->vcpu, HV_X86_RDI, boot_info); + + /* trap everything for cr0 and cr4 */ + wvmcs(p->vcpu, VMCS_CTRL_CR0_MASK, 0xffffffff); + wvmcs(p->vcpu, VMCS_CTRL_CR4_MASK, 0xffffffff); + wvmcs(p->vcpu, VMCS_CTRL_CR0_SHADOW, rvmcs(p->vcpu, VMCS_GUEST_CR0)); + wvmcs(p->vcpu, VMCS_CTRL_CR4_SHADOW, rvmcs(p->vcpu, VMCS_GUEST_CR4)); +} + +#define VMX_CTRLS(v,c,t,f) do { \ + uint64_t cap; \ + if (hv_vmx_read_capability((c), &cap)) { \ + abort(); \ + } \ + \ + uint64_t zeros = cap & 0xffffffff; \ + uint64_t ones = (cap >> 32) & 0xffffffff; \ + uint64_t setting = cap2ctrl(cap, (f)); \ + if (0) { \ + printf("%s %s\n", #c, #t); \ + printf(" 0s: 0x%08llx\n", zeros); \ + printf(" 1s: 0x%08llx\n", ones); \ + printf(" setting: 0x%08llx\n", setting); \ + } \ + wvmcs((v), (t), setting); \ + } while (0) \ + +int platform_init(struct platform **pdata_p) +{ + hv_vcpuid_t vcpu; + uint8_t *mem; + + /* create a VM instance for the current task */ + if (hv_vm_create(HV_VM_DEFAULT)) + abort(); + + /* allocate some guest physical memory */ + mem = (uint8_t *)valloc(GUEST_SIZE); + if (!mem) + abort(); + + memset(mem, 0, GUEST_SIZE); + + /* map a segment of guest physical memory into the guest physical + * address space of the vm (at address 0) + */ + if (hv_vm_map(mem, 0, GUEST_SIZE, + HV_MEMORY_READ | HV_MEMORY_WRITE | HV_MEMORY_EXEC)) + abort(); + + /* create a vCPU instance for this thread */ + if (hv_vcpu_create(&vcpu, HV_VCPU_DEFAULT)) + abort(); + + /* + * From FreeBSD: + * + * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE. + * The guest FSBASE and GSBASE are saved and restored during + * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are + * always restored from the vmcs host state area on vm-exit. + * + * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in + * how they are saved/restored so can be directly accessed by the + * guest. + * + * MSR_EFER is saved and restored in the guest VMCS area on a + * VM exit and entry respectively. It is also restored from the + * host VMCS area on a VM exit. + */ + if (hv_vcpu_enable_native_msr(vcpu, MSR_GSBASE, 1) || + hv_vcpu_enable_native_msr(vcpu, MSR_FSBASE, 1) || + hv_vcpu_enable_native_msr(vcpu, MSR_SYSENTER_CS_MSR, 1) || + hv_vcpu_enable_native_msr(vcpu, MSR_SYSENTER_ESP_MSR, 1) || + hv_vcpu_enable_native_msr(vcpu, MSR_SYSENTER_EIP_MSR, 1) || + hv_vcpu_enable_native_msr(vcpu, MSR_LSTAR, 1) || + hv_vcpu_enable_native_msr(vcpu, MSR_CSTAR, 1) || + hv_vcpu_enable_native_msr(vcpu, MSR_STAR, 1) || + hv_vcpu_enable_native_msr(vcpu, MSR_SF_MASK, 1) || + hv_vcpu_enable_native_msr(vcpu, MSR_KGSBASE, 1)) { + abort(); + } + + VMX_CTRLS(vcpu, HV_VMX_CAP_PINBASED, VMCS_CTRL_PIN_BASED, 0); + + /* It appears that bit 19 and 20 (CR8 load/store exiting) are + * necessary for a bunch of things to work, including + * CPU_BASED_HLT (bit 7) and MONITOR_TRAP_FLAG (bit 27) + */ + if (1) { + VMX_CTRLS(vcpu, HV_VMX_CAP_PROCBASED, VMCS_CTRL_CPU_BASED, 0 + | CPU_BASED_HLT | CPU_BASED_INVLPG + | CPU_BASED_MWAIT | CPU_BASED_RDPMC + | CPU_BASED_UNCOND_IO + | CPU_BASED_CR8_LOAD | CPU_BASED_CR8_STORE + | CPU_BASED_CR3_LOAD | CPU_BASED_CR3_STORE); + } else { + VMX_CTRLS(vcpu, HV_VMX_CAP_PROCBASED, VMCS_CTRL_CPU_BASED, 0 + | CPU_BASED_HLT | CPU_BASED_INVLPG + | CPU_BASED_MWAIT | CPU_BASED_RDPMC + | CPU_BASED_RDTSC | CPU_BASED_UNCOND_IO + | CPU_BASED_CR8_LOAD | CPU_BASED_CR8_STORE + | CPU_BASED_CR3_LOAD | CPU_BASED_CR3_STORE); + } + VMX_CTRLS(vcpu, HV_VMX_CAP_PROCBASED2, VMCS_CTRL_CPU_BASED2, 0 + | CPU_BASED2_DESC_TABLE | CPU_BASED2_RDRAND); + VMX_CTRLS(vcpu, HV_VMX_CAP_ENTRY, VMCS_CTRL_VMENTRY_CONTROLS, 0 + | VMENTRY_GUEST_IA32E | VMENTRY_LOAD_EFER); + VMX_CTRLS(vcpu, HV_VMX_CAP_EXIT, VMCS_CTRL_VMEXIT_CONTROLS, 0); + + wvmcs(vcpu, VMCS_CTRL_EXC_BITMAP, 0xffffffff); + + platform.mem = mem; + platform.vcpu = vcpu; + platform.priv = NULL; + + *pdata_p = &platform; + + return 0; +} + +void platform_cleanup(struct platform *p) +{ + mach_port_deallocate(mach_task_self(), cclock); + + /* destroy vCPU */ + if (hv_vcpu_destroy(p->vcpu)) + abort(); + + /* unmap memory segment at address 0 */ + if (hv_vm_unmap(0, GUEST_SIZE)) + abort(); + + /* destroy VM instance of this task */ + if (hv_vm_destroy()) + abort(); + + free(p->mem); +} + +int platform_run(struct platform *p) +{ + return !!hv_vcpu_run(p->vcpu); +} + +int platform_get_exit_reason(struct platform *p) +{ + uint64_t exit_reason = rvmcs(p->vcpu, VMCS_RO_EXIT_REASON); + + switch ((int)exit_reason) { + case VMX_REASON_HLT: + return EXIT_HLT; + case VMX_REASON_CPUID: + return EXIT_CPUID; + case VMX_REASON_RDTSC: + return EXIT_RDTSC; + case VMX_REASON_RDRAND: + return EXIT_RDRAND; + case VMX_REASON_IO: + return EXIT_IO; + + case VMX_REASON_IRQ: /* host interrupt */ + case VMX_REASON_EPT_VIOLATION: /* cold misses */ + return EXIT_IGNORE; + + case VMX_REASON_EXC_NMI: { + uint32_t idt_vector_info = rvmcs(p->vcpu, VMCS_RO_IDT_VECTOR_INFO); + uint32_t idt_vector_error = rvmcs(p->vcpu, VMCS_RO_IDT_VECTOR_ERROR); + uint32_t irq_info = rvmcs(p->vcpu, VMCS_RO_VMEXIT_IRQ_INFO); + uint32_t irq_error = rvmcs(p->vcpu, VMCS_RO_VMEXIT_IRQ_ERROR); + + /* irq && HW exception && #DB */ + if (irq_info + && (((irq_info >> 8) & 0x3) == 3) + && ((irq_info & 0xff) == 1)) + return EXIT_DEBUG; + + printf("EXIT_REASON_EXCEPTION\n"); + if (idt_vector_info) { + printf("idt_vector_info = 0x%x\n", idt_vector_info); + printf("idt_vector_error = 0x%x\n", idt_vector_error); + } + if (irq_info) { + printf("irq_info = 0x%x\n", irq_info); + printf(" vector = %d (0x%x)\n", + irq_info & 0xff, + irq_info & 0xff); + switch ((irq_info >> 8) & 0x3) { + case 0: + printf(" type = external\n"); + break; + case 2: + printf(" type = NMI\n"); + break; + case 3: + printf(" type = HW exception\n"); + break; + case 6: + printf(" type = SW exception\n"); + break; + default: + printf(" type = BOGUS!!!\n"); + } + if ((irq_info >> 11) & 0x1) + printf("irq_error = 0x%x\n", irq_error); + } + + printf("RIP was 0x%llx\n", rreg(p->vcpu, HV_X86_RIP)); + printf("RSP was 0x%llx\n", rreg(p->vcpu, HV_X86_RSP)); + return EXIT_FAIL; + } + case VMX_REASON_VMENTRY_GUEST: + fprintf(stderr, "Invalid VMCS!"); + return EXIT_FAIL; + default: + fprintf(stderr, "unhandled VMEXIT %lld (0x%llx)\n", + exit_reason, exit_reason); + fprintf(stderr, "RIP was 0x%llx\n", rreg(p->vcpu, HV_X86_RIP)); + return EXIT_FAIL; + } +} + +int platform_get_io_port(struct platform *p) +{ + uint64_t exit_qualification = rvmcs(p->vcpu, VMCS_RO_EXIT_QUALIFIC); + uint16_t port = (uint16_t)(exit_qualification >> 16); + + return port; +} + +uint64_t platform_get_io_data(struct platform *p) +{ + uint64_t rax = rreg(p->vcpu, HV_X86_RAX); + + return GUEST_PIO32_TO_PADDR(&rax); +} + +void platform_advance_rip(struct platform *p) +{ + uint64_t len = rvmcs(p->vcpu, VMCS_RO_VMEXIT_INSTR_LEN); + + wvmcs(p->vcpu, VMCS_GUEST_RIP, rreg(p->vcpu, HV_X86_RIP) + len); +} + +void platform_init_time(uint64_t *freq) +{ + size_t len = sizeof(*freq); + + host_get_clock_service(mach_host_self(), + CALENDAR_CLOCK, &cclock); + + sysctlbyname("machdep.tsc.frequency", freq, &len, NULL, 0); +} + +uint64_t platform_get_exec_time(struct platform *p) +{ + uint64_t exec_time; + + if (hv_vcpu_get_exec_time(p->vcpu, &exec_time)) + errx(1, "couldn't get exec time"); + + return exec_time; +} + +void platform_emul_rdtsc(struct platform *p, uint64_t new_tsc) +{ + wreg(p->vcpu, HV_X86_RAX, new_tsc & 0xffffffff); + wreg(p->vcpu, HV_X86_RDX, (new_tsc >> 32) & 0xffffffff); +} + +void platform_get_timestamp(uint64_t *s, uint64_t *ns) +{ + mach_timespec_t mts; + + clock_get_time(cclock, &mts); + *s = mts.tv_sec; + *ns = mts.tv_nsec; +} + +static int decode_reg(int reg) { + switch(reg) { + case RAX: + return HV_X86_RAX; + case RBX: + return HV_X86_RBX; + case RCX: + return HV_X86_RCX; + case RDX: + return HV_X86_RDX; + default: + errx(1, "Couldn't decode reg\n"); + } +} + +uint64_t platform_get_reg(struct platform *p, int reg) +{ + return rreg(p->vcpu, decode_reg(reg)); +} +void platform_set_reg(struct platform *p, int reg, uint64_t val) +{ + wreg(p->vcpu, decode_reg(reg), val); +} +void platform_emul_rdrand(struct platform *p, uint64_t r) +{ + uint32_t instr_info = rvmcs(p->vcpu, VMCS_RO_VMX_INSTR_INFO); + int reg = (instr_info >> 3) & 0xf; + int size = (instr_info >> 11) & 0x3; + + if (reg != 0) /* eax */ + errx(1, "rdrand to non-eax unimplemented\n"); + if (size != 2) /* 64-bit */ + errx(1, "non-64-bit rdrand unimplemented\n"); + + wreg(p->vcpu, HV_X86_RAX, r); +} diff --git a/monitors/uhvf/uhvf-gdb.c b/monitors/uhvf/uhvf-gdb.c new file mode 100644 index 000000000..eb388ef72 --- /dev/null +++ b/monitors/uhvf/uhvf-gdb.c @@ -0,0 +1,219 @@ +/*************************************************************************** + * + * THIS SOFTWARE IS NOT COPYRIGHTED + * + * HP offers the following for use in the public domain. HP makes no + * warranty with regard to the software or it's performance and the + * user accepts the software "AS IS" with all faults. + * + * HP DISCLAIMS ANY WARRANTIES, EXPRESS OR IMPLIED, WITH REGARD + * TO THIS SOFTWARE INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. + * + **************************************************************************/ + +/**************************************************************************** + * Header: remcom.c,v 1.34 91/03/09 12:29:49 glenne Exp $ + * + * Module name: remcom.c $ + * Revision: 1.34 $ + * Date: 91/03/09 12:29:49 $ + * Contributor: Lake Stevens Instrument Division$ + * + * Description: low level support for gdb debugger. $ + * + * Considerations: only works on target hardware $ + * + * Written by: Glenn Engel $ + * ModuleState: Experimental $ + * + * NOTES: See Below $ + * + * Modified for 386 by Jim Kingdon, Cygnus Support. + * + * To enable debugger support, two things need to happen. One, a + * call to set_debug_traps() is necessary in order to allow any breakpoints + * or error conditions to be properly intercepted and reported to gdb. + * Two, a breakpoint needs to be generated to begin communication. This + * is most easily accomplished by a call to breakpoint(). Breakpoint() + * simulates a breakpoint by executing a trap #1. + * + * The external function exceptionHandler() is + * used to attach a specific handler to a specific 386 vector number. + * It should use the same privilege level it runs at. It should + * install it as an interrupt gate so that interrupts are masked + * while the handler runs. + * + * Because gdb will sometimes write to the stack area to execute function + * calls, this program cannot rely on using the supervisor stack so it + * uses it's own stack area reserved in the int array remcomStack. + * + ************* + * + * The following gdb commands are supported: + * + * command function Return value + * + * g return the value of the CPU registers hex data or ENN + * G set the value of the CPU registers OK or ENN + * + * mAA..AA,LLLL Read LLLL bytes at address AA..AA hex data or ENN + * MAA..AA,LLLL: Write LLLL bytes at address AA.AA OK or ENN + * + * c Resume at current address SNN ( signal NN) + * cAA..AA Continue at address AA..AA SNN + * + * s Step one instruction SNN + * sAA..AA Step one instruction from AA..AA SNN + * + * k kill + * + * ? What was the last sigval ? SNN (signal NN) + * + * All commands and responses are sent with a packet which includes a + * checksum. A packet consists of + * + * $#. + * + * where + * :: + * :: < two hex digits computed as modulo 256 sum of > + * + * When a packet is received, it is first acknowledged with either '+' or '-'. + * '+' indicates a successful transfer. '-' indicates a failed transfer. + * + * Example: + * + * Host: Reply: + * $m0,10#2a +$00010203040506070809101112131415#42 + * + ****************************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "../ukvm-private.h" +#include "../ukvm-modules.h" +#include "../ukvm-cpu.h" +#include "../ukvm.h" +#include "../unikernel-monitor.h" + +uint64_t platform_get_rip(struct platform *p) +{ + int ret; + uint64_t rip; + + ret = hv_vcpu_read_register(p->vcpu, HV_X86_RIP, &rip); + assert(ret == 0); + return rip; +} + +int platform_get_regs(struct platform *p, long *reg) +{ + int ret; + uint64_t v; + + ret = hv_vcpu_read_register(p->vcpu, HV_X86_RAX, &v); + assert(ret == 0); + reg[RAX] = v; + + ret = hv_vcpu_read_register(p->vcpu, HV_X86_RBX, &v); + assert(ret == 0); + reg[RBX] = v; + + ret = hv_vcpu_read_register(p->vcpu, HV_X86_RCX, &v); + assert(ret == 0); + reg[RCX] = v; + + ret = hv_vcpu_read_register(p->vcpu, HV_X86_RDX, &v); + assert(ret == 0); + reg[RDX] = v; + + ret = hv_vcpu_read_register(p->vcpu, HV_X86_RSI, &v); + assert(ret == 0); + reg[RSI] = v; + + ret = hv_vcpu_read_register(p->vcpu, HV_X86_RDI, &v); + assert(ret == 0); + reg[RDI] = v; + + ret = hv_vcpu_read_register(p->vcpu, HV_X86_RBP, &v); + assert(ret == 0); + reg[RBP] = v; + + ret = hv_vcpu_read_register(p->vcpu, HV_X86_RSP, &v); + assert(ret == 0); + reg[RSP] = v; + + ret = hv_vcpu_read_register(p->vcpu, HV_X86_R8, &v); + assert(ret == 0); + reg[R8] = v; + + ret = hv_vcpu_read_register(p->vcpu, HV_X86_R9, &v); + assert(ret == 0); + reg[R9] = v; + + ret = hv_vcpu_read_register(p->vcpu, HV_X86_R10, &v); + assert(ret == 0); + reg[R10] = v; + + ret = hv_vcpu_read_register(p->vcpu, HV_X86_R11, &v); + assert(ret == 0); + reg[R11] = v; + + ret = hv_vcpu_read_register(p->vcpu, HV_X86_R12, &v); + assert(ret == 0); + reg[R12] = v; + + ret = hv_vcpu_read_register(p->vcpu, HV_X86_R13, &v); + assert(ret == 0); + reg[R13] = v; + + ret = hv_vcpu_read_register(p->vcpu, HV_X86_R14, &v); + assert(ret == 0); + reg[R14] = v; + + ret = hv_vcpu_read_register(p->vcpu, HV_X86_R15, &v); + assert(ret == 0); + reg[R15] = v; + + ret = hv_vcpu_read_register(p->vcpu, HV_X86_RIP, &v); + assert(ret == 0); + reg[RIP] = v; + + ret = hv_vcpu_read_register(p->vcpu, HV_X86_RFLAGS, &v); + assert(ret == 0); + reg[EFLAGS] = v; + + return 0; +} + +int platform_enable_debug(struct platform *p) +{ + int ret; + uint64_t rflags; + + ret = hv_vcpu_read_register(p->vcpu, HV_X86_RFLAGS, &rflags); + assert(ret == 0); + ret = hv_vcpu_write_register(p->vcpu, HV_X86_RFLAGS, + rflags | X86_EFLAGS_TF); + assert(ret == 0); + + return 0; +} + diff --git a/monitors/uhvf/uhvf-net.c b/monitors/uhvf/uhvf-net.c new file mode 100644 index 000000000..f51a04e4d --- /dev/null +++ b/monitors/uhvf/uhvf-net.c @@ -0,0 +1,314 @@ +#include +#include +#include +#include +#include +#include +#include + +#include + +/* for net */ +#include +#include +#include +#include + +#include +#include + + +#include "../ukvm-private.h" +#include "../ukvm-modules.h" +#include "../ukvm.h" + +static char *netiface; +static int netfd; +static struct ukvm_netinfo netinfo; + +struct vmnet_state { + interface_ref iface; + const char *mac; + unsigned int mtu; + unsigned int max_packet_size; + + dispatch_queue_t if_q; + int write_fd; +}; + +static struct vmnet_state vms; + +static void vmn_enable_notifications(void) +{ + /* Whenever there are packets available we write to a pipe so that + * the generic poll on the pipe's fd can pick it up. This is not + * ideal. + */ + vmnet_interface_set_event_callback(vms.iface, + VMNET_INTERFACE_PACKETS_AVAILABLE, + vms.if_q, + ^(interface_event_t event_id, xpc_object_t event) + { + size_t num_written; + + num_written = write(vms.write_fd, "x", 1); + assert(num_written == 1); + + /* Disable the notifications until we're ready to hear about more.*/ + vmnet_interface_set_event_callback(vms.iface, + VMNET_INTERFACE_PACKETS_AVAILABLE, + NULL, + NULL); + }); +} + +static int vmn_create(void) +{ + int pipefds[2]; + xpc_object_t iface_desc; + uuid_t uuid; + + __block interface_ref iface = NULL; + __block vmnet_return_t iface_status = 0; + + iface_desc = xpc_dictionary_create(NULL, NULL, 0); + xpc_dictionary_set_uint64(iface_desc, vmnet_operation_mode_key, + VMNET_SHARED_MODE); + +#ifdef USE_TEST_UUID + /* This will result in a test MAC address of 64:65:3a:31:64:3a */ + uint8_t test_uuid[] = {0x40, 0xab, 0xea, 0x25, + 0x95, 0x2f, 0x44, 0xe8, + 0x85, 0x79, 0xb7, 0x73, + 0x67, 0x3c, 0x2e, 0xb8}; + + memcpy(&uuid, test_uuid, sizeof(uuid)); +#else + uuid_generate_random(uuid); +#endif + xpc_dictionary_set_uuid(iface_desc, vmnet_interface_id_key, uuid); + + pipe(pipefds); + vms.write_fd = pipefds[1]; + + /* do vmnet_start_interface synchronously */ + { + dispatch_queue_t if_create_q; + dispatch_semaphore_t if_create_sema; + + if_create_q = dispatch_queue_create("uhvf.vmnet.create", + DISPATCH_QUEUE_SERIAL); + if_create_sema = dispatch_semaphore_create(0); + + iface = vmnet_start_interface(iface_desc, if_create_q, + ^(vmnet_return_t status, + xpc_object_t x) + { + iface_status = status; + + if (iface_status == VMNET_SUCCESS) { + vms.mtu = xpc_dictionary_get_uint64(x, + vmnet_mtu_key); + vms.max_packet_size = xpc_dictionary_get_uint64(x, + vmnet_max_packet_size_key); + vms.mac = strdup(xpc_dictionary_get_string(x, + vmnet_mac_address_key)); + } + dispatch_semaphore_signal(if_create_sema); + }); + dispatch_semaphore_wait(if_create_sema, DISPATCH_TIME_FOREVER); + dispatch_release(if_create_q); + dispatch_release(if_create_sema); + } + + if (!iface || iface_status != VMNET_SUCCESS) { + printf("vmnet: vmnet_start_interface failed\n"); + goto out; + } + + vms.iface = iface; + vms.if_q = dispatch_queue_create("uhvf.vmnet.iface_q", 0); + + vmn_enable_notifications(); + + return pipefds[0]; + + out: + close(pipefds[0]); + close(pipefds[1]); + return -1; +} + +static ssize_t vmn_read(uint8_t *data, int len) +{ + struct iovec iov; + vmnet_return_t r; + struct vmpktdesc v; + int pktcnt; + + v.vm_pkt_size = len; + + assert(v.vm_pkt_size >= vms.max_packet_size); + + iov.iov_base = data; + iov.iov_len = len; + v.vm_pkt_iov = &iov; + v.vm_pkt_iovcnt = 1; + v.vm_flags = 0; + pktcnt = 1; + + r = vmnet_read(vms.iface, &v, &pktcnt); + { + char throwaway; + size_t num_read; + + num_read = read(netfd, &throwaway, 1); + assert(num_read == 1); + + /* We're ready now for another notification. */ + vmn_enable_notifications(); + } + + assert(r == VMNET_SUCCESS); + + if (pktcnt < 1) + return 0; + + return (ssize_t)v.vm_pkt_size; +} + +static size_t vmn_write(uint8_t *data, int len) +{ + struct iovec iov; + vmnet_return_t r; + struct vmpktdesc v; + int pktcnt; + + v.vm_pkt_size = len; + assert(len <= vms.max_packet_size); + + iov.iov_base = data; + iov.iov_len = len; + v.vm_pkt_iov = &iov; + v.vm_pkt_iovcnt = 1; + v.vm_flags = 0; + pktcnt = 1; + + r = vmnet_write(vms.iface, &v, &pktcnt); + assert(r == VMNET_SUCCESS); + + return iov.iov_len; +} + +static void ukvm_port_netinfo(uint8_t *mem, uint64_t paddr) +{ + GUEST_CHECK_PADDR(paddr, GUEST_SIZE, sizeof (struct ukvm_netinfo)); + struct ukvm_netinfo *info = (struct ukvm_netinfo *)(mem + paddr); + + printf("netinfo!\n"); + memcpy(info->mac_str, netinfo.mac_str, sizeof(netinfo.mac_str)); +} + +static void ukvm_port_netwrite(uint8_t *mem, uint64_t paddr) +{ + GUEST_CHECK_PADDR(paddr, GUEST_SIZE, sizeof (struct ukvm_netwrite)); + struct ukvm_netwrite *wr = (struct ukvm_netwrite *)(mem + paddr); + int ret; + + GUEST_CHECK_PADDR(wr->data, GUEST_SIZE, wr->len); + ret = vmn_write(mem + wr->data, wr->len); + if (wr->len != ret) + printf("wr->len=%zu ret=%d\n", wr->len, ret); + assert(wr->len == ret); + wr->ret = 0; +} + +static void ukvm_port_netread(uint8_t *mem, uint64_t paddr) +{ + GUEST_CHECK_PADDR(paddr, GUEST_SIZE, sizeof (struct ukvm_netread)); + struct ukvm_netread *rd = (struct ukvm_netread *)(mem + paddr); + int ret; + + GUEST_CHECK_PADDR(rd->data, GUEST_SIZE, rd->len); + ret = vmn_read(mem + rd->data, rd->len); + if (ret == 0) { + rd->ret = -1; + return; + } + assert(ret > 0); + rd->len = ret; + rd->ret = 0; +} + +static int handle_exit(struct platform *p) +{ + if (platform_get_exit_reason(p) != EXIT_IO) + return -1; + + int port = platform_get_io_port(p); + uint64_t data = platform_get_io_data(p); + + switch (port) { + case UKVM_PORT_NETINFO: + ukvm_port_netinfo(p->mem, data); + break; + case UKVM_PORT_NETWRITE: + ukvm_port_netwrite(p->mem, data); + break; + case UKVM_PORT_NETREAD: + ukvm_port_netread(p->mem, data); + break; + default: + return -1; + } + + platform_advance_rip(p); + return 0; +} + +static int handle_cmdarg(char *cmdarg) +{ + if (strncmp("--net=", cmdarg, 6)) + return -1; + netiface = cmdarg + 6; + return 0; +} + +static int setup(struct platform *p) +{ + + /* set up virtual network */ + netfd = vmn_create(); + if (netfd <= 0) { + perror("Allocating interface"); + exit(1); + } + snprintf(netinfo.mac_str, sizeof(netinfo.mac_str), + "%02x:%02x:%02x:%02x:%02x:%02x", + vms.mac[0], vms.mac[1], vms.mac[2], + vms.mac[3], vms.mac[4], vms.mac[5]); + + printf("Providing network: guest address %s\n", + netinfo.mac_str); + + return 0; +} + +static int get_fd(void) +{ + return netfd; +} + +static char *usage(void) +{ + return "--net=TAP (host tap device for guest network interface)"; +} + +struct ukvm_module ukvm_net = { + .get_fd = get_fd, + .handle_exit = handle_exit, + .handle_cmdarg = handle_cmdarg, + .setup = setup, + .usage = usage +}; + diff --git a/monitors/ukvm-configure b/monitors/ukvm-configure new file mode 100755 index 000000000..5ee1629a7 --- /dev/null +++ b/monitors/ukvm-configure @@ -0,0 +1,84 @@ +#!/bin/bash + +die() +{ + echo "$0: $@" 1>&2 + exit 1 +} + +if [ "$#" -lt 1 ]; then + echo "Usage: ukvm-configure UKVM_SRC [MODULES]" + echo " UKVM_SRC is /path/to/ukvm" + echo " MODULES can be any combination of: net blk gdb" + exit 1 +fi + + +# Hopefully a more portable way to get absolute path... +UKVM_SRC=`(cd $(dirname $1)/$(basename $1) && pwd -P)` +# UKVM_SRC=`readlink -f $1` +if [ ! -d ${UKVM_SRC} -o ! -f ${UKVM_SRC}/core.c ]; then + echo "Error: Not a ukvm source directory: ${UKVM_SRC}" 1>&2 + exit 1 +fi +shift +UKVM_MODULES=$@ + +case $(uname -s) in + Linux) + MON="ukvm" + MONFLAGS= + ;; + FreeBSD) + MON="ukvm" + MONFLAGS= + ;; + Darwin) + MON="uhvf" + MONFLAGS="-framework Hypervisor -framework vmnet" + ;; + *) + die "Unsupported build OS: $(uname -s)" + ;; +esac + +cat < Makefile.ukvm +# Generated by ukvm-configure $@ + +COMMON_MODULE_OBJS=\$(addsuffix .o,\$(addprefix _build-ukvm/,${UKVM_MODULES})) +UKVM_MODULE_OBJS=\$(COMMON_MODULE_OBJS) \$(addsuffix .o,\$(addprefix _build-ukvm/${MON}-,${UKVM_MODULES})) + +UKVM_MODULE_FLAGS=\$(addprefix -DUKVM_MODULE_,\$(shell echo ${UKVM_MODULES}| tr '[:lower:]' '[:upper:]')) + +UKVM_CC?=cc +UKVM_FLAGS=-D__UKVM_HOST__ \$(UKVM_MODULE_FLAGS) +UKVM_CFLAGS=-Wall -Werror -std=c99 -O2 -g \$(UKVM_FLAGS) +UKVM_OBJS=_build-ukvm/core.o _build-ukvm/${MON}-core.o \$(UKVM_MODULE_OBJS) +ifdef UKVM_STATIC +UKVM_LDFLAGS=-static +endif +UKVM_HEADERS= \\ +$UKVM_SRC/ukvm-private.h \\ +$UKVM_SRC/ukvm-modules.h \\ +$UKVM_SRC/ukvm-cpu.h \\ +$UKVM_SRC/ukvm.h + +_build-ukvm: + mkdir -p _build-ukvm + +_build-ukvm/$MON-%.o: $UKVM_SRC/$MON/$MON-%.c \$(MAKEFILE_LIST) | _build-ukvm + \$(UKVM_CC) \$(UKVM_CFLAGS) -c \$< -o \$@ + +_build-ukvm/%.o: $UKVM_SRC/%.c \$(MAKEFILE_LIST) | _build-ukvm + \$(UKVM_CC) \$(UKVM_CFLAGS) -c \$< -o \$@ + +ukvm-bin: \$(UKVM_OBJS) \$(UKVM_HEADERS) \$(MAKEFILE_LIST) + \$(UKVM_CC) \$(UKVM_LDFLAGS) -o \$@ \$(UKVM_CFLAGS) \$(UKVM_OBJS) $MONFLAGS + +.PHONY: ukvm-clean +ukvm-clean: + \$(RM) -r _build-ukvm + \$(RM) ukvm-bin + +EOF + diff --git a/ukvm/ukvm-cpu.h b/monitors/ukvm-cpu.h similarity index 50% rename from ukvm/ukvm-cpu.h rename to monitors/ukvm-cpu.h index 747296dee..b3a4b8453 100644 --- a/ukvm/ukvm-cpu.h +++ b/monitors/ukvm-cpu.h @@ -1,23 +1,3 @@ -/* - * Copyright (c) 2015-2017 Contributors as noted in the AUTHORS file - * - * This file is part of ukvm, a unikernel monitor. - * - * Permission to use, copy, modify, and/or distribute this software - * for any purpose with or without fee is hereby granted, provided - * that the above copyright notice and this permission notice appear - * in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL - * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE - * AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR - * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS - * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, - * NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - #ifndef __UKVM_CPU_H__ #define __UKVM_CPU_H__ @@ -41,28 +21,44 @@ * EFLAGS bits */ #define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ +#define X86_EFLAGS_TF_BIT 8 /* Trap flag (single step) */ +#define X86_EFLAGS_TF _BITUL(X86_EFLAGS_TF_BIT) /* * Basic CPU control in CR0 */ -#define X86_CR0_PE_BIT 0 /* Protection Enable */ -#define X86_CR0_PE _BITUL(X86_CR0_PE_BIT) -#define X86_CR0_MP_BIT 1 /* Monitor Coprocessor */ -#define X86_CR0_MP _BITUL(X86_CR0_MP_BIT) -#define X86_CR0_EM_BIT 2 /* Emulation */ -#define X86_CR0_EM _BITUL(X86_CR0_EM_BIT) -#define X86_CR0_PG_BIT 31 /* Paging */ -#define X86_CR0_PG _BITUL(X86_CR0_PG_BIT) +#define X86_CR0_PE 0x00000001 /* Protected mode Enable */ +#define X86_CR0_NE 0x00000020 /* Numeric Error enable (EX16 vs IRQ13) */ +#define X86_CR0_PG 0x80000000 /* PaGing enable */ +#define X86_CR0_NW 0x20000000 /* Not Write-through */ +#define X86_CR0_CD 0x40000000 /* Cache Disable */ +#define X86_CR0_MP 0x00000002 /* "Math" (fpu) Present */ +#define X86_CR0_EM 0x00000004 /* EMulate FPU instructions. (trap ESC only) */ /* * Intel CPU features in CR4 */ -#define X86_CR4_PAE_BIT 5 /* enable physical address extensions */ -#define X86_CR4_PAE _BITUL(X86_CR4_PAE_BIT) -#define X86_CR4_OSFXSR_BIT 9 /* OS support for FXSAVE/FXRSTOR */ -#define X86_CR4_OSFXSR _BITUL(X86_CR4_OSFXSR_BIT) -#define X86_CR4_OSXMMEXCPT_BIT 10 /* OS support for FP exceptions */ -#define X86_CR4_OSXMMEXCPT _BITUL(X86_CR4_OSXMMEXCPT_BIT) +#define X86_CR4_PAE 0x00000020 /* Physical address extension */ +#define X86_CR4_VMXE 0x00002000 /* enable VMX operation (Intel-specific) */ +#define X86_CR4_FXSR 0x00000200 /* Fast FPU save/restore used by OS */ +#define X86_CR4_XMM 0x00000400 /* enable SIMD/MMX2 to use except 16 */ + + +#define X86_EFER_LME 0x000000100 /* Long mode enable (R/W) */ +#define X86_EFER_LMA 0x000000400 /* Long mode active (R) */ + +/* AMD64 MSR's */ +#define MSR_EFER 0xc0000080 /* extended features */ +#define MSR_STAR 0xc0000081 /* legacy mode SYSCALL target/cs/ss */ +#define MSR_LSTAR 0xc0000082 /* long mode SYSCALL target rip */ +#define MSR_CSTAR 0xc0000083 /* compat mode SYSCALL target rip */ +#define MSR_SF_MASK 0xc0000084 /* syscall flags mask */ +#define MSR_FSBASE 0xc0000100 /* base address of the %fs "segment" */ +#define MSR_GSBASE 0xc0000101 /* base address of the %gs "segment" */ +#define MSR_KGSBASE 0xc0000102 /* base address of the kernel %gs */ +#define MSR_SYSENTER_CS_MSR 0x174 +#define MSR_SYSENTER_ESP_MSR 0x175 +#define MSR_SYSENTER_EIP_MSR 0x176 /* * Intel long mode page directory/table entries @@ -99,27 +95,27 @@ (((limit) & _AC(0x0000ffff, ULL)))) struct _kvm_segment { - __u64 base; - __u32 limit; - __u16 selector; - __u8 type; - __u8 present, dpl, db, s, l, g, avl; - __u8 unusable; - __u8 padding; + uint64_t base; + uint32_t limit; + uint16_t selector; + uint8_t type; + uint8_t present, dpl, db, s, l, g, avl; + uint8_t unusable; + uint8_t padding; }; -#define GDT_GET_G(x) (__u8)(((x) & 0x0080000000000000) >> 55) -#define GDT_GET_DB(x) (__u8)(((x) & 0x0040000000000000) >> 54) -#define GDT_GET_L(x) (__u8)(((x) & 0x0020000000000000) >> 53) -#define GDT_GET_AVL(x) (__u8)(((x) & 0x0010000000000000) >> 52) -#define GDT_GET_P(x) (__u8)(((x) & 0x0000800000000000) >> 47) -#define GDT_GET_DPL(x) (__u8)(((x) & 0x0000600000000000) >> 45) -#define GDT_GET_S(x) (__u8)(((x) & 0x0000100000000000) >> 44) -#define GDT_GET_TYPE(x)(__u8)(((x) & 0x00000F0000000000) >> 40) +#define GDT_GET_G(x) (uint8_t)(((x) & 0x0080000000000000) >> 55) +#define GDT_GET_DB(x) (uint8_t)(((x) & 0x0040000000000000) >> 54) +#define GDT_GET_L(x) (uint8_t)(((x) & 0x0020000000000000) >> 53) +#define GDT_GET_AVL(x) (uint8_t)(((x) & 0x0010000000000000) >> 52) +#define GDT_GET_P(x) (uint8_t)(((x) & 0x0000800000000000) >> 47) +#define GDT_GET_DPL(x) (uint8_t)(((x) & 0x0000600000000000) >> 45) +#define GDT_GET_S(x) (uint8_t)(((x) & 0x0000100000000000) >> 44) +#define GDT_GET_TYPE(x)(uint8_t)(((x) & 0x00000F0000000000) >> 40) #define GDT_TO_KVM_SEGMENT(seg, gdt_table, sel) \ do { \ - __u64 gdt_ent = gdt_table[sel]; \ + uint64_t gdt_ent = gdt_table[sel]; \ seg.base = GDT_GET_BASE(gdt_ent); \ seg.limit = GDT_GET_LIMIT(gdt_ent); \ seg.selector = sel * 8; \ @@ -134,3 +130,4 @@ struct _kvm_segment { } while (0) #endif + diff --git a/ukvm/ukvm-modules.h b/monitors/ukvm-modules.h similarity index 91% rename from ukvm/ukvm-modules.h rename to monitors/ukvm-modules.h index 756ad6a35..bad015879 100644 --- a/ukvm/ukvm-modules.h +++ b/monitors/ukvm-modules.h @@ -21,13 +21,15 @@ #ifndef __UKVM_MODULES_H__ #define __UKVM_MODULES_H__ +#include "unikernel-monitor.h" + /* hypercall interfaces exported by modules are in ukvm.h */ struct ukvm_module { int (*get_fd)(void); - int (*handle_exit)(struct kvm_run *run, int vcpufd, uint8_t *mem); + int (*handle_exit)(struct platform *p); int (*handle_cmdarg)(char *cmdarg); - int (*setup)(int vcpufd, uint8_t *mem); + int (*setup)(struct platform *p); char *(*usage)(void); const char *name; }; @@ -37,3 +39,4 @@ extern struct ukvm_module ukvm_net; extern struct ukvm_module ukvm_gdb; #endif + diff --git a/ukvm/ukvm-private.h b/monitors/ukvm-private.h similarity index 96% rename from ukvm/ukvm-private.h rename to monitors/ukvm-private.h index ded12a25c..d527b4fc3 100644 --- a/ukvm/ukvm-private.h +++ b/monitors/ukvm-private.h @@ -87,7 +87,7 @@ * Given a pointer to 32-bit guest I/O write data, dereference and return as * a guest physical address (uint64_t). */ -#define GUEST_PIO32_TO_PADDR(x) (uint64_t)(* (uint32_t *)(x)) +#define GUEST_PIO32_TO_PADDR(x) (uint64_t)(*(uint32_t *)(x)) /* * Given a guest physical address (p), validate that: @@ -97,7 +97,7 @@ * (p) and (l) must be of type uint64_t. (sz) must be of type size_t or * compatible. */ -#define GUEST_CHECK_PADDR(p, l, sz) \ +#define GUEST_CHECK_PADDR(p, l, sz) \ { \ uint64_t __e; \ if ((p >= l) || add_overflow(p, sz, __e) || (__e >= l)) \ @@ -107,3 +107,4 @@ } #endif + diff --git a/ukvm/ukvm.h b/monitors/ukvm.h similarity index 91% rename from ukvm/ukvm.h rename to monitors/ukvm.h index 2a321c1d5..e402b7171 100644 --- a/ukvm/ukvm.h +++ b/monitors/ukvm.h @@ -27,6 +27,12 @@ struct ukvm_boot_info { uint64_t cmdline; /* Address of command line (C string) */ }; +/* On x86, I/O ports are used for hypercalls to ukvm. */ +static inline void outl(uint16_t port, uint32_t v) +{ + __asm__ __volatile__("outl %0,%1" : : "a" (v), "dN" (port)); +} + /* * We can only send 32 bits via ports, so sending pointers will only * work for 32-bit addresses. If we have unikernels with more than @@ -52,6 +58,7 @@ static inline uint32_t ukvm_ptr(volatile void *p) /* was UKVM_PORT_DBG_STACK 0x508 */ #define UKVM_PORT_POLL 0x509 +#define UKVM_PORT_TIME_INIT 0x50a /* * Guest-provided pointers in UKVM I/O operations MUST be declared with @@ -147,4 +154,13 @@ struct ukvm_poll { int ret; }; + +/* UKVM_PORT_TIME_INIT */ +struct ukvm_time_init { + /* OUT */ + uint64_t freq; + uint64_t rtc_boot; +}; + #endif + diff --git a/monitors/ukvm/ukvm-blk.c b/monitors/ukvm/ukvm-blk.c new file mode 100644 index 000000000..e69de29bb diff --git a/monitors/ukvm/ukvm-core.c b/monitors/ukvm/ukvm-core.c new file mode 100644 index 000000000..c36149dea --- /dev/null +++ b/monitors/ukvm/ukvm-core.c @@ -0,0 +1,546 @@ +/* Copyright (c) 2015, IBM + * Author(s): Dan Williams + * Ricardo Koller + * + * Permission to use, copy, modify, and/or distribute this software + * for any purpose with or without fee is hereby granted, provided + * that the above copyright notice and this permission notice appear + * in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL + * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE + * AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, + * NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* We used several existing projects as guides + * kvmtest.c: http://lwn.net/Articles/658512/ + * lkvm: http://github.com/clearlinux/kvmtool + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../unikernel-monitor.h" +#include "../ukvm-private.h" +#include "../ukvm-cpu.h" + +#define KVM_32BIT_MAX_MEM_SIZE (1ULL << 32) +#define KVM_32BIT_GAP_SIZE (768 << 20) +#define KVM_32BIT_GAP_START (KVM_32BIT_MAX_MEM_SIZE - KVM_32BIT_GAP_SIZE) + +static struct platform platform; + +static ssize_t pread_in_full(int fd, void *buf, size_t count, off_t offset) +{ + ssize_t total = 0; + char *p = buf; + + if (count > SSIZE_MAX) { + errno = E2BIG; + return -1; + } + + lseek(fd, 0, SEEK_SET); + while (count > 0) { + ssize_t nr; + + lseek(fd, offset, SEEK_SET); + nr = read(fd, p, count); + if (nr == 0) + return total; + else if (nr == -1 && errno == EINTR) + continue; + else if (nr == -1) + return -1; + + count -= nr; + total += nr; + p += nr; + offset += nr; + } + + return total; +} + +/* + * Load code from elf file into *mem and return the elf entry point + * and the last byte of the program when loaded into memory. This + * accounts not only for the last loaded piece of code from the elf, + * but also for the zeroed out pieces that are not loaded and sould be + * reserved. + * + * Memory will look like this after the elf is loaded: + * + * *mem *p_entry *p_end + * | | | | + * | ... | .text .rodata | .data .bss | + * | | code | 00000000000 | + * | | [PROT_EXEC|READ] | | + * + */ +static void load_code(const char *file, uint8_t *mem, /* IN */ + uint64_t *p_entry, uint64_t *p_end) /* OUT */ +{ + int fd_kernel; + ssize_t numb; + size_t buflen; + Elf64_Off ph_off; + Elf64_Half ph_entsz; + Elf64_Half ph_cnt; + Elf64_Half ph_i; + Elf64_Phdr *phdr = NULL; + Elf64_Ehdr hdr; + + /* elf entry point (on physical memory) */ + *p_entry = 0; + /* highest byte of the program (on physical memory) */ + *p_end = 0; + + fd_kernel = open(file, O_RDONLY); + if (fd_kernel == -1) + goto out_error; + + numb = pread_in_full(fd_kernel, &hdr, sizeof(Elf64_Ehdr), 0); + if (numb < 0) + goto out_error; + if (numb != sizeof(Elf64_Ehdr)) + goto out_invalid; + + /* + * Validate program is in ELF64 format: + * 1. EI_MAG fields 0, 1, 2, 3 spell ELFMAG('0x7f', 'E', 'L', 'F'), + * 2. File contains 64-bit objects, + * 3. Objects are Executable, + * 4. Target instruction set architecture is set to x86_64. + */ + if (hdr.e_ident[EI_MAG0] != ELFMAG0 + || hdr.e_ident[EI_MAG1] != ELFMAG1 + || hdr.e_ident[EI_MAG2] != ELFMAG2 + || hdr.e_ident[EI_MAG3] != ELFMAG3 + || hdr.e_ident[EI_CLASS] != ELFCLASS64 + || hdr.e_type != ET_EXEC + || hdr.e_machine != EM_X86_64) + goto out_invalid; + + ph_off = hdr.e_phoff; + ph_entsz = hdr.e_phentsize; + ph_cnt = hdr.e_phnum; + buflen = ph_entsz * ph_cnt; + + phdr = (Elf64_Phdr *)malloc(buflen); + if (!phdr) + goto out_error; + numb = pread_in_full(fd_kernel, phdr, buflen, ph_off); + if (numb < 0) + goto out_error; + if (numb != buflen) + goto out_invalid; + + /* + * Load all segments with the LOAD directive from the elf file at offset + * p_offset, and copy that into p_addr in memory. The amount of bytes + * copied is p_filesz. However, each segment should be given + * p_memsz aligned up to p_align bytes on memory. + */ + for (ph_i = 0; ph_i < ph_cnt; ph_i++) { + uint8_t *daddr; + uint64_t _end; + size_t offset = phdr[ph_i].p_offset; + size_t filesz = phdr[ph_i].p_filesz; + size_t memsz = phdr[ph_i].p_memsz; + uint64_t paddr = phdr[ph_i].p_paddr; + uint64_t align = phdr[ph_i].p_align; + uint64_t result; + + if (phdr[ph_i].p_type != PT_LOAD) + continue; + + if ((paddr >= GUEST_SIZE) || add_overflow(paddr, filesz, result) + || (result >= GUEST_SIZE)) + goto out_invalid; + if (add_overflow(paddr, memsz, result) || (result >= GUEST_SIZE)) + goto out_invalid; + /* + * Verify that align is a non-zero power of 2 and safely compute + * ((_end + (align - 1)) & -align). + */ + if (align > 0 && (align & (align - 1)) == 0) { + if (add_overflow(result, (align - 1), _end)) + goto out_invalid; + _end = _end & -align; + } else { + _end = result; + } + if (_end > *p_end) + *p_end = _end; + + daddr = p->mem + paddr; + numb = pread_in_full(fd_kernel, daddr, filesz, offset); + if (numb < 0) + goto out_error; + if (numb != filesz) + goto out_invalid; + memset(daddr + filesz, 0, memsz - filesz); + + /* Write-protect the executable segment */ + if (phdr[ph_i].p_flags & PF_X) { + if (mprotect(daddr, _end - paddr, PROT_EXEC | PROT_READ) == -1) + goto out_error; + } + } + + free(phdr); + close(fd_kernel); + *p_entry = hdr.e_entry; + return; + +out_error: + err(1, "%s", file); + +out_invalid: + errx(1, "%s: Exec format error", file); +} + +void platform_setup_system_64bit(struct platform *p, uint64_t cr0, + uint64_t cr4, uint64_t efer) +{ + struct kvm_sregs sregs; + int ret; + + ret = ioctl(p->vcpu, KVM_GET_SREGS, &sregs); + if (ret == -1) + err(1, "KVM: ioctl (GET_SREGS) failed"); + + sregs.cr0 = cr0; + sregs.cr4 = cr4; + sregs.efer = efer; + + ret = ioctl(p->vcpu, KVM_SET_SREGS, &sregs); + if (ret == -1) + err(1, "KVM: ioctl (SET_SREGS) failed"); +} + +void platform_setup_system_page_tables(struct platform *p, + uint64_t pml4) +{ + struct kvm_sregs sregs; + int ret; + + ret = ioctl(p->vcpu, KVM_GET_SREGS, &sregs); + if (ret == -1) + err(1, "KVM: ioctl (GET_SREGS) failed"); + + sregs.cr3 = pml4; + + ret = ioctl(p->vcpu, KVM_SET_SREGS, &sregs); + if (ret == -1) + err(1, "KVM: ioctl (SET_SREGS) failed"); +} + +void platform_setup_system_gdt(struct platform *p, + uint64_t cs_idx, + uint64_t ds_idx, + uint64_t off, + uint64_t limit) +{ + struct kvm_sregs sregs; + struct kvm_segment data_seg, code_seg; + int ret; + uint64_t *gdt = (uint64_t *) (p->mem + off); + + /* Set all cpu/mem system structures */ + ret = ioctl(p->vcpu, KVM_GET_SREGS, &sregs); + if (ret == -1) + err(1, "KVM: ioctl (GET_SREGS) failed"); + + sregs.gdt.base = off; + sregs.gdt.limit = limit; + + GDT_TO_KVM_SEGMENT(code_seg, gdt, cs_idx); + GDT_TO_KVM_SEGMENT(data_seg, gdt, ds_idx); + + sregs.cs = code_seg; + sregs.ds = data_seg; + sregs.es = data_seg; + sregs.fs = data_seg; + sregs.gs = data_seg; + sregs.ss = data_seg; + + ret = ioctl(p->vcpu, KVM_SET_SREGS, &sregs); + if (ret == -1) + err(1, "KVM: ioctl (SET_SREGS) failed"); +} + +static void setup_cpuid(int kvm, int vcpufd) +{ + struct kvm_cpuid2 *kvm_cpuid; + int max_entries = 100; + + kvm_cpuid = calloc(1, sizeof(*kvm_cpuid) + + max_entries * sizeof(*kvm_cpuid->entries)); + kvm_cpuid->nent = max_entries; + + if (ioctl(kvm, KVM_GET_SUPPORTED_CPUID, kvm_cpuid) < 0) + err(1, "KVM: ioctl (GET_SUPPORTED_CPUID) failed"); + + if (ioctl(vcpufd, KVM_SET_CPUID2, kvm_cpuid) < 0) + err(1, "KVM: ioctl (SET_CPUID2) failed"); +} + +int platform_run(struct platform *p) +{ + while (1) { + int ret; + + ret = ioctl(p->vcpu, KVM_RUN, NULL); + if (ret == -1 && errno == EINTR) + continue; + if (ret == -1) { + if (errno == EFAULT) { + struct kvm_regs regs; + ret = ioctl(p->vcpu, KVM_GET_REGS, ®s); + if (ret == -1) + err(1, "KVM: ioctl (GET_REGS) failed after guest fault"); + errx(1, "KVM: host/guest translation fault: rip=0x%llx", + regs.rip); + } else { + err(1, "KVM: ioctl (RUN) failed"); + } + } + + return 0; + } +} + +int platform_get_io_port(struct platform *p) +{ + struct kvm_run *run = (struct kvm_run *)p->priv; + + if (run->io.direction != KVM_EXIT_IO_OUT + || run->io.size != 4) + errx(1, "Invalid guest port access: port=0x%x", run->io.port); + + return run->io.port; +} + +uint64_t platform_get_io_data(struct platform *p) +{ + struct kvm_run *run = (struct kvm_run *)p->priv; + + assert(run->io.direction == KVM_EXIT_IO_OUT); + assert(run->io.size == 4); + + return GUEST_PIO32_TO_PADDR((uint8_t *)run + run->io.data_offset); +} + + +int platform_get_exit_reason(struct platform *p) +{ + struct kvm_run *run = (struct kvm_run *)p->priv; + + switch (run->exit_reason) { + case KVM_EXIT_HLT: + return EXIT_HLT; + + case KVM_EXIT_IO: + return EXIT_IO; + + case KVM_EXIT_INTR: + return EXIT_IGNORE; + + case KVM_EXIT_DEBUG: + return EXIT_DEBUG; + + case KVM_EXIT_FAIL_ENTRY: + errx(1, "KVM: entry failure: hw_entry_failure_reason=0x%llx", + run->fail_entry.hardware_entry_failure_reason); + + case KVM_EXIT_INTERNAL_ERROR: + errx(1, "KVM: internal error exit: suberror=0x%x", + run->internal.suberror); + + default: + errx(1, "KVM: unhandled exit: exit_reason=0x%x", run->exit_reason); + } +} + +int platform_init(struct platform **pdata_p) +{ + int kvm, ret, vmfd, vcpufd; + uint8_t *mem; + struct kvm_run *run; + size_t mmap_size; + + kvm = open("/dev/kvm", O_RDWR | O_CLOEXEC); + if (kvm == -1) + err(1, "Could not open: /dev/kvm"); + + /* Make sure we have the stable version of the API */ + ret = ioctl(kvm, KVM_GET_API_VERSION, NULL); + if (ret == -1) + err(1, "KVM: ioctl (GET_API_VERSION) failed"); + if (ret != 12) + errx(1, "KVM: API version is %d, ukvm requires version 12", ret); + + vmfd = ioctl(kvm, KVM_CREATE_VM, 0); + if (vmfd == -1) + err(1, "KVM: ioctl (CREATE_VM) failed"); + + /* + * TODO If the guest size is larger than ~4GB, we need two region + * slots: one before the pci gap, and one after it. + * Reference: kvmtool x86/kvm.c:kvm__init_ram() + */ + assert(GUEST_SIZE < KVM_32BIT_GAP_START); + + /* Allocate GUEST_SIZE page-aligned guest memory. */ + mem = mmap(NULL, GUEST_SIZE, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (mem == MAP_FAILED) + err(1, "Error allocating guest memory"); + + struct kvm_userspace_memory_region region = { + .slot = 0, + .guest_phys_addr = 0, + .memory_size = GUEST_SIZE, + .userspace_addr = (uint64_t) mem, + }; + + ret = ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, ®ion); + if (ret == -1) + err(1, "KVM: ioctl (SET_USER_MEMORY_REGION) failed"); + + + /* enabling this seems to mess up our receiving of hlt instructions */ + /* ret = ioctl(vmfd, KVM_CREATE_IRQCHIP); */ + /* if (ret == -1) */ + /* err(1, "KVM_CREATE_IRQCHIP"); */ + + vcpufd = ioctl(vmfd, KVM_CREATE_VCPU, 0); + if (vcpufd == -1) + err(1, "KVM: ioctl (CREATE_VCPU) failed"); + + /* Map the shared kvm_run structure and following data. */ + ret = ioctl(kvm, KVM_GET_VCPU_MMAP_SIZE, NULL); + if (ret == -1) + err(1, "KVM: ioctl (GET_VCPU_MMAP_SIZE) failed"); + mmap_size = ret; + if (mmap_size < sizeof(*run)) + errx(1, "KVM: invalid VCPU_MMAP_SIZE: %zd", mmap_size); + run = + mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, vcpufd, + 0); + if (run == MAP_FAILED) + err(1, "KVM: VCPU mmap failed"); + + setup_cpuid(kvm, vcpufd); + + platform.mem = mem; + platform.vcpu = vcpufd; + platform.priv = run; + + *pdata_p = &platform; + + return 0; +} + +void platform_setup_system(struct platform *p, uint64_t entry, + uint64_t boot_info) +{ + int ret; + /* + * Initialize registers: instruction pointer for our code, addends, + * and initial flags required by x86 architecture. + * Arguments to the kernel main are passed using the x86_64 calling + * convention: RDI, RSI, RDX, RCX, R8, and R9 + */ + struct kvm_regs regs = { + .rip = entry, + .rflags = 0x2, + .rsp = GUEST_SIZE - 8, /* x86_64 ABI requires ((rsp + 8) % 16) == 0 */ + .rdi = boot_info, /* size arg in kernel main */ + }; + ret = ioctl(p->vcpu, KVM_SET_REGS, ®s); + if (ret == -1) + err(1, "KVM: ioctl (SET_REGS) failed"); +} + +void platform_cleanup(struct platform *p) +{ + /* XXX */ +} + +void platform_advance_rip(struct platform *p) +{ + /* no-op: KVM automatically advances RIP after I/O */ +} + +/* XXX this is horrible */ +static uint64_t get_tsc_const(void) +{ + FILE *f = fopen("/proc/cpuinfo", "r"); + uint64_t mhz = 0, dec = 0; + int ret = 0; + + assert(f != NULL); + + while (ret == 0) { + ret = fscanf(f, "cpu MHz\t: %lu.%lu\n", &mhz, &dec); + if (ret == 0) { + while (fgetc(f) != '\n') + fgetc(f); + } + } + return (mhz * 1000000) + (dec * 1000); +} + +void platform_init_time(uint64_t *freq) +{ + *freq = get_tsc_const(); +} + +uint64_t platform_get_exec_time(struct platform *p) +{ + printf("unimplemented"); + assert(0); + return 0; +} + +void platform_emul_rdtsc(struct platform *p, uint64_t new_tsc) +{ + printf("unimplemented"); + assert(0); +} + +void platform_get_timestamp(uint64_t *s, uint64_t *ns) +{ + struct timespec tp; + + clock_gettime(CLOCK_REALTIME, &tp); + *s = tp.tv_sec; + *ns = tp.tv_nsec; +} diff --git a/monitors/ukvm/ukvm-gdb.c b/monitors/ukvm/ukvm-gdb.c new file mode 100644 index 000000000..766a48e5e --- /dev/null +++ b/monitors/ukvm/ukvm-gdb.c @@ -0,0 +1,81 @@ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "../ukvm-private.h" +#include "../ukvm-modules.h" +#include "../ukvm-cpu.h" +#include "../ukvm.h" +#include "../unikernel-monitor.h" + +int platform_get_regs(struct platform *p, long *registers) +{ + struct kvm_regs regs; + int ret; + + ret = ioctl(p->vcpu, KVM_GET_REGS, ®s); + if (ret == -1) + err(1, "KVM_GET_REGS"); + + registers[RAX] = regs.rax; + registers[RBX] = regs.rbx; + registers[RCX] = regs.rcx; + registers[RDX] = regs.rdx; + + registers[RSI] = regs.rsi; + registers[RDI] = regs.rdi; + registers[RBP] = regs.rbp; + registers[RSP] = regs.rsp; + + registers[R8] = regs.r8; + registers[R9] = regs.r9; + registers[R10] = regs.r10; + registers[R11] = regs.r11; + registers[R12] = regs.r12; + registers[R13] = regs.r13; + registers[R14] = regs.r14; + registers[R15] = regs.r15; + + registers[RIP] = regs.rip; + registers[EFLAGS] = regs.rflags; + + /* TODO what about others like cs and ss? */ + return 0; +} + +uint64_t platform_get_rip(struct platform *p) +{ + struct kvm_run *run = (struct kvm_run *)p->priv; + struct kvm_debug_exit_arch *arch_info; + + arch_info = &run->debug.arch; + return arch_info->pc; +} + +int platform_enable_debug(struct platform *p) +{ + /* TODO check if we have the KVM_CAP_SET_GUEST_DEBUG capbility */ + struct kvm_guest_debug debug = { + .control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP, + }; + + if (ioctl(p->vcpu, KVM_SET_GUEST_DEBUG, &debug) < 0) { + printf("KVM_SET_GUEST_DEBUG failed"); + return -1; + } + + return 0; +} diff --git a/ukvm/ukvm-net.c b/monitors/ukvm/ukvm-net.c similarity index 95% rename from ukvm/ukvm-net.c rename to monitors/ukvm/ukvm-net.c index 60121b14e..20dd563dd 100644 --- a/ukvm/ukvm-net.c +++ b/monitors/ukvm/ukvm-net.c @@ -38,9 +38,9 @@ #include #include -#include "ukvm-private.h" -#include "ukvm-modules.h" -#include "ukvm.h" +#include "../ukvm-private.h" +#include "../ukvm-modules.h" +#include "../ukvm.h" static char *netiface; static int netfd; @@ -104,8 +104,8 @@ static int tap_attach(const char *dev) } /* * If we got back a different device than the one requested, e.g. because - * the caller mistakenly passed in '%d' (yes, that's really in the Linux API) - * then fail. + * the caller mistakenly passed in '%d' (yes, that's really in the Linux + * API) then fail. */ if (strncmp(ifr.ifr_name, dev, IFNAMSIZ) != 0) { close(fd); @@ -170,8 +170,10 @@ static void ukvm_port_netread(uint8_t *mem, uint64_t paddr) rd->ret = 0; } -static int handle_exit(struct kvm_run *run, int vcpufd, uint8_t *mem) +static int handle_exit(struct platform *p) { + struct kvm_run *run = (struct kvm_run *)p->priv; + if ((run->exit_reason != KVM_EXIT_IO) || (run->io.direction != KVM_EXIT_IO_OUT) || (run->io.size != 4)) @@ -182,13 +184,13 @@ static int handle_exit(struct kvm_run *run, int vcpufd, uint8_t *mem) switch (run->io.port) { case UKVM_PORT_NETINFO: - ukvm_port_netinfo(mem, paddr); + ukvm_port_netinfo(p->mem, paddr); break; case UKVM_PORT_NETWRITE: - ukvm_port_netwrite(mem, paddr); + ukvm_port_netwrite(p->mem, paddr); break; case UKVM_PORT_NETREAD: - ukvm_port_netread(mem, paddr); + ukvm_port_netread(p->mem, paddr); break; default: return -1; @@ -220,7 +222,7 @@ static int handle_cmdarg(char *cmdarg) } } -static int setup(int vcpufd, uint8_t *mem) +static int setup(struct platform *p) { if (netiface == NULL) return -1; diff --git a/monitors/unikernel-monitor.h b/monitors/unikernel-monitor.h new file mode 100644 index 000000000..857c002a6 --- /dev/null +++ b/monitors/unikernel-monitor.h @@ -0,0 +1,86 @@ +#ifndef __UNIKERNEL_MONITOR_H__ +#define __UNIKERNEL_MONITOR_H__ + + +enum { + EXIT_HLT, + EXIT_IO, + + EXIT_RDTSC, + EXIT_CPUID, + EXIT_RDRAND, + + EXIT_DEBUG, + EXIT_IGNORE, + EXIT_FAIL, +}; + + +#ifdef __APPLE__ +#include +typedef hv_vcpuid_t platform_vcpu_t; +#else +typedef uint64_t platform_vcpu_t; +#endif + +struct platform { + platform_vcpu_t vcpu; + uint8_t *mem; + void *priv; +}; + +/* in /-core.c */ +int platform_init(struct platform **p); + +void platform_load_code(struct platform *p, const char *file, /* IN */ + uint64_t *p_entry, uint64_t *p_end); /* OUT */ + +void platform_setup_system_64bit(struct platform *p, uint64_t cr0, + uint64_t cr4, uint64_t efer); +void platform_setup_system_page_tables(struct platform *p, uint64_t pml4); +void platform_setup_system_gdt(struct platform *p, + uint64_t cs_idx, uint64_t ds_idx, + uint64_t off, uint64_t limit); +void platform_setup_system(struct platform *p, uint64_t entry, + uint64_t boot_info); + +int platform_run(struct platform *p); +int platform_get_exit_reason(struct platform *p); +int platform_get_io_port(struct platform *p); +uint64_t platform_get_io_data(struct platform *p); + +void platform_advance_rip(struct platform *p); +void platform_cleanup(struct platform *p); + +void platform_init_time(uint64_t *freq); +void platform_get_timestamp(uint64_t *s, uint64_t *ns); +uint64_t platform_get_exec_time(struct platform *p); +void platform_emul_rdtsc(struct platform *p, uint64_t new_tsc); +void platform_emul_rdrand(struct platform *p, uint64_t r); + +uint64_t platform_get_reg(struct platform *p, int reg); +void platform_set_reg(struct platform *p, int reg, uint64_t val); + + +/* in /-gdb.c */ +int platform_enable_debug(struct platform *p); +uint64_t platform_get_rip(struct platform *p); +int platform_get_regs(struct platform *p, long *reg); + +/* XXX this doesn't belong here (for gdb) */ +/* Number of registers. */ +#define NUMREGS 32 +/* Number of bytes of registers. */ +#define NUMREGBYTES (NUMREGS * 8) +/* list is here: gdb/amd64-linux-nat.c */ +enum regnames { + RAX, RBX, RCX, RDX, + RSI, RDI, RBP, RSP, + R8, R9, R10, R11, + R12, R13, R14, R15, + RIP, EFLAGS, CS, SS, + DS, ES, FS, GS +}; + +#endif + diff --git a/solo5-kernel-ukvm.opam b/solo5-kernel-ukvm.opam index 1d3e3ce1a..eaaef96ec 100644 --- a/solo5-kernel-ukvm.opam +++ b/solo5-kernel-ukvm.opam @@ -27,5 +27,5 @@ depexts: [ ] available: [ - ocaml-version >= "4.02.3" & arch = "x86_64" & os = "linux" + ocaml-version >= "4.02.3" & arch = "x86_64" ] diff --git a/solo5-kernel-ukvm.pc.in b/solo5-kernel-ukvm.pc.in index 690ce13be..b770154d2 100644 --- a/solo5-kernel-ukvm.pc.in +++ b/solo5-kernel-ukvm.pc.in @@ -2,7 +2,8 @@ prefix=${pcfiledir}/../.. exec_prefix=${prefix} includedir=${prefix}/include/solo5-kernel-ukvm/include libdir=${exec_prefix}/lib/solo5-kernel-ukvm -ldflags=!LDFLAGS! -T ${libdir}/solo5.lds ${libdir}/solo5.o +#ldflags=!LDFLAGS! -T ${libdir}/solo5.lds ${libdir}/solo5.o +ldflags=!LDFLAGS! -e __start -segalign 1000 -seg1addr -fff00000 ${libdir}/solo5.o Name: solo5-kernel-ukvm Version: 0.1 diff --git a/tests/Makefile b/tests/Makefile index be226fca3..27caba89b 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -16,7 +16,7 @@ # NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -TESTDIRS=test_hello test_globals test_ping_serve test_blk test_exception test_fpu +TESTDIRS=test_hello test_globals test_ping_serve test_blk test_exception test_fpu test_rdtsc UKVM_TESTS=$(subst test, _test_ukvm, $(TESTDIRS)) VIRTIO_TESTS=$(subst test, _test_virtio, $(TESTDIRS)) diff --git a/tests/Makefile.tests b/tests/Makefile.tests index 3aca2ee87..10bd9367b 100644 --- a/tests/Makefile.tests +++ b/tests/Makefile.tests @@ -26,7 +26,7 @@ ukvm: $(UKVM_TARGETS) virtio: $(VIRTIO_TARGETS) SOLO5_DIR=$(TOP)/kernel -UKVM_SRC=$(TOP)/ukvm +UKVM_SRC=$(TOP)/monitors CFLAGS+=-I$(SOLO5_DIR) @@ -39,10 +39,10 @@ Makefile.ukvm: $(UKVM_SRC)/ukvm-configure $(UKVM_SRC)/ukvm-configure $(UKVM_SRC) $(UKVM_MODULES) -include Makefile.ukvm +# $(LD) -T $(SOLO5_DIR)/ukvm/solo5.lds \ %.ukvm: %.o $(SOLO5_DIR)/ukvm/solo5.lds $(SOLO5_DIR)/ukvm/solo5.o - $(LD) -T $(SOLO5_DIR)/ukvm/solo5.lds \ - $(LDFLAGS) -o $@ $(SOLO5_DIR)/ukvm/solo5.o $< $(LDLIBS) + $(LD) -e __start -segalign 1000 -seg1addr -fff00000 $(LDFLAGS) -o $@ $(SOLO5_DIR)/ukvm/solo5.o $< $(LDLIBS) $(SOLO5_DIR)/ukvm/solo5.o: $(MAKE) -C $(SOLO5_DIR) ukvm diff --git a/tests/test_blk/test_blk.c b/tests/test_blk/test_blk.c index db50a7739..bd37ad0c8 100644 --- a/tests/test_blk/test_blk.c +++ b/tests/test_blk/test_blk.c @@ -19,7 +19,7 @@ */ #include "solo5.h" -#include "../../kernel/lib.c" +//#include "../../kernel/lib.c" static void puts(const char *s) { diff --git a/tests/test_exception/test_exception.c b/tests/test_exception/test_exception.c index 07d84260e..939317cf2 100644 --- a/tests/test_exception/test_exception.c +++ b/tests/test_exception/test_exception.c @@ -19,7 +19,7 @@ */ #include "solo5.h" -#include "../../kernel/lib.c" +//#include "../../kernel/lib.c" static void puts(const char *s) { diff --git a/tests/test_fpu/test_fpu.c b/tests/test_fpu/test_fpu.c index 2f21ffc78..e39cfdace 100644 --- a/tests/test_fpu/test_fpu.c +++ b/tests/test_fpu/test_fpu.c @@ -19,7 +19,7 @@ */ #include "solo5.h" -#include "../../kernel/lib.c" +//#include "../../kernel/lib.c" static void puts(const char *s) { diff --git a/tests/test_globals/test_globals.c b/tests/test_globals/test_globals.c index e782363eb..6d480c29c 100644 --- a/tests/test_globals/test_globals.c +++ b/tests/test_globals/test_globals.c @@ -19,7 +19,7 @@ */ #include "solo5.h" -#include "../../kernel/lib.c" +//#include "../../kernel/lib.c" static void puts(const char *s) { diff --git a/tests/test_hello/test_hello.c b/tests/test_hello/test_hello.c index 6a7638d44..a5adde5fe 100644 --- a/tests/test_hello/test_hello.c +++ b/tests/test_hello/test_hello.c @@ -19,7 +19,7 @@ */ #include "solo5.h" -#include "../../kernel/lib.c" +//#include "../../kernel/lib.c" static void puts(const char *s) { diff --git a/tests/test_ping_serve/test_ping_serve.c b/tests/test_ping_serve/test_ping_serve.c index edfb81b43..ead520b89 100644 --- a/tests/test_ping_serve/test_ping_serve.c +++ b/tests/test_ping_serve/test_ping_serve.c @@ -19,7 +19,7 @@ */ #include "solo5.h" -#include "../../kernel/lib.c" +//#include "../../kernel/lib.c" static void puts(const char *s) { diff --git a/tests/test_rdtsc/Makefile b/tests/test_rdtsc/Makefile new file mode 100644 index 000000000..9017df130 --- /dev/null +++ b/tests/test_rdtsc/Makefile @@ -0,0 +1,23 @@ +# Copyright (c) 2015-2017 Contributors as noted in the AUTHORS file +# +# This file is part of Solo5, a unikernel base layer. +# +# Permission to use, copy, modify, and/or distribute this software +# for any purpose with or without fee is hereby granted, provided +# that the above copyright notice and this permission notice appear +# in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL +# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE +# AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR +# CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS +# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, +# NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +UKVM_TARGETS=test_rdtsc.ukvm ukvm-bin +VIRTIO_TARGETS=test_rdtsc.virtio +UKVM_MODULES= + +include ../Makefile.tests diff --git a/tests/test_rdtsc/test_rdtsc.c b/tests/test_rdtsc/test_rdtsc.c new file mode 100644 index 000000000..abc6d11a1 --- /dev/null +++ b/tests/test_rdtsc/test_rdtsc.c @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2015-2017 Contributors as noted in the AUTHORS file + * + * This file is part of Solo5, a unikernel base layer. + * + * Permission to use, copy, modify, and/or distribute this software + * for any purpose with or without fee is hereby granted, provided + * that the above copyright notice and this permission notice appear + * in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL + * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE + * AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS + * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, + * NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "solo5.h" + +static char *digits = "0123456789abcdef"; + +#define STRLEN 16 +static char strbuf[STRLEN + 1]; + +void hex_to_str(char *target, uint64_t val) { + int i; + for (i = 0; i < STRLEN; i++) { + int digit = val & 0xf; + target[STRLEN - i - 1] = digits[digit]; + val = val >> 4; + } + target[STRLEN] = '\n'; + +} + +static void puts(const char *s) +{ + solo5_console_write(s, strlen(s)); +} + +static inline uint64_t cpu_rdtsc(void) +{ + uint32_t edx_, eax_; + + __asm__("rdtsc" : "=a" (eax_), "=d" (edx_)); + return (uint64_t)eax_ + ((uint64_t)edx_ << 32); +} + + +#define RDTSC_LOOPS 1000000 + +int solo5_app_main(char *cmdline __attribute__((unused))) +{ + volatile uint64_t start, end; + int i; + + puts("\n**** Solo5 standalone test_rdtsc ****\n\n"); + + start = cpu_rdtsc(); + for (i = 0; i < RDTSC_LOOPS; i++) { + end = cpu_rdtsc(); + } + puts("rdtsc diff: "); + hex_to_str(strbuf, end - start); + solo5_console_write(strbuf, STRLEN + 1); + + return 0; +} diff --git a/tools/run/solo5-run-virtio.sh b/tools/run/solo5-run-virtio.sh index 26574aa7b..d807901c9 100755 --- a/tools/run/solo5-run-virtio.sh +++ b/tools/run/solo5-run-virtio.sh @@ -17,6 +17,8 @@ # NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +READLINK=${READLINK:-readlink} + usage () { cat <&2 @@ -44,6 +46,11 @@ die () exit 1 } +have_readlink_f () +{ + ${READLINK:-readlink} -f . > /dev/null 2>&1 +} + hv_addargs () { if [ -z "${HVCMD}" ]; then @@ -58,6 +65,9 @@ is_quiet () [ -n "${QUIET}" ] } +have_readlink_f || die "Please set the env var READLINK to a" \ + "'readlink' program that supports -f" + # Parse command line arguments. ARGS=$(getopt d:m:n:qH: $*) [ $? -ne 0 ] && usage @@ -70,7 +80,7 @@ QUIET= while true; do case "$1" in -d) - BLKIMG=$(readlink -f $2) + BLKIMG=$(${READLINK} -f $2) [ -f ${BLKIMG} ] || die "not found: ${BLKIMG}" shift; shift ;; @@ -104,7 +114,7 @@ while true; do done [ $# -lt 1 ] && usage -UNIKERNEL=$(readlink -f $1) +UNIKERNEL=$(${READLINK} -f $1) [ -n "${UNIKERNEL}" -a -f "${UNIKERNEL}" ] || die "not found: $1}" shift VMNAME=vm$$ @@ -126,6 +136,9 @@ if [ "${HV}" = "best" ]; then type grub-bhyve >/dev/null 2>&1 \ || die "Please install grub-bhyve from ports" ;; + Darwin) + HV=qemu + ;; *) die "unsupported os: ${SYS}" ;; diff --git a/ukvm/ukvm-configure b/ukvm/ukvm-configure deleted file mode 100755 index cacae0a72..000000000 --- a/ukvm/ukvm-configure +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/bash -# Copyright (c) 2015-2017 Contributors as noted in the AUTHORS file -# -# This file is part of ukvm, a unikernel monitor. -# -# Permission to use, copy, modify, and/or distribute this software -# for any purpose with or without fee is hereby granted, provided -# that the above copyright notice and this permission notice appear -# in all copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL -# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED -# WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE -# AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR -# CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS -# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, -# NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN -# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - -if [ "$#" -lt 1 ]; then - echo "Usage: ukvm-configure UKVM_SRC [MODULES]" - echo " UKVM_SRC is /path/to/ukvm" - echo " MODULES can be any combination of: net blk gdb" - exit 1 -fi - -UKVM_SRC=`readlink -f $1` -if [ ! -d ${UKVM_SRC} -o ! -f ${UKVM_SRC}/ukvm-core.c ]; then - echo "Error: Not a ukvm source directory: ${UKVM_SRC}" 1>&2 - exit 1 -fi -shift -UKVM_MODULES=$@ - -cat < Makefile.ukvm -# Generated by ukvm-configure $@ - -UKVM_MODULE_OBJS=\$(addsuffix .o,\$(addprefix _build-ukvm/ukvm-,${UKVM_MODULES})) -UKVM_MODULE_FLAGS=\$(addprefix -DUKVM_MODULE_,\$(shell echo ${UKVM_MODULES}| tr '[:lower:]' '[:upper:]')) - -UKVM_CC?=cc -UKVM_FLAGS=-D__UKVM_HOST__ \$(UKVM_MODULE_FLAGS) -UKVM_CFLAGS=-Wall -Werror -std=c99 -O2 -g \$(UKVM_FLAGS) -UKVM_OBJS=_build-ukvm/ukvm-core.o \$(UKVM_MODULE_OBJS) -ifdef UKVM_STATIC -UKVM_LDFLAGS=-static -endif -UKVM_HEADERS= \\ -$UKVM_SRC/ukvm-private.h \\ -$UKVM_SRC/ukvm-modules.h \\ -$UKVM_SRC/ukvm-cpu.h \\ -$UKVM_SRC/ukvm.h - -_build-ukvm: - mkdir -p _build-ukvm - -_build-ukvm/ukvm-%.o: $UKVM_SRC/ukvm-%.c \$(MAKEFILE_LIST) | _build-ukvm - \$(UKVM_CC) \$(UKVM_CFLAGS) -c \$< -o \$@ - -ukvm-bin: \$(UKVM_OBJS) \$(UKVM_HEADERS) \$(MAKEFILE_LIST) - \$(UKVM_CC) \$(UKVM_LDFLAGS) -o \$@ \$(UKVM_CFLAGS) \$(UKVM_OBJS) - -.PHONY: ukvm-clean -ukvm-clean: - \$(RM) -r _build-ukvm - \$(RM) ukvm-bin - -EOF - diff --git a/ukvm/ukvm-core.c b/ukvm/ukvm-core.c deleted file mode 100644 index 0c2aebf6b..000000000 --- a/ukvm/ukvm-core.c +++ /dev/null @@ -1,714 +0,0 @@ -/* - * Copyright (c) 2015-2017 Contributors as noted in the AUTHORS file - * - * This file is part of ukvm, a unikernel monitor. - * - * Permission to use, copy, modify, and/or distribute this software - * for any purpose with or without fee is hereby granted, provided - * that the above copyright notice and this permission notice appear - * in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL - * WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED - * WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE - * AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR - * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS - * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, - * NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -/* We used several existing projects as guides - * kvmtest.c: http://lwn.net/Articles/658512/ - * lkvm: http://github.com/clearlinux/kvmtool - */ -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "ukvm-private.h" -#include "ukvm-modules.h" -#include "ukvm-cpu.h" -#include "ukvm.h" - -struct ukvm_module *modules[] = { -#ifdef UKVM_MODULE_BLK - &ukvm_blk, -#endif -#ifdef UKVM_MODULE_NET - &ukvm_net, -#endif -#ifdef UKVM_MODULE_GDB - &ukvm_gdb, -#endif - NULL, -}; -#define NUM_MODULES ((sizeof(modules) / sizeof(struct ukvm_module *)) - 1) - -/* - * Memory map: - * - * 0x100000 loaded elf file (linker script dictates location) - * ######## unused - * 0x013000 - * 0x012000 bootstrap pde - * 0x011000 bootstrap pdpte - * 0x010000 bootstrap pml4 - * ######## command line arguments - * 0x002000 ukvm_boot_info - * 0x001000 bootstrap gdt (contains correct code/data/ but tss points to 0) - */ - -#define BOOT_GDT 0x1000 -#define BOOT_INFO 0x2000 -#define BOOT_PML4 0x10000 -#define BOOT_PDPTE 0x11000 -#define BOOT_PDE 0x12000 - -#define BOOT_GDT_NULL 0 -#define BOOT_GDT_CODE 1 -#define BOOT_GDT_DATA 2 -#define BOOT_GDT_MAX 3 - -#define KVM_32BIT_MAX_MEM_SIZE (1ULL << 32) -#define KVM_32BIT_GAP_SIZE (768 << 20) -#define KVM_32BIT_GAP_START (KVM_32BIT_MAX_MEM_SIZE - KVM_32BIT_GAP_SIZE) - -void setup_boot_info(uint8_t *mem, - uint64_t size, - uint64_t kernel_end, - int argc, char **argv) -{ - struct ukvm_boot_info *bi = (struct ukvm_boot_info *)(mem + BOOT_INFO); - uint64_t cmdline = BOOT_INFO + sizeof(struct ukvm_boot_info); - size_t cmdline_free = BOOT_PML4 - cmdline - 1; - char *cmdline_p = (char *)(mem + cmdline); - - bi->mem_size = size; - bi->kernel_end = kernel_end; - bi->cmdline = cmdline; - cmdline_p[0] = 0; - - for (; *argv; argc--, argv++) { - size_t alen = snprintf(cmdline_p, cmdline_free, "%s%s", *argv, - (argc > 1) ? " " : ""); - if (alen >= cmdline_free) { - warnx("command line too long, truncated"); - break; - } - cmdline_free -= alen; - cmdline_p += alen; - } - -} - -ssize_t pread_in_full(int fd, void *buf, size_t count, off_t offset) -{ - ssize_t total = 0; - char *p = buf; - - if (count > SSIZE_MAX) { - errno = E2BIG; - return -1; - } - - while (count > 0) { - ssize_t nr; - - nr = pread(fd, p, count, offset); - if (nr == 0) - return total; - else if (nr == -1 && errno == EINTR) - continue; - else if (nr == -1) - return -1; - - count -= nr; - total += nr; - p += nr; - offset += nr; - } - - return total; -} - - -/* - * Load code from elf file into *mem and return the elf entry point - * and the last byte of the program when loaded into memory. This - * accounts not only for the last loaded piece of code from the elf, - * but also for the zeroed out pieces that are not loaded and sould be - * reserved. - * - * Memory will look like this after the elf is loaded: - * - * *mem *p_entry *p_end - * | | | | - * | ... | .text .rodata | .data .bss | - * | | code | 00000000000 | - * | | [PROT_EXEC|READ] | | - * - */ -static void load_code(const char *file, uint8_t *mem, /* IN */ - uint64_t *p_entry, uint64_t *p_end) /* OUT */ -{ - int fd_kernel; - ssize_t numb; - size_t buflen; - Elf64_Off ph_off; - Elf64_Half ph_entsz; - Elf64_Half ph_cnt; - Elf64_Half ph_i; - Elf64_Phdr *phdr = NULL; - Elf64_Ehdr hdr; - - /* elf entry point (on physical memory) */ - *p_entry = 0; - /* highest byte of the program (on physical memory) */ - *p_end = 0; - - fd_kernel = open(file, O_RDONLY); - if (fd_kernel == -1) - goto out_error; - - numb = pread_in_full(fd_kernel, &hdr, sizeof(Elf64_Ehdr), 0); - if (numb < 0) - goto out_error; - if (numb != sizeof(Elf64_Ehdr)) - goto out_invalid; - - /* - * Validate program is in ELF64 format: - * 1. EI_MAG fields 0, 1, 2, 3 spell ELFMAG('0x7f', 'E', 'L', 'F'), - * 2. File contains 64-bit objects, - * 3. Objects are Executable, - * 4. Target instruction set architecture is set to x86_64. - */ - if (hdr.e_ident[EI_MAG0] != ELFMAG0 - || hdr.e_ident[EI_MAG1] != ELFMAG1 - || hdr.e_ident[EI_MAG2] != ELFMAG2 - || hdr.e_ident[EI_MAG3] != ELFMAG3 - || hdr.e_ident[EI_CLASS] != ELFCLASS64 - || hdr.e_type != ET_EXEC - || hdr.e_machine != EM_X86_64) - goto out_invalid; - - ph_off = hdr.e_phoff; - ph_entsz = hdr.e_phentsize; - ph_cnt = hdr.e_phnum; - buflen = ph_entsz * ph_cnt; - - phdr = malloc(buflen); - if (!phdr) - goto out_error; - numb = pread_in_full(fd_kernel, phdr, buflen, ph_off); - if (numb < 0) - goto out_error; - if (numb != buflen) - goto out_invalid; - - /* - * Load all segments with the LOAD directive from the elf file at offset - * p_offset, and copy that into p_addr in memory. The amount of bytes - * copied is p_filesz. However, each segment should be given - * p_memsz aligned up to p_align bytes on memory. - */ - for (ph_i = 0; ph_i < ph_cnt; ph_i++) { - uint8_t *daddr; - uint64_t _end; - size_t offset = phdr[ph_i].p_offset; - size_t filesz = phdr[ph_i].p_filesz; - size_t memsz = phdr[ph_i].p_memsz; - uint64_t paddr = phdr[ph_i].p_paddr; - uint64_t align = phdr[ph_i].p_align; - uint64_t result; - - if (phdr[ph_i].p_type != PT_LOAD) - continue; - - if ((paddr >= GUEST_SIZE) || add_overflow(paddr, filesz, result) - || (result >= GUEST_SIZE)) - goto out_invalid; - if (add_overflow(paddr, memsz, result) || (result >= GUEST_SIZE)) - goto out_invalid; - /* - * Verify that align is a non-zero power of 2 and safely compute - * ((_end + (align - 1)) & -align). - */ - if (align > 0 && (align & (align - 1)) == 0) { - if (add_overflow(result, (align - 1), _end)) - goto out_invalid; - _end = _end & -align; - } - else { - _end = result; - } - if (_end > *p_end) - *p_end = _end; - - daddr = mem + paddr; - numb = pread_in_full(fd_kernel, daddr, filesz, offset); - if (numb < 0) - goto out_error; - if (numb != filesz) - goto out_invalid; - memset(daddr + filesz, 0, memsz - filesz); - - /* Write-protect the executable segment */ - if (phdr[ph_i].p_flags & PF_X) { - if (mprotect(daddr, _end - paddr, PROT_EXEC | PROT_READ) == -1) - goto out_error; - } - } - - free (phdr); - close (fd_kernel); - *p_entry = hdr.e_entry; - return; - -out_error: - err(1, "%s", file); - -out_invalid: - errx(1, "%s: Exec format error", file); -} - - -static void setup_system_64bit(struct kvm_sregs *sregs) -{ - sregs->cr0 |= X86_CR0_PE; - sregs->efer |= EFER_LME; -} - -static void setup_system_sse(struct kvm_sregs *sregs) -{ - sregs->cr0 &= ~X86_CR0_EM; - sregs->cr0 |= X86_CR0_MP; - sregs->cr4 |= X86_CR4_OSFXSR; - sregs->cr4 |= X86_CR4_OSXMMEXCPT; -} - -static void setup_system_page_tables(struct kvm_sregs *sregs, uint8_t *mem) -{ - uint64_t *pml4 = (uint64_t *) (mem + BOOT_PML4); - uint64_t *pdpte = (uint64_t *) (mem + BOOT_PDPTE); - uint64_t *pde = (uint64_t *) (mem + BOOT_PDE); - uint64_t paddr; - - /* - * For simplicity we currently use 2MB pages and only a single - * PML4/PDPTE/PDE. Sanity check that the guest size is a multiple of the - * page size and will fit in a single PDE (512 entries). - */ - assert((GUEST_SIZE & (GUEST_PAGE_SIZE - 1)) == 0); - assert(GUEST_SIZE <= (GUEST_PAGE_SIZE * 512)); - - memset(pml4, 0, 4096); - memset(pdpte, 0, 4096); - memset(pde, 0, 4096); - - *pml4 = BOOT_PDPTE | (X86_PDPT_P | X86_PDPT_RW); - *pdpte = BOOT_PDE | (X86_PDPT_P | X86_PDPT_RW); - for (paddr = 0; paddr < GUEST_SIZE; paddr += GUEST_PAGE_SIZE, pde++) - *pde = paddr | (X86_PDPT_P | X86_PDPT_RW | X86_PDPT_PS); - - sregs->cr3 = BOOT_PML4; - sregs->cr4 |= X86_CR4_PAE; - sregs->cr0 |= X86_CR0_PG; -} - -static void setup_system_gdt(struct kvm_sregs *sregs, - uint8_t *mem, - uint64_t off) -{ - uint64_t *gdt = (uint64_t *) (mem + off); - struct kvm_segment data_seg, code_seg; - - /* flags, base, limit */ - gdt[BOOT_GDT_NULL] = GDT_ENTRY(0, 0, 0); - gdt[BOOT_GDT_CODE] = GDT_ENTRY(0xA09B, 0, 0xFFFFF); - gdt[BOOT_GDT_DATA] = GDT_ENTRY(0xC093, 0, 0xFFFFF); - - sregs->gdt.base = off; - sregs->gdt.limit = (sizeof(uint64_t) * BOOT_GDT_MAX) - 1; - - GDT_TO_KVM_SEGMENT(code_seg, gdt, BOOT_GDT_CODE); - GDT_TO_KVM_SEGMENT(data_seg, gdt, BOOT_GDT_DATA); - - sregs->cs = code_seg; - sregs->ds = data_seg; - sregs->es = data_seg; - sregs->fs = data_seg; - sregs->gs = data_seg; - sregs->ss = data_seg; -} - -static void setup_system(int vcpufd, uint8_t *mem) -{ - struct kvm_sregs sregs; - int ret; - - /* Set all cpu/mem system structures */ - ret = ioctl(vcpufd, KVM_GET_SREGS, &sregs); - if (ret == -1) - err(1, "KVM: ioctl (GET_SREGS) failed"); - - setup_system_gdt(&sregs, mem, BOOT_GDT); - setup_system_page_tables(&sregs, mem); - setup_system_64bit(&sregs); - setup_system_sse(&sregs); - - ret = ioctl(vcpufd, KVM_SET_SREGS, &sregs); - if (ret == -1) - err(1, "KVM: ioctl (SET_SREGS) failed"); -} - - -static void setup_cpuid(int kvm, int vcpufd) -{ - struct kvm_cpuid2 *kvm_cpuid; - int max_entries = 100; - - kvm_cpuid = calloc(1, sizeof(*kvm_cpuid) + - max_entries * sizeof(*kvm_cpuid->entries)); - kvm_cpuid->nent = max_entries; - - if (ioctl(kvm, KVM_GET_SUPPORTED_CPUID, kvm_cpuid) < 0) - err(1, "KVM: ioctl (GET_SUPPORTED_CPUID) failed"); - - if (ioctl(vcpufd, KVM_SET_CPUID2, kvm_cpuid) < 0) - err(1, "KVM: ioctl (SET_CPUID2) failed"); -} - -void ukvm_port_puts(uint8_t *mem, uint64_t paddr) -{ - GUEST_CHECK_PADDR(paddr, GUEST_SIZE, sizeof (struct ukvm_puts)); - struct ukvm_puts *p = (struct ukvm_puts *)(mem + paddr); - - GUEST_CHECK_PADDR(p->data, GUEST_SIZE, p->len); - assert(write(1, mem + p->data, p->len) != -1); -} - -void ukvm_port_poll(uint8_t *mem, uint64_t paddr) -{ - GUEST_CHECK_PADDR(paddr, GUEST_SIZE, sizeof (struct ukvm_poll)); - struct ukvm_poll *t = (struct ukvm_poll *)(mem + paddr); - struct timespec ts; - int rc, i, num_fds = 0; - struct pollfd fds[NUM_MODULES]; /* we only support at most one - * instance per module for now - */ - - for (i = 0; i < NUM_MODULES; i++) { - int fd = modules[i]->get_fd(); - - if (fd) { - fds[num_fds].fd = fd; - fds[num_fds].events = POLLIN; - num_fds += 1; - } - } - - ts.tv_sec = t->timeout_nsecs / 1000000000ULL; - ts.tv_nsec = t->timeout_nsecs % 1000000000ULL; - - /* - * Guest execution is blocked during the ppoll() call, note that - * interrupts will not be injected. - */ - do { - rc = ppoll(fds, num_fds, &ts, NULL); - } while (rc == -1 && errno == EINTR); - assert(rc >= 0); - t->ret = rc; -} - -static int vcpu_loop(struct kvm_run *run, int vcpufd, uint8_t *mem) -{ - int ret; - - /* Repeatedly run code and handle VM exits. */ - while (1) { - int i, handled = 0; - - ret = ioctl(vcpufd, KVM_RUN, NULL); - if (ret == -1 && errno == EINTR) - continue; - if (ret == -1) { - if (errno == EFAULT) { - struct kvm_regs regs; - ret = ioctl(vcpufd, KVM_GET_REGS, ®s); - if (ret == -1) - err(1, "KVM: ioctl (GET_REGS) failed after guest fault"); - errx(1, "KVM: host/guest translation fault: rip=0x%llx", - regs.rip); - } - else - err(1, "KVM: ioctl (RUN) failed"); - } - - for (i = 0; i < NUM_MODULES; i++) { - if (!modules[i]->handle_exit(run, vcpufd, mem)) { - handled = 1; - break; - } - } - - if (handled) - continue; - - switch (run->exit_reason) { - case KVM_EXIT_HLT: - /* Guest has halted the CPU, this is considered as a normal exit. */ - return 0; - - case KVM_EXIT_IO: { - if (run->io.direction != KVM_EXIT_IO_OUT - || run->io.size != 4) - errx(1, "Invalid guest port access: port=0x%x", run->io.port); - - uint64_t paddr = - GUEST_PIO32_TO_PADDR((uint8_t *)run + run->io.data_offset); - - switch (run->io.port) { - case UKVM_PORT_PUTS: - ukvm_port_puts(mem, paddr); - break; - case UKVM_PORT_POLL: - ukvm_port_poll(mem, paddr); - break; - default: - errx(1, "Invalid guest port access: port=0x%x", run->io.port); - } - break; - } - - case KVM_EXIT_FAIL_ENTRY: - errx(1, "KVM: entry failure: hw_entry_failure_reason=0x%llx", - run->fail_entry.hardware_entry_failure_reason); - - case KVM_EXIT_INTERNAL_ERROR: - errx(1, "KVM: internal error exit: suberror=0x%x", - run->internal.suberror); - - default: - errx(1, "KVM: unhandled exit: exit_reason=0x%x", run->exit_reason); - } - } -} - -int setup_modules(int vcpufd, uint8_t *mem) -{ - int i; - - for (i = 0; i < NUM_MODULES; i++) { - if (modules[i]->setup(vcpufd, mem)) { - warnx("Module `%s' setup failed", modules[i]->name); - warnx("Please check you have correctly specified:\n %s", - modules[i]->usage()); - return -1; - } - } - return 0; -} - -void sig_handler(int signo) -{ - errx(1, "Exiting on signal %d", signo); -} - -static void usage(const char *prog) -{ - int m; - - fprintf(stderr, "usage: %s [ CORE OPTIONS ] [ MODULE OPTIONS ] [ -- ] " - "KERNEL [ ARGS ]\n", prog); - fprintf(stderr, "KERNEL is the filename of the unikernel to run.\n"); - fprintf(stderr, "ARGS are optional arguments passed to the unikernel.\n"); - fprintf(stderr, "Core options:\n"); - fprintf(stderr, " --help (display this help)\n"); - fprintf(stderr, "Compiled-in module options:\n"); - for (m = 0; m < NUM_MODULES; m++) - fprintf(stderr, " %s\n", modules[m]->usage()); - if (!m) - fprintf(stderr, " (none)\n"); - exit(1); -} - -int main(int argc, char **argv) -{ - int kvm, vmfd, vcpufd, ret; - uint8_t *mem; - struct kvm_run *run; - size_t mmap_size; - uint64_t elf_entry; - uint64_t kernel_end; - const char *prog; - const char *elffile; - int matched; - - prog = basename(*argv); - argc--; - argv++; - - while (*argv && *argv[0] == '-') { - int j; - - if (strcmp("--help", *argv) == 0) - usage(prog); - - if (strcmp("--", *argv) == 0) { - /* Consume and stop arg processing */ - argc--; - argv++; - break; - } - - matched = 0; - for (j = 0; j < NUM_MODULES; j++) { - if (modules[j]->handle_cmdarg(*argv) == 0) { - /* Handled by module, consume and go on to next arg */ - matched = 1; - argc--; - argv++; - break; - } - } - if (!matched) { - warnx("Invalid option: `%s'", *argv); - usage(prog); - } - } - - /* At least one non-option argument required */ - if (*argv == NULL) { - warnx("Missing KERNEL operand"); - usage(prog); - } - elffile = *argv; - argc--; - argv++; - - struct sigaction sa; - memset (&sa, 0, sizeof (struct sigaction)); - sa.sa_handler = sig_handler; - sigfillset(&sa.sa_mask); - if (sigaction(SIGINT, &sa, NULL) == -1) - err(1, "Could not install signal handler"); - if (sigaction(SIGTERM, &sa, NULL) == -1) - err(1, "Could not install signal handler"); - - kvm = open("/dev/kvm", O_RDWR | O_CLOEXEC); - if (kvm == -1) - err(1, "Could not open: /dev/kvm"); - - /* Make sure we have the stable version of the API */ - ret = ioctl(kvm, KVM_GET_API_VERSION, NULL); - if (ret == -1) - err(1, "KVM: ioctl (GET_API_VERSION) failed"); - if (ret != 12) - errx(1, "KVM: API version is %d, ukvm requires version 12", ret); - - vmfd = ioctl(kvm, KVM_CREATE_VM, 0); - if (vmfd == -1) - err(1, "KVM: ioctl (CREATE_VM) failed"); - - /* - * TODO If the guest size is larger than ~4GB, we need two region - * slots: one before the pci gap, and one after it. - * Reference: kvmtool x86/kvm.c:kvm__init_ram() - */ - assert(GUEST_SIZE < KVM_32BIT_GAP_START); - - /* Allocate GUEST_SIZE page-aligned guest memory. */ - mem = mmap(NULL, GUEST_SIZE, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, -1, 0); - if (mem == MAP_FAILED) - err(1, "Error allocating guest memory"); - - load_code(elffile, mem, &elf_entry, &kernel_end); - - struct kvm_userspace_memory_region region = { - .slot = 0, - .guest_phys_addr = 0, - .memory_size = GUEST_SIZE, - .userspace_addr = (uint64_t) mem, - }; - - ret = ioctl(vmfd, KVM_SET_USER_MEMORY_REGION, ®ion); - if (ret == -1) - err(1, "KVM: ioctl (SET_USER_MEMORY_REGION) failed"); - - - /* enabling this seems to mess up our receiving of hlt instructions */ - /* ret = ioctl(vmfd, KVM_CREATE_IRQCHIP); */ - /* if (ret == -1) */ - /* err(1, "KVM_CREATE_IRQCHIP"); */ - - vcpufd = ioctl(vmfd, KVM_CREATE_VCPU, 0); - if (vcpufd == -1) - err(1, "KVM: ioctl (CREATE_VCPU) failed"); - - /* Setup x86 system registers and memory. */ - setup_system(vcpufd, mem); - - /* Setup ukvm_boot_info and command line */ - setup_boot_info(mem, GUEST_SIZE, kernel_end, argc, argv); - - /* - * Initialize registers: instruction pointer for our code, addends, - * and initial flags required by x86 architecture. - * Arguments to the kernel main are passed using the x86_64 calling - * convention: RDI, RSI, RDX, RCX, R8, and R9 - */ - struct kvm_regs regs = { - .rip = elf_entry, - .rax = 2, - .rbx = 2, - .rflags = 0x2, - .rsp = GUEST_SIZE - 8, /* x86_64 ABI requires ((rsp + 8) % 16) == 0 */ - .rdi = BOOT_INFO, /* size arg in kernel main */ - }; - ret = ioctl(vcpufd, KVM_SET_REGS, ®s); - if (ret == -1) - err(1, "KVM: ioctl (SET_REGS) failed"); - - - /* Map the shared kvm_run structure and following data. */ - ret = ioctl(kvm, KVM_GET_VCPU_MMAP_SIZE, NULL); - if (ret == -1) - err(1, "KVM: ioctl (GET_VCPU_MMAP_SIZE) failed"); - mmap_size = ret; - if (mmap_size < sizeof(*run)) - errx(1, "KVM: invalid VCPU_MMAP_SIZE: %zd", mmap_size); - run = - mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, vcpufd, - 0); - if (run == MAP_FAILED) - err(1, "KVM: VCPU mmap failed"); - - setup_cpuid(kvm, vcpufd); - - if (setup_modules(vcpufd, mem)) - exit(1); - - return vcpu_loop(run, vcpufd, mem); -}